Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: vmalloc: convert vread() to vread_iter()

Having previously laid the foundation for converting vread() to an
iterator function, pull the trigger and do so.

This patch attempts to provide minimal refactoring and to reflect the
existing logic as best we can, for example we continue to zero portions of
memory not read, as before.

Overall, there should be no functional difference other than a performance
improvement in /proc/kcore access to vmalloc regions.

Now we have eliminated the need for a bounce buffer in read_kcore_iter(),
we dispense with it, and try to write to user memory optimistically but
with faults disabled via copy_page_to_iter_nofault(). We already have
preemption disabled by holding a spin lock. We continue faulting in until
the operation is complete.

Additionally, we must account for the fact that at any point a copy may
fail (most likely due to a fault not being able to occur), we exit
indicating fewer bytes retrieved than expected.

[sfr@canb.auug.org.au: fix sparc64 warning]
Link: https://lkml.kernel.org/r/20230320144721.663280c3@canb.auug.org.au
[lstoakes@gmail.com: redo Stephen's sparc build fix]
Link: https://lkml.kernel.org/r/8506cbc667c39205e65a323f750ff9c11a463798.1679566220.git.lstoakes@gmail.com
[akpm@linux-foundation.org: unbreak uio.h includes]
Link: https://lkml.kernel.org/r/941f88bc5ab928e6656e1e2593b91bf0f8c81e1b.1679511146.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
4c91c07c 4f80818b

+178 -116
+23 -21
fs/proc/kcore.c
··· 307 307 *i = ALIGN(*i + descsz, 4); 308 308 } 309 309 310 - static ssize_t 311 - read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) 310 + static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) 312 311 { 313 - struct file *file = iocb->ki_filp; 314 - char *buf = file->private_data; 315 312 loff_t *fpos = &iocb->ki_pos; 316 - 317 313 size_t phdrs_offset, notes_offset, data_offset; 318 314 size_t page_offline_frozen = 1; 319 315 size_t phdrs_len, notes_len; ··· 503 507 504 508 switch (m->type) { 505 509 case KCORE_VMALLOC: 506 - vread(buf, (char *)start, tsz); 507 - /* we have to zero-fill user buffer even if no read */ 508 - if (copy_to_iter(buf, tsz, iter) != tsz) { 509 - ret = -EFAULT; 510 - goto out; 510 + { 511 + const char *src = (char *)start; 512 + size_t read = 0, left = tsz; 513 + 514 + /* 515 + * vmalloc uses spinlocks, so we optimistically try to 516 + * read memory. If this fails, fault pages in and try 517 + * again until we are done. 518 + */ 519 + while (true) { 520 + read += vread_iter(iter, src, left); 521 + if (read == tsz) 522 + break; 523 + 524 + src += read; 525 + left -= read; 526 + 527 + if (fault_in_iov_iter_writeable(iter, left)) { 528 + ret = -EFAULT; 529 + goto out; 530 + } 511 531 } 512 532 break; 533 + } 513 534 case KCORE_USER: 514 535 /* User page is handled prior to normal kernel page: */ 515 536 if (copy_to_iter((char *)start, tsz, iter) != tsz) { ··· 595 582 if (ret) 596 583 return ret; 597 584 598 - filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); 599 - if (!filp->private_data) 600 - return -ENOMEM; 601 - 602 585 if (kcore_need_update) 603 586 kcore_update_ram(); 604 587 if (i_size_read(inode) != proc_root_kcore->size) { ··· 605 596 return 0; 606 597 } 607 598 608 - static int release_kcore(struct inode *inode, struct file *file) 609 - { 610 - kfree(file->private_data); 611 - return 0; 612 - } 613 - 614 599 static const struct proc_ops kcore_proc_ops = { 615 600 .proc_read_iter = read_kcore_iter, 616 601 .proc_open = open_kcore, 617 - .proc_release = release_kcore, 618 602 .proc_lseek = default_llseek, 619 603 }; 620 604
+2 -1
include/linux/vmalloc.h
··· 14 14 15 15 struct vm_area_struct; /* vma defining user mapping in mm_types.h */ 16 16 struct notifier_block; /* in notifier.h */ 17 + struct iov_iter; /* in uio.h */ 17 18 18 19 /* bits in flags of vmalloc's vm_struct below */ 19 20 #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ ··· 248 247 #endif 249 248 250 249 /* for /proc/kcore */ 251 - extern long vread(char *buf, char *addr, unsigned long count); 250 + extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); 252 251 253 252 /* 254 253 * Internals. Don't use..
+5 -5
mm/nommu.c
··· 36 36 #include <linux/printk.h> 37 37 38 38 #include <linux/uaccess.h> 39 + #include <linux/uio.h> 39 40 #include <asm/tlb.h> 40 41 #include <asm/tlbflush.h> 41 42 #include <asm/mmu_context.h> ··· 199 198 } 200 199 EXPORT_SYMBOL(vmalloc_to_pfn); 201 200 202 - long vread(char *buf, char *addr, unsigned long count) 201 + long vread_iter(struct iov_iter *iter, const char *addr, size_t count) 203 202 { 204 203 /* Don't allow overflow */ 205 - if ((unsigned long) buf + count < count) 206 - count = -(unsigned long) buf; 204 + if ((unsigned long) addr + count < count) 205 + count = -(unsigned long) addr; 207 206 208 - memcpy(buf, addr, count); 209 - return count; 207 + return copy_to_iter(addr, count, iter); 210 208 } 211 209 212 210 /*
+148 -89
mm/vmalloc.c
··· 33 33 #include <linux/compiler.h> 34 34 #include <linux/memcontrol.h> 35 35 #include <linux/llist.h> 36 + #include <linux/uio.h> 36 37 #include <linux/bitops.h> 37 38 #include <linux/rbtree_augmented.h> 38 39 #include <linux/overflow.h> 39 40 #include <linux/pgtable.h> 40 - #include <linux/uaccess.h> 41 41 #include <linux/hugetlb.h> 42 42 #include <linux/sched/mm.h> 43 43 #include <asm/tlbflush.h> ··· 3442 3442 EXPORT_SYMBOL(vmalloc_32_user); 3443 3443 3444 3444 /* 3445 - * small helper routine , copy contents to buf from addr. 3446 - * If the page is not present, fill zero. 3445 + * Atomically zero bytes in the iterator. 3446 + * 3447 + * Returns the number of zeroed bytes. 3447 3448 */ 3448 - 3449 - static int aligned_vread(char *buf, char *addr, unsigned long count) 3449 + static size_t zero_iter(struct iov_iter *iter, size_t count) 3450 3450 { 3451 - struct page *p; 3452 - int copied = 0; 3451 + size_t remains = count; 3453 3452 3454 - while (count) { 3453 + while (remains > 0) { 3454 + size_t num, copied; 3455 + 3456 + num = remains < PAGE_SIZE ? remains : PAGE_SIZE; 3457 + copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); 3458 + remains -= copied; 3459 + 3460 + if (copied < num) 3461 + break; 3462 + } 3463 + 3464 + return count - remains; 3465 + } 3466 + 3467 + /* 3468 + * small helper routine, copy contents to iter from addr. 3469 + * If the page is not present, fill zero. 3470 + * 3471 + * Returns the number of copied bytes. 3472 + */ 3473 + static size_t aligned_vread_iter(struct iov_iter *iter, 3474 + const char *addr, size_t count) 3475 + { 3476 + size_t remains = count; 3477 + struct page *page; 3478 + 3479 + while (remains > 0) { 3455 3480 unsigned long offset, length; 3481 + size_t copied = 0; 3456 3482 3457 3483 offset = offset_in_page(addr); 3458 3484 length = PAGE_SIZE - offset; 3459 - if (length > count) 3460 - length = count; 3461 - p = vmalloc_to_page(addr); 3485 + if (length > remains) 3486 + length = remains; 3487 + page = vmalloc_to_page(addr); 3462 3488 /* 3463 - * To do safe access to this _mapped_ area, we need 3464 - * lock. But adding lock here means that we need to add 3465 - * overhead of vmalloc()/vfree() calls for this _debug_ 3466 - * interface, rarely used. Instead of that, we'll use 3467 - * kmap() and get small overhead in this access function. 3489 + * To do safe access to this _mapped_ area, we need lock. But 3490 + * adding lock here means that we need to add overhead of 3491 + * vmalloc()/vfree() calls for this _debug_ interface, rarely 3492 + * used. Instead of that, we'll use an local mapping via 3493 + * copy_page_to_iter_nofault() and accept a small overhead in 3494 + * this access function. 3468 3495 */ 3469 - if (p) { 3470 - /* We can expect USER0 is not used -- see vread() */ 3471 - void *map = kmap_atomic(p); 3472 - memcpy(buf, map + offset, length); 3473 - kunmap_atomic(map); 3474 - } else 3475 - memset(buf, 0, length); 3496 + if (page) 3497 + copied = copy_page_to_iter_nofault(page, offset, 3498 + length, iter); 3499 + else 3500 + copied = zero_iter(iter, length); 3476 3501 3477 - addr += length; 3478 - buf += length; 3479 - copied += length; 3480 - count -= length; 3502 + addr += copied; 3503 + remains -= copied; 3504 + 3505 + if (copied != length) 3506 + break; 3481 3507 } 3482 - return copied; 3508 + 3509 + return count - remains; 3483 3510 } 3484 3511 3485 - static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) 3512 + /* 3513 + * Read from a vm_map_ram region of memory. 3514 + * 3515 + * Returns the number of copied bytes. 3516 + */ 3517 + static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, 3518 + size_t count, unsigned long flags) 3486 3519 { 3487 3520 char *start; 3488 3521 struct vmap_block *vb; 3489 3522 unsigned long offset; 3490 - unsigned int rs, re, n; 3523 + unsigned int rs, re; 3524 + size_t remains, n; 3491 3525 3492 3526 /* 3493 3527 * If it's area created by vm_map_ram() interface directly, but 3494 3528 * not further subdividing and delegating management to vmap_block, 3495 3529 * handle it here. 3496 3530 */ 3497 - if (!(flags & VMAP_BLOCK)) { 3498 - aligned_vread(buf, addr, count); 3499 - return; 3500 - } 3531 + if (!(flags & VMAP_BLOCK)) 3532 + return aligned_vread_iter(iter, addr, count); 3533 + 3534 + remains = count; 3501 3535 3502 3536 /* 3503 3537 * Area is split into regions and tracked with vmap_block, read out ··· 3539 3505 */ 3540 3506 vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); 3541 3507 if (!vb) 3542 - goto finished; 3508 + goto finished_zero; 3543 3509 3544 3510 spin_lock(&vb->lock); 3545 3511 if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { 3546 3512 spin_unlock(&vb->lock); 3547 - goto finished; 3513 + goto finished_zero; 3548 3514 } 3515 + 3549 3516 for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { 3550 - if (!count) 3551 - break; 3517 + size_t copied; 3518 + 3519 + if (remains == 0) 3520 + goto finished; 3521 + 3552 3522 start = vmap_block_vaddr(vb->va->va_start, rs); 3553 - while (addr < start) { 3554 - if (count == 0) 3555 - goto unlock; 3556 - *buf = '\0'; 3557 - buf++; 3558 - addr++; 3559 - count--; 3523 + 3524 + if (addr < start) { 3525 + size_t to_zero = min_t(size_t, start - addr, remains); 3526 + size_t zeroed = zero_iter(iter, to_zero); 3527 + 3528 + addr += zeroed; 3529 + remains -= zeroed; 3530 + 3531 + if (remains == 0 || zeroed != to_zero) 3532 + goto finished; 3560 3533 } 3534 + 3561 3535 /*it could start reading from the middle of used region*/ 3562 3536 offset = offset_in_page(addr); 3563 3537 n = ((re - rs + 1) << PAGE_SHIFT) - offset; 3564 - if (n > count) 3565 - n = count; 3566 - aligned_vread(buf, start+offset, n); 3538 + if (n > remains) 3539 + n = remains; 3567 3540 3568 - buf += n; 3569 - addr += n; 3570 - count -= n; 3541 + copied = aligned_vread_iter(iter, start + offset, n); 3542 + 3543 + addr += copied; 3544 + remains -= copied; 3545 + 3546 + if (copied != n) 3547 + goto finished; 3571 3548 } 3572 - unlock: 3549 + 3573 3550 spin_unlock(&vb->lock); 3574 3551 3575 - finished: 3552 + finished_zero: 3576 3553 /* zero-fill the left dirty or free regions */ 3577 - if (count) 3578 - memset(buf, 0, count); 3554 + return count - remains + zero_iter(iter, remains); 3555 + finished: 3556 + /* We couldn't copy/zero everything */ 3557 + spin_unlock(&vb->lock); 3558 + return count - remains; 3579 3559 } 3580 3560 3581 3561 /** 3582 - * vread() - read vmalloc area in a safe way. 3583 - * @buf: buffer for reading data 3584 - * @addr: vm address. 3585 - * @count: number of bytes to be read. 3562 + * vread_iter() - read vmalloc area in a safe way to an iterator. 3563 + * @iter: the iterator to which data should be written. 3564 + * @addr: vm address. 3565 + * @count: number of bytes to be read. 3586 3566 * 3587 3567 * This function checks that addr is a valid vmalloc'ed area, and 3588 3568 * copy data from that area to a given buffer. If the given memory range ··· 3616 3568 * (same number as @count) or %0 if [addr...addr+count) doesn't 3617 3569 * include any intersection with valid vmalloc area 3618 3570 */ 3619 - long vread(char *buf, char *addr, unsigned long count) 3571 + long vread_iter(struct iov_iter *iter, const char *addr, size_t count) 3620 3572 { 3621 3573 struct vmap_area *va; 3622 3574 struct vm_struct *vm; 3623 - char *vaddr, *buf_start = buf; 3624 - unsigned long buflen = count; 3625 - unsigned long n, size, flags; 3575 + char *vaddr; 3576 + size_t n, size, flags, remains; 3626 3577 3627 3578 addr = kasan_reset_tag(addr); 3628 3579 ··· 3629 3582 if ((unsigned long) addr + count < count) 3630 3583 count = -(unsigned long) addr; 3631 3584 3585 + remains = count; 3586 + 3632 3587 spin_lock(&vmap_area_lock); 3633 3588 va = find_vmap_area_exceed_addr((unsigned long)addr); 3634 3589 if (!va) 3635 - goto finished; 3590 + goto finished_zero; 3636 3591 3637 3592 /* no intersects with alive vmap_area */ 3638 - if ((unsigned long)addr + count <= va->va_start) 3639 - goto finished; 3593 + if ((unsigned long)addr + remains <= va->va_start) 3594 + goto finished_zero; 3640 3595 3641 3596 list_for_each_entry_from(va, &vmap_area_list, list) { 3642 - if (!count) 3643 - break; 3597 + size_t copied; 3598 + 3599 + if (remains == 0) 3600 + goto finished; 3644 3601 3645 3602 vm = va->vm; 3646 3603 flags = va->flags & VMAP_FLAGS_MASK; ··· 3659 3608 3660 3609 if (vm && (vm->flags & VM_UNINITIALIZED)) 3661 3610 continue; 3611 + 3662 3612 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3663 3613 smp_rmb(); 3664 3614 ··· 3668 3616 3669 3617 if (addr >= vaddr + size) 3670 3618 continue; 3671 - while (addr < vaddr) { 3672 - if (count == 0) 3619 + 3620 + if (addr < vaddr) { 3621 + size_t to_zero = min_t(size_t, vaddr - addr, remains); 3622 + size_t zeroed = zero_iter(iter, to_zero); 3623 + 3624 + addr += zeroed; 3625 + remains -= zeroed; 3626 + 3627 + if (remains == 0 || zeroed != to_zero) 3673 3628 goto finished; 3674 - *buf = '\0'; 3675 - buf++; 3676 - addr++; 3677 - count--; 3678 3629 } 3630 + 3679 3631 n = vaddr + size - addr; 3680 - if (n > count) 3681 - n = count; 3632 + if (n > remains) 3633 + n = remains; 3682 3634 3683 3635 if (flags & VMAP_RAM) 3684 - vmap_ram_vread(buf, addr, n, flags); 3636 + copied = vmap_ram_vread_iter(iter, addr, n, flags); 3685 3637 else if (!(vm->flags & VM_IOREMAP)) 3686 - aligned_vread(buf, addr, n); 3638 + copied = aligned_vread_iter(iter, addr, n); 3687 3639 else /* IOREMAP area is treated as memory hole */ 3688 - memset(buf, 0, n); 3689 - buf += n; 3690 - addr += n; 3691 - count -= n; 3640 + copied = zero_iter(iter, n); 3641 + 3642 + addr += copied; 3643 + remains -= copied; 3644 + 3645 + if (copied != n) 3646 + goto finished; 3692 3647 } 3648 + 3649 + finished_zero: 3650 + spin_unlock(&vmap_area_lock); 3651 + /* zero-fill memory holes */ 3652 + return count - remains + zero_iter(iter, remains); 3693 3653 finished: 3654 + /* Nothing remains, or We couldn't copy/zero everything. */ 3694 3655 spin_unlock(&vmap_area_lock); 3695 3656 3696 - if (buf == buf_start) 3697 - return 0; 3698 - /* zero-fill memory holes */ 3699 - if (buf != buf_start + buflen) 3700 - memset(buf, 0, buflen - (buf - buf_start)); 3701 - 3702 - return buflen; 3657 + return count - remains; 3703 3658 } 3704 3659 3705 3660 /**