Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 vdso updates from Ingo Molnar:
"The main changes in this cycle centered around adding support for
32-bit compatible C/R of the vDSO on 64-bit kernels, by Dmitry
Safonov"

* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/vdso: Use CONFIG_X86_X32_ABI to enable vdso prctl
x86/vdso: Only define map_vdso_randomized() if CONFIG_X86_64
x86/vdso: Only define prctl_map_vdso() if CONFIG_CHECKPOINT_RESTORE
x86/signal: Add SA_{X32,IA32}_ABI sa_flags
x86/ptrace: Down with test_thread_flag(TIF_IA32)
x86/coredump: Use pr_reg size, rather that TIF_IA32 flag
x86/arch_prctl/vdso: Add ARCH_MAP_VDSO_*
x86/vdso: Replace calculate_addr in map_vdso() with addr
x86/vdso: Unmap vdso blob on vvar mapping failure

+220 -106
+102 -73
arch/x86/entry/vdso/vma.c
··· 37 37 38 38 struct linux_binprm; 39 39 40 - /* 41 - * Put the vdso above the (randomized) stack with another randomized 42 - * offset. This way there is no hole in the middle of address space. 43 - * To save memory make sure it is still in the same PTE as the stack 44 - * top. This doesn't give that many random bits. 45 - * 46 - * Note that this algorithm is imperfect: the distribution of the vdso 47 - * start address within a PMD is biased toward the end. 48 - * 49 - * Only used for the 64-bit and x32 vdsos. 50 - */ 51 - static unsigned long vdso_addr(unsigned long start, unsigned len) 52 - { 53 - #ifdef CONFIG_X86_32 54 - return 0; 55 - #else 56 - unsigned long addr, end; 57 - unsigned offset; 58 - 59 - /* 60 - * Round up the start address. It can start out unaligned as a result 61 - * of stack start randomization. 62 - */ 63 - start = PAGE_ALIGN(start); 64 - 65 - /* Round the lowest possible end address up to a PMD boundary. */ 66 - end = (start + len + PMD_SIZE - 1) & PMD_MASK; 67 - if (end >= TASK_SIZE_MAX) 68 - end = TASK_SIZE_MAX; 69 - end -= len; 70 - 71 - if (end > start) { 72 - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); 73 - addr = start + (offset << PAGE_SHIFT); 74 - } else { 75 - addr = start; 76 - } 77 - 78 - /* 79 - * Forcibly align the final address in case we have a hardware 80 - * issue that requires alignment for performance reasons. 81 - */ 82 - addr = align_vdso_addr(addr); 83 - 84 - return addr; 85 - #endif 86 - } 87 - 88 40 static int vdso_fault(const struct vm_special_mapping *sm, 89 41 struct vm_area_struct *vma, struct vm_fault *vmf) 90 42 { ··· 128 176 return VM_FAULT_SIGBUS; 129 177 } 130 178 131 - static int map_vdso(const struct vdso_image *image, bool calculate_addr) 179 + static const struct vm_special_mapping vdso_mapping = { 180 + .name = "[vdso]", 181 + .fault = vdso_fault, 182 + .mremap = vdso_mremap, 183 + }; 184 + static const struct vm_special_mapping vvar_mapping = { 185 + .name = "[vvar]", 186 + .fault = vvar_fault, 187 + }; 188 + 189 + /* 190 + * Add vdso and vvar mappings to current process. 191 + * @image - blob to map 192 + * @addr - request a specific address (zero to map at free addr) 193 + */ 194 + static int map_vdso(const struct vdso_image *image, unsigned long addr) 132 195 { 133 196 struct mm_struct *mm = current->mm; 134 197 struct vm_area_struct *vma; 135 - unsigned long addr, text_start; 198 + unsigned long text_start; 136 199 int ret = 0; 137 - 138 - static const struct vm_special_mapping vdso_mapping = { 139 - .name = "[vdso]", 140 - .fault = vdso_fault, 141 - .mremap = vdso_mremap, 142 - }; 143 - static const struct vm_special_mapping vvar_mapping = { 144 - .name = "[vvar]", 145 - .fault = vvar_fault, 146 - }; 147 - 148 - if (calculate_addr) { 149 - addr = vdso_addr(current->mm->start_stack, 150 - image->size - image->sym_vvar_start); 151 - } else { 152 - addr = 0; 153 - } 154 200 155 201 if (down_write_killable(&mm->mmap_sem)) 156 202 return -EINTR; ··· 188 238 189 239 if (IS_ERR(vma)) { 190 240 ret = PTR_ERR(vma); 191 - goto up_fail; 241 + do_munmap(mm, text_start, image->size); 192 242 } 193 243 194 244 up_fail: 195 - if (ret) 245 + if (ret) { 196 246 current->mm->context.vdso = NULL; 247 + current->mm->context.vdso_image = NULL; 248 + } 197 249 198 250 up_write(&mm->mmap_sem); 199 251 return ret; 252 + } 253 + 254 + #ifdef CONFIG_X86_64 255 + /* 256 + * Put the vdso above the (randomized) stack with another randomized 257 + * offset. This way there is no hole in the middle of address space. 258 + * To save memory make sure it is still in the same PTE as the stack 259 + * top. This doesn't give that many random bits. 260 + * 261 + * Note that this algorithm is imperfect: the distribution of the vdso 262 + * start address within a PMD is biased toward the end. 263 + * 264 + * Only used for the 64-bit and x32 vdsos. 265 + */ 266 + static unsigned long vdso_addr(unsigned long start, unsigned len) 267 + { 268 + unsigned long addr, end; 269 + unsigned offset; 270 + 271 + /* 272 + * Round up the start address. It can start out unaligned as a result 273 + * of stack start randomization. 274 + */ 275 + start = PAGE_ALIGN(start); 276 + 277 + /* Round the lowest possible end address up to a PMD boundary. */ 278 + end = (start + len + PMD_SIZE - 1) & PMD_MASK; 279 + if (end >= TASK_SIZE_MAX) 280 + end = TASK_SIZE_MAX; 281 + end -= len; 282 + 283 + if (end > start) { 284 + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); 285 + addr = start + (offset << PAGE_SHIFT); 286 + } else { 287 + addr = start; 288 + } 289 + 290 + /* 291 + * Forcibly align the final address in case we have a hardware 292 + * issue that requires alignment for performance reasons. 293 + */ 294 + addr = align_vdso_addr(addr); 295 + 296 + return addr; 297 + } 298 + 299 + static int map_vdso_randomized(const struct vdso_image *image) 300 + { 301 + unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); 302 + 303 + return map_vdso(image, addr); 304 + } 305 + #endif 306 + 307 + int map_vdso_once(const struct vdso_image *image, unsigned long addr) 308 + { 309 + struct mm_struct *mm = current->mm; 310 + struct vm_area_struct *vma; 311 + 312 + down_write(&mm->mmap_sem); 313 + /* 314 + * Check if we have already mapped vdso blob - fail to prevent 315 + * abusing from userspace install_speciall_mapping, which may 316 + * not do accounting and rlimit right. 317 + * We could search vma near context.vdso, but it's a slowpath, 318 + * so let's explicitely check all VMAs to be completely sure. 319 + */ 320 + for (vma = mm->mmap; vma; vma = vma->vm_next) { 321 + if (vma_is_special_mapping(vma, &vdso_mapping) || 322 + vma_is_special_mapping(vma, &vvar_mapping)) { 323 + up_write(&mm->mmap_sem); 324 + return -EEXIST; 325 + } 326 + } 327 + up_write(&mm->mmap_sem); 328 + 329 + return map_vdso(image, addr); 200 330 } 201 331 202 332 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) ··· 285 255 if (vdso32_enabled != 1) /* Other values all mean "disabled" */ 286 256 return 0; 287 257 288 - return map_vdso(&vdso_image_32, false); 258 + return map_vdso(&vdso_image_32, 0); 289 259 } 290 260 #endif 291 261 ··· 295 265 if (!vdso64_enabled) 296 266 return 0; 297 267 298 - return map_vdso(&vdso_image_64, true); 268 + return map_vdso_randomized(&vdso_image_64); 299 269 } 300 270 301 271 #ifdef CONFIG_COMPAT ··· 306 276 if (test_thread_flag(TIF_X32)) { 307 277 if (!vdso64_enabled) 308 278 return 0; 309 - 310 - return map_vdso(&vdso_image_x32, true); 279 + return map_vdso_randomized(&vdso_image_x32); 311 280 } 312 281 #endif 313 282 #ifdef CONFIG_IA32_EMULATION
+1 -1
arch/x86/ia32/ia32_signal.c
··· 378 378 put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); 379 379 } put_user_catch(err); 380 380 381 - err |= copy_siginfo_to_user32(&frame->info, &ksig->info); 381 + err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false); 382 382 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, 383 383 regs, set->sig[0]); 384 384 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+4 -4
arch/x86/include/asm/compat.h
··· 275 275 #ifdef CONFIG_X86_X32_ABI 276 276 typedef struct user_regs_struct compat_elf_gregset_t; 277 277 278 - #define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216) 279 - #define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296) 280 - #define SET_PR_FPVALID(S,V) \ 281 - do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \ 278 + /* Full regset -- prstatus on x32, otherwise on ia32 */ 279 + #define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) 280 + #define SET_PR_FPVALID(S, V, R) \ 281 + do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ 282 282 while (0) 283 283 284 284 #define COMPAT_USE_64BIT_TIME \
+6
arch/x86/include/asm/fpu/signal.h
··· 19 19 # define ia32_setup_rt_frame __setup_rt_frame 20 20 #endif 21 21 22 + #ifdef CONFIG_COMPAT 23 + int __copy_siginfo_to_user32(compat_siginfo_t __user *to, 24 + const siginfo_t *from, bool x32_ABI); 25 + #endif 26 + 27 + 22 28 extern void convert_from_fxsr(struct user_i387_ia32_struct *env, 23 29 struct task_struct *tsk); 24 30 extern void convert_to_fxsr(struct task_struct *tsk,
+4
arch/x86/include/asm/signal.h
··· 23 23 unsigned long sig[_NSIG_WORDS]; 24 24 } sigset_t; 25 25 26 + /* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */ 27 + #define SA_IA32_ABI 0x02000000u 28 + #define SA_X32_ABI 0x01000000u 29 + 26 30 #ifndef CONFIG_COMPAT 27 31 typedef sigset_t compat_sigset_t; 28 32 #endif
+2
arch/x86/include/asm/vdso.h
··· 41 41 42 42 extern void __init init_vdso_image(const struct vdso_image *image); 43 43 44 + extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); 45 + 44 46 #endif /* __ASSEMBLER__ */ 45 47 46 48 #endif /* _ASM_X86_VDSO_H */
+6
arch/x86/include/uapi/asm/prctl.h
··· 6 6 #define ARCH_GET_FS 0x1003 7 7 #define ARCH_GET_GS 0x1004 8 8 9 + #ifdef CONFIG_CHECKPOINT_RESTORE 10 + # define ARCH_MAP_VDSO_X32 0x2001 11 + # define ARCH_MAP_VDSO_32 0x2002 12 + # define ARCH_MAP_VDSO_64 0x2003 13 + #endif 14 + 9 15 #endif /* _ASM_X86_PRCTL_H */
+27
arch/x86/kernel/process_64.c
··· 49 49 #include <asm/debugreg.h> 50 50 #include <asm/switch_to.h> 51 51 #include <asm/xen/hypervisor.h> 52 + #include <asm/vdso.h> 52 53 53 54 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); 54 55 ··· 524 523 } 525 524 EXPORT_SYMBOL_GPL(set_personality_ia32); 526 525 526 + #ifdef CONFIG_CHECKPOINT_RESTORE 527 + static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) 528 + { 529 + int ret; 530 + 531 + ret = map_vdso_once(image, addr); 532 + if (ret) 533 + return ret; 534 + 535 + return (long)image->size; 536 + } 537 + #endif 538 + 527 539 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 528 540 { 529 541 int ret = 0; ··· 589 575 ret = put_user(base, (unsigned long __user *)addr); 590 576 break; 591 577 } 578 + 579 + #ifdef CONFIG_CHECKPOINT_RESTORE 580 + # ifdef CONFIG_X86_X32_ABI 581 + case ARCH_MAP_VDSO_X32: 582 + return prctl_map_vdso(&vdso_image_x32, addr); 583 + # endif 584 + # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 585 + case ARCH_MAP_VDSO_32: 586 + return prctl_map_vdso(&vdso_image_32, addr); 587 + # endif 588 + case ARCH_MAP_VDSO_64: 589 + return prctl_map_vdso(&vdso_image_64, addr); 590 + #endif 592 591 593 592 default: 594 593 ret = -EINVAL;
+1 -1
arch/x86/kernel/ptrace.c
··· 1358 1358 const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1359 1359 { 1360 1360 #ifdef CONFIG_IA32_EMULATION 1361 - if (test_tsk_thread_flag(task, TIF_IA32)) 1361 + if (!user_64bit_mode(task_pt_regs(task))) 1362 1362 #endif 1363 1363 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1364 1364 return &user_x86_32_view;
+11 -9
arch/x86/kernel/signal.c
··· 42 42 #include <asm/syscalls.h> 43 43 44 44 #include <asm/sigframe.h> 45 + #include <asm/signal.h> 45 46 46 47 #define COPY(x) do { \ 47 48 get_user_ex(regs->x, &sc->x); \ ··· 548 547 return -EFAULT; 549 548 550 549 if (ksig->ka.sa.sa_flags & SA_SIGINFO) { 551 - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) 550 + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) 552 551 return -EFAULT; 553 552 } 554 553 ··· 661 660 return 0; 662 661 } 663 662 664 - static inline int is_ia32_compat_frame(void) 663 + static inline int is_ia32_compat_frame(struct ksignal *ksig) 665 664 { 666 665 return IS_ENABLED(CONFIG_IA32_EMULATION) && 667 - test_thread_flag(TIF_IA32); 666 + ksig->ka.sa.sa_flags & SA_IA32_ABI; 668 667 } 669 668 670 - static inline int is_ia32_frame(void) 669 + static inline int is_ia32_frame(struct ksignal *ksig) 671 670 { 672 - return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(); 671 + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); 673 672 } 674 673 675 - static inline int is_x32_frame(void) 674 + static inline int is_x32_frame(struct ksignal *ksig) 676 675 { 677 - return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); 676 + return IS_ENABLED(CONFIG_X86_X32_ABI) && 677 + ksig->ka.sa.sa_flags & SA_X32_ABI; 678 678 } 679 679 680 680 static int ··· 686 684 compat_sigset_t *cset = (compat_sigset_t *) set; 687 685 688 686 /* Set up the stack frame */ 689 - if (is_ia32_frame()) { 687 + if (is_ia32_frame(ksig)) { 690 688 if (ksig->ka.sa.sa_flags & SA_SIGINFO) 691 689 return ia32_setup_rt_frame(usig, ksig, cset, regs); 692 690 else 693 691 return ia32_setup_frame(usig, ksig, cset, regs); 694 - } else if (is_x32_frame()) { 692 + } else if (is_x32_frame(ksig)) { 695 693 return x32_setup_rt_frame(ksig, cset, regs); 696 694 } else { 697 695 return __setup_rt_frame(ksig->sig, ksig, set, regs);
+31 -3
arch/x86/kernel/signal_compat.c
··· 1 1 #include <linux/compat.h> 2 2 #include <linux/uaccess.h> 3 + #include <linux/ptrace.h> 3 4 4 5 /* 5 6 * The compat_siginfo_t structure and handing code is very easy ··· 93 92 /* any new si_fields should be added here */ 94 93 } 95 94 96 - int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) 95 + void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) 96 + { 97 + /* Don't leak in-kernel non-uapi flags to user-space */ 98 + if (oact) 99 + oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); 100 + 101 + if (!act) 102 + return; 103 + 104 + /* Don't let flags to be set from userspace */ 105 + act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); 106 + 107 + if (user_64bit_mode(current_pt_regs())) 108 + return; 109 + 110 + if (in_ia32_syscall()) 111 + act->sa.sa_flags |= SA_IA32_ABI; 112 + if (in_x32_syscall()) 113 + act->sa.sa_flags |= SA_X32_ABI; 114 + } 115 + 116 + int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, 117 + bool x32_ABI) 97 118 { 98 119 int err = 0; 99 - bool ia32 = test_thread_flag(TIF_IA32); 100 120 101 121 signal_compat_build_tests(); 102 122 ··· 168 146 put_user_ex(from->si_arch, &to->si_arch); 169 147 break; 170 148 case __SI_CHLD >> 16: 171 - if (ia32) { 149 + if (!x32_ABI) { 172 150 put_user_ex(from->si_utime, &to->si_utime); 173 151 put_user_ex(from->si_stime, &to->si_stime); 174 152 } else { ··· 200 178 } put_user_catch(err); 201 179 202 180 return err; 181 + } 182 + 183 + /* from syscall's path, where we know the ABI */ 184 + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) 185 + { 186 + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); 203 187 } 204 188 205 189 int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+8 -15
fs/binfmt_elf.c
··· 1624 1624 regset->writeback(task, regset, 1); 1625 1625 } 1626 1626 1627 - #ifndef PR_REG_SIZE 1628 - #define PR_REG_SIZE(S) sizeof(S) 1629 - #endif 1630 - 1631 1627 #ifndef PRSTATUS_SIZE 1632 - #define PRSTATUS_SIZE(S) sizeof(S) 1633 - #endif 1634 - 1635 - #ifndef PR_REG_PTR 1636 - #define PR_REG_PTR(S) (&((S)->pr_reg)) 1628 + #define PRSTATUS_SIZE(S, R) sizeof(S) 1637 1629 #endif 1638 1630 1639 1631 #ifndef SET_PR_FPVALID 1640 - #define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) 1632 + #define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) 1641 1633 #endif 1642 1634 1643 1635 static int fill_thread_core_info(struct elf_thread_core_info *t, ··· 1637 1645 long signr, size_t *total) 1638 1646 { 1639 1647 unsigned int i; 1648 + unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; 1640 1649 1641 1650 /* 1642 1651 * NT_PRSTATUS is the one special case, because the regset data ··· 1646 1653 * We assume that regset 0 is NT_PRSTATUS. 1647 1654 */ 1648 1655 fill_prstatus(&t->prstatus, t->task, signr); 1649 - (void) view->regsets[0].get(t->task, &view->regsets[0], 1650 - 0, PR_REG_SIZE(t->prstatus.pr_reg), 1651 - PR_REG_PTR(&t->prstatus), NULL); 1656 + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, 1657 + &t->prstatus.pr_reg, NULL); 1652 1658 1653 1659 fill_note(&t->notes[0], "CORE", NT_PRSTATUS, 1654 - PRSTATUS_SIZE(t->prstatus), &t->prstatus); 1660 + PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); 1655 1661 *total += notesize(&t->notes[0]); 1656 1662 1657 1663 do_thread_regset_writeback(t->task, &view->regsets[0]); ··· 1680 1688 regset->core_note_type, 1681 1689 size, data); 1682 1690 else { 1683 - SET_PR_FPVALID(&t->prstatus, 1); 1691 + SET_PR_FPVALID(&t->prstatus, 1692 + 1, regset_size); 1684 1693 fill_note(&t->notes[i], "CORE", 1685 1694 NT_PRFPREG, size, data); 1686 1695 }
+2
include/linux/mm.h
··· 2019 2019 extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); 2020 2020 extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); 2021 2021 2022 + extern bool vma_is_special_mapping(const struct vm_area_struct *vma, 2023 + const struct vm_special_mapping *sm); 2022 2024 extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, 2023 2025 unsigned long addr, unsigned long len, 2024 2026 unsigned long flags,
+7
kernel/signal.c
··· 3044 3044 } 3045 3045 EXPORT_SYMBOL(kernel_sigaction); 3046 3046 3047 + void __weak sigaction_compat_abi(struct k_sigaction *act, 3048 + struct k_sigaction *oact) 3049 + { 3050 + } 3051 + 3047 3052 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3048 3053 { 3049 3054 struct task_struct *p = current, *t; ··· 3063 3058 spin_lock_irq(&p->sighand->siglock); 3064 3059 if (oact) 3065 3060 *oact = *k; 3061 + 3062 + sigaction_compat_abi(act, oact); 3066 3063 3067 3064 if (act) { 3068 3065 sigdelsetmask(&act->sa.sa_mask,
+8
mm/mmap.c
··· 3068 3068 return ERR_PTR(ret); 3069 3069 } 3070 3070 3071 + bool vma_is_special_mapping(const struct vm_area_struct *vma, 3072 + const struct vm_special_mapping *sm) 3073 + { 3074 + return vma->vm_private_data == sm && 3075 + (vma->vm_ops == &special_mapping_vmops || 3076 + vma->vm_ops == &legacy_special_mapping_vmops); 3077 + } 3078 + 3071 3079 /* 3072 3080 * Called with mm->mmap_sem held for writing. 3073 3081 * Insert a new vma covering the given region, with the given flags.