fs/exec.c at v4.20-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / exec.c
at v4.20-rc4 2001 lines 48 kB view raw
   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats.
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/fdtable.h>
  28#include <linux/mm.h>
  29#include <linux/vmacache.h>
  30#include <linux/stat.h>
  31#include <linux/fcntl.h>
  32#include <linux/swap.h>
  33#include <linux/string.h>
  34#include <linux/init.h>
  35#include <linux/sched/mm.h>
  36#include <linux/sched/coredump.h>
  37#include <linux/sched/signal.h>
  38#include <linux/sched/numa_balancing.h>
  39#include <linux/sched/task.h>
  40#include <linux/pagemap.h>
  41#include <linux/perf_event.h>
  42#include <linux/highmem.h>
  43#include <linux/spinlock.h>
  44#include <linux/key.h>
  45#include <linux/personality.h>
  46#include <linux/binfmts.h>
  47#include <linux/utsname.h>
  48#include <linux/pid_namespace.h>
  49#include <linux/module.h>
  50#include <linux/namei.h>
  51#include <linux/mount.h>
  52#include <linux/security.h>
  53#include <linux/syscalls.h>
  54#include <linux/tsacct_kern.h>
  55#include <linux/cn_proc.h>
  56#include <linux/audit.h>
  57#include <linux/tracehook.h>
  58#include <linux/kmod.h>
  59#include <linux/fsnotify.h>
  60#include <linux/fs_struct.h>
  61#include <linux/pipe_fs_i.h>
  62#include <linux/oom.h>
  63#include <linux/compat.h>
  64#include <linux/vmalloc.h>
  65#include <linux/freezer.h>
  66
  67#include <linux/uaccess.h>
  68#include <asm/mmu_context.h>
  69#include <asm/tlb.h>
  70
  71#include <trace/events/task.h>
  72#include "internal.h"
  73
  74#include <trace/events/sched.h>
  75
  76int suid_dumpable = 0;
  77
  78static LIST_HEAD(formats);
  79static DEFINE_RWLOCK(binfmt_lock);
  80
  81void __register_binfmt(struct linux_binfmt * fmt, int insert)
  82{
  83	BUG_ON(!fmt);
  84	if (WARN_ON(!fmt->load_binary))
  85		return;
  86	write_lock(&binfmt_lock);
  87	insert ? list_add(&fmt->lh, &formats) :
  88		 list_add_tail(&fmt->lh, &formats);
  89	write_unlock(&binfmt_lock);
  90}
  91
  92EXPORT_SYMBOL(__register_binfmt);
  93
  94void unregister_binfmt(struct linux_binfmt * fmt)
  95{
  96	write_lock(&binfmt_lock);
  97	list_del(&fmt->lh);
  98	write_unlock(&binfmt_lock);
  99}
 100
 101EXPORT_SYMBOL(unregister_binfmt);
 102
 103static inline void put_binfmt(struct linux_binfmt * fmt)
 104{
 105	module_put(fmt->module);
 106}
 107
 108bool path_noexec(const struct path *path)
 109{
 110	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
 111	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
 112}
 113
 114#ifdef CONFIG_USELIB
 115/*
 116 * Note that a shared library must be both readable and executable due to
 117 * security reasons.
 118 *
 119 * Also note that we take the address to load from from the file itself.
 120 */
 121SYSCALL_DEFINE1(uselib, const char __user *, library)
 122{
 123	struct linux_binfmt *fmt;
 124	struct file *file;
 125	struct filename *tmp = getname(library);
 126	int error = PTR_ERR(tmp);
 127	static const struct open_flags uselib_flags = {
 128		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 129		.acc_mode = MAY_READ | MAY_EXEC,
 130		.intent = LOOKUP_OPEN,
 131		.lookup_flags = LOOKUP_FOLLOW,
 132	};
 133
 134	if (IS_ERR(tmp))
 135		goto out;
 136
 137	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
 138	putname(tmp);
 139	error = PTR_ERR(file);
 140	if (IS_ERR(file))
 141		goto out;
 142
 143	error = -EINVAL;
 144	if (!S_ISREG(file_inode(file)->i_mode))
 145		goto exit;
 146
 147	error = -EACCES;
 148	if (path_noexec(&file->f_path))
 149		goto exit;
 150
 151	fsnotify_open(file);
 152
 153	error = -ENOEXEC;
 154
 155	read_lock(&binfmt_lock);
 156	list_for_each_entry(fmt, &formats, lh) {
 157		if (!fmt->load_shlib)
 158			continue;
 159		if (!try_module_get(fmt->module))
 160			continue;
 161		read_unlock(&binfmt_lock);
 162		error = fmt->load_shlib(file);
 163		read_lock(&binfmt_lock);
 164		put_binfmt(fmt);
 165		if (error != -ENOEXEC)
 166			break;
 167	}
 168	read_unlock(&binfmt_lock);
 169exit:
 170	fput(file);
 171out:
 172  	return error;
 173}
 174#endif /* #ifdef CONFIG_USELIB */
 175
 176#ifdef CONFIG_MMU
 177/*
 178 * The nascent bprm->mm is not visible until exec_mmap() but it can
 179 * use a lot of memory, account these pages in current->mm temporary
 180 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 181 * change the counter back via acct_arg_size(0).
 182 */
 183static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 184{
 185	struct mm_struct *mm = current->mm;
 186	long diff = (long)(pages - bprm->vma_pages);
 187
 188	if (!mm || !diff)
 189		return;
 190
 191	bprm->vma_pages = pages;
 192	add_mm_counter(mm, MM_ANONPAGES, diff);
 193}
 194
 195static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 196		int write)
 197{
 198	struct page *page;
 199	int ret;
 200	unsigned int gup_flags = FOLL_FORCE;
 201
 202#ifdef CONFIG_STACK_GROWSUP
 203	if (write) {
 204		ret = expand_downwards(bprm->vma, pos);
 205		if (ret < 0)
 206			return NULL;
 207	}
 208#endif
 209
 210	if (write)
 211		gup_flags |= FOLL_WRITE;
 212
 213	/*
 214	 * We are doing an exec().  'current' is the process
 215	 * doing the exec and bprm->mm is the new process's mm.
 216	 */
 217	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
 218			&page, NULL, NULL);
 219	if (ret <= 0)
 220		return NULL;
 221
 222	if (write) {
 223		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 224		unsigned long ptr_size, limit;
 225
 226		/*
 227		 * Since the stack will hold pointers to the strings, we
 228		 * must account for them as well.
 229		 *
 230		 * The size calculation is the entire vma while each arg page is
 231		 * built, so each time we get here it's calculating how far it
 232		 * is currently (rather than each call being just the newly
 233		 * added size from the arg page).  As a result, we need to
 234		 * always add the entire size of the pointers, so that on the
 235		 * last call to get_arg_page() we'll actually have the entire
 236		 * correct size.
 237		 */
 238		ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
 239		if (ptr_size > ULONG_MAX - size)
 240			goto fail;
 241		size += ptr_size;
 242
 243		acct_arg_size(bprm, size / PAGE_SIZE);
 244
 245		/*
 246		 * We've historically supported up to 32 pages (ARG_MAX)
 247		 * of argument strings even with small stacks
 248		 */
 249		if (size <= ARG_MAX)
 250			return page;
 251
 252		/*
 253		 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
 254		 * (whichever is smaller) for the argv+env strings.
 255		 * This ensures that:
 256		 *  - the remaining binfmt code will not run out of stack space,
 257		 *  - the program will have a reasonable amount of stack left
 258		 *    to work from.
 259		 */
 260		limit = _STK_LIM / 4 * 3;
 261		limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
 262		if (size > limit)
 263			goto fail;
 264	}
 265
 266	return page;
 267
 268fail:
 269	put_page(page);
 270	return NULL;
 271}
 272
 273static void put_arg_page(struct page *page)
 274{
 275	put_page(page);
 276}
 277
 278static void free_arg_pages(struct linux_binprm *bprm)
 279{
 280}
 281
 282static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 283		struct page *page)
 284{
 285	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 286}
 287
 288static int __bprm_mm_init(struct linux_binprm *bprm)
 289{
 290	int err;
 291	struct vm_area_struct *vma = NULL;
 292	struct mm_struct *mm = bprm->mm;
 293
 294	bprm->vma = vma = vm_area_alloc(mm);
 295	if (!vma)
 296		return -ENOMEM;
 297	vma_set_anonymous(vma);
 298
 299	if (down_write_killable(&mm->mmap_sem)) {
 300		err = -EINTR;
 301		goto err_free;
 302	}
 303
 304	/*
 305	 * Place the stack at the largest stack address the architecture
 306	 * supports. Later, we'll move this to an appropriate place. We don't
 307	 * use STACK_TOP because that can depend on attributes which aren't
 308	 * configured yet.
 309	 */
 310	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 311	vma->vm_end = STACK_TOP_MAX;
 312	vma->vm_start = vma->vm_end - PAGE_SIZE;
 313	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 314	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 315
 316	err = insert_vm_struct(mm, vma);
 317	if (err)
 318		goto err;
 319
 320	mm->stack_vm = mm->total_vm = 1;
 321	arch_bprm_mm_init(mm, vma);
 322	up_write(&mm->mmap_sem);
 323	bprm->p = vma->vm_end - sizeof(void *);
 324	return 0;
 325err:
 326	up_write(&mm->mmap_sem);
 327err_free:
 328	bprm->vma = NULL;
 329	vm_area_free(vma);
 330	return err;
 331}
 332
 333static bool valid_arg_len(struct linux_binprm *bprm, long len)
 334{
 335	return len <= MAX_ARG_STRLEN;
 336}
 337
 338#else
 339
 340static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 341{
 342}
 343
 344static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 345		int write)
 346{
 347	struct page *page;
 348
 349	page = bprm->page[pos / PAGE_SIZE];
 350	if (!page && write) {
 351		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 352		if (!page)
 353			return NULL;
 354		bprm->page[pos / PAGE_SIZE] = page;
 355	}
 356
 357	return page;
 358}
 359
 360static void put_arg_page(struct page *page)
 361{
 362}
 363
 364static void free_arg_page(struct linux_binprm *bprm, int i)
 365{
 366	if (bprm->page[i]) {
 367		__free_page(bprm->page[i]);
 368		bprm->page[i] = NULL;
 369	}
 370}
 371
 372static void free_arg_pages(struct linux_binprm *bprm)
 373{
 374	int i;
 375
 376	for (i = 0; i < MAX_ARG_PAGES; i++)
 377		free_arg_page(bprm, i);
 378}
 379
 380static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 381		struct page *page)
 382{
 383}
 384
 385static int __bprm_mm_init(struct linux_binprm *bprm)
 386{
 387	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 388	return 0;
 389}
 390
 391static bool valid_arg_len(struct linux_binprm *bprm, long len)
 392{
 393	return len <= bprm->p;
 394}
 395
 396#endif /* CONFIG_MMU */
 397
 398/*
 399 * Create a new mm_struct and populate it with a temporary stack
 400 * vm_area_struct.  We don't have enough context at this point to set the stack
 401 * flags, permissions, and offset, so we use temporary values.  We'll update
 402 * them later in setup_arg_pages().
 403 */
 404static int bprm_mm_init(struct linux_binprm *bprm)
 405{
 406	int err;
 407	struct mm_struct *mm = NULL;
 408
 409	bprm->mm = mm = mm_alloc();
 410	err = -ENOMEM;
 411	if (!mm)
 412		goto err;
 413
 414	/* Save current stack limit for all calculations made during exec. */
 415	task_lock(current->group_leader);
 416	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
 417	task_unlock(current->group_leader);
 418
 419	err = __bprm_mm_init(bprm);
 420	if (err)
 421		goto err;
 422
 423	return 0;
 424
 425err:
 426	if (mm) {
 427		bprm->mm = NULL;
 428		mmdrop(mm);
 429	}
 430
 431	return err;
 432}
 433
 434struct user_arg_ptr {
 435#ifdef CONFIG_COMPAT
 436	bool is_compat;
 437#endif
 438	union {
 439		const char __user *const __user *native;
 440#ifdef CONFIG_COMPAT
 441		const compat_uptr_t __user *compat;
 442#endif
 443	} ptr;
 444};
 445
 446static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 447{
 448	const char __user *native;
 449
 450#ifdef CONFIG_COMPAT
 451	if (unlikely(argv.is_compat)) {
 452		compat_uptr_t compat;
 453
 454		if (get_user(compat, argv.ptr.compat + nr))
 455			return ERR_PTR(-EFAULT);
 456
 457		return compat_ptr(compat);
 458	}
 459#endif
 460
 461	if (get_user(native, argv.ptr.native + nr))
 462		return ERR_PTR(-EFAULT);
 463
 464	return native;
 465}
 466
 467/*
 468 * count() counts the number of strings in array ARGV.
 469 */
 470static int count(struct user_arg_ptr argv, int max)
 471{
 472	int i = 0;
 473
 474	if (argv.ptr.native != NULL) {
 475		for (;;) {
 476			const char __user *p = get_user_arg_ptr(argv, i);
 477
 478			if (!p)
 479				break;
 480
 481			if (IS_ERR(p))
 482				return -EFAULT;
 483
 484			if (i >= max)
 485				return -E2BIG;
 486			++i;
 487
 488			if (fatal_signal_pending(current))
 489				return -ERESTARTNOHAND;
 490			cond_resched();
 491		}
 492	}
 493	return i;
 494}
 495
 496/*
 497 * 'copy_strings()' copies argument/environment strings from the old
 498 * processes's memory to the new process's stack.  The call to get_user_pages()
 499 * ensures the destination page is created and not swapped out.
 500 */
 501static int copy_strings(int argc, struct user_arg_ptr argv,
 502			struct linux_binprm *bprm)
 503{
 504	struct page *kmapped_page = NULL;
 505	char *kaddr = NULL;
 506	unsigned long kpos = 0;
 507	int ret;
 508
 509	while (argc-- > 0) {
 510		const char __user *str;
 511		int len;
 512		unsigned long pos;
 513
 514		ret = -EFAULT;
 515		str = get_user_arg_ptr(argv, argc);
 516		if (IS_ERR(str))
 517			goto out;
 518
 519		len = strnlen_user(str, MAX_ARG_STRLEN);
 520		if (!len)
 521			goto out;
 522
 523		ret = -E2BIG;
 524		if (!valid_arg_len(bprm, len))
 525			goto out;
 526
 527		/* We're going to work our way backwords. */
 528		pos = bprm->p;
 529		str += len;
 530		bprm->p -= len;
 531
 532		while (len > 0) {
 533			int offset, bytes_to_copy;
 534
 535			if (fatal_signal_pending(current)) {
 536				ret = -ERESTARTNOHAND;
 537				goto out;
 538			}
 539			cond_resched();
 540
 541			offset = pos % PAGE_SIZE;
 542			if (offset == 0)
 543				offset = PAGE_SIZE;
 544
 545			bytes_to_copy = offset;
 546			if (bytes_to_copy > len)
 547				bytes_to_copy = len;
 548
 549			offset -= bytes_to_copy;
 550			pos -= bytes_to_copy;
 551			str -= bytes_to_copy;
 552			len -= bytes_to_copy;
 553
 554			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 555				struct page *page;
 556
 557				page = get_arg_page(bprm, pos, 1);
 558				if (!page) {
 559					ret = -E2BIG;
 560					goto out;
 561				}
 562
 563				if (kmapped_page) {
 564					flush_kernel_dcache_page(kmapped_page);
 565					kunmap(kmapped_page);
 566					put_arg_page(kmapped_page);
 567				}
 568				kmapped_page = page;
 569				kaddr = kmap(kmapped_page);
 570				kpos = pos & PAGE_MASK;
 571				flush_arg_page(bprm, kpos, kmapped_page);
 572			}
 573			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 574				ret = -EFAULT;
 575				goto out;
 576			}
 577		}
 578	}
 579	ret = 0;
 580out:
 581	if (kmapped_page) {
 582		flush_kernel_dcache_page(kmapped_page);
 583		kunmap(kmapped_page);
 584		put_arg_page(kmapped_page);
 585	}
 586	return ret;
 587}
 588
 589/*
 590 * Like copy_strings, but get argv and its values from kernel memory.
 591 */
 592int copy_strings_kernel(int argc, const char *const *__argv,
 593			struct linux_binprm *bprm)
 594{
 595	int r;
 596	mm_segment_t oldfs = get_fs();
 597	struct user_arg_ptr argv = {
 598		.ptr.native = (const char __user *const  __user *)__argv,
 599	};
 600
 601	set_fs(KERNEL_DS);
 602	r = copy_strings(argc, argv, bprm);
 603	set_fs(oldfs);
 604
 605	return r;
 606}
 607EXPORT_SYMBOL(copy_strings_kernel);
 608
 609#ifdef CONFIG_MMU
 610
 611/*
 612 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 613 * the binfmt code determines where the new stack should reside, we shift it to
 614 * its final location.  The process proceeds as follows:
 615 *
 616 * 1) Use shift to calculate the new vma endpoints.
 617 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 618 *    arguments passed to subsequent functions are consistent.
 619 * 3) Move vma's page tables to the new range.
 620 * 4) Free up any cleared pgd range.
 621 * 5) Shrink the vma to cover only the new range.
 622 */
 623static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 624{
 625	struct mm_struct *mm = vma->vm_mm;
 626	unsigned long old_start = vma->vm_start;
 627	unsigned long old_end = vma->vm_end;
 628	unsigned long length = old_end - old_start;
 629	unsigned long new_start = old_start - shift;
 630	unsigned long new_end = old_end - shift;
 631	struct mmu_gather tlb;
 632
 633	BUG_ON(new_start > new_end);
 634
 635	/*
 636	 * ensure there are no vmas between where we want to go
 637	 * and where we are
 638	 */
 639	if (vma != find_vma(mm, new_start))
 640		return -EFAULT;
 641
 642	/*
 643	 * cover the whole range: [new_start, old_end)
 644	 */
 645	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
 646		return -ENOMEM;
 647
 648	/*
 649	 * move the page tables downwards, on failure we rely on
 650	 * process cleanup to remove whatever mess we made.
 651	 */
 652	if (length != move_page_tables(vma, old_start,
 653				       vma, new_start, length, false))
 654		return -ENOMEM;
 655
 656	lru_add_drain();
 657	tlb_gather_mmu(&tlb, mm, old_start, old_end);
 658	if (new_end > old_start) {
 659		/*
 660		 * when the old and new regions overlap clear from new_end.
 661		 */
 662		free_pgd_range(&tlb, new_end, old_end, new_end,
 663			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 664	} else {
 665		/*
 666		 * otherwise, clean from old_start; this is done to not touch
 667		 * the address space in [new_end, old_start) some architectures
 668		 * have constraints on va-space that make this illegal (IA64) -
 669		 * for the others its just a little faster.
 670		 */
 671		free_pgd_range(&tlb, old_start, old_end, new_end,
 672			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 673	}
 674	tlb_finish_mmu(&tlb, old_start, old_end);
 675
 676	/*
 677	 * Shrink the vma to just the new range.  Always succeeds.
 678	 */
 679	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 680
 681	return 0;
 682}
 683
 684/*
 685 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 686 * the stack is optionally relocated, and some extra space is added.
 687 */
 688int setup_arg_pages(struct linux_binprm *bprm,
 689		    unsigned long stack_top,
 690		    int executable_stack)
 691{
 692	unsigned long ret;
 693	unsigned long stack_shift;
 694	struct mm_struct *mm = current->mm;
 695	struct vm_area_struct *vma = bprm->vma;
 696	struct vm_area_struct *prev = NULL;
 697	unsigned long vm_flags;
 698	unsigned long stack_base;
 699	unsigned long stack_size;
 700	unsigned long stack_expand;
 701	unsigned long rlim_stack;
 702
 703#ifdef CONFIG_STACK_GROWSUP
 704	/* Limit stack size */
 705	stack_base = bprm->rlim_stack.rlim_max;
 706	if (stack_base > STACK_SIZE_MAX)
 707		stack_base = STACK_SIZE_MAX;
 708
 709	/* Add space for stack randomization. */
 710	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
 711
 712	/* Make sure we didn't let the argument array grow too large. */
 713	if (vma->vm_end - vma->vm_start > stack_base)
 714		return -ENOMEM;
 715
 716	stack_base = PAGE_ALIGN(stack_top - stack_base);
 717
 718	stack_shift = vma->vm_start - stack_base;
 719	mm->arg_start = bprm->p - stack_shift;
 720	bprm->p = vma->vm_end - stack_shift;
 721#else
 722	stack_top = arch_align_stack(stack_top);
 723	stack_top = PAGE_ALIGN(stack_top);
 724
 725	if (unlikely(stack_top < mmap_min_addr) ||
 726	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
 727		return -ENOMEM;
 728
 729	stack_shift = vma->vm_end - stack_top;
 730
 731	bprm->p -= stack_shift;
 732	mm->arg_start = bprm->p;
 733#endif
 734
 735	if (bprm->loader)
 736		bprm->loader -= stack_shift;
 737	bprm->exec -= stack_shift;
 738
 739	if (down_write_killable(&mm->mmap_sem))
 740		return -EINTR;
 741
 742	vm_flags = VM_STACK_FLAGS;
 743
 744	/*
 745	 * Adjust stack execute permissions; explicitly enable for
 746	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 747	 * (arch default) otherwise.
 748	 */
 749	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 750		vm_flags |= VM_EXEC;
 751	else if (executable_stack == EXSTACK_DISABLE_X)
 752		vm_flags &= ~VM_EXEC;
 753	vm_flags |= mm->def_flags;
 754	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 755
 756	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 757			vm_flags);
 758	if (ret)
 759		goto out_unlock;
 760	BUG_ON(prev != vma);
 761
 762	/* Move stack pages down in memory. */
 763	if (stack_shift) {
 764		ret = shift_arg_pages(vma, stack_shift);
 765		if (ret)
 766			goto out_unlock;
 767	}
 768
 769	/* mprotect_fixup is overkill to remove the temporary stack flags */
 770	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
 771
 772	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
 773	stack_size = vma->vm_end - vma->vm_start;
 774	/*
 775	 * Align this down to a page boundary as expand_stack
 776	 * will align it up.
 777	 */
 778	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
 779#ifdef CONFIG_STACK_GROWSUP
 780	if (stack_size + stack_expand > rlim_stack)
 781		stack_base = vma->vm_start + rlim_stack;
 782	else
 783		stack_base = vma->vm_end + stack_expand;
 784#else
 785	if (stack_size + stack_expand > rlim_stack)
 786		stack_base = vma->vm_end - rlim_stack;
 787	else
 788		stack_base = vma->vm_start - stack_expand;
 789#endif
 790	current->mm->start_stack = bprm->p;
 791	ret = expand_stack(vma, stack_base);
 792	if (ret)
 793		ret = -EFAULT;
 794
 795out_unlock:
 796	up_write(&mm->mmap_sem);
 797	return ret;
 798}
 799EXPORT_SYMBOL(setup_arg_pages);
 800
 801#else
 802
 803/*
 804 * Transfer the program arguments and environment from the holding pages
 805 * onto the stack. The provided stack pointer is adjusted accordingly.
 806 */
 807int transfer_args_to_stack(struct linux_binprm *bprm,
 808			   unsigned long *sp_location)
 809{
 810	unsigned long index, stop, sp;
 811	int ret = 0;
 812
 813	stop = bprm->p >> PAGE_SHIFT;
 814	sp = *sp_location;
 815
 816	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
 817		unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
 818		char *src = kmap(bprm->page[index]) + offset;
 819		sp -= PAGE_SIZE - offset;
 820		if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
 821			ret = -EFAULT;
 822		kunmap(bprm->page[index]);
 823		if (ret)
 824			goto out;
 825	}
 826
 827	*sp_location = sp;
 828
 829out:
 830	return ret;
 831}
 832EXPORT_SYMBOL(transfer_args_to_stack);
 833
 834#endif /* CONFIG_MMU */
 835
 836static struct file *do_open_execat(int fd, struct filename *name, int flags)
 837{
 838	struct file *file;
 839	int err;
 840	struct open_flags open_exec_flags = {
 841		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 842		.acc_mode = MAY_EXEC,
 843		.intent = LOOKUP_OPEN,
 844		.lookup_flags = LOOKUP_FOLLOW,
 845	};
 846
 847	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 848		return ERR_PTR(-EINVAL);
 849	if (flags & AT_SYMLINK_NOFOLLOW)
 850		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
 851	if (flags & AT_EMPTY_PATH)
 852		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
 853
 854	file = do_filp_open(fd, name, &open_exec_flags);
 855	if (IS_ERR(file))
 856		goto out;
 857
 858	err = -EACCES;
 859	if (!S_ISREG(file_inode(file)->i_mode))
 860		goto exit;
 861
 862	if (path_noexec(&file->f_path))
 863		goto exit;
 864
 865	err = deny_write_access(file);
 866	if (err)
 867		goto exit;
 868
 869	if (name->name[0] != '\0')
 870		fsnotify_open(file);
 871
 872out:
 873	return file;
 874
 875exit:
 876	fput(file);
 877	return ERR_PTR(err);
 878}
 879
 880struct file *open_exec(const char *name)
 881{
 882	struct filename *filename = getname_kernel(name);
 883	struct file *f = ERR_CAST(filename);
 884
 885	if (!IS_ERR(filename)) {
 886		f = do_open_execat(AT_FDCWD, filename, 0);
 887		putname(filename);
 888	}
 889	return f;
 890}
 891EXPORT_SYMBOL(open_exec);
 892
 893int kernel_read_file(struct file *file, void **buf, loff_t *size,
 894		     loff_t max_size, enum kernel_read_file_id id)
 895{
 896	loff_t i_size, pos;
 897	ssize_t bytes = 0;
 898	int ret;
 899
 900	if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
 901		return -EINVAL;
 902
 903	ret = deny_write_access(file);
 904	if (ret)
 905		return ret;
 906
 907	ret = security_kernel_read_file(file, id);
 908	if (ret)
 909		goto out;
 910
 911	i_size = i_size_read(file_inode(file));
 912	if (i_size <= 0) {
 913		ret = -EINVAL;
 914		goto out;
 915	}
 916	if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
 917		ret = -EFBIG;
 918		goto out;
 919	}
 920
 921	if (id != READING_FIRMWARE_PREALLOC_BUFFER)
 922		*buf = vmalloc(i_size);
 923	if (!*buf) {
 924		ret = -ENOMEM;
 925		goto out;
 926	}
 927
 928	pos = 0;
 929	while (pos < i_size) {
 930		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
 931		if (bytes < 0) {
 932			ret = bytes;
 933			goto out;
 934		}
 935
 936		if (bytes == 0)
 937			break;
 938	}
 939
 940	if (pos != i_size) {
 941		ret = -EIO;
 942		goto out_free;
 943	}
 944
 945	ret = security_kernel_post_read_file(file, *buf, i_size, id);
 946	if (!ret)
 947		*size = pos;
 948
 949out_free:
 950	if (ret < 0) {
 951		if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
 952			vfree(*buf);
 953			*buf = NULL;
 954		}
 955	}
 956
 957out:
 958	allow_write_access(file);
 959	return ret;
 960}
 961EXPORT_SYMBOL_GPL(kernel_read_file);
 962
 963int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
 964			       loff_t max_size, enum kernel_read_file_id id)
 965{
 966	struct file *file;
 967	int ret;
 968
 969	if (!path || !*path)
 970		return -EINVAL;
 971
 972	file = filp_open(path, O_RDONLY, 0);
 973	if (IS_ERR(file))
 974		return PTR_ERR(file);
 975
 976	ret = kernel_read_file(file, buf, size, max_size, id);
 977	fput(file);
 978	return ret;
 979}
 980EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
 981
 982int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
 983			     enum kernel_read_file_id id)
 984{
 985	struct fd f = fdget(fd);
 986	int ret = -EBADF;
 987
 988	if (!f.file)
 989		goto out;
 990
 991	ret = kernel_read_file(f.file, buf, size, max_size, id);
 992out:
 993	fdput(f);
 994	return ret;
 995}
 996EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
 997
 998ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 999{
1000	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
1001	if (res > 0)
1002		flush_icache_range(addr, addr + len);
1003	return res;
1004}
1005EXPORT_SYMBOL(read_code);
1006
1007static int exec_mmap(struct mm_struct *mm)
1008{
1009	struct task_struct *tsk;
1010	struct mm_struct *old_mm, *active_mm;
1011
1012	/* Notify parent that we're no longer interested in the old VM */
1013	tsk = current;
1014	old_mm = current->mm;
1015	mm_release(tsk, old_mm);
1016
1017	if (old_mm) {
1018		sync_mm_rss(old_mm);
1019		/*
1020		 * Make sure that if there is a core dump in progress
1021		 * for the old mm, we get out and die instead of going
1022		 * through with the exec.  We must hold mmap_sem around
1023		 * checking core_state and changing tsk->mm.
1024		 */
1025		down_read(&old_mm->mmap_sem);
1026		if (unlikely(old_mm->core_state)) {
1027			up_read(&old_mm->mmap_sem);
1028			return -EINTR;
1029		}
1030	}
1031	task_lock(tsk);
1032	active_mm = tsk->active_mm;
1033	tsk->mm = mm;
1034	tsk->active_mm = mm;
1035	activate_mm(active_mm, mm);
1036	tsk->mm->vmacache_seqnum = 0;
1037	vmacache_flush(tsk);
1038	task_unlock(tsk);
1039	if (old_mm) {
1040		up_read(&old_mm->mmap_sem);
1041		BUG_ON(active_mm != old_mm);
1042		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
1043		mm_update_next_owner(old_mm);
1044		mmput(old_mm);
1045		return 0;
1046	}
1047	mmdrop(active_mm);
1048	return 0;
1049}
1050
1051/*
1052 * This function makes sure the current process has its own signal table,
1053 * so that flush_signal_handlers can later reset the handlers without
1054 * disturbing other processes.  (Other processes might share the signal
1055 * table via the CLONE_SIGHAND option to clone().)
1056 */
1057static int de_thread(struct task_struct *tsk)
1058{
1059	struct signal_struct *sig = tsk->signal;
1060	struct sighand_struct *oldsighand = tsk->sighand;
1061	spinlock_t *lock = &oldsighand->siglock;
1062
1063	if (thread_group_empty(tsk))
1064		goto no_thread_group;
1065
1066	/*
1067	 * Kill all other threads in the thread group.
1068	 */
1069	spin_lock_irq(lock);
1070	if (signal_group_exit(sig)) {
1071		/*
1072		 * Another group action in progress, just
1073		 * return so that the signal is processed.
1074		 */
1075		spin_unlock_irq(lock);
1076		return -EAGAIN;
1077	}
1078
1079	sig->group_exit_task = tsk;
1080	sig->notify_count = zap_other_threads(tsk);
1081	if (!thread_group_leader(tsk))
1082		sig->notify_count--;
1083
1084	while (sig->notify_count) {
1085		__set_current_state(TASK_KILLABLE);
1086		spin_unlock_irq(lock);
1087		freezable_schedule();
1088		if (unlikely(__fatal_signal_pending(tsk)))
1089			goto killed;
1090		spin_lock_irq(lock);
1091	}
1092	spin_unlock_irq(lock);
1093
1094	/*
1095	 * At this point all other threads have exited, all we have to
1096	 * do is to wait for the thread group leader to become inactive,
1097	 * and to assume its PID:
1098	 */
1099	if (!thread_group_leader(tsk)) {
1100		struct task_struct *leader = tsk->group_leader;
1101
1102		for (;;) {
1103			cgroup_threadgroup_change_begin(tsk);
1104			write_lock_irq(&tasklist_lock);
1105			/*
1106			 * Do this under tasklist_lock to ensure that
1107			 * exit_notify() can't miss ->group_exit_task
1108			 */
1109			sig->notify_count = -1;
1110			if (likely(leader->exit_state))
1111				break;
1112			__set_current_state(TASK_KILLABLE);
1113			write_unlock_irq(&tasklist_lock);
1114			cgroup_threadgroup_change_end(tsk);
1115			freezable_schedule();
1116			if (unlikely(__fatal_signal_pending(tsk)))
1117				goto killed;
1118		}
1119
1120		/*
1121		 * The only record we have of the real-time age of a
1122		 * process, regardless of execs it's done, is start_time.
1123		 * All the past CPU time is accumulated in signal_struct
1124		 * from sister threads now dead.  But in this non-leader
1125		 * exec, nothing survives from the original leader thread,
1126		 * whose birth marks the true age of this process now.
1127		 * When we take on its identity by switching to its PID, we
1128		 * also take its birthdate (always earlier than our own).
1129		 */
1130		tsk->start_time = leader->start_time;
1131		tsk->real_start_time = leader->real_start_time;
1132
1133		BUG_ON(!same_thread_group(leader, tsk));
1134		BUG_ON(has_group_leader_pid(tsk));
1135		/*
1136		 * An exec() starts a new thread group with the
1137		 * TGID of the previous thread group. Rehash the
1138		 * two threads with a switched PID, and release
1139		 * the former thread group leader:
1140		 */
1141
1142		/* Become a process group leader with the old leader's pid.
1143		 * The old leader becomes a thread of the this thread group.
1144		 * Note: The old leader also uses this pid until release_task
1145		 *       is called.  Odd but simple and correct.
1146		 */
1147		tsk->pid = leader->pid;
1148		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
1149		transfer_pid(leader, tsk, PIDTYPE_TGID);
1150		transfer_pid(leader, tsk, PIDTYPE_PGID);
1151		transfer_pid(leader, tsk, PIDTYPE_SID);
1152
1153		list_replace_rcu(&leader->tasks, &tsk->tasks);
1154		list_replace_init(&leader->sibling, &tsk->sibling);
1155
1156		tsk->group_leader = tsk;
1157		leader->group_leader = tsk;
1158
1159		tsk->exit_signal = SIGCHLD;
1160		leader->exit_signal = -1;
1161
1162		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1163		leader->exit_state = EXIT_DEAD;
1164
1165		/*
1166		 * We are going to release_task()->ptrace_unlink() silently,
1167		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1168		 * the tracer wont't block again waiting for this thread.
1169		 */
1170		if (unlikely(leader->ptrace))
1171			__wake_up_parent(leader, leader->parent);
1172		write_unlock_irq(&tasklist_lock);
1173		cgroup_threadgroup_change_end(tsk);
1174
1175		release_task(leader);
1176	}
1177
1178	sig->group_exit_task = NULL;
1179	sig->notify_count = 0;
1180
1181no_thread_group:
1182	/* we have changed execution domain */
1183	tsk->exit_signal = SIGCHLD;
1184
1185#ifdef CONFIG_POSIX_TIMERS
1186	exit_itimers(sig);
1187	flush_itimer_signals();
1188#endif
1189
1190	if (atomic_read(&oldsighand->count) != 1) {
1191		struct sighand_struct *newsighand;
1192		/*
1193		 * This ->sighand is shared with the CLONE_SIGHAND
1194		 * but not CLONE_THREAD task, switch to the new one.
1195		 */
1196		newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1197		if (!newsighand)
1198			return -ENOMEM;
1199
1200		atomic_set(&newsighand->count, 1);
1201		memcpy(newsighand->action, oldsighand->action,
1202		       sizeof(newsighand->action));
1203
1204		write_lock_irq(&tasklist_lock);
1205		spin_lock(&oldsighand->siglock);
1206		rcu_assign_pointer(tsk->sighand, newsighand);
1207		spin_unlock(&oldsighand->siglock);
1208		write_unlock_irq(&tasklist_lock);
1209
1210		__cleanup_sighand(oldsighand);
1211	}
1212
1213	BUG_ON(!thread_group_leader(tsk));
1214	return 0;
1215
1216killed:
1217	/* protects against exit_notify() and __exit_signal() */
1218	read_lock(&tasklist_lock);
1219	sig->group_exit_task = NULL;
1220	sig->notify_count = 0;
1221	read_unlock(&tasklist_lock);
1222	return -EAGAIN;
1223}
1224
1225char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
1226{
1227	task_lock(tsk);
1228	strncpy(buf, tsk->comm, buf_size);
1229	task_unlock(tsk);
1230	return buf;
1231}
1232EXPORT_SYMBOL_GPL(__get_task_comm);
1233
1234/*
1235 * These functions flushes out all traces of the currently running executable
1236 * so that a new one can be started
1237 */
1238
1239void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
1240{
1241	task_lock(tsk);
1242	trace_task_rename(tsk, buf);
1243	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1244	task_unlock(tsk);
1245	perf_event_comm(tsk, exec);
1246}
1247
1248/*
1249 * Calling this is the point of no return. None of the failures will be
1250 * seen by userspace since either the process is already taking a fatal
1251 * signal (via de_thread() or coredump), or will have SEGV raised
1252 * (after exec_mmap()) by search_binary_handlers (see below).
1253 */
1254int flush_old_exec(struct linux_binprm * bprm)
1255{
1256	int retval;
1257
1258	/*
1259	 * Make sure we have a private signal table and that
1260	 * we are unassociated from the previous thread group.
1261	 */
1262	retval = de_thread(current);
1263	if (retval)
1264		goto out;
1265
1266	/*
1267	 * Must be called _before_ exec_mmap() as bprm->mm is
1268	 * not visibile until then. This also enables the update
1269	 * to be lockless.
1270	 */
1271	set_mm_exe_file(bprm->mm, bprm->file);
1272
1273	/*
1274	 * Release all of the old mmap stuff
1275	 */
1276	acct_arg_size(bprm, 0);
1277	retval = exec_mmap(bprm->mm);
1278	if (retval)
1279		goto out;
1280
1281	/*
1282	 * After clearing bprm->mm (to mark that current is using the
1283	 * prepared mm now), we have nothing left of the original
1284	 * process. If anything from here on returns an error, the check
1285	 * in search_binary_handler() will SEGV current.
1286	 */
1287	bprm->mm = NULL;
1288
1289	set_fs(USER_DS);
1290	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1291					PF_NOFREEZE | PF_NO_SETAFFINITY);
1292	flush_thread();
1293	current->personality &= ~bprm->per_clear;
1294
1295	/*
1296	 * We have to apply CLOEXEC before we change whether the process is
1297	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
1298	 * trying to access the should-be-closed file descriptors of a process
1299	 * undergoing exec(2).
1300	 */
1301	do_close_on_exec(current->files);
1302	return 0;
1303
1304out:
1305	return retval;
1306}
1307EXPORT_SYMBOL(flush_old_exec);
1308
1309void would_dump(struct linux_binprm *bprm, struct file *file)
1310{
1311	struct inode *inode = file_inode(file);
1312	if (inode_permission(inode, MAY_READ) < 0) {
1313		struct user_namespace *old, *user_ns;
1314		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1315
1316		/* Ensure mm->user_ns contains the executable */
1317		user_ns = old = bprm->mm->user_ns;
1318		while ((user_ns != &init_user_ns) &&
1319		       !privileged_wrt_inode_uidgid(user_ns, inode))
1320			user_ns = user_ns->parent;
1321
1322		if (old != user_ns) {
1323			bprm->mm->user_ns = get_user_ns(user_ns);
1324			put_user_ns(old);
1325		}
1326	}
1327}
1328EXPORT_SYMBOL(would_dump);
1329
1330void setup_new_exec(struct linux_binprm * bprm)
1331{
1332	/*
1333	 * Once here, prepare_binrpm() will not be called any more, so
1334	 * the final state of setuid/setgid/fscaps can be merged into the
1335	 * secureexec flag.
1336	 */
1337	bprm->secureexec |= bprm->cap_elevated;
1338
1339	if (bprm->secureexec) {
1340		/* Make sure parent cannot signal privileged process. */
1341		current->pdeath_signal = 0;
1342
1343		/*
1344		 * For secureexec, reset the stack limit to sane default to
1345		 * avoid bad behavior from the prior rlimits. This has to
1346		 * happen before arch_pick_mmap_layout(), which examines
1347		 * RLIMIT_STACK, but after the point of no return to avoid
1348		 * needing to clean up the change on failure.
1349		 */
1350		if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1351			bprm->rlim_stack.rlim_cur = _STK_LIM;
1352	}
1353
1354	arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
1355
1356	current->sas_ss_sp = current->sas_ss_size = 0;
1357
1358	/*
1359	 * Figure out dumpability. Note that this checking only of current
1360	 * is wrong, but userspace depends on it. This should be testing
1361	 * bprm->secureexec instead.
1362	 */
1363	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1364	    !(uid_eq(current_euid(), current_uid()) &&
1365	      gid_eq(current_egid(), current_gid())))
1366		set_dumpable(current->mm, suid_dumpable);
1367	else
1368		set_dumpable(current->mm, SUID_DUMP_USER);
1369
1370	arch_setup_new_exec();
1371	perf_event_exec();
1372	__set_task_comm(current, kbasename(bprm->filename), true);
1373
1374	/* Set the new mm task size. We have to do that late because it may
1375	 * depend on TIF_32BIT which is only updated in flush_thread() on
1376	 * some architectures like powerpc
1377	 */
1378	current->mm->task_size = TASK_SIZE;
1379
1380	/* An exec changes our domain. We are no longer part of the thread
1381	   group */
1382	current->self_exec_id++;
1383	flush_signal_handlers(current, 0);
1384}
1385EXPORT_SYMBOL(setup_new_exec);
1386
1387/* Runs immediately before start_thread() takes over. */
1388void finalize_exec(struct linux_binprm *bprm)
1389{
1390	/* Store any stack rlimit changes before starting thread. */
1391	task_lock(current->group_leader);
1392	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1393	task_unlock(current->group_leader);
1394}
1395EXPORT_SYMBOL(finalize_exec);
1396
1397/*
1398 * Prepare credentials and lock ->cred_guard_mutex.
1399 * install_exec_creds() commits the new creds and drops the lock.
1400 * Or, if exec fails before, free_bprm() should release ->cred and
1401 * and unlock.
1402 */
1403int prepare_bprm_creds(struct linux_binprm *bprm)
1404{
1405	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1406		return -ERESTARTNOINTR;
1407
1408	bprm->cred = prepare_exec_creds();
1409	if (likely(bprm->cred))
1410		return 0;
1411
1412	mutex_unlock(&current->signal->cred_guard_mutex);
1413	return -ENOMEM;
1414}
1415
1416static void free_bprm(struct linux_binprm *bprm)
1417{
1418	free_arg_pages(bprm);
1419	if (bprm->cred) {
1420		mutex_unlock(&current->signal->cred_guard_mutex);
1421		abort_creds(bprm->cred);
1422	}
1423	if (bprm->file) {
1424		allow_write_access(bprm->file);
1425		fput(bprm->file);
1426	}
1427	/* If a binfmt changed the interp, free it. */
1428	if (bprm->interp != bprm->filename)
1429		kfree(bprm->interp);
1430	kfree(bprm);
1431}
1432
1433int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
1434{
1435	/* If a binfmt changed the interp, free it first. */
1436	if (bprm->interp != bprm->filename)
1437		kfree(bprm->interp);
1438	bprm->interp = kstrdup(interp, GFP_KERNEL);
1439	if (!bprm->interp)
1440		return -ENOMEM;
1441	return 0;
1442}
1443EXPORT_SYMBOL(bprm_change_interp);
1444
1445/*
1446 * install the new credentials for this executable
1447 */
1448void install_exec_creds(struct linux_binprm *bprm)
1449{
1450	security_bprm_committing_creds(bprm);
1451
1452	commit_creds(bprm->cred);
1453	bprm->cred = NULL;
1454
1455	/*
1456	 * Disable monitoring for regular users
1457	 * when executing setuid binaries. Must
1458	 * wait until new credentials are committed
1459	 * by commit_creds() above
1460	 */
1461	if (get_dumpable(current->mm) != SUID_DUMP_USER)
1462		perf_event_exit_task(current);
1463	/*
1464	 * cred_guard_mutex must be held at least to this point to prevent
1465	 * ptrace_attach() from altering our determination of the task's
1466	 * credentials; any time after this it may be unlocked.
1467	 */
1468	security_bprm_committed_creds(bprm);
1469	mutex_unlock(&current->signal->cred_guard_mutex);
1470}
1471EXPORT_SYMBOL(install_exec_creds);
1472
1473/*
1474 * determine how safe it is to execute the proposed program
1475 * - the caller must hold ->cred_guard_mutex to protect against
1476 *   PTRACE_ATTACH or seccomp thread-sync
1477 */
1478static void check_unsafe_exec(struct linux_binprm *bprm)
1479{
1480	struct task_struct *p = current, *t;
1481	unsigned n_fs;
1482
1483	if (p->ptrace)
1484		bprm->unsafe |= LSM_UNSAFE_PTRACE;
1485
1486	/*
1487	 * This isn't strictly necessary, but it makes it harder for LSMs to
1488	 * mess up.
1489	 */
1490	if (task_no_new_privs(current))
1491		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1492
1493	t = p;
1494	n_fs = 1;
1495	spin_lock(&p->fs->lock);
1496	rcu_read_lock();
1497	while_each_thread(p, t) {
1498		if (t->fs == p->fs)
1499			n_fs++;
1500	}
1501	rcu_read_unlock();
1502
1503	if (p->fs->users > n_fs)
1504		bprm->unsafe |= LSM_UNSAFE_SHARE;
1505	else
1506		p->fs->in_exec = 1;
1507	spin_unlock(&p->fs->lock);
1508}
1509
1510static void bprm_fill_uid(struct linux_binprm *bprm)
1511{
1512	struct inode *inode;
1513	unsigned int mode;
1514	kuid_t uid;
1515	kgid_t gid;
1516
1517	/*
1518	 * Since this can be called multiple times (via prepare_binprm),
1519	 * we must clear any previous work done when setting set[ug]id
1520	 * bits from any earlier bprm->file uses (for example when run
1521	 * first for a setuid script then again for its interpreter).
1522	 */
1523	bprm->cred->euid = current_euid();
1524	bprm->cred->egid = current_egid();
1525
1526	if (!mnt_may_suid(bprm->file->f_path.mnt))
1527		return;
1528
1529	if (task_no_new_privs(current))
1530		return;
1531
1532	inode = bprm->file->f_path.dentry->d_inode;
1533	mode = READ_ONCE(inode->i_mode);
1534	if (!(mode & (S_ISUID|S_ISGID)))
1535		return;
1536
1537	/* Be careful if suid/sgid is set */
1538	inode_lock(inode);
1539
1540	/* reload atomically mode/uid/gid now that lock held */
1541	mode = inode->i_mode;
1542	uid = inode->i_uid;
1543	gid = inode->i_gid;
1544	inode_unlock(inode);
1545
1546	/* We ignore suid/sgid if there are no mappings for them in the ns */
1547	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
1548		 !kgid_has_mapping(bprm->cred->user_ns, gid))
1549		return;
1550
1551	if (mode & S_ISUID) {
1552		bprm->per_clear |= PER_CLEAR_ON_SETID;
1553		bprm->cred->euid = uid;
1554	}
1555
1556	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1557		bprm->per_clear |= PER_CLEAR_ON_SETID;
1558		bprm->cred->egid = gid;
1559	}
1560}
1561
1562/*
1563 * Fill the binprm structure from the inode.
1564 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1565 *
1566 * This may be called multiple times for binary chains (scripts for example).
1567 */
1568int prepare_binprm(struct linux_binprm *bprm)
1569{
1570	int retval;
1571	loff_t pos = 0;
1572
1573	bprm_fill_uid(bprm);
1574
1575	/* fill in binprm security blob */
1576	retval = security_bprm_set_creds(bprm);
1577	if (retval)
1578		return retval;
1579	bprm->called_set_creds = 1;
1580
1581	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1582	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1583}
1584
1585EXPORT_SYMBOL(prepare_binprm);
1586
1587/*
1588 * Arguments are '\0' separated strings found at the location bprm->p
1589 * points to; chop off the first by relocating brpm->p to right after
1590 * the first '\0' encountered.
1591 */
1592int remove_arg_zero(struct linux_binprm *bprm)
1593{
1594	int ret = 0;
1595	unsigned long offset;
1596	char *kaddr;
1597	struct page *page;
1598
1599	if (!bprm->argc)
1600		return 0;
1601
1602	do {
1603		offset = bprm->p & ~PAGE_MASK;
1604		page = get_arg_page(bprm, bprm->p, 0);
1605		if (!page) {
1606			ret = -EFAULT;
1607			goto out;
1608		}
1609		kaddr = kmap_atomic(page);
1610
1611		for (; offset < PAGE_SIZE && kaddr[offset];
1612				offset++, bprm->p++)
1613			;
1614
1615		kunmap_atomic(kaddr);
1616		put_arg_page(page);
1617	} while (offset == PAGE_SIZE);
1618
1619	bprm->p++;
1620	bprm->argc--;
1621	ret = 0;
1622
1623out:
1624	return ret;
1625}
1626EXPORT_SYMBOL(remove_arg_zero);
1627
1628#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1629/*
1630 * cycle the list of binary formats handler, until one recognizes the image
1631 */
1632int search_binary_handler(struct linux_binprm *bprm)
1633{
1634	bool need_retry = IS_ENABLED(CONFIG_MODULES);
1635	struct linux_binfmt *fmt;
1636	int retval;
1637
1638	/* This allows 4 levels of binfmt rewrites before failing hard. */
1639	if (bprm->recursion_depth > 5)
1640		return -ELOOP;
1641
1642	retval = security_bprm_check(bprm);
1643	if (retval)
1644		return retval;
1645
1646	retval = -ENOENT;
1647 retry:
1648	read_lock(&binfmt_lock);
1649	list_for_each_entry(fmt, &formats, lh) {
1650		if (!try_module_get(fmt->module))
1651			continue;
1652		read_unlock(&binfmt_lock);
1653		bprm->recursion_depth++;
1654		retval = fmt->load_binary(bprm);
1655		read_lock(&binfmt_lock);
1656		put_binfmt(fmt);
1657		bprm->recursion_depth--;
1658		if (retval < 0 && !bprm->mm) {
1659			/* we got to flush_old_exec() and failed after it */
1660			read_unlock(&binfmt_lock);
1661			force_sigsegv(SIGSEGV, current);
1662			return retval;
1663		}
1664		if (retval != -ENOEXEC || !bprm->file) {
1665			read_unlock(&binfmt_lock);
1666			return retval;
1667		}
1668	}
1669	read_unlock(&binfmt_lock);
1670
1671	if (need_retry) {
1672		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1673		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
1674			return retval;
1675		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
1676			return retval;
1677		need_retry = false;
1678		goto retry;
1679	}
1680
1681	return retval;
1682}
1683EXPORT_SYMBOL(search_binary_handler);
1684
1685static int exec_binprm(struct linux_binprm *bprm)
1686{
1687	pid_t old_pid, old_vpid;
1688	int ret;
1689
1690	/* Need to fetch pid before load_binary changes it */
1691	old_pid = current->pid;
1692	rcu_read_lock();
1693	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1694	rcu_read_unlock();
1695
1696	ret = search_binary_handler(bprm);
1697	if (ret >= 0) {
1698		audit_bprm(bprm);
1699		trace_sched_process_exec(current, old_pid, bprm);
1700		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1701		proc_exec_connector(current);
1702	}
1703
1704	return ret;
1705}
1706
1707/*
1708 * sys_execve() executes a new program.
1709 */
1710static int __do_execve_file(int fd, struct filename *filename,
1711			    struct user_arg_ptr argv,
1712			    struct user_arg_ptr envp,
1713			    int flags, struct file *file)
1714{
1715	char *pathbuf = NULL;
1716	struct linux_binprm *bprm;
1717	struct files_struct *displaced;
1718	int retval;
1719
1720	if (IS_ERR(filename))
1721		return PTR_ERR(filename);
1722
1723	/*
1724	 * We move the actual failure in case of RLIMIT_NPROC excess from
1725	 * set*uid() to execve() because too many poorly written programs
1726	 * don't check setuid() return code.  Here we additionally recheck
1727	 * whether NPROC limit is still exceeded.
1728	 */
1729	if ((current->flags & PF_NPROC_EXCEEDED) &&
1730	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1731		retval = -EAGAIN;
1732		goto out_ret;
1733	}
1734
1735	/* We're below the limit (still or again), so we don't want to make
1736	 * further execve() calls fail. */
1737	current->flags &= ~PF_NPROC_EXCEEDED;
1738
1739	retval = unshare_files(&displaced);
1740	if (retval)
1741		goto out_ret;
1742
1743	retval = -ENOMEM;
1744	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1745	if (!bprm)
1746		goto out_files;
1747
1748	retval = prepare_bprm_creds(bprm);
1749	if (retval)
1750		goto out_free;
1751
1752	check_unsafe_exec(bprm);
1753	current->in_execve = 1;
1754
1755	if (!file)
1756		file = do_open_execat(fd, filename, flags);
1757	retval = PTR_ERR(file);
1758	if (IS_ERR(file))
1759		goto out_unmark;
1760
1761	sched_exec();
1762
1763	bprm->file = file;
1764	if (!filename) {
1765		bprm->filename = "none";
1766	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
1767		bprm->filename = filename->name;
1768	} else {
1769		if (filename->name[0] == '\0')
1770			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1771		else
1772			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1773					    fd, filename->name);
1774		if (!pathbuf) {
1775			retval = -ENOMEM;
1776			goto out_unmark;
1777		}
1778		/*
1779		 * Record that a name derived from an O_CLOEXEC fd will be
1780		 * inaccessible after exec. Relies on having exclusive access to
1781		 * current->files (due to unshare_files above).
1782		 */
1783		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1784			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1785		bprm->filename = pathbuf;
1786	}
1787	bprm->interp = bprm->filename;
1788
1789	retval = bprm_mm_init(bprm);
1790	if (retval)
1791		goto out_unmark;
1792
1793	bprm->argc = count(argv, MAX_ARG_STRINGS);
1794	if ((retval = bprm->argc) < 0)
1795		goto out;
1796
1797	bprm->envc = count(envp, MAX_ARG_STRINGS);
1798	if ((retval = bprm->envc) < 0)
1799		goto out;
1800
1801	retval = prepare_binprm(bprm);
1802	if (retval < 0)
1803		goto out;
1804
1805	retval = copy_strings_kernel(1, &bprm->filename, bprm);
1806	if (retval < 0)
1807		goto out;
1808
1809	bprm->exec = bprm->p;
1810	retval = copy_strings(bprm->envc, envp, bprm);
1811	if (retval < 0)
1812		goto out;
1813
1814	retval = copy_strings(bprm->argc, argv, bprm);
1815	if (retval < 0)
1816		goto out;
1817
1818	would_dump(bprm, bprm->file);
1819
1820	retval = exec_binprm(bprm);
1821	if (retval < 0)
1822		goto out;
1823
1824	/* execve succeeded */
1825	current->fs->in_exec = 0;
1826	current->in_execve = 0;
1827	membarrier_execve(current);
1828	rseq_execve(current);
1829	acct_update_integrals(current);
1830	task_numa_free(current);
1831	free_bprm(bprm);
1832	kfree(pathbuf);
1833	if (filename)
1834		putname(filename);
1835	if (displaced)
1836		put_files_struct(displaced);
1837	return retval;
1838
1839out:
1840	if (bprm->mm) {
1841		acct_arg_size(bprm, 0);
1842		mmput(bprm->mm);
1843	}
1844
1845out_unmark:
1846	current->fs->in_exec = 0;
1847	current->in_execve = 0;
1848
1849out_free:
1850	free_bprm(bprm);
1851	kfree(pathbuf);
1852
1853out_files:
1854	if (displaced)
1855		reset_files_struct(displaced);
1856out_ret:
1857	if (filename)
1858		putname(filename);
1859	return retval;
1860}
1861
1862static int do_execveat_common(int fd, struct filename *filename,
1863			      struct user_arg_ptr argv,
1864			      struct user_arg_ptr envp,
1865			      int flags)
1866{
1867	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
1868}
1869
1870int do_execve_file(struct file *file, void *__argv, void *__envp)
1871{
1872	struct user_arg_ptr argv = { .ptr.native = __argv };
1873	struct user_arg_ptr envp = { .ptr.native = __envp };
1874
1875	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
1876}
1877
1878int do_execve(struct filename *filename,
1879	const char __user *const __user *__argv,
1880	const char __user *const __user *__envp)
1881{
1882	struct user_arg_ptr argv = { .ptr.native = __argv };
1883	struct user_arg_ptr envp = { .ptr.native = __envp };
1884	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1885}
1886
1887int do_execveat(int fd, struct filename *filename,
1888		const char __user *const __user *__argv,
1889		const char __user *const __user *__envp,
1890		int flags)
1891{
1892	struct user_arg_ptr argv = { .ptr.native = __argv };
1893	struct user_arg_ptr envp = { .ptr.native = __envp };
1894
1895	return do_execveat_common(fd, filename, argv, envp, flags);
1896}
1897
1898#ifdef CONFIG_COMPAT
1899static int compat_do_execve(struct filename *filename,
1900	const compat_uptr_t __user *__argv,
1901	const compat_uptr_t __user *__envp)
1902{
1903	struct user_arg_ptr argv = {
1904		.is_compat = true,
1905		.ptr.compat = __argv,
1906	};
1907	struct user_arg_ptr envp = {
1908		.is_compat = true,
1909		.ptr.compat = __envp,
1910	};
1911	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1912}
1913
1914static int compat_do_execveat(int fd, struct filename *filename,
1915			      const compat_uptr_t __user *__argv,
1916			      const compat_uptr_t __user *__envp,
1917			      int flags)
1918{
1919	struct user_arg_ptr argv = {
1920		.is_compat = true,
1921		.ptr.compat = __argv,
1922	};
1923	struct user_arg_ptr envp = {
1924		.is_compat = true,
1925		.ptr.compat = __envp,
1926	};
1927	return do_execveat_common(fd, filename, argv, envp, flags);
1928}
1929#endif
1930
1931void set_binfmt(struct linux_binfmt *new)
1932{
1933	struct mm_struct *mm = current->mm;
1934
1935	if (mm->binfmt)
1936		module_put(mm->binfmt->module);
1937
1938	mm->binfmt = new;
1939	if (new)
1940		__module_get(new->module);
1941}
1942EXPORT_SYMBOL(set_binfmt);
1943
1944/*
1945 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1946 */
1947void set_dumpable(struct mm_struct *mm, int value)
1948{
1949	unsigned long old, new;
1950
1951	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1952		return;
1953
1954	do {
1955		old = READ_ONCE(mm->flags);
1956		new = (old & ~MMF_DUMPABLE_MASK) | value;
1957	} while (cmpxchg(&mm->flags, old, new) != old);
1958}
1959
1960SYSCALL_DEFINE3(execve,
1961		const char __user *, filename,
1962		const char __user *const __user *, argv,
1963		const char __user *const __user *, envp)
1964{
1965	return do_execve(getname(filename), argv, envp);
1966}
1967
1968SYSCALL_DEFINE5(execveat,
1969		int, fd, const char __user *, filename,
1970		const char __user *const __user *, argv,
1971		const char __user *const __user *, envp,
1972		int, flags)
1973{
1974	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1975
1976	return do_execveat(fd,
1977			   getname_flags(filename, lookup_flags, NULL),
1978			   argv, envp, flags);
1979}
1980
1981#ifdef CONFIG_COMPAT
1982COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1983	const compat_uptr_t __user *, argv,
1984	const compat_uptr_t __user *, envp)
1985{
1986	return compat_do_execve(getname(filename), argv, envp);
1987}
1988
1989COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
1990		       const char __user *, filename,
1991		       const compat_uptr_t __user *, argv,
1992		       const compat_uptr_t __user *, envp,
1993		       int,  flags)
1994{
1995	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
1996
1997	return compat_do_execveat(fd,
1998				  getname_flags(filename, lookup_flags, NULL),
1999				  argv, envp, flags);
2000}
2001#endif