fs/exec.c at 609ee4679b8a0831257552dd2b0e54f509ba0c77 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / exec.c
at 609ee4679b8a0831257552dd2b0e54f509ba0c77 1810 lines 42 kB view raw
   1/*
   2 *  linux/fs/exec.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * #!-checking implemented by tytso.
   9 */
  10/*
  11 * Demand-loading implemented 01.12.91 - no need to read anything but
  12 * the header into memory. The inode of the executable is put into
  13 * "current->executable", and page faults do the actual loading. Clean.
  14 *
  15 * Once more I can proudly say that linux stood up to being changed: it
  16 * was less than 2 hours work to get demand-loading completely implemented.
  17 *
  18 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
  19 * current->executable is only used by the procfs.  This allows a dispatch
  20 * table to check for several different types  of binary formats.  We keep
  21 * trying until we recognize the file or we run out of supported binary
  22 * formats. 
  23 */
  24
  25#include <linux/slab.h>
  26#include <linux/file.h>
  27#include <linux/mman.h>
  28#include <linux/a.out.h>
  29#include <linux/stat.h>
  30#include <linux/fcntl.h>
  31#include <linux/smp_lock.h>
  32#include <linux/init.h>
  33#include <linux/pagemap.h>
  34#include <linux/highmem.h>
  35#include <linux/spinlock.h>
  36#include <linux/key.h>
  37#include <linux/personality.h>
  38#include <linux/binfmts.h>
  39#include <linux/swap.h>
  40#include <linux/utsname.h>
  41#include <linux/pid_namespace.h>
  42#include <linux/module.h>
  43#include <linux/namei.h>
  44#include <linux/proc_fs.h>
  45#include <linux/ptrace.h>
  46#include <linux/mount.h>
  47#include <linux/security.h>
  48#include <linux/syscalls.h>
  49#include <linux/rmap.h>
  50#include <linux/tsacct_kern.h>
  51#include <linux/cn_proc.h>
  52#include <linux/audit.h>
  53#include <linux/signalfd.h>
  54
  55#include <asm/uaccess.h>
  56#include <asm/mmu_context.h>
  57#include <asm/tlb.h>
  58
  59#ifdef CONFIG_KMOD
  60#include <linux/kmod.h>
  61#endif
  62
  63int core_uses_pid;
  64char core_pattern[CORENAME_MAX_SIZE] = "core";
  65int suid_dumpable = 0;
  66
  67EXPORT_SYMBOL(suid_dumpable);
  68/* The maximal length of core_pattern is also specified in sysctl.c */
  69
  70static struct linux_binfmt *formats;
  71static DEFINE_RWLOCK(binfmt_lock);
  72
  73int register_binfmt(struct linux_binfmt * fmt)
  74{
  75	struct linux_binfmt ** tmp = &formats;
  76
  77	if (!fmt)
  78		return -EINVAL;
  79	if (fmt->next)
  80		return -EBUSY;
  81	write_lock(&binfmt_lock);
  82	while (*tmp) {
  83		if (fmt == *tmp) {
  84			write_unlock(&binfmt_lock);
  85			return -EBUSY;
  86		}
  87		tmp = &(*tmp)->next;
  88	}
  89	fmt->next = formats;
  90	formats = fmt;
  91	write_unlock(&binfmt_lock);
  92	return 0;	
  93}
  94
  95EXPORT_SYMBOL(register_binfmt);
  96
  97int unregister_binfmt(struct linux_binfmt * fmt)
  98{
  99	struct linux_binfmt ** tmp = &formats;
 100
 101	write_lock(&binfmt_lock);
 102	while (*tmp) {
 103		if (fmt == *tmp) {
 104			*tmp = fmt->next;
 105			fmt->next = NULL;
 106			write_unlock(&binfmt_lock);
 107			return 0;
 108		}
 109		tmp = &(*tmp)->next;
 110	}
 111	write_unlock(&binfmt_lock);
 112	return -EINVAL;
 113}
 114
 115EXPORT_SYMBOL(unregister_binfmt);
 116
 117static inline void put_binfmt(struct linux_binfmt * fmt)
 118{
 119	module_put(fmt->module);
 120}
 121
 122/*
 123 * Note that a shared library must be both readable and executable due to
 124 * security reasons.
 125 *
 126 * Also note that we take the address to load from from the file itself.
 127 */
 128asmlinkage long sys_uselib(const char __user * library)
 129{
 130	struct file * file;
 131	struct nameidata nd;
 132	int error;
 133
 134	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 135	if (error)
 136		goto out;
 137
 138	error = -EACCES;
 139	if (nd.mnt->mnt_flags & MNT_NOEXEC)
 140		goto exit;
 141	error = -EINVAL;
 142	if (!S_ISREG(nd.dentry->d_inode->i_mode))
 143		goto exit;
 144
 145	error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
 146	if (error)
 147		goto exit;
 148
 149	file = nameidata_to_filp(&nd, O_RDONLY);
 150	error = PTR_ERR(file);
 151	if (IS_ERR(file))
 152		goto out;
 153
 154	error = -ENOEXEC;
 155	if(file->f_op) {
 156		struct linux_binfmt * fmt;
 157
 158		read_lock(&binfmt_lock);
 159		for (fmt = formats ; fmt ; fmt = fmt->next) {
 160			if (!fmt->load_shlib)
 161				continue;
 162			if (!try_module_get(fmt->module))
 163				continue;
 164			read_unlock(&binfmt_lock);
 165			error = fmt->load_shlib(file);
 166			read_lock(&binfmt_lock);
 167			put_binfmt(fmt);
 168			if (error != -ENOEXEC)
 169				break;
 170		}
 171		read_unlock(&binfmt_lock);
 172	}
 173	fput(file);
 174out:
 175  	return error;
 176exit:
 177	release_open_intent(&nd);
 178	path_release(&nd);
 179	goto out;
 180}
 181
 182#ifdef CONFIG_MMU
 183
 184static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 185		int write)
 186{
 187	struct page *page;
 188	int ret;
 189
 190#ifdef CONFIG_STACK_GROWSUP
 191	if (write) {
 192		ret = expand_stack_downwards(bprm->vma, pos);
 193		if (ret < 0)
 194			return NULL;
 195	}
 196#endif
 197	ret = get_user_pages(current, bprm->mm, pos,
 198			1, write, 1, &page, NULL);
 199	if (ret <= 0)
 200		return NULL;
 201
 202	if (write) {
 203		struct rlimit *rlim = current->signal->rlim;
 204		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 205
 206		/*
 207		 * Limit to 1/4-th the stack size for the argv+env strings.
 208		 * This ensures that:
 209		 *  - the remaining binfmt code will not run out of stack space,
 210		 *  - the program will have a reasonable amount of stack left
 211		 *    to work from.
 212		 */
 213		if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
 214			put_page(page);
 215			return NULL;
 216		}
 217	}
 218
 219	return page;
 220}
 221
 222static void put_arg_page(struct page *page)
 223{
 224	put_page(page);
 225}
 226
 227static void free_arg_page(struct linux_binprm *bprm, int i)
 228{
 229}
 230
 231static void free_arg_pages(struct linux_binprm *bprm)
 232{
 233}
 234
 235static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 236		struct page *page)
 237{
 238	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 239}
 240
 241static int __bprm_mm_init(struct linux_binprm *bprm)
 242{
 243	int err = -ENOMEM;
 244	struct vm_area_struct *vma = NULL;
 245	struct mm_struct *mm = bprm->mm;
 246
 247	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 248	if (!vma)
 249		goto err;
 250
 251	down_write(&mm->mmap_sem);
 252	vma->vm_mm = mm;
 253
 254	/*
 255	 * Place the stack at the largest stack address the architecture
 256	 * supports. Later, we'll move this to an appropriate place. We don't
 257	 * use STACK_TOP because that can depend on attributes which aren't
 258	 * configured yet.
 259	 */
 260	vma->vm_end = STACK_TOP_MAX;
 261	vma->vm_start = vma->vm_end - PAGE_SIZE;
 262
 263	vma->vm_flags = VM_STACK_FLAGS;
 264	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 265	err = insert_vm_struct(mm, vma);
 266	if (err) {
 267		up_write(&mm->mmap_sem);
 268		goto err;
 269	}
 270
 271	mm->stack_vm = mm->total_vm = 1;
 272	up_write(&mm->mmap_sem);
 273
 274	bprm->p = vma->vm_end - sizeof(void *);
 275
 276	return 0;
 277
 278err:
 279	if (vma) {
 280		bprm->vma = NULL;
 281		kmem_cache_free(vm_area_cachep, vma);
 282	}
 283
 284	return err;
 285}
 286
 287static bool valid_arg_len(struct linux_binprm *bprm, long len)
 288{
 289	return len <= MAX_ARG_STRLEN;
 290}
 291
 292#else
 293
 294static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 295		int write)
 296{
 297	struct page *page;
 298
 299	page = bprm->page[pos / PAGE_SIZE];
 300	if (!page && write) {
 301		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
 302		if (!page)
 303			return NULL;
 304		bprm->page[pos / PAGE_SIZE] = page;
 305	}
 306
 307	return page;
 308}
 309
 310static void put_arg_page(struct page *page)
 311{
 312}
 313
 314static void free_arg_page(struct linux_binprm *bprm, int i)
 315{
 316	if (bprm->page[i]) {
 317		__free_page(bprm->page[i]);
 318		bprm->page[i] = NULL;
 319	}
 320}
 321
 322static void free_arg_pages(struct linux_binprm *bprm)
 323{
 324	int i;
 325
 326	for (i = 0; i < MAX_ARG_PAGES; i++)
 327		free_arg_page(bprm, i);
 328}
 329
 330static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 331		struct page *page)
 332{
 333}
 334
 335static int __bprm_mm_init(struct linux_binprm *bprm)
 336{
 337	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
 338	return 0;
 339}
 340
 341static bool valid_arg_len(struct linux_binprm *bprm, long len)
 342{
 343	return len <= bprm->p;
 344}
 345
 346#endif /* CONFIG_MMU */
 347
 348/*
 349 * Create a new mm_struct and populate it with a temporary stack
 350 * vm_area_struct.  We don't have enough context at this point to set the stack
 351 * flags, permissions, and offset, so we use temporary values.  We'll update
 352 * them later in setup_arg_pages().
 353 */
 354int bprm_mm_init(struct linux_binprm *bprm)
 355{
 356	int err;
 357	struct mm_struct *mm = NULL;
 358
 359	bprm->mm = mm = mm_alloc();
 360	err = -ENOMEM;
 361	if (!mm)
 362		goto err;
 363
 364	err = init_new_context(current, mm);
 365	if (err)
 366		goto err;
 367
 368	err = __bprm_mm_init(bprm);
 369	if (err)
 370		goto err;
 371
 372	return 0;
 373
 374err:
 375	if (mm) {
 376		bprm->mm = NULL;
 377		mmdrop(mm);
 378	}
 379
 380	return err;
 381}
 382
 383/*
 384 * count() counts the number of strings in array ARGV.
 385 */
 386static int count(char __user * __user * argv, int max)
 387{
 388	int i = 0;
 389
 390	if (argv != NULL) {
 391		for (;;) {
 392			char __user * p;
 393
 394			if (get_user(p, argv))
 395				return -EFAULT;
 396			if (!p)
 397				break;
 398			argv++;
 399			if(++i > max)
 400				return -E2BIG;
 401			cond_resched();
 402		}
 403	}
 404	return i;
 405}
 406
 407/*
 408 * 'copy_strings()' copies argument/environment strings from the old
 409 * processes's memory to the new process's stack.  The call to get_user_pages()
 410 * ensures the destination page is created and not swapped out.
 411 */
 412static int copy_strings(int argc, char __user * __user * argv,
 413			struct linux_binprm *bprm)
 414{
 415	struct page *kmapped_page = NULL;
 416	char *kaddr = NULL;
 417	unsigned long kpos = 0;
 418	int ret;
 419
 420	while (argc-- > 0) {
 421		char __user *str;
 422		int len;
 423		unsigned long pos;
 424
 425		if (get_user(str, argv+argc) ||
 426				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
 427			ret = -EFAULT;
 428			goto out;
 429		}
 430
 431		if (!valid_arg_len(bprm, len)) {
 432			ret = -E2BIG;
 433			goto out;
 434		}
 435
 436		/* We're going to work our way backwords. */
 437		pos = bprm->p;
 438		str += len;
 439		bprm->p -= len;
 440
 441		while (len > 0) {
 442			int offset, bytes_to_copy;
 443
 444			offset = pos % PAGE_SIZE;
 445			if (offset == 0)
 446				offset = PAGE_SIZE;
 447
 448			bytes_to_copy = offset;
 449			if (bytes_to_copy > len)
 450				bytes_to_copy = len;
 451
 452			offset -= bytes_to_copy;
 453			pos -= bytes_to_copy;
 454			str -= bytes_to_copy;
 455			len -= bytes_to_copy;
 456
 457			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 458				struct page *page;
 459
 460				page = get_arg_page(bprm, pos, 1);
 461				if (!page) {
 462					ret = -E2BIG;
 463					goto out;
 464				}
 465
 466				if (kmapped_page) {
 467					flush_kernel_dcache_page(kmapped_page);
 468					kunmap(kmapped_page);
 469					put_arg_page(kmapped_page);
 470				}
 471				kmapped_page = page;
 472				kaddr = kmap(kmapped_page);
 473				kpos = pos & PAGE_MASK;
 474				flush_arg_page(bprm, kpos, kmapped_page);
 475			}
 476			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 477				ret = -EFAULT;
 478				goto out;
 479			}
 480		}
 481	}
 482	ret = 0;
 483out:
 484	if (kmapped_page) {
 485		flush_kernel_dcache_page(kmapped_page);
 486		kunmap(kmapped_page);
 487		put_arg_page(kmapped_page);
 488	}
 489	return ret;
 490}
 491
 492/*
 493 * Like copy_strings, but get argv and its values from kernel memory.
 494 */
 495int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
 496{
 497	int r;
 498	mm_segment_t oldfs = get_fs();
 499	set_fs(KERNEL_DS);
 500	r = copy_strings(argc, (char __user * __user *)argv, bprm);
 501	set_fs(oldfs);
 502	return r;
 503}
 504EXPORT_SYMBOL(copy_strings_kernel);
 505
 506#ifdef CONFIG_MMU
 507
 508/*
 509 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 510 * the binfmt code determines where the new stack should reside, we shift it to
 511 * its final location.  The process proceeds as follows:
 512 *
 513 * 1) Use shift to calculate the new vma endpoints.
 514 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 515 *    arguments passed to subsequent functions are consistent.
 516 * 3) Move vma's page tables to the new range.
 517 * 4) Free up any cleared pgd range.
 518 * 5) Shrink the vma to cover only the new range.
 519 */
 520static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 521{
 522	struct mm_struct *mm = vma->vm_mm;
 523	unsigned long old_start = vma->vm_start;
 524	unsigned long old_end = vma->vm_end;
 525	unsigned long length = old_end - old_start;
 526	unsigned long new_start = old_start - shift;
 527	unsigned long new_end = old_end - shift;
 528	struct mmu_gather *tlb;
 529
 530	BUG_ON(new_start > new_end);
 531
 532	/*
 533	 * ensure there are no vmas between where we want to go
 534	 * and where we are
 535	 */
 536	if (vma != find_vma(mm, new_start))
 537		return -EFAULT;
 538
 539	/*
 540	 * cover the whole range: [new_start, old_end)
 541	 */
 542	vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
 543
 544	/*
 545	 * move the page tables downwards, on failure we rely on
 546	 * process cleanup to remove whatever mess we made.
 547	 */
 548	if (length != move_page_tables(vma, old_start,
 549				       vma, new_start, length))
 550		return -ENOMEM;
 551
 552	lru_add_drain();
 553	tlb = tlb_gather_mmu(mm, 0);
 554	if (new_end > old_start) {
 555		/*
 556		 * when the old and new regions overlap clear from new_end.
 557		 */
 558		free_pgd_range(&tlb, new_end, old_end, new_end,
 559			vma->vm_next ? vma->vm_next->vm_start : 0);
 560	} else {
 561		/*
 562		 * otherwise, clean from old_start; this is done to not touch
 563		 * the address space in [new_end, old_start) some architectures
 564		 * have constraints on va-space that make this illegal (IA64) -
 565		 * for the others its just a little faster.
 566		 */
 567		free_pgd_range(&tlb, old_start, old_end, new_end,
 568			vma->vm_next ? vma->vm_next->vm_start : 0);
 569	}
 570	tlb_finish_mmu(tlb, new_end, old_end);
 571
 572	/*
 573	 * shrink the vma to just the new range.
 574	 */
 575	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 576
 577	return 0;
 578}
 579
 580#define EXTRA_STACK_VM_PAGES	20	/* random */
 581
 582/*
 583 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 584 * the stack is optionally relocated, and some extra space is added.
 585 */
 586int setup_arg_pages(struct linux_binprm *bprm,
 587		    unsigned long stack_top,
 588		    int executable_stack)
 589{
 590	unsigned long ret;
 591	unsigned long stack_shift;
 592	struct mm_struct *mm = current->mm;
 593	struct vm_area_struct *vma = bprm->vma;
 594	struct vm_area_struct *prev = NULL;
 595	unsigned long vm_flags;
 596	unsigned long stack_base;
 597
 598#ifdef CONFIG_STACK_GROWSUP
 599	/* Limit stack size to 1GB */
 600	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 601	if (stack_base > (1 << 30))
 602		stack_base = 1 << 30;
 603
 604	/* Make sure we didn't let the argument array grow too large. */
 605	if (vma->vm_end - vma->vm_start > stack_base)
 606		return -ENOMEM;
 607
 608	stack_base = PAGE_ALIGN(stack_top - stack_base);
 609
 610	stack_shift = vma->vm_start - stack_base;
 611	mm->arg_start = bprm->p - stack_shift;
 612	bprm->p = vma->vm_end - stack_shift;
 613#else
 614	stack_top = arch_align_stack(stack_top);
 615	stack_top = PAGE_ALIGN(stack_top);
 616	stack_shift = vma->vm_end - stack_top;
 617
 618	bprm->p -= stack_shift;
 619	mm->arg_start = bprm->p;
 620#endif
 621
 622	if (bprm->loader)
 623		bprm->loader -= stack_shift;
 624	bprm->exec -= stack_shift;
 625
 626	down_write(&mm->mmap_sem);
 627	vm_flags = vma->vm_flags;
 628
 629	/*
 630	 * Adjust stack execute permissions; explicitly enable for
 631	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
 632	 * (arch default) otherwise.
 633	 */
 634	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
 635		vm_flags |= VM_EXEC;
 636	else if (executable_stack == EXSTACK_DISABLE_X)
 637		vm_flags &= ~VM_EXEC;
 638	vm_flags |= mm->def_flags;
 639
 640	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 641			vm_flags);
 642	if (ret)
 643		goto out_unlock;
 644	BUG_ON(prev != vma);
 645
 646	/* Move stack pages down in memory. */
 647	if (stack_shift) {
 648		ret = shift_arg_pages(vma, stack_shift);
 649		if (ret) {
 650			up_write(&mm->mmap_sem);
 651			return ret;
 652		}
 653	}
 654
 655#ifdef CONFIG_STACK_GROWSUP
 656	stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 657#else
 658	stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 659#endif
 660	ret = expand_stack(vma, stack_base);
 661	if (ret)
 662		ret = -EFAULT;
 663
 664out_unlock:
 665	up_write(&mm->mmap_sem);
 666	return 0;
 667}
 668EXPORT_SYMBOL(setup_arg_pages);
 669
 670#endif /* CONFIG_MMU */
 671
 672struct file *open_exec(const char *name)
 673{
 674	struct nameidata nd;
 675	int err;
 676	struct file *file;
 677
 678	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 679	file = ERR_PTR(err);
 680
 681	if (!err) {
 682		struct inode *inode = nd.dentry->d_inode;
 683		file = ERR_PTR(-EACCES);
 684		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
 685		    S_ISREG(inode->i_mode)) {
 686			int err = vfs_permission(&nd, MAY_EXEC);
 687			file = ERR_PTR(err);
 688			if (!err) {
 689				file = nameidata_to_filp(&nd, O_RDONLY);
 690				if (!IS_ERR(file)) {
 691					err = deny_write_access(file);
 692					if (err) {
 693						fput(file);
 694						file = ERR_PTR(err);
 695					}
 696				}
 697out:
 698				return file;
 699			}
 700		}
 701		release_open_intent(&nd);
 702		path_release(&nd);
 703	}
 704	goto out;
 705}
 706
 707EXPORT_SYMBOL(open_exec);
 708
 709int kernel_read(struct file *file, unsigned long offset,
 710	char *addr, unsigned long count)
 711{
 712	mm_segment_t old_fs;
 713	loff_t pos = offset;
 714	int result;
 715
 716	old_fs = get_fs();
 717	set_fs(get_ds());
 718	/* The cast to a user pointer is valid due to the set_fs() */
 719	result = vfs_read(file, (void __user *)addr, count, &pos);
 720	set_fs(old_fs);
 721	return result;
 722}
 723
 724EXPORT_SYMBOL(kernel_read);
 725
 726static int exec_mmap(struct mm_struct *mm)
 727{
 728	struct task_struct *tsk;
 729	struct mm_struct * old_mm, *active_mm;
 730
 731	/* Notify parent that we're no longer interested in the old VM */
 732	tsk = current;
 733	old_mm = current->mm;
 734	mm_release(tsk, old_mm);
 735
 736	if (old_mm) {
 737		/*
 738		 * Make sure that if there is a core dump in progress
 739		 * for the old mm, we get out and die instead of going
 740		 * through with the exec.  We must hold mmap_sem around
 741		 * checking core_waiters and changing tsk->mm.  The
 742		 * core-inducing thread will increment core_waiters for
 743		 * each thread whose ->mm == old_mm.
 744		 */
 745		down_read(&old_mm->mmap_sem);
 746		if (unlikely(old_mm->core_waiters)) {
 747			up_read(&old_mm->mmap_sem);
 748			return -EINTR;
 749		}
 750	}
 751	task_lock(tsk);
 752	active_mm = tsk->active_mm;
 753	tsk->mm = mm;
 754	tsk->active_mm = mm;
 755	activate_mm(active_mm, mm);
 756	task_unlock(tsk);
 757	arch_pick_mmap_layout(mm);
 758	if (old_mm) {
 759		up_read(&old_mm->mmap_sem);
 760		BUG_ON(active_mm != old_mm);
 761		mmput(old_mm);
 762		return 0;
 763	}
 764	mmdrop(active_mm);
 765	return 0;
 766}
 767
 768/*
 769 * This function makes sure the current process has its own signal table,
 770 * so that flush_signal_handlers can later reset the handlers without
 771 * disturbing other processes.  (Other processes might share the signal
 772 * table via the CLONE_SIGHAND option to clone().)
 773 */
 774static int de_thread(struct task_struct *tsk)
 775{
 776	struct signal_struct *sig = tsk->signal;
 777	struct sighand_struct *newsighand, *oldsighand = tsk->sighand;
 778	spinlock_t *lock = &oldsighand->siglock;
 779	struct task_struct *leader = NULL;
 780	int count;
 781
 782	/*
 783	 * If we don't share sighandlers, then we aren't sharing anything
 784	 * and we can just re-use it all.
 785	 */
 786	if (atomic_read(&oldsighand->count) <= 1) {
 787		signalfd_detach(tsk);
 788		exit_itimers(sig);
 789		return 0;
 790	}
 791
 792	newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 793	if (!newsighand)
 794		return -ENOMEM;
 795
 796	if (thread_group_empty(tsk))
 797		goto no_thread_group;
 798
 799	/*
 800	 * Kill all other threads in the thread group.
 801	 * We must hold tasklist_lock to call zap_other_threads.
 802	 */
 803	read_lock(&tasklist_lock);
 804	spin_lock_irq(lock);
 805	if (sig->flags & SIGNAL_GROUP_EXIT) {
 806		/*
 807		 * Another group action in progress, just
 808		 * return so that the signal is processed.
 809		 */
 810		spin_unlock_irq(lock);
 811		read_unlock(&tasklist_lock);
 812		kmem_cache_free(sighand_cachep, newsighand);
 813		return -EAGAIN;
 814	}
 815
 816	/*
 817	 * child_reaper ignores SIGKILL, change it now.
 818	 * Reparenting needs write_lock on tasklist_lock,
 819	 * so it is safe to do it under read_lock.
 820	 */
 821	if (unlikely(tsk->group_leader == child_reaper(tsk)))
 822		tsk->nsproxy->pid_ns->child_reaper = tsk;
 823
 824	zap_other_threads(tsk);
 825	read_unlock(&tasklist_lock);
 826
 827	/*
 828	 * Account for the thread group leader hanging around:
 829	 */
 830	count = 1;
 831	if (!thread_group_leader(tsk)) {
 832		count = 2;
 833		/*
 834		 * The SIGALRM timer survives the exec, but needs to point
 835		 * at us as the new group leader now.  We have a race with
 836		 * a timer firing now getting the old leader, so we need to
 837		 * synchronize with any firing (by calling del_timer_sync)
 838		 * before we can safely let the old group leader die.
 839		 */
 840		sig->tsk = tsk;
 841		spin_unlock_irq(lock);
 842		if (hrtimer_cancel(&sig->real_timer))
 843			hrtimer_restart(&sig->real_timer);
 844		spin_lock_irq(lock);
 845	}
 846	while (atomic_read(&sig->count) > count) {
 847		sig->group_exit_task = tsk;
 848		sig->notify_count = count;
 849		__set_current_state(TASK_UNINTERRUPTIBLE);
 850		spin_unlock_irq(lock);
 851		schedule();
 852		spin_lock_irq(lock);
 853	}
 854	sig->group_exit_task = NULL;
 855	sig->notify_count = 0;
 856	spin_unlock_irq(lock);
 857
 858	/*
 859	 * At this point all other threads have exited, all we have to
 860	 * do is to wait for the thread group leader to become inactive,
 861	 * and to assume its PID:
 862	 */
 863	if (!thread_group_leader(tsk)) {
 864		/*
 865		 * Wait for the thread group leader to be a zombie.
 866		 * It should already be zombie at this point, most
 867		 * of the time.
 868		 */
 869		leader = tsk->group_leader;
 870		while (leader->exit_state != EXIT_ZOMBIE)
 871			yield();
 872
 873		/*
 874		 * The only record we have of the real-time age of a
 875		 * process, regardless of execs it's done, is start_time.
 876		 * All the past CPU time is accumulated in signal_struct
 877		 * from sister threads now dead.  But in this non-leader
 878		 * exec, nothing survives from the original leader thread,
 879		 * whose birth marks the true age of this process now.
 880		 * When we take on its identity by switching to its PID, we
 881		 * also take its birthdate (always earlier than our own).
 882		 */
 883		tsk->start_time = leader->start_time;
 884
 885		write_lock_irq(&tasklist_lock);
 886
 887		BUG_ON(leader->tgid != tsk->tgid);
 888		BUG_ON(tsk->pid == tsk->tgid);
 889		/*
 890		 * An exec() starts a new thread group with the
 891		 * TGID of the previous thread group. Rehash the
 892		 * two threads with a switched PID, and release
 893		 * the former thread group leader:
 894		 */
 895
 896		/* Become a process group leader with the old leader's pid.
 897		 * The old leader becomes a thread of the this thread group.
 898		 * Note: The old leader also uses this pid until release_task
 899		 *       is called.  Odd but simple and correct.
 900		 */
 901		detach_pid(tsk, PIDTYPE_PID);
 902		tsk->pid = leader->pid;
 903		attach_pid(tsk, PIDTYPE_PID,  find_pid(tsk->pid));
 904		transfer_pid(leader, tsk, PIDTYPE_PGID);
 905		transfer_pid(leader, tsk, PIDTYPE_SID);
 906		list_replace_rcu(&leader->tasks, &tsk->tasks);
 907
 908		tsk->group_leader = tsk;
 909		leader->group_leader = tsk;
 910
 911		tsk->exit_signal = SIGCHLD;
 912
 913		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 914		leader->exit_state = EXIT_DEAD;
 915
 916		write_unlock_irq(&tasklist_lock);
 917        }
 918
 919	/*
 920	 * There may be one thread left which is just exiting,
 921	 * but it's safe to stop telling the group to kill themselves.
 922	 */
 923	sig->flags = 0;
 924
 925no_thread_group:
 926	signalfd_detach(tsk);
 927	exit_itimers(sig);
 928	if (leader)
 929		release_task(leader);
 930
 931	if (atomic_read(&oldsighand->count) == 1) {
 932		/*
 933		 * Now that we nuked the rest of the thread group,
 934		 * it turns out we are not sharing sighand any more either.
 935		 * So we can just keep it.
 936		 */
 937		kmem_cache_free(sighand_cachep, newsighand);
 938	} else {
 939		/*
 940		 * Move our state over to newsighand and switch it in.
 941		 */
 942		atomic_set(&newsighand->count, 1);
 943		memcpy(newsighand->action, oldsighand->action,
 944		       sizeof(newsighand->action));
 945
 946		write_lock_irq(&tasklist_lock);
 947		spin_lock(&oldsighand->siglock);
 948		spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING);
 949
 950		rcu_assign_pointer(tsk->sighand, newsighand);
 951		recalc_sigpending();
 952
 953		spin_unlock(&newsighand->siglock);
 954		spin_unlock(&oldsighand->siglock);
 955		write_unlock_irq(&tasklist_lock);
 956
 957		__cleanup_sighand(oldsighand);
 958	}
 959
 960	BUG_ON(!thread_group_leader(tsk));
 961	return 0;
 962}
 963	
 964/*
 965 * These functions flushes out all traces of the currently running executable
 966 * so that a new one can be started
 967 */
 968
 969static void flush_old_files(struct files_struct * files)
 970{
 971	long j = -1;
 972	struct fdtable *fdt;
 973
 974	spin_lock(&files->file_lock);
 975	for (;;) {
 976		unsigned long set, i;
 977
 978		j++;
 979		i = j * __NFDBITS;
 980		fdt = files_fdtable(files);
 981		if (i >= fdt->max_fds)
 982			break;
 983		set = fdt->close_on_exec->fds_bits[j];
 984		if (!set)
 985			continue;
 986		fdt->close_on_exec->fds_bits[j] = 0;
 987		spin_unlock(&files->file_lock);
 988		for ( ; set ; i++,set >>= 1) {
 989			if (set & 1) {
 990				sys_close(i);
 991			}
 992		}
 993		spin_lock(&files->file_lock);
 994
 995	}
 996	spin_unlock(&files->file_lock);
 997}
 998
 999void get_task_comm(char *buf, struct task_struct *tsk)
1000{
1001	/* buf must be at least sizeof(tsk->comm) in size */
1002	task_lock(tsk);
1003	strncpy(buf, tsk->comm, sizeof(tsk->comm));
1004	task_unlock(tsk);
1005}
1006
1007void set_task_comm(struct task_struct *tsk, char *buf)
1008{
1009	task_lock(tsk);
1010	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
1011	task_unlock(tsk);
1012}
1013
1014int flush_old_exec(struct linux_binprm * bprm)
1015{
1016	char * name;
1017	int i, ch, retval;
1018	struct files_struct *files;
1019	char tcomm[sizeof(current->comm)];
1020
1021	/*
1022	 * Make sure we have a private signal table and that
1023	 * we are unassociated from the previous thread group.
1024	 */
1025	retval = de_thread(current);
1026	if (retval)
1027		goto out;
1028
1029	/*
1030	 * Make sure we have private file handles. Ask the
1031	 * fork helper to do the work for us and the exit
1032	 * helper to do the cleanup of the old one.
1033	 */
1034	files = current->files;		/* refcounted so safe to hold */
1035	retval = unshare_files();
1036	if (retval)
1037		goto out;
1038	/*
1039	 * Release all of the old mmap stuff
1040	 */
1041	retval = exec_mmap(bprm->mm);
1042	if (retval)
1043		goto mmap_failed;
1044
1045	bprm->mm = NULL;		/* We're using it now */
1046
1047	/* This is the point of no return */
1048	put_files_struct(files);
1049
1050	current->sas_ss_sp = current->sas_ss_size = 0;
1051
1052	if (current->euid == current->uid && current->egid == current->gid)
1053		set_dumpable(current->mm, 1);
1054	else
1055		set_dumpable(current->mm, suid_dumpable);
1056
1057	name = bprm->filename;
1058
1059	/* Copies the binary name from after last slash */
1060	for (i=0; (ch = *(name++)) != '\0';) {
1061		if (ch == '/')
1062			i = 0; /* overwrite what we wrote */
1063		else
1064			if (i < (sizeof(tcomm) - 1))
1065				tcomm[i++] = ch;
1066	}
1067	tcomm[i] = '\0';
1068	set_task_comm(current, tcomm);
1069
1070	current->flags &= ~PF_RANDOMIZE;
1071	flush_thread();
1072
1073	/* Set the new mm task size. We have to do that late because it may
1074	 * depend on TIF_32BIT which is only updated in flush_thread() on
1075	 * some architectures like powerpc
1076	 */
1077	current->mm->task_size = TASK_SIZE;
1078
1079	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
1080		suid_keys(current);
1081		set_dumpable(current->mm, suid_dumpable);
1082		current->pdeath_signal = 0;
1083	} else if (file_permission(bprm->file, MAY_READ) ||
1084			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
1085		suid_keys(current);
1086		set_dumpable(current->mm, suid_dumpable);
1087	}
1088
1089	/* An exec changes our domain. We are no longer part of the thread
1090	   group */
1091
1092	current->self_exec_id++;
1093			
1094	flush_signal_handlers(current, 0);
1095	flush_old_files(current->files);
1096
1097	return 0;
1098
1099mmap_failed:
1100	reset_files_struct(current, files);
1101out:
1102	return retval;
1103}
1104
1105EXPORT_SYMBOL(flush_old_exec);
1106
1107/* 
1108 * Fill the binprm structure from the inode. 
1109 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1110 */
1111int prepare_binprm(struct linux_binprm *bprm)
1112{
1113	int mode;
1114	struct inode * inode = bprm->file->f_path.dentry->d_inode;
1115	int retval;
1116
1117	mode = inode->i_mode;
1118	if (bprm->file->f_op == NULL)
1119		return -EACCES;
1120
1121	bprm->e_uid = current->euid;
1122	bprm->e_gid = current->egid;
1123
1124	if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
1125		/* Set-uid? */
1126		if (mode & S_ISUID) {
1127			current->personality &= ~PER_CLEAR_ON_SETID;
1128			bprm->e_uid = inode->i_uid;
1129		}
1130
1131		/* Set-gid? */
1132		/*
1133		 * If setgid is set but no group execute bit then this
1134		 * is a candidate for mandatory locking, not a setgid
1135		 * executable.
1136		 */
1137		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1138			current->personality &= ~PER_CLEAR_ON_SETID;
1139			bprm->e_gid = inode->i_gid;
1140		}
1141	}
1142
1143	/* fill in binprm security blob */
1144	retval = security_bprm_set(bprm);
1145	if (retval)
1146		return retval;
1147
1148	memset(bprm->buf,0,BINPRM_BUF_SIZE);
1149	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
1150}
1151
1152EXPORT_SYMBOL(prepare_binprm);
1153
1154static int unsafe_exec(struct task_struct *p)
1155{
1156	int unsafe = 0;
1157	if (p->ptrace & PT_PTRACED) {
1158		if (p->ptrace & PT_PTRACE_CAP)
1159			unsafe |= LSM_UNSAFE_PTRACE_CAP;
1160		else
1161			unsafe |= LSM_UNSAFE_PTRACE;
1162	}
1163	if (atomic_read(&p->fs->count) > 1 ||
1164	    atomic_read(&p->files->count) > 1 ||
1165	    atomic_read(&p->sighand->count) > 1)
1166		unsafe |= LSM_UNSAFE_SHARE;
1167
1168	return unsafe;
1169}
1170
1171void compute_creds(struct linux_binprm *bprm)
1172{
1173	int unsafe;
1174
1175	if (bprm->e_uid != current->uid) {
1176		suid_keys(current);
1177		current->pdeath_signal = 0;
1178	}
1179	exec_keys(current);
1180
1181	task_lock(current);
1182	unsafe = unsafe_exec(current);
1183	security_bprm_apply_creds(bprm, unsafe);
1184	task_unlock(current);
1185	security_bprm_post_apply_creds(bprm);
1186}
1187EXPORT_SYMBOL(compute_creds);
1188
1189/*
1190 * Arguments are '\0' separated strings found at the location bprm->p
1191 * points to; chop off the first by relocating brpm->p to right after
1192 * the first '\0' encountered.
1193 */
1194int remove_arg_zero(struct linux_binprm *bprm)
1195{
1196	int ret = 0;
1197	unsigned long offset;
1198	char *kaddr;
1199	struct page *page;
1200
1201	if (!bprm->argc)
1202		return 0;
1203
1204	do {
1205		offset = bprm->p & ~PAGE_MASK;
1206		page = get_arg_page(bprm, bprm->p, 0);
1207		if (!page) {
1208			ret = -EFAULT;
1209			goto out;
1210		}
1211		kaddr = kmap_atomic(page, KM_USER0);
1212
1213		for (; offset < PAGE_SIZE && kaddr[offset];
1214				offset++, bprm->p++)
1215			;
1216
1217		kunmap_atomic(kaddr, KM_USER0);
1218		put_arg_page(page);
1219
1220		if (offset == PAGE_SIZE)
1221			free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1222	} while (offset == PAGE_SIZE);
1223
1224	bprm->p++;
1225	bprm->argc--;
1226	ret = 0;
1227
1228out:
1229	return ret;
1230}
1231EXPORT_SYMBOL(remove_arg_zero);
1232
1233/*
1234 * cycle the list of binary formats handler, until one recognizes the image
1235 */
1236int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1237{
1238	int try,retval;
1239	struct linux_binfmt *fmt;
1240#ifdef __alpha__
1241	/* handle /sbin/loader.. */
1242	{
1243	    struct exec * eh = (struct exec *) bprm->buf;
1244
1245	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
1246		(eh->fh.f_flags & 0x3000) == 0x3000)
1247	    {
1248		struct file * file;
1249		unsigned long loader;
1250
1251		allow_write_access(bprm->file);
1252		fput(bprm->file);
1253		bprm->file = NULL;
1254
1255		loader = bprm->vma->vm_end - sizeof(void *);
1256
1257		file = open_exec("/sbin/loader");
1258		retval = PTR_ERR(file);
1259		if (IS_ERR(file))
1260			return retval;
1261
1262		/* Remember if the application is TASO.  */
1263		bprm->sh_bang = eh->ah.entry < 0x100000000UL;
1264
1265		bprm->file = file;
1266		bprm->loader = loader;
1267		retval = prepare_binprm(bprm);
1268		if (retval<0)
1269			return retval;
1270		/* should call search_binary_handler recursively here,
1271		   but it does not matter */
1272	    }
1273	}
1274#endif
1275	retval = security_bprm_check(bprm);
1276	if (retval)
1277		return retval;
1278
1279	/* kernel module loader fixup */
1280	/* so we don't try to load run modprobe in kernel space. */
1281	set_fs(USER_DS);
1282
1283	retval = audit_bprm(bprm);
1284	if (retval)
1285		return retval;
1286
1287	retval = -ENOENT;
1288	for (try=0; try<2; try++) {
1289		read_lock(&binfmt_lock);
1290		for (fmt = formats ; fmt ; fmt = fmt->next) {
1291			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
1292			if (!fn)
1293				continue;
1294			if (!try_module_get(fmt->module))
1295				continue;
1296			read_unlock(&binfmt_lock);
1297			retval = fn(bprm, regs);
1298			if (retval >= 0) {
1299				put_binfmt(fmt);
1300				allow_write_access(bprm->file);
1301				if (bprm->file)
1302					fput(bprm->file);
1303				bprm->file = NULL;
1304				current->did_exec = 1;
1305				proc_exec_connector(current);
1306				return retval;
1307			}
1308			read_lock(&binfmt_lock);
1309			put_binfmt(fmt);
1310			if (retval != -ENOEXEC || bprm->mm == NULL)
1311				break;
1312			if (!bprm->file) {
1313				read_unlock(&binfmt_lock);
1314				return retval;
1315			}
1316		}
1317		read_unlock(&binfmt_lock);
1318		if (retval != -ENOEXEC || bprm->mm == NULL) {
1319			break;
1320#ifdef CONFIG_KMOD
1321		}else{
1322#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1323			if (printable(bprm->buf[0]) &&
1324			    printable(bprm->buf[1]) &&
1325			    printable(bprm->buf[2]) &&
1326			    printable(bprm->buf[3]))
1327				break; /* -ENOEXEC */
1328			request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1329#endif
1330		}
1331	}
1332	return retval;
1333}
1334
1335EXPORT_SYMBOL(search_binary_handler);
1336
1337/*
1338 * sys_execve() executes a new program.
1339 */
1340int do_execve(char * filename,
1341	char __user *__user *argv,
1342	char __user *__user *envp,
1343	struct pt_regs * regs)
1344{
1345	struct linux_binprm *bprm;
1346	struct file *file;
1347	unsigned long env_p;
1348	int retval;
1349
1350	retval = -ENOMEM;
1351	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1352	if (!bprm)
1353		goto out_ret;
1354
1355	file = open_exec(filename);
1356	retval = PTR_ERR(file);
1357	if (IS_ERR(file))
1358		goto out_kfree;
1359
1360	sched_exec();
1361
1362	bprm->file = file;
1363	bprm->filename = filename;
1364	bprm->interp = filename;
1365
1366	retval = bprm_mm_init(bprm);
1367	if (retval)
1368		goto out_file;
1369
1370	bprm->argc = count(argv, MAX_ARG_STRINGS);
1371	if ((retval = bprm->argc) < 0)
1372		goto out_mm;
1373
1374	bprm->envc = count(envp, MAX_ARG_STRINGS);
1375	if ((retval = bprm->envc) < 0)
1376		goto out_mm;
1377
1378	retval = security_bprm_alloc(bprm);
1379	if (retval)
1380		goto out;
1381
1382	retval = prepare_binprm(bprm);
1383	if (retval < 0)
1384		goto out;
1385
1386	retval = copy_strings_kernel(1, &bprm->filename, bprm);
1387	if (retval < 0)
1388		goto out;
1389
1390	bprm->exec = bprm->p;
1391	retval = copy_strings(bprm->envc, envp, bprm);
1392	if (retval < 0)
1393		goto out;
1394
1395	env_p = bprm->p;
1396	retval = copy_strings(bprm->argc, argv, bprm);
1397	if (retval < 0)
1398		goto out;
1399	bprm->argv_len = env_p - bprm->p;
1400
1401	retval = search_binary_handler(bprm,regs);
1402	if (retval >= 0) {
1403		/* execve success */
1404		free_arg_pages(bprm);
1405		security_bprm_free(bprm);
1406		acct_update_integrals(current);
1407		kfree(bprm);
1408		return retval;
1409	}
1410
1411out:
1412	free_arg_pages(bprm);
1413	if (bprm->security)
1414		security_bprm_free(bprm);
1415
1416out_mm:
1417	if (bprm->mm)
1418		mmput (bprm->mm);
1419
1420out_file:
1421	if (bprm->file) {
1422		allow_write_access(bprm->file);
1423		fput(bprm->file);
1424	}
1425out_kfree:
1426	kfree(bprm);
1427
1428out_ret:
1429	return retval;
1430}
1431
1432int set_binfmt(struct linux_binfmt *new)
1433{
1434	struct linux_binfmt *old = current->binfmt;
1435
1436	if (new) {
1437		if (!try_module_get(new->module))
1438			return -1;
1439	}
1440	current->binfmt = new;
1441	if (old)
1442		module_put(old->module);
1443	return 0;
1444}
1445
1446EXPORT_SYMBOL(set_binfmt);
1447
1448/* format_corename will inspect the pattern parameter, and output a
1449 * name into corename, which must have space for at least
1450 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1451 */
1452static int format_corename(char *corename, const char *pattern, long signr)
1453{
1454	const char *pat_ptr = pattern;
1455	char *out_ptr = corename;
1456	char *const out_end = corename + CORENAME_MAX_SIZE;
1457	int rc;
1458	int pid_in_pattern = 0;
1459	int ispipe = 0;
1460
1461	if (*pattern == '|')
1462		ispipe = 1;
1463
1464	/* Repeat as long as we have more pattern to process and more output
1465	   space */
1466	while (*pat_ptr) {
1467		if (*pat_ptr != '%') {
1468			if (out_ptr == out_end)
1469				goto out;
1470			*out_ptr++ = *pat_ptr++;
1471		} else {
1472			switch (*++pat_ptr) {
1473			case 0:
1474				goto out;
1475			/* Double percent, output one percent */
1476			case '%':
1477				if (out_ptr == out_end)
1478					goto out;
1479				*out_ptr++ = '%';
1480				break;
1481			/* pid */
1482			case 'p':
1483				pid_in_pattern = 1;
1484				rc = snprintf(out_ptr, out_end - out_ptr,
1485					      "%d", current->tgid);
1486				if (rc > out_end - out_ptr)
1487					goto out;
1488				out_ptr += rc;
1489				break;
1490			/* uid */
1491			case 'u':
1492				rc = snprintf(out_ptr, out_end - out_ptr,
1493					      "%d", current->uid);
1494				if (rc > out_end - out_ptr)
1495					goto out;
1496				out_ptr += rc;
1497				break;
1498			/* gid */
1499			case 'g':
1500				rc = snprintf(out_ptr, out_end - out_ptr,
1501					      "%d", current->gid);
1502				if (rc > out_end - out_ptr)
1503					goto out;
1504				out_ptr += rc;
1505				break;
1506			/* signal that caused the coredump */
1507			case 's':
1508				rc = snprintf(out_ptr, out_end - out_ptr,
1509					      "%ld", signr);
1510				if (rc > out_end - out_ptr)
1511					goto out;
1512				out_ptr += rc;
1513				break;
1514			/* UNIX time of coredump */
1515			case 't': {
1516				struct timeval tv;
1517				do_gettimeofday(&tv);
1518				rc = snprintf(out_ptr, out_end - out_ptr,
1519					      "%lu", tv.tv_sec);
1520				if (rc > out_end - out_ptr)
1521					goto out;
1522				out_ptr += rc;
1523				break;
1524			}
1525			/* hostname */
1526			case 'h':
1527				down_read(&uts_sem);
1528				rc = snprintf(out_ptr, out_end - out_ptr,
1529					      "%s", utsname()->nodename);
1530				up_read(&uts_sem);
1531				if (rc > out_end - out_ptr)
1532					goto out;
1533				out_ptr += rc;
1534				break;
1535			/* executable */
1536			case 'e':
1537				rc = snprintf(out_ptr, out_end - out_ptr,
1538					      "%s", current->comm);
1539				if (rc > out_end - out_ptr)
1540					goto out;
1541				out_ptr += rc;
1542				break;
1543			default:
1544				break;
1545			}
1546			++pat_ptr;
1547		}
1548	}
1549	/* Backward compatibility with core_uses_pid:
1550	 *
1551	 * If core_pattern does not include a %p (as is the default)
1552	 * and core_uses_pid is set, then .%pid will be appended to
1553	 * the filename. Do not do this for piped commands. */
1554	if (!ispipe && !pid_in_pattern
1555            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
1556		rc = snprintf(out_ptr, out_end - out_ptr,
1557			      ".%d", current->tgid);
1558		if (rc > out_end - out_ptr)
1559			goto out;
1560		out_ptr += rc;
1561	}
1562out:
1563	*out_ptr = 0;
1564	return ispipe;
1565}
1566
1567static void zap_process(struct task_struct *start)
1568{
1569	struct task_struct *t;
1570
1571	start->signal->flags = SIGNAL_GROUP_EXIT;
1572	start->signal->group_stop_count = 0;
1573
1574	t = start;
1575	do {
1576		if (t != current && t->mm) {
1577			t->mm->core_waiters++;
1578			sigaddset(&t->pending.signal, SIGKILL);
1579			signal_wake_up(t, 1);
1580		}
1581	} while ((t = next_thread(t)) != start);
1582}
1583
1584static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1585				int exit_code)
1586{
1587	struct task_struct *g, *p;
1588	unsigned long flags;
1589	int err = -EAGAIN;
1590
1591	spin_lock_irq(&tsk->sighand->siglock);
1592	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
1593		tsk->signal->group_exit_code = exit_code;
1594		zap_process(tsk);
1595		err = 0;
1596	}
1597	spin_unlock_irq(&tsk->sighand->siglock);
1598	if (err)
1599		return err;
1600
1601	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
1602		goto done;
1603
1604	rcu_read_lock();
1605	for_each_process(g) {
1606		if (g == tsk->group_leader)
1607			continue;
1608
1609		p = g;
1610		do {
1611			if (p->mm) {
1612				if (p->mm == mm) {
1613					/*
1614					 * p->sighand can't disappear, but
1615					 * may be changed by de_thread()
1616					 */
1617					lock_task_sighand(p, &flags);
1618					zap_process(p);
1619					unlock_task_sighand(p, &flags);
1620				}
1621				break;
1622			}
1623		} while ((p = next_thread(p)) != g);
1624	}
1625	rcu_read_unlock();
1626done:
1627	return mm->core_waiters;
1628}
1629
1630static int coredump_wait(int exit_code)
1631{
1632	struct task_struct *tsk = current;
1633	struct mm_struct *mm = tsk->mm;
1634	struct completion startup_done;
1635	struct completion *vfork_done;
1636	int core_waiters;
1637
1638	init_completion(&mm->core_done);
1639	init_completion(&startup_done);
1640	mm->core_startup_done = &startup_done;
1641
1642	core_waiters = zap_threads(tsk, mm, exit_code);
1643	up_write(&mm->mmap_sem);
1644
1645	if (unlikely(core_waiters < 0))
1646		goto fail;
1647
1648	/*
1649	 * Make sure nobody is waiting for us to release the VM,
1650	 * otherwise we can deadlock when we wait on each other
1651	 */
1652	vfork_done = tsk->vfork_done;
1653	if (vfork_done) {
1654		tsk->vfork_done = NULL;
1655		complete(vfork_done);
1656	}
1657
1658	if (core_waiters)
1659		wait_for_completion(&startup_done);
1660fail:
1661	BUG_ON(mm->core_waiters);
1662	return core_waiters;
1663}
1664
1665/*
1666 * set_dumpable converts traditional three-value dumpable to two flags and
1667 * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
1668 * these bits are not changed atomically.  So get_dumpable can observe the
1669 * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
1670 * return either old dumpable or new one by paying attention to the order of
1671 * modifying the bits.
1672 *
1673 * dumpable |   mm->flags (binary)
1674 * old  new | initial interim  final
1675 * ---------+-----------------------
1676 *  0    1  |   00      01      01
1677 *  0    2  |   00      10(*)   11
1678 *  1    0  |   01      00      00
1679 *  1    2  |   01      11      11
1680 *  2    0  |   11      10(*)   00
1681 *  2    1  |   11      11      01
1682 *
1683 * (*) get_dumpable regards interim value of 10 as 11.
1684 */
1685void set_dumpable(struct mm_struct *mm, int value)
1686{
1687	switch (value) {
1688	case 0:
1689		clear_bit(MMF_DUMPABLE, &mm->flags);
1690		smp_wmb();
1691		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1692		break;
1693	case 1:
1694		set_bit(MMF_DUMPABLE, &mm->flags);
1695		smp_wmb();
1696		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1697		break;
1698	case 2:
1699		set_bit(MMF_DUMP_SECURELY, &mm->flags);
1700		smp_wmb();
1701		set_bit(MMF_DUMPABLE, &mm->flags);
1702		break;
1703	}
1704}
1705EXPORT_SYMBOL_GPL(set_dumpable);
1706
1707int get_dumpable(struct mm_struct *mm)
1708{
1709	int ret;
1710
1711	ret = mm->flags & 0x3;
1712	return (ret >= 2) ? 2 : ret;
1713}
1714
1715int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1716{
1717	char corename[CORENAME_MAX_SIZE + 1];
1718	struct mm_struct *mm = current->mm;
1719	struct linux_binfmt * binfmt;
1720	struct inode * inode;
1721	struct file * file;
1722	int retval = 0;
1723	int fsuid = current->fsuid;
1724	int flag = 0;
1725	int ispipe = 0;
1726
1727	audit_core_dumps(signr);
1728
1729	binfmt = current->binfmt;
1730	if (!binfmt || !binfmt->core_dump)
1731		goto fail;
1732	down_write(&mm->mmap_sem);
1733	if (!get_dumpable(mm)) {
1734		up_write(&mm->mmap_sem);
1735		goto fail;
1736	}
1737
1738	/*
1739	 *	We cannot trust fsuid as being the "true" uid of the
1740	 *	process nor do we know its entire history. We only know it
1741	 *	was tainted so we dump it as root in mode 2.
1742	 */
1743	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
1744		flag = O_EXCL;		/* Stop rewrite attacks */
1745		current->fsuid = 0;	/* Dump root private */
1746	}
1747	set_dumpable(mm, 0);
1748
1749	retval = coredump_wait(exit_code);
1750	if (retval < 0)
1751		goto fail;
1752
1753	/*
1754	 * Clear any false indication of pending signals that might
1755	 * be seen by the filesystem code called to write the core file.
1756	 */
1757	clear_thread_flag(TIF_SIGPENDING);
1758
1759	if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
1760		goto fail_unlock;
1761
1762	/*
1763	 * lock_kernel() because format_corename() is controlled by sysctl, which
1764	 * uses lock_kernel()
1765	 */
1766 	lock_kernel();
1767	ispipe = format_corename(corename, core_pattern, signr);
1768	unlock_kernel();
1769 	if (ispipe) {
1770		/* SIGPIPE can happen, but it's just never processed */
1771 		if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
1772 			printk(KERN_INFO "Core dump to %s pipe failed\n",
1773			       corename);
1774 			goto fail_unlock;
1775 		}
1776 	} else
1777 		file = filp_open(corename,
1778				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1779				 0600);
1780	if (IS_ERR(file))
1781		goto fail_unlock;
1782	inode = file->f_path.dentry->d_inode;
1783	if (inode->i_nlink > 1)
1784		goto close_fail;	/* multiple links - don't dump */
1785	if (!ispipe && d_unhashed(file->f_path.dentry))
1786		goto close_fail;
1787
1788	/* AK: actually i see no reason to not allow this for named pipes etc.,
1789	   but keep the previous behaviour for now. */
1790	if (!ispipe && !S_ISREG(inode->i_mode))
1791		goto close_fail;
1792	if (!file->f_op)
1793		goto close_fail;
1794	if (!file->f_op->write)
1795		goto close_fail;
1796	if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
1797		goto close_fail;
1798
1799	retval = binfmt->core_dump(signr, regs, file);
1800
1801	if (retval)
1802		current->signal->group_exit_code |= 0x80;
1803close_fail:
1804	filp_close(file, NULL);
1805fail_unlock:
1806	current->fsuid = fsuid;
1807	complete_all(&mm->core_done);
1808fail:
1809	return retval;
1810}