mm: Fix boot crash in mm_alloc()

Thomas Gleixner reports that we now have a boot crash triggered by
CONFIG_CPUMASK_OFFSTACK=y:

BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<c11ae035>] find_next_bit+0x55/0xb0
Call Trace:
[<c11addda>] cpumask_any_but+0x2a/0x70
[<c102396b>] flush_tlb_mm+0x2b/0x80
[<c1022705>] pud_populate+0x35/0x50
[<c10227ba>] pgd_alloc+0x9a/0xf0
[<c103a3fc>] mm_init+0xec/0x120
[<c103a7a3>] mm_alloc+0x53/0xd0

which was introduced by commit de03c72cfce5 ("mm: convert
mm->cpu_vm_cpumask into cpumask_var_t"), and is due to wrong ordering of
mm_init() vs mm_init_cpumask

Thomas wrote a patch to just fix the ordering of initialization, but I
hate the new double allocation in the fork path, so I ended up instead
doing some more radical surgery to clean it all up.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Changed files
+23 -36
include
init
kernel
+12 -2
include/linux/mm_types.h
··· 264 264 265 265 struct linux_binfmt *binfmt; 266 266 267 + cpumask_var_t cpu_vm_mask_var; 268 + 267 269 /* Architecture-specific MM context */ 268 270 mm_context_t context; 269 271 ··· 313 311 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 314 312 pgtable_t pmd_huge_pte; /* protected by page_table_lock */ 315 313 #endif 316 - 317 - cpumask_var_t cpu_vm_mask_var; 314 + #ifdef CONFIG_CPUMASK_OFFSTACK 315 + struct cpumask cpumask_allocation; 316 + #endif 318 317 }; 318 + 319 + static inline void mm_init_cpumask(struct mm_struct *mm) 320 + { 321 + #ifdef CONFIG_CPUMASK_OFFSTACK 322 + mm->cpu_vm_mask_var = &mm->cpumask_allocation; 323 + #endif 324 + } 319 325 320 326 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ 321 327 static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
-1
include/linux/sched.h
··· 2194 2194 if (unlikely(atomic_dec_and_test(&mm->mm_count))) 2195 2195 __mmdrop(mm); 2196 2196 } 2197 - extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm); 2198 2197 2199 2198 /* mmput gets rid of the mappings and all user-space */ 2200 2199 extern void mmput(struct mm_struct *);
+1 -1
init/main.c
··· 487 487 printk(KERN_NOTICE "%s", linux_banner); 488 488 setup_arch(&command_line); 489 489 mm_init_owner(&init_mm, &init_task); 490 + mm_init_cpumask(&init_mm); 490 491 setup_command_line(command_line); 491 492 setup_nr_cpu_ids(); 492 493 setup_per_cpu_areas(); ··· 511 510 sort_main_extable(); 512 511 trap_init(); 513 512 mm_init(); 514 - BUG_ON(mm_init_cpumask(&init_mm, 0)); 515 513 516 514 /* 517 515 * Set up the scheduler prior starting any interrupts (such as the
+10 -32
kernel/fork.c
··· 484 484 #endif 485 485 } 486 486 487 - int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) 488 - { 489 - #ifdef CONFIG_CPUMASK_OFFSTACK 490 - if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) 491 - return -ENOMEM; 492 - 493 - if (oldmm) 494 - cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); 495 - else 496 - memset(mm_cpumask(mm), 0, cpumask_size()); 497 - #endif 498 - return 0; 499 - } 500 - 501 487 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 502 488 { 503 489 atomic_set(&mm->mm_users, 1); ··· 524 538 return NULL; 525 539 526 540 memset(mm, 0, sizeof(*mm)); 527 - mm = mm_init(mm, current); 528 - if (!mm) 529 - return NULL; 530 - 531 - if (mm_init_cpumask(mm, NULL)) { 532 - mm_free_pgd(mm); 533 - free_mm(mm); 534 - return NULL; 535 - } 536 - 537 - return mm; 541 + mm_init_cpumask(mm); 542 + return mm_init(mm, current); 538 543 } 539 544 540 545 /* ··· 536 559 void __mmdrop(struct mm_struct *mm) 537 560 { 538 561 BUG_ON(mm == &init_mm); 539 - free_cpumask_var(mm->cpu_vm_mask_var); 540 562 mm_free_pgd(mm); 541 563 destroy_context(mm); 542 564 mmu_notifier_mm_destroy(mm); ··· 729 753 goto fail_nomem; 730 754 731 755 memcpy(mm, oldmm, sizeof(*mm)); 756 + mm_init_cpumask(mm); 732 757 733 758 /* Initializing for Swap token stuff */ 734 759 mm->token_priority = 0; ··· 741 764 742 765 if (!mm_init(mm, tsk)) 743 766 goto fail_nomem; 744 - 745 - if (mm_init_cpumask(mm, oldmm)) 746 - goto fail_nocpumask; 747 767 748 768 if (init_new_context(tsk, mm)) 749 769 goto fail_nocontext; ··· 768 794 return NULL; 769 795 770 796 fail_nocontext: 771 - free_cpumask_var(mm->cpu_vm_mask_var); 772 - 773 - fail_nocpumask: 774 797 /* 775 798 * If init_new_context() failed, we cannot use mmput() to free the mm 776 799 * because it calls destroy_context() ··· 1562 1591 fs_cachep = kmem_cache_create("fs_cache", 1563 1592 sizeof(struct fs_struct), 0, 1564 1593 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1594 + /* 1595 + * FIXME! The "sizeof(struct mm_struct)" currently includes the 1596 + * whole struct cpumask for the OFFSTACK case. We could change 1597 + * this to *only* allocate as much of it as required by the 1598 + * maximum number of CPU's we can ever have. The cpumask_allocation 1599 + * is at the end of the structure, exactly for that reason. 1600 + */ 1565 1601 mm_cachep = kmem_cache_create("mm_struct", 1566 1602 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1567 1603 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);