commit c3b86a29429dac1033e3f602f51fa8d00006a8eb

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86-32, percpu: Correct the ordering of the percpu readmostly section
x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G
x86: Spread tlb flush vector between nodes
percpu: Introduce a read-mostly percpu API
x86, mm: Fix incorrect data type in vmalloc_sync_all()
x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
x86, mm: Fix bogus whitespace in sync_global_pgds()
x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation
x86, mm: Add RESERVE_BRK_ARRAY() helper
mm, x86: Saving vmcore with non-lazy freeing of vmas
x86, kdump: Change copy_oldmem_page() to use cached addressing
x86, mm: fix uninitialized addr in kernel_physical_mapping_init()
x86, kmemcheck: Remove double test
x86, mm: Make spurious_fault check explicitly check the PRESENT bit
x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes
x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions
x86, mm: Avoid unnecessary TLB flush

Linus Torvalds 15 years ago c3b86a29 8d8d2e9c

+174 -34

17 changed files

expand all

unified split

arch

x86

Kconfig

include

asm

io.h

page_types.h

pgtable.h

pgtable_64.h

setup.h

kernel

crash_dump_64.c

fault.c

init_64.c

kmemcheck

opcode.c

pgtable.c

tlb.c

include

asm-generic

pgtable.h

vmlinux.lds.h

linux

percpu-defs.h

memory.c

vmalloc.c

arch/x86/Kconfig

··· 1163 1163 config ARCH_PHYS_ADDR_T_64BIT 1164 1164 def_bool X86_64 || X86_PAE 1165 1165 1166 + config ARCH_DMA_ADDR_T_64BIT 1167 + def_bool X86_64 || HIGHMEM64G 1168 + 1166 1169 config DIRECT_GBPAGES 1167 1170 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1168 1171 default y

arch/x86/include/asm/io.h

··· 206 206 207 207 extern void iounmap(volatile void __iomem *addr); 208 208 209 + extern void set_iounmap_nonlazy(void); 209 210 210 211 #ifdef __KERNEL__ 211 212

+1 -1

arch/x86/include/asm/page_types.h

··· 8 8 #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) 9 9 #define PAGE_MASK (~(PAGE_SIZE-1)) 10 10 11 - #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) 11 + #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) 12 12 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 13 13 14 14 /* Cast PAGE_MASK to a signed type so that it is sign-extended if

arch/x86/include/asm/pgtable.h

··· 28 28 extern spinlock_t pgd_lock; 29 29 extern struct list_head pgd_list; 30 30 31 + extern struct mm_struct *pgd_page_get_mm(struct page *page); 32 + 31 33 #ifdef CONFIG_PARAVIRT 32 34 #include <asm/paravirt.h> 33 35 #else /* !CONFIG_PARAVIRT */ ··· 604 602 clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); 605 603 pte_update(mm, addr, ptep); 606 604 } 605 + 606 + #define flush_tlb_fix_spurious_fault(vma, address) 607 607 608 608 /* 609 609 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);

arch/x86/include/asm/pgtable_64.h

··· 102 102 native_set_pgd(pgd, native_make_pgd(0)); 103 103 } 104 104 105 + extern void sync_global_pgds(unsigned long start, unsigned long end); 106 + 105 107 /* 106 108 * Conversion functions: convert a page and protection to a page entry, 107 109 * and a page entry and page directory to the page they refer to.

arch/x86/include/asm/setup.h

··· 93 93 : : "i" (sz)); \ 94 94 } 95 95 96 + /* Helper for reserving space for arrays of things */ 97 + #define RESERVE_BRK_ARRAY(type, name, entries) \ 98 + type *name; \ 99 + RESERVE_BRK(name, sizeof(type) * entries) 100 + 96 101 #ifdef __i386__ 97 102 98 103 void __init i386_start_kernel(void);

+2 -1

arch/x86/kernel/crash_dump_64.c

··· 34 34 if (!csize) 35 35 return 0; 36 36 37 - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 37 + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); 38 38 if (!vaddr) 39 39 return -ENOMEM; 40 40 ··· 46 46 } else 47 47 memcpy(buf, vaddr + offset, csize); 48 48 49 + set_iounmap_nonlazy(); 49 50 iounmap(vaddr); 50 51 return csize; 51 52 }

+18 -25

arch/x86/mm/fault.c

··· 229 229 230 230 spin_lock_irqsave(&pgd_lock, flags); 231 231 list_for_each_entry(page, &pgd_list, lru) { 232 - if (!vmalloc_sync_one(page_address(page), address)) 232 + spinlock_t *pgt_lock; 233 + pmd_t *ret; 234 + 235 + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 236 + 237 + spin_lock(pgt_lock); 238 + ret = vmalloc_sync_one(page_address(page), address); 239 + spin_unlock(pgt_lock); 240 + 241 + if (!ret) 233 242 break; 234 243 } 235 244 spin_unlock_irqrestore(&pgd_lock, flags); ··· 337 328 338 329 void vmalloc_sync_all(void) 339 330 { 340 - unsigned long address; 341 - 342 - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 343 - address += PGDIR_SIZE) { 344 - 345 - const pgd_t *pgd_ref = pgd_offset_k(address); 346 - unsigned long flags; 347 - struct page *page; 348 - 349 - if (pgd_none(*pgd_ref)) 350 - continue; 351 - 352 - spin_lock_irqsave(&pgd_lock, flags); 353 - list_for_each_entry(page, &pgd_list, lru) { 354 - pgd_t *pgd; 355 - pgd = (pgd_t *)page_address(page) + pgd_index(address); 356 - if (pgd_none(*pgd)) 357 - set_pgd(pgd, *pgd_ref); 358 - else 359 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 360 - } 361 - spin_unlock_irqrestore(&pgd_lock, flags); 362 - } 331 + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); 363 332 } 364 333 365 334 /* ··· 885 898 if (pmd_large(*pmd)) 886 899 return spurious_fault_check(error_code, (pte_t *) pmd); 887 900 901 + /* 902 + * Note: don't use pte_present() here, since it returns true 903 + * if the _PAGE_PROTNONE bit is set. However, this aliases the 904 + * _PAGE_GLOBAL bit, which for kernel pages give false positives 905 + * when CONFIG_DEBUG_PAGEALLOC is used. 906 + */ 888 907 pte = pte_offset_kernel(pmd, address); 889 - if (!pte_present(*pte)) 908 + if (!(pte_flags(*pte) & _PAGE_PRESENT)) 890 909 return 0; 891 910 892 911 ret = spurious_fault_check(error_code, pte);

+46 -1

arch/x86/mm/init_64.c

··· 98 98 __setup("noexec32=", nonx32_setup); 99 99 100 100 /* 101 + * When memory was added/removed make sure all the processes MM have 102 + * suitable PGD entries in the local PGD level page. 103 + */ 104 + void sync_global_pgds(unsigned long start, unsigned long end) 105 + { 106 + unsigned long address; 107 + 108 + for (address = start; address <= end; address += PGDIR_SIZE) { 109 + const pgd_t *pgd_ref = pgd_offset_k(address); 110 + unsigned long flags; 111 + struct page *page; 112 + 113 + if (pgd_none(*pgd_ref)) 114 + continue; 115 + 116 + spin_lock_irqsave(&pgd_lock, flags); 117 + list_for_each_entry(page, &pgd_list, lru) { 118 + pgd_t *pgd; 119 + spinlock_t *pgt_lock; 120 + 121 + pgd = (pgd_t *)page_address(page) + pgd_index(address); 122 + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 123 + spin_lock(pgt_lock); 124 + 125 + if (pgd_none(*pgd)) 126 + set_pgd(pgd, *pgd_ref); 127 + else 128 + BUG_ON(pgd_page_vaddr(*pgd) 129 + != pgd_page_vaddr(*pgd_ref)); 130 + 131 + spin_unlock(pgt_lock); 132 + } 133 + spin_unlock_irqrestore(&pgd_lock, flags); 134 + } 135 + } 136 + 137 + /* 101 138 * NOTE: This function is marked __ref because it calls __init function 102 139 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 103 140 */ ··· 571 534 unsigned long end, 572 535 unsigned long page_size_mask) 573 536 { 574 - 537 + bool pgd_changed = false; 575 538 unsigned long next, last_map_addr = end; 539 + unsigned long addr; 576 540 577 541 start = (unsigned long)__va(start); 578 542 end = (unsigned long)__va(end); 543 + addr = start; 579 544 580 545 for (; start < end; start = next) { 581 546 pgd_t *pgd = pgd_offset_k(start); ··· 602 563 spin_lock(&init_mm.page_table_lock); 603 564 pgd_populate(&init_mm, pgd, __va(pud_phys)); 604 565 spin_unlock(&init_mm.page_table_lock); 566 + pgd_changed = true; 605 567 } 568 + 569 + if (pgd_changed) 570 + sync_global_pgds(addr, end); 571 + 606 572 __flush_tlb_all(); 607 573 608 574 return last_map_addr; ··· 1047 1003 } 1048 1004 1049 1005 } 1006 + sync_global_pgds((unsigned long)start_page, end); 1050 1007 return 0; 1051 1008 } 1052 1009

+1 -1

arch/x86/mm/kmemcheck/opcode.c

··· 9 9 b == 0xf0 || b == 0xf2 || b == 0xf3 10 10 /* Group 2 */ 11 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 12 - || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e 12 + || b == 0x64 || b == 0x65 13 13 /* Group 3 */ 14 14 || b == 0x66 15 15 /* Group 4 */

+17 -3

arch/x86/mm/pgtable.c

··· 87 87 #define UNSHARED_PTRS_PER_PGD \ 88 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 89 89 90 - static void pgd_ctor(pgd_t *pgd) 90 + 91 + static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 92 + { 93 + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); 94 + virt_to_page(pgd)->index = (pgoff_t)mm; 95 + } 96 + 97 + struct mm_struct *pgd_page_get_mm(struct page *page) 98 + { 99 + return (struct mm_struct *)page->index; 100 + } 101 + 102 + static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 91 103 { 92 104 /* If the pgd points to a shared pagetable level (either the 93 105 ptes in non-PAE, or shared PMD in PAE), then just copy the ··· 117 105 } 118 106 119 107 /* list required to sync kernel mapping updates */ 120 - if (!SHARED_KERNEL_PMD) 108 + if (!SHARED_KERNEL_PMD) { 109 + pgd_set_mm(pgd, mm); 121 110 pgd_list_add(pgd); 111 + } 122 112 } 123 113 124 114 static void pgd_dtor(pgd_t *pgd) ··· 286 272 */ 287 273 spin_lock_irqsave(&pgd_lock, flags); 288 274 289 - pgd_ctor(pgd); 275 + pgd_ctor(mm, pgd); 290 276 pgd_prepopulate_pmd(mm, pgd, pmds); 291 277 292 278 spin_unlock_irqrestore(&pgd_lock, flags);

+47 -1

arch/x86/mm/tlb.c

··· 5 5 #include <linux/smp.h> 6 6 #include <linux/interrupt.h> 7 7 #include <linux/module.h> 8 + #include <linux/cpu.h> 8 9 9 10 #include <asm/tlbflush.h> 10 11 #include <asm/mmu_context.h> ··· 52 51 to a full cache line because other CPUs can access it and we don't 53 52 want false sharing in the per cpu data segment. */ 54 53 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54 + 55 + static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); 55 56 56 57 /* 57 58 * We cannot call mmdrop() because we are in interrupt context, ··· 176 173 union smp_flush_state *f; 177 174 178 175 /* Caller has disabled preemption */ 179 - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 176 + sender = this_cpu_read(tlb_vector_offset); 180 177 f = &flush_state[sender]; 181 178 182 179 /* ··· 221 218 flush_tlb_others_ipi(cpumask, mm, va); 222 219 } 223 220 221 + static void __cpuinit calculate_tlb_offset(void) 222 + { 223 + int cpu, node, nr_node_vecs; 224 + /* 225 + * we are changing tlb_vector_offset for each CPU in runtime, but this 226 + * will not cause inconsistency, as the write is atomic under X86. we 227 + * might see more lock contentions in a short time, but after all CPU's 228 + * tlb_vector_offset are changed, everything should go normal 229 + * 230 + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might 231 + * waste some vectors. 232 + **/ 233 + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) 234 + nr_node_vecs = 1; 235 + else 236 + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; 237 + 238 + for_each_online_node(node) { 239 + int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * 240 + nr_node_vecs; 241 + int cpu_offset = 0; 242 + for_each_cpu(cpu, cpumask_of_node(node)) { 243 + per_cpu(tlb_vector_offset, cpu) = node_offset + 244 + cpu_offset; 245 + cpu_offset++; 246 + cpu_offset = cpu_offset % nr_node_vecs; 247 + } 248 + } 249 + } 250 + 251 + static int tlb_cpuhp_notify(struct notifier_block *n, 252 + unsigned long action, void *hcpu) 253 + { 254 + switch (action & 0xf) { 255 + case CPU_ONLINE: 256 + case CPU_DEAD: 257 + calculate_tlb_offset(); 258 + } 259 + return NOTIFY_OK; 260 + } 261 + 224 262 static int __cpuinit init_smp_flush(void) 225 263 { 226 264 int i; ··· 269 225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 270 226 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 271 227 228 + calculate_tlb_offset(); 229 + hotcpu_notifier(tlb_cpuhp_notify, 0); 272 230 return 0; 273 231 } 274 232 core_initcall(init_smp_flush);

include/asm-generic/pgtable.h

··· 129 129 #define move_pte(pte, prot, old_addr, new_addr) (pte) 130 130 #endif 131 131 132 + #ifndef flush_tlb_fix_spurious_fault 133 + #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) 134 + #endif 135 + 132 136 #ifndef pgprot_noncached 133 137 #define pgprot_noncached(prot) (prot) 134 138 #endif

include/asm-generic/vmlinux.lds.h

··· 687 687 - LOAD_OFFSET) { \ 688 688 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 689 689 *(.data..percpu..first) \ 690 + . = ALIGN(PAGE_SIZE); \ 690 691 *(.data..percpu..page_aligned) \ 692 + *(.data..percpu..readmostly) \ 691 693 *(.data..percpu) \ 692 694 *(.data..percpu..shared_aligned) \ 693 695 VMLINUX_SYMBOL(__per_cpu_end) = .; \ ··· 715 713 VMLINUX_SYMBOL(__per_cpu_load) = .; \ 716 714 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 717 715 *(.data..percpu..first) \ 716 + . = ALIGN(PAGE_SIZE); \ 718 717 *(.data..percpu..page_aligned) \ 718 + *(.data..percpu..readmostly) \ 719 719 *(.data..percpu) \ 720 720 *(.data..percpu..shared_aligned) \ 721 721 VMLINUX_SYMBOL(__per_cpu_end) = .; \

include/linux/percpu-defs.h

··· 139 139 __aligned(PAGE_SIZE) 140 140 141 141 /* 142 + * Declaration/definition used for per-CPU variables that must be read mostly. 143 + */ 144 + #define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ 145 + DECLARE_PER_CPU_SECTION(type, name, "..readmostly") 146 + 147 + #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ 148 + DEFINE_PER_CPU_SECTION(type, name, "..readmostly") 149 + 150 + /* 142 151 * Intermodule exports for per-CPU variables. sparse forgets about 143 152 * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to 144 153 * noop if __CHECKER__.

+1 -1

mm/memory.c

··· 3185 3185 * with threads. 3186 3186 */ 3187 3187 if (flags & FAULT_FLAG_WRITE) 3188 - flush_tlb_page(vma, address); 3188 + flush_tlb_fix_spurious_fault(vma, address); 3189 3189 } 3190 3190 unlock: 3191 3191 pte_unmap_unlock(pte, ptl);

mm/vmalloc.c

··· 517 517 static void purge_fragmented_blocks_allcpus(void); 518 518 519 519 /* 520 + * called before a call to iounmap() if the caller wants vm_area_struct's 521 + * immediately freed. 522 + */ 523 + void set_iounmap_nonlazy(void) 524 + { 525 + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); 526 + } 527 + 528 + /* 520 529 * Purges all lazily-freed vmap areas. 521 530 * 522 531 * If sync is 0 then don't purge if there is already a purge in progress.