x86: construct 32-bit boot time page tables in native format.

Specifically the boot time page tables in a CONFIG_X86_PAE=y enabled
kernel are in PAE format.

early_ioremap is updated to use the standard page table accessors.

Clear any mappings beyond max_low_pfn from the boot page tables in
native_pagetable_setup_start because the initial mappings can extend
beyond the range of physical memory and into the vmalloc area.

Derived from patches by Eric Biederman and H. Peter Anvin.

[ jeremy@goop.org: PAE swapper_pg_dir needs to be page-sized fix ]

Signed-off-by: Ian Campbell <ijc@hellion.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Mika Penttilä <mika.penttila@kolumbus.fi>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

authored by Ian Campbell and committed by Thomas Gleixner 551889a6 185c045c

+177 -108
+116 -35
arch/x86/kernel/head_32.S
··· 19 19 #include <asm/thread_info.h> 20 20 #include <asm/asm-offsets.h> 21 21 #include <asm/setup.h> 22 + #include <asm/processor-flags.h> 23 + 24 + /* Physical address */ 25 + #define pa(X) ((X) - __PAGE_OFFSET) 22 26 23 27 /* 24 28 * References to members of the new_cpu_data structure. ··· 84 80 */ 85 81 .section .text.head,"ax",@progbits 86 82 ENTRY(startup_32) 87 - /* check to see if KEEP_SEGMENTS flag is meaningful */ 88 - cmpw $0x207, BP_version(%esi) 89 - jb 1f 90 - 91 83 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 92 84 us to not reload segments */ 93 85 testb $(1<<6), BP_loadflags(%esi) ··· 92 92 /* 93 93 * Set segments to known values. 94 94 */ 95 - 1: lgdt boot_gdt_descr - __PAGE_OFFSET 95 + lgdt pa(boot_gdt_descr) 96 96 movl $(__BOOT_DS),%eax 97 97 movl %eax,%ds 98 98 movl %eax,%es ··· 105 105 */ 106 106 cld 107 107 xorl %eax,%eax 108 - movl $__bss_start - __PAGE_OFFSET,%edi 109 - movl $__bss_stop - __PAGE_OFFSET,%ecx 108 + movl $pa(__bss_start),%edi 109 + movl $pa(__bss_stop),%ecx 110 110 subl %edi,%ecx 111 111 shrl $2,%ecx 112 112 rep ; stosl ··· 118 118 * (kexec on panic case). Hence copy out the parameters before initializing 119 119 * page tables. 120 120 */ 121 - movl $(boot_params - __PAGE_OFFSET),%edi 121 + movl $pa(boot_params),%edi 122 122 movl $(PARAM_SIZE/4),%ecx 123 123 cld 124 124 rep 125 125 movsl 126 - movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi 126 + movl pa(boot_params) + NEW_CL_POINTER,%esi 127 127 andl %esi,%esi 128 128 jz 1f # No comand line 129 - movl $(boot_command_line - __PAGE_OFFSET),%edi 129 + movl $pa(boot_command_line),%edi 130 130 movl $(COMMAND_LINE_SIZE/4),%ecx 131 131 rep 132 132 movsl 133 133 1: 134 134 135 135 #ifdef CONFIG_PARAVIRT 136 - cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET) 136 + /* This is can only trip for a broken bootloader... */ 137 + cmpw $0x207, pa(boot_params + BP_version) 137 138 jb default_entry 138 139 139 140 /* Paravirt-compatible boot parameters. Look to see what architecture 140 141 we're booting under. */ 141 - movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax 142 + movl pa(boot_params + BP_hardware_subarch), %eax 142 143 cmpl $num_subarch_entries, %eax 143 144 jae bad_subarch 144 145 145 - movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax 146 + movl pa(subarch_entries)(,%eax,4), %eax 146 147 subl $__PAGE_OFFSET, %eax 147 148 jmp *%eax 148 149 ··· 171 170 * Mappings are created both at virtual address 0 (identity mapping) 172 171 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. 173 172 * 174 - * Warning: don't use %esi or the stack in this code. However, %esp 175 - * can be used as a GPR if you really need it... 173 + * Note that the stack is not yet set up! 176 174 */ 177 - page_pde_offset = (__PAGE_OFFSET >> 20); 175 + #define PTE_ATTR 0x007 /* PRESENT+RW+USER */ 176 + #define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ 177 + #define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ 178 178 179 179 default_entry: 180 - movl $(pg0 - __PAGE_OFFSET), %edi 181 - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx 182 - movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ 180 + #ifdef CONFIG_X86_PAE 181 + 182 + /* 183 + * In PAE mode swapper_pg_dir is statically defined to contain enough 184 + * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 185 + * entries). The identity mapping is handled by pointing two PGD 186 + * entries to the first kernel PMD. 187 + * 188 + * Note the upper half of each PMD or PTE are always zero at 189 + * this stage. 190 + */ 191 + 192 + #define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ 193 + 194 + xorl %ebx,%ebx /* %ebx is kept at zero */ 195 + 196 + movl $pa(pg0), %edi 197 + movl $pa(swapper_pg_pmd), %edx 198 + movl $PTE_ATTR, %eax 183 199 10: 184 - leal 0x007(%edi),%ecx /* Create PDE entry */ 200 + leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 201 + movl %ecx,(%edx) /* Store PMD entry */ 202 + /* Upper half already zero */ 203 + addl $8,%edx 204 + movl $512,%ecx 205 + 11: 206 + stosl 207 + xchgl %eax,%ebx 208 + stosl 209 + xchgl %eax,%ebx 210 + addl $0x1000,%eax 211 + loop 11b 212 + 213 + /* 214 + * End condition: we must map up to and including INIT_MAP_BEYOND_END 215 + * bytes beyond the end of our own page tables. 216 + */ 217 + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 218 + cmpl %ebp,%eax 219 + jb 10b 220 + 1: 221 + movl %edi,pa(init_pg_tables_end) 222 + 223 + /* Do early initialization of the fixmap area */ 224 + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 225 + movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 226 + #else /* Not PAE */ 227 + 228 + page_pde_offset = (__PAGE_OFFSET >> 20); 229 + 230 + movl $pa(pg0), %edi 231 + movl $pa(swapper_pg_dir), %edx 232 + movl $PTE_ATTR, %eax 233 + 10: 234 + leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 185 235 movl %ecx,(%edx) /* Store identity PDE entry */ 186 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 187 237 addl $4,%edx ··· 241 189 stosl 242 190 addl $0x1000,%eax 243 191 loop 11b 244 - /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ 245 - /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ 246 - leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp 192 + /* 193 + * End condition: we must map up to and including INIT_MAP_BEYOND_END 194 + * bytes beyond the end of our own page tables; the +0x007 is 195 + * the attribute bits 196 + */ 197 + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 247 198 cmpl %ebp,%eax 248 199 jb 10b 249 - movl %edi,(init_pg_tables_end - __PAGE_OFFSET) 200 + movl %edi,pa(init_pg_tables_end) 250 201 251 - /* Do an early initialization of the fixmap area */ 252 - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx 253 - movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax 254 - addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ 255 - movl %eax, 4092(%edx) 256 - 202 + /* Do early initialization of the fixmap area */ 203 + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 204 + movl %eax,pa(swapper_pg_dir+0xffc) 205 + #endif 257 206 jmp 3f 258 207 /* 259 208 * Non-boot CPU entry point; entered from trampoline.S ··· 294 241 * NOTE! We have to correct for the fact that we're 295 242 * not yet offset PAGE_OFFSET.. 296 243 */ 297 - #define cr4_bits mmu_cr4_features-__PAGE_OFFSET 244 + #define cr4_bits pa(mmu_cr4_features) 298 245 movl cr4_bits,%edx 299 246 andl %edx,%edx 300 247 jz 6f ··· 329 276 /* 330 277 * Enable paging 331 278 */ 332 - movl $swapper_pg_dir-__PAGE_OFFSET,%eax 279 + movl $pa(swapper_pg_dir),%eax 333 280 movl %eax,%cr3 /* set the page table pointer.. */ 334 281 movl %cr0,%eax 335 - orl $0x80000000,%eax 282 + orl $X86_CR0_PG,%eax 336 283 movl %eax,%cr0 /* ..and set paging (PG) bit */ 337 284 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 338 285 1: ··· 605 552 */ 606 553 .section ".bss.page_aligned","wa" 607 554 .align PAGE_SIZE_asm 555 + #ifdef CONFIG_X86_PAE 556 + ENTRY(swapper_pg_pmd) 557 + .fill 1024*KPMDS,4,0 558 + #else 608 559 ENTRY(swapper_pg_dir) 609 560 .fill 1024,4,0 610 - ENTRY(swapper_pg_pmd) 561 + #endif 562 + ENTRY(swapper_pg_fixmap) 611 563 .fill 1024,4,0 612 564 ENTRY(empty_zero_page) 613 565 .fill 4096,1,0 614 - 615 566 /* 616 567 * This starts the data section. 617 568 */ 569 + #ifdef CONFIG_X86_PAE 570 + .section ".data.page_aligned","wa" 571 + /* Page-aligned for the benefit of paravirt? */ 572 + .align PAGE_SIZE_asm 573 + ENTRY(swapper_pg_dir) 574 + .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 575 + # if KPMDS == 3 576 + .long pa(swapper_pg_pmd+PGD_ATTR),0 577 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 578 + .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 579 + # elif KPMDS == 2 580 + .long 0,0 581 + .long pa(swapper_pg_pmd+PGD_ATTR),0 582 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 583 + # elif KPMDS == 1 584 + .long 0,0 585 + .long 0,0 586 + .long pa(swapper_pg_pmd+PGD_ATTR),0 587 + # else 588 + # error "Kernel PMDs should be 1, 2 or 3" 589 + # endif 590 + .align PAGE_SIZE_asm /* needs to be page-sized too */ 591 + #endif 592 + 618 593 .data 619 594 ENTRY(stack_start) 620 595 .long init_thread_union+THREAD_SIZE
+4
arch/x86/kernel/setup_32.c
··· 154 154 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 155 155 EXPORT_SYMBOL(boot_cpu_data); 156 156 157 + #ifndef CONFIG_X86_PAE 157 158 unsigned long mmu_cr4_features; 159 + #else 160 + unsigned long mmu_cr4_features = X86_CR4_PAE; 161 + #endif 158 162 159 163 /* for MCA, but anyone else can use it if they want */ 160 164 unsigned int machine_id;
+26 -44
arch/x86/mm/init_32.c
··· 46 46 #include <asm/pgalloc.h> 47 47 #include <asm/sections.h> 48 48 #include <asm/paravirt.h> 49 + #include <asm/setup.h> 49 50 50 51 unsigned int __VMALLOC_RESERVE = 128 << 20; 51 52 ··· 329 328 330 329 void __init native_pagetable_setup_start(pgd_t *base) 331 330 { 332 - #ifdef CONFIG_X86_PAE 333 - int i; 331 + unsigned long pfn, va; 332 + pgd_t *pgd; 333 + pud_t *pud; 334 + pmd_t *pmd; 335 + pte_t *pte; 334 336 335 337 /* 336 - * Init entries of the first-level page table to the 337 - * zero page, if they haven't already been set up. 338 - * 339 - * In a normal native boot, we'll be running on a 340 - * pagetable rooted in swapper_pg_dir, but not in PAE 341 - * mode, so this will end up clobbering the mappings 342 - * for the lower 24Mbytes of the address space, 343 - * without affecting the kernel address space. 338 + * Remove any mappings which extend past the end of physical 339 + * memory from the boot time page table: 344 340 */ 345 - for (i = 0; i < USER_PTRS_PER_PGD; i++) 346 - set_pgd(&base[i], 347 - __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); 341 + for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { 342 + va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); 343 + pgd = base + pgd_index(va); 344 + if (!pgd_present(*pgd)) 345 + break; 348 346 349 - /* Make sure kernel address space is empty so that a pagetable 350 - will be allocated for it. */ 351 - memset(&base[USER_PTRS_PER_PGD], 0, 352 - KERNEL_PGD_PTRS * sizeof(pgd_t)); 353 - #else 347 + pud = pud_offset(pgd, va); 348 + pmd = pmd_offset(pud, va); 349 + if (!pmd_present(*pmd)) 350 + break; 351 + 352 + pte = pte_offset_kernel(pmd, va); 353 + if (!pte_present(*pte)) 354 + break; 355 + 356 + pte_clear(NULL, va, pte); 357 + } 354 358 paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); 355 - #endif 356 359 } 357 360 358 361 void __init native_pagetable_setup_done(pgd_t *base) 359 362 { 360 - #ifdef CONFIG_X86_PAE 361 - /* 362 - * Add low memory identity-mappings - SMP needs it when 363 - * starting up on an AP from real-mode. In the non-PAE 364 - * case we already have these mappings through head.S. 365 - * All user-space mappings are explicitly cleared after 366 - * SMP startup. 367 - */ 368 - set_pgd(&base[0], base[USER_PTRS_PER_PGD]); 369 - #endif 370 363 } 371 364 372 365 /* ··· 369 374 * the boot process. 370 375 * 371 376 * If we're booting on native hardware, this will be a pagetable 372 - * constructed in arch/i386/kernel/head.S, and not running in PAE mode 373 - * (even if we'll end up running in PAE). The root of the pagetable 374 - * will be swapper_pg_dir. 377 + * constructed in arch/x86/kernel/head_32.S. The root of the 378 + * pagetable will be swapper_pg_dir. 375 379 * 376 380 * If we're booting paravirtualized under a hypervisor, then there are 377 381 * more options: we may already be running PAE, and the pagetable may ··· 531 537 532 538 load_cr3(swapper_pg_dir); 533 539 534 - #ifdef CONFIG_X86_PAE 535 - /* 536 - * We will bail out later - printk doesn't work right now so 537 - * the user would just see a hanging kernel. 538 - */ 539 - if (cpu_has_pae) 540 - set_in_cr4(X86_CR4_PAE); 541 - #endif 542 540 __flush_tlb_all(); 543 541 544 542 kmap_init(); ··· 661 675 BUG_ON((unsigned long)high_memory > VMALLOC_START); 662 676 #endif /* double-sanity-check paranoia */ 663 677 664 - #ifdef CONFIG_X86_PAE 665 - if (!cpu_has_pae) 666 - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); 667 - #endif 668 678 if (boot_cpu_data.wp_works_ok < 0) 669 679 test_wp_bit(); 670 680
+31 -24
arch/x86/mm/ioremap.c
··· 260 260 early_param("early_ioremap_debug", early_ioremap_debug_setup); 261 261 262 262 static __initdata int after_paging_init; 263 - static __initdata unsigned long bm_pte[1024] 263 + static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 264 264 __attribute__((aligned(PAGE_SIZE))); 265 265 266 - static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) 266 + static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 267 267 { 268 - return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); 268 + pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)]; 269 + pud_t *pud = pud_offset(pgd, addr); 270 + pmd_t *pmd = pmd_offset(pud, addr); 271 + 272 + return pmd; 269 273 } 270 274 271 - static inline unsigned long * __init early_ioremap_pte(unsigned long addr) 275 + static inline pte_t * __init early_ioremap_pte(unsigned long addr) 272 276 { 273 - return bm_pte + ((addr >> PAGE_SHIFT) & 1023); 277 + return &bm_pte[pte_index(addr)]; 274 278 } 275 279 276 280 void __init early_ioremap_init(void) 277 281 { 278 - unsigned long *pgd; 282 + pmd_t *pmd; 279 283 280 284 if (early_ioremap_debug) 281 285 printk(KERN_INFO "early_ioremap_init()\n"); 282 286 283 - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); 284 - *pgd = __pa(bm_pte) | _PAGE_TABLE; 287 + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 285 288 memset(bm_pte, 0, sizeof(bm_pte)); 289 + set_pmd(pmd, __pmd(__pa(bm_pte) | _PAGE_TABLE)); 290 + 286 291 /* 287 - * The boot-ioremap range spans multiple pgds, for which 292 + * The boot-ioremap range spans multiple pmds, for which 288 293 * we are not prepared: 289 294 */ 290 - if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { 295 + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { 291 296 WARN_ON(1); 292 - printk(KERN_WARNING "pgd %p != %p\n", 293 - pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); 297 + printk(KERN_WARNING "pmd %p != %p\n", 298 + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); 294 299 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", 295 - fix_to_virt(FIX_BTMAP_BEGIN)); 300 + fix_to_virt(FIX_BTMAP_BEGIN)); 296 301 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", 297 - fix_to_virt(FIX_BTMAP_END)); 302 + fix_to_virt(FIX_BTMAP_END)); 298 303 299 304 printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); 300 305 printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", ··· 309 304 310 305 void __init early_ioremap_clear(void) 311 306 { 312 - unsigned long *pgd; 307 + pmd_t *pmd; 313 308 314 309 if (early_ioremap_debug) 315 310 printk(KERN_INFO "early_ioremap_clear()\n"); 316 311 317 - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); 318 - *pgd = 0; 319 - paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); 312 + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 313 + pmd_clear(pmd); 314 + paravirt_release_pt(__pa(pmd) >> PAGE_SHIFT); 320 315 __flush_tlb_all(); 321 316 } 322 317 323 318 void __init early_ioremap_reset(void) 324 319 { 325 320 enum fixed_addresses idx; 326 - unsigned long *pte, phys, addr; 321 + unsigned long addr, phys; 322 + pte_t *pte; 327 323 328 324 after_paging_init = 1; 329 325 for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { 330 326 addr = fix_to_virt(idx); 331 327 pte = early_ioremap_pte(addr); 332 - if (*pte & _PAGE_PRESENT) { 333 - phys = *pte & PAGE_MASK; 328 + if (pte_present(*pte)) { 329 + phys = pte_val(*pte) & PAGE_MASK; 334 330 set_fixmap(idx, phys); 335 331 } 336 332 } ··· 340 334 static void __init __early_set_fixmap(enum fixed_addresses idx, 341 335 unsigned long phys, pgprot_t flags) 342 336 { 343 - unsigned long *pte, addr = __fix_to_virt(idx); 337 + unsigned long addr = __fix_to_virt(idx); 338 + pte_t *pte; 344 339 345 340 if (idx >= __end_of_fixed_addresses) { 346 341 BUG(); ··· 349 342 } 350 343 pte = early_ioremap_pte(addr); 351 344 if (pgprot_val(flags)) 352 - *pte = (phys & PAGE_MASK) | pgprot_val(flags); 345 + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 353 346 else 354 - *pte = 0; 347 + pte_clear(NULL, addr, pte); 355 348 __flush_tlb_one(addr); 356 349 } 357 350
-1
include/asm-x86/page_32.h
··· 48 48 typedef unsigned long phys_addr_t; 49 49 50 50 typedef union { pteval_t pte, pte_low; } pte_t; 51 - typedef pte_t boot_pte_t; 52 51 53 52 #endif /* __ASSEMBLY__ */ 54 53 #endif /* CONFIG_X86_PAE */
-4
include/asm-x86/pgtable_32.h
··· 52 52 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) 53 53 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) 54 54 55 - #define TWOLEVEL_PGDIR_SHIFT 22 56 - #define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) 57 - #define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) 58 - 59 55 /* Just any arbitrary offset to the start of the vmalloc VM area: the 60 56 * current 8MB value just means that there will be a 8MB "hole" after the 61 57 * physical memory until the kernel virtual memory starts. That means that