Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Revert "xen/mmu: Add workaround "x86-64, mm: Put early page table high""

This reverts commit a38647837a411f7df79623128421eef2118b5884.

It does not work with certain AMD machines.

last_pfn = 0x100000 max_arch_pfn = 0x400000000
initial memory mapped : 0 - 02c3a000
Base memory trampoline at [ffff88000009b000] 9b000 size 20480
init_memory_mapping: 0000000000000000-0000000100000000
0000000000 - 0100000000 page 4k
kernel direct mapping tables up to 100000000 @ ff7fb000-100000000
init_memory_mapping: 0000000100000000-00000001e0800000
0100000000 - 01e0800000 page 4k
kernel direct mapping tables up to 1e0800000 @ 1df0f3000-1e0000000
xen: setting RW the range fffdc000 - 100000000
RAMDISK: 0203b000 - 02c3a000
No NUMA configuration found
Faking a node at 0000000000000000-00000001e0800000
NUMA: Using 63 for the hash shift.
Initmem setup node 0 0000000000000000-00000001e0800000
NODE_DATA [00000001dfffb000 - 00000001dfffffff]
BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<ffffffff81cf6a75>] setup_node_bootmem+0x18a/0x1ea
PGD 0
Oops: 0003 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Not tainted 2.6.39-0-virtual #6~smb1
RIP: e030:[<ffffffff81cf6a75>] [<ffffffff81cf6a75>] setup_node_bootmem+0x18a/0x1ea
RSP: e02b:ffffffff81c01e38 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 00000001e0800000 RCX: 0000000000001040
RDX: 0000000000004100 RSI: 0000000000000000 RDI: ffff8801dfffb000
RBP: ffffffff81c01e58 R08: 0000000000000020 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000bfe400
FS: 0000000000000000(0000) GS:ffffffff81cca000(0000) knlGS:0000000000000000
CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000001c03000 CR4: 0000000000000660
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81c00000, task ffffffff81c0b020)
Stack:
0000000000000040 0000000000000001 0000000000000000 ffffffffffffffff
ffffffff81c01e88 ffffffff81cf6c25 0000000000000000 0000000000000000
ffffffff81cf687f 0000000000000000 ffffffff81c01ea8 ffffffff81cf6e45
Call Trace:
[<ffffffff81cf6c25>] numa_register_memblks.constprop.3+0x150/0x181
[<ffffffff81cf687f>] ? numa_add_memblk+0x7c/0x7c
[<ffffffff81cf6e45>] numa_init.part.2+0x1c/0x7c
[<ffffffff81cf687f>] ? numa_add_memblk+0x7c/0x7c
[<ffffffff81cf6f67>] numa_init+0x6c/0x70
[<ffffffff81cf7057>] initmem_init+0x39/0x3b
[<ffffffff81ce5865>] setup_arch+0x64e/0x769
[<ffffffff815e43c1>] ? printk+0x51/0x53
[<ffffffff81cdf92b>] start_kernel+0xd4/0x3f3
[<ffffffff81cdf388>] x86_64_start_reservations+0x132/0x136
[<ffffffff81ce2ed4>] xen_start_kernel+0x588/0x58f
Code: 41 00 00 48 8b 3c c5 a0 24 cc 81 31 c0 40 f6 c7 01 74 05 aa 66 ba ff 40 40 f6 c7 02 74 05 66 ab 83 ea 02 89 d1 c1 e9 02 f6 c2 02 <f3> ab 74 02 66 ab 80 e2 01 74 01 aa 49 63 c4 48 c1 eb 0c 44 89
RIP [<ffffffff81cf6a75>] setup_node_bootmem+0x18a/0x1ea
RSP <ffffffff81c01e38>
CR2: 0000000000000000
---[ end trace a7919e7f17c0a725 ]---
Kernel panic - not syncing: Attempted to kill the idle task!
Pid: 0, comm: swapper Tainted: G D 2.6.39-0-virtual #6~smb1

Reported-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

-123
-123
arch/x86/xen/mmu.c
··· 1463 1463 return ret; 1464 1464 } 1465 1465 1466 - #ifdef CONFIG_X86_64 1467 - static __initdata u64 __last_pgt_set_rw = 0; 1468 - static __initdata u64 __pgt_buf_start = 0; 1469 - static __initdata u64 __pgt_buf_end = 0; 1470 - static __initdata u64 __pgt_buf_top = 0; 1471 - /* 1472 - * As a consequence of the commit: 1473 - * 1474 - * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e 1475 - * Author: Yinghai Lu <yinghai@kernel.org> 1476 - * Date: Fri Dec 17 16:58:28 2010 -0800 1477 - * 1478 - * x86-64, mm: Put early page table high 1479 - * 1480 - * at some point init_memory_mapping is going to reach the pagetable pages 1481 - * area and map those pages too (mapping them as normal memory that falls 1482 - * in the range of addresses passed to init_memory_mapping as argument). 1483 - * Some of those pages are already pagetable pages (they are in the range 1484 - * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and 1485 - * everything is fine. 1486 - * Some of these pages are not pagetable pages yet (they fall in the range 1487 - * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they 1488 - * are going to be mapped RW. When these pages become pagetable pages and 1489 - * are hooked into the pagetable, xen will find that the guest has already 1490 - * a RW mapping of them somewhere and fail the operation. 1491 - * The reason Xen requires pagetables to be RO is that the hypervisor needs 1492 - * to verify that the pagetables are valid before using them. The validation 1493 - * operations are called "pinning". 1494 - * 1495 - * In order to fix the issue we mark all the pages in the entire range 1496 - * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation 1497 - * is completed only the range pgt_buf_start-pgt_buf_end is reserved by 1498 - * init_memory_mapping. Hence the kernel is going to crash as soon as one 1499 - * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those 1500 - * ranges are RO). 1501 - * 1502 - * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ 1503 - * the init_memory_mapping has completed (in a perfect world we would 1504 - * call this function from init_memory_mapping, but lets ignore that). 1505 - * 1506 - * Because we are called _after_ init_memory_mapping the pgt_buf_[start, 1507 - * end,top] have all changed to new values (b/c init_memory_mapping 1508 - * is called and setting up another new page-table). Hence, the first time 1509 - * we enter this function, we save away the pgt_buf_start value and update 1510 - * the pgt_buf_[end,top]. 1511 - * 1512 - * When we detect that the "old" pgt_buf_start through pgt_buf_end 1513 - * PFNs have been reserved (so memblock_x86_reserve_range has been called), 1514 - * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. 1515 - * 1516 - * And then we update those "old" pgt_buf_[end|top] with the new ones 1517 - * so that we can redo this on the next pagetable. 1518 - */ 1519 - static __init void mark_rw_past_pgt(void) { 1520 - 1521 - if (pgt_buf_end > pgt_buf_start) { 1522 - u64 addr, size; 1523 - 1524 - /* Save it away. */ 1525 - if (!__pgt_buf_start) { 1526 - __pgt_buf_start = pgt_buf_start; 1527 - __pgt_buf_end = pgt_buf_end; 1528 - __pgt_buf_top = pgt_buf_top; 1529 - return; 1530 - } 1531 - /* If we get the range that starts at __pgt_buf_end that means 1532 - * the range is reserved, and that in 'init_memory_mapping' 1533 - * the 'memblock_x86_reserve_range' has been called with the 1534 - * outdated __pgt_buf_start, __pgt_buf_end (the "new" 1535 - * pgt_buf_[start|end|top] refer now to a new pagetable. 1536 - * Note: we are called _after_ the pgt_buf_[..] have been 1537 - * updated.*/ 1538 - 1539 - addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), 1540 - &size, PAGE_SIZE); 1541 - 1542 - /* Still not reserved, meaning 'memblock_x86_reserve_range' 1543 - * hasn't been called yet. Update the _end and _top.*/ 1544 - if (addr == PFN_PHYS(__pgt_buf_start)) { 1545 - __pgt_buf_end = pgt_buf_end; 1546 - __pgt_buf_top = pgt_buf_top; 1547 - return; 1548 - } 1549 - 1550 - /* OK, the area is reserved, meaning it is time for us to 1551 - * set RW for the old end->top PFNs. */ 1552 - 1553 - /* ..unless we had already done this. */ 1554 - if (__pgt_buf_end == __last_pgt_set_rw) 1555 - return; 1556 - 1557 - addr = PFN_PHYS(__pgt_buf_end); 1558 - 1559 - /* set as RW the rest */ 1560 - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", 1561 - PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); 1562 - 1563 - while (addr < PFN_PHYS(__pgt_buf_top)) { 1564 - make_lowmem_page_readwrite(__va(addr)); 1565 - addr += PAGE_SIZE; 1566 - } 1567 - /* And update everything so that we are ready for the next 1568 - * pagetable (the one created for regions past 4GB) */ 1569 - __last_pgt_set_rw = __pgt_buf_end; 1570 - __pgt_buf_start = pgt_buf_start; 1571 - __pgt_buf_end = pgt_buf_end; 1572 - __pgt_buf_top = pgt_buf_top; 1573 - } 1574 - return; 1575 - } 1576 - #else 1577 - static __init void mark_rw_past_pgt(void) { } 1578 - #endif 1579 1466 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1580 1467 { 1581 1468 #ifdef CONFIG_X86_64 ··· 1488 1601 { 1489 1602 unsigned long pfn = pte_pfn(pte); 1490 1603 1491 - /* 1492 - * A bit of optimization. We do not need to call the workaround 1493 - * when xen_set_pte_init is called with a PTE with 0 as PFN. 1494 - * That is b/c the pagetable at that point are just being populated 1495 - * with empty values and we can save some cycles by not calling 1496 - * the 'memblock' code.*/ 1497 - if (pfn) 1498 - mark_rw_past_pgt(); 1499 1604 /* 1500 1605 * If the new pfn is within the range of the newly allocated 1501 1606 * kernel pagetable, and it isn't being mapped into an ··· 1997 2118 1998 2119 static __init void xen_post_allocator_init(void) 1999 2120 { 2000 - mark_rw_past_pgt(); 2001 - 2002 2121 #ifdef CONFIG_XEN_DEBUG 2003 2122 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); 2004 2123 #endif