Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86,xen: introduce x86_init.mapping.pagetable_reserve

Introduce a new x86_init hook called pagetable_reserve that at the end
of init_memory_mapping is used to reserve a range of memory addresses for
the kernel pagetable pages we used and free the other ones.

On native it just calls memblock_x86_reserve_range while on xen it also
takes care of setting the spare memory previously allocated
for kernel pagetable pages from RO to RW, so that it can be used for
other purposes.

A detailed explanation of the reason why this hook is needed follows.

As a consequence of the commit:

commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
Author: Yinghai Lu <yinghai@kernel.org>
Date: Fri Dec 17 16:58:28 2010 -0800

x86-64, mm: Put early page table high

at some point init_memory_mapping is going to reach the pagetable pages
area and map those pages too (mapping them as normal memory that falls
in the range of addresses passed to init_memory_mapping as argument).
Some of those pages are already pagetable pages (they are in the range
pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
everything is fine.
Some of these pages are not pagetable pages yet (they fall in the range
pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
are going to be mapped RW. When these pages become pagetable pages and
are hooked into the pagetable, xen will find that the guest has already
a RW mapping of them somewhere and fail the operation.
The reason Xen requires pagetables to be RO is that the hypervisor needs
to verify that the pagetables are valid before using them. The validation
operations are called "pinning" (more details in arch/x86/xen/mmu.c).

In order to fix the issue we mark all the pages in the entire range
pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
is completed only the range pgt_buf_start-pgt_buf_end is reserved by
init_memory_mapping. Hence the kernel is going to crash as soon as one
of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
ranges are RO).

For this reason we need a hook to reserve the kernel pagetable pages we
used and free the other ones so that they can be reused for other
purposes.
On native it just means calling memblock_x86_reserve_range, on Xen it
also means marking RW the pagetable pages that we allocated before but
that haven't been used before.

Another way to fix this is without using the hook is by adding a 'if
(xen_pv_domain)' in the 'init_memory_mapping' code and calling the Xen
counterpart, but that is just nasty.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

authored by

Stefano Stabellini and committed by
Konrad Rzeszutek Wilk
279b706b 92bdaef7

+54 -2
+1
arch/x86/include/asm/pgtable_types.h
··· 299 299 /* Install a pte for a particular vaddr in kernel space. */ 300 300 void set_pte_vaddr(unsigned long vaddr, pte_t pte); 301 301 302 + extern void native_pagetable_reserve(u64 start, u64 end); 302 303 #ifdef CONFIG_X86_32 303 304 extern void native_pagetable_setup_start(pgd_t *base); 304 305 extern void native_pagetable_setup_done(pgd_t *base);
+12
arch/x86/include/asm/x86_init.h
··· 68 68 }; 69 69 70 70 /** 71 + * struct x86_init_mapping - platform specific initial kernel pagetable setup 72 + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage 73 + * 74 + * For more details on the purpose of this hook, look in 75 + * init_memory_mapping and the commit that added it. 76 + */ 77 + struct x86_init_mapping { 78 + void (*pagetable_reserve)(u64 start, u64 end); 79 + }; 80 + 81 + /** 71 82 * struct x86_init_paging - platform specific paging functions 72 83 * @pagetable_setup_start: platform specific pre paging_init() call 73 84 * @pagetable_setup_done: platform specific post paging_init() call ··· 134 123 struct x86_init_mpparse mpparse; 135 124 struct x86_init_irqs irqs; 136 125 struct x86_init_oem oem; 126 + struct x86_init_mapping mapping; 137 127 struct x86_init_paging paging; 138 128 struct x86_init_timers timers; 139 129 struct x86_init_iommu iommu;
+4
arch/x86/kernel/x86_init.c
··· 61 61 .banner = default_banner, 62 62 }, 63 63 64 + .mapping = { 65 + .pagetable_reserve = native_pagetable_reserve, 66 + }, 67 + 64 68 .paging = { 65 69 .pagetable_setup_start = native_pagetable_setup_start, 66 70 .pagetable_setup_done = native_pagetable_setup_done,
+22 -2
arch/x86/mm/init.c
··· 81 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); 82 82 } 83 83 84 + void native_pagetable_reserve(u64 start, u64 end) 85 + { 86 + memblock_x86_reserve_range(start, end, "PGTABLE"); 87 + } 88 + 84 89 struct map_range { 85 90 unsigned long start; 86 91 unsigned long end; ··· 277 272 278 273 __flush_tlb_all(); 279 274 275 + /* 276 + * Reserve the kernel pagetable pages we used (pgt_buf_start - 277 + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) 278 + * so that they can be reused for other purposes. 279 + * 280 + * On native it just means calling memblock_x86_reserve_range, on Xen it 281 + * also means marking RW the pagetable pages that we allocated before 282 + * but that haven't been used. 283 + * 284 + * In fact on xen we mark RO the whole range pgt_buf_start - 285 + * pgt_buf_top, because we have to make sure that when 286 + * init_memory_mapping reaches the pagetable pages area, it maps 287 + * RO all the pagetable pages, including the ones that are beyond 288 + * pgt_buf_end at that time. 289 + */ 280 290 if (!after_bootmem && pgt_buf_end > pgt_buf_start) 281 - memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, 282 - pgt_buf_end << PAGE_SHIFT, "PGTABLE"); 291 + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), 292 + PFN_PHYS(pgt_buf_end)); 283 293 284 294 if (!after_bootmem) 285 295 early_memtest(start, end);
+15
arch/x86/xen/mmu.c
··· 1275 1275 { 1276 1276 } 1277 1277 1278 + static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) 1279 + { 1280 + /* reserve the range used */ 1281 + native_pagetable_reserve(start, end); 1282 + 1283 + /* set as RW the rest */ 1284 + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, 1285 + PFN_PHYS(pgt_buf_top)); 1286 + while (end < PFN_PHYS(pgt_buf_top)) { 1287 + make_lowmem_page_readwrite(__va(end)); 1288 + end += PAGE_SIZE; 1289 + } 1290 + } 1291 + 1278 1292 static void xen_post_allocator_init(void); 1279 1293 1280 1294 static __init void xen_pagetable_setup_done(pgd_t *base) ··· 2119 2105 2120 2106 void __init xen_init_mmu_ops(void) 2121 2107 { 2108 + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; 2122 2109 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2123 2110 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2124 2111 pv_mmu_ops = xen_mmu_ops;