Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/lguest: Remove lguest support

Lguest seems to be rather unused these days. It has seen only patches
ensuring it still builds the last two years and its official state is
"Odd Fixes".

Remove it in order to be able to clean up the paravirt code.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: boris.ostrovsky@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: rusty@rustcorp.com.au
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20170816173157.8633-3-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Juergen Gross and committed by
Ingo Molnar
ecda85e7 edcb5cf8

+16 -10502
-11
MAINTAINERS
··· 7640 7640 S: Maintained 7641 7641 F: drivers/media/dvb-frontends/lgdt3305.* 7642 7642 7643 - LGUEST 7644 - M: Rusty Russell <rusty@rustcorp.com.au> 7645 - L: lguest@lists.ozlabs.org 7646 - W: http://lguest.ozlabs.org/ 7647 - S: Odd Fixes 7648 - F: arch/x86/include/asm/lguest*.h 7649 - F: arch/x86/lguest/ 7650 - F: drivers/lguest/ 7651 - F: include/linux/lguest*.h 7652 - F: tools/lguest/ 7653 - 7654 7643 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER 7655 7644 M: Viresh Kumar <vireshk@kernel.org> 7656 7645 L: linux-ide@vger.kernel.org
-3
arch/x86/Kbuild
··· 10 10 # Hyper-V paravirtualization support 11 11 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ 12 12 13 - # lguest paravirtualization support 14 - obj-$(CONFIG_LGUEST_GUEST) += lguest/ 15 - 16 13 obj-y += realmode/ 17 14 obj-y += kernel/ 18 15 obj-y += mm/
-2
arch/x86/Kconfig
··· 777 777 Statistics are displayed in debugfs filesystem. Enabling this option 778 778 may incur significant overhead. 779 779 780 - source "arch/x86/lguest/Kconfig" 781 - 782 780 config PARAVIRT_TIME_ACCOUNTING 783 781 bool "Paravirtual steal time accounting" 784 782 depends on PARAVIRT
-91
arch/x86/include/asm/lguest.h
··· 1 - #ifndef _ASM_X86_LGUEST_H 2 - #define _ASM_X86_LGUEST_H 3 - 4 - #define GDT_ENTRY_LGUEST_CS 10 5 - #define GDT_ENTRY_LGUEST_DS 11 6 - #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) 7 - #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) 8 - 9 - #ifndef __ASSEMBLY__ 10 - #include <asm/desc.h> 11 - 12 - #define GUEST_PL 1 13 - 14 - /* Page for Switcher text itself, then two pages per cpu */ 15 - #define SWITCHER_TEXT_PAGES (1) 16 - #define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) 17 - #define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) 18 - 19 - /* Where we map the Switcher, in both Host and Guest. */ 20 - extern unsigned long switcher_addr; 21 - 22 - /* Found in switcher.S */ 23 - extern unsigned long default_idt_entries[]; 24 - 25 - /* Declarations for definitions in arch/x86/lguest/head_32.S */ 26 - extern char lguest_noirq_iret[]; 27 - extern const char lgstart_cli[], lgend_cli[]; 28 - extern const char lgstart_pushf[], lgend_pushf[]; 29 - 30 - extern void lguest_iret(void); 31 - extern void lguest_init(void); 32 - 33 - struct lguest_regs { 34 - /* Manually saved part. */ 35 - unsigned long eax, ebx, ecx, edx; 36 - unsigned long esi, edi, ebp; 37 - unsigned long gs; 38 - unsigned long fs, ds, es; 39 - unsigned long trapnum, errcode; 40 - /* Trap pushed part */ 41 - unsigned long eip; 42 - unsigned long cs; 43 - unsigned long eflags; 44 - unsigned long esp; 45 - unsigned long ss; 46 - }; 47 - 48 - /* This is a guest-specific page (mapped ro) into the guest. */ 49 - struct lguest_ro_state { 50 - /* Host information we need to restore when we switch back. */ 51 - u32 host_cr3; 52 - struct desc_ptr host_idt_desc; 53 - struct desc_ptr host_gdt_desc; 54 - u32 host_sp; 55 - 56 - /* Fields which are used when guest is running. */ 57 - struct desc_ptr guest_idt_desc; 58 - struct desc_ptr guest_gdt_desc; 59 - struct x86_hw_tss guest_tss; 60 - struct desc_struct guest_idt[IDT_ENTRIES]; 61 - struct desc_struct guest_gdt[GDT_ENTRIES]; 62 - }; 63 - 64 - struct lg_cpu_arch { 65 - /* The GDT entries copied into lguest_ro_state when running. */ 66 - struct desc_struct gdt[GDT_ENTRIES]; 67 - 68 - /* The IDT entries: some copied into lguest_ro_state when running. */ 69 - struct desc_struct idt[IDT_ENTRIES]; 70 - 71 - /* The address of the last guest-visible pagefault (ie. cr2). */ 72 - unsigned long last_pagefault; 73 - }; 74 - 75 - static inline void lguest_set_ts(void) 76 - { 77 - u32 cr0; 78 - 79 - cr0 = read_cr0(); 80 - if (!(cr0 & 8)) 81 - write_cr0(cr0 | 8); 82 - } 83 - 84 - /* Full 4G segment descriptors, suitable for CS and DS. */ 85 - #define FULL_EXEC_SEGMENT \ 86 - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) 87 - #define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) 88 - 89 - #endif /* __ASSEMBLY__ */ 90 - 91 - #endif /* _ASM_X86_LGUEST_H */
-74
arch/x86/include/asm/lguest_hcall.h
··· 1 - /* Architecture specific portion of the lguest hypercalls */ 2 - #ifndef _ASM_X86_LGUEST_HCALL_H 3 - #define _ASM_X86_LGUEST_HCALL_H 4 - 5 - #define LHCALL_FLUSH_ASYNC 0 6 - #define LHCALL_LGUEST_INIT 1 7 - #define LHCALL_SHUTDOWN 2 8 - #define LHCALL_NEW_PGTABLE 4 9 - #define LHCALL_FLUSH_TLB 5 10 - #define LHCALL_LOAD_IDT_ENTRY 6 11 - #define LHCALL_SET_STACK 7 12 - #define LHCALL_SET_CLOCKEVENT 9 13 - #define LHCALL_HALT 10 14 - #define LHCALL_SET_PMD 13 15 - #define LHCALL_SET_PTE 14 16 - #define LHCALL_SET_PGD 15 17 - #define LHCALL_LOAD_TLS 16 18 - #define LHCALL_LOAD_GDT_ENTRY 18 19 - #define LHCALL_SEND_INTERRUPTS 19 20 - 21 - #define LGUEST_TRAP_ENTRY 0x1F 22 - 23 - /* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ 24 - #define LGUEST_SHUTDOWN_POWEROFF 1 25 - #define LGUEST_SHUTDOWN_RESTART 2 26 - 27 - #ifndef __ASSEMBLY__ 28 - #include <asm/hw_irq.h> 29 - 30 - /*G:030 31 - * But first, how does our Guest contact the Host to ask for privileged 32 - * operations? There are two ways: the direct way is to make a "hypercall", 33 - * to make requests of the Host Itself. 34 - * 35 - * Our hypercall mechanism uses the highest unused trap code (traps 32 and 36 - * above are used by real hardware interrupts). Seventeen hypercalls are 37 - * available: the hypercall number is put in the %eax register, and the 38 - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 39 - * If a return value makes sense, it's returned in %eax. 40 - * 41 - * Grossly invalid calls result in Sudden Death at the hands of the vengeful 42 - * Host, rather than returning failure. This reflects Winston Churchill's 43 - * definition of a gentleman: "someone who is only rude intentionally". 44 - */ 45 - static inline unsigned long 46 - hcall(unsigned long call, 47 - unsigned long arg1, unsigned long arg2, unsigned long arg3, 48 - unsigned long arg4) 49 - { 50 - /* "int" is the Intel instruction to trigger a trap. */ 51 - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 52 - /* The call in %eax (aka "a") might be overwritten */ 53 - : "=a"(call) 54 - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ 55 - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) 56 - /* "memory" means this might write somewhere in memory. 57 - * This isn't true for all calls, but it's safe to tell 58 - * gcc that it might happen so it doesn't get clever. */ 59 - : "memory"); 60 - return call; 61 - } 62 - /*:*/ 63 - 64 - /* Can't use our min() macro here: needs to be a constant */ 65 - #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 66 - 67 - #define LHCALL_RING_SIZE 64 68 - struct hcall_args { 69 - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ 70 - unsigned long arg0, arg1, arg2, arg3, arg4; 71 - }; 72 - 73 - #endif /* !__ASSEMBLY__ */ 74 - #endif /* _ASM_X86_LGUEST_HCALL_H */
+1 -1
arch/x86/include/asm/processor.h
··· 662 662 * In case NMI unmasking or performance ever becomes a problem, 663 663 * the next best option appears to be MOV-to-CR2 and an 664 664 * unconditional jump. That sequence also works on all CPUs, 665 - * but it will fault at CPL3 (i.e. Xen PV and lguest). 665 + * but it will fault at CPL3 (i.e. Xen PV). 666 666 * 667 667 * CPUID is the conventional way, but it's nasty: it doesn't 668 668 * exist on some 486-like CPUs, and it usually exits to a
+1 -1
arch/x86/include/uapi/asm/bootparam.h
··· 201 201 * 202 202 * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard 203 203 * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. 204 - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest 204 + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated 205 205 * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, 206 206 * which start at asm startup_xen() entry point and later jump to the C 207 207 * xen_start_kernel() entry point. Both domU and dom0 type of guests are
-20
arch/x86/kernel/asm-offsets_32.c
··· 4 4 5 5 #include <asm/ucontext.h> 6 6 7 - #include <linux/lguest.h> 8 - #include "../../../drivers/lguest/lg.h" 9 - 10 7 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, 11 8 static char syscalls[] = { 12 9 #include <asm/syscalls_32.h> ··· 59 62 OFFSET(stack_canary_offset, stack_canary, canary); 60 63 #endif 61 64 62 - #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 63 - BLANK(); 64 - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 65 - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); 66 - 67 - BLANK(); 68 - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 69 - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 70 - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); 71 - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); 72 - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); 73 - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); 74 - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); 75 - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); 76 - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 - #endif 79 65 BLANK(); 80 66 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); 81 67 DEFINE(NR_syscalls, sizeof(syscalls));
-2
arch/x86/kernel/head_32.S
··· 155 155 jmp *%eax 156 156 157 157 .Lbad_subarch: 158 - WEAK(lguest_entry) 159 158 WEAK(xen_entry) 160 159 /* Unknown implementation; there's really 161 160 nothing we can do at this point. */ ··· 164 165 165 166 subarch_entries: 166 167 .long .Ldefault_entry /* normal x86/PC */ 167 - .long lguest_entry /* lguest hypervisor */ 168 168 .long xen_entry /* Xen hypervisor */ 169 169 .long .Ldefault_entry /* Moorestown MID */ 170 170 num_subarch_entries = (. - subarch_entries) / 4
-1
arch/x86/kernel/platform-quirks.c
··· 16 16 x86_platform.legacy.reserve_bios_regions = 1; 17 17 break; 18 18 case X86_SUBARCH_XEN: 19 - case X86_SUBARCH_LGUEST: 20 19 x86_platform.legacy.devices.pnpbios = 0; 21 20 x86_platform.legacy.rtc = 0; 22 21 break;
-1
arch/x86/kvm/Kconfig
··· 89 89 # OK, it's a little counter-intuitive to do this, but it puts it neatly under 90 90 # the virtualization menu. 91 91 source drivers/vhost/Kconfig 92 - source drivers/lguest/Kconfig 93 92 94 93 endif # VIRTUALIZATION
-14
arch/x86/lguest/Kconfig
··· 1 - config LGUEST_GUEST 2 - bool "Lguest guest support" 3 - depends on X86_32 && PARAVIRT && PCI 4 - select TTY 5 - select VIRTUALIZATION 6 - select VIRTIO 7 - select VIRTIO_CONSOLE 8 - help 9 - Lguest is a tiny in-kernel hypervisor. Selecting this will 10 - allow your kernel to boot under lguest. This option will increase 11 - your kernel size by about 10k. If in doubt, say N. 12 - 13 - If you say Y here, make sure you say Y (or M) to the virtio block 14 - and net drivers which lguest needs.
-2
arch/x86/lguest/Makefile
··· 1 - obj-y := head_32.o boot.o 2 - CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
-1558
arch/x86/lguest/boot.c
··· 1 - /*P:010 2 - * A hypervisor allows multiple Operating Systems to run on a single machine. 3 - * To quote David Wheeler: "Any problem in computer science can be solved with 4 - * another layer of indirection." 5 - * 6 - * We keep things simple in two ways. First, we start with a normal Linux 7 - * kernel and insert a module (lg.ko) which allows us to run other Linux 8 - * kernels the same way we'd run processes. We call the first kernel the Host, 9 - * and the others the Guests. The program which sets up and configures Guests 10 - * (such as the example in tools/lguest/lguest.c) is called the Launcher. 11 - * 12 - * Secondly, we only run specially modified Guests, not normal kernels: setting 13 - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows 14 - * how to be a Guest at boot time. This means that you can use the same kernel 15 - * you boot normally (ie. as a Host) as a Guest. 16 - * 17 - * These Guests know that they cannot do privileged operations, such as disable 18 - * interrupts, and that they have to ask the Host to do such things explicitly. 19 - * This file consists of all the replacements for such low-level native 20 - * hardware operations: these special Guest versions call the Host. 21 - * 22 - * So how does the kernel know it's a Guest? We'll see that later, but let's 23 - * just say that we end up here where we replace the native functions various 24 - * "paravirt" structures with our Guest versions, then boot like normal. 25 - :*/ 26 - 27 - /* 28 - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 - * 30 - * This program is free software; you can redistribute it and/or modify 31 - * it under the terms of the GNU General Public License as published by 32 - * the Free Software Foundation; either version 2 of the License, or 33 - * (at your option) any later version. 34 - * 35 - * This program is distributed in the hope that it will be useful, but 36 - * WITHOUT ANY WARRANTY; without even the implied warranty of 37 - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 38 - * NON INFRINGEMENT. See the GNU General Public License for more 39 - * details. 40 - * 41 - * You should have received a copy of the GNU General Public License 42 - * along with this program; if not, write to the Free Software 43 - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 44 - */ 45 - #include <linux/kernel.h> 46 - #include <linux/start_kernel.h> 47 - #include <linux/string.h> 48 - #include <linux/console.h> 49 - #include <linux/screen_info.h> 50 - #include <linux/irq.h> 51 - #include <linux/interrupt.h> 52 - #include <linux/clocksource.h> 53 - #include <linux/clockchips.h> 54 - #include <linux/lguest.h> 55 - #include <linux/lguest_launcher.h> 56 - #include <linux/virtio_console.h> 57 - #include <linux/pm.h> 58 - #include <linux/export.h> 59 - #include <linux/pci.h> 60 - #include <linux/virtio_pci.h> 61 - #include <asm/acpi.h> 62 - #include <asm/apic.h> 63 - #include <asm/lguest.h> 64 - #include <asm/paravirt.h> 65 - #include <asm/param.h> 66 - #include <asm/page.h> 67 - #include <asm/pgtable.h> 68 - #include <asm/desc.h> 69 - #include <asm/setup.h> 70 - #include <asm/e820/api.h> 71 - #include <asm/mce.h> 72 - #include <asm/io.h> 73 - #include <asm/fpu/api.h> 74 - #include <asm/stackprotector.h> 75 - #include <asm/reboot.h> /* for struct machine_ops */ 76 - #include <asm/kvm_para.h> 77 - #include <asm/pci_x86.h> 78 - #include <asm/pci-direct.h> 79 - 80 - /*G:010 81 - * Welcome to the Guest! 82 - * 83 - * The Guest in our tale is a simple creature: identical to the Host but 84 - * behaving in simplified but equivalent ways. In particular, the Guest is the 85 - * same kernel as the Host (or at least, built from the same source code). 86 - :*/ 87 - 88 - struct lguest_data lguest_data = { 89 - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 90 - .noirq_iret = (u32)lguest_noirq_iret, 91 - .kernel_address = PAGE_OFFSET, 92 - .blocked_interrupts = { 1 }, /* Block timer interrupts */ 93 - .syscall_vec = IA32_SYSCALL_VECTOR, 94 - }; 95 - 96 - /*G:037 97 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a 98 - * ring buffer of stored hypercalls which the Host will run though next time we 99 - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 100 - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 101 - * and 255 once the Host has finished with it. 102 - * 103 - * If we come around to a slot which hasn't been finished, then the table is 104 - * full and we just make the hypercall directly. This has the nice side 105 - * effect of causing the Host to run all the stored calls in the ring buffer 106 - * which empties it for next time! 107 - */ 108 - static void async_hcall(unsigned long call, unsigned long arg1, 109 - unsigned long arg2, unsigned long arg3, 110 - unsigned long arg4) 111 - { 112 - /* Note: This code assumes we're uniprocessor. */ 113 - static unsigned int next_call; 114 - unsigned long flags; 115 - 116 - /* 117 - * Disable interrupts if not already disabled: we don't want an 118 - * interrupt handler making a hypercall while we're already doing 119 - * one! 120 - */ 121 - local_irq_save(flags); 122 - if (lguest_data.hcall_status[next_call] != 0xFF) { 123 - /* Table full, so do normal hcall which will flush table. */ 124 - hcall(call, arg1, arg2, arg3, arg4); 125 - } else { 126 - lguest_data.hcalls[next_call].arg0 = call; 127 - lguest_data.hcalls[next_call].arg1 = arg1; 128 - lguest_data.hcalls[next_call].arg2 = arg2; 129 - lguest_data.hcalls[next_call].arg3 = arg3; 130 - lguest_data.hcalls[next_call].arg4 = arg4; 131 - /* Arguments must all be written before we mark it to go */ 132 - wmb(); 133 - lguest_data.hcall_status[next_call] = 0; 134 - if (++next_call == LHCALL_RING_SIZE) 135 - next_call = 0; 136 - } 137 - local_irq_restore(flags); 138 - } 139 - 140 - /*G:035 141 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real 142 - * optimization trick! 143 - * 144 - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 145 - * them as a batch when lazy_mode is eventually turned off. Because hypercalls 146 - * are reasonably expensive, batching them up makes sense. For example, a 147 - * large munmap might update dozens of page table entries: that code calls 148 - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls 149 - * lguest_leave_lazy_mode(). 150 - * 151 - * So, when we're in lazy mode, we call async_hcall() to store the call for 152 - * future processing: 153 - */ 154 - static void lazy_hcall1(unsigned long call, unsigned long arg1) 155 - { 156 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 157 - hcall(call, arg1, 0, 0, 0); 158 - else 159 - async_hcall(call, arg1, 0, 0, 0); 160 - } 161 - 162 - /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 163 - static void lazy_hcall2(unsigned long call, 164 - unsigned long arg1, 165 - unsigned long arg2) 166 - { 167 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 168 - hcall(call, arg1, arg2, 0, 0); 169 - else 170 - async_hcall(call, arg1, arg2, 0, 0); 171 - } 172 - 173 - static void lazy_hcall3(unsigned long call, 174 - unsigned long arg1, 175 - unsigned long arg2, 176 - unsigned long arg3) 177 - { 178 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 179 - hcall(call, arg1, arg2, arg3, 0); 180 - else 181 - async_hcall(call, arg1, arg2, arg3, 0); 182 - } 183 - 184 - #ifdef CONFIG_X86_PAE 185 - static void lazy_hcall4(unsigned long call, 186 - unsigned long arg1, 187 - unsigned long arg2, 188 - unsigned long arg3, 189 - unsigned long arg4) 190 - { 191 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 192 - hcall(call, arg1, arg2, arg3, arg4); 193 - else 194 - async_hcall(call, arg1, arg2, arg3, arg4); 195 - } 196 - #endif 197 - 198 - /*G:036 199 - * When lazy mode is turned off, we issue the do-nothing hypercall to 200 - * flush any stored calls, and call the generic helper to reset the 201 - * per-cpu lazy mode variable. 202 - */ 203 - static void lguest_leave_lazy_mmu_mode(void) 204 - { 205 - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 206 - paravirt_leave_lazy_mmu(); 207 - } 208 - 209 - /* 210 - * We also catch the end of context switch; we enter lazy mode for much of 211 - * that too, so again we need to flush here. 212 - * 213 - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU 214 - * mode, but unlike Xen, lguest doesn't care about the difference). 215 - */ 216 - static void lguest_end_context_switch(struct task_struct *next) 217 - { 218 - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 219 - paravirt_end_context_switch(next); 220 - } 221 - 222 - /*G:032 223 - * After that diversion we return to our first native-instruction 224 - * replacements: four functions for interrupt control. 225 - * 226 - * The simplest way of implementing these would be to have "turn interrupts 227 - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: 228 - * these are by far the most commonly called functions of those we override. 229 - * 230 - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", 231 - * which the Guest can update with a single instruction. The Host knows to 232 - * check there before it tries to deliver an interrupt. 233 - */ 234 - 235 - /* 236 - * save_flags() is expected to return the processor state (ie. "flags"). The 237 - * flags word contains all kind of stuff, but in practice Linux only cares 238 - * about the interrupt flag. Our "save_flags()" just returns that. 239 - */ 240 - asmlinkage __visible unsigned long lguest_save_fl(void) 241 - { 242 - return lguest_data.irq_enabled; 243 - } 244 - 245 - /* Interrupts go off... */ 246 - asmlinkage __visible void lguest_irq_disable(void) 247 - { 248 - lguest_data.irq_enabled = 0; 249 - } 250 - 251 - /* 252 - * Let's pause a moment. Remember how I said these are called so often? 253 - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 254 - * break some rules. In particular, these functions are assumed to save their 255 - * own registers if they need to: normal C functions assume they can trash the 256 - * eax register. To use normal C functions, we use 257 - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 258 - * C function, then restores it. 259 - */ 260 - PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); 261 - PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); 262 - /*:*/ 263 - 264 - /* These are in head_32.S */ 265 - extern void lg_irq_enable(void); 266 - extern void lg_restore_fl(unsigned long flags); 267 - 268 - /*M:003 269 - * We could be more efficient in our checking of outstanding interrupts, rather 270 - * than using a branch. One way would be to put the "irq_enabled" field in a 271 - * page by itself, and have the Host write-protect it when an interrupt comes 272 - * in when irqs are disabled. There will then be a page fault as soon as 273 - * interrupts are re-enabled. 274 - * 275 - * A better method is to implement soft interrupt disable generally for x86: 276 - * instead of disabling interrupts, we set a flag. If an interrupt does come 277 - * in, we then disable them for real. This is uncommon, so we could simply use 278 - * a hypercall for interrupt control and not worry about efficiency. 279 - :*/ 280 - 281 - /*G:034 282 - * The Interrupt Descriptor Table (IDT). 283 - * 284 - * The IDT tells the processor what to do when an interrupt comes in. Each 285 - * entry in the table is a 64-bit descriptor: this holds the privilege level, 286 - * address of the handler, and... well, who cares? The Guest just asks the 287 - * Host to make the change anyway, because the Host controls the real IDT. 288 - */ 289 - static void lguest_write_idt_entry(gate_desc *dt, 290 - int entrynum, const gate_desc *g) 291 - { 292 - /* 293 - * The gate_desc structure is 8 bytes long: we hand it to the Host in 294 - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 295 - * around like this; typesafety wasn't a big concern in Linux's early 296 - * years. 297 - */ 298 - u32 *desc = (u32 *)g; 299 - /* Keep the local copy up to date. */ 300 - native_write_idt_entry(dt, entrynum, g); 301 - /* Tell Host about this new entry. */ 302 - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); 303 - } 304 - 305 - /* 306 - * Changing to a different IDT is very rare: we keep the IDT up-to-date every 307 - * time it is written, so we can simply loop through all entries and tell the 308 - * Host about them. 309 - */ 310 - static void lguest_load_idt(const struct desc_ptr *desc) 311 - { 312 - unsigned int i; 313 - struct desc_struct *idt = (void *)desc->address; 314 - 315 - for (i = 0; i < (desc->size+1)/8; i++) 316 - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); 317 - } 318 - 319 - /* 320 - * The Global Descriptor Table. 321 - * 322 - * The Intel architecture defines another table, called the Global Descriptor 323 - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" 324 - * instruction, and then several other instructions refer to entries in the 325 - * table. There are three entries which the Switcher needs, so the Host simply 326 - * controls the entire thing and the Guest asks it to make changes using the 327 - * LOAD_GDT hypercall. 328 - * 329 - * This is the exactly like the IDT code. 330 - */ 331 - static void lguest_load_gdt(const struct desc_ptr *desc) 332 - { 333 - unsigned int i; 334 - struct desc_struct *gdt = (void *)desc->address; 335 - 336 - for (i = 0; i < (desc->size+1)/8; i++) 337 - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); 338 - } 339 - 340 - /* 341 - * For a single GDT entry which changes, we simply change our copy and 342 - * then tell the host about it. 343 - */ 344 - static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 345 - const void *desc, int type) 346 - { 347 - native_write_gdt_entry(dt, entrynum, desc, type); 348 - /* Tell Host about this new entry. */ 349 - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, 350 - dt[entrynum].a, dt[entrynum].b, 0); 351 - } 352 - 353 - /* 354 - * There are three "thread local storage" GDT entries which change 355 - * on every context switch (these three entries are how glibc implements 356 - * __thread variables). As an optimization, we have a hypercall 357 - * specifically for this case. 358 - * 359 - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall 360 - * which took a range of entries? 361 - */ 362 - static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 363 - { 364 - /* 365 - * There's one problem which normal hardware doesn't have: the Host 366 - * can't handle us removing entries we're currently using. So we clear 367 - * the GS register here: if it's needed it'll be reloaded anyway. 368 - */ 369 - lazy_load_gs(0); 370 - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 371 - } 372 - 373 - /*G:038 374 - * That's enough excitement for now, back to ploughing through each of the 375 - * different pv_ops structures (we're about 1/3 of the way through). 376 - * 377 - * This is the Local Descriptor Table, another weird Intel thingy. Linux only 378 - * uses this for some strange applications like Wine. We don't do anything 379 - * here, so they'll get an informative and friendly Segmentation Fault. 380 - */ 381 - static void lguest_set_ldt(const void *addr, unsigned entries) 382 - { 383 - } 384 - 385 - /* 386 - * This loads a GDT entry into the "Task Register": that entry points to a 387 - * structure called the Task State Segment. Some comments scattered though the 388 - * kernel code indicate that this used for task switching in ages past, along 389 - * with blood sacrifice and astrology. 390 - * 391 - * Now there's nothing interesting in here that we don't get told elsewhere. 392 - * But the native version uses the "ltr" instruction, which makes the Host 393 - * complain to the Guest about a Segmentation Fault and it'll oops. So we 394 - * override the native version with a do-nothing version. 395 - */ 396 - static void lguest_load_tr_desc(void) 397 - { 398 - } 399 - 400 - /* 401 - * The "cpuid" instruction is a way of querying both the CPU identity 402 - * (manufacturer, model, etc) and its features. It was introduced before the 403 - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 404 - * As you might imagine, after a decade and a half this treatment, it is now a 405 - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 406 - * 407 - * This instruction even it has its own Wikipedia entry. The Wikipedia entry 408 - * has been translated into 6 languages. I am not making this up! 409 - * 410 - * We could get funky here and identify ourselves as "GenuineLguest", but 411 - * instead we just use the real "cpuid" instruction. Then I pretty much turned 412 - * off feature bits until the Guest booted. (Don't say that: you'll damage 413 - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 414 - * hardly future proof.) No one's listening! They don't like you anyway, 415 - * parenthetic weirdo! 416 - * 417 - * Replacing the cpuid so we can turn features off is great for the kernel, but 418 - * anyone (including userspace) can just use the raw "cpuid" instruction and 419 - * the Host won't even notice since it isn't privileged. So we try not to get 420 - * too worked up about it. 421 - */ 422 - static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 423 - unsigned int *cx, unsigned int *dx) 424 - { 425 - int function = *ax; 426 - 427 - native_cpuid(ax, bx, cx, dx); 428 - switch (function) { 429 - /* 430 - * CPUID 0 gives the highest legal CPUID number (and the ID string). 431 - * We futureproof our code a little by sticking to known CPUID values. 432 - */ 433 - case 0: 434 - if (*ax > 5) 435 - *ax = 5; 436 - break; 437 - 438 - /* 439 - * CPUID 1 is a basic feature request. 440 - * 441 - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 442 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. 443 - */ 444 - case 1: 445 - *cx &= 0x00002201; 446 - *dx &= 0x07808151; 447 - /* 448 - * The Host can do a nice optimization if it knows that the 449 - * kernel mappings (addresses above 0xC0000000 or whatever 450 - * PAGE_OFFSET is set to) haven't changed. But Linux calls 451 - * flush_tlb_user() for both user and kernel mappings unless 452 - * the Page Global Enable (PGE) feature bit is set. 453 - */ 454 - *dx |= 0x00002000; 455 - /* 456 - * We also lie, and say we're family id 5. 6 or greater 457 - * leads to a rdmsr in early_init_intel which we can't handle. 458 - * Family ID is returned as bits 8-12 in ax. 459 - */ 460 - *ax &= 0xFFFFF0FF; 461 - *ax |= 0x00000500; 462 - break; 463 - 464 - /* 465 - * This is used to detect if we're running under KVM. We might be, 466 - * but that's a Host matter, not us. So say we're not. 467 - */ 468 - case KVM_CPUID_SIGNATURE: 469 - *bx = *cx = *dx = 0; 470 - break; 471 - 472 - /* 473 - * 0x80000000 returns the highest Extended Function, so we futureproof 474 - * like we do above by limiting it to known fields. 475 - */ 476 - case 0x80000000: 477 - if (*ax > 0x80000008) 478 - *ax = 0x80000008; 479 - break; 480 - 481 - /* 482 - * PAE systems can mark pages as non-executable. Linux calls this the 483 - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced 484 - * Virus Protection). We just switch it off here, since we don't 485 - * support it. 486 - */ 487 - case 0x80000001: 488 - *dx &= ~(1 << 20); 489 - break; 490 - } 491 - } 492 - 493 - /* 494 - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 495 - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 496 - * it. The Host needs to know when the Guest wants to change them, so we have 497 - * a whole series of functions like read_cr0() and write_cr0(). 498 - * 499 - * We start with cr0. cr0 allows you to turn on and off all kinds of basic 500 - * features, but the only cr0 bit that Linux ever used at runtime was the 501 - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) 502 - * 503 - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if 504 - * the floating point unit is used. Which allows us to restore FPU state 505 - * lazily after a task switch if we wanted to, but wouldn't a name like 506 - * "FPUTRAP bit" be a little less cryptic? 507 - * 508 - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore 509 - * cr0. 510 - */ 511 - static void lguest_write_cr0(unsigned long val) 512 - { 513 - } 514 - 515 - static unsigned long lguest_read_cr0(void) 516 - { 517 - return 0; 518 - } 519 - 520 - /* 521 - * cr2 is the virtual address of the last page fault, which the Guest only ever 522 - * reads. The Host kindly writes this into our "struct lguest_data", so we 523 - * just read it out of there. 524 - */ 525 - static unsigned long lguest_read_cr2(void) 526 - { 527 - return lguest_data.cr2; 528 - } 529 - 530 - /* See lguest_set_pte() below. */ 531 - static bool cr3_changed = false; 532 - static unsigned long current_cr3; 533 - 534 - /* 535 - * cr3 is the current toplevel pagetable page: the principle is the same as 536 - * cr0. Keep a local copy, and tell the Host when it changes. 537 - */ 538 - static void lguest_write_cr3(unsigned long cr3) 539 - { 540 - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 541 - current_cr3 = cr3; 542 - 543 - /* These two page tables are simple, linear, and used during boot */ 544 - if (cr3 != __pa_symbol(swapper_pg_dir) && 545 - cr3 != __pa_symbol(initial_page_table)) 546 - cr3_changed = true; 547 - } 548 - 549 - static unsigned long lguest_read_cr3(void) 550 - { 551 - return current_cr3; 552 - } 553 - 554 - /* cr4 is used to enable and disable PGE, but we don't care. */ 555 - static unsigned long lguest_read_cr4(void) 556 - { 557 - return 0; 558 - } 559 - 560 - static void lguest_write_cr4(unsigned long val) 561 - { 562 - } 563 - 564 - /* 565 - * Page Table Handling. 566 - * 567 - * Now would be a good time to take a rest and grab a coffee or similarly 568 - * relaxing stimulant. The easy parts are behind us, and the trek gradually 569 - * winds uphill from here. 570 - * 571 - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU 572 - * maps virtual addresses to physical addresses using "page tables". We could 573 - * use one huge index of 1 million entries: each address is 4 bytes, so that's 574 - * 1024 pages just to hold the page tables. But since most virtual addresses 575 - * are unused, we use a two level index which saves space. The cr3 register 576 - * contains the physical address of the top level "page directory" page, which 577 - * contains physical addresses of up to 1024 second-level pages. Each of these 578 - * second level pages contains up to 1024 physical addresses of actual pages, 579 - * or Page Table Entries (PTEs). 580 - * 581 - * Here's a diagram, where arrows indicate physical addresses: 582 - * 583 - * cr3 ---> +---------+ 584 - * | --------->+---------+ 585 - * | | | PADDR1 | 586 - * Mid-level | | PADDR2 | 587 - * (PMD) page | | | 588 - * | | Lower-level | 589 - * | | (PTE) page | 590 - * | | | | 591 - * .... .... 592 - * 593 - * So to convert a virtual address to a physical address, we look up the top 594 - * level, which points us to the second level, which gives us the physical 595 - * address of that page. If the top level entry was not present, or the second 596 - * level entry was not present, then the virtual address is invalid (we 597 - * say "the page was not mapped"). 598 - * 599 - * Put another way, a 32-bit virtual address is divided up like so: 600 - * 601 - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 602 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| 603 - * Index into top Index into second Offset within page 604 - * page directory page pagetable page 605 - * 606 - * Now, unfortunately, this isn't the whole story: Intel added Physical Address 607 - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). 608 - * These are held in 64-bit page table entries, so we can now only fit 512 609 - * entries in a page, and the neat three-level tree breaks down. 610 - * 611 - * The result is a four level page table: 612 - * 613 - * cr3 --> [ 4 Upper ] 614 - * [ Level ] 615 - * [ Entries ] 616 - * [(PUD Page)]---> +---------+ 617 - * | --------->+---------+ 618 - * | | | PADDR1 | 619 - * Mid-level | | PADDR2 | 620 - * (PMD) page | | | 621 - * | | Lower-level | 622 - * | | (PTE) page | 623 - * | | | | 624 - * .... .... 625 - * 626 - * 627 - * And the virtual address is decoded as: 628 - * 629 - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 630 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| 631 - * Index into Index into mid Index into lower Offset within page 632 - * top entries directory page pagetable page 633 - * 634 - * It's too hard to switch between these two formats at runtime, so Linux only 635 - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many 636 - * distributions turn it on, and not just for people with silly amounts of 637 - * memory: the larger PTE entries allow room for the NX bit, which lets the 638 - * kernel disable execution of pages and increase security. 639 - * 640 - * This was a problem for lguest, which couldn't run on these distributions; 641 - * then Matias Zabaljauregui figured it all out and implemented it, and only a 642 - * handful of puppies were crushed in the process! 643 - * 644 - * Back to our point: the kernel spends a lot of time changing both the 645 - * top-level page directory and lower-level pagetable pages. The Guest doesn't 646 - * know physical addresses, so while it maintains these page tables exactly 647 - * like normal, it also needs to keep the Host informed whenever it makes a 648 - * change: the Host will create the real page tables based on the Guests'. 649 - */ 650 - 651 - /* 652 - * The Guest calls this after it has set a second-level entry (pte), ie. to map 653 - * a page into a process' address space. We tell the Host the toplevel and 654 - * address this corresponds to. The Guest uses one pagetable per process, so 655 - * we need to tell the Host which one we're changing (mm->pgd). 656 - */ 657 - static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 658 - pte_t *ptep) 659 - { 660 - #ifdef CONFIG_X86_PAE 661 - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ 662 - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 663 - ptep->pte_low, ptep->pte_high); 664 - #else 665 - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 666 - #endif 667 - } 668 - 669 - /* This is the "set and update" combo-meal-deal version. */ 670 - static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 671 - pte_t *ptep, pte_t pteval) 672 - { 673 - native_set_pte(ptep, pteval); 674 - lguest_pte_update(mm, addr, ptep); 675 - } 676 - 677 - /* 678 - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 679 - * to set a middle-level entry when PAE is activated. 680 - * 681 - * Again, we set the entry then tell the Host which page we changed, 682 - * and the index of the entry we changed. 683 - */ 684 - #ifdef CONFIG_X86_PAE 685 - static void lguest_set_pud(pud_t *pudp, pud_t pudval) 686 - { 687 - native_set_pud(pudp, pudval); 688 - 689 - /* 32 bytes aligned pdpt address and the index. */ 690 - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, 691 - (__pa(pudp) & 0x1F) / sizeof(pud_t)); 692 - } 693 - 694 - static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 695 - { 696 - native_set_pmd(pmdp, pmdval); 697 - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 698 - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); 699 - } 700 - #else 701 - 702 - /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ 703 - static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 704 - { 705 - native_set_pmd(pmdp, pmdval); 706 - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, 707 - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); 708 - } 709 - #endif 710 - 711 - /* 712 - * There are a couple of legacy places where the kernel sets a PTE, but we 713 - * don't know the top level any more. This is useless for us, since we don't 714 - * know which pagetable is changing or what address, so we just tell the Host 715 - * to forget all of them. Fortunately, this is very rare. 716 - * 717 - * ... except in early boot when the kernel sets up the initial pagetables, 718 - * which makes booting astonishingly slow: 48 seconds! So we don't even tell 719 - * the Host anything changed until we've done the first real page table switch, 720 - * which brings boot back to 4.3 seconds. 721 - */ 722 - static void lguest_set_pte(pte_t *ptep, pte_t pteval) 723 - { 724 - native_set_pte(ptep, pteval); 725 - if (cr3_changed) 726 - lazy_hcall1(LHCALL_FLUSH_TLB, 1); 727 - } 728 - 729 - #ifdef CONFIG_X86_PAE 730 - /* 731 - * With 64-bit PTE values, we need to be careful setting them: if we set 32 732 - * bits at a time, the hardware could see a weird half-set entry. These 733 - * versions ensure we update all 64 bits at once. 734 - */ 735 - static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 736 - { 737 - native_set_pte_atomic(ptep, pte); 738 - if (cr3_changed) 739 - lazy_hcall1(LHCALL_FLUSH_TLB, 1); 740 - } 741 - 742 - static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, 743 - pte_t *ptep) 744 - { 745 - native_pte_clear(mm, addr, ptep); 746 - lguest_pte_update(mm, addr, ptep); 747 - } 748 - 749 - static void lguest_pmd_clear(pmd_t *pmdp) 750 - { 751 - lguest_set_pmd(pmdp, __pmd(0)); 752 - } 753 - #endif 754 - 755 - /* 756 - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 757 - * native page table operations. On native hardware you can set a new page 758 - * table entry whenever you want, but if you want to remove one you have to do 759 - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 760 - * 761 - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only 762 - * called when a valid entry is written, not when it's removed (ie. marked not 763 - * present). Instead, this is where we come when the Guest wants to remove a 764 - * page table entry: we tell the Host to set that entry to 0 (ie. the present 765 - * bit is zero). 766 - */ 767 - static void lguest_flush_tlb_single(unsigned long addr) 768 - { 769 - /* Simply set it to zero: if it was not, it will fault back in. */ 770 - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); 771 - } 772 - 773 - /* 774 - * This is what happens after the Guest has removed a large number of entries. 775 - * This tells the Host that any of the page table entries for userspace might 776 - * have changed, ie. virtual addresses below PAGE_OFFSET. 777 - */ 778 - static void lguest_flush_tlb_user(void) 779 - { 780 - lazy_hcall1(LHCALL_FLUSH_TLB, 0); 781 - } 782 - 783 - /* 784 - * This is called when the kernel page tables have changed. That's not very 785 - * common (unless the Guest is using highmem, which makes the Guest extremely 786 - * slow), so it's worth separating this from the user flushing above. 787 - */ 788 - static void lguest_flush_tlb_kernel(void) 789 - { 790 - lazy_hcall1(LHCALL_FLUSH_TLB, 1); 791 - } 792 - 793 - /* 794 - * The Unadvanced Programmable Interrupt Controller. 795 - * 796 - * This is an attempt to implement the simplest possible interrupt controller. 797 - * I spent some time looking though routines like set_irq_chip_and_handler, 798 - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and 799 - * I *think* this is as simple as it gets. 800 - * 801 - * We can tell the Host what interrupts we want blocked ready for using the 802 - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as 803 - * simple as setting a bit. We don't actually "ack" interrupts as such, we 804 - * just mask and unmask them. I wonder if we should be cleverer? 805 - */ 806 - static void disable_lguest_irq(struct irq_data *data) 807 - { 808 - set_bit(data->irq, lguest_data.blocked_interrupts); 809 - } 810 - 811 - static void enable_lguest_irq(struct irq_data *data) 812 - { 813 - clear_bit(data->irq, lguest_data.blocked_interrupts); 814 - } 815 - 816 - /* This structure describes the lguest IRQ controller. */ 817 - static struct irq_chip lguest_irq_controller = { 818 - .name = "lguest", 819 - .irq_mask = disable_lguest_irq, 820 - .irq_mask_ack = disable_lguest_irq, 821 - .irq_unmask = enable_lguest_irq, 822 - }; 823 - 824 - /* 825 - * Interrupt descriptors are allocated as-needed, but low-numbered ones are 826 - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it 827 - * tells us the irq is already used: other errors (ie. ENOMEM) we take 828 - * seriously. 829 - */ 830 - static int lguest_setup_irq(unsigned int irq) 831 - { 832 - struct irq_desc *desc; 833 - int err; 834 - 835 - /* Returns -ve error or vector number. */ 836 - err = irq_alloc_desc_at(irq, 0); 837 - if (err < 0 && err != -EEXIST) 838 - return err; 839 - 840 - /* 841 - * Tell the Linux infrastructure that the interrupt is 842 - * controlled by our level-based lguest interrupt controller. 843 - */ 844 - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, 845 - handle_level_irq, "level"); 846 - 847 - /* Some systems map "vectors" to interrupts weirdly. Not us! */ 848 - desc = irq_to_desc(irq); 849 - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); 850 - return 0; 851 - } 852 - 853 - static int lguest_enable_irq(struct pci_dev *dev) 854 - { 855 - int err; 856 - u8 line = 0; 857 - 858 - /* We literally use the PCI interrupt line as the irq number. */ 859 - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); 860 - err = lguest_setup_irq(line); 861 - if (!err) 862 - dev->irq = line; 863 - return err; 864 - } 865 - 866 - /* We don't do hotplug PCI, so this shouldn't be called. */ 867 - static void lguest_disable_irq(struct pci_dev *dev) 868 - { 869 - WARN_ON(1); 870 - } 871 - 872 - /* 873 - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 874 - * interrupt (except 128, which is used for system calls). 875 - */ 876 - static void __init lguest_init_IRQ(void) 877 - { 878 - unsigned int i; 879 - 880 - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { 881 - if (i != IA32_SYSCALL_VECTOR) 882 - set_intr_gate(i, irq_entries_start + 883 - 8 * (i - FIRST_EXTERNAL_VECTOR)); 884 - } 885 - 886 - /* 887 - * This call is required to set up for 4k stacks, where we have 888 - * separate stacks for hard and soft interrupts. 889 - */ 890 - irq_ctx_init(smp_processor_id()); 891 - } 892 - 893 - /* 894 - * Time. 895 - * 896 - * It would be far better for everyone if the Guest had its own clock, but 897 - * until then the Host gives us the time on every interrupt. 898 - */ 899 - static void lguest_get_wallclock(struct timespec *now) 900 - { 901 - *now = lguest_data.time; 902 - } 903 - 904 - /* 905 - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 906 - * what speed it runs at, or 0 if it's unusable as a reliable clock source. 907 - * This matches what we want here: if we return 0 from this function, the x86 908 - * TSC clock will give up and not register itself. 909 - */ 910 - static unsigned long lguest_tsc_khz(void) 911 - { 912 - return lguest_data.tsc_khz; 913 - } 914 - 915 - /* 916 - * If we can't use the TSC, the kernel falls back to our lower-priority 917 - * "lguest_clock", where we read the time value given to us by the Host. 918 - */ 919 - static u64 lguest_clock_read(struct clocksource *cs) 920 - { 921 - unsigned long sec, nsec; 922 - 923 - /* 924 - * Since the time is in two parts (seconds and nanoseconds), we risk 925 - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 926 - * and getting 99 and 0. As Linux tends to come apart under the stress 927 - * of time travel, we must be careful: 928 - */ 929 - do { 930 - /* First we read the seconds part. */ 931 - sec = lguest_data.time.tv_sec; 932 - /* 933 - * This read memory barrier tells the compiler and the CPU that 934 - * this can't be reordered: we have to complete the above 935 - * before going on. 936 - */ 937 - rmb(); 938 - /* Now we read the nanoseconds part. */ 939 - nsec = lguest_data.time.tv_nsec; 940 - /* Make sure we've done that. */ 941 - rmb(); 942 - /* Now if the seconds part has changed, try again. */ 943 - } while (unlikely(lguest_data.time.tv_sec != sec)); 944 - 945 - /* Our lguest clock is in real nanoseconds. */ 946 - return sec*1000000000ULL + nsec; 947 - } 948 - 949 - /* This is the fallback clocksource: lower priority than the TSC clocksource. */ 950 - static struct clocksource lguest_clock = { 951 - .name = "lguest", 952 - .rating = 200, 953 - .read = lguest_clock_read, 954 - .mask = CLOCKSOURCE_MASK(64), 955 - .flags = CLOCK_SOURCE_IS_CONTINUOUS, 956 - }; 957 - 958 - /* 959 - * We also need a "struct clock_event_device": Linux asks us to set it to go 960 - * off some time in the future. Actually, James Morris figured all this out, I 961 - * just applied the patch. 962 - */ 963 - static int lguest_clockevent_set_next_event(unsigned long delta, 964 - struct clock_event_device *evt) 965 - { 966 - /* FIXME: I don't think this can ever happen, but James tells me he had 967 - * to put this code in. Maybe we should remove it now. Anyone? */ 968 - if (delta < LG_CLOCK_MIN_DELTA) { 969 - if (printk_ratelimit()) 970 - printk(KERN_DEBUG "%s: small delta %lu ns\n", 971 - __func__, delta); 972 - return -ETIME; 973 - } 974 - 975 - /* Please wake us this far in the future. */ 976 - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); 977 - return 0; 978 - } 979 - 980 - static int lguest_clockevent_shutdown(struct clock_event_device *evt) 981 - { 982 - /* A 0 argument shuts the clock down. */ 983 - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); 984 - return 0; 985 - } 986 - 987 - /* This describes our primitive timer chip. */ 988 - static struct clock_event_device lguest_clockevent = { 989 - .name = "lguest", 990 - .features = CLOCK_EVT_FEAT_ONESHOT, 991 - .set_next_event = lguest_clockevent_set_next_event, 992 - .set_state_shutdown = lguest_clockevent_shutdown, 993 - .rating = INT_MAX, 994 - .mult = 1, 995 - .shift = 0, 996 - .min_delta_ns = LG_CLOCK_MIN_DELTA, 997 - .min_delta_ticks = LG_CLOCK_MIN_DELTA, 998 - .max_delta_ns = LG_CLOCK_MAX_DELTA, 999 - .max_delta_ticks = LG_CLOCK_MAX_DELTA, 1000 - }; 1001 - 1002 - /* 1003 - * This is the Guest timer interrupt handler (hardware interrupt 0). We just 1004 - * call the clockevent infrastructure and it does whatever needs doing. 1005 - */ 1006 - static void lguest_time_irq(struct irq_desc *desc) 1007 - { 1008 - unsigned long flags; 1009 - 1010 - /* Don't interrupt us while this is running. */ 1011 - local_irq_save(flags); 1012 - lguest_clockevent.event_handler(&lguest_clockevent); 1013 - local_irq_restore(flags); 1014 - } 1015 - 1016 - /* 1017 - * At some point in the boot process, we get asked to set up our timing 1018 - * infrastructure. The kernel doesn't expect timer interrupts before this, but 1019 - * we cleverly initialized the "blocked_interrupts" field of "struct 1020 - * lguest_data" so that timer interrupts were blocked until now. 1021 - */ 1022 - static void lguest_time_init(void) 1023 - { 1024 - /* Set up the timer interrupt (0) to go to our simple timer routine */ 1025 - if (lguest_setup_irq(0) != 0) 1026 - panic("Could not set up timer irq"); 1027 - irq_set_handler(0, lguest_time_irq); 1028 - 1029 - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); 1030 - 1031 - /* We can't set cpumask in the initializer: damn C limitations! Set it 1032 - * here and register our timer device. */ 1033 - lguest_clockevent.cpumask = cpumask_of(0); 1034 - clockevents_register_device(&lguest_clockevent); 1035 - 1036 - /* Finally, we unblock the timer interrupt. */ 1037 - clear_bit(0, lguest_data.blocked_interrupts); 1038 - } 1039 - 1040 - /* 1041 - * Miscellaneous bits and pieces. 1042 - * 1043 - * Here is an oddball collection of functions which the Guest needs for things 1044 - * to work. They're pretty simple. 1045 - */ 1046 - 1047 - /* 1048 - * The Guest needs to tell the Host what stack it expects traps to use. For 1049 - * native hardware, this is part of the Task State Segment mentioned above in 1050 - * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1051 - * 1052 - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1053 - * segment), the privilege level (we're privilege level 1, the Host is 0 and 1054 - * will not tolerate us trying to use that), the stack pointer, and the number 1055 - * of pages in the stack. 1056 - */ 1057 - static void lguest_load_sp0(struct tss_struct *tss, 1058 - struct thread_struct *thread) 1059 - { 1060 - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, 1061 - THREAD_SIZE / PAGE_SIZE); 1062 - tss->x86_tss.sp0 = thread->sp0; 1063 - } 1064 - 1065 - /* Let's just say, I wouldn't do debugging under a Guest. */ 1066 - static unsigned long lguest_get_debugreg(int regno) 1067 - { 1068 - /* FIXME: Implement */ 1069 - return 0; 1070 - } 1071 - 1072 - static void lguest_set_debugreg(int regno, unsigned long value) 1073 - { 1074 - /* FIXME: Implement */ 1075 - } 1076 - 1077 - /* 1078 - * There are times when the kernel wants to make sure that no memory writes are 1079 - * caught in the cache (that they've all reached real hardware devices). This 1080 - * doesn't matter for the Guest which has virtual hardware. 1081 - * 1082 - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush 1083 - * (clflush) instruction is available and the kernel uses that. Otherwise, it 1084 - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. 1085 - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can 1086 - * ignore clflush, but replace wbinvd. 1087 - */ 1088 - static void lguest_wbinvd(void) 1089 - { 1090 - } 1091 - 1092 - /* 1093 - * If the Guest expects to have an Advanced Programmable Interrupt Controller, 1094 - * we play dumb by ignoring writes and returning 0 for reads. So it's no 1095 - * longer Programmable nor Controlling anything, and I don't think 8 lines of 1096 - * code qualifies for Advanced. It will also never interrupt anything. It 1097 - * does, however, allow us to get through the Linux boot code. 1098 - */ 1099 - #ifdef CONFIG_X86_LOCAL_APIC 1100 - static void lguest_apic_write(u32 reg, u32 v) 1101 - { 1102 - } 1103 - 1104 - static u32 lguest_apic_read(u32 reg) 1105 - { 1106 - return 0; 1107 - } 1108 - 1109 - static u64 lguest_apic_icr_read(void) 1110 - { 1111 - return 0; 1112 - } 1113 - 1114 - static void lguest_apic_icr_write(u32 low, u32 id) 1115 - { 1116 - /* Warn to see if there's any stray references */ 1117 - WARN_ON(1); 1118 - } 1119 - 1120 - static void lguest_apic_wait_icr_idle(void) 1121 - { 1122 - return; 1123 - } 1124 - 1125 - static u32 lguest_apic_safe_wait_icr_idle(void) 1126 - { 1127 - return 0; 1128 - } 1129 - 1130 - static void set_lguest_basic_apic_ops(void) 1131 - { 1132 - apic->read = lguest_apic_read; 1133 - apic->write = lguest_apic_write; 1134 - apic->icr_read = lguest_apic_icr_read; 1135 - apic->icr_write = lguest_apic_icr_write; 1136 - apic->wait_icr_idle = lguest_apic_wait_icr_idle; 1137 - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; 1138 - }; 1139 - #endif 1140 - 1141 - /* STOP! Until an interrupt comes in. */ 1142 - static void lguest_safe_halt(void) 1143 - { 1144 - hcall(LHCALL_HALT, 0, 0, 0, 0); 1145 - } 1146 - 1147 - /* 1148 - * The SHUTDOWN hypercall takes a string to describe what's happening, and 1149 - * an argument which says whether this to restart (reboot) the Guest or not. 1150 - * 1151 - * Note that the Host always prefers that the Guest speak in physical addresses 1152 - * rather than virtual addresses, so we use __pa() here. 1153 - */ 1154 - static void lguest_power_off(void) 1155 - { 1156 - hcall(LHCALL_SHUTDOWN, __pa("Power down"), 1157 - LGUEST_SHUTDOWN_POWEROFF, 0, 0); 1158 - } 1159 - 1160 - /* 1161 - * Panicing. 1162 - * 1163 - * Don't. But if you did, this is what happens. 1164 - */ 1165 - static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 1166 - { 1167 - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); 1168 - /* The hcall won't return, but to keep gcc happy, we're "done". */ 1169 - return NOTIFY_DONE; 1170 - } 1171 - 1172 - static struct notifier_block paniced = { 1173 - .notifier_call = lguest_panic 1174 - }; 1175 - 1176 - /* Setting up memory is fairly easy. */ 1177 - static __init char *lguest_memory_setup(void) 1178 - { 1179 - /* 1180 - * The Linux bootloader header contains an "e820" memory map: the 1181 - * Launcher populated the first entry with our memory limit. 1182 - */ 1183 - e820__range_add(boot_params.e820_table[0].addr, 1184 - boot_params.e820_table[0].size, 1185 - boot_params.e820_table[0].type); 1186 - 1187 - /* This string is for the boot messages. */ 1188 - return "LGUEST"; 1189 - } 1190 - 1191 - /* Offset within PCI config space of BAR access capability. */ 1192 - static int console_cfg_offset = 0; 1193 - static int console_access_cap; 1194 - 1195 - /* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ 1196 - static void set_cfg_window(u32 cfg_offset, u32 off) 1197 - { 1198 - write_pci_config_byte(0, 1, 0, 1199 - cfg_offset + offsetof(struct virtio_pci_cap, bar), 1200 - 0); 1201 - write_pci_config(0, 1, 0, 1202 - cfg_offset + offsetof(struct virtio_pci_cap, length), 1203 - 4); 1204 - write_pci_config(0, 1, 0, 1205 - cfg_offset + offsetof(struct virtio_pci_cap, offset), 1206 - off); 1207 - } 1208 - 1209 - static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) 1210 - { 1211 - /* 1212 - * We could set this up once, then leave it; nothing else in the * 1213 - * kernel should touch these registers. But if it went wrong, that 1214 - * would be a horrible bug to find. 1215 - */ 1216 - set_cfg_window(cfg_offset, off); 1217 - write_pci_config(0, 1, 0, 1218 - cfg_offset + sizeof(struct virtio_pci_cap), val); 1219 - } 1220 - 1221 - static void probe_pci_console(void) 1222 - { 1223 - u8 cap, common_cap = 0, device_cap = 0; 1224 - u32 device_len; 1225 - 1226 - /* Avoid recursive printk into here. */ 1227 - console_cfg_offset = -1; 1228 - 1229 - if (!early_pci_allowed()) { 1230 - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); 1231 - return; 1232 - } 1233 - 1234 - /* We expect a console PCI device at BUS0, slot 1. */ 1235 - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { 1236 - printk(KERN_ERR "lguest: PCI device is %#x!\n", 1237 - read_pci_config(0, 1, 0, 0)); 1238 - return; 1239 - } 1240 - 1241 - /* Find the capabilities we need (must be in bar0) */ 1242 - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); 1243 - while (cap) { 1244 - u8 vndr = read_pci_config_byte(0, 1, 0, cap); 1245 - if (vndr == PCI_CAP_ID_VNDR) { 1246 - u8 type, bar; 1247 - 1248 - type = read_pci_config_byte(0, 1, 0, 1249 - cap + offsetof(struct virtio_pci_cap, cfg_type)); 1250 - bar = read_pci_config_byte(0, 1, 0, 1251 - cap + offsetof(struct virtio_pci_cap, bar)); 1252 - 1253 - switch (type) { 1254 - case VIRTIO_PCI_CAP_DEVICE_CFG: 1255 - if (bar == 0) 1256 - device_cap = cap; 1257 - break; 1258 - case VIRTIO_PCI_CAP_PCI_CFG: 1259 - console_access_cap = cap; 1260 - break; 1261 - } 1262 - } 1263 - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); 1264 - } 1265 - if (!device_cap || !console_access_cap) { 1266 - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", 1267 - common_cap, device_cap, console_access_cap); 1268 - return; 1269 - } 1270 - 1271 - /* 1272 - * Note that we can't check features, until we've set the DRIVER 1273 - * status bit. We don't want to do that until we have a real driver, 1274 - * so we just check that the device-specific config has room for 1275 - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE 1276 - * it should ignore the access. 1277 - */ 1278 - device_len = read_pci_config(0, 1, 0, 1279 - device_cap + offsetof(struct virtio_pci_cap, length)); 1280 - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) 1281 - + sizeof(u32))) { 1282 - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); 1283 - return; 1284 - } 1285 - 1286 - console_cfg_offset = read_pci_config(0, 1, 0, 1287 - device_cap + offsetof(struct virtio_pci_cap, offset)); 1288 - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); 1289 - } 1290 - 1291 - /* 1292 - * We will eventually use the virtio console device to produce console output, 1293 - * but before that is set up we use the virtio PCI console's backdoor mmio 1294 - * access and the "emergency" write facility (which is legal even before the 1295 - * device is configured). 1296 - */ 1297 - static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1298 - { 1299 - /* If we couldn't find PCI console, forget it. */ 1300 - if (console_cfg_offset < 0) 1301 - return count; 1302 - 1303 - if (unlikely(!console_cfg_offset)) { 1304 - probe_pci_console(); 1305 - if (console_cfg_offset < 0) 1306 - return count; 1307 - } 1308 - 1309 - write_bar_via_cfg(console_access_cap, 1310 - console_cfg_offset 1311 - + offsetof(struct virtio_console_config, emerg_wr), 1312 - buf[0]); 1313 - return 1; 1314 - } 1315 - 1316 - /* 1317 - * Rebooting also tells the Host we're finished, but the RESTART flag tells the 1318 - * Launcher to reboot us. 1319 - */ 1320 - static void lguest_restart(char *reason) 1321 - { 1322 - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); 1323 - } 1324 - 1325 - /*G:050 1326 - * Patching (Powerfully Placating Performance Pedants) 1327 - * 1328 - * We have already seen that pv_ops structures let us replace simple native 1329 - * instructions with calls to the appropriate back end all throughout the 1330 - * kernel. This allows the same kernel to run as a Guest and as a native 1331 - * kernel, but it's slow because of all the indirect branches. 1332 - * 1333 - * Remember that David Wheeler quote about "Any problem in computer science can 1334 - * be solved with another layer of indirection"? The rest of that quote is 1335 - * "... But that usually will create another problem." This is the first of 1336 - * those problems. 1337 - * 1338 - * Our current solution is to allow the paravirt back end to optionally patch 1339 - * over the indirect calls to replace them with something more efficient. We 1340 - * patch two of the simplest of the most commonly called functions: disable 1341 - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch 1342 - * into: the Guest versions of these operations are small enough that we can 1343 - * fit comfortably. 1344 - * 1345 - * First we need assembly templates of each of the patchable Guest operations, 1346 - * and these are in head_32.S. 1347 - */ 1348 - 1349 - /*G:060 We construct a table from the assembler templates: */ 1350 - static const struct lguest_insns 1351 - { 1352 - const char *start, *end; 1353 - } lguest_insns[] = { 1354 - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1355 - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1356 - }; 1357 - 1358 - /* 1359 - * Now our patch routine is fairly simple (based on the native one in 1360 - * paravirt.c). If we have a replacement, we copy it in and return how much of 1361 - * the available space we used. 1362 - */ 1363 - static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1364 - unsigned long addr, unsigned len) 1365 - { 1366 - unsigned int insn_len; 1367 - 1368 - /* Don't do anything special if we don't have a replacement */ 1369 - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) 1370 - return paravirt_patch_default(type, clobber, ibuf, addr, len); 1371 - 1372 - insn_len = lguest_insns[type].end - lguest_insns[type].start; 1373 - 1374 - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ 1375 - if (len < insn_len) 1376 - return paravirt_patch_default(type, clobber, ibuf, addr, len); 1377 - 1378 - /* Copy in our instructions. */ 1379 - memcpy(ibuf, lguest_insns[type].start, insn_len); 1380 - return insn_len; 1381 - } 1382 - 1383 - /*G:029 1384 - * Once we get to lguest_init(), we know we're a Guest. The various 1385 - * pv_ops structures in the kernel provide points for (almost) every routine we 1386 - * have to override to avoid privileged instructions. 1387 - */ 1388 - __init void lguest_init(void) 1389 - { 1390 - /* We're under lguest. */ 1391 - pv_info.name = "lguest"; 1392 - /* We're running at privilege level 1, not 0 as normal. */ 1393 - pv_info.kernel_rpl = 1; 1394 - /* Everyone except Xen runs with this set. */ 1395 - pv_info.shared_kernel_pmd = 1; 1396 - 1397 - /* 1398 - * We set up all the lguest overrides for sensitive operations. These 1399 - * are detailed with the operations themselves. 1400 - */ 1401 - 1402 - /* Interrupt-related operations */ 1403 - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); 1404 - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1405 - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); 1406 - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1407 - pv_irq_ops.safe_halt = lguest_safe_halt; 1408 - 1409 - /* Setup operations */ 1410 - pv_init_ops.patch = lguest_patch; 1411 - 1412 - /* Intercepts of various CPU instructions */ 1413 - pv_cpu_ops.load_gdt = lguest_load_gdt; 1414 - pv_cpu_ops.cpuid = lguest_cpuid; 1415 - pv_cpu_ops.load_idt = lguest_load_idt; 1416 - pv_cpu_ops.iret = lguest_iret; 1417 - pv_cpu_ops.load_sp0 = lguest_load_sp0; 1418 - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; 1419 - pv_cpu_ops.set_ldt = lguest_set_ldt; 1420 - pv_cpu_ops.load_tls = lguest_load_tls; 1421 - pv_cpu_ops.get_debugreg = lguest_get_debugreg; 1422 - pv_cpu_ops.set_debugreg = lguest_set_debugreg; 1423 - pv_cpu_ops.read_cr0 = lguest_read_cr0; 1424 - pv_cpu_ops.write_cr0 = lguest_write_cr0; 1425 - pv_cpu_ops.read_cr4 = lguest_read_cr4; 1426 - pv_cpu_ops.write_cr4 = lguest_write_cr4; 1427 - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1428 - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1429 - pv_cpu_ops.wbinvd = lguest_wbinvd; 1430 - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1431 - pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1432 - 1433 - /* Pagetable management */ 1434 - pv_mmu_ops.write_cr3 = lguest_write_cr3; 1435 - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1436 - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1437 - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; 1438 - pv_mmu_ops.set_pte = lguest_set_pte; 1439 - pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1440 - pv_mmu_ops.set_pmd = lguest_set_pmd; 1441 - #ifdef CONFIG_X86_PAE 1442 - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; 1443 - pv_mmu_ops.pte_clear = lguest_pte_clear; 1444 - pv_mmu_ops.pmd_clear = lguest_pmd_clear; 1445 - pv_mmu_ops.set_pud = lguest_set_pud; 1446 - #endif 1447 - pv_mmu_ops.read_cr2 = lguest_read_cr2; 1448 - pv_mmu_ops.read_cr3 = lguest_read_cr3; 1449 - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1450 - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; 1451 - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; 1452 - pv_mmu_ops.pte_update = lguest_pte_update; 1453 - 1454 - #ifdef CONFIG_X86_LOCAL_APIC 1455 - /* APIC read/write intercepts */ 1456 - set_lguest_basic_apic_ops(); 1457 - #endif 1458 - 1459 - x86_init.resources.memory_setup = lguest_memory_setup; 1460 - x86_init.irqs.intr_init = lguest_init_IRQ; 1461 - x86_init.timers.timer_init = lguest_time_init; 1462 - x86_platform.calibrate_tsc = lguest_tsc_khz; 1463 - x86_platform.get_wallclock = lguest_get_wallclock; 1464 - 1465 - /* 1466 - * Now is a good time to look at the implementations of these functions 1467 - * before returning to the rest of lguest_init(). 1468 - */ 1469 - 1470 - /*G:070 1471 - * Now we've seen all the paravirt_ops, we return to 1472 - * lguest_init() where the rest of the fairly chaotic boot setup 1473 - * occurs. 1474 - */ 1475 - 1476 - /* 1477 - * The stack protector is a weird thing where gcc places a canary 1478 - * value on the stack and then checks it on return. This file is 1479 - * compiled with -fno-stack-protector it, so we got this far without 1480 - * problems. The value of the canary is kept at offset 20 from the 1481 - * %gs register, so we need to set that up before calling C functions 1482 - * in other files. 1483 - */ 1484 - setup_stack_canary_segment(0); 1485 - 1486 - /* 1487 - * We could just call load_stack_canary_segment(), but we might as well 1488 - * call switch_to_new_gdt() which loads the whole table and sets up the 1489 - * per-cpu segment descriptor register %fs as well. 1490 - */ 1491 - switch_to_new_gdt(0); 1492 - 1493 - /* 1494 - * The Host<->Guest Switcher lives at the top of our address space, and 1495 - * the Host told us how big it is when we made LGUEST_INIT hypercall: 1496 - * it put the answer in lguest_data.reserve_mem 1497 - */ 1498 - reserve_top_address(lguest_data.reserve_mem); 1499 - 1500 - /* Hook in our special panic hypercall code. */ 1501 - atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1502 - 1503 - /* 1504 - * This is messy CPU setup stuff which the native boot code does before 1505 - * start_kernel, so we have to do, too: 1506 - */ 1507 - cpu_detect(&new_cpu_data); 1508 - /* head.S usually sets up the first capability word, so do it here. */ 1509 - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); 1510 - 1511 - /* Math is always hard! */ 1512 - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); 1513 - 1514 - /* We don't have features. We have puppies! Puppies! */ 1515 - #ifdef CONFIG_X86_MCE 1516 - mca_cfg.disabled = true; 1517 - #endif 1518 - #ifdef CONFIG_ACPI 1519 - acpi_disabled = 1; 1520 - #endif 1521 - 1522 - /* 1523 - * We set the preferred console to "hvc". This is the "hypervisor 1524 - * virtual console" driver written by the PowerPC people, which we also 1525 - * adapted for lguest's use. 1526 - */ 1527 - add_preferred_console("hvc", 0, NULL); 1528 - 1529 - /* Register our very early console. */ 1530 - virtio_cons_early_init(early_put_chars); 1531 - 1532 - /* Don't let ACPI try to control our PCI interrupts. */ 1533 - disable_acpi(); 1534 - 1535 - /* We control them ourselves, by overriding these two hooks. */ 1536 - pcibios_enable_irq = lguest_enable_irq; 1537 - pcibios_disable_irq = lguest_disable_irq; 1538 - 1539 - /* 1540 - * Last of all, we set the power management poweroff hook to point to 1541 - * the Guest routine to power off, and the reboot hook to our restart 1542 - * routine. 1543 - */ 1544 - pm_power_off = lguest_power_off; 1545 - machine_ops.restart = lguest_restart; 1546 - 1547 - /* 1548 - * Now we're set up, call i386_start_kernel() in head32.c and we proceed 1549 - * to boot as normal. It never returns. 1550 - */ 1551 - i386_start_kernel(); 1552 - } 1553 - /* 1554 - * This marks the end of stage II of our journey, The Guest. 1555 - * 1556 - * It is now time for us to explore the layer of virtual drivers and complete 1557 - * our understanding of the Guest in "make Drivers". 1558 - */
-192
arch/x86/lguest/head_32.S
··· 1 - #include <linux/linkage.h> 2 - #include <linux/lguest.h> 3 - #include <asm/lguest_hcall.h> 4 - #include <asm/asm-offsets.h> 5 - #include <asm/thread_info.h> 6 - #include <asm/processor-flags.h> 7 - 8 - /*G:020 9 - 10 - * Our story starts with the bzImage: booting starts at startup_32 in 11 - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real 12 - * kernel in place and then jumps into it: startup_32 in 13 - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi 14 - * register, which is created by the bootloader (the Launcher in our case). 15 - * 16 - * The startup_32 function does very little: it clears the uninitialized global 17 - * C variables which we expect to be zero (ie. BSS) and then copies the boot 18 - * header and kernel command line somewhere safe, and populates some initial 19 - * page tables. Finally it checks the 'hardware_subarch' field. This was 20 - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's 21 - * assigned number), then it calls us here. 22 - * 23 - * WARNING: be very careful here! We're running at addresses equal to physical 24 - * addresses (around 0), not above PAGE_OFFSET as most code expects 25 - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 26 - * data without remembering to subtract __PAGE_OFFSET! 27 - * 28 - * The .section line puts this code in .init.text so it will be discarded after 29 - * boot. 30 - */ 31 - .section .init.text, "ax", @progbits 32 - ENTRY(lguest_entry) 33 - /* 34 - * We make the "initialization" hypercall now to tell the Host where 35 - * our lguest_data struct is. 36 - */ 37 - movl $LHCALL_LGUEST_INIT, %eax 38 - movl $lguest_data - __PAGE_OFFSET, %ebx 39 - int $LGUEST_TRAP_ENTRY 40 - 41 - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ 42 - movl $LHCALL_NEW_PGTABLE, %eax 43 - movl $(initial_page_table - __PAGE_OFFSET), %ebx 44 - int $LGUEST_TRAP_ENTRY 45 - 46 - /* Set up the initial stack so we can run C code. */ 47 - movl $(init_thread_union+THREAD_SIZE),%esp 48 - 49 - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ 50 - jmp lguest_init+__PAGE_OFFSET 51 - 52 - /*G:055 53 - * We create a macro which puts the assembler code between lgstart_ and lgend_ 54 - * markers. These templates are put in the .text section: they can't be 55 - * discarded after boot as we may need to patch modules, too. 56 - */ 57 - .text 58 - #define LGUEST_PATCH(name, insns...) \ 59 - lgstart_##name: insns; lgend_##name:; \ 60 - .globl lgstart_##name; .globl lgend_##name 61 - 62 - LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 63 - LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 64 - 65 - /*G:033 66 - * But using those wrappers is inefficient (we'll see why that doesn't matter 67 - * for save_fl and irq_disable later). If we write our routines carefully in 68 - * assembler, we can avoid clobbering any registers and avoid jumping through 69 - * the wrapper functions. 70 - * 71 - * I skipped over our first piece of assembler, but this one is worth studying 72 - * in a bit more detail so I'll describe in easy stages. First, the routine to 73 - * enable interrupts: 74 - */ 75 - ENTRY(lg_irq_enable) 76 - /* 77 - * The reverse of irq_disable, this sets lguest_data.irq_enabled to 78 - * X86_EFLAGS_IF (ie. "Interrupts enabled"). 79 - */ 80 - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 81 - /* 82 - * But now we need to check if the Host wants to know: there might have 83 - * been interrupts waiting to be delivered, in which case it will have 84 - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 85 - * jump to send_interrupts, otherwise we're done. 86 - */ 87 - cmpl $0, lguest_data+LGUEST_DATA_irq_pending 88 - jnz send_interrupts 89 - /* 90 - * One cool thing about x86 is that you can do many things without using 91 - * a register. In this case, the normal path hasn't needed to save or 92 - * restore any registers at all! 93 - */ 94 - ret 95 - send_interrupts: 96 - /* 97 - * OK, now we need a register: eax is used for the hypercall number, 98 - * which is LHCALL_SEND_INTERRUPTS. 99 - * 100 - * We used not to bother with this pending detection at all, which was 101 - * much simpler. Sooner or later the Host would realize it had to 102 - * send us an interrupt. But that turns out to make performance 7 103 - * times worse on a simple tcp benchmark. So now we do this the hard 104 - * way. 105 - */ 106 - pushl %eax 107 - movl $LHCALL_SEND_INTERRUPTS, %eax 108 - /* This is the actual hypercall trap. */ 109 - int $LGUEST_TRAP_ENTRY 110 - /* Put eax back the way we found it. */ 111 - popl %eax 112 - ret 113 - 114 - /* 115 - * Finally, the "popf" or "restore flags" routine. The %eax register holds the 116 - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 117 - * enabling interrupts again, if it's 0 we're leaving them off. 118 - */ 119 - ENTRY(lg_restore_fl) 120 - /* This is just "lguest_data.irq_enabled = flags;" */ 121 - movl %eax, lguest_data+LGUEST_DATA_irq_enabled 122 - /* 123 - * Now, if the %eax value has enabled interrupts and 124 - * lguest_data.irq_pending is set, we want to tell the Host so it can 125 - * deliver any outstanding interrupts. Fortunately, both values will 126 - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 127 - * instruction will AND them together for us. If both are set, we 128 - * jump to send_interrupts. 129 - */ 130 - testl lguest_data+LGUEST_DATA_irq_pending, %eax 131 - jnz send_interrupts 132 - /* Again, the normal path has used no extra registers. Clever, huh? */ 133 - ret 134 - /*:*/ 135 - 136 - /* These demark the EIP where host should never deliver interrupts. */ 137 - .global lguest_noirq_iret 138 - 139 - /*M:004 140 - * When the Host reflects a trap or injects an interrupt into the Guest, it 141 - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, 142 - * so the Guest iret logic does the right thing when restoring it. However, 143 - * when the Host sets the Guest up for direct traps, such as system calls, the 144 - * processor is the one to push eflags onto the stack, and the interrupt bit 145 - * will be 1 (in reality, interrupts are always enabled in the Guest). 146 - * 147 - * This turns out to be harmless: the only trap which should happen under Linux 148 - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 149 - * regions), which has to be reflected through the Host anyway. If another 150 - * trap *does* go off when interrupts are disabled, the Guest will panic, and 151 - * we'll never get to this iret! 152 - :*/ 153 - 154 - /*G:045 155 - * There is one final paravirt_op that the Guest implements, and glancing at it 156 - * you can see why I left it to last. It's *cool*! It's in *assembler*! 157 - * 158 - * The "iret" instruction is used to return from an interrupt or trap. The 159 - * stack looks like this: 160 - * old address 161 - * old code segment & privilege level 162 - * old processor flags ("eflags") 163 - * 164 - * The "iret" instruction pops those values off the stack and restores them all 165 - * at once. The only problem is that eflags includes the Interrupt Flag which 166 - * the Guest can't change: the CPU will simply ignore it when we do an "iret". 167 - * So we have to copy eflags from the stack to lguest_data.irq_enabled before 168 - * we do the "iret". 169 - * 170 - * There are two problems with this: firstly, we can't clobber any registers 171 - * and secondly, the whole thing needs to be atomic. The first problem 172 - * is solved by using "push memory"/"pop memory" instruction pair for copying. 173 - * 174 - * The second is harder: copying eflags to lguest_data.irq_enabled will turn 175 - * interrupts on before we're finished, so we could be interrupted before we 176 - * return to userspace or wherever. Our solution to this is to tell the 177 - * Host that it is *never* to interrupt us there, even if interrupts seem to be 178 - * enabled. (It's not necessary to protect pop instruction, since 179 - * data gets updated only after it completes, so we only need to protect 180 - * one instruction, iret). 181 - */ 182 - ENTRY(lguest_iret) 183 - pushl 2*4(%esp) 184 - /* 185 - * Note the %ss: segment prefix here. Normal data accesses use the 186 - * "ds" segment, but that will have already been restored for whatever 187 - * we're returning to (such as userspace): we can't trust it. The %ss: 188 - * prefix makes sure we use the stack segment, which is still valid. 189 - */ 190 - popl %ss:lguest_data+LGUEST_DATA_irq_enabled 191 - lguest_noirq_iret: 192 - iret
-1
drivers/Makefile
··· 125 125 obj-$(CONFIG_ISDN) += isdn/ 126 126 obj-$(CONFIG_EDAC) += edac/ 127 127 obj-$(CONFIG_EISA) += eisa/ 128 - obj-y += lguest/ 129 128 obj-$(CONFIG_CPU_FREQ) += cpufreq/ 130 129 obj-$(CONFIG_CPU_IDLE) += cpuidle/ 131 130 obj-y += mmc/
+1 -1
drivers/block/Kconfig
··· 470 470 depends on VIRTIO 471 471 ---help--- 472 472 This is the virtual block driver for virtio. It can be used with 473 - lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. 473 + QEMU based VMMs (like KVM or Xen). Say Y or M. 474 474 475 475 config VIRTIO_BLK_SCSI 476 476 bool "SCSI passthrough request for the Virtio block driver"
+1 -1
drivers/char/Kconfig
··· 161 161 depends on VIRTIO && TTY 162 162 select HVC_DRIVER 163 163 help 164 - Virtio console for use with lguest and other hypervisors. 164 + Virtio console for use with hypervisors. 165 165 166 166 Also serves as a general-purpose serial device for data 167 167 transfer between the guest and host. Character devices at
+1 -1
drivers/char/virtio_console.c
··· 1130 1130 * We turn the characters into a scatter-gather list, add it to the 1131 1131 * output queue and then kick the Host. Then we sit here waiting for 1132 1132 * it to finish: inefficient in theory, but in practice 1133 - * implementations will do it immediately (lguest's Launcher does). 1133 + * implementations will do it immediately. 1134 1134 */ 1135 1135 static int put_chars(u32 vtermno, const char *buf, int count) 1136 1136 {
-13
drivers/lguest/Kconfig
··· 1 - config LGUEST 2 - tristate "Linux hypervisor example code" 3 - depends on X86_32 && EVENTFD && TTY && PCI_DIRECT 4 - select HVC_DRIVER 5 - ---help--- 6 - This is a very simple module which allows you to run 7 - multiple instances of the same Linux kernel, using the 8 - "lguest" command found in the tools/lguest directory. 9 - 10 - Note that "lguest" is pronounced to rhyme with "fell quest", 11 - not "rustyvisor". See tools/lguest/lguest.txt. 12 - 13 - If unsure, say N. If curious, say M. If masochistic, say Y.
-26
drivers/lguest/Makefile
··· 1 - # Host requires the other files, which can be a module. 2 - obj-$(CONFIG_LGUEST) += lg.o 3 - lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 4 - segments.o lguest_user.o 5 - 6 - lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o 7 - 8 - Preparation Preparation!: PREFIX=P 9 - Guest: PREFIX=G 10 - Drivers: PREFIX=D 11 - Launcher: PREFIX=L 12 - Host: PREFIX=H 13 - Switcher: PREFIX=S 14 - Mastery: PREFIX=M 15 - Beer: 16 - @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" 17 - Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: 18 - @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` 19 - Puppy: 20 - @clear 21 - @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" 22 - @sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear 23 - @printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n" 24 - @sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear 25 - @printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n" 26 - @sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear
-47
drivers/lguest/README
··· 1 - Welcome, friend reader, to lguest. 2 - 3 - Lguest is an adventure, with you, the reader, as Hero. I can't think of many 4 - 5000-line projects which offer both such capability and glimpses of future 5 - potential; it is an exciting time to be delving into the source! 6 - 7 - But be warned; this is an arduous journey of several hours or more! And as we 8 - know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or 9 - equivalent) to anyone I meet who has completed this documentation. 10 - 11 - So get comfortable and keep your wits about you (both quick and humorous). 12 - Along your way to the Noble Goal, you will also gain masterly insight into 13 - lguest, and hypervisors and x86 virtualization in general. 14 - 15 - Our Quest is in seven parts: (best read with C highlighting turned on) 16 - 17 - I) Preparation 18 - - In which our potential hero is flown quickly over the landscape for a 19 - taste of its scope. Suitable for the armchair coders and other such 20 - persons of faint constitution. 21 - 22 - II) Guest 23 - - Where we encounter the first tantalising wisps of code, and come to 24 - understand the details of the life of a Guest kernel. 25 - 26 - III) Drivers 27 - - Whereby the Guest finds its voice and become useful, and our 28 - understanding of the Guest is completed. 29 - 30 - IV) Launcher 31 - - Where we trace back to the creation of the Guest, and thus begin our 32 - understanding of the Host. 33 - 34 - V) Host 35 - - Where we master the Host code, through a long and tortuous journey. 36 - Indeed, it is here that our hero is tested in the Bit of Despair. 37 - 38 - VI) Switcher 39 - - Where our understanding of the intertwined nature of Guests and Hosts 40 - is completed. 41 - 42 - VII) Mastery 43 - - Where our fully fledged hero grapples with the Great Question: 44 - "What next?" 45 - 46 - make Preparation! 47 - Rusty Russell.
-398
drivers/lguest/core.c
··· 1 - /*P:400 2 - * This contains run_guest() which actually calls into the Host<->Guest 3 - * Switcher and analyzes the return, such as determining if the Guest wants the 4 - * Host to do something. This file also contains useful helper routines. 5 - :*/ 6 - #include <linux/module.h> 7 - #include <linux/stringify.h> 8 - #include <linux/stddef.h> 9 - #include <linux/io.h> 10 - #include <linux/mm.h> 11 - #include <linux/sched/signal.h> 12 - #include <linux/vmalloc.h> 13 - #include <linux/cpu.h> 14 - #include <linux/freezer.h> 15 - #include <linux/highmem.h> 16 - #include <linux/slab.h> 17 - #include <asm/paravirt.h> 18 - #include <asm/pgtable.h> 19 - #include <linux/uaccess.h> 20 - #include <asm/poll.h> 21 - #include <asm/asm-offsets.h> 22 - #include "lg.h" 23 - 24 - unsigned long switcher_addr; 25 - struct page **lg_switcher_pages; 26 - static struct vm_struct *switcher_text_vma; 27 - static struct vm_struct *switcher_stacks_vma; 28 - 29 - /* This One Big lock protects all inter-guest data structures. */ 30 - DEFINE_MUTEX(lguest_lock); 31 - 32 - /*H:010 33 - * We need to set up the Switcher at a high virtual address. Remember the 34 - * Switcher is a few hundred bytes of assembler code which actually changes the 35 - * CPU to run the Guest, and then changes back to the Host when a trap or 36 - * interrupt happens. 37 - * 38 - * The Switcher code must be at the same virtual address in the Guest as the 39 - * Host since it will be running as the switchover occurs. 40 - * 41 - * Trying to map memory at a particular address is an unusual thing to do, so 42 - * it's not a simple one-liner. 43 - */ 44 - static __init int map_switcher(void) 45 - { 46 - int i, err; 47 - 48 - /* 49 - * Map the Switcher in to high memory. 50 - * 51 - * It turns out that if we choose the address 0xFFC00000 (4MB under the 52 - * top virtual address), it makes setting up the page tables really 53 - * easy. 54 - */ 55 - 56 - /* We assume Switcher text fits into a single page. */ 57 - if (end_switcher_text - start_switcher_text > PAGE_SIZE) { 58 - printk(KERN_ERR "lguest: switcher text too large (%zu)\n", 59 - end_switcher_text - start_switcher_text); 60 - return -EINVAL; 61 - } 62 - 63 - /* 64 - * We allocate an array of struct page pointers. map_vm_area() wants 65 - * this, rather than just an array of pages. 66 - */ 67 - lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) 68 - * TOTAL_SWITCHER_PAGES, 69 - GFP_KERNEL); 70 - if (!lg_switcher_pages) { 71 - err = -ENOMEM; 72 - goto out; 73 - } 74 - 75 - /* 76 - * Now we actually allocate the pages. The Guest will see these pages, 77 - * so we make sure they're zeroed. 78 - */ 79 - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 80 - lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); 81 - if (!lg_switcher_pages[i]) { 82 - err = -ENOMEM; 83 - goto free_some_pages; 84 - } 85 - } 86 - 87 - /* 88 - * Copy in the compiled-in Switcher code (from x86/switcher_32.S). 89 - * It goes in the first page, which we map in momentarily. 90 - */ 91 - memcpy(kmap(lg_switcher_pages[0]), start_switcher_text, 92 - end_switcher_text - start_switcher_text); 93 - kunmap(lg_switcher_pages[0]); 94 - 95 - /* 96 - * We place the Switcher underneath the fixmap area, which is the 97 - * highest virtual address we can get. This is important, since we 98 - * tell the Guest it can't access this memory, so we want its ceiling 99 - * as high as possible. 100 - */ 101 - switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE; 102 - 103 - /* 104 - * Now we reserve the "virtual memory area"s we want. We might 105 - * not get them in theory, but in practice it's worked so far. 106 - * 107 - * We want the switcher text to be read-only and executable, and 108 - * the stacks to be read-write and non-executable. 109 - */ 110 - switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD, 111 - switcher_addr, 112 - switcher_addr + PAGE_SIZE); 113 - 114 - if (!switcher_text_vma) { 115 - err = -ENOMEM; 116 - printk("lguest: could not map switcher pages high\n"); 117 - goto free_pages; 118 - } 119 - 120 - switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE, 121 - VM_ALLOC|VM_NO_GUARD, 122 - switcher_addr + PAGE_SIZE, 123 - switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE); 124 - if (!switcher_stacks_vma) { 125 - err = -ENOMEM; 126 - printk("lguest: could not map switcher pages high\n"); 127 - goto free_text_vma; 128 - } 129 - 130 - /* 131 - * This code actually sets up the pages we've allocated to appear at 132 - * switcher_addr. map_vm_area() takes the vma we allocated above, the 133 - * kind of pages we're mapping (kernel text pages and kernel writable 134 - * pages respectively), and a pointer to our array of struct pages. 135 - */ 136 - err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages); 137 - if (err) { 138 - printk("lguest: text map_vm_area failed: %i\n", err); 139 - goto free_vmas; 140 - } 141 - 142 - err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL, 143 - lg_switcher_pages + SWITCHER_TEXT_PAGES); 144 - if (err) { 145 - printk("lguest: stacks map_vm_area failed: %i\n", err); 146 - goto free_vmas; 147 - } 148 - 149 - /* 150 - * Now the Switcher is mapped at the right address, we can't fail! 151 - */ 152 - printk(KERN_INFO "lguest: mapped switcher at %p\n", 153 - switcher_text_vma->addr); 154 - /* And we succeeded... */ 155 - return 0; 156 - 157 - free_vmas: 158 - /* Undoes map_vm_area and __get_vm_area */ 159 - vunmap(switcher_stacks_vma->addr); 160 - free_text_vma: 161 - vunmap(switcher_text_vma->addr); 162 - free_pages: 163 - i = TOTAL_SWITCHER_PAGES; 164 - free_some_pages: 165 - for (--i; i >= 0; i--) 166 - __free_pages(lg_switcher_pages[i], 0); 167 - kfree(lg_switcher_pages); 168 - out: 169 - return err; 170 - } 171 - /*:*/ 172 - 173 - /* Cleaning up the mapping when the module is unloaded is almost... too easy. */ 174 - static void unmap_switcher(void) 175 - { 176 - unsigned int i; 177 - 178 - /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ 179 - vunmap(switcher_text_vma->addr); 180 - vunmap(switcher_stacks_vma->addr); 181 - /* Now we just need to free the pages we copied the switcher into */ 182 - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 183 - __free_pages(lg_switcher_pages[i], 0); 184 - kfree(lg_switcher_pages); 185 - } 186 - 187 - /*H:032 188 - * Dealing With Guest Memory. 189 - * 190 - * Before we go too much further into the Host, we need to grok the routines 191 - * we use to deal with Guest memory. 192 - * 193 - * When the Guest gives us (what it thinks is) a physical address, we can use 194 - * the normal copy_from_user() & copy_to_user() on the corresponding place in 195 - * the memory region allocated by the Launcher. 196 - * 197 - * But we can't trust the Guest: it might be trying to access the Launcher 198 - * code. We have to check that the range is below the pfn_limit the Launcher 199 - * gave us. We have to make sure that addr + len doesn't give us a false 200 - * positive by overflowing, too. 201 - */ 202 - bool lguest_address_ok(const struct lguest *lg, 203 - unsigned long addr, unsigned long len) 204 - { 205 - return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr); 206 - } 207 - 208 - /* 209 - * This routine copies memory from the Guest. Here we can see how useful the 210 - * kill_lguest() routine we met in the Launcher can be: we return a random 211 - * value (all zeroes) instead of needing to return an error. 212 - */ 213 - void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) 214 - { 215 - if (!lguest_address_ok(cpu->lg, addr, bytes) 216 - || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) { 217 - /* copy_from_user should do this, but as we rely on it... */ 218 - memset(b, 0, bytes); 219 - kill_guest(cpu, "bad read address %#lx len %u", addr, bytes); 220 - } 221 - } 222 - 223 - /* This is the write (copy into Guest) version. */ 224 - void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, 225 - unsigned bytes) 226 - { 227 - if (!lguest_address_ok(cpu->lg, addr, bytes) 228 - || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0) 229 - kill_guest(cpu, "bad write address %#lx len %u", addr, bytes); 230 - } 231 - /*:*/ 232 - 233 - /*H:030 234 - * Let's jump straight to the the main loop which runs the Guest. 235 - * Remember, this is called by the Launcher reading /dev/lguest, and we keep 236 - * going around and around until something interesting happens. 237 - */ 238 - int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 239 - { 240 - /* If the launcher asked for a register with LHREQ_GETREG */ 241 - if (cpu->reg_read) { 242 - if (put_user(*cpu->reg_read, user)) 243 - return -EFAULT; 244 - cpu->reg_read = NULL; 245 - return sizeof(*cpu->reg_read); 246 - } 247 - 248 - /* We stop running once the Guest is dead. */ 249 - while (!cpu->lg->dead) { 250 - unsigned int irq; 251 - bool more; 252 - 253 - /* First we run any hypercalls the Guest wants done. */ 254 - if (cpu->hcall) 255 - do_hypercalls(cpu); 256 - 257 - /* Do we have to tell the Launcher about a trap? */ 258 - if (cpu->pending.trap) { 259 - if (copy_to_user(user, &cpu->pending, 260 - sizeof(cpu->pending))) 261 - return -EFAULT; 262 - return sizeof(cpu->pending); 263 - } 264 - 265 - /* 266 - * All long-lived kernel loops need to check with this horrible 267 - * thing called the freezer. If the Host is trying to suspend, 268 - * it stops us. 269 - */ 270 - try_to_freeze(); 271 - 272 - /* Check for signals */ 273 - if (signal_pending(current)) 274 - return -ERESTARTSYS; 275 - 276 - /* 277 - * Check if there are any interrupts which can be delivered now: 278 - * if so, this sets up the hander to be executed when we next 279 - * run the Guest. 280 - */ 281 - irq = interrupt_pending(cpu, &more); 282 - if (irq < LGUEST_IRQS) 283 - try_deliver_interrupt(cpu, irq, more); 284 - 285 - /* 286 - * Just make absolutely sure the Guest is still alive. One of 287 - * those hypercalls could have been fatal, for example. 288 - */ 289 - if (cpu->lg->dead) 290 - break; 291 - 292 - /* 293 - * If the Guest asked to be stopped, we sleep. The Guest's 294 - * clock timer will wake us. 295 - */ 296 - if (cpu->halted) { 297 - set_current_state(TASK_INTERRUPTIBLE); 298 - /* 299 - * Just before we sleep, make sure no interrupt snuck in 300 - * which we should be doing. 301 - */ 302 - if (interrupt_pending(cpu, &more) < LGUEST_IRQS) 303 - set_current_state(TASK_RUNNING); 304 - else 305 - schedule(); 306 - continue; 307 - } 308 - 309 - /* 310 - * OK, now we're ready to jump into the Guest. First we put up 311 - * the "Do Not Disturb" sign: 312 - */ 313 - local_irq_disable(); 314 - 315 - /* Actually run the Guest until something happens. */ 316 - lguest_arch_run_guest(cpu); 317 - 318 - /* Now we're ready to be interrupted or moved to other CPUs */ 319 - local_irq_enable(); 320 - 321 - /* Now we deal with whatever happened to the Guest. */ 322 - lguest_arch_handle_trap(cpu); 323 - } 324 - 325 - /* Special case: Guest is 'dead' but wants a reboot. */ 326 - if (cpu->lg->dead == ERR_PTR(-ERESTART)) 327 - return -ERESTART; 328 - 329 - /* The Guest is dead => "No such file or directory" */ 330 - return -ENOENT; 331 - } 332 - 333 - /*H:000 334 - * Welcome to the Host! 335 - * 336 - * By this point your brain has been tickled by the Guest code and numbed by 337 - * the Launcher code; prepare for it to be stretched by the Host code. This is 338 - * the heart. Let's begin at the initialization routine for the Host's lg 339 - * module. 340 - */ 341 - static int __init init(void) 342 - { 343 - int err; 344 - 345 - /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ 346 - if (get_kernel_rpl() != 0) { 347 - printk("lguest is afraid of being a guest\n"); 348 - return -EPERM; 349 - } 350 - 351 - /* First we put the Switcher up in very high virtual memory. */ 352 - err = map_switcher(); 353 - if (err) 354 - goto out; 355 - 356 - /* We might need to reserve an interrupt vector. */ 357 - err = init_interrupts(); 358 - if (err) 359 - goto unmap; 360 - 361 - /* /dev/lguest needs to be registered. */ 362 - err = lguest_device_init(); 363 - if (err) 364 - goto free_interrupts; 365 - 366 - /* Finally we do some architecture-specific setup. */ 367 - lguest_arch_host_init(); 368 - 369 - /* All good! */ 370 - return 0; 371 - 372 - free_interrupts: 373 - free_interrupts(); 374 - unmap: 375 - unmap_switcher(); 376 - out: 377 - return err; 378 - } 379 - 380 - /* Cleaning up is just the same code, backwards. With a little French. */ 381 - static void __exit fini(void) 382 - { 383 - lguest_device_remove(); 384 - free_interrupts(); 385 - unmap_switcher(); 386 - 387 - lguest_arch_host_fini(); 388 - } 389 - /*:*/ 390 - 391 - /* 392 - * The Host side of lguest can be a module. This is a nice way for people to 393 - * play with it. 394 - */ 395 - module_init(init); 396 - module_exit(fini); 397 - MODULE_LICENSE("GPL"); 398 - MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-304
drivers/lguest/hypercalls.c
··· 1 - /*P:500 2 - * Just as userspace programs request kernel operations through a system 3 - * call, the Guest requests Host operations through a "hypercall". You might 4 - * notice this nomenclature doesn't really follow any logic, but the name has 5 - * been around for long enough that we're stuck with it. As you'd expect, this 6 - * code is basically a one big switch statement. 7 - :*/ 8 - 9 - /* Copyright (C) 2006 Rusty Russell IBM Corporation 10 - 11 - This program is free software; you can redistribute it and/or modify 12 - it under the terms of the GNU General Public License as published by 13 - the Free Software Foundation; either version 2 of the License, or 14 - (at your option) any later version. 15 - 16 - This program is distributed in the hope that it will be useful, 17 - but WITHOUT ANY WARRANTY; without even the implied warranty of 18 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 - GNU General Public License for more details. 20 - 21 - You should have received a copy of the GNU General Public License 22 - along with this program; if not, write to the Free Software 23 - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 24 - */ 25 - #include <linux/uaccess.h> 26 - #include <linux/syscalls.h> 27 - #include <linux/mm.h> 28 - #include <linux/ktime.h> 29 - #include <asm/page.h> 30 - #include <asm/pgtable.h> 31 - #include "lg.h" 32 - 33 - /*H:120 34 - * This is the core hypercall routine: where the Guest gets what it wants. 35 - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. 36 - */ 37 - static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 38 - { 39 - switch (args->arg0) { 40 - case LHCALL_FLUSH_ASYNC: 41 - /* 42 - * This call does nothing, except by breaking out of the Guest 43 - * it makes us process all the asynchronous hypercalls. 44 - */ 45 - break; 46 - case LHCALL_SEND_INTERRUPTS: 47 - /* 48 - * This call does nothing too, but by breaking out of the Guest 49 - * it makes us process any pending interrupts. 50 - */ 51 - break; 52 - case LHCALL_LGUEST_INIT: 53 - /* 54 - * You can't get here unless you're already initialized. Don't 55 - * do that. 56 - */ 57 - kill_guest(cpu, "already have lguest_data"); 58 - break; 59 - case LHCALL_SHUTDOWN: { 60 - char msg[128]; 61 - /* 62 - * Shutdown is such a trivial hypercall that we do it in five 63 - * lines right here. 64 - * 65 - * If the lgread fails, it will call kill_guest() itself; the 66 - * kill_guest() with the message will be ignored. 67 - */ 68 - __lgread(cpu, msg, args->arg1, sizeof(msg)); 69 - msg[sizeof(msg)-1] = '\0'; 70 - kill_guest(cpu, "CRASH: %s", msg); 71 - if (args->arg2 == LGUEST_SHUTDOWN_RESTART) 72 - cpu->lg->dead = ERR_PTR(-ERESTART); 73 - break; 74 - } 75 - case LHCALL_FLUSH_TLB: 76 - /* FLUSH_TLB comes in two flavors, depending on the argument: */ 77 - if (args->arg1) 78 - guest_pagetable_clear_all(cpu); 79 - else 80 - guest_pagetable_flush_user(cpu); 81 - break; 82 - 83 - /* 84 - * All these calls simply pass the arguments through to the right 85 - * routines. 86 - */ 87 - case LHCALL_NEW_PGTABLE: 88 - guest_new_pagetable(cpu, args->arg1); 89 - break; 90 - case LHCALL_SET_STACK: 91 - guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); 92 - break; 93 - case LHCALL_SET_PTE: 94 - #ifdef CONFIG_X86_PAE 95 - guest_set_pte(cpu, args->arg1, args->arg2, 96 - __pte(args->arg3 | (u64)args->arg4 << 32)); 97 - #else 98 - guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); 99 - #endif 100 - break; 101 - case LHCALL_SET_PGD: 102 - guest_set_pgd(cpu->lg, args->arg1, args->arg2); 103 - break; 104 - #ifdef CONFIG_X86_PAE 105 - case LHCALL_SET_PMD: 106 - guest_set_pmd(cpu->lg, args->arg1, args->arg2); 107 - break; 108 - #endif 109 - case LHCALL_SET_CLOCKEVENT: 110 - guest_set_clockevent(cpu, args->arg1); 111 - break; 112 - case LHCALL_HALT: 113 - /* Similarly, this sets the halted flag for run_guest(). */ 114 - cpu->halted = 1; 115 - break; 116 - default: 117 - /* It should be an architecture-specific hypercall. */ 118 - if (lguest_arch_do_hcall(cpu, args)) 119 - kill_guest(cpu, "Bad hypercall %li\n", args->arg0); 120 - } 121 - } 122 - 123 - /*H:124 124 - * Asynchronous hypercalls are easy: we just look in the array in the 125 - * Guest's "struct lguest_data" to see if any new ones are marked "ready". 126 - * 127 - * We are careful to do these in order: obviously we respect the order the 128 - * Guest put them in the ring, but we also promise the Guest that they will 129 - * happen before any normal hypercall (which is why we check this before 130 - * checking for a normal hcall). 131 - */ 132 - static void do_async_hcalls(struct lg_cpu *cpu) 133 - { 134 - unsigned int i; 135 - u8 st[LHCALL_RING_SIZE]; 136 - 137 - /* For simplicity, we copy the entire call status array in at once. */ 138 - if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st))) 139 - return; 140 - 141 - /* We process "struct lguest_data"s hcalls[] ring once. */ 142 - for (i = 0; i < ARRAY_SIZE(st); i++) { 143 - struct hcall_args args; 144 - /* 145 - * We remember where we were up to from last time. This makes 146 - * sure that the hypercalls are done in the order the Guest 147 - * places them in the ring. 148 - */ 149 - unsigned int n = cpu->next_hcall; 150 - 151 - /* 0xFF means there's no call here (yet). */ 152 - if (st[n] == 0xFF) 153 - break; 154 - 155 - /* 156 - * OK, we have hypercall. Increment the "next_hcall" cursor, 157 - * and wrap back to 0 if we reach the end. 158 - */ 159 - if (++cpu->next_hcall == LHCALL_RING_SIZE) 160 - cpu->next_hcall = 0; 161 - 162 - /* 163 - * Copy the hypercall arguments into a local copy of the 164 - * hcall_args struct. 165 - */ 166 - if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], 167 - sizeof(struct hcall_args))) { 168 - kill_guest(cpu, "Fetching async hypercalls"); 169 - break; 170 - } 171 - 172 - /* Do the hypercall, same as a normal one. */ 173 - do_hcall(cpu, &args); 174 - 175 - /* Mark the hypercall done. */ 176 - if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) { 177 - kill_guest(cpu, "Writing result for async hypercall"); 178 - break; 179 - } 180 - 181 - /* 182 - * Stop doing hypercalls if they want to notify the Launcher: 183 - * it needs to service this first. 184 - */ 185 - if (cpu->pending.trap) 186 - break; 187 - } 188 - } 189 - 190 - /* 191 - * Last of all, we look at what happens first of all. The very first time the 192 - * Guest makes a hypercall, we end up here to set things up: 193 - */ 194 - static void initialize(struct lg_cpu *cpu) 195 - { 196 - /* 197 - * You can't do anything until you're initialized. The Guest knows the 198 - * rules, so we're unforgiving here. 199 - */ 200 - if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { 201 - kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); 202 - return; 203 - } 204 - 205 - if (lguest_arch_init_hypercalls(cpu)) 206 - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 207 - 208 - /* 209 - * The Guest tells us where we're not to deliver interrupts by putting 210 - * the instruction address into "struct lguest_data". 211 - */ 212 - if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret)) 213 - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 214 - 215 - /* 216 - * We write the current time into the Guest's data page once so it can 217 - * set its clock. 218 - */ 219 - write_timestamp(cpu); 220 - 221 - /* page_tables.c will also do some setup. */ 222 - page_table_guest_data_init(cpu); 223 - 224 - /* 225 - * This is the one case where the above accesses might have been the 226 - * first write to a Guest page. This may have caused a copy-on-write 227 - * fault, but the old page might be (read-only) in the Guest 228 - * pagetable. 229 - */ 230 - guest_pagetable_clear_all(cpu); 231 - } 232 - /*:*/ 233 - 234 - /*M:013 235 - * If a Guest reads from a page (so creates a mapping) that it has never 236 - * written to, and then the Launcher writes to it (ie. the output of a virtual 237 - * device), the Guest will still see the old page. In practice, this never 238 - * happens: why would the Guest read a page which it has never written to? But 239 - * a similar scenario might one day bite us, so it's worth mentioning. 240 - * 241 - * Note that if we used a shared anonymous mapping in the Launcher instead of 242 - * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we 243 - * need that to switch the Launcher to processes (away from threads) anyway. 244 - :*/ 245 - 246 - /*H:100 247 - * Hypercalls 248 - * 249 - * Remember from the Guest, hypercalls come in two flavors: normal and 250 - * asynchronous. This file handles both of types. 251 - */ 252 - void do_hypercalls(struct lg_cpu *cpu) 253 - { 254 - /* Not initialized yet? This hypercall must do it. */ 255 - if (unlikely(!cpu->lg->lguest_data)) { 256 - /* Set up the "struct lguest_data" */ 257 - initialize(cpu); 258 - /* Hcall is done. */ 259 - cpu->hcall = NULL; 260 - return; 261 - } 262 - 263 - /* 264 - * The Guest has initialized. 265 - * 266 - * Look in the hypercall ring for the async hypercalls: 267 - */ 268 - do_async_hcalls(cpu); 269 - 270 - /* 271 - * If we stopped reading the hypercall ring because the Guest did a 272 - * NOTIFY to the Launcher, we want to return now. Otherwise we do 273 - * the hypercall. 274 - */ 275 - if (!cpu->pending.trap) { 276 - do_hcall(cpu, cpu->hcall); 277 - /* 278 - * Tricky point: we reset the hcall pointer to mark the 279 - * hypercall as "done". We use the hcall pointer rather than 280 - * the trap number to indicate a hypercall is pending. 281 - * Normally it doesn't matter: the Guest will run again and 282 - * update the trap number before we come back here. 283 - * 284 - * However, if we are signalled or the Guest sends I/O to the 285 - * Launcher, the run_guest() loop will exit without running the 286 - * Guest. When it comes back it would try to re-run the 287 - * hypercall. Finding that bug sucked. 288 - */ 289 - cpu->hcall = NULL; 290 - } 291 - } 292 - 293 - /* 294 - * This routine supplies the Guest with time: it's used for wallclock time at 295 - * initial boot and as a rough time source if the TSC isn't available. 296 - */ 297 - void write_timestamp(struct lg_cpu *cpu) 298 - { 299 - struct timespec now; 300 - ktime_get_real_ts(&now); 301 - if (copy_to_user(&cpu->lg->lguest_data->time, 302 - &now, sizeof(struct timespec))) 303 - kill_guest(cpu, "Writing timestamp"); 304 - }
-706
drivers/lguest/interrupts_and_traps.c
··· 1 - /*P:800 2 - * Interrupts (traps) are complicated enough to earn their own file. 3 - * There are three classes of interrupts: 4 - * 5 - * 1) Real hardware interrupts which occur while we're running the Guest, 6 - * 2) Interrupts for virtual devices attached to the Guest, and 7 - * 3) Traps and faults from the Guest. 8 - * 9 - * Real hardware interrupts must be delivered to the Host, not the Guest. 10 - * Virtual interrupts must be delivered to the Guest, but we make them look 11 - * just like real hardware would deliver them. Traps from the Guest can be set 12 - * up to go directly back into the Guest, but sometimes the Host wants to see 13 - * them first, so we also have a way of "reflecting" them into the Guest as if 14 - * they had been delivered to it directly. 15 - :*/ 16 - #include <linux/uaccess.h> 17 - #include <linux/interrupt.h> 18 - #include <linux/module.h> 19 - #include <linux/sched.h> 20 - #include "lg.h" 21 - 22 - /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ 23 - static unsigned int syscall_vector = IA32_SYSCALL_VECTOR; 24 - module_param(syscall_vector, uint, 0444); 25 - 26 - /* The address of the interrupt handler is split into two bits: */ 27 - static unsigned long idt_address(u32 lo, u32 hi) 28 - { 29 - return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); 30 - } 31 - 32 - /* 33 - * The "type" of the interrupt handler is a 4 bit field: we only support a 34 - * couple of types. 35 - */ 36 - static int idt_type(u32 lo, u32 hi) 37 - { 38 - return (hi >> 8) & 0xF; 39 - } 40 - 41 - /* An IDT entry can't be used unless the "present" bit is set. */ 42 - static bool idt_present(u32 lo, u32 hi) 43 - { 44 - return (hi & 0x8000); 45 - } 46 - 47 - /* 48 - * We need a helper to "push" a value onto the Guest's stack, since that's a 49 - * big part of what delivering an interrupt does. 50 - */ 51 - static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) 52 - { 53 - /* Stack grows upwards: move stack then write value. */ 54 - *gstack -= 4; 55 - lgwrite(cpu, *gstack, u32, val); 56 - } 57 - 58 - /*H:210 59 - * The push_guest_interrupt_stack() routine saves Guest state on the stack for 60 - * an interrupt or trap. The mechanics of delivering traps and interrupts to 61 - * the Guest are the same, except some traps have an "error code" which gets 62 - * pushed onto the stack as well: the caller tells us if this is one. 63 - * 64 - * We set up the stack just like the CPU does for a real interrupt, so it's 65 - * identical for the Guest (and the standard "iret" instruction will undo 66 - * it). 67 - */ 68 - static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err) 69 - { 70 - unsigned long gstack, origstack; 71 - u32 eflags, ss, irq_enable; 72 - unsigned long virtstack; 73 - 74 - /* 75 - * There are two cases for interrupts: one where the Guest is already 76 - * in the kernel, and a more complex one where the Guest is in 77 - * userspace. We check the privilege level to find out. 78 - */ 79 - if ((cpu->regs->ss&0x3) != GUEST_PL) { 80 - /* 81 - * The Guest told us their kernel stack with the SET_STACK 82 - * hypercall: both the virtual address and the segment. 83 - */ 84 - virtstack = cpu->esp1; 85 - ss = cpu->ss1; 86 - 87 - origstack = gstack = guest_pa(cpu, virtstack); 88 - /* 89 - * We push the old stack segment and pointer onto the new 90 - * stack: when the Guest does an "iret" back from the interrupt 91 - * handler the CPU will notice they're dropping privilege 92 - * levels and expect these here. 93 - */ 94 - push_guest_stack(cpu, &gstack, cpu->regs->ss); 95 - push_guest_stack(cpu, &gstack, cpu->regs->esp); 96 - } else { 97 - /* We're staying on the same Guest (kernel) stack. */ 98 - virtstack = cpu->regs->esp; 99 - ss = cpu->regs->ss; 100 - 101 - origstack = gstack = guest_pa(cpu, virtstack); 102 - } 103 - 104 - /* 105 - * Remember that we never let the Guest actually disable interrupts, so 106 - * the "Interrupt Flag" bit is always set. We copy that bit from the 107 - * Guest's "irq_enabled" field into the eflags word: we saw the Guest 108 - * copy it back in "lguest_iret". 109 - */ 110 - eflags = cpu->regs->eflags; 111 - if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 112 - && !(irq_enable & X86_EFLAGS_IF)) 113 - eflags &= ~X86_EFLAGS_IF; 114 - 115 - /* 116 - * An interrupt is expected to push three things on the stack: the old 117 - * "eflags" word, the old code segment, and the old instruction 118 - * pointer. 119 - */ 120 - push_guest_stack(cpu, &gstack, eflags); 121 - push_guest_stack(cpu, &gstack, cpu->regs->cs); 122 - push_guest_stack(cpu, &gstack, cpu->regs->eip); 123 - 124 - /* For the six traps which supply an error code, we push that, too. */ 125 - if (has_err) 126 - push_guest_stack(cpu, &gstack, cpu->regs->errcode); 127 - 128 - /* Adjust the stack pointer and stack segment. */ 129 - cpu->regs->ss = ss; 130 - cpu->regs->esp = virtstack + (gstack - origstack); 131 - } 132 - 133 - /* 134 - * This actually makes the Guest start executing the given interrupt/trap 135 - * handler. 136 - * 137 - * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this 138 - * interrupt or trap. It's split into two parts for traditional reasons: gcc 139 - * on i386 used to be frightened by 64 bit numbers. 140 - */ 141 - static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi) 142 - { 143 - /* If we're already in the kernel, we don't change stacks. */ 144 - if ((cpu->regs->ss&0x3) != GUEST_PL) 145 - cpu->regs->ss = cpu->esp1; 146 - 147 - /* 148 - * Set the code segment and the address to execute. 149 - */ 150 - cpu->regs->cs = (__KERNEL_CS|GUEST_PL); 151 - cpu->regs->eip = idt_address(lo, hi); 152 - 153 - /* 154 - * Trapping always clears these flags: 155 - * TF: Trap flag 156 - * VM: Virtual 8086 mode 157 - * RF: Resume 158 - * NT: Nested task. 159 - */ 160 - cpu->regs->eflags &= 161 - ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); 162 - 163 - /* 164 - * There are two kinds of interrupt handlers: 0xE is an "interrupt 165 - * gate" which expects interrupts to be disabled on entry. 166 - */ 167 - if (idt_type(lo, hi) == 0xE) 168 - if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) 169 - kill_guest(cpu, "Disabling interrupts"); 170 - } 171 - 172 - /* This restores the eflags word which was pushed on the stack by a trap */ 173 - static void restore_eflags(struct lg_cpu *cpu) 174 - { 175 - /* This is the physical address of the stack. */ 176 - unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp); 177 - 178 - /* 179 - * Stack looks like this: 180 - * Address Contents 181 - * esp EIP 182 - * esp + 4 CS 183 - * esp + 8 EFLAGS 184 - */ 185 - cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32); 186 - cpu->regs->eflags &= 187 - ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); 188 - } 189 - 190 - /*H:205 191 - * Virtual Interrupts. 192 - * 193 - * interrupt_pending() returns the first pending interrupt which isn't blocked 194 - * by the Guest. It is called before every entry to the Guest, and just before 195 - * we go to sleep when the Guest has halted itself. 196 - */ 197 - unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) 198 - { 199 - unsigned int irq; 200 - DECLARE_BITMAP(blk, LGUEST_IRQS); 201 - 202 - /* If the Guest hasn't even initialized yet, we can do nothing. */ 203 - if (!cpu->lg->lguest_data) 204 - return LGUEST_IRQS; 205 - 206 - /* 207 - * Take our "irqs_pending" array and remove any interrupts the Guest 208 - * wants blocked: the result ends up in "blk". 209 - */ 210 - if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 211 - sizeof(blk))) 212 - return LGUEST_IRQS; 213 - bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); 214 - 215 - /* Find the first interrupt. */ 216 - irq = find_first_bit(blk, LGUEST_IRQS); 217 - *more = find_next_bit(blk, LGUEST_IRQS, irq+1); 218 - 219 - return irq; 220 - } 221 - 222 - /* 223 - * This actually diverts the Guest to running an interrupt handler, once an 224 - * interrupt has been identified by interrupt_pending(). 225 - */ 226 - void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) 227 - { 228 - struct desc_struct *idt; 229 - 230 - BUG_ON(irq >= LGUEST_IRQS); 231 - 232 - /* If they're halted, interrupts restart them. */ 233 - if (cpu->halted) { 234 - /* Re-enable interrupts. */ 235 - if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled)) 236 - kill_guest(cpu, "Re-enabling interrupts"); 237 - cpu->halted = 0; 238 - } else { 239 - /* Otherwise we check if they have interrupts disabled. */ 240 - u32 irq_enabled; 241 - if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) 242 - irq_enabled = 0; 243 - if (!irq_enabled) { 244 - /* Make sure they know an IRQ is pending. */ 245 - put_user(X86_EFLAGS_IF, 246 - &cpu->lg->lguest_data->irq_pending); 247 - return; 248 - } 249 - } 250 - 251 - /* 252 - * Look at the IDT entry the Guest gave us for this interrupt. The 253 - * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 254 - * over them. 255 - */ 256 - idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 257 - /* If they don't have a handler (yet?), we just ignore it */ 258 - if (idt_present(idt->a, idt->b)) { 259 - /* OK, mark it no longer pending and deliver it. */ 260 - clear_bit(irq, cpu->irqs_pending); 261 - 262 - /* 263 - * They may be about to iret, where they asked us never to 264 - * deliver interrupts. In this case, we can emulate that iret 265 - * then immediately deliver the interrupt. This is basically 266 - * a noop: the iret would pop the interrupt frame and restore 267 - * eflags, and then we'd set it up again. So just restore the 268 - * eflags word and jump straight to the handler in this case. 269 - * 270 - * Denys Vlasenko points out that this isn't quite right: if 271 - * the iret was returning to userspace, then that interrupt 272 - * would reset the stack pointer (which the Guest told us 273 - * about via LHCALL_SET_STACK). But unless the Guest is being 274 - * *really* weird, that will be the same as the current stack 275 - * anyway. 276 - */ 277 - if (cpu->regs->eip == cpu->lg->noirq_iret) { 278 - restore_eflags(cpu); 279 - } else { 280 - /* 281 - * set_guest_interrupt() takes a flag to say whether 282 - * this interrupt pushes an error code onto the stack 283 - * as well: virtual interrupts never do. 284 - */ 285 - push_guest_interrupt_stack(cpu, false); 286 - } 287 - /* Actually make Guest cpu jump to handler. */ 288 - guest_run_interrupt(cpu, idt->a, idt->b); 289 - } 290 - 291 - /* 292 - * Every time we deliver an interrupt, we update the timestamp in the 293 - * Guest's lguest_data struct. It would be better for the Guest if we 294 - * did this more often, but it can actually be quite slow: doing it 295 - * here is a compromise which means at least it gets updated every 296 - * timer interrupt. 297 - */ 298 - write_timestamp(cpu); 299 - 300 - /* 301 - * If there are no other interrupts we want to deliver, clear 302 - * the pending flag. 303 - */ 304 - if (!more) 305 - put_user(0, &cpu->lg->lguest_data->irq_pending); 306 - } 307 - 308 - /* And this is the routine when we want to set an interrupt for the Guest. */ 309 - void set_interrupt(struct lg_cpu *cpu, unsigned int irq) 310 - { 311 - /* 312 - * Next time the Guest runs, the core code will see if it can deliver 313 - * this interrupt. 314 - */ 315 - set_bit(irq, cpu->irqs_pending); 316 - 317 - /* 318 - * Make sure it sees it; it might be asleep (eg. halted), or running 319 - * the Guest right now, in which case kick_process() will knock it out. 320 - */ 321 - if (!wake_up_process(cpu->tsk)) 322 - kick_process(cpu->tsk); 323 - } 324 - /*:*/ 325 - 326 - /* 327 - * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 328 - * me a patch, so we support that too. It'd be a big step for lguest if half 329 - * the Plan 9 user base were to start using it. 330 - * 331 - * Actually now I think of it, it's possible that Ron *is* half the Plan 9 332 - * userbase. Oh well. 333 - */ 334 - bool could_be_syscall(unsigned int num) 335 - { 336 - /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */ 337 - return num == IA32_SYSCALL_VECTOR || num == syscall_vector; 338 - } 339 - 340 - /* The syscall vector it wants must be unused by Host. */ 341 - bool check_syscall_vector(struct lguest *lg) 342 - { 343 - u32 vector; 344 - 345 - if (get_user(vector, &lg->lguest_data->syscall_vec)) 346 - return false; 347 - 348 - return could_be_syscall(vector); 349 - } 350 - 351 - int init_interrupts(void) 352 - { 353 - /* If they want some strange system call vector, reserve it now */ 354 - if (syscall_vector != IA32_SYSCALL_VECTOR) { 355 - if (test_bit(syscall_vector, used_vectors) || 356 - vector_used_by_percpu_irq(syscall_vector)) { 357 - printk(KERN_ERR "lg: couldn't reserve syscall %u\n", 358 - syscall_vector); 359 - return -EBUSY; 360 - } 361 - set_bit(syscall_vector, used_vectors); 362 - } 363 - 364 - return 0; 365 - } 366 - 367 - void free_interrupts(void) 368 - { 369 - if (syscall_vector != IA32_SYSCALL_VECTOR) 370 - clear_bit(syscall_vector, used_vectors); 371 - } 372 - 373 - /*H:220 374 - * Now we've got the routines to deliver interrupts, delivering traps like 375 - * page fault is easy. The only trick is that Intel decided that some traps 376 - * should have error codes: 377 - */ 378 - static bool has_err(unsigned int trap) 379 - { 380 - return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); 381 - } 382 - 383 - /* deliver_trap() returns true if it could deliver the trap. */ 384 - bool deliver_trap(struct lg_cpu *cpu, unsigned int num) 385 - { 386 - /* 387 - * Trap numbers are always 8 bit, but we set an impossible trap number 388 - * for traps inside the Switcher, so check that here. 389 - */ 390 - if (num >= ARRAY_SIZE(cpu->arch.idt)) 391 - return false; 392 - 393 - /* 394 - * Early on the Guest hasn't set the IDT entries (or maybe it put a 395 - * bogus one in): if we fail here, the Guest will be killed. 396 - */ 397 - if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) 398 - return false; 399 - push_guest_interrupt_stack(cpu, has_err(num)); 400 - guest_run_interrupt(cpu, cpu->arch.idt[num].a, 401 - cpu->arch.idt[num].b); 402 - return true; 403 - } 404 - 405 - /*H:250 406 - * Here's the hard part: returning to the Host every time a trap happens 407 - * and then calling deliver_trap() and re-entering the Guest is slow. 408 - * Particularly because Guest userspace system calls are traps (usually trap 409 - * 128). 410 - * 411 - * So we'd like to set up the IDT to tell the CPU to deliver traps directly 412 - * into the Guest. This is possible, but the complexities cause the size of 413 - * this file to double! However, 150 lines of code is worth writing for taking 414 - * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 415 - * the other hypervisors would beat it up at lunchtime. 416 - * 417 - * This routine indicates if a particular trap number could be delivered 418 - * directly. 419 - * 420 - * Unfortunately, Linux 4.6 started using an interrupt gate instead of a 421 - * trap gate for syscalls, so this trick is ineffective. See Mastery for 422 - * how we could do this anyway... 423 - */ 424 - static bool direct_trap(unsigned int num) 425 - { 426 - /* 427 - * Hardware interrupts don't go to the Guest at all (except system 428 - * call). 429 - */ 430 - if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 431 - return false; 432 - 433 - /* 434 - * The Host needs to see page faults (for shadow paging and to save the 435 - * fault address), general protection faults (in/out emulation) and 436 - * device not available (TS handling) and of course, the hypercall trap. 437 - */ 438 - return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; 439 - } 440 - /*:*/ 441 - 442 - /*M:005 443 - * The Guest has the ability to turn its interrupt gates into trap gates, 444 - * if it is careful. The Host will let trap gates can go directly to the 445 - * Guest, but the Guest needs the interrupts atomically disabled for an 446 - * interrupt gate. The Host could provide a mechanism to register more 447 - * "no-interrupt" regions, and the Guest could point the trap gate at 448 - * instructions within that region, where it can safely disable interrupts. 449 - */ 450 - 451 - /*M:006 452 - * The Guests do not use the sysenter (fast system call) instruction, 453 - * because it's hardcoded to enter privilege level 0 and so can't go direct. 454 - * It's about twice as fast as the older "int 0x80" system call, so it might 455 - * still be worthwhile to handle it in the Switcher and lcall down to the 456 - * Guest. The sysenter semantics are hairy tho: search for that keyword in 457 - * entry.S 458 - :*/ 459 - 460 - /*H:260 461 - * When we make traps go directly into the Guest, we need to make sure 462 - * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the 463 - * CPU trying to deliver the trap will fault while trying to push the interrupt 464 - * words on the stack: this is called a double fault, and it forces us to kill 465 - * the Guest. 466 - * 467 - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. 468 - */ 469 - void pin_stack_pages(struct lg_cpu *cpu) 470 - { 471 - unsigned int i; 472 - 473 - /* 474 - * Depending on the CONFIG_4KSTACKS option, the Guest can have one or 475 - * two pages of stack space. 476 - */ 477 - for (i = 0; i < cpu->lg->stack_pages; i++) 478 - /* 479 - * The stack grows *upwards*, so the address we're given is the 480 - * start of the page after the kernel stack. Subtract one to 481 - * get back onto the first stack page, and keep subtracting to 482 - * get to the rest of the stack pages. 483 - */ 484 - pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); 485 - } 486 - 487 - /* 488 - * Direct traps also mean that we need to know whenever the Guest wants to use 489 - * a different kernel stack, so we can change the guest TSS to use that 490 - * stack. The TSS entries expect a virtual address, so unlike most addresses 491 - * the Guest gives us, the "esp" (stack pointer) value here is virtual, not 492 - * physical. 493 - * 494 - * In Linux each process has its own kernel stack, so this happens a lot: we 495 - * change stacks on each context switch. 496 - */ 497 - void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) 498 - { 499 - /* 500 - * You're not allowed a stack segment with privilege level 0: bad Guest! 501 - */ 502 - if ((seg & 0x3) != GUEST_PL) 503 - kill_guest(cpu, "bad stack segment %i", seg); 504 - /* We only expect one or two stack pages. */ 505 - if (pages > 2) 506 - kill_guest(cpu, "bad stack pages %u", pages); 507 - /* Save where the stack is, and how many pages */ 508 - cpu->ss1 = seg; 509 - cpu->esp1 = esp; 510 - cpu->lg->stack_pages = pages; 511 - /* Make sure the new stack pages are mapped */ 512 - pin_stack_pages(cpu); 513 - } 514 - 515 - /* 516 - * All this reference to mapping stacks leads us neatly into the other complex 517 - * part of the Host: page table handling. 518 - */ 519 - 520 - /*H:235 521 - * This is the routine which actually checks the Guest's IDT entry and 522 - * transfers it into the entry in "struct lguest": 523 - */ 524 - static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, 525 - unsigned int num, u32 lo, u32 hi) 526 - { 527 - u8 type = idt_type(lo, hi); 528 - 529 - /* We zero-out a not-present entry */ 530 - if (!idt_present(lo, hi)) { 531 - trap->a = trap->b = 0; 532 - return; 533 - } 534 - 535 - /* We only support interrupt and trap gates. */ 536 - if (type != 0xE && type != 0xF) 537 - kill_guest(cpu, "bad IDT type %i", type); 538 - 539 - /* 540 - * We only copy the handler address, present bit, privilege level and 541 - * type. The privilege level controls where the trap can be triggered 542 - * manually with an "int" instruction. This is usually GUEST_PL, 543 - * except for system calls which userspace can use. 544 - */ 545 - trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); 546 - trap->b = (hi&0xFFFFEF00); 547 - } 548 - 549 - /*H:230 550 - * While we're here, dealing with delivering traps and interrupts to the 551 - * Guest, we might as well complete the picture: how the Guest tells us where 552 - * it wants them to go. This would be simple, except making traps fast 553 - * requires some tricks. 554 - * 555 - * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 556 - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. 557 - */ 558 - void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) 559 - { 560 - /* 561 - * Guest never handles: NMI, doublefault, spurious interrupt or 562 - * hypercall. We ignore when it tries to set them. 563 - */ 564 - if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) 565 - return; 566 - 567 - /* 568 - * Mark the IDT as changed: next time the Guest runs we'll know we have 569 - * to copy this again. 570 - */ 571 - cpu->changed |= CHANGED_IDT; 572 - 573 - /* Check that the Guest doesn't try to step outside the bounds. */ 574 - if (num >= ARRAY_SIZE(cpu->arch.idt)) 575 - kill_guest(cpu, "Setting idt entry %u", num); 576 - else 577 - set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); 578 - } 579 - 580 - /* 581 - * The default entry for each interrupt points into the Switcher routines which 582 - * simply return to the Host. The run_guest() loop will then call 583 - * deliver_trap() to bounce it back into the Guest. 584 - */ 585 - static void default_idt_entry(struct desc_struct *idt, 586 - int trap, 587 - const unsigned long handler, 588 - const struct desc_struct *base) 589 - { 590 - /* A present interrupt gate. */ 591 - u32 flags = 0x8e00; 592 - 593 - /* 594 - * Set the privilege level on the entry for the hypercall: this allows 595 - * the Guest to use the "int" instruction to trigger it. 596 - */ 597 - if (trap == LGUEST_TRAP_ENTRY) 598 - flags |= (GUEST_PL << 13); 599 - else if (base) 600 - /* 601 - * Copy privilege level from what Guest asked for. This allows 602 - * debug (int 3) traps from Guest userspace, for example. 603 - */ 604 - flags |= (base->b & 0x6000); 605 - 606 - /* Now pack it into the IDT entry in its weird format. */ 607 - idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); 608 - idt->b = (handler&0xFFFF0000) | flags; 609 - } 610 - 611 - /* When the Guest first starts, we put default entries into the IDT. */ 612 - void setup_default_idt_entries(struct lguest_ro_state *state, 613 - const unsigned long *def) 614 - { 615 - unsigned int i; 616 - 617 - for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) 618 - default_idt_entry(&state->guest_idt[i], i, def[i], NULL); 619 - } 620 - 621 - /*H:240 622 - * We don't use the IDT entries in the "struct lguest" directly, instead 623 - * we copy them into the IDT which we've set up for Guests on this CPU, just 624 - * before we run the Guest. This routine does that copy. 625 - */ 626 - void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 627 - const unsigned long *def) 628 - { 629 - unsigned int i; 630 - 631 - /* 632 - * We can simply copy the direct traps, otherwise we use the default 633 - * ones in the Switcher: they will return to the Host. 634 - */ 635 - for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { 636 - const struct desc_struct *gidt = &cpu->arch.idt[i]; 637 - 638 - /* If no Guest can ever override this trap, leave it alone. */ 639 - if (!direct_trap(i)) 640 - continue; 641 - 642 - /* 643 - * Only trap gates (type 15) can go direct to the Guest. 644 - * Interrupt gates (type 14) disable interrupts as they are 645 - * entered, which we never let the Guest do. Not present 646 - * entries (type 0x0) also can't go direct, of course. 647 - * 648 - * If it can't go direct, we still need to copy the priv. level: 649 - * they might want to give userspace access to a software 650 - * interrupt. 651 - */ 652 - if (idt_type(gidt->a, gidt->b) == 0xF) 653 - idt[i] = *gidt; 654 - else 655 - default_idt_entry(&idt[i], i, def[i], gidt); 656 - } 657 - } 658 - 659 - /*H:200 660 - * The Guest Clock. 661 - * 662 - * There are two sources of virtual interrupts. We saw one in lguest_user.c: 663 - * the Launcher sending interrupts for virtual devices. The other is the Guest 664 - * timer interrupt. 665 - * 666 - * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to 667 - * the next timer interrupt (in nanoseconds). We use the high-resolution timer 668 - * infrastructure to set a callback at that time. 669 - * 670 - * 0 means "turn off the clock". 671 - */ 672 - void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) 673 - { 674 - ktime_t expires; 675 - 676 - if (unlikely(delta == 0)) { 677 - /* Clock event device is shutting down. */ 678 - hrtimer_cancel(&cpu->hrt); 679 - return; 680 - } 681 - 682 - /* 683 - * We use wallclock time here, so the Guest might not be running for 684 - * all the time between now and the timer interrupt it asked for. This 685 - * is almost always the right thing to do. 686 - */ 687 - expires = ktime_add_ns(ktime_get_real(), delta); 688 - hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); 689 - } 690 - 691 - /* This is the function called when the Guest's timer expires. */ 692 - static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 693 - { 694 - struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); 695 - 696 - /* Remember the first interrupt is the timer interrupt. */ 697 - set_interrupt(cpu, 0); 698 - return HRTIMER_NORESTART; 699 - } 700 - 701 - /* This sets up the timer for this Guest. */ 702 - void init_clockdev(struct lg_cpu *cpu) 703 - { 704 - hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); 705 - cpu->hrt.function = clockdev_fn; 706 - }
-258
drivers/lguest/lg.h
··· 1 - #ifndef _LGUEST_H 2 - #define _LGUEST_H 3 - 4 - #ifndef __ASSEMBLY__ 5 - #include <linux/types.h> 6 - #include <linux/init.h> 7 - #include <linux/stringify.h> 8 - #include <linux/lguest.h> 9 - #include <linux/lguest_launcher.h> 10 - #include <linux/wait.h> 11 - #include <linux/hrtimer.h> 12 - #include <linux/err.h> 13 - #include <linux/slab.h> 14 - 15 - #include <asm/lguest.h> 16 - 17 - struct pgdir { 18 - unsigned long gpgdir; 19 - bool switcher_mapped; 20 - int last_host_cpu; 21 - pgd_t *pgdir; 22 - }; 23 - 24 - /* We have two pages shared with guests, per cpu. */ 25 - struct lguest_pages { 26 - /* This is the stack page mapped rw in guest */ 27 - char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; 28 - struct lguest_regs regs; 29 - 30 - /* This is the host state & guest descriptor page, ro in guest */ 31 - struct lguest_ro_state state; 32 - } __attribute__((aligned(PAGE_SIZE))); 33 - 34 - #define CHANGED_IDT 1 35 - #define CHANGED_GDT 2 36 - #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ 37 - #define CHANGED_ALL 3 38 - 39 - struct lg_cpu { 40 - unsigned int id; 41 - struct lguest *lg; 42 - struct task_struct *tsk; 43 - struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 44 - 45 - u32 cr2; 46 - u32 esp1; 47 - u16 ss1; 48 - 49 - /* Bitmap of what has changed: see CHANGED_* above. */ 50 - int changed; 51 - 52 - /* Pending operation. */ 53 - struct lguest_pending pending; 54 - 55 - unsigned long *reg_read; /* register from LHREQ_GETREG */ 56 - 57 - /* At end of a page shared mapped over lguest_pages in guest. */ 58 - unsigned long regs_page; 59 - struct lguest_regs *regs; 60 - 61 - struct lguest_pages *last_pages; 62 - 63 - /* Initialization mode: linear map everything. */ 64 - bool linear_pages; 65 - int cpu_pgd; /* Which pgd this cpu is currently using */ 66 - 67 - /* If a hypercall was asked for, this points to the arguments. */ 68 - struct hcall_args *hcall; 69 - u32 next_hcall; 70 - 71 - /* Virtual clock device */ 72 - struct hrtimer hrt; 73 - 74 - /* Did the Guest tell us to halt? */ 75 - int halted; 76 - 77 - /* Pending virtual interrupts */ 78 - DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); 79 - 80 - struct lg_cpu_arch arch; 81 - }; 82 - 83 - /* The private info the thread maintains about the guest. */ 84 - struct lguest { 85 - struct lguest_data __user *lguest_data; 86 - struct lg_cpu cpus[NR_CPUS]; 87 - unsigned int nr_cpus; 88 - 89 - /* Valid guest memory pages must be < this. */ 90 - u32 pfn_limit; 91 - 92 - /* Device memory is >= pfn_limit and < device_limit. */ 93 - u32 device_limit; 94 - 95 - /* 96 - * This provides the offset to the base of guest-physical memory in the 97 - * Launcher. 98 - */ 99 - void __user *mem_base; 100 - unsigned long kernel_address; 101 - 102 - struct pgdir pgdirs[4]; 103 - 104 - unsigned long noirq_iret; 105 - 106 - unsigned int stack_pages; 107 - u32 tsc_khz; 108 - 109 - /* Dead? */ 110 - const char *dead; 111 - }; 112 - 113 - extern struct mutex lguest_lock; 114 - 115 - /* core.c: */ 116 - bool lguest_address_ok(const struct lguest *lg, 117 - unsigned long addr, unsigned long len); 118 - void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 119 - void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 120 - extern struct page **lg_switcher_pages; 121 - 122 - /*H:035 123 - * Using memory-copy operations like that is usually inconvient, so we 124 - * have the following helper macros which read and write a specific type (often 125 - * an unsigned long). 126 - * 127 - * This reads into a variable of the given type then returns that. 128 - */ 129 - #define lgread(cpu, addr, type) \ 130 - ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) 131 - 132 - /* This checks that the variable is of the given type, then writes it out. */ 133 - #define lgwrite(cpu, addr, type, val) \ 134 - do { \ 135 - typecheck(type, val); \ 136 - __lgwrite((cpu), (addr), &(val), sizeof(val)); \ 137 - } while(0) 138 - /* (end of memory access helper routines) :*/ 139 - 140 - int run_guest(struct lg_cpu *cpu, unsigned long __user *user); 141 - 142 - /* 143 - * Helper macros to obtain the first 12 or the last 20 bits, this is only the 144 - * first step in the migration to the kernel types. pte_pfn is already defined 145 - * in the kernel. 146 - */ 147 - #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 148 - #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 149 - #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) 150 - #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) 151 - 152 - /* interrupts_and_traps.c: */ 153 - unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); 154 - void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); 155 - void set_interrupt(struct lg_cpu *cpu, unsigned int irq); 156 - bool deliver_trap(struct lg_cpu *cpu, unsigned int num); 157 - void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, 158 - u32 low, u32 hi); 159 - void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages); 160 - void pin_stack_pages(struct lg_cpu *cpu); 161 - void setup_default_idt_entries(struct lguest_ro_state *state, 162 - const unsigned long *def); 163 - void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 164 - const unsigned long *def); 165 - void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); 166 - bool send_notify_to_eventfd(struct lg_cpu *cpu); 167 - void init_clockdev(struct lg_cpu *cpu); 168 - bool check_syscall_vector(struct lguest *lg); 169 - bool could_be_syscall(unsigned int num); 170 - int init_interrupts(void); 171 - void free_interrupts(void); 172 - 173 - /* segments.c: */ 174 - void setup_default_gdt_entries(struct lguest_ro_state *state); 175 - void setup_guest_gdt(struct lg_cpu *cpu); 176 - void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i, 177 - u32 low, u32 hi); 178 - void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array); 179 - void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); 180 - void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); 181 - 182 - /* page_tables.c: */ 183 - int init_guest_pagetable(struct lguest *lg); 184 - void free_guest_pagetable(struct lguest *lg); 185 - void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); 186 - void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); 187 - #ifdef CONFIG_X86_PAE 188 - void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 189 - #endif 190 - void guest_pagetable_clear_all(struct lg_cpu *cpu); 191 - void guest_pagetable_flush_user(struct lg_cpu *cpu); 192 - void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, 193 - unsigned long vaddr, pte_t val); 194 - void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); 195 - bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode, 196 - unsigned long *iomem); 197 - void pin_page(struct lg_cpu *cpu, unsigned long vaddr); 198 - bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr); 199 - unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); 200 - void page_table_guest_data_init(struct lg_cpu *cpu); 201 - 202 - /* <arch>/core.c: */ 203 - void lguest_arch_host_init(void); 204 - void lguest_arch_host_fini(void); 205 - void lguest_arch_run_guest(struct lg_cpu *cpu); 206 - void lguest_arch_handle_trap(struct lg_cpu *cpu); 207 - int lguest_arch_init_hypercalls(struct lg_cpu *cpu); 208 - int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); 209 - void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); 210 - unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any); 211 - 212 - /* <arch>/switcher.S: */ 213 - extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 214 - 215 - /* lguest_user.c: */ 216 - int lguest_device_init(void); 217 - void lguest_device_remove(void); 218 - 219 - /* hypercalls.c: */ 220 - void do_hypercalls(struct lg_cpu *cpu); 221 - void write_timestamp(struct lg_cpu *cpu); 222 - 223 - /*L:035 224 - * Let's step aside for the moment, to study one important routine that's used 225 - * widely in the Host code. 226 - * 227 - * There are many cases where the Guest can do something invalid, like pass crap 228 - * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite 229 - * acceptable to simply terminate the Guest and give the Launcher a nicely 230 - * formatted reason. It's also simpler for the Guest itself, which doesn't 231 - * need to check most hypercalls for "success"; if you're still running, it 232 - * succeeded. 233 - * 234 - * Once this is called, the Guest will never run again, so most Host code can 235 - * call this then continue as if nothing had happened. This means many 236 - * functions don't have to explicitly return an error code, which keeps the 237 - * code simple. 238 - * 239 - * It also means that this can be called more than once: only the first one is 240 - * remembered. The only trick is that we still need to kill the Guest even if 241 - * we can't allocate memory to store the reason. Linux has a neat way of 242 - * packing error codes into invalid pointers, so we use that here. 243 - * 244 - * Like any macro which uses an "if", it is safely wrapped in a run-once "do { 245 - * } while(0)". 246 - */ 247 - #define kill_guest(cpu, fmt...) \ 248 - do { \ 249 - if (!(cpu)->lg->dead) { \ 250 - (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \ 251 - if (!(cpu)->lg->dead) \ 252 - (cpu)->lg->dead = ERR_PTR(-ENOMEM); \ 253 - } \ 254 - } while(0) 255 - /* (End of aside) :*/ 256 - 257 - #endif /* __ASSEMBLY__ */ 258 - #endif /* _LGUEST_H */
-446
drivers/lguest/lguest_user.c
··· 1 - /*P:200 This contains all the /dev/lguest code, whereby the userspace 2 - * launcher controls and communicates with the Guest. For example, 3 - * the first write will tell us the Guest's memory layout and entry 4 - * point. A read will run the Guest until something happens, such as 5 - * a signal or the Guest accessing a device. 6 - :*/ 7 - #include <linux/uaccess.h> 8 - #include <linux/miscdevice.h> 9 - #include <linux/fs.h> 10 - #include <linux/sched.h> 11 - #include <linux/sched/mm.h> 12 - #include <linux/file.h> 13 - #include <linux/slab.h> 14 - #include <linux/export.h> 15 - #include "lg.h" 16 - 17 - /*L:052 18 - The Launcher can get the registers, and also set some of them. 19 - */ 20 - static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) 21 - { 22 - unsigned long which; 23 - 24 - /* We re-use the ptrace structure to specify which register to read. */ 25 - if (get_user(which, input) != 0) 26 - return -EFAULT; 27 - 28 - /* 29 - * We set up the cpu register pointer, and their next read will 30 - * actually get the value (instead of running the guest). 31 - * 32 - * The last argument 'true' says we can access any register. 33 - */ 34 - cpu->reg_read = lguest_arch_regptr(cpu, which, true); 35 - if (!cpu->reg_read) 36 - return -ENOENT; 37 - 38 - /* And because this is a write() call, we return the length used. */ 39 - return sizeof(unsigned long) * 2; 40 - } 41 - 42 - static int setreg(struct lg_cpu *cpu, const unsigned long __user *input) 43 - { 44 - unsigned long which, value, *reg; 45 - 46 - /* We re-use the ptrace structure to specify which register to read. */ 47 - if (get_user(which, input) != 0) 48 - return -EFAULT; 49 - input++; 50 - if (get_user(value, input) != 0) 51 - return -EFAULT; 52 - 53 - /* The last argument 'false' means we can't access all registers. */ 54 - reg = lguest_arch_regptr(cpu, which, false); 55 - if (!reg) 56 - return -ENOENT; 57 - 58 - *reg = value; 59 - 60 - /* And because this is a write() call, we return the length used. */ 61 - return sizeof(unsigned long) * 3; 62 - } 63 - 64 - /*L:050 65 - * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 66 - * number to /dev/lguest. 67 - */ 68 - static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) 69 - { 70 - unsigned long irq; 71 - 72 - if (get_user(irq, input) != 0) 73 - return -EFAULT; 74 - if (irq >= LGUEST_IRQS) 75 - return -EINVAL; 76 - 77 - /* 78 - * Next time the Guest runs, the core code will see if it can deliver 79 - * this interrupt. 80 - */ 81 - set_interrupt(cpu, irq); 82 - return 0; 83 - } 84 - 85 - /*L:053 86 - * Deliver a trap: this is used by the Launcher if it can't emulate 87 - * an instruction. 88 - */ 89 - static int trap(struct lg_cpu *cpu, const unsigned long __user *input) 90 - { 91 - unsigned long trapnum; 92 - 93 - if (get_user(trapnum, input) != 0) 94 - return -EFAULT; 95 - 96 - if (!deliver_trap(cpu, trapnum)) 97 - return -EINVAL; 98 - 99 - return 0; 100 - } 101 - 102 - /*L:040 103 - * Once our Guest is initialized, the Launcher makes it run by reading 104 - * from /dev/lguest. 105 - */ 106 - static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 107 - { 108 - struct lguest *lg = file->private_data; 109 - struct lg_cpu *cpu; 110 - unsigned int cpu_id = *o; 111 - 112 - /* You must write LHREQ_INITIALIZE first! */ 113 - if (!lg) 114 - return -EINVAL; 115 - 116 - /* Watch out for arbitrary vcpu indexes! */ 117 - if (cpu_id >= lg->nr_cpus) 118 - return -EINVAL; 119 - 120 - cpu = &lg->cpus[cpu_id]; 121 - 122 - /* If you're not the task which owns the Guest, go away. */ 123 - if (current != cpu->tsk) 124 - return -EPERM; 125 - 126 - /* If the Guest is already dead, we indicate why */ 127 - if (lg->dead) { 128 - size_t len; 129 - 130 - /* lg->dead either contains an error code, or a string. */ 131 - if (IS_ERR(lg->dead)) 132 - return PTR_ERR(lg->dead); 133 - 134 - /* We can only return as much as the buffer they read with. */ 135 - len = min(size, strlen(lg->dead)+1); 136 - if (copy_to_user(user, lg->dead, len) != 0) 137 - return -EFAULT; 138 - return len; 139 - } 140 - 141 - /* 142 - * If we returned from read() last time because the Guest sent I/O, 143 - * clear the flag. 144 - */ 145 - if (cpu->pending.trap) 146 - cpu->pending.trap = 0; 147 - 148 - /* Run the Guest until something interesting happens. */ 149 - return run_guest(cpu, (unsigned long __user *)user); 150 - } 151 - 152 - /*L:025 153 - * This actually initializes a CPU. For the moment, a Guest is only 154 - * uniprocessor, so "id" is always 0. 155 - */ 156 - static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 157 - { 158 - /* We have a limited number of CPUs in the lguest struct. */ 159 - if (id >= ARRAY_SIZE(cpu->lg->cpus)) 160 - return -EINVAL; 161 - 162 - /* Set up this CPU's id, and pointer back to the lguest struct. */ 163 - cpu->id = id; 164 - cpu->lg = container_of(cpu, struct lguest, cpus[id]); 165 - cpu->lg->nr_cpus++; 166 - 167 - /* Each CPU has a timer it can set. */ 168 - init_clockdev(cpu); 169 - 170 - /* 171 - * We need a complete page for the Guest registers: they are accessible 172 - * to the Guest and we can only grant it access to whole pages. 173 - */ 174 - cpu->regs_page = get_zeroed_page(GFP_KERNEL); 175 - if (!cpu->regs_page) 176 - return -ENOMEM; 177 - 178 - /* We actually put the registers at the end of the page. */ 179 - cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 180 - 181 - /* 182 - * Now we initialize the Guest's registers, handing it the start 183 - * address. 184 - */ 185 - lguest_arch_setup_regs(cpu, start_ip); 186 - 187 - /* 188 - * We keep a pointer to the Launcher task (ie. current task) for when 189 - * other Guests want to wake this one (eg. console input). 190 - */ 191 - cpu->tsk = current; 192 - 193 - /* 194 - * We need to keep a pointer to the Launcher's memory map, because if 195 - * the Launcher dies we need to clean it up. If we don't keep a 196 - * reference, it is destroyed before close() is called. 197 - */ 198 - cpu->mm = get_task_mm(cpu->tsk); 199 - 200 - /* 201 - * We remember which CPU's pages this Guest used last, for optimization 202 - * when the same Guest runs on the same CPU twice. 203 - */ 204 - cpu->last_pages = NULL; 205 - 206 - /* No error == success. */ 207 - return 0; 208 - } 209 - 210 - /*L:020 211 - * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in 212 - * addition to the LHREQ_INITIALIZE value). These are: 213 - * 214 - * base: The start of the Guest-physical memory inside the Launcher memory. 215 - * 216 - * pfnlimit: The highest (Guest-physical) page number the Guest should be 217 - * allowed to access. The Guest memory lives inside the Launcher, so it sets 218 - * this to ensure the Guest can only reach its own memory. 219 - * 220 - * start: The first instruction to execute ("eip" in x86-speak). 221 - */ 222 - static int initialize(struct file *file, const unsigned long __user *input) 223 - { 224 - /* "struct lguest" contains all we (the Host) know about a Guest. */ 225 - struct lguest *lg; 226 - int err; 227 - unsigned long args[4]; 228 - 229 - /* 230 - * We grab the Big Lguest lock, which protects against multiple 231 - * simultaneous initializations. 232 - */ 233 - mutex_lock(&lguest_lock); 234 - /* You can't initialize twice! Close the device and start again... */ 235 - if (file->private_data) { 236 - err = -EBUSY; 237 - goto unlock; 238 - } 239 - 240 - if (copy_from_user(args, input, sizeof(args)) != 0) { 241 - err = -EFAULT; 242 - goto unlock; 243 - } 244 - 245 - lg = kzalloc(sizeof(*lg), GFP_KERNEL); 246 - if (!lg) { 247 - err = -ENOMEM; 248 - goto unlock; 249 - } 250 - 251 - /* Populate the easy fields of our "struct lguest" */ 252 - lg->mem_base = (void __user *)args[0]; 253 - lg->pfn_limit = args[1]; 254 - lg->device_limit = args[3]; 255 - 256 - /* This is the first cpu (cpu 0) and it will start booting at args[2] */ 257 - err = lg_cpu_start(&lg->cpus[0], 0, args[2]); 258 - if (err) 259 - goto free_lg; 260 - 261 - /* 262 - * Initialize the Guest's shadow page tables. This allocates 263 - * memory, so can fail. 264 - */ 265 - err = init_guest_pagetable(lg); 266 - if (err) 267 - goto free_regs; 268 - 269 - /* We keep our "struct lguest" in the file's private_data. */ 270 - file->private_data = lg; 271 - 272 - mutex_unlock(&lguest_lock); 273 - 274 - /* And because this is a write() call, we return the length used. */ 275 - return sizeof(args); 276 - 277 - free_regs: 278 - /* FIXME: This should be in free_vcpu */ 279 - free_page(lg->cpus[0].regs_page); 280 - free_lg: 281 - kfree(lg); 282 - unlock: 283 - mutex_unlock(&lguest_lock); 284 - return err; 285 - } 286 - 287 - /*L:010 288 - * The first operation the Launcher does must be a write. All writes 289 - * start with an unsigned long number: for the first write this must be 290 - * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 291 - * writes of other values to send interrupts or set up receipt of notifications. 292 - * 293 - * Note that we overload the "offset" in the /dev/lguest file to indicate what 294 - * CPU number we're dealing with. Currently this is always 0 since we only 295 - * support uniprocessor Guests, but you can see the beginnings of SMP support 296 - * here. 297 - */ 298 - static ssize_t write(struct file *file, const char __user *in, 299 - size_t size, loff_t *off) 300 - { 301 - /* 302 - * Once the Guest is initialized, we hold the "struct lguest" in the 303 - * file private data. 304 - */ 305 - struct lguest *lg = file->private_data; 306 - const unsigned long __user *input = (const unsigned long __user *)in; 307 - unsigned long req; 308 - struct lg_cpu *uninitialized_var(cpu); 309 - unsigned int cpu_id = *off; 310 - 311 - /* The first value tells us what this request is. */ 312 - if (get_user(req, input) != 0) 313 - return -EFAULT; 314 - input++; 315 - 316 - /* If you haven't initialized, you must do that first. */ 317 - if (req != LHREQ_INITIALIZE) { 318 - if (!lg || (cpu_id >= lg->nr_cpus)) 319 - return -EINVAL; 320 - cpu = &lg->cpus[cpu_id]; 321 - 322 - /* Once the Guest is dead, you can only read() why it died. */ 323 - if (lg->dead) 324 - return -ENOENT; 325 - } 326 - 327 - switch (req) { 328 - case LHREQ_INITIALIZE: 329 - return initialize(file, input); 330 - case LHREQ_IRQ: 331 - return user_send_irq(cpu, input); 332 - case LHREQ_GETREG: 333 - return getreg_setup(cpu, input); 334 - case LHREQ_SETREG: 335 - return setreg(cpu, input); 336 - case LHREQ_TRAP: 337 - return trap(cpu, input); 338 - default: 339 - return -EINVAL; 340 - } 341 - } 342 - 343 - static int open(struct inode *inode, struct file *file) 344 - { 345 - file->private_data = NULL; 346 - 347 - return 0; 348 - } 349 - 350 - /*L:060 351 - * The final piece of interface code is the close() routine. It reverses 352 - * everything done in initialize(). This is usually called because the 353 - * Launcher exited. 354 - * 355 - * Note that the close routine returns 0 or a negative error number: it can't 356 - * really fail, but it can whine. I blame Sun for this wart, and K&R C for 357 - * letting them do it. 358 - :*/ 359 - static int close(struct inode *inode, struct file *file) 360 - { 361 - struct lguest *lg = file->private_data; 362 - unsigned int i; 363 - 364 - /* If we never successfully initialized, there's nothing to clean up */ 365 - if (!lg) 366 - return 0; 367 - 368 - /* 369 - * We need the big lock, to protect from inter-guest I/O and other 370 - * Launchers initializing guests. 371 - */ 372 - mutex_lock(&lguest_lock); 373 - 374 - /* Free up the shadow page tables for the Guest. */ 375 - free_guest_pagetable(lg); 376 - 377 - for (i = 0; i < lg->nr_cpus; i++) { 378 - /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 379 - hrtimer_cancel(&lg->cpus[i].hrt); 380 - /* We can free up the register page we allocated. */ 381 - free_page(lg->cpus[i].regs_page); 382 - /* 383 - * Now all the memory cleanups are done, it's safe to release 384 - * the Launcher's memory management structure. 385 - */ 386 - mmput(lg->cpus[i].mm); 387 - } 388 - 389 - /* 390 - * If lg->dead doesn't contain an error code it will be NULL or a 391 - * kmalloc()ed string, either of which is ok to hand to kfree(). 392 - */ 393 - if (!IS_ERR(lg->dead)) 394 - kfree(lg->dead); 395 - /* Free the memory allocated to the lguest_struct */ 396 - kfree(lg); 397 - /* Release lock and exit. */ 398 - mutex_unlock(&lguest_lock); 399 - 400 - return 0; 401 - } 402 - 403 - /*L:000 404 - * Welcome to our journey through the Launcher! 405 - * 406 - * The Launcher is the Host userspace program which sets up, runs and services 407 - * the Guest. In fact, many comments in the Drivers which refer to "the Host" 408 - * doing things are inaccurate: the Launcher does all the device handling for 409 - * the Guest, but the Guest can't know that. 410 - * 411 - * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we 412 - * shall see more of that later. 413 - * 414 - * We begin our understanding with the Host kernel interface which the Launcher 415 - * uses: reading and writing a character device called /dev/lguest. All the 416 - * work happens in the read(), write() and close() routines: 417 - */ 418 - static const struct file_operations lguest_fops = { 419 - .owner = THIS_MODULE, 420 - .open = open, 421 - .release = close, 422 - .write = write, 423 - .read = read, 424 - .llseek = default_llseek, 425 - }; 426 - /*:*/ 427 - 428 - /* 429 - * This is a textbook example of a "misc" character device. Populate a "struct 430 - * miscdevice" and register it with misc_register(). 431 - */ 432 - static struct miscdevice lguest_dev = { 433 - .minor = MISC_DYNAMIC_MINOR, 434 - .name = "lguest", 435 - .fops = &lguest_fops, 436 - }; 437 - 438 - int __init lguest_device_init(void) 439 - { 440 - return misc_register(&lguest_dev); 441 - } 442 - 443 - void __exit lguest_device_remove(void) 444 - { 445 - misc_deregister(&lguest_dev); 446 - }
-1239
drivers/lguest/page_tables.c
··· 1 - /*P:700 2 - * The pagetable code, on the other hand, still shows the scars of 3 - * previous encounters. It's functional, and as neat as it can be in the 4 - * circumstances, but be wary, for these things are subtle and break easily. 5 - * The Guest provides a virtual to physical mapping, but we can neither trust 6 - * it nor use it: we verify and convert it here then point the CPU to the 7 - * converted Guest pages when running the Guest. 8 - :*/ 9 - 10 - /* Copyright (C) Rusty Russell IBM Corporation 2013. 11 - * GPL v2 and any later version */ 12 - #include <linux/mm.h> 13 - #include <linux/gfp.h> 14 - #include <linux/types.h> 15 - #include <linux/spinlock.h> 16 - #include <linux/random.h> 17 - #include <linux/percpu.h> 18 - #include <asm/tlbflush.h> 19 - #include <linux/uaccess.h> 20 - #include "lg.h" 21 - 22 - /*M:008 23 - * We hold reference to pages, which prevents them from being swapped. 24 - * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 25 - * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 26 - * could probably consider launching Guests as non-root. 27 - :*/ 28 - 29 - /*H:300 30 - * The Page Table Code 31 - * 32 - * We use two-level page tables for the Guest, or three-level with PAE. If 33 - * you're not entirely comfortable with virtual addresses, physical addresses 34 - * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page 35 - * Table Handling" (with diagrams!). 36 - * 37 - * The Guest keeps page tables, but we maintain the actual ones here: these are 38 - * called "shadow" page tables. Which is a very Guest-centric name: these are 39 - * the real page tables the CPU uses, although we keep them up to date to 40 - * reflect the Guest's. (See what I mean about weird naming? Since when do 41 - * shadows reflect anything?) 42 - * 43 - * Anyway, this is the most complicated part of the Host code. There are seven 44 - * parts to this: 45 - * (i) Looking up a page table entry when the Guest faults, 46 - * (ii) Making sure the Guest stack is mapped, 47 - * (iii) Setting up a page table entry when the Guest tells us one has changed, 48 - * (iv) Switching page tables, 49 - * (v) Flushing (throwing away) page tables, 50 - * (vi) Mapping the Switcher when the Guest is about to run, 51 - * (vii) Setting up the page tables initially. 52 - :*/ 53 - 54 - /* 55 - * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) 56 - * or 512 PTE entries with PAE (2MB). 57 - */ 58 - #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59 - 60 - /* 61 - * For PAE we need the PMD index as well. We use the last 2MB, so we 62 - * will need the last pmd entry of the last pmd page. 63 - */ 64 - #ifdef CONFIG_X86_PAE 65 - #define CHECK_GPGD_MASK _PAGE_PRESENT 66 - #else 67 - #define CHECK_GPGD_MASK _PAGE_TABLE 68 - #endif 69 - 70 - /*H:320 71 - * The page table code is curly enough to need helper functions to keep it 72 - * clear and clean. The kernel itself provides many of them; one advantage 73 - * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting. 74 - * 75 - * There are two functions which return pointers to the shadow (aka "real") 76 - * page tables. 77 - * 78 - * spgd_addr() takes the virtual address and returns a pointer to the top-level 79 - * page directory entry (PGD) for that address. Since we keep track of several 80 - * page tables, the "i" argument tells us which one we're interested in (it's 81 - * usually the current one). 82 - */ 83 - static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 84 - { 85 - unsigned int index = pgd_index(vaddr); 86 - 87 - /* Return a pointer index'th pgd entry for the i'th page table. */ 88 - return &cpu->lg->pgdirs[i].pgdir[index]; 89 - } 90 - 91 - #ifdef CONFIG_X86_PAE 92 - /* 93 - * This routine then takes the PGD entry given above, which contains the 94 - * address of the PMD page. It then returns a pointer to the PMD entry for the 95 - * given address. 96 - */ 97 - static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 98 - { 99 - unsigned int index = pmd_index(vaddr); 100 - pmd_t *page; 101 - 102 - /* You should never call this if the PGD entry wasn't valid */ 103 - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 104 - page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 105 - 106 - return &page[index]; 107 - } 108 - #endif 109 - 110 - /* 111 - * This routine then takes the page directory entry returned above, which 112 - * contains the address of the page table entry (PTE) page. It then returns a 113 - * pointer to the PTE entry for the given address. 114 - */ 115 - static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 116 - { 117 - #ifdef CONFIG_X86_PAE 118 - pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); 119 - pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); 120 - 121 - /* You should never call this if the PMD entry wasn't valid */ 122 - BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); 123 - #else 124 - pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 125 - /* You should never call this if the PGD entry wasn't valid */ 126 - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 127 - #endif 128 - 129 - return &page[pte_index(vaddr)]; 130 - } 131 - 132 - /* 133 - * These functions are just like the above, except they access the Guest 134 - * page tables. Hence they return a Guest address. 135 - */ 136 - static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 137 - { 138 - unsigned int index = vaddr >> (PGDIR_SHIFT); 139 - return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); 140 - } 141 - 142 - #ifdef CONFIG_X86_PAE 143 - /* Follow the PGD to the PMD. */ 144 - static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 145 - { 146 - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 147 - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 148 - return gpage + pmd_index(vaddr) * sizeof(pmd_t); 149 - } 150 - 151 - /* Follow the PMD to the PTE. */ 152 - static unsigned long gpte_addr(struct lg_cpu *cpu, 153 - pmd_t gpmd, unsigned long vaddr) 154 - { 155 - unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; 156 - 157 - BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); 158 - return gpage + pte_index(vaddr) * sizeof(pte_t); 159 - } 160 - #else 161 - /* Follow the PGD to the PTE (no mid-level for !PAE). */ 162 - static unsigned long gpte_addr(struct lg_cpu *cpu, 163 - pgd_t gpgd, unsigned long vaddr) 164 - { 165 - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 166 - 167 - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 168 - return gpage + pte_index(vaddr) * sizeof(pte_t); 169 - } 170 - #endif 171 - /*:*/ 172 - 173 - /*M:007 174 - * get_pfn is slow: we could probably try to grab batches of pages here as 175 - * an optimization (ie. pre-faulting). 176 - :*/ 177 - 178 - /*H:350 179 - * This routine takes a page number given by the Guest and converts it to 180 - * an actual, physical page number. It can fail for several reasons: the 181 - * virtual address might not be mapped by the Launcher, the write flag is set 182 - * and the page is read-only, or the write flag was set and the page was 183 - * shared so had to be copied, but we ran out of memory. 184 - * 185 - * This holds a reference to the page, so release_pte() is careful to put that 186 - * back. 187 - */ 188 - static unsigned long get_pfn(unsigned long virtpfn, int write) 189 - { 190 - struct page *page; 191 - 192 - /* gup me one page at this address please! */ 193 - if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1) 194 - return page_to_pfn(page); 195 - 196 - /* This value indicates failure. */ 197 - return -1UL; 198 - } 199 - 200 - /*H:340 201 - * Converting a Guest page table entry to a shadow (ie. real) page table 202 - * entry can be a little tricky. The flags are (almost) the same, but the 203 - * Guest PTE contains a virtual page number: the CPU needs the real page 204 - * number. 205 - */ 206 - static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 207 - { 208 - unsigned long pfn, base, flags; 209 - 210 - /* 211 - * The Guest sets the global flag, because it thinks that it is using 212 - * PGE. We only told it to use PGE so it would tell us whether it was 213 - * flushing a kernel mapping or a userspace mapping. We don't actually 214 - * use the global bit, so throw it away. 215 - */ 216 - flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 217 - 218 - /* The Guest's pages are offset inside the Launcher. */ 219 - base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 220 - 221 - /* 222 - * We need a temporary "unsigned long" variable to hold the answer from 223 - * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 224 - * fit in spte.pfn. get_pfn() finds the real physical number of the 225 - * page, given the virtual number. 226 - */ 227 - pfn = get_pfn(base + pte_pfn(gpte), write); 228 - if (pfn == -1UL) { 229 - kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 230 - /* 231 - * When we destroy the Guest, we'll go through the shadow page 232 - * tables and release_pte() them. Make sure we don't think 233 - * this one is valid! 234 - */ 235 - flags = 0; 236 - } 237 - /* Now we assemble our shadow PTE from the page number and flags. */ 238 - return pfn_pte(pfn, __pgprot(flags)); 239 - } 240 - 241 - /*H:460 And to complete the chain, release_pte() looks like this: */ 242 - static void release_pte(pte_t pte) 243 - { 244 - /* 245 - * Remember that get_user_pages_fast() took a reference to the page, in 246 - * get_pfn()? We have to put it back now. 247 - */ 248 - if (pte_flags(pte) & _PAGE_PRESENT) 249 - put_page(pte_page(pte)); 250 - } 251 - /*:*/ 252 - 253 - static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte) 254 - { 255 - /* We don't handle large pages. */ 256 - if (pte_flags(gpte) & _PAGE_PSE) 257 - return false; 258 - 259 - return (pte_pfn(gpte) >= cpu->lg->pfn_limit 260 - && pte_pfn(gpte) < cpu->lg->device_limit); 261 - } 262 - 263 - static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) 264 - { 265 - if ((pte_flags(gpte) & _PAGE_PSE) || 266 - pte_pfn(gpte) >= cpu->lg->pfn_limit) { 267 - kill_guest(cpu, "bad page table entry"); 268 - return false; 269 - } 270 - return true; 271 - } 272 - 273 - static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 274 - { 275 - if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || 276 - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { 277 - kill_guest(cpu, "bad page directory entry"); 278 - return false; 279 - } 280 - return true; 281 - } 282 - 283 - #ifdef CONFIG_X86_PAE 284 - static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) 285 - { 286 - if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || 287 - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { 288 - kill_guest(cpu, "bad page middle directory entry"); 289 - return false; 290 - } 291 - return true; 292 - } 293 - #endif 294 - 295 - /*H:331 296 - * This is the core routine to walk the shadow page tables and find the page 297 - * table entry for a specific address. 298 - * 299 - * If allocate is set, then we allocate any missing levels, setting the flags 300 - * on the new page directory and mid-level directories using the arguments 301 - * (which are copied from the Guest's page table entries). 302 - */ 303 - static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, 304 - int pgd_flags, int pmd_flags) 305 - { 306 - pgd_t *spgd; 307 - /* Mid level for PAE. */ 308 - #ifdef CONFIG_X86_PAE 309 - pmd_t *spmd; 310 - #endif 311 - 312 - /* Get top level entry. */ 313 - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 314 - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 315 - /* No shadow entry: allocate a new shadow PTE page. */ 316 - unsigned long ptepage; 317 - 318 - /* If they didn't want us to allocate anything, stop. */ 319 - if (!allocate) 320 - return NULL; 321 - 322 - ptepage = get_zeroed_page(GFP_KERNEL); 323 - /* 324 - * This is not really the Guest's fault, but killing it is 325 - * simple for this corner case. 326 - */ 327 - if (!ptepage) { 328 - kill_guest(cpu, "out of memory allocating pte page"); 329 - return NULL; 330 - } 331 - /* 332 - * And we copy the flags to the shadow PGD entry. The page 333 - * number in the shadow PGD is the page we just allocated. 334 - */ 335 - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); 336 - } 337 - 338 - /* 339 - * Intel's Physical Address Extension actually uses three levels of 340 - * page tables, so we need to look in the mid-level. 341 - */ 342 - #ifdef CONFIG_X86_PAE 343 - /* Now look at the mid-level shadow entry. */ 344 - spmd = spmd_addr(cpu, *spgd, vaddr); 345 - 346 - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { 347 - /* No shadow entry: allocate a new shadow PTE page. */ 348 - unsigned long ptepage; 349 - 350 - /* If they didn't want us to allocate anything, stop. */ 351 - if (!allocate) 352 - return NULL; 353 - 354 - ptepage = get_zeroed_page(GFP_KERNEL); 355 - 356 - /* 357 - * This is not really the Guest's fault, but killing it is 358 - * simple for this corner case. 359 - */ 360 - if (!ptepage) { 361 - kill_guest(cpu, "out of memory allocating pmd page"); 362 - return NULL; 363 - } 364 - 365 - /* 366 - * And we copy the flags to the shadow PMD entry. The page 367 - * number in the shadow PMD is the page we just allocated. 368 - */ 369 - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); 370 - } 371 - #endif 372 - 373 - /* Get the pointer to the shadow PTE entry we're going to set. */ 374 - return spte_addr(cpu, *spgd, vaddr); 375 - } 376 - 377 - /*H:330 378 - * (i) Looking up a page table entry when the Guest faults. 379 - * 380 - * We saw this call in run_guest(): when we see a page fault in the Guest, we 381 - * come here. That's because we only set up the shadow page tables lazily as 382 - * they're needed, so we get page faults all the time and quietly fix them up 383 - * and return to the Guest without it knowing. 384 - * 385 - * If we fixed up the fault (ie. we mapped the address), this routine returns 386 - * true. Otherwise, it was a real fault and we need to tell the Guest. 387 - * 388 - * There's a corner case: they're trying to access memory between 389 - * pfn_limit and device_limit, which is I/O memory. In this case, we 390 - * return false and set @iomem to the physical address, so the the 391 - * Launcher can handle the instruction manually. 392 - */ 393 - bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode, 394 - unsigned long *iomem) 395 - { 396 - unsigned long gpte_ptr; 397 - pte_t gpte; 398 - pte_t *spte; 399 - pmd_t gpmd; 400 - pgd_t gpgd; 401 - 402 - *iomem = 0; 403 - 404 - /* We never demand page the Switcher, so trying is a mistake. */ 405 - if (vaddr >= switcher_addr) 406 - return false; 407 - 408 - /* First step: get the top-level Guest page table entry. */ 409 - if (unlikely(cpu->linear_pages)) { 410 - /* Faking up a linear mapping. */ 411 - gpgd = __pgd(CHECK_GPGD_MASK); 412 - } else { 413 - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 414 - /* Toplevel not present? We can't map it in. */ 415 - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 416 - return false; 417 - 418 - /* 419 - * This kills the Guest if it has weird flags or tries to 420 - * refer to a "physical" address outside the bounds. 421 - */ 422 - if (!check_gpgd(cpu, gpgd)) 423 - return false; 424 - } 425 - 426 - /* This "mid-level" entry is only used for non-linear, PAE mode. */ 427 - gpmd = __pmd(_PAGE_TABLE); 428 - 429 - #ifdef CONFIG_X86_PAE 430 - if (likely(!cpu->linear_pages)) { 431 - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 432 - /* Middle level not present? We can't map it in. */ 433 - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 434 - return false; 435 - 436 - /* 437 - * This kills the Guest if it has weird flags or tries to 438 - * refer to a "physical" address outside the bounds. 439 - */ 440 - if (!check_gpmd(cpu, gpmd)) 441 - return false; 442 - } 443 - 444 - /* 445 - * OK, now we look at the lower level in the Guest page table: keep its 446 - * address, because we might update it later. 447 - */ 448 - gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 449 - #else 450 - /* 451 - * OK, now we look at the lower level in the Guest page table: keep its 452 - * address, because we might update it later. 453 - */ 454 - gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 455 - #endif 456 - 457 - if (unlikely(cpu->linear_pages)) { 458 - /* Linear? Make up a PTE which points to same page. */ 459 - gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); 460 - } else { 461 - /* Read the actual PTE value. */ 462 - gpte = lgread(cpu, gpte_ptr, pte_t); 463 - } 464 - 465 - /* If this page isn't in the Guest page tables, we can't page it in. */ 466 - if (!(pte_flags(gpte) & _PAGE_PRESENT)) 467 - return false; 468 - 469 - /* 470 - * Check they're not trying to write to a page the Guest wants 471 - * read-only (bit 2 of errcode == write). 472 - */ 473 - if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 474 - return false; 475 - 476 - /* User access to a kernel-only page? (bit 3 == user access) */ 477 - if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 478 - return false; 479 - 480 - /* If they're accessing io memory, we expect a fault. */ 481 - if (gpte_in_iomem(cpu, gpte)) { 482 - *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); 483 - return false; 484 - } 485 - 486 - /* 487 - * Check that the Guest PTE flags are OK, and the page number is below 488 - * the pfn_limit (ie. not mapping the Launcher binary). 489 - */ 490 - if (!check_gpte(cpu, gpte)) 491 - return false; 492 - 493 - /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 494 - gpte = pte_mkyoung(gpte); 495 - if (errcode & 2) 496 - gpte = pte_mkdirty(gpte); 497 - 498 - /* Get the pointer to the shadow PTE entry we're going to set. */ 499 - spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); 500 - if (!spte) 501 - return false; 502 - 503 - /* 504 - * If there was a valid shadow PTE entry here before, we release it. 505 - * This can happen with a write to a previously read-only entry. 506 - */ 507 - release_pte(*spte); 508 - 509 - /* 510 - * If this is a write, we insist that the Guest page is writable (the 511 - * final arg to gpte_to_spte()). 512 - */ 513 - if (pte_dirty(gpte)) 514 - *spte = gpte_to_spte(cpu, gpte, 1); 515 - else 516 - /* 517 - * If this is a read, don't set the "writable" bit in the page 518 - * table entry, even if the Guest says it's writable. That way 519 - * we will come back here when a write does actually occur, so 520 - * we can update the Guest's _PAGE_DIRTY flag. 521 - */ 522 - set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 523 - 524 - /* 525 - * Finally, we write the Guest PTE entry back: we've set the 526 - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 527 - */ 528 - if (likely(!cpu->linear_pages)) 529 - lgwrite(cpu, gpte_ptr, pte_t, gpte); 530 - 531 - /* 532 - * The fault is fixed, the page table is populated, the mapping 533 - * manipulated, the result returned and the code complete. A small 534 - * delay and a trace of alliteration are the only indications the Guest 535 - * has that a page fault occurred at all. 536 - */ 537 - return true; 538 - } 539 - 540 - /*H:360 541 - * (ii) Making sure the Guest stack is mapped. 542 - * 543 - * Remember that direct traps into the Guest need a mapped Guest kernel stack. 544 - * pin_stack_pages() calls us here: we could simply call demand_page(), but as 545 - * we've seen that logic is quite long, and usually the stack pages are already 546 - * mapped, so it's overkill. 547 - * 548 - * This is a quick version which answers the question: is this virtual address 549 - * mapped by the shadow page tables, and is it writable? 550 - */ 551 - static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 552 - { 553 - pte_t *spte; 554 - unsigned long flags; 555 - 556 - /* You can't put your stack in the Switcher! */ 557 - if (vaddr >= switcher_addr) 558 - return false; 559 - 560 - /* If there's no shadow PTE, it's not writable. */ 561 - spte = find_spte(cpu, vaddr, false, 0, 0); 562 - if (!spte) 563 - return false; 564 - 565 - /* 566 - * Check the flags on the pte entry itself: it must be present and 567 - * writable. 568 - */ 569 - flags = pte_flags(*spte); 570 - return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 571 - } 572 - 573 - /* 574 - * So, when pin_stack_pages() asks us to pin a page, we check if it's already 575 - * in the page tables, and if not, we call demand_page() with error code 2 576 - * (meaning "write"). 577 - */ 578 - void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 579 - { 580 - unsigned long iomem; 581 - 582 - if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem)) 583 - kill_guest(cpu, "bad stack page %#lx", vaddr); 584 - } 585 - /*:*/ 586 - 587 - #ifdef CONFIG_X86_PAE 588 - static void release_pmd(pmd_t *spmd) 589 - { 590 - /* If the entry's not present, there's nothing to release. */ 591 - if (pmd_flags(*spmd) & _PAGE_PRESENT) { 592 - unsigned int i; 593 - pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); 594 - /* For each entry in the page, we might need to release it. */ 595 - for (i = 0; i < PTRS_PER_PTE; i++) 596 - release_pte(ptepage[i]); 597 - /* Now we can free the page of PTEs */ 598 - free_page((long)ptepage); 599 - /* And zero out the PMD entry so we never release it twice. */ 600 - set_pmd(spmd, __pmd(0)); 601 - } 602 - } 603 - 604 - static void release_pgd(pgd_t *spgd) 605 - { 606 - /* If the entry's not present, there's nothing to release. */ 607 - if (pgd_flags(*spgd) & _PAGE_PRESENT) { 608 - unsigned int i; 609 - pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 610 - 611 - for (i = 0; i < PTRS_PER_PMD; i++) 612 - release_pmd(&pmdpage[i]); 613 - 614 - /* Now we can free the page of PMDs */ 615 - free_page((long)pmdpage); 616 - /* And zero out the PGD entry so we never release it twice. */ 617 - set_pgd(spgd, __pgd(0)); 618 - } 619 - } 620 - 621 - #else /* !CONFIG_X86_PAE */ 622 - /*H:450 623 - * If we chase down the release_pgd() code, the non-PAE version looks like 624 - * this. The PAE version is almost identical, but instead of calling 625 - * release_pte it calls release_pmd(), which looks much like this. 626 - */ 627 - static void release_pgd(pgd_t *spgd) 628 - { 629 - /* If the entry's not present, there's nothing to release. */ 630 - if (pgd_flags(*spgd) & _PAGE_PRESENT) { 631 - unsigned int i; 632 - /* 633 - * Converting the pfn to find the actual PTE page is easy: turn 634 - * the page number into a physical address, then convert to a 635 - * virtual address (easy for kernel pages like this one). 636 - */ 637 - pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 638 - /* For each entry in the page, we might need to release it. */ 639 - for (i = 0; i < PTRS_PER_PTE; i++) 640 - release_pte(ptepage[i]); 641 - /* Now we can free the page of PTEs */ 642 - free_page((long)ptepage); 643 - /* And zero out the PGD entry so we never release it twice. */ 644 - *spgd = __pgd(0); 645 - } 646 - } 647 - #endif 648 - 649 - /*H:445 650 - * We saw flush_user_mappings() twice: once from the flush_user_mappings() 651 - * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 652 - * It simply releases every PTE page from 0 up to the Guest's kernel address. 653 - */ 654 - static void flush_user_mappings(struct lguest *lg, int idx) 655 - { 656 - unsigned int i; 657 - /* Release every pgd entry up to the kernel's address. */ 658 - for (i = 0; i < pgd_index(lg->kernel_address); i++) 659 - release_pgd(lg->pgdirs[idx].pgdir + i); 660 - } 661 - 662 - /*H:440 663 - * (v) Flushing (throwing away) page tables, 664 - * 665 - * The Guest has a hypercall to throw away the page tables: it's used when a 666 - * large number of mappings have been changed. 667 - */ 668 - void guest_pagetable_flush_user(struct lg_cpu *cpu) 669 - { 670 - /* Drop the userspace part of the current page table. */ 671 - flush_user_mappings(cpu->lg, cpu->cpu_pgd); 672 - } 673 - /*:*/ 674 - 675 - /* We walk down the guest page tables to get a guest-physical address */ 676 - bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr) 677 - { 678 - pgd_t gpgd; 679 - pte_t gpte; 680 - #ifdef CONFIG_X86_PAE 681 - pmd_t gpmd; 682 - #endif 683 - 684 - /* Still not set up? Just map 1:1. */ 685 - if (unlikely(cpu->linear_pages)) { 686 - *paddr = vaddr; 687 - return true; 688 - } 689 - 690 - /* First step: get the top-level Guest page table entry. */ 691 - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 692 - /* Toplevel not present? We can't map it in. */ 693 - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 694 - goto fail; 695 - 696 - #ifdef CONFIG_X86_PAE 697 - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 698 - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 699 - goto fail; 700 - gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); 701 - #else 702 - gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); 703 - #endif 704 - if (!(pte_flags(gpte) & _PAGE_PRESENT)) 705 - goto fail; 706 - 707 - *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 708 - return true; 709 - 710 - fail: 711 - *paddr = -1UL; 712 - return false; 713 - } 714 - 715 - /* 716 - * This is the version we normally use: kills the Guest if it uses a 717 - * bad address 718 - */ 719 - unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) 720 - { 721 - unsigned long paddr; 722 - 723 - if (!__guest_pa(cpu, vaddr, &paddr)) 724 - kill_guest(cpu, "Bad address %#lx", vaddr); 725 - return paddr; 726 - } 727 - 728 - /* 729 - * We keep several page tables. This is a simple routine to find the page 730 - * table (if any) corresponding to this top-level address the Guest has given 731 - * us. 732 - */ 733 - static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 734 - { 735 - unsigned int i; 736 - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 737 - if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) 738 - break; 739 - return i; 740 - } 741 - 742 - /*H:435 743 - * And this is us, creating the new page directory. If we really do 744 - * allocate a new one (and so the kernel parts are not there), we set 745 - * blank_pgdir. 746 - */ 747 - static unsigned int new_pgdir(struct lg_cpu *cpu, 748 - unsigned long gpgdir, 749 - int *blank_pgdir) 750 - { 751 - unsigned int next; 752 - 753 - /* 754 - * We pick one entry at random to throw out. Choosing the Least 755 - * Recently Used might be better, but this is easy. 756 - */ 757 - next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs); 758 - /* If it's never been allocated at all before, try now. */ 759 - if (!cpu->lg->pgdirs[next].pgdir) { 760 - cpu->lg->pgdirs[next].pgdir = 761 - (pgd_t *)get_zeroed_page(GFP_KERNEL); 762 - /* If the allocation fails, just keep using the one we have */ 763 - if (!cpu->lg->pgdirs[next].pgdir) 764 - next = cpu->cpu_pgd; 765 - else { 766 - /* 767 - * This is a blank page, so there are no kernel 768 - * mappings: caller must map the stack! 769 - */ 770 - *blank_pgdir = 1; 771 - } 772 - } 773 - /* Record which Guest toplevel this shadows. */ 774 - cpu->lg->pgdirs[next].gpgdir = gpgdir; 775 - /* Release all the non-kernel mappings. */ 776 - flush_user_mappings(cpu->lg, next); 777 - 778 - /* This hasn't run on any CPU at all. */ 779 - cpu->lg->pgdirs[next].last_host_cpu = -1; 780 - 781 - return next; 782 - } 783 - 784 - /*H:501 785 - * We do need the Switcher code mapped at all times, so we allocate that 786 - * part of the Guest page table here. We map the Switcher code immediately, 787 - * but defer mapping of the guest register page and IDT/LDT etc page until 788 - * just before we run the guest in map_switcher_in_guest(). 789 - * 790 - * We *could* do this setup in map_switcher_in_guest(), but at that point 791 - * we've interrupts disabled, and allocating pages like that is fraught: we 792 - * can't sleep if we need to free up some memory. 793 - */ 794 - static bool allocate_switcher_mapping(struct lg_cpu *cpu) 795 - { 796 - int i; 797 - 798 - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 799 - pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, 800 - CHECK_GPGD_MASK, _PAGE_TABLE); 801 - if (!pte) 802 - return false; 803 - 804 - /* 805 - * Map the switcher page if not already there. It might 806 - * already be there because we call allocate_switcher_mapping() 807 - * in guest_set_pgd() just in case it did discard our Switcher 808 - * mapping, but it probably didn't. 809 - */ 810 - if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { 811 - /* Get a reference to the Switcher page. */ 812 - get_page(lg_switcher_pages[0]); 813 - /* Create a read-only, exectuable, kernel-style PTE */ 814 - set_pte(pte, 815 - mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); 816 - } 817 - } 818 - cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; 819 - return true; 820 - } 821 - 822 - /*H:470 823 - * Finally, a routine which throws away everything: all PGD entries in all 824 - * the shadow page tables, including the Guest's kernel mappings. This is used 825 - * when we destroy the Guest. 826 - */ 827 - static void release_all_pagetables(struct lguest *lg) 828 - { 829 - unsigned int i, j; 830 - 831 - /* Every shadow pagetable this Guest has */ 832 - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { 833 - if (!lg->pgdirs[i].pgdir) 834 - continue; 835 - 836 - /* Every PGD entry. */ 837 - for (j = 0; j < PTRS_PER_PGD; j++) 838 - release_pgd(lg->pgdirs[i].pgdir + j); 839 - lg->pgdirs[i].switcher_mapped = false; 840 - lg->pgdirs[i].last_host_cpu = -1; 841 - } 842 - } 843 - 844 - /* 845 - * We also throw away everything when a Guest tells us it's changed a kernel 846 - * mapping. Since kernel mappings are in every page table, it's easiest to 847 - * throw them all away. This traps the Guest in amber for a while as 848 - * everything faults back in, but it's rare. 849 - */ 850 - void guest_pagetable_clear_all(struct lg_cpu *cpu) 851 - { 852 - release_all_pagetables(cpu->lg); 853 - /* We need the Guest kernel stack mapped again. */ 854 - pin_stack_pages(cpu); 855 - /* And we need Switcher allocated. */ 856 - if (!allocate_switcher_mapping(cpu)) 857 - kill_guest(cpu, "Cannot populate switcher mapping"); 858 - } 859 - 860 - /*H:430 861 - * (iv) Switching page tables 862 - * 863 - * Now we've seen all the page table setting and manipulation, let's see 864 - * what happens when the Guest changes page tables (ie. changes the top-level 865 - * pgdir). This occurs on almost every context switch. 866 - */ 867 - void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 868 - { 869 - int newpgdir, repin = 0; 870 - 871 - /* 872 - * The very first time they call this, we're actually running without 873 - * any page tables; we've been making it up. Throw them away now. 874 - */ 875 - if (unlikely(cpu->linear_pages)) { 876 - release_all_pagetables(cpu->lg); 877 - cpu->linear_pages = false; 878 - /* Force allocation of a new pgdir. */ 879 - newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); 880 - } else { 881 - /* Look to see if we have this one already. */ 882 - newpgdir = find_pgdir(cpu->lg, pgtable); 883 - } 884 - 885 - /* 886 - * If not, we allocate or mug an existing one: if it's a fresh one, 887 - * repin gets set to 1. 888 - */ 889 - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 890 - newpgdir = new_pgdir(cpu, pgtable, &repin); 891 - /* Change the current pgd index to the new one. */ 892 - cpu->cpu_pgd = newpgdir; 893 - /* 894 - * If it was completely blank, we map in the Guest kernel stack and 895 - * the Switcher. 896 - */ 897 - if (repin) 898 - pin_stack_pages(cpu); 899 - 900 - if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { 901 - if (!allocate_switcher_mapping(cpu)) 902 - kill_guest(cpu, "Cannot populate switcher mapping"); 903 - } 904 - } 905 - /*:*/ 906 - 907 - /*M:009 908 - * Since we throw away all mappings when a kernel mapping changes, our 909 - * performance sucks for guests using highmem. In fact, a guest with 910 - * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 911 - * usually slower than a Guest with less memory. 912 - * 913 - * This, of course, cannot be fixed. It would take some kind of... well, I 914 - * don't know, but the term "puissant code-fu" comes to mind. 915 - :*/ 916 - 917 - /*H:420 918 - * This is the routine which actually sets the page table entry for then 919 - * "idx"'th shadow page table. 920 - * 921 - * Normally, we can just throw out the old entry and replace it with 0: if they 922 - * use it demand_page() will put the new entry in. We need to do this anyway: 923 - * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page 924 - * is read from, and _PAGE_DIRTY when it's written to. 925 - * 926 - * But Avi Kivity pointed out that most Operating Systems (Linux included) set 927 - * these bits on PTEs immediately anyway. This is done to save the CPU from 928 - * having to update them, but it helps us the same way: if they set 929 - * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if 930 - * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 931 - */ 932 - static void __guest_set_pte(struct lg_cpu *cpu, int idx, 933 - unsigned long vaddr, pte_t gpte) 934 - { 935 - /* Look up the matching shadow page directory entry. */ 936 - pgd_t *spgd = spgd_addr(cpu, idx, vaddr); 937 - #ifdef CONFIG_X86_PAE 938 - pmd_t *spmd; 939 - #endif 940 - 941 - /* If the top level isn't present, there's no entry to update. */ 942 - if (pgd_flags(*spgd) & _PAGE_PRESENT) { 943 - #ifdef CONFIG_X86_PAE 944 - spmd = spmd_addr(cpu, *spgd, vaddr); 945 - if (pmd_flags(*spmd) & _PAGE_PRESENT) { 946 - #endif 947 - /* Otherwise, start by releasing the existing entry. */ 948 - pte_t *spte = spte_addr(cpu, *spgd, vaddr); 949 - release_pte(*spte); 950 - 951 - /* 952 - * If they're setting this entry as dirty or accessed, 953 - * we might as well put that entry they've given us in 954 - * now. This shaves 10% off a copy-on-write 955 - * micro-benchmark. 956 - */ 957 - if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) 958 - && !gpte_in_iomem(cpu, gpte)) { 959 - if (!check_gpte(cpu, gpte)) 960 - return; 961 - set_pte(spte, 962 - gpte_to_spte(cpu, gpte, 963 - pte_flags(gpte) & _PAGE_DIRTY)); 964 - } else { 965 - /* 966 - * Otherwise kill it and we can demand_page() 967 - * it in later. 968 - */ 969 - set_pte(spte, __pte(0)); 970 - } 971 - #ifdef CONFIG_X86_PAE 972 - } 973 - #endif 974 - } 975 - } 976 - 977 - /*H:410 978 - * Updating a PTE entry is a little trickier. 979 - * 980 - * We keep track of several different page tables (the Guest uses one for each 981 - * process, so it makes sense to cache at least a few). Each of these have 982 - * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for 983 - * all processes. So when the page table above that address changes, we update 984 - * all the page tables, not just the current one. This is rare. 985 - * 986 - * The benefit is that when we have to track a new page table, we can keep all 987 - * the kernel mappings. This speeds up context switch immensely. 988 - */ 989 - void guest_set_pte(struct lg_cpu *cpu, 990 - unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 991 - { 992 - /* We don't let you remap the Switcher; we need it to get back! */ 993 - if (vaddr >= switcher_addr) { 994 - kill_guest(cpu, "attempt to set pte into Switcher pages"); 995 - return; 996 - } 997 - 998 - /* 999 - * Kernel mappings must be changed on all top levels. Slow, but doesn't 1000 - * happen often. 1001 - */ 1002 - if (vaddr >= cpu->lg->kernel_address) { 1003 - unsigned int i; 1004 - for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 1005 - if (cpu->lg->pgdirs[i].pgdir) 1006 - __guest_set_pte(cpu, i, vaddr, gpte); 1007 - } else { 1008 - /* Is this page table one we have a shadow for? */ 1009 - int pgdir = find_pgdir(cpu->lg, gpgdir); 1010 - if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) 1011 - /* If so, do the update. */ 1012 - __guest_set_pte(cpu, pgdir, vaddr, gpte); 1013 - } 1014 - } 1015 - 1016 - /*H:400 1017 - * (iii) Setting up a page table entry when the Guest tells us one has changed. 1018 - * 1019 - * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 1020 - * with the other side of page tables while we're here: what happens when the 1021 - * Guest asks for a page table to be updated? 1022 - * 1023 - * We already saw that demand_page() will fill in the shadow page tables when 1024 - * needed, so we can simply remove shadow page table entries whenever the Guest 1025 - * tells us they've changed. When the Guest tries to use the new entry it will 1026 - * fault and demand_page() will fix it up. 1027 - * 1028 - * So with that in mind here's our code to update a (top-level) PGD entry: 1029 - */ 1030 - void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) 1031 - { 1032 - int pgdir; 1033 - 1034 - if (idx > PTRS_PER_PGD) { 1035 - kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", 1036 - idx, PTRS_PER_PGD); 1037 - return; 1038 - } 1039 - 1040 - /* If they're talking about a page table we have a shadow for... */ 1041 - pgdir = find_pgdir(lg, gpgdir); 1042 - if (pgdir < ARRAY_SIZE(lg->pgdirs)) { 1043 - /* ... throw it away. */ 1044 - release_pgd(lg->pgdirs[pgdir].pgdir + idx); 1045 - /* That might have been the Switcher mapping, remap it. */ 1046 - if (!allocate_switcher_mapping(&lg->cpus[0])) { 1047 - kill_guest(&lg->cpus[0], 1048 - "Cannot populate switcher mapping"); 1049 - } 1050 - lg->pgdirs[pgdir].last_host_cpu = -1; 1051 - } 1052 - } 1053 - 1054 - #ifdef CONFIG_X86_PAE 1055 - /* For setting a mid-level, we just throw everything away. It's easy. */ 1056 - void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 1057 - { 1058 - guest_pagetable_clear_all(&lg->cpus[0]); 1059 - } 1060 - #endif 1061 - 1062 - /*H:500 1063 - * (vii) Setting up the page tables initially. 1064 - * 1065 - * When a Guest is first created, set initialize a shadow page table which 1066 - * we will populate on future faults. The Guest doesn't have any actual 1067 - * pagetables yet, so we set linear_pages to tell demand_page() to fake it 1068 - * for the moment. 1069 - * 1070 - * We do need the Switcher to be mapped at all times, so we allocate that 1071 - * part of the Guest page table here. 1072 - */ 1073 - int init_guest_pagetable(struct lguest *lg) 1074 - { 1075 - struct lg_cpu *cpu = &lg->cpus[0]; 1076 - int allocated = 0; 1077 - 1078 - /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ 1079 - cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); 1080 - if (!allocated) 1081 - return -ENOMEM; 1082 - 1083 - /* We start with a linear mapping until the initialize. */ 1084 - cpu->linear_pages = true; 1085 - 1086 - /* Allocate the page tables for the Switcher. */ 1087 - if (!allocate_switcher_mapping(cpu)) { 1088 - release_all_pagetables(lg); 1089 - return -ENOMEM; 1090 - } 1091 - 1092 - return 0; 1093 - } 1094 - 1095 - /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1096 - void page_table_guest_data_init(struct lg_cpu *cpu) 1097 - { 1098 - /* 1099 - * We tell the Guest that it can't use the virtual addresses 1100 - * used by the Switcher. This trick is equivalent to 4GB - 1101 - * switcher_addr. 1102 - */ 1103 - u32 top = ~switcher_addr + 1; 1104 - 1105 - /* We get the kernel address: above this is all kernel memory. */ 1106 - if (get_user(cpu->lg->kernel_address, 1107 - &cpu->lg->lguest_data->kernel_address) 1108 - /* 1109 - * We tell the Guest that it can't use the top virtual 1110 - * addresses (used by the Switcher). 1111 - */ 1112 - || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { 1113 - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1114 - return; 1115 - } 1116 - 1117 - /* 1118 - * In flush_user_mappings() we loop from 0 to 1119 - * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1120 - * Switcher mappings, so check that now. 1121 - */ 1122 - if (cpu->lg->kernel_address >= switcher_addr) 1123 - kill_guest(cpu, "bad kernel address %#lx", 1124 - cpu->lg->kernel_address); 1125 - } 1126 - 1127 - /* When a Guest dies, our cleanup is fairly simple. */ 1128 - void free_guest_pagetable(struct lguest *lg) 1129 - { 1130 - unsigned int i; 1131 - 1132 - /* Throw away all page table pages. */ 1133 - release_all_pagetables(lg); 1134 - /* Now free the top levels: free_page() can handle 0 just fine. */ 1135 - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 1136 - free_page((long)lg->pgdirs[i].pgdir); 1137 - } 1138 - 1139 - /*H:481 1140 - * This clears the Switcher mappings for cpu #i. 1141 - */ 1142 - static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) 1143 - { 1144 - unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; 1145 - pte_t *pte; 1146 - 1147 - /* Clear the mappings for both pages. */ 1148 - pte = find_spte(cpu, base, false, 0, 0); 1149 - release_pte(*pte); 1150 - set_pte(pte, __pte(0)); 1151 - 1152 - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); 1153 - release_pte(*pte); 1154 - set_pte(pte, __pte(0)); 1155 - } 1156 - 1157 - /*H:480 1158 - * (vi) Mapping the Switcher when the Guest is about to run. 1159 - * 1160 - * The Switcher and the two pages for this CPU need to be visible in the Guest 1161 - * (and not the pages for other CPUs). 1162 - * 1163 - * The pages for the pagetables have all been allocated before: we just need 1164 - * to make sure the actual PTEs are up-to-date for the CPU we're about to run 1165 - * on. 1166 - */ 1167 - void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1168 - { 1169 - unsigned long base; 1170 - struct page *percpu_switcher_page, *regs_page; 1171 - pte_t *pte; 1172 - struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; 1173 - 1174 - /* Switcher page should always be mapped by now! */ 1175 - BUG_ON(!pgdir->switcher_mapped); 1176 - 1177 - /* 1178 - * Remember that we have two pages for each Host CPU, so we can run a 1179 - * Guest on each CPU without them interfering. We need to make sure 1180 - * those pages are mapped correctly in the Guest, but since we usually 1181 - * run on the same CPU, we cache that, and only update the mappings 1182 - * when we move. 1183 - */ 1184 - if (pgdir->last_host_cpu == raw_smp_processor_id()) 1185 - return; 1186 - 1187 - /* -1 means unknown so we remove everything. */ 1188 - if (pgdir->last_host_cpu == -1) { 1189 - unsigned int i; 1190 - for_each_possible_cpu(i) 1191 - remove_switcher_percpu_map(cpu, i); 1192 - } else { 1193 - /* We know exactly what CPU mapping to remove. */ 1194 - remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); 1195 - } 1196 - 1197 - /* 1198 - * When we're running the Guest, we want the Guest's "regs" page to 1199 - * appear where the first Switcher page for this CPU is. This is an 1200 - * optimization: when the Switcher saves the Guest registers, it saves 1201 - * them into the first page of this CPU's "struct lguest_pages": if we 1202 - * make sure the Guest's register page is already mapped there, we 1203 - * don't have to copy them out again. 1204 - */ 1205 - /* Find the shadow PTE for this regs page. */ 1206 - base = switcher_addr + PAGE_SIZE 1207 - + raw_smp_processor_id() * sizeof(struct lguest_pages); 1208 - pte = find_spte(cpu, base, false, 0, 0); 1209 - regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); 1210 - get_page(regs_page); 1211 - set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); 1212 - 1213 - /* 1214 - * We map the second page of the struct lguest_pages read-only in 1215 - * the Guest: the IDT, GDT and other things it's not supposed to 1216 - * change. 1217 - */ 1218 - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); 1219 - percpu_switcher_page 1220 - = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; 1221 - get_page(percpu_switcher_page); 1222 - set_pte(pte, mk_pte(percpu_switcher_page, 1223 - __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); 1224 - 1225 - pgdir->last_host_cpu = raw_smp_processor_id(); 1226 - } 1227 - 1228 - /*H:490 1229 - * We've made it through the page table code. Perhaps our tired brains are 1230 - * still processing the details, or perhaps we're simply glad it's over. 1231 - * 1232 - * If nothing else, note that all this complexity in juggling shadow page tables 1233 - * in sync with the Guest's page tables is for one reason: for most Guests this 1234 - * page table dance determines how bad performance will be. This is why Xen 1235 - * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1236 - * have implemented shadow page table support directly into hardware. 1237 - * 1238 - * There is just one file remaining in the Host. 1239 - */
-228
drivers/lguest/segments.c
··· 1 - /*P:600 2 - * The x86 architecture has segments, which involve a table of descriptors 3 - * which can be used to do funky things with virtual address interpretation. 4 - * We originally used to use segments so the Guest couldn't alter the 5 - * Guest<->Host Switcher, and then we had to trim Guest segments, and restore 6 - * for userspace per-thread segments, but trim again for on userspace->kernel 7 - * transitions... This nightmarish creation was contained within this file, 8 - * where we knew not to tread without heavy armament and a change of underwear. 9 - * 10 - * In these modern times, the segment handling code consists of simple sanity 11 - * checks, and the worst you'll experience reading this code is butterfly-rash 12 - * from frolicking through its parklike serenity. 13 - :*/ 14 - #include "lg.h" 15 - 16 - /*H:600 17 - * Segments & The Global Descriptor Table 18 - * 19 - * (That title sounds like a bad Nerdcore group. Not to suggest that there are 20 - * any good Nerdcore groups, but in high school a friend of mine had a band 21 - * called Joe Fish and the Chips, so there are definitely worse band names). 22 - * 23 - * To refresh: the GDT is a table of 8-byte values describing segments. Once 24 - * set up, these segments can be loaded into one of the 6 "segment registers". 25 - * 26 - * GDT entries are passed around as "struct desc_struct"s, which like IDT 27 - * entries are split into two 32-bit members, "a" and "b". One day, someone 28 - * will clean that up, and be declared a Hero. (No pressure, I'm just saying). 29 - * 30 - * Anyway, the GDT entry contains a base (the start address of the segment), a 31 - * limit (the size of the segment - 1), and some flags. Sounds simple, and it 32 - * would be, except those zany Intel engineers decided that it was too boring 33 - * to put the base at one end, the limit at the other, and the flags in 34 - * between. They decided to shotgun the bits at random throughout the 8 bytes, 35 - * like so: 36 - * 37 - * 0 16 40 48 52 56 63 38 - * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ] 39 - * mit ags part 2 40 - * part 2 41 - * 42 - * As a result, this file contains a certain amount of magic numeracy. Let's 43 - * begin. 44 - */ 45 - 46 - /* 47 - * There are several entries we don't let the Guest set. The TSS entry is the 48 - * "Task State Segment" which controls all kinds of delicate things. The 49 - * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the 50 - * the Guest can't be trusted to deal with double faults. 51 - */ 52 - static bool ignored_gdt(unsigned int num) 53 - { 54 - return (num == GDT_ENTRY_TSS 55 - || num == GDT_ENTRY_LGUEST_CS 56 - || num == GDT_ENTRY_LGUEST_DS 57 - || num == GDT_ENTRY_DOUBLEFAULT_TSS); 58 - } 59 - 60 - /*H:630 61 - * Once the Guest gave us new GDT entries, we fix them up a little. We 62 - * don't care if they're invalid: the worst that can happen is a General 63 - * Protection Fault in the Switcher when it restores a Guest segment register 64 - * which tries to use that entry. Then we kill the Guest for causing such a 65 - * mess: the message will be "unhandled trap 256". 66 - */ 67 - static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) 68 - { 69 - unsigned int i; 70 - 71 - for (i = start; i < end; i++) { 72 - /* 73 - * We never copy these ones to real GDT, so we don't care what 74 - * they say 75 - */ 76 - if (ignored_gdt(i)) 77 - continue; 78 - 79 - /* 80 - * Segment descriptors contain a privilege level: the Guest is 81 - * sometimes careless and leaves this as 0, even though it's 82 - * running at privilege level 1. If so, we fix it here. 83 - */ 84 - if (cpu->arch.gdt[i].dpl == 0) 85 - cpu->arch.gdt[i].dpl |= GUEST_PL; 86 - 87 - /* 88 - * Each descriptor has an "accessed" bit. If we don't set it 89 - * now, the CPU will try to set it when the Guest first loads 90 - * that entry into a segment register. But the GDT isn't 91 - * writable by the Guest, so bad things can happen. 92 - */ 93 - cpu->arch.gdt[i].type |= 0x1; 94 - } 95 - } 96 - 97 - /*H:610 98 - * Like the IDT, we never simply use the GDT the Guest gives us. We keep 99 - * a GDT for each CPU, and copy across the Guest's entries each time we want to 100 - * run the Guest on that CPU. 101 - * 102 - * This routine is called at boot or modprobe time for each CPU to set up the 103 - * constant GDT entries: the ones which are the same no matter what Guest we're 104 - * running. 105 - */ 106 - void setup_default_gdt_entries(struct lguest_ro_state *state) 107 - { 108 - struct desc_struct *gdt = state->guest_gdt; 109 - unsigned long tss = (unsigned long)&state->guest_tss; 110 - 111 - /* The Switcher segments are full 0-4G segments, privilege level 0 */ 112 - gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 113 - gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 114 - 115 - /* 116 - * The TSS segment refers to the TSS entry for this particular CPU. 117 - */ 118 - gdt[GDT_ENTRY_TSS].a = 0; 119 - gdt[GDT_ENTRY_TSS].b = 0; 120 - 121 - gdt[GDT_ENTRY_TSS].limit0 = 0x67; 122 - gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF; 123 - gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF; 124 - gdt[GDT_ENTRY_TSS].base2 = tss >> 24; 125 - gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */ 126 - gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */ 127 - gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */ 128 - gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */ 129 - 130 - } 131 - 132 - /* 133 - * This routine sets up the initial Guest GDT for booting. All entries start 134 - * as 0 (unusable). 135 - */ 136 - void setup_guest_gdt(struct lg_cpu *cpu) 137 - { 138 - /* 139 - * Start with full 0-4G segments...except the Guest is allowed to use 140 - * them, so set the privilege level appropriately in the flags. 141 - */ 142 - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 143 - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 144 - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL; 145 - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL; 146 - } 147 - 148 - /*H:650 149 - * An optimization of copy_gdt(), for just the three "thead-local storage" 150 - * entries. 151 - */ 152 - void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) 153 - { 154 - unsigned int i; 155 - 156 - for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 157 - gdt[i] = cpu->arch.gdt[i]; 158 - } 159 - 160 - /*H:640 161 - * When the Guest is run on a different CPU, or the GDT entries have changed, 162 - * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's 163 - * GDT. 164 - */ 165 - void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) 166 - { 167 - unsigned int i; 168 - 169 - /* 170 - * The default entries from setup_default_gdt_entries() are not 171 - * replaced. See ignored_gdt() above. 172 - */ 173 - for (i = 0; i < GDT_ENTRIES; i++) 174 - if (!ignored_gdt(i)) 175 - gdt[i] = cpu->arch.gdt[i]; 176 - } 177 - 178 - /*H:620 179 - * This is where the Guest asks us to load a new GDT entry 180 - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. 181 - */ 182 - void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) 183 - { 184 - /* 185 - * We assume the Guest has the same number of GDT entries as the 186 - * Host, otherwise we'd have to dynamically allocate the Guest GDT. 187 - */ 188 - if (num >= ARRAY_SIZE(cpu->arch.gdt)) { 189 - kill_guest(cpu, "too many gdt entries %i", num); 190 - return; 191 - } 192 - 193 - /* Set it up, then fix it. */ 194 - cpu->arch.gdt[num].a = lo; 195 - cpu->arch.gdt[num].b = hi; 196 - fixup_gdt_table(cpu, num, num+1); 197 - /* 198 - * Mark that the GDT changed so the core knows it has to copy it again, 199 - * even if the Guest is run on the same CPU. 200 - */ 201 - cpu->changed |= CHANGED_GDT; 202 - } 203 - 204 - /* 205 - * This is the fast-track version for just changing the three TLS entries. 206 - * Remember that this happens on every context switch, so it's worth 207 - * optimizing. But wouldn't it be neater to have a single hypercall to cover 208 - * both cases? 209 - */ 210 - void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) 211 - { 212 - struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; 213 - 214 - __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 215 - fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 216 - /* Note that just the TLS entries have changed. */ 217 - cpu->changed |= CHANGED_GDT_TLS; 218 - } 219 - 220 - /*H:660 221 - * With this, we have finished the Host. 222 - * 223 - * Five of the seven parts of our task are complete. You have made it through 224 - * the Bit of Despair (I think that's somewhere in the page table code, 225 - * myself). 226 - * 227 - * Next, we examine "make Switcher". It's short, but intense. 228 - */
-724
drivers/lguest/x86/core.c
··· 1 - /* 2 - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 3 - * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI. 4 - * 5 - * This program is free software; you can redistribute it and/or modify 6 - * it under the terms of the GNU General Public License as published by 7 - * the Free Software Foundation; either version 2 of the License, or 8 - * (at your option) any later version. 9 - * 10 - * This program is distributed in the hope that it will be useful, but 11 - * WITHOUT ANY WARRANTY; without even the implied warranty of 12 - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 13 - * NON INFRINGEMENT. See the GNU General Public License for more 14 - * details. 15 - * 16 - * You should have received a copy of the GNU General Public License 17 - * along with this program; if not, write to the Free Software 18 - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 - */ 20 - /*P:450 21 - * This file contains the x86-specific lguest code. It used to be all 22 - * mixed in with drivers/lguest/core.c but several foolhardy code slashers 23 - * wrestled most of the dependencies out to here in preparation for porting 24 - * lguest to other architectures (see what I mean by foolhardy?). 25 - * 26 - * This also contains a couple of non-obvious setup and teardown pieces which 27 - * were implemented after days of debugging pain. 28 - :*/ 29 - #include <linux/kernel.h> 30 - #include <linux/start_kernel.h> 31 - #include <linux/string.h> 32 - #include <linux/console.h> 33 - #include <linux/screen_info.h> 34 - #include <linux/irq.h> 35 - #include <linux/interrupt.h> 36 - #include <linux/clocksource.h> 37 - #include <linux/clockchips.h> 38 - #include <linux/cpu.h> 39 - #include <linux/lguest.h> 40 - #include <linux/lguest_launcher.h> 41 - #include <asm/paravirt.h> 42 - #include <asm/param.h> 43 - #include <asm/page.h> 44 - #include <asm/pgtable.h> 45 - #include <asm/desc.h> 46 - #include <asm/setup.h> 47 - #include <asm/lguest.h> 48 - #include <linux/uaccess.h> 49 - #include <asm/fpu/internal.h> 50 - #include <asm/tlbflush.h> 51 - #include "../lg.h" 52 - 53 - static int cpu_had_pge; 54 - 55 - static struct { 56 - unsigned long offset; 57 - unsigned short segment; 58 - } lguest_entry; 59 - 60 - /* Offset from where switcher.S was compiled to where we've copied it */ 61 - static unsigned long switcher_offset(void) 62 - { 63 - return switcher_addr - (unsigned long)start_switcher_text; 64 - } 65 - 66 - /* This cpu's struct lguest_pages (after the Switcher text page) */ 67 - static struct lguest_pages *lguest_pages(unsigned int cpu) 68 - { 69 - return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); 70 - } 71 - 72 - static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); 73 - 74 - /*S:010 75 - * We approach the Switcher. 76 - * 77 - * Remember that each CPU has two pages which are visible to the Guest when it 78 - * runs on that CPU. This has to contain the state for that Guest: we copy the 79 - * state in just before we run the Guest. 80 - * 81 - * Each Guest has "changed" flags which indicate what has changed in the Guest 82 - * since it last ran. We saw this set in interrupts_and_traps.c and 83 - * segments.c. 84 - */ 85 - static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) 86 - { 87 - /* 88 - * Copying all this data can be quite expensive. We usually run the 89 - * same Guest we ran last time (and that Guest hasn't run anywhere else 90 - * meanwhile). If that's not the case, we pretend everything in the 91 - * Guest has changed. 92 - */ 93 - if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) { 94 - __this_cpu_write(lg_last_cpu, cpu); 95 - cpu->last_pages = pages; 96 - cpu->changed = CHANGED_ALL; 97 - } 98 - 99 - /* 100 - * These copies are pretty cheap, so we do them unconditionally: */ 101 - /* Save the current Host top-level page directory. 102 - */ 103 - pages->state.host_cr3 = __pa(current->mm->pgd); 104 - /* 105 - * Set up the Guest's page tables to see this CPU's pages (and no 106 - * other CPU's pages). 107 - */ 108 - map_switcher_in_guest(cpu, pages); 109 - /* 110 - * Set up the two "TSS" members which tell the CPU what stack to use 111 - * for traps which do directly into the Guest (ie. traps at privilege 112 - * level 1). 113 - */ 114 - pages->state.guest_tss.sp1 = cpu->esp1; 115 - pages->state.guest_tss.ss1 = cpu->ss1; 116 - 117 - /* Copy direct-to-Guest trap entries. */ 118 - if (cpu->changed & CHANGED_IDT) 119 - copy_traps(cpu, pages->state.guest_idt, default_idt_entries); 120 - 121 - /* Copy all GDT entries which the Guest can change. */ 122 - if (cpu->changed & CHANGED_GDT) 123 - copy_gdt(cpu, pages->state.guest_gdt); 124 - /* If only the TLS entries have changed, copy them. */ 125 - else if (cpu->changed & CHANGED_GDT_TLS) 126 - copy_gdt_tls(cpu, pages->state.guest_gdt); 127 - 128 - /* Mark the Guest as unchanged for next time. */ 129 - cpu->changed = 0; 130 - } 131 - 132 - /* Finally: the code to actually call into the Switcher to run the Guest. */ 133 - static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) 134 - { 135 - /* This is a dummy value we need for GCC's sake. */ 136 - unsigned int clobber; 137 - 138 - /* 139 - * Copy the guest-specific information into this CPU's "struct 140 - * lguest_pages". 141 - */ 142 - copy_in_guest_info(cpu, pages); 143 - 144 - /* 145 - * Set the trap number to 256 (impossible value). If we fault while 146 - * switching to the Guest (bad segment registers or bug), this will 147 - * cause us to abort the Guest. 148 - */ 149 - cpu->regs->trapnum = 256; 150 - 151 - /* 152 - * Now: we push the "eflags" register on the stack, then do an "lcall". 153 - * This is how we change from using the kernel code segment to using 154 - * the dedicated lguest code segment, as well as jumping into the 155 - * Switcher. 156 - * 157 - * The lcall also pushes the old code segment (KERNEL_CS) onto the 158 - * stack, then the address of this call. This stack layout happens to 159 - * exactly match the stack layout created by an interrupt... 160 - */ 161 - asm volatile("pushf; lcall *%4" 162 - /* 163 - * This is how we tell GCC that %eax ("a") and %ebx ("b") 164 - * are changed by this routine. The "=" means output. 165 - */ 166 - : "=a"(clobber), "=b"(clobber) 167 - /* 168 - * %eax contains the pages pointer. ("0" refers to the 169 - * 0-th argument above, ie "a"). %ebx contains the 170 - * physical address of the Guest's top-level page 171 - * directory. 172 - */ 173 - : "0"(pages), 174 - "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)), 175 - "m"(lguest_entry) 176 - /* 177 - * We tell gcc that all these registers could change, 178 - * which means we don't have to save and restore them in 179 - * the Switcher. 180 - */ 181 - : "memory", "%edx", "%ecx", "%edi", "%esi"); 182 - } 183 - /*:*/ 184 - 185 - unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any) 186 - { 187 - switch (reg_off) { 188 - case offsetof(struct pt_regs, bx): 189 - return &cpu->regs->ebx; 190 - case offsetof(struct pt_regs, cx): 191 - return &cpu->regs->ecx; 192 - case offsetof(struct pt_regs, dx): 193 - return &cpu->regs->edx; 194 - case offsetof(struct pt_regs, si): 195 - return &cpu->regs->esi; 196 - case offsetof(struct pt_regs, di): 197 - return &cpu->regs->edi; 198 - case offsetof(struct pt_regs, bp): 199 - return &cpu->regs->ebp; 200 - case offsetof(struct pt_regs, ax): 201 - return &cpu->regs->eax; 202 - case offsetof(struct pt_regs, ip): 203 - return &cpu->regs->eip; 204 - case offsetof(struct pt_regs, sp): 205 - return &cpu->regs->esp; 206 - } 207 - 208 - /* Launcher can read these, but we don't allow any setting. */ 209 - if (any) { 210 - switch (reg_off) { 211 - case offsetof(struct pt_regs, ds): 212 - return &cpu->regs->ds; 213 - case offsetof(struct pt_regs, es): 214 - return &cpu->regs->es; 215 - case offsetof(struct pt_regs, fs): 216 - return &cpu->regs->fs; 217 - case offsetof(struct pt_regs, gs): 218 - return &cpu->regs->gs; 219 - case offsetof(struct pt_regs, cs): 220 - return &cpu->regs->cs; 221 - case offsetof(struct pt_regs, flags): 222 - return &cpu->regs->eflags; 223 - case offsetof(struct pt_regs, ss): 224 - return &cpu->regs->ss; 225 - } 226 - } 227 - 228 - return NULL; 229 - } 230 - 231 - /*M:002 232 - * There are hooks in the scheduler which we can register to tell when we 233 - * get kicked off the CPU (preempt_notifier_register()). This would allow us 234 - * to lazily disable SYSENTER which would regain some performance, and should 235 - * also simplify copy_in_guest_info(). Note that we'd still need to restore 236 - * things when we exit to Launcher userspace, but that's fairly easy. 237 - * 238 - * We could also try using these hooks for PGE, but that might be too expensive. 239 - * 240 - * The hooks were designed for KVM, but we can also put them to good use. 241 - :*/ 242 - 243 - /*H:040 244 - * This is the i386-specific code to setup and run the Guest. Interrupts 245 - * are disabled: we own the CPU. 246 - */ 247 - void lguest_arch_run_guest(struct lg_cpu *cpu) 248 - { 249 - /* 250 - * SYSENTER is an optimized way of doing system calls. We can't allow 251 - * it because it always jumps to privilege level 0. A normal Guest 252 - * won't try it because we don't advertise it in CPUID, but a malicious 253 - * Guest (or malicious Guest userspace program) could, so we tell the 254 - * CPU to disable it before running the Guest. 255 - */ 256 - if (boot_cpu_has(X86_FEATURE_SEP)) 257 - wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 258 - 259 - /* 260 - * Now we actually run the Guest. It will return when something 261 - * interesting happens, and we can examine its registers to see what it 262 - * was doing. 263 - */ 264 - run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 265 - 266 - /* 267 - * Note that the "regs" structure contains two extra entries which are 268 - * not really registers: a trap number which says what interrupt or 269 - * trap made the switcher code come back, and an error code which some 270 - * traps set. 271 - */ 272 - 273 - /* Restore SYSENTER if it's supposed to be on. */ 274 - if (boot_cpu_has(X86_FEATURE_SEP)) 275 - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 276 - 277 - /* 278 - * If the Guest page faulted, then the cr2 register will tell us the 279 - * bad virtual address. We have to grab this now, because once we 280 - * re-enable interrupts an interrupt could fault and thus overwrite 281 - * cr2, or we could even move off to a different CPU. 282 - */ 283 - if (cpu->regs->trapnum == 14) 284 - cpu->arch.last_pagefault = read_cr2(); 285 - /* 286 - * Similarly, if we took a trap because the Guest used the FPU, 287 - * we have to restore the FPU it expects to see. 288 - * fpu__restore() may sleep and we may even move off to 289 - * a different CPU. So all the critical stuff should be done 290 - * before this. 291 - */ 292 - else if (cpu->regs->trapnum == 7 && !fpregs_active()) 293 - fpu__restore(&current->thread.fpu); 294 - } 295 - 296 - /*H:130 297 - * Now we've examined the hypercall code; our Guest can make requests. 298 - * Our Guest is usually so well behaved; it never tries to do things it isn't 299 - * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 300 - * infrastructure isn't quite complete, because it doesn't contain replacements 301 - * for the Intel I/O instructions. As a result, the Guest sometimes fumbles 302 - * across one during the boot process as it probes for various things which are 303 - * usually attached to a PC. 304 - * 305 - * When the Guest uses one of these instructions, we get a trap (General 306 - * Protection Fault) and come here. We queue this to be sent out to the 307 - * Launcher to handle. 308 - */ 309 - 310 - /* 311 - * The eip contains the *virtual* address of the Guest's instruction: 312 - * we copy the instruction here so the Launcher doesn't have to walk 313 - * the page tables to decode it. We handle the case (eg. in a kernel 314 - * module) where the instruction is over two pages, and the pages are 315 - * virtually but not physically contiguous. 316 - * 317 - * The longest possible x86 instruction is 15 bytes, but we don't handle 318 - * anything that strange. 319 - */ 320 - static void copy_from_guest(struct lg_cpu *cpu, 321 - void *dst, unsigned long vaddr, size_t len) 322 - { 323 - size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE); 324 - unsigned long paddr; 325 - 326 - BUG_ON(len > PAGE_SIZE); 327 - 328 - /* If it goes over a page, copy in two parts. */ 329 - if (len > to_page_end) { 330 - /* But make sure the next page is mapped! */ 331 - if (__guest_pa(cpu, vaddr + to_page_end, &paddr)) 332 - copy_from_guest(cpu, dst + to_page_end, 333 - vaddr + to_page_end, 334 - len - to_page_end); 335 - else 336 - /* Otherwise fill with zeroes. */ 337 - memset(dst + to_page_end, 0, len - to_page_end); 338 - len = to_page_end; 339 - } 340 - 341 - /* This will kill the guest if it isn't mapped, but that 342 - * shouldn't happen. */ 343 - __lgread(cpu, dst, guest_pa(cpu, vaddr), len); 344 - } 345 - 346 - 347 - static void setup_emulate_insn(struct lg_cpu *cpu) 348 - { 349 - cpu->pending.trap = 13; 350 - copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, 351 - sizeof(cpu->pending.insn)); 352 - } 353 - 354 - static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr) 355 - { 356 - cpu->pending.trap = 14; 357 - cpu->pending.addr = iomem_addr; 358 - copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, 359 - sizeof(cpu->pending.insn)); 360 - } 361 - 362 - /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 363 - void lguest_arch_handle_trap(struct lg_cpu *cpu) 364 - { 365 - unsigned long iomem_addr; 366 - 367 - switch (cpu->regs->trapnum) { 368 - case 13: /* We've intercepted a General Protection Fault. */ 369 - /* Hand to Launcher to emulate those pesky IN and OUT insns */ 370 - if (cpu->regs->errcode == 0) { 371 - setup_emulate_insn(cpu); 372 - return; 373 - } 374 - break; 375 - case 14: /* We've intercepted a Page Fault. */ 376 - /* 377 - * The Guest accessed a virtual address that wasn't mapped. 378 - * This happens a lot: we don't actually set up most of the page 379 - * tables for the Guest at all when we start: as it runs it asks 380 - * for more and more, and we set them up as required. In this 381 - * case, we don't even tell the Guest that the fault happened. 382 - * 383 - * The errcode tells whether this was a read or a write, and 384 - * whether kernel or userspace code. 385 - */ 386 - if (demand_page(cpu, cpu->arch.last_pagefault, 387 - cpu->regs->errcode, &iomem_addr)) 388 - return; 389 - 390 - /* Was this an access to memory mapped IO? */ 391 - if (iomem_addr) { 392 - /* Tell Launcher, let it handle it. */ 393 - setup_iomem_insn(cpu, iomem_addr); 394 - return; 395 - } 396 - 397 - /* 398 - * OK, it's really not there (or not OK): the Guest needs to 399 - * know. We write out the cr2 value so it knows where the 400 - * fault occurred. 401 - * 402 - * Note that if the Guest were really messed up, this could 403 - * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 404 - * lg->lguest_data could be NULL 405 - */ 406 - if (cpu->lg->lguest_data && 407 - put_user(cpu->arch.last_pagefault, 408 - &cpu->lg->lguest_data->cr2)) 409 - kill_guest(cpu, "Writing cr2"); 410 - break; 411 - case 7: /* We've intercepted a Device Not Available fault. */ 412 - /* No special handling is needed here. */ 413 - break; 414 - case 32 ... 255: 415 - /* This might be a syscall. */ 416 - if (could_be_syscall(cpu->regs->trapnum)) 417 - break; 418 - 419 - /* 420 - * Other values mean a real interrupt occurred, in which case 421 - * the Host handler has already been run. We just do a 422 - * friendly check if another process should now be run, then 423 - * return to run the Guest again. 424 - */ 425 - cond_resched(); 426 - return; 427 - case LGUEST_TRAP_ENTRY: 428 - /* 429 - * Our 'struct hcall_args' maps directly over our regs: we set 430 - * up the pointer now to indicate a hypercall is pending. 431 - */ 432 - cpu->hcall = (struct hcall_args *)cpu->regs; 433 - return; 434 - } 435 - 436 - /* We didn't handle the trap, so it needs to go to the Guest. */ 437 - if (!deliver_trap(cpu, cpu->regs->trapnum)) 438 - /* 439 - * If the Guest doesn't have a handler (either it hasn't 440 - * registered any yet, or it's one of the faults we don't let 441 - * it handle), it dies with this cryptic error message. 442 - */ 443 - kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 444 - cpu->regs->trapnum, cpu->regs->eip, 445 - cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault 446 - : cpu->regs->errcode); 447 - } 448 - 449 - /* 450 - * Now we can look at each of the routines this calls, in increasing order of 451 - * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 452 - * deliver_trap() and demand_page(). After all those, we'll be ready to 453 - * examine the Switcher, and our philosophical understanding of the Host/Guest 454 - * duality will be complete. 455 - :*/ 456 - static void adjust_pge(void *on) 457 - { 458 - if (on) 459 - cr4_set_bits(X86_CR4_PGE); 460 - else 461 - cr4_clear_bits(X86_CR4_PGE); 462 - } 463 - 464 - /*H:020 465 - * Now the Switcher is mapped and every thing else is ready, we need to do 466 - * some more i386-specific initialization. 467 - */ 468 - void __init lguest_arch_host_init(void) 469 - { 470 - int i; 471 - 472 - /* 473 - * Most of the x86/switcher_32.S doesn't care that it's been moved; on 474 - * Intel, jumps are relative, and it doesn't access any references to 475 - * external code or data. 476 - * 477 - * The only exception is the interrupt handlers in switcher.S: their 478 - * addresses are placed in a table (default_idt_entries), so we need to 479 - * update the table with the new addresses. switcher_offset() is a 480 - * convenience function which returns the distance between the 481 - * compiled-in switcher code and the high-mapped copy we just made. 482 - */ 483 - for (i = 0; i < IDT_ENTRIES; i++) 484 - default_idt_entries[i] += switcher_offset(); 485 - 486 - /* 487 - * Set up the Switcher's per-cpu areas. 488 - * 489 - * Each CPU gets two pages of its own within the high-mapped region 490 - * (aka. "struct lguest_pages"). Much of this can be initialized now, 491 - * but some depends on what Guest we are running (which is set up in 492 - * copy_in_guest_info()). 493 - */ 494 - for_each_possible_cpu(i) { 495 - /* lguest_pages() returns this CPU's two pages. */ 496 - struct lguest_pages *pages = lguest_pages(i); 497 - /* This is a convenience pointer to make the code neater. */ 498 - struct lguest_ro_state *state = &pages->state; 499 - 500 - /* 501 - * The Global Descriptor Table: the Host has a different one 502 - * for each CPU. We keep a descriptor for the GDT which says 503 - * where it is and how big it is (the size is actually the last 504 - * byte, not the size, hence the "-1"). 505 - */ 506 - state->host_gdt_desc.size = GDT_SIZE-1; 507 - state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i); 508 - 509 - /* 510 - * All CPUs on the Host use the same Interrupt Descriptor 511 - * Table, so we just use store_idt(), which gets this CPU's IDT 512 - * descriptor. 513 - */ 514 - store_idt(&state->host_idt_desc); 515 - 516 - /* 517 - * The descriptors for the Guest's GDT and IDT can be filled 518 - * out now, too. We copy the GDT & IDT into ->guest_gdt and 519 - * ->guest_idt before actually running the Guest. 520 - */ 521 - state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 522 - state->guest_idt_desc.address = (long)&state->guest_idt; 523 - state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 524 - state->guest_gdt_desc.address = (long)&state->guest_gdt; 525 - 526 - /* 527 - * We know where we want the stack to be when the Guest enters 528 - * the Switcher: in pages->regs. The stack grows upwards, so 529 - * we start it at the end of that structure. 530 - */ 531 - state->guest_tss.sp0 = (long)(&pages->regs + 1); 532 - /* 533 - * And this is the GDT entry to use for the stack: we keep a 534 - * couple of special LGUEST entries. 535 - */ 536 - state->guest_tss.ss0 = LGUEST_DS; 537 - 538 - /* 539 - * x86 can have a finegrained bitmap which indicates what I/O 540 - * ports the process can use. We set it to the end of our 541 - * structure, meaning "none". 542 - */ 543 - state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 544 - 545 - /* 546 - * Some GDT entries are the same across all Guests, so we can 547 - * set them up now. 548 - */ 549 - setup_default_gdt_entries(state); 550 - /* Most IDT entries are the same for all Guests, too.*/ 551 - setup_default_idt_entries(state, default_idt_entries); 552 - 553 - /* 554 - * The Host needs to be able to use the LGUEST segments on this 555 - * CPU, too, so put them in the Host GDT. 556 - */ 557 - get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 558 - get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 559 - } 560 - 561 - /* 562 - * In the Switcher, we want the %cs segment register to use the 563 - * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 564 - * it will be undisturbed when we switch. To change %cs and jump we 565 - * need this structure to feed to Intel's "lcall" instruction. 566 - */ 567 - lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 568 - lguest_entry.segment = LGUEST_CS; 569 - 570 - /* 571 - * Finally, we need to turn off "Page Global Enable". PGE is an 572 - * optimization where page table entries are specially marked to show 573 - * they never change. The Host kernel marks all the kernel pages this 574 - * way because it's always present, even when userspace is running. 575 - * 576 - * Lguest breaks this: unbeknownst to the rest of the Host kernel, we 577 - * switch to the Guest kernel. If you don't disable this on all CPUs, 578 - * you'll get really weird bugs that you'll chase for two days. 579 - * 580 - * I used to turn PGE off every time we switched to the Guest and back 581 - * on when we return, but that slowed the Switcher down noticibly. 582 - */ 583 - 584 - /* 585 - * We don't need the complexity of CPUs coming and going while we're 586 - * doing this. 587 - */ 588 - get_online_cpus(); 589 - if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */ 590 - /* Remember that this was originally set (for cleanup). */ 591 - cpu_had_pge = 1; 592 - /* 593 - * adjust_pge is a helper function which sets or unsets the PGE 594 - * bit on its CPU, depending on the argument (0 == unset). 595 - */ 596 - on_each_cpu(adjust_pge, (void *)0, 1); 597 - /* Turn off the feature in the global feature set. */ 598 - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 599 - } 600 - put_online_cpus(); 601 - } 602 - /*:*/ 603 - 604 - void __exit lguest_arch_host_fini(void) 605 - { 606 - /* If we had PGE before we started, turn it back on now. */ 607 - get_online_cpus(); 608 - if (cpu_had_pge) { 609 - set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 610 - /* adjust_pge's argument "1" means set PGE. */ 611 - on_each_cpu(adjust_pge, (void *)1, 1); 612 - } 613 - put_online_cpus(); 614 - } 615 - 616 - 617 - /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ 618 - int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 619 - { 620 - switch (args->arg0) { 621 - case LHCALL_LOAD_GDT_ENTRY: 622 - load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3); 623 - break; 624 - case LHCALL_LOAD_IDT_ENTRY: 625 - load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3); 626 - break; 627 - case LHCALL_LOAD_TLS: 628 - guest_load_tls(cpu, args->arg1); 629 - break; 630 - default: 631 - /* Bad Guest. Bad! */ 632 - return -EIO; 633 - } 634 - return 0; 635 - } 636 - 637 - /*H:126 i386-specific hypercall initialization: */ 638 - int lguest_arch_init_hypercalls(struct lg_cpu *cpu) 639 - { 640 - u32 tsc_speed; 641 - 642 - /* 643 - * The pointer to the Guest's "struct lguest_data" is the only argument. 644 - * We check that address now. 645 - */ 646 - if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 647 - sizeof(*cpu->lg->lguest_data))) 648 - return -EFAULT; 649 - 650 - /* 651 - * Having checked it, we simply set lg->lguest_data to point straight 652 - * into the Launcher's memory at the right place and then use 653 - * copy_to_user/from_user from now on, instead of lgread/write. I put 654 - * this in to show that I'm not immune to writing stupid 655 - * optimizations. 656 - */ 657 - cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; 658 - 659 - /* 660 - * We insist that the Time Stamp Counter exist and doesn't change with 661 - * cpu frequency. Some devious chip manufacturers decided that TSC 662 - * changes could be handled in software. I decided that time going 663 - * backwards might be good for benchmarks, but it's bad for users. 664 - * 665 - * We also insist that the TSC be stable: the kernel detects unreliable 666 - * TSCs for its own purposes, and we use that here. 667 - */ 668 - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 669 - tsc_speed = tsc_khz; 670 - else 671 - tsc_speed = 0; 672 - if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz)) 673 - return -EFAULT; 674 - 675 - /* The interrupt code might not like the system call vector. */ 676 - if (!check_syscall_vector(cpu->lg)) 677 - kill_guest(cpu, "bad syscall vector"); 678 - 679 - return 0; 680 - } 681 - /*:*/ 682 - 683 - /*L:030 684 - * Most of the Guest's registers are left alone: we used get_zeroed_page() to 685 - * allocate the structure, so they will be 0. 686 - */ 687 - void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) 688 - { 689 - struct lguest_regs *regs = cpu->regs; 690 - 691 - /* 692 - * There are four "segment" registers which the Guest needs to boot: 693 - * The "code segment" register (cs) refers to the kernel code segment 694 - * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 695 - * refer to the kernel data segment __KERNEL_DS. 696 - * 697 - * The privilege level is packed into the lower bits. The Guest runs 698 - * at privilege level 1 (GUEST_PL). 699 - */ 700 - regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 701 - regs->cs = __KERNEL_CS|GUEST_PL; 702 - 703 - /* 704 - * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 705 - * is supposed to always be "1". Bit 9 (0x200) controls whether 706 - * interrupts are enabled. We always leave interrupts enabled while 707 - * running the Guest. 708 - */ 709 - regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; 710 - 711 - /* 712 - * The "Extended Instruction Pointer" register says where the Guest is 713 - * running. 714 - */ 715 - regs->eip = start; 716 - 717 - /* 718 - * %esi points to our boot information, at physical address 0, so don't 719 - * touch it. 720 - */ 721 - 722 - /* There are a couple of GDT entries the Guest expects at boot. */ 723 - setup_guest_gdt(cpu); 724 - }
-388
drivers/lguest/x86/switcher_32.S
··· 1 - /*P:900 2 - * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride 3 - * both the Host and Guest to do the low-level Guest<->Host switch. It is as 4 - * simple as it can be made, but it's naturally very specific to x86. 5 - * 6 - * You have now completed Preparation. If this has whet your appetite; if you 7 - * are feeling invigorated and refreshed then the next, more challenging stage 8 - * can be found in "make Guest". 9 - :*/ 10 - 11 - /*M:012 12 - * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 13 - * gain at least 1% more performance. Since neither LOC nor performance can be 14 - * measured beforehand, it generally means implementing a feature then deciding 15 - * if it's worth it. And once it's implemented, who can say no? 16 - * 17 - * This is why I haven't implemented this idea myself. I want to, but I 18 - * haven't. You could, though. 19 - * 20 - * The main place where lguest performance sucks is Guest page faulting. When 21 - * a Guest userspace process hits an unmapped page we switch back to the Host, 22 - * walk the page tables, find it's not mapped, switch back to the Guest page 23 - * fault handler, which calls a hypercall to set the page table entry, then 24 - * finally returns to userspace. That's two round-trips. 25 - * 26 - * If we had a small walker in the Switcher, we could quickly check the Guest 27 - * page table and if the page isn't mapped, immediately reflect the fault back 28 - * into the Guest. This means the Switcher would have to know the top of the 29 - * Guest page table and the page fault handler address. 30 - * 31 - * For simplicity, the Guest should only handle the case where the privilege 32 - * level of the fault is 3 and probably only not present or write faults. It 33 - * should also detect recursive faults, and hand the original fault to the 34 - * Host (which is actually really easy). 35 - * 36 - * Two questions remain. Would the performance gain outweigh the complexity? 37 - * And who would write the verse documenting it? 38 - :*/ 39 - 40 - /*M:011 41 - * Lguest64 handles NMI. This gave me NMI envy (until I looked at their 42 - * code). It's worth doing though, since it would let us use oprofile in the 43 - * Host when a Guest is running. 44 - :*/ 45 - 46 - /*S:100 47 - * Welcome to the Switcher itself! 48 - * 49 - * This file contains the low-level code which changes the CPU to run the Guest 50 - * code, and returns to the Host when something happens. Understand this, and 51 - * you understand the heart of our journey. 52 - * 53 - * Because this is in assembler rather than C, our tale switches from prose to 54 - * verse. First I tried limericks: 55 - * 56 - * There once was an eax reg, 57 - * To which our pointer was fed, 58 - * It needed an add, 59 - * Which asm-offsets.h had 60 - * But this limerick is hurting my head. 61 - * 62 - * Next I tried haikus, but fitting the required reference to the seasons in 63 - * every stanza was quickly becoming tiresome: 64 - * 65 - * The %eax reg 66 - * Holds "struct lguest_pages" now: 67 - * Cherry blossoms fall. 68 - * 69 - * Then I started with Heroic Verse, but the rhyming requirement leeched away 70 - * the content density and led to some uniquely awful oblique rhymes: 71 - * 72 - * These constants are coming from struct offsets 73 - * For use within the asm switcher text. 74 - * 75 - * Finally, I settled for something between heroic hexameter, and normal prose 76 - * with inappropriate linebreaks. Anyway, it aint no Shakespeare. 77 - */ 78 - 79 - // Not all kernel headers work from assembler 80 - // But these ones are needed: the ENTRY() define 81 - // And constants extracted from struct offsets 82 - // To avoid magic numbers and breakage: 83 - // Should they change the compiler can't save us 84 - // Down here in the depths of assembler code. 85 - #include <linux/linkage.h> 86 - #include <asm/asm-offsets.h> 87 - #include <asm/page.h> 88 - #include <asm/segment.h> 89 - #include <asm/lguest.h> 90 - 91 - // We mark the start of the code to copy 92 - // It's placed in .text tho it's never run here 93 - // You'll see the trick macro at the end 94 - // Which interleaves data and text to effect. 95 - .text 96 - ENTRY(start_switcher_text) 97 - 98 - // When we reach switch_to_guest we have just left 99 - // The safe and comforting shores of C code 100 - // %eax has the "struct lguest_pages" to use 101 - // Where we save state and still see it from the Guest 102 - // And %ebx holds the Guest shadow pagetable: 103 - // Once set we have truly left Host behind. 104 - ENTRY(switch_to_guest) 105 - // We told gcc all its regs could fade, 106 - // Clobbered by our journey into the Guest 107 - // We could have saved them, if we tried 108 - // But time is our master and cycles count. 109 - 110 - // Segment registers must be saved for the Host 111 - // We push them on the Host stack for later 112 - pushl %es 113 - pushl %ds 114 - pushl %gs 115 - pushl %fs 116 - // But the compiler is fickle, and heeds 117 - // No warning of %ebp clobbers 118 - // When frame pointers are used. That register 119 - // Must be saved and restored or chaos strikes. 120 - pushl %ebp 121 - // The Host's stack is done, now save it away 122 - // In our "struct lguest_pages" at offset 123 - // Distilled into asm-offsets.h 124 - movl %esp, LGUEST_PAGES_host_sp(%eax) 125 - 126 - // All saved and there's now five steps before us: 127 - // Stack, GDT, IDT, TSS 128 - // Then last of all the page tables are flipped. 129 - 130 - // Yet beware that our stack pointer must be 131 - // Always valid lest an NMI hits 132 - // %edx does the duty here as we juggle 133 - // %eax is lguest_pages: our stack lies within. 134 - movl %eax, %edx 135 - addl $LGUEST_PAGES_regs, %edx 136 - movl %edx, %esp 137 - 138 - // The Guest's GDT we so carefully 139 - // Placed in the "struct lguest_pages" before 140 - lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 141 - 142 - // The Guest's IDT we did partially 143 - // Copy to "struct lguest_pages" as well. 144 - lidt LGUEST_PAGES_guest_idt_desc(%eax) 145 - 146 - // The TSS entry which controls traps 147 - // Must be loaded up with "ltr" now: 148 - // The GDT entry that TSS uses 149 - // Changes type when we load it: damn Intel! 150 - // For after we switch over our page tables 151 - // That entry will be read-only: we'd crash. 152 - movl $(GDT_ENTRY_TSS*8), %edx 153 - ltr %dx 154 - 155 - // Look back now, before we take this last step! 156 - // The Host's TSS entry was also marked used; 157 - // Let's clear it again for our return. 158 - // The GDT descriptor of the Host 159 - // Points to the table after two "size" bytes 160 - movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 161 - // Clear "used" from type field (byte 5, bit 2) 162 - andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 163 - 164 - // Once our page table's switched, the Guest is live! 165 - // The Host fades as we run this final step. 166 - // Our "struct lguest_pages" is now read-only. 167 - movl %ebx, %cr3 168 - 169 - // The page table change did one tricky thing: 170 - // The Guest's register page has been mapped 171 - // Writable under our %esp (stack) -- 172 - // We can simply pop off all Guest regs. 173 - popl %eax 174 - popl %ebx 175 - popl %ecx 176 - popl %edx 177 - popl %esi 178 - popl %edi 179 - popl %ebp 180 - popl %gs 181 - popl %fs 182 - popl %ds 183 - popl %es 184 - 185 - // Near the base of the stack lurk two strange fields 186 - // Which we fill as we exit the Guest 187 - // These are the trap number and its error 188 - // We can simply step past them on our way. 189 - addl $8, %esp 190 - 191 - // The last five stack slots hold return address 192 - // And everything needed to switch privilege 193 - // From Switcher's level 0 to Guest's 1, 194 - // And the stack where the Guest had last left it. 195 - // Interrupts are turned back on: we are Guest. 196 - iret 197 - 198 - // We tread two paths to switch back to the Host 199 - // Yet both must save Guest state and restore Host 200 - // So we put the routine in a macro. 201 - #define SWITCH_TO_HOST \ 202 - /* We save the Guest state: all registers first \ 203 - * Laid out just as "struct lguest_regs" defines */ \ 204 - pushl %es; \ 205 - pushl %ds; \ 206 - pushl %fs; \ 207 - pushl %gs; \ 208 - pushl %ebp; \ 209 - pushl %edi; \ 210 - pushl %esi; \ 211 - pushl %edx; \ 212 - pushl %ecx; \ 213 - pushl %ebx; \ 214 - pushl %eax; \ 215 - /* Our stack and our code are using segments \ 216 - * Set in the TSS and IDT \ 217 - * Yet if we were to touch data we'd use \ 218 - * Whatever data segment the Guest had. \ 219 - * Load the lguest ds segment for now. */ \ 220 - movl $(LGUEST_DS), %eax; \ 221 - movl %eax, %ds; \ 222 - /* So where are we? Which CPU, which struct? \ 223 - * The stack is our clue: our TSS starts \ 224 - * It at the end of "struct lguest_pages". \ 225 - * Or we may have stumbled while restoring \ 226 - * Our Guest segment regs while in switch_to_guest, \ 227 - * The fault pushed atop that part-unwound stack. \ 228 - * If we round the stack down to the page start \ 229 - * We're at the start of "struct lguest_pages". */ \ 230 - movl %esp, %eax; \ 231 - andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 232 - /* Save our trap number: the switch will obscure it \ 233 - * (In the Host the Guest regs are not mapped here) \ 234 - * %ebx holds it safe for deliver_to_host */ \ 235 - movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 236 - /* The Host GDT, IDT and stack! \ 237 - * All these lie safely hidden from the Guest: \ 238 - * We must return to the Host page tables \ 239 - * (Hence that was saved in struct lguest_pages) */ \ 240 - movl LGUEST_PAGES_host_cr3(%eax), %edx; \ 241 - movl %edx, %cr3; \ 242 - /* As before, when we looked back at the Host \ 243 - * As we left and marked TSS unused \ 244 - * So must we now for the Guest left behind. */ \ 245 - andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ 246 - /* Switch to Host's GDT, IDT. */ \ 247 - lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 248 - lidt LGUEST_PAGES_host_idt_desc(%eax); \ 249 - /* Restore the Host's stack where its saved regs lie */ \ 250 - movl LGUEST_PAGES_host_sp(%eax), %esp; \ 251 - /* Last the TSS: our Host is returned */ \ 252 - movl $(GDT_ENTRY_TSS*8), %edx; \ 253 - ltr %dx; \ 254 - /* Restore now the regs saved right at the first. */ \ 255 - popl %ebp; \ 256 - popl %fs; \ 257 - popl %gs; \ 258 - popl %ds; \ 259 - popl %es 260 - 261 - // The first path is trod when the Guest has trapped: 262 - // (Which trap it was has been pushed on the stack). 263 - // We need only switch back, and the Host will decode 264 - // Why we came home, and what needs to be done. 265 - return_to_host: 266 - SWITCH_TO_HOST 267 - iret 268 - 269 - // We are lead to the second path like so: 270 - // An interrupt, with some cause external 271 - // Has ajerked us rudely from the Guest's code 272 - // Again we must return home to the Host 273 - deliver_to_host: 274 - SWITCH_TO_HOST 275 - // But now we must go home via that place 276 - // Where that interrupt was supposed to go 277 - // Had we not been ensconced, running the Guest. 278 - // Here we see the trickness of run_guest_once(): 279 - // The Host stack is formed like an interrupt 280 - // With EIP, CS and EFLAGS layered. 281 - // Interrupt handlers end with "iret" 282 - // And that will take us home at long long last. 283 - 284 - // But first we must find the handler to call! 285 - // The IDT descriptor for the Host 286 - // Has two bytes for size, and four for address: 287 - // %edx will hold it for us for now. 288 - movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx 289 - // We now know the table address we need, 290 - // And saved the trap's number inside %ebx. 291 - // Yet the pointer to the handler is smeared 292 - // Across the bits of the table entry. 293 - // What oracle can tell us how to extract 294 - // From such a convoluted encoding? 295 - // I consulted gcc, and it gave 296 - // These instructions, which I gladly credit: 297 - leal (%edx,%ebx,8), %eax 298 - movzwl (%eax),%edx 299 - movl 4(%eax), %eax 300 - xorw %ax, %ax 301 - orl %eax, %edx 302 - // Now the address of the handler's in %edx 303 - // We call it now: its "iret" drops us home. 304 - jmp *%edx 305 - 306 - // Every interrupt can come to us here 307 - // But we must truly tell each apart. 308 - // They number two hundred and fifty six 309 - // And each must land in a different spot, 310 - // Push its number on stack, and join the stream. 311 - 312 - // And worse, a mere six of the traps stand apart 313 - // And push on their stack an addition: 314 - // An error number, thirty two bits long 315 - // So we punish the other two fifty 316 - // And make them push a zero so they match. 317 - 318 - // Yet two fifty six entries is long 319 - // And all will look most the same as the last 320 - // So we create a macro which can make 321 - // As many entries as we need to fill. 322 - 323 - // Note the change to .data then .text: 324 - // We plant the address of each entry 325 - // Into a (data) table for the Host 326 - // To know where each Guest interrupt should go. 327 - .macro IRQ_STUB N TARGET 328 - .data; .long 1f; .text; 1: 329 - // Trap eight, ten through fourteen and seventeen 330 - // Supply an error number. Else zero. 331 - .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) 332 - pushl $0 333 - .endif 334 - pushl $\N 335 - jmp \TARGET 336 - ALIGN 337 - .endm 338 - 339 - // This macro creates numerous entries 340 - // Using GAS macros which out-power C's. 341 - .macro IRQ_STUBS FIRST LAST TARGET 342 - irq=\FIRST 343 - .rept \LAST-\FIRST+1 344 - IRQ_STUB irq \TARGET 345 - irq=irq+1 346 - .endr 347 - .endm 348 - 349 - // Here's the marker for our pointer table 350 - // Laid in the data section just before 351 - // Each macro places the address of code 352 - // Forming an array: each one points to text 353 - // Which handles interrupt in its turn. 354 - .data 355 - .global default_idt_entries 356 - default_idt_entries: 357 - .text 358 - // The first two traps go straight back to the Host 359 - IRQ_STUBS 0 1 return_to_host 360 - // We'll say nothing, yet, about NMI 361 - IRQ_STUB 2 handle_nmi 362 - // Other traps also return to the Host 363 - IRQ_STUBS 3 31 return_to_host 364 - // All interrupts go via their handlers 365 - IRQ_STUBS 32 127 deliver_to_host 366 - // 'Cept system calls coming from userspace 367 - // Are to go to the Guest, never the Host. 368 - IRQ_STUB 128 return_to_host 369 - IRQ_STUBS 129 255 deliver_to_host 370 - 371 - // The NMI, what a fabulous beast 372 - // Which swoops in and stops us no matter that 373 - // We're suspended between heaven and hell, 374 - // (Or more likely between the Host and Guest) 375 - // When in it comes! We are dazed and confused 376 - // So we do the simplest thing which one can. 377 - // Though we've pushed the trap number and zero 378 - // We discard them, return, and hope we live. 379 - handle_nmi: 380 - addl $8, %esp 381 - iret 382 - 383 - // We are done; all that's left is Mastery 384 - // And "make Mastery" is a journey long 385 - // Designed to make your fingers itch to code. 386 - 387 - // Here ends the text, the file and poem. 388 - ENTRY(end_switcher_text)
+1 -1
drivers/net/Kconfig
··· 333 333 depends on VIRTIO 334 334 ---help--- 335 335 This is the virtual network driver for virtio. It can be used with 336 - lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. 336 + QEMU based VMMs (like KVM or Xen). Say Y or M. 337 337 338 338 config NLMON 339 339 tristate "Virtual netlink monitoring device"
+1 -1
drivers/tty/hvc/Kconfig
··· 4 4 bool 5 5 help 6 6 Generic "hypervisor virtual console" infrastructure for various 7 - hypervisors (pSeries, iSeries, Xen, lguest). 7 + hypervisors (pSeries, iSeries, Xen). 8 8 It will automatically be selected if one of the back-end console drivers 9 9 is selected. 10 10
+2 -2
drivers/virtio/Kconfig
··· 2 2 tristate 3 3 ---help--- 4 4 This option is selected by any driver which implements the virtio 5 - bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST, 6 - CONFIG_RPMSG or CONFIG_S390_GUEST. 5 + bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG 6 + or CONFIG_S390_GUEST. 7 7 8 8 menu "Virtio drivers" 9 9
-73
include/linux/lguest.h
··· 1 - /* 2 - * Things the lguest guest needs to know. Note: like all lguest interfaces, 3 - * this is subject to wild and random change between versions. 4 - */ 5 - #ifndef _LINUX_LGUEST_H 6 - #define _LINUX_LGUEST_H 7 - 8 - #ifndef __ASSEMBLY__ 9 - #include <linux/time.h> 10 - #include <asm/irq.h> 11 - #include <asm/lguest_hcall.h> 12 - 13 - #define LG_CLOCK_MIN_DELTA 100UL 14 - #define LG_CLOCK_MAX_DELTA ULONG_MAX 15 - 16 - /*G:031 17 - * The second method of communicating with the Host is to via "struct 18 - * lguest_data". Once the Guest's initialization hypercall tells the Host where 19 - * this is, the Guest and Host both publish information in it. 20 - :*/ 21 - struct lguest_data { 22 - /* 23 - * 512 == enabled (same as eflags in normal hardware). The Guest 24 - * changes interrupts so often that a hypercall is too slow. 25 - */ 26 - unsigned int irq_enabled; 27 - /* Fine-grained interrupt disabling by the Guest */ 28 - DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); 29 - 30 - /* 31 - * The Host writes the virtual address of the last page fault here, 32 - * which saves the Guest a hypercall. CR2 is the native register where 33 - * this address would normally be found. 34 - */ 35 - unsigned long cr2; 36 - 37 - /* Wallclock time set by the Host. */ 38 - struct timespec time; 39 - 40 - /* 41 - * Interrupt pending set by the Host. The Guest should do a hypercall 42 - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). 43 - */ 44 - int irq_pending; 45 - 46 - /* 47 - * Async hypercall ring. Instead of directly making hypercalls, we can 48 - * place them in here for processing the next time the Host wants. 49 - * This batching can be quite efficient. 50 - */ 51 - 52 - /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 53 - u8 hcall_status[LHCALL_RING_SIZE]; 54 - /* The actual registers for the hypercalls. */ 55 - struct hcall_args hcalls[LHCALL_RING_SIZE]; 56 - 57 - /* Fields initialized by the Host at boot: */ 58 - /* Memory not to try to access */ 59 - unsigned long reserve_mem; 60 - /* KHz for the TSC clock. */ 61 - u32 tsc_khz; 62 - 63 - /* Fields initialized by the Guest at boot: */ 64 - /* Instruction to suppress interrupts even if enabled */ 65 - unsigned long noirq_iret; 66 - /* Address above which page tables are all identical. */ 67 - unsigned long kernel_address; 68 - /* The vector to try to use for system calls (0x40 or 0x80). */ 69 - unsigned int syscall_vec; 70 - }; 71 - extern struct lguest_data lguest_data; 72 - #endif /* __ASSEMBLY__ */ 73 - #endif /* _LINUX_LGUEST_H */
-44
include/linux/lguest_launcher.h
··· 1 - #ifndef _LINUX_LGUEST_LAUNCHER 2 - #define _LINUX_LGUEST_LAUNCHER 3 - /* Everything the "lguest" userspace program needs to know. */ 4 - #include <linux/types.h> 5 - 6 - /*D:010 7 - * Drivers 8 - * 9 - * The Guest needs devices to do anything useful. Since we don't let it touch 10 - * real devices (think of the damage it could do!) we provide virtual devices. 11 - * We emulate a PCI bus with virtio devices on it; we used to have our own 12 - * lguest bus which was far simpler, but this tests the virtio 1.0 standard. 13 - * 14 - * Virtio devices are also used by kvm, so we can simply reuse their optimized 15 - * device drivers. And one day when everyone uses virtio, my plan will be 16 - * complete. Bwahahahah! 17 - */ 18 - 19 - /* Write command first word is a request. */ 20 - enum lguest_req 21 - { 22 - LHREQ_INITIALIZE, /* + base, pfnlimit, start */ 23 - LHREQ_GETDMA, /* No longer used */ 24 - LHREQ_IRQ, /* + irq */ 25 - LHREQ_BREAK, /* No longer used */ 26 - LHREQ_EVENTFD, /* No longer used. */ 27 - LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */ 28 - LHREQ_SETREG, /* + offset within struct pt_regs, value. */ 29 - LHREQ_TRAP, /* + trap number to deliver to guest. */ 30 - }; 31 - 32 - /* 33 - * This is what read() of the lguest fd populates. trap == 34 - * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the 35 - * argument), 14 for a page fault in the MMIO region (addr is 36 - * the trap address, insn is the instruction), or 13 for a GPF 37 - * (insn is the instruction). 38 - */ 39 - struct lguest_pending { 40 - __u8 trap; 41 - __u8 insn[7]; 42 - __u32 addr; 43 - }; 44 - #endif /* _LINUX_LGUEST_LAUNCHER */
+2 -2
include/uapi/linux/virtio_ring.h
··· 1 1 #ifndef _UAPI_LINUX_VIRTIO_RING_H 2 2 #define _UAPI_LINUX_VIRTIO_RING_H 3 - /* An interface for efficient virtio implementation, currently for use by KVM 4 - * and lguest, but hopefully others soon. Do NOT change this since it will 3 + /* An interface for efficient virtio implementation, currently for use by KVM, 4 + * but hopefully others soon. Do NOT change this since it will 5 5 * break existing servers and clients. 6 6 * 7 7 * This header is BSD licensed so anyone can use the definitions to implement
+5 -6
tools/Makefile
··· 18 18 @echo ' iio - IIO tools' 19 19 @echo ' kvm_stat - top-like utility for displaying kvm statistics' 20 20 @echo ' leds - LEDs tools' 21 - @echo ' lguest - a minimal 32-bit x86 hypervisor' 22 21 @echo ' liblockdep - user-space wrapper for kernel locking-validator' 23 22 @echo ' net - misc networking tools' 24 23 @echo ' perf - Linux performance measurement and analysis tool' ··· 89 90 kvm_stat: FORCE 90 91 $(call descend,kvm/$@) 91 92 92 - all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \ 93 + all: acpi cgroup cpupower gpio hv firewire liblockdep \ 93 94 perf selftests turbostat usb \ 94 95 virtio vm net x86_energy_perf_policy \ 95 96 tmon freefall objtool kvm_stat ··· 100 101 cpupower_install: 101 102 $(call descend,power/$(@:_install=),install) 102 103 103 - cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install: 104 + cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install: 104 105 $(call descend,$(@:_install=),install) 105 106 106 107 liblockdep_install: ··· 122 123 $(call descend,kvm/$(@:_install=),install) 123 124 124 125 install: acpi_install cgroup_install cpupower_install gpio_install \ 125 - hv_install firewire_install lguest_install liblockdep_install \ 126 + hv_install firewire_install liblockdep_install \ 126 127 perf_install selftests_install turbostat_install usb_install \ 127 128 virtio_install vm_install net_install x86_energy_perf_policy_install \ 128 129 tmon_install freefall_install objtool_install kvm_stat_install ··· 133 134 cpupower_clean: 134 135 $(call descend,power/cpupower,clean) 135 136 136 - cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: 137 + cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: 137 138 $(call descend,$(@:_clean=),clean) 138 139 139 140 liblockdep_clean: ··· 167 168 build_clean: 168 169 $(call descend,build,clean) 169 170 170 - clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \ 171 + clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \ 171 172 perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \ 172 173 vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ 173 174 freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
-2
tools/lguest/.gitignore
··· 1 - lguest 2 - include
-14
tools/lguest/Makefile
··· 1 - # This creates the demonstration utility "lguest" which runs a Linux guest. 2 - CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude 3 - 4 - all: lguest 5 - 6 - include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h 7 - mkdir -p include/linux 2>&1 || true 8 - ln -sf ../../../../include/uapi/linux/virtio_types.h $@ 9 - 10 - lguest: include/linux/virtio_types.h 11 - 12 - clean: 13 - rm -f lguest 14 - rm -rf include
-58
tools/lguest/extract
··· 1 - #! /bin/sh 2 - 3 - set -e 4 - 5 - PREFIX=$1 6 - shift 7 - 8 - trap 'rm -r $TMPDIR' 0 9 - TMPDIR=`mktemp -d` 10 - 11 - exec 3>/dev/null 12 - for f; do 13 - while IFS=" 14 - " read -r LINE; do 15 - case "$LINE" in 16 - *$PREFIX:[0-9]*:\**) 17 - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` 18 - if [ -f $TMPDIR/$NUM ]; then 19 - echo "$TMPDIR/$NUM already exits prior to $f" 20 - exit 1 21 - fi 22 - exec 3>>$TMPDIR/$NUM 23 - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM 24 - /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 25 - ;; 26 - *$PREFIX:[0-9]*) 27 - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` 28 - if [ -f $TMPDIR/$NUM ]; then 29 - echo "$TMPDIR/$NUM already exits prior to $f" 30 - exit 1 31 - fi 32 - exec 3>>$TMPDIR/$NUM 33 - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM 34 - /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 35 - ;; 36 - *:\**) 37 - /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 38 - echo >&3 39 - exec 3>/dev/null 40 - ;; 41 - *) 42 - /bin/echo "$LINE" >&3 43 - ;; 44 - esac 45 - done < $f 46 - echo >&3 47 - exec 3>/dev/null 48 - done 49 - 50 - LASTFILE="" 51 - for f in $TMPDIR/*; do 52 - if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then 53 - LASTFILE=$(cat $TMPDIR/.$(basename $f) ) 54 - echo "[ $LASTFILE ]" 55 - fi 56 - cat $f 57 - done 58 -
-3420
tools/lguest/lguest.c
··· 1 - /*P:100 2 - * This is the Launcher code, a simple program which lays out the "physical" 3 - * memory for the new Guest by mapping the kernel image and the virtual 4 - * devices, then opens /dev/lguest to tell the kernel about the Guest and 5 - * control it. 6 - :*/ 7 - #define _LARGEFILE64_SOURCE 8 - #define _GNU_SOURCE 9 - #include <stdio.h> 10 - #include <string.h> 11 - #include <unistd.h> 12 - #include <err.h> 13 - #include <stdint.h> 14 - #include <stdlib.h> 15 - #include <elf.h> 16 - #include <sys/mman.h> 17 - #include <sys/param.h> 18 - #include <sys/types.h> 19 - #include <sys/stat.h> 20 - #include <sys/wait.h> 21 - #include <sys/eventfd.h> 22 - #include <fcntl.h> 23 - #include <stdbool.h> 24 - #include <errno.h> 25 - #include <ctype.h> 26 - #include <sys/socket.h> 27 - #include <sys/ioctl.h> 28 - #include <sys/time.h> 29 - #include <time.h> 30 - #include <netinet/in.h> 31 - #include <net/if.h> 32 - #include <linux/sockios.h> 33 - #include <linux/if_tun.h> 34 - #include <sys/uio.h> 35 - #include <termios.h> 36 - #include <getopt.h> 37 - #include <assert.h> 38 - #include <sched.h> 39 - #include <limits.h> 40 - #include <stddef.h> 41 - #include <signal.h> 42 - #include <pwd.h> 43 - #include <grp.h> 44 - #include <sys/user.h> 45 - #include <linux/pci_regs.h> 46 - 47 - #ifndef VIRTIO_F_ANY_LAYOUT 48 - #define VIRTIO_F_ANY_LAYOUT 27 49 - #endif 50 - 51 - /*L:110 52 - * We can ignore the 43 include files we need for this program, but I do want 53 - * to draw attention to the use of kernel-style types. 54 - * 55 - * As Linus said, "C is a Spartan language, and so should your naming be." I 56 - * like these abbreviations, so we define them here. Note that u64 is always 57 - * unsigned long long, which works on all Linux systems: this means that we can 58 - * use %llu in printf for any u64. 59 - */ 60 - typedef unsigned long long u64; 61 - typedef uint32_t u32; 62 - typedef uint16_t u16; 63 - typedef uint8_t u8; 64 - /*:*/ 65 - 66 - #define VIRTIO_CONFIG_NO_LEGACY 67 - #define VIRTIO_PCI_NO_LEGACY 68 - #define VIRTIO_BLK_NO_LEGACY 69 - #define VIRTIO_NET_NO_LEGACY 70 - 71 - /* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */ 72 - #include "../../include/uapi/linux/virtio_config.h" 73 - #include "../../include/uapi/linux/virtio_net.h" 74 - #include "../../include/uapi/linux/virtio_blk.h" 75 - #include "../../include/uapi/linux/virtio_console.h" 76 - #include "../../include/uapi/linux/virtio_rng.h" 77 - #include <linux/virtio_ring.h> 78 - #include "../../include/uapi/linux/virtio_pci.h" 79 - #include <asm/bootparam.h> 80 - #include "../../include/linux/lguest_launcher.h" 81 - 82 - #define BRIDGE_PFX "bridge:" 83 - #ifndef SIOCBRADDIF 84 - #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 85 - #endif 86 - /* We can have up to 256 pages for devices. */ 87 - #define DEVICE_PAGES 256 88 - /* This will occupy 3 pages: it must be a power of 2. */ 89 - #define VIRTQUEUE_NUM 256 90 - 91 - /*L:120 92 - * verbose is both a global flag and a macro. The C preprocessor allows 93 - * this, and although I wouldn't recommend it, it works quite nicely here. 94 - */ 95 - static bool verbose; 96 - #define verbose(args...) \ 97 - do { if (verbose) printf(args); } while(0) 98 - /*:*/ 99 - 100 - /* The pointer to the start of guest memory. */ 101 - static void *guest_base; 102 - /* The maximum guest physical address allowed, and maximum possible. */ 103 - static unsigned long guest_limit, guest_max, guest_mmio; 104 - /* The /dev/lguest file descriptor. */ 105 - static int lguest_fd; 106 - 107 - /* a per-cpu variable indicating whose vcpu is currently running */ 108 - static unsigned int __thread cpu_id; 109 - 110 - /* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */ 111 - #define MAX_PCI_DEVICES 32 112 - 113 - /* This is our list of devices. */ 114 - struct device_list { 115 - /* Counter to assign interrupt numbers. */ 116 - unsigned int next_irq; 117 - 118 - /* Counter to print out convenient device numbers. */ 119 - unsigned int device_num; 120 - 121 - /* PCI devices. */ 122 - struct device *pci[MAX_PCI_DEVICES]; 123 - }; 124 - 125 - /* The list of Guest devices, based on command line arguments. */ 126 - static struct device_list devices; 127 - 128 - /* 129 - * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h, 130 - * but uses a u32 explicitly for the data. 131 - */ 132 - struct virtio_pci_cfg_cap_u32 { 133 - struct virtio_pci_cap cap; 134 - u32 pci_cfg_data; /* Data for BAR access. */ 135 - }; 136 - 137 - struct virtio_pci_mmio { 138 - struct virtio_pci_common_cfg cfg; 139 - u16 notify; 140 - u8 isr; 141 - u8 padding; 142 - /* Device-specific configuration follows this. */ 143 - }; 144 - 145 - /* This is the layout (little-endian) of the PCI config space. */ 146 - struct pci_config { 147 - u16 vendor_id, device_id; 148 - u16 command, status; 149 - u8 revid, prog_if, subclass, class; 150 - u8 cacheline_size, lat_timer, header_type, bist; 151 - u32 bar[6]; 152 - u32 cardbus_cis_ptr; 153 - u16 subsystem_vendor_id, subsystem_device_id; 154 - u32 expansion_rom_addr; 155 - u8 capabilities, reserved1[3]; 156 - u32 reserved2; 157 - u8 irq_line, irq_pin, min_grant, max_latency; 158 - 159 - /* Now, this is the linked capability list. */ 160 - struct virtio_pci_cap common; 161 - struct virtio_pci_notify_cap notify; 162 - struct virtio_pci_cap isr; 163 - struct virtio_pci_cap device; 164 - struct virtio_pci_cfg_cap_u32 cfg_access; 165 - }; 166 - 167 - /* The device structure describes a single device. */ 168 - struct device { 169 - /* The name of this device, for --verbose. */ 170 - const char *name; 171 - 172 - /* Any queues attached to this device */ 173 - struct virtqueue *vq; 174 - 175 - /* Is it operational */ 176 - bool running; 177 - 178 - /* Has it written FEATURES_OK but not re-checked it? */ 179 - bool wrote_features_ok; 180 - 181 - /* PCI configuration */ 182 - union { 183 - struct pci_config config; 184 - u32 config_words[sizeof(struct pci_config) / sizeof(u32)]; 185 - }; 186 - 187 - /* Features we offer, and those accepted. */ 188 - u64 features, features_accepted; 189 - 190 - /* Device-specific config hangs off the end of this. */ 191 - struct virtio_pci_mmio *mmio; 192 - 193 - /* PCI MMIO resources (all in BAR0) */ 194 - size_t mmio_size; 195 - u32 mmio_addr; 196 - 197 - /* Device-specific data. */ 198 - void *priv; 199 - }; 200 - 201 - /* The virtqueue structure describes a queue attached to a device. */ 202 - struct virtqueue { 203 - struct virtqueue *next; 204 - 205 - /* Which device owns me. */ 206 - struct device *dev; 207 - 208 - /* Name for printing errors. */ 209 - const char *name; 210 - 211 - /* The actual ring of buffers. */ 212 - struct vring vring; 213 - 214 - /* The information about this virtqueue (we only use queue_size on) */ 215 - struct virtio_pci_common_cfg pci_config; 216 - 217 - /* Last available index we saw. */ 218 - u16 last_avail_idx; 219 - 220 - /* How many are used since we sent last irq? */ 221 - unsigned int pending_used; 222 - 223 - /* Eventfd where Guest notifications arrive. */ 224 - int eventfd; 225 - 226 - /* Function for the thread which is servicing this virtqueue. */ 227 - void (*service)(struct virtqueue *vq); 228 - pid_t thread; 229 - }; 230 - 231 - /* Remember the arguments to the program so we can "reboot" */ 232 - static char **main_args; 233 - 234 - /* The original tty settings to restore on exit. */ 235 - static struct termios orig_term; 236 - 237 - /* 238 - * We have to be careful with barriers: our devices are all run in separate 239 - * threads and so we need to make sure that changes visible to the Guest happen 240 - * in precise order. 241 - */ 242 - #define wmb() __asm__ __volatile__("" : : : "memory") 243 - #define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") 244 - #define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") 245 - 246 - /* Wrapper for the last available index. Makes it easier to change. */ 247 - #define lg_last_avail(vq) ((vq)->last_avail_idx) 248 - 249 - /* 250 - * The virtio configuration space is defined to be little-endian. x86 is 251 - * little-endian too, but it's nice to be explicit so we have these helpers. 252 - */ 253 - #define cpu_to_le16(v16) (v16) 254 - #define cpu_to_le32(v32) (v32) 255 - #define cpu_to_le64(v64) (v64) 256 - #define le16_to_cpu(v16) (v16) 257 - #define le32_to_cpu(v32) (v32) 258 - #define le64_to_cpu(v64) (v64) 259 - 260 - /* 261 - * A real device would ignore weird/non-compliant driver behaviour. We 262 - * stop and flag it, to help debugging Linux problems. 263 - */ 264 - #define bad_driver(d, fmt, ...) \ 265 - errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__) 266 - #define bad_driver_vq(vq, fmt, ...) \ 267 - errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \ 268 - vq->name, ## __VA_ARGS__) 269 - 270 - /* Is this iovec empty? */ 271 - static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 272 - { 273 - unsigned int i; 274 - 275 - for (i = 0; i < num_iov; i++) 276 - if (iov[i].iov_len) 277 - return false; 278 - return true; 279 - } 280 - 281 - /* Take len bytes from the front of this iovec. */ 282 - static void iov_consume(struct device *d, 283 - struct iovec iov[], unsigned num_iov, 284 - void *dest, unsigned len) 285 - { 286 - unsigned int i; 287 - 288 - for (i = 0; i < num_iov; i++) { 289 - unsigned int used; 290 - 291 - used = iov[i].iov_len < len ? iov[i].iov_len : len; 292 - if (dest) { 293 - memcpy(dest, iov[i].iov_base, used); 294 - dest += used; 295 - } 296 - iov[i].iov_base += used; 297 - iov[i].iov_len -= used; 298 - len -= used; 299 - } 300 - if (len != 0) 301 - bad_driver(d, "iovec too short!"); 302 - } 303 - 304 - /*L:100 305 - * The Launcher code itself takes us out into userspace, that scary place where 306 - * pointers run wild and free! Unfortunately, like most userspace programs, 307 - * it's quite boring (which is why everyone likes to hack on the kernel!). 308 - * Perhaps if you make up an Lguest Drinking Game at this point, it will get 309 - * you through this section. Or, maybe not. 310 - * 311 - * The Launcher sets up a big chunk of memory to be the Guest's "physical" 312 - * memory and stores it in "guest_base". In other words, Guest physical == 313 - * Launcher virtual with an offset. 314 - * 315 - * This can be tough to get your head around, but usually it just means that we 316 - * use these trivial conversion functions when the Guest gives us its 317 - * "physical" addresses: 318 - */ 319 - static void *from_guest_phys(unsigned long addr) 320 - { 321 - return guest_base + addr; 322 - } 323 - 324 - static unsigned long to_guest_phys(const void *addr) 325 - { 326 - return (addr - guest_base); 327 - } 328 - 329 - /*L:130 330 - * Loading the Kernel. 331 - * 332 - * We start with couple of simple helper routines. open_or_die() avoids 333 - * error-checking code cluttering the callers: 334 - */ 335 - static int open_or_die(const char *name, int flags) 336 - { 337 - int fd = open(name, flags); 338 - if (fd < 0) 339 - err(1, "Failed to open %s", name); 340 - return fd; 341 - } 342 - 343 - /* map_zeroed_pages() takes a number of pages. */ 344 - static void *map_zeroed_pages(unsigned int num) 345 - { 346 - int fd = open_or_die("/dev/zero", O_RDONLY); 347 - void *addr; 348 - 349 - /* 350 - * We use a private mapping (ie. if we write to the page, it will be 351 - * copied). We allocate an extra two pages PROT_NONE to act as guard 352 - * pages against read/write attempts that exceed allocated space. 353 - */ 354 - addr = mmap(NULL, getpagesize() * (num+2), 355 - PROT_NONE, MAP_PRIVATE, fd, 0); 356 - 357 - if (addr == MAP_FAILED) 358 - err(1, "Mmapping %u pages of /dev/zero", num); 359 - 360 - if (mprotect(addr + getpagesize(), getpagesize() * num, 361 - PROT_READ|PROT_WRITE) == -1) 362 - err(1, "mprotect rw %u pages failed", num); 363 - 364 - /* 365 - * One neat mmap feature is that you can close the fd, and it 366 - * stays mapped. 367 - */ 368 - close(fd); 369 - 370 - /* Return address after PROT_NONE page */ 371 - return addr + getpagesize(); 372 - } 373 - 374 - /* Get some bytes which won't be mapped into the guest. */ 375 - static unsigned long get_mmio_region(size_t size) 376 - { 377 - unsigned long addr = guest_mmio; 378 - size_t i; 379 - 380 - if (!size) 381 - return addr; 382 - 383 - /* Size has to be a power of 2 (and multiple of 16) */ 384 - for (i = 1; i < size; i <<= 1); 385 - 386 - guest_mmio += i; 387 - 388 - return addr; 389 - } 390 - 391 - /* 392 - * This routine is used to load the kernel or initrd. It tries mmap, but if 393 - * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 394 - * it falls back to reading the memory in. 395 - */ 396 - static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 397 - { 398 - ssize_t r; 399 - 400 - /* 401 - * We map writable even though for some segments are marked read-only. 402 - * The kernel really wants to be writable: it patches its own 403 - * instructions. 404 - * 405 - * MAP_PRIVATE means that the page won't be copied until a write is 406 - * done to it. This allows us to share untouched memory between 407 - * Guests. 408 - */ 409 - if (mmap(addr, len, PROT_READ|PROT_WRITE, 410 - MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 411 - return; 412 - 413 - /* pread does a seek and a read in one shot: saves a few lines. */ 414 - r = pread(fd, addr, len, offset); 415 - if (r != len) 416 - err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 417 - } 418 - 419 - /* 420 - * This routine takes an open vmlinux image, which is in ELF, and maps it into 421 - * the Guest memory. ELF = Embedded Linking Format, which is the format used 422 - * by all modern binaries on Linux including the kernel. 423 - * 424 - * The ELF headers give *two* addresses: a physical address, and a virtual 425 - * address. We use the physical address; the Guest will map itself to the 426 - * virtual address. 427 - * 428 - * We return the starting address. 429 - */ 430 - static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 431 - { 432 - Elf32_Phdr phdr[ehdr->e_phnum]; 433 - unsigned int i; 434 - 435 - /* 436 - * Sanity checks on the main ELF header: an x86 executable with a 437 - * reasonable number of correctly-sized program headers. 438 - */ 439 - if (ehdr->e_type != ET_EXEC 440 - || ehdr->e_machine != EM_386 441 - || ehdr->e_phentsize != sizeof(Elf32_Phdr) 442 - || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 443 - errx(1, "Malformed elf header"); 444 - 445 - /* 446 - * An ELF executable contains an ELF header and a number of "program" 447 - * headers which indicate which parts ("segments") of the program to 448 - * load where. 449 - */ 450 - 451 - /* We read in all the program headers at once: */ 452 - if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 453 - err(1, "Seeking to program headers"); 454 - if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 455 - err(1, "Reading program headers"); 456 - 457 - /* 458 - * Try all the headers: there are usually only three. A read-only one, 459 - * a read-write one, and a "note" section which we don't load. 460 - */ 461 - for (i = 0; i < ehdr->e_phnum; i++) { 462 - /* If this isn't a loadable segment, we ignore it */ 463 - if (phdr[i].p_type != PT_LOAD) 464 - continue; 465 - 466 - verbose("Section %i: size %i addr %p\n", 467 - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 468 - 469 - /* We map this section of the file at its physical address. */ 470 - map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), 471 - phdr[i].p_offset, phdr[i].p_filesz); 472 - } 473 - 474 - /* The entry point is given in the ELF header. */ 475 - return ehdr->e_entry; 476 - } 477 - 478 - /*L:150 479 - * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed 480 - * to jump into it and it will unpack itself. We used to have to perform some 481 - * hairy magic because the unpacking code scared me. 482 - * 483 - * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 484 - * a small patch to jump over the tricky bits in the Guest, so now we just read 485 - * the funky header so we know where in the file to load, and away we go! 486 - */ 487 - static unsigned long load_bzimage(int fd) 488 - { 489 - struct boot_params boot; 490 - int r; 491 - /* Modern bzImages get loaded at 1M. */ 492 - void *p = from_guest_phys(0x100000); 493 - 494 - /* 495 - * Go back to the start of the file and read the header. It should be 496 - * a Linux boot header (see Documentation/x86/boot.txt) 497 - */ 498 - lseek(fd, 0, SEEK_SET); 499 - read(fd, &boot, sizeof(boot)); 500 - 501 - /* Inside the setup_hdr, we expect the magic "HdrS" */ 502 - if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) 503 - errx(1, "This doesn't look like a bzImage to me"); 504 - 505 - /* Skip over the extra sectors of the header. */ 506 - lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); 507 - 508 - /* Now read everything into memory. in nice big chunks. */ 509 - while ((r = read(fd, p, 65536)) > 0) 510 - p += r; 511 - 512 - /* Finally, code32_start tells us where to enter the kernel. */ 513 - return boot.hdr.code32_start; 514 - } 515 - 516 - /*L:140 517 - * Loading the kernel is easy when it's a "vmlinux", but most kernels 518 - * come wrapped up in the self-decompressing "bzImage" format. With a little 519 - * work, we can load those, too. 520 - */ 521 - static unsigned long load_kernel(int fd) 522 - { 523 - Elf32_Ehdr hdr; 524 - 525 - /* Read in the first few bytes. */ 526 - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 527 - err(1, "Reading kernel"); 528 - 529 - /* If it's an ELF file, it starts with "\177ELF" */ 530 - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 531 - return map_elf(fd, &hdr); 532 - 533 - /* Otherwise we assume it's a bzImage, and try to load it. */ 534 - return load_bzimage(fd); 535 - } 536 - 537 - /* 538 - * This is a trivial little helper to align pages. Andi Kleen hated it because 539 - * it calls getpagesize() twice: "it's dumb code." 540 - * 541 - * Kernel guys get really het up about optimization, even when it's not 542 - * necessary. I leave this code as a reaction against that. 543 - */ 544 - static inline unsigned long page_align(unsigned long addr) 545 - { 546 - /* Add upwards and truncate downwards. */ 547 - return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 548 - } 549 - 550 - /*L:180 551 - * An "initial ram disk" is a disk image loaded into memory along with the 552 - * kernel which the kernel can use to boot from without needing any drivers. 553 - * Most distributions now use this as standard: the initrd contains the code to 554 - * load the appropriate driver modules for the current machine. 555 - * 556 - * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 557 - * kernels. He sent me this (and tells me when I break it). 558 - */ 559 - static unsigned long load_initrd(const char *name, unsigned long mem) 560 - { 561 - int ifd; 562 - struct stat st; 563 - unsigned long len; 564 - 565 - ifd = open_or_die(name, O_RDONLY); 566 - /* fstat() is needed to get the file size. */ 567 - if (fstat(ifd, &st) < 0) 568 - err(1, "fstat() on initrd '%s'", name); 569 - 570 - /* 571 - * We map the initrd at the top of memory, but mmap wants it to be 572 - * page-aligned, so we round the size up for that. 573 - */ 574 - len = page_align(st.st_size); 575 - map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 576 - /* 577 - * Once a file is mapped, you can close the file descriptor. It's a 578 - * little odd, but quite useful. 579 - */ 580 - close(ifd); 581 - verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 582 - 583 - /* We return the initrd size. */ 584 - return len; 585 - } 586 - /*:*/ 587 - 588 - /* 589 - * Simple routine to roll all the commandline arguments together with spaces 590 - * between them. 591 - */ 592 - static void concat(char *dst, char *args[]) 593 - { 594 - unsigned int i, len = 0; 595 - 596 - for (i = 0; args[i]; i++) { 597 - if (i) { 598 - strcat(dst+len, " "); 599 - len++; 600 - } 601 - strcpy(dst+len, args[i]); 602 - len += strlen(args[i]); 603 - } 604 - /* In case it's empty. */ 605 - dst[len] = '\0'; 606 - } 607 - 608 - /*L:185 609 - * This is where we actually tell the kernel to initialize the Guest. We 610 - * saw the arguments it expects when we looked at initialize() in lguest_user.c: 611 - * the base of Guest "physical" memory, the top physical page to allow and the 612 - * entry point for the Guest. 613 - */ 614 - static void tell_kernel(unsigned long start) 615 - { 616 - unsigned long args[] = { LHREQ_INITIALIZE, 617 - (unsigned long)guest_base, 618 - guest_limit / getpagesize(), start, 619 - (guest_mmio+getpagesize()-1) / getpagesize() }; 620 - verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n", 621 - guest_base, guest_base + guest_limit, 622 - guest_limit, guest_mmio); 623 - lguest_fd = open_or_die("/dev/lguest", O_RDWR); 624 - if (write(lguest_fd, args, sizeof(args)) < 0) 625 - err(1, "Writing to /dev/lguest"); 626 - } 627 - /*:*/ 628 - 629 - /*L:200 630 - * Device Handling. 631 - * 632 - * When the Guest gives us a buffer, it sends an array of addresses and sizes. 633 - * We need to make sure it's not trying to reach into the Launcher itself, so 634 - * we have a convenient routine which checks it and exits with an error message 635 - * if something funny is going on: 636 - */ 637 - static void *_check_pointer(struct device *d, 638 - unsigned long addr, unsigned int size, 639 - unsigned int line) 640 - { 641 - /* 642 - * Check if the requested address and size exceeds the allocated memory, 643 - * or addr + size wraps around. 644 - */ 645 - if ((addr + size) > guest_limit || (addr + size) < addr) 646 - bad_driver(d, "%s:%i: Invalid address %#lx", 647 - __FILE__, line, addr); 648 - /* 649 - * We return a pointer for the caller's convenience, now we know it's 650 - * safe to use. 651 - */ 652 - return from_guest_phys(addr); 653 - } 654 - /* A macro which transparently hands the line number to the real function. */ 655 - #define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__) 656 - 657 - /* 658 - * Each buffer in the virtqueues is actually a chain of descriptors. This 659 - * function returns the next descriptor in the chain, or vq->vring.num if we're 660 - * at the end. 661 - */ 662 - static unsigned next_desc(struct device *d, struct vring_desc *desc, 663 - unsigned int i, unsigned int max) 664 - { 665 - unsigned int next; 666 - 667 - /* If this descriptor says it doesn't chain, we're done. */ 668 - if (!(desc[i].flags & VRING_DESC_F_NEXT)) 669 - return max; 670 - 671 - /* Check they're not leading us off end of descriptors. */ 672 - next = desc[i].next; 673 - /* Make sure compiler knows to grab that: we don't want it changing! */ 674 - wmb(); 675 - 676 - if (next >= max) 677 - bad_driver(d, "Desc next is %u", next); 678 - 679 - return next; 680 - } 681 - 682 - /* 683 - * This actually sends the interrupt for this virtqueue, if we've used a 684 - * buffer. 685 - */ 686 - static void trigger_irq(struct virtqueue *vq) 687 - { 688 - unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line }; 689 - 690 - /* Don't inform them if nothing used. */ 691 - if (!vq->pending_used) 692 - return; 693 - vq->pending_used = 0; 694 - 695 - /* 696 - * 2.4.7.1: 697 - * 698 - * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: 699 - * The driver MUST set flags to 0 or 1. 700 - */ 701 - if (vq->vring.avail->flags > 1) 702 - bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags); 703 - 704 - /* 705 - * 2.4.7.2: 706 - * 707 - * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: 708 - * 709 - * - The device MUST ignore the used_event value. 710 - * - After the device writes a descriptor index into the used ring: 711 - * - If flags is 1, the device SHOULD NOT send an interrupt. 712 - * - If flags is 0, the device MUST send an interrupt. 713 - */ 714 - if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 715 - return; 716 - } 717 - 718 - /* 719 - * 4.1.4.5.1: 720 - * 721 - * If MSI-X capability is disabled, the device MUST set the Queue 722 - * Interrupt bit in ISR status before sending a virtqueue notification 723 - * to the driver. 724 - */ 725 - vq->dev->mmio->isr = 0x1; 726 - 727 - /* Send the Guest an interrupt tell them we used something up. */ 728 - if (write(lguest_fd, buf, sizeof(buf)) != 0) 729 - err(1, "Triggering irq %i", vq->dev->config.irq_line); 730 - } 731 - 732 - /* 733 - * This looks in the virtqueue for the first available buffer, and converts 734 - * it to an iovec for convenient access. Since descriptors consist of some 735 - * number of output then some number of input descriptors, it's actually two 736 - * iovecs, but we pack them into one and note how many of each there were. 737 - * 738 - * This function waits if necessary, and returns the descriptor number found. 739 - */ 740 - static unsigned wait_for_vq_desc(struct virtqueue *vq, 741 - struct iovec iov[], 742 - unsigned int *out_num, unsigned int *in_num) 743 - { 744 - unsigned int i, head, max; 745 - struct vring_desc *desc; 746 - u16 last_avail = lg_last_avail(vq); 747 - 748 - /* 749 - * 2.4.7.1: 750 - * 751 - * The driver MUST handle spurious interrupts from the device. 752 - * 753 - * That's why this is a while loop. 754 - */ 755 - 756 - /* There's nothing available? */ 757 - while (last_avail == vq->vring.avail->idx) { 758 - u64 event; 759 - 760 - /* 761 - * Since we're about to sleep, now is a good time to tell the 762 - * Guest about what we've used up to now. 763 - */ 764 - trigger_irq(vq); 765 - 766 - /* OK, now we need to know about added descriptors. */ 767 - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 768 - 769 - /* 770 - * They could have slipped one in as we were doing that: make 771 - * sure it's written, then check again. 772 - */ 773 - mb(); 774 - if (last_avail != vq->vring.avail->idx) { 775 - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 776 - break; 777 - } 778 - 779 - /* Nothing new? Wait for eventfd to tell us they refilled. */ 780 - if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) 781 - errx(1, "Event read failed?"); 782 - 783 - /* We don't need to be notified again. */ 784 - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 785 - } 786 - 787 - /* Check it isn't doing very strange things with descriptor numbers. */ 788 - if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 789 - bad_driver_vq(vq, "Guest moved used index from %u to %u", 790 - last_avail, vq->vring.avail->idx); 791 - 792 - /* 793 - * Make sure we read the descriptor number *after* we read the ring 794 - * update; don't let the cpu or compiler change the order. 795 - */ 796 - rmb(); 797 - 798 - /* 799 - * Grab the next descriptor number they're advertising, and increment 800 - * the index we've seen. 801 - */ 802 - head = vq->vring.avail->ring[last_avail % vq->vring.num]; 803 - lg_last_avail(vq)++; 804 - 805 - /* If their number is silly, that's a fatal mistake. */ 806 - if (head >= vq->vring.num) 807 - bad_driver_vq(vq, "Guest says index %u is available", head); 808 - 809 - /* When we start there are none of either input nor output. */ 810 - *out_num = *in_num = 0; 811 - 812 - max = vq->vring.num; 813 - desc = vq->vring.desc; 814 - i = head; 815 - 816 - /* 817 - * We have to read the descriptor after we read the descriptor number, 818 - * but there's a data dependency there so the CPU shouldn't reorder 819 - * that: no rmb() required. 820 - */ 821 - 822 - do { 823 - /* 824 - * If this is an indirect entry, then this buffer contains a 825 - * descriptor table which we handle as if it's any normal 826 - * descriptor chain. 827 - */ 828 - if (desc[i].flags & VRING_DESC_F_INDIRECT) { 829 - /* 2.4.5.3.1: 830 - * 831 - * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT 832 - * flag unless the VIRTIO_F_INDIRECT_DESC feature was 833 - * negotiated. 834 - */ 835 - if (!(vq->dev->features_accepted & 836 - (1<<VIRTIO_RING_F_INDIRECT_DESC))) 837 - bad_driver_vq(vq, "vq indirect not negotiated"); 838 - 839 - /* 840 - * 2.4.5.3.1: 841 - * 842 - * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT 843 - * flag within an indirect descriptor (ie. only one 844 - * table per descriptor). 845 - */ 846 - if (desc != vq->vring.desc) 847 - bad_driver_vq(vq, "Indirect within indirect"); 848 - 849 - /* 850 - * Proposed update VIRTIO-134 spells this out: 851 - * 852 - * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 853 - * and VIRTQ_DESC_F_NEXT in flags. 854 - */ 855 - if (desc[i].flags & VRING_DESC_F_NEXT) 856 - bad_driver_vq(vq, "indirect and next together"); 857 - 858 - if (desc[i].len % sizeof(struct vring_desc)) 859 - bad_driver_vq(vq, 860 - "Invalid size for indirect table"); 861 - /* 862 - * 2.4.5.3.2: 863 - * 864 - * The device MUST ignore the write-only flag 865 - * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that 866 - * refers to an indirect table. 867 - * 868 - * We ignore it here: :) 869 - */ 870 - 871 - max = desc[i].len / sizeof(struct vring_desc); 872 - desc = check_pointer(vq->dev, desc[i].addr, desc[i].len); 873 - i = 0; 874 - 875 - /* 2.4.5.3.1: 876 - * 877 - * A driver MUST NOT create a descriptor chain longer 878 - * than the Queue Size of the device. 879 - */ 880 - if (max > vq->pci_config.queue_size) 881 - bad_driver_vq(vq, 882 - "indirect has too many entries"); 883 - } 884 - 885 - /* Grab the first descriptor, and check it's OK. */ 886 - iov[*out_num + *in_num].iov_len = desc[i].len; 887 - iov[*out_num + *in_num].iov_base 888 - = check_pointer(vq->dev, desc[i].addr, desc[i].len); 889 - /* If this is an input descriptor, increment that count. */ 890 - if (desc[i].flags & VRING_DESC_F_WRITE) 891 - (*in_num)++; 892 - else { 893 - /* 894 - * If it's an output descriptor, they're all supposed 895 - * to come before any input descriptors. 896 - */ 897 - if (*in_num) 898 - bad_driver_vq(vq, 899 - "Descriptor has out after in"); 900 - (*out_num)++; 901 - } 902 - 903 - /* If we've got too many, that implies a descriptor loop. */ 904 - if (*out_num + *in_num > max) 905 - bad_driver_vq(vq, "Looped descriptor"); 906 - } while ((i = next_desc(vq->dev, desc, i, max)) != max); 907 - 908 - return head; 909 - } 910 - 911 - /* 912 - * After we've used one of their buffers, we tell the Guest about it. Sometime 913 - * later we'll want to send them an interrupt using trigger_irq(); note that 914 - * wait_for_vq_desc() does that for us if it has to wait. 915 - */ 916 - static void add_used(struct virtqueue *vq, unsigned int head, int len) 917 - { 918 - struct vring_used_elem *used; 919 - 920 - /* 921 - * The virtqueue contains a ring of used buffers. Get a pointer to the 922 - * next entry in that used ring. 923 - */ 924 - used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 925 - used->id = head; 926 - used->len = len; 927 - /* Make sure buffer is written before we update index. */ 928 - wmb(); 929 - vq->vring.used->idx++; 930 - vq->pending_used++; 931 - } 932 - 933 - /* And here's the combo meal deal. Supersize me! */ 934 - static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) 935 - { 936 - add_used(vq, head, len); 937 - trigger_irq(vq); 938 - } 939 - 940 - /* 941 - * The Console 942 - * 943 - * We associate some data with the console for our exit hack. 944 - */ 945 - struct console_abort { 946 - /* How many times have they hit ^C? */ 947 - int count; 948 - /* When did they start? */ 949 - struct timeval start; 950 - }; 951 - 952 - /* This is the routine which handles console input (ie. stdin). */ 953 - static void console_input(struct virtqueue *vq) 954 - { 955 - int len; 956 - unsigned int head, in_num, out_num; 957 - struct console_abort *abort = vq->dev->priv; 958 - struct iovec iov[vq->vring.num]; 959 - 960 - /* Make sure there's a descriptor available. */ 961 - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 962 - if (out_num) 963 - bad_driver_vq(vq, "Output buffers in console in queue?"); 964 - 965 - /* Read into it. This is where we usually wait. */ 966 - len = readv(STDIN_FILENO, iov, in_num); 967 - if (len <= 0) { 968 - /* Ran out of input? */ 969 - warnx("Failed to get console input, ignoring console."); 970 - /* 971 - * For simplicity, dying threads kill the whole Launcher. So 972 - * just nap here. 973 - */ 974 - for (;;) 975 - pause(); 976 - } 977 - 978 - /* Tell the Guest we used a buffer. */ 979 - add_used_and_trigger(vq, head, len); 980 - 981 - /* 982 - * Three ^C within one second? Exit. 983 - * 984 - * This is such a hack, but works surprisingly well. Each ^C has to 985 - * be in a buffer by itself, so they can't be too fast. But we check 986 - * that we get three within about a second, so they can't be too 987 - * slow. 988 - */ 989 - if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 990 - abort->count = 0; 991 - return; 992 - } 993 - 994 - abort->count++; 995 - if (abort->count == 1) 996 - gettimeofday(&abort->start, NULL); 997 - else if (abort->count == 3) { 998 - struct timeval now; 999 - gettimeofday(&now, NULL); 1000 - /* Kill all Launcher processes with SIGINT, like normal ^C */ 1001 - if (now.tv_sec <= abort->start.tv_sec+1) 1002 - kill(0, SIGINT); 1003 - abort->count = 0; 1004 - } 1005 - } 1006 - 1007 - /* This is the routine which handles console output (ie. stdout). */ 1008 - static void console_output(struct virtqueue *vq) 1009 - { 1010 - unsigned int head, out, in; 1011 - struct iovec iov[vq->vring.num]; 1012 - 1013 - /* We usually wait in here, for the Guest to give us something. */ 1014 - head = wait_for_vq_desc(vq, iov, &out, &in); 1015 - if (in) 1016 - bad_driver_vq(vq, "Input buffers in console output queue?"); 1017 - 1018 - /* writev can return a partial write, so we loop here. */ 1019 - while (!iov_empty(iov, out)) { 1020 - int len = writev(STDOUT_FILENO, iov, out); 1021 - if (len <= 0) { 1022 - warn("Write to stdout gave %i (%d)", len, errno); 1023 - break; 1024 - } 1025 - iov_consume(vq->dev, iov, out, NULL, len); 1026 - } 1027 - 1028 - /* 1029 - * We're finished with that buffer: if we're going to sleep, 1030 - * wait_for_vq_desc() will prod the Guest with an interrupt. 1031 - */ 1032 - add_used(vq, head, 0); 1033 - } 1034 - 1035 - /* 1036 - * The Network 1037 - * 1038 - * Handling output for network is also simple: we get all the output buffers 1039 - * and write them to /dev/net/tun. 1040 - */ 1041 - struct net_info { 1042 - int tunfd; 1043 - }; 1044 - 1045 - static void net_output(struct virtqueue *vq) 1046 - { 1047 - struct net_info *net_info = vq->dev->priv; 1048 - unsigned int head, out, in; 1049 - struct iovec iov[vq->vring.num]; 1050 - 1051 - /* We usually wait in here for the Guest to give us a packet. */ 1052 - head = wait_for_vq_desc(vq, iov, &out, &in); 1053 - if (in) 1054 - bad_driver_vq(vq, "Input buffers in net output queue?"); 1055 - /* 1056 - * Send the whole thing through to /dev/net/tun. It expects the exact 1057 - * same format: what a coincidence! 1058 - */ 1059 - if (writev(net_info->tunfd, iov, out) < 0) 1060 - warnx("Write to tun failed (%d)?", errno); 1061 - 1062 - /* 1063 - * Done with that one; wait_for_vq_desc() will send the interrupt if 1064 - * all packets are processed. 1065 - */ 1066 - add_used(vq, head, 0); 1067 - } 1068 - 1069 - /* 1070 - * Handling network input is a bit trickier, because I've tried to optimize it. 1071 - * 1072 - * First we have a helper routine which tells is if from this file descriptor 1073 - * (ie. the /dev/net/tun device) will block: 1074 - */ 1075 - static bool will_block(int fd) 1076 - { 1077 - fd_set fdset; 1078 - struct timeval zero = { 0, 0 }; 1079 - FD_ZERO(&fdset); 1080 - FD_SET(fd, &fdset); 1081 - return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 1082 - } 1083 - 1084 - /* 1085 - * This handles packets coming in from the tun device to our Guest. Like all 1086 - * service routines, it gets called again as soon as it returns, so you don't 1087 - * see a while(1) loop here. 1088 - */ 1089 - static void net_input(struct virtqueue *vq) 1090 - { 1091 - int len; 1092 - unsigned int head, out, in; 1093 - struct iovec iov[vq->vring.num]; 1094 - struct net_info *net_info = vq->dev->priv; 1095 - 1096 - /* 1097 - * Get a descriptor to write an incoming packet into. This will also 1098 - * send an interrupt if they're out of descriptors. 1099 - */ 1100 - head = wait_for_vq_desc(vq, iov, &out, &in); 1101 - if (out) 1102 - bad_driver_vq(vq, "Output buffers in net input queue?"); 1103 - 1104 - /* 1105 - * If it looks like we'll block reading from the tun device, send them 1106 - * an interrupt. 1107 - */ 1108 - if (vq->pending_used && will_block(net_info->tunfd)) 1109 - trigger_irq(vq); 1110 - 1111 - /* 1112 - * Read in the packet. This is where we normally wait (when there's no 1113 - * incoming network traffic). 1114 - */ 1115 - len = readv(net_info->tunfd, iov, in); 1116 - if (len <= 0) 1117 - warn("Failed to read from tun (%d).", errno); 1118 - 1119 - /* 1120 - * Mark that packet buffer as used, but don't interrupt here. We want 1121 - * to wait until we've done as much work as we can. 1122 - */ 1123 - add_used(vq, head, len); 1124 - } 1125 - /*:*/ 1126 - 1127 - /* This is the helper to create threads: run the service routine in a loop. */ 1128 - static int do_thread(void *_vq) 1129 - { 1130 - struct virtqueue *vq = _vq; 1131 - 1132 - for (;;) 1133 - vq->service(vq); 1134 - return 0; 1135 - } 1136 - 1137 - /* 1138 - * When a child dies, we kill our entire process group with SIGTERM. This 1139 - * also has the side effect that the shell restores the console for us! 1140 - */ 1141 - static void kill_launcher(int signal) 1142 - { 1143 - kill(0, SIGTERM); 1144 - } 1145 - 1146 - static void reset_vq_pci_config(struct virtqueue *vq) 1147 - { 1148 - vq->pci_config.queue_size = VIRTQUEUE_NUM; 1149 - vq->pci_config.queue_enable = 0; 1150 - } 1151 - 1152 - static void reset_device(struct device *dev) 1153 - { 1154 - struct virtqueue *vq; 1155 - 1156 - verbose("Resetting device %s\n", dev->name); 1157 - 1158 - /* Clear any features they've acked. */ 1159 - dev->features_accepted = 0; 1160 - 1161 - /* We're going to be explicitly killing threads, so ignore them. */ 1162 - signal(SIGCHLD, SIG_IGN); 1163 - 1164 - /* 1165 - * 4.1.4.3.1: 1166 - * 1167 - * The device MUST present a 0 in queue_enable on reset. 1168 - * 1169 - * This means we set it here, and reset the saved ones in every vq. 1170 - */ 1171 - dev->mmio->cfg.queue_enable = 0; 1172 - 1173 - /* Get rid of the virtqueue threads */ 1174 - for (vq = dev->vq; vq; vq = vq->next) { 1175 - vq->last_avail_idx = 0; 1176 - reset_vq_pci_config(vq); 1177 - if (vq->thread != (pid_t)-1) { 1178 - kill(vq->thread, SIGTERM); 1179 - waitpid(vq->thread, NULL, 0); 1180 - vq->thread = (pid_t)-1; 1181 - } 1182 - } 1183 - dev->running = false; 1184 - dev->wrote_features_ok = false; 1185 - 1186 - /* Now we care if threads die. */ 1187 - signal(SIGCHLD, (void *)kill_launcher); 1188 - } 1189 - 1190 - static void cleanup_devices(void) 1191 - { 1192 - unsigned int i; 1193 - 1194 - for (i = 1; i < MAX_PCI_DEVICES; i++) { 1195 - struct device *d = devices.pci[i]; 1196 - if (!d) 1197 - continue; 1198 - reset_device(d); 1199 - } 1200 - 1201 - /* If we saved off the original terminal settings, restore them now. */ 1202 - if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) 1203 - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 1204 - } 1205 - 1206 - /*L:217 1207 - * We do PCI. This is mainly done to let us test the kernel virtio PCI 1208 - * code. 1209 - */ 1210 - 1211 - /* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */ 1212 - static struct device pci_host_bridge; 1213 - 1214 - static void init_pci_host_bridge(void) 1215 - { 1216 - pci_host_bridge.name = "PCI Host Bridge"; 1217 - pci_host_bridge.config.class = 0x06; /* bridge */ 1218 - pci_host_bridge.config.subclass = 0; /* host bridge */ 1219 - devices.pci[0] = &pci_host_bridge; 1220 - } 1221 - 1222 - /* The IO ports used to read the PCI config space. */ 1223 - #define PCI_CONFIG_ADDR 0xCF8 1224 - #define PCI_CONFIG_DATA 0xCFC 1225 - 1226 - /* 1227 - * Not really portable, but does help readability: this is what the Guest 1228 - * writes to the PCI_CONFIG_ADDR IO port. 1229 - */ 1230 - union pci_config_addr { 1231 - struct { 1232 - unsigned mbz: 2; 1233 - unsigned offset: 6; 1234 - unsigned funcnum: 3; 1235 - unsigned devnum: 5; 1236 - unsigned busnum: 8; 1237 - unsigned reserved: 7; 1238 - unsigned enabled : 1; 1239 - } bits; 1240 - u32 val; 1241 - }; 1242 - 1243 - /* 1244 - * We cache what they wrote to the address port, so we know what they're 1245 - * talking about when they access the data port. 1246 - */ 1247 - static union pci_config_addr pci_config_addr; 1248 - 1249 - static struct device *find_pci_device(unsigned int index) 1250 - { 1251 - return devices.pci[index]; 1252 - } 1253 - 1254 - /* PCI can do 1, 2 and 4 byte reads; we handle that here. */ 1255 - static void ioread(u16 off, u32 v, u32 mask, u32 *val) 1256 - { 1257 - assert(off < 4); 1258 - assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); 1259 - *val = (v >> (off * 8)) & mask; 1260 - } 1261 - 1262 - /* PCI can do 1, 2 and 4 byte writes; we handle that here. */ 1263 - static void iowrite(u16 off, u32 v, u32 mask, u32 *dst) 1264 - { 1265 - assert(off < 4); 1266 - assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); 1267 - *dst &= ~(mask << (off * 8)); 1268 - *dst |= (v & mask) << (off * 8); 1269 - } 1270 - 1271 - /* 1272 - * Where PCI_CONFIG_DATA accesses depends on the previous write to 1273 - * PCI_CONFIG_ADDR. 1274 - */ 1275 - static struct device *dev_and_reg(u32 *reg) 1276 - { 1277 - if (!pci_config_addr.bits.enabled) 1278 - return NULL; 1279 - 1280 - if (pci_config_addr.bits.funcnum != 0) 1281 - return NULL; 1282 - 1283 - if (pci_config_addr.bits.busnum != 0) 1284 - return NULL; 1285 - 1286 - if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config)) 1287 - return NULL; 1288 - 1289 - *reg = pci_config_addr.bits.offset; 1290 - return find_pci_device(pci_config_addr.bits.devnum); 1291 - } 1292 - 1293 - /* 1294 - * We can get invalid combinations of values while they're writing, so we 1295 - * only fault if they try to write with some invalid bar/offset/length. 1296 - */ 1297 - static bool valid_bar_access(struct device *d, 1298 - struct virtio_pci_cfg_cap_u32 *cfg_access) 1299 - { 1300 - /* We only have 1 bar (BAR0) */ 1301 - if (cfg_access->cap.bar != 0) 1302 - return false; 1303 - 1304 - /* Check it's within BAR0. */ 1305 - if (cfg_access->cap.offset >= d->mmio_size 1306 - || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size) 1307 - return false; 1308 - 1309 - /* Check length is 1, 2 or 4. */ 1310 - if (cfg_access->cap.length != 1 1311 - && cfg_access->cap.length != 2 1312 - && cfg_access->cap.length != 4) 1313 - return false; 1314 - 1315 - /* 1316 - * 4.1.4.7.2: 1317 - * 1318 - * The driver MUST NOT write a cap.offset which is not a multiple of 1319 - * cap.length (ie. all accesses MUST be aligned). 1320 - */ 1321 - if (cfg_access->cap.offset % cfg_access->cap.length != 0) 1322 - return false; 1323 - 1324 - /* Return pointer into word in BAR0. */ 1325 - return true; 1326 - } 1327 - 1328 - /* Is this accessing the PCI config address port?. */ 1329 - static bool is_pci_addr_port(u16 port) 1330 - { 1331 - return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4; 1332 - } 1333 - 1334 - static bool pci_addr_iowrite(u16 port, u32 mask, u32 val) 1335 - { 1336 - iowrite(port - PCI_CONFIG_ADDR, val, mask, 1337 - &pci_config_addr.val); 1338 - verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n", 1339 - pci_config_addr.bits.enabled ? "" : " DISABLED", 1340 - val, mask, 1341 - pci_config_addr.bits.busnum, 1342 - pci_config_addr.bits.devnum, 1343 - pci_config_addr.bits.funcnum, 1344 - pci_config_addr.bits.offset); 1345 - return true; 1346 - } 1347 - 1348 - static void pci_addr_ioread(u16 port, u32 mask, u32 *val) 1349 - { 1350 - ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val); 1351 - } 1352 - 1353 - /* Is this accessing the PCI config data port?. */ 1354 - static bool is_pci_data_port(u16 port) 1355 - { 1356 - return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4; 1357 - } 1358 - 1359 - static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask); 1360 - 1361 - static bool pci_data_iowrite(u16 port, u32 mask, u32 val) 1362 - { 1363 - u32 reg, portoff; 1364 - struct device *d = dev_and_reg(&reg); 1365 - 1366 - /* Complain if they don't belong to a device. */ 1367 - if (!d) 1368 - return false; 1369 - 1370 - /* They can do 1 byte writes, etc. */ 1371 - portoff = port - PCI_CONFIG_DATA; 1372 - 1373 - /* 1374 - * PCI uses a weird way to determine the BAR size: the OS 1375 - * writes all 1's, and sees which ones stick. 1376 - */ 1377 - if (&d->config_words[reg] == &d->config.bar[0]) { 1378 - int i; 1379 - 1380 - iowrite(portoff, val, mask, &d->config.bar[0]); 1381 - for (i = 0; (1 << i) < d->mmio_size; i++) 1382 - d->config.bar[0] &= ~(1 << i); 1383 - return true; 1384 - } else if ((&d->config_words[reg] > &d->config.bar[0] 1385 - && &d->config_words[reg] <= &d->config.bar[6]) 1386 - || &d->config_words[reg] == &d->config.expansion_rom_addr) { 1387 - /* Allow writing to any other BAR, or expansion ROM */ 1388 - iowrite(portoff, val, mask, &d->config_words[reg]); 1389 - return true; 1390 - /* We let them override latency timer and cacheline size */ 1391 - } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) { 1392 - /* Only let them change the first two fields. */ 1393 - if (mask == 0xFFFFFFFF) 1394 - mask = 0xFFFF; 1395 - iowrite(portoff, val, mask, &d->config_words[reg]); 1396 - return true; 1397 - } else if (&d->config_words[reg] == (void *)&d->config.command 1398 - && mask == 0xFFFF) { 1399 - /* Ignore command writes. */ 1400 - return true; 1401 - } else if (&d->config_words[reg] 1402 - == (void *)&d->config.cfg_access.cap.bar 1403 - || &d->config_words[reg] 1404 - == &d->config.cfg_access.cap.length 1405 - || &d->config_words[reg] 1406 - == &d->config.cfg_access.cap.offset) { 1407 - 1408 - /* 1409 - * The VIRTIO_PCI_CAP_PCI_CFG capability 1410 - * provides a backdoor to access the MMIO 1411 - * regions without mapping them. Weird, but 1412 - * useful. 1413 - */ 1414 - iowrite(portoff, val, mask, &d->config_words[reg]); 1415 - return true; 1416 - } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { 1417 - u32 write_mask; 1418 - 1419 - /* 1420 - * 4.1.4.7.1: 1421 - * 1422 - * Upon detecting driver write access to pci_cfg_data, the 1423 - * device MUST execute a write access at offset cap.offset at 1424 - * BAR selected by cap.bar using the first cap.length bytes 1425 - * from pci_cfg_data. 1426 - */ 1427 - 1428 - /* Must be bar 0 */ 1429 - if (!valid_bar_access(d, &d->config.cfg_access)) 1430 - return false; 1431 - 1432 - iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data); 1433 - 1434 - /* 1435 - * Now emulate a write. The mask we use is set by 1436 - * len, *not* this write! 1437 - */ 1438 - write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1; 1439 - verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n", 1440 - d->config.cfg_access.pci_cfg_data, write_mask, 1441 - d->config.cfg_access.cap.bar, 1442 - d->config.cfg_access.cap.offset, 1443 - d->config.cfg_access.cap.length); 1444 - 1445 - emulate_mmio_write(d, d->config.cfg_access.cap.offset, 1446 - d->config.cfg_access.pci_cfg_data, 1447 - write_mask); 1448 - return true; 1449 - } 1450 - 1451 - /* 1452 - * 4.1.4.1: 1453 - * 1454 - * The driver MUST NOT write into any field of the capability 1455 - * structure, with the exception of those with cap_type 1456 - * VIRTIO_PCI_CAP_PCI_CFG... 1457 - */ 1458 - return false; 1459 - } 1460 - 1461 - static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask); 1462 - 1463 - static void pci_data_ioread(u16 port, u32 mask, u32 *val) 1464 - { 1465 - u32 reg; 1466 - struct device *d = dev_and_reg(&reg); 1467 - 1468 - if (!d) 1469 - return; 1470 - 1471 - /* Read through the PCI MMIO access window is special */ 1472 - if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { 1473 - u32 read_mask; 1474 - 1475 - /* 1476 - * 4.1.4.7.1: 1477 - * 1478 - * Upon detecting driver read access to pci_cfg_data, the 1479 - * device MUST execute a read access of length cap.length at 1480 - * offset cap.offset at BAR selected by cap.bar and store the 1481 - * first cap.length bytes in pci_cfg_data. 1482 - */ 1483 - /* Must be bar 0 */ 1484 - if (!valid_bar_access(d, &d->config.cfg_access)) 1485 - bad_driver(d, 1486 - "Invalid cfg_access to bar%u, offset %u len %u", 1487 - d->config.cfg_access.cap.bar, 1488 - d->config.cfg_access.cap.offset, 1489 - d->config.cfg_access.cap.length); 1490 - 1491 - /* 1492 - * Read into the window. The mask we use is set by 1493 - * len, *not* this read! 1494 - */ 1495 - read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1; 1496 - d->config.cfg_access.pci_cfg_data 1497 - = emulate_mmio_read(d, 1498 - d->config.cfg_access.cap.offset, 1499 - read_mask); 1500 - verbose("Window read %#x/%#x from bar %u, offset %u len %u\n", 1501 - d->config.cfg_access.pci_cfg_data, read_mask, 1502 - d->config.cfg_access.cap.bar, 1503 - d->config.cfg_access.cap.offset, 1504 - d->config.cfg_access.cap.length); 1505 - } 1506 - ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val); 1507 - } 1508 - 1509 - /*L:216 1510 - * This is where we emulate a handful of Guest instructions. It's ugly 1511 - * and we used to do it in the kernel but it grew over time. 1512 - */ 1513 - 1514 - /* 1515 - * We use the ptrace syscall's pt_regs struct to talk about registers 1516 - * to lguest: these macros convert the names to the offsets. 1517 - */ 1518 - #define getreg(name) getreg_off(offsetof(struct user_regs_struct, name)) 1519 - #define setreg(name, val) \ 1520 - setreg_off(offsetof(struct user_regs_struct, name), (val)) 1521 - 1522 - static u32 getreg_off(size_t offset) 1523 - { 1524 - u32 r; 1525 - unsigned long args[] = { LHREQ_GETREG, offset }; 1526 - 1527 - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) 1528 - err(1, "Getting register %u", offset); 1529 - if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r)) 1530 - err(1, "Reading register %u", offset); 1531 - 1532 - return r; 1533 - } 1534 - 1535 - static void setreg_off(size_t offset, u32 val) 1536 - { 1537 - unsigned long args[] = { LHREQ_SETREG, offset, val }; 1538 - 1539 - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) 1540 - err(1, "Setting register %u", offset); 1541 - } 1542 - 1543 - /* Get register by instruction encoding */ 1544 - static u32 getreg_num(unsigned regnum, u32 mask) 1545 - { 1546 - /* 8 bit ops use regnums 4-7 for high parts of word */ 1547 - if (mask == 0xFF && (regnum & 0x4)) 1548 - return getreg_num(regnum & 0x3, 0xFFFF) >> 8; 1549 - 1550 - switch (regnum) { 1551 - case 0: return getreg(eax) & mask; 1552 - case 1: return getreg(ecx) & mask; 1553 - case 2: return getreg(edx) & mask; 1554 - case 3: return getreg(ebx) & mask; 1555 - case 4: return getreg(esp) & mask; 1556 - case 5: return getreg(ebp) & mask; 1557 - case 6: return getreg(esi) & mask; 1558 - case 7: return getreg(edi) & mask; 1559 - } 1560 - abort(); 1561 - } 1562 - 1563 - /* Set register by instruction encoding */ 1564 - static void setreg_num(unsigned regnum, u32 val, u32 mask) 1565 - { 1566 - /* Don't try to set bits out of range */ 1567 - assert(~(val & ~mask)); 1568 - 1569 - /* 8 bit ops use regnums 4-7 for high parts of word */ 1570 - if (mask == 0xFF && (regnum & 0x4)) { 1571 - /* Construct the 16 bits we want. */ 1572 - val = (val << 8) | getreg_num(regnum & 0x3, 0xFF); 1573 - setreg_num(regnum & 0x3, val, 0xFFFF); 1574 - return; 1575 - } 1576 - 1577 - switch (regnum) { 1578 - case 0: setreg(eax, val | (getreg(eax) & ~mask)); return; 1579 - case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return; 1580 - case 2: setreg(edx, val | (getreg(edx) & ~mask)); return; 1581 - case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return; 1582 - case 4: setreg(esp, val | (getreg(esp) & ~mask)); return; 1583 - case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return; 1584 - case 6: setreg(esi, val | (getreg(esi) & ~mask)); return; 1585 - case 7: setreg(edi, val | (getreg(edi) & ~mask)); return; 1586 - } 1587 - abort(); 1588 - } 1589 - 1590 - /* Get bytes of displacement appended to instruction, from r/m encoding */ 1591 - static u32 insn_displacement_len(u8 mod_reg_rm) 1592 - { 1593 - /* Switch on the mod bits */ 1594 - switch (mod_reg_rm >> 6) { 1595 - case 0: 1596 - /* If mod == 0, and r/m == 101, 16-bit displacement follows */ 1597 - if ((mod_reg_rm & 0x7) == 0x5) 1598 - return 2; 1599 - /* Normally, mod == 0 means no literal displacement */ 1600 - return 0; 1601 - case 1: 1602 - /* One byte displacement */ 1603 - return 1; 1604 - case 2: 1605 - /* Four byte displacement */ 1606 - return 4; 1607 - case 3: 1608 - /* Register mode */ 1609 - return 0; 1610 - } 1611 - abort(); 1612 - } 1613 - 1614 - static void emulate_insn(const u8 insn[]) 1615 - { 1616 - unsigned long args[] = { LHREQ_TRAP, 13 }; 1617 - unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access; 1618 - unsigned int eax, port, mask; 1619 - /* 1620 - * Default is to return all-ones on IO port reads, which traditionally 1621 - * means "there's nothing there". 1622 - */ 1623 - u32 val = 0xFFFFFFFF; 1624 - 1625 - /* 1626 - * This must be the Guest kernel trying to do something, not userspace! 1627 - * The bottom two bits of the CS segment register are the privilege 1628 - * level. 1629 - */ 1630 - if ((getreg(xcs) & 3) != 0x1) 1631 - goto no_emulate; 1632 - 1633 - /* Decoding x86 instructions is icky. */ 1634 - 1635 - /* 1636 - * Around 2.6.33, the kernel started using an emulation for the 1637 - * cmpxchg8b instruction in early boot on many configurations. This 1638 - * code isn't paravirtualized, and it tries to disable interrupts. 1639 - * Ignore it, which will Mostly Work. 1640 - */ 1641 - if (insn[insnlen] == 0xfa) { 1642 - /* "cli", or Clear Interrupt Enable instruction. Skip it. */ 1643 - insnlen = 1; 1644 - goto skip_insn; 1645 - } 1646 - 1647 - /* 1648 - * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. 1649 - */ 1650 - if (insn[insnlen] == 0x66) { 1651 - small_operand = 1; 1652 - /* The instruction is 1 byte so far, read the next byte. */ 1653 - insnlen = 1; 1654 - } 1655 - 1656 - /* If the lower bit isn't set, it's a single byte access */ 1657 - byte_access = !(insn[insnlen] & 1); 1658 - 1659 - /* 1660 - * Now we can ignore the lower bit and decode the 4 opcodes 1661 - * we need to emulate. 1662 - */ 1663 - switch (insn[insnlen] & 0xFE) { 1664 - case 0xE4: /* in <next byte>,%al */ 1665 - port = insn[insnlen+1]; 1666 - insnlen += 2; 1667 - in = 1; 1668 - break; 1669 - case 0xEC: /* in (%dx),%al */ 1670 - port = getreg(edx) & 0xFFFF; 1671 - insnlen += 1; 1672 - in = 1; 1673 - break; 1674 - case 0xE6: /* out %al,<next byte> */ 1675 - port = insn[insnlen+1]; 1676 - insnlen += 2; 1677 - break; 1678 - case 0xEE: /* out %al,(%dx) */ 1679 - port = getreg(edx) & 0xFFFF; 1680 - insnlen += 1; 1681 - break; 1682 - default: 1683 - /* OK, we don't know what this is, can't emulate. */ 1684 - goto no_emulate; 1685 - } 1686 - 1687 - /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */ 1688 - if (byte_access) 1689 - mask = 0xFF; 1690 - else if (small_operand) 1691 - mask = 0xFFFF; 1692 - else 1693 - mask = 0xFFFFFFFF; 1694 - 1695 - /* 1696 - * If it was an "IN" instruction, they expect the result to be read 1697 - * into %eax, so we change %eax. 1698 - */ 1699 - eax = getreg(eax); 1700 - 1701 - if (in) { 1702 - /* This is the PS/2 keyboard status; 1 means ready for output */ 1703 - if (port == 0x64) 1704 - val = 1; 1705 - else if (is_pci_addr_port(port)) 1706 - pci_addr_ioread(port, mask, &val); 1707 - else if (is_pci_data_port(port)) 1708 - pci_data_ioread(port, mask, &val); 1709 - 1710 - /* Clear the bits we're about to read */ 1711 - eax &= ~mask; 1712 - /* Copy bits in from val. */ 1713 - eax |= val & mask; 1714 - /* Now update the register. */ 1715 - setreg(eax, eax); 1716 - } else { 1717 - if (is_pci_addr_port(port)) { 1718 - if (!pci_addr_iowrite(port, mask, eax)) 1719 - goto bad_io; 1720 - } else if (is_pci_data_port(port)) { 1721 - if (!pci_data_iowrite(port, mask, eax)) 1722 - goto bad_io; 1723 - } 1724 - /* There are many other ports, eg. CMOS clock, serial 1725 - * and parallel ports, so we ignore them all. */ 1726 - } 1727 - 1728 - verbose("IO %s of %x to %u: %#08x\n", 1729 - in ? "IN" : "OUT", mask, port, eax); 1730 - skip_insn: 1731 - /* Finally, we've "done" the instruction, so move past it. */ 1732 - setreg(eip, getreg(eip) + insnlen); 1733 - return; 1734 - 1735 - bad_io: 1736 - warnx("Attempt to %s port %u (%#x mask)", 1737 - in ? "read from" : "write to", port, mask); 1738 - 1739 - no_emulate: 1740 - /* Inject trap into Guest. */ 1741 - if (write(lguest_fd, args, sizeof(args)) < 0) 1742 - err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip)); 1743 - } 1744 - 1745 - static struct device *find_mmio_region(unsigned long paddr, u32 *off) 1746 - { 1747 - unsigned int i; 1748 - 1749 - for (i = 1; i < MAX_PCI_DEVICES; i++) { 1750 - struct device *d = devices.pci[i]; 1751 - 1752 - if (!d) 1753 - continue; 1754 - if (paddr < d->mmio_addr) 1755 - continue; 1756 - if (paddr >= d->mmio_addr + d->mmio_size) 1757 - continue; 1758 - *off = paddr - d->mmio_addr; 1759 - return d; 1760 - } 1761 - return NULL; 1762 - } 1763 - 1764 - /* FIXME: Use vq array. */ 1765 - static struct virtqueue *vq_by_num(struct device *d, u32 num) 1766 - { 1767 - struct virtqueue *vq = d->vq; 1768 - 1769 - while (num-- && vq) 1770 - vq = vq->next; 1771 - 1772 - return vq; 1773 - } 1774 - 1775 - static void save_vq_config(const struct virtio_pci_common_cfg *cfg, 1776 - struct virtqueue *vq) 1777 - { 1778 - vq->pci_config = *cfg; 1779 - } 1780 - 1781 - static void restore_vq_config(struct virtio_pci_common_cfg *cfg, 1782 - struct virtqueue *vq) 1783 - { 1784 - /* Only restore the per-vq part */ 1785 - size_t off = offsetof(struct virtio_pci_common_cfg, queue_size); 1786 - 1787 - memcpy((void *)cfg + off, (void *)&vq->pci_config + off, 1788 - sizeof(*cfg) - off); 1789 - } 1790 - 1791 - /* 1792 - * 4.1.4.3.2: 1793 - * 1794 - * The driver MUST configure the other virtqueue fields before 1795 - * enabling the virtqueue with queue_enable. 1796 - * 1797 - * When they enable the virtqueue, we check that their setup is valid. 1798 - */ 1799 - static void check_virtqueue(struct device *d, struct virtqueue *vq) 1800 - { 1801 - /* Because lguest is 32 bit, all the descriptor high bits must be 0 */ 1802 - if (vq->pci_config.queue_desc_hi 1803 - || vq->pci_config.queue_avail_hi 1804 - || vq->pci_config.queue_used_hi) 1805 - bad_driver_vq(vq, "invalid 64-bit queue address"); 1806 - 1807 - /* 1808 - * 2.4.1: 1809 - * 1810 - * The driver MUST ensure that the physical address of the first byte 1811 - * of each virtqueue part is a multiple of the specified alignment 1812 - * value in the above table. 1813 - */ 1814 - if (vq->pci_config.queue_desc_lo % 16 1815 - || vq->pci_config.queue_avail_lo % 2 1816 - || vq->pci_config.queue_used_lo % 4) 1817 - bad_driver_vq(vq, "invalid alignment in queue addresses"); 1818 - 1819 - /* Initialize the virtqueue and check they're all in range. */ 1820 - vq->vring.num = vq->pci_config.queue_size; 1821 - vq->vring.desc = check_pointer(vq->dev, 1822 - vq->pci_config.queue_desc_lo, 1823 - sizeof(*vq->vring.desc) * vq->vring.num); 1824 - vq->vring.avail = check_pointer(vq->dev, 1825 - vq->pci_config.queue_avail_lo, 1826 - sizeof(*vq->vring.avail) 1827 - + (sizeof(vq->vring.avail->ring[0]) 1828 - * vq->vring.num)); 1829 - vq->vring.used = check_pointer(vq->dev, 1830 - vq->pci_config.queue_used_lo, 1831 - sizeof(*vq->vring.used) 1832 - + (sizeof(vq->vring.used->ring[0]) 1833 - * vq->vring.num)); 1834 - 1835 - /* 1836 - * 2.4.9.1: 1837 - * 1838 - * The driver MUST initialize flags in the used ring to 0 1839 - * when allocating the used ring. 1840 - */ 1841 - if (vq->vring.used->flags != 0) 1842 - bad_driver_vq(vq, "invalid initial used.flags %#x", 1843 - vq->vring.used->flags); 1844 - } 1845 - 1846 - static void start_virtqueue(struct virtqueue *vq) 1847 - { 1848 - /* 1849 - * Create stack for thread. Since the stack grows upwards, we point 1850 - * the stack pointer to the end of this region. 1851 - */ 1852 - char *stack = malloc(32768); 1853 - 1854 - /* Create a zero-initialized eventfd. */ 1855 - vq->eventfd = eventfd(0, 0); 1856 - if (vq->eventfd < 0) 1857 - err(1, "Creating eventfd"); 1858 - 1859 - /* 1860 - * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 1861 - * we get a signal if it dies. 1862 - */ 1863 - vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1864 - if (vq->thread == (pid_t)-1) 1865 - err(1, "Creating clone"); 1866 - } 1867 - 1868 - static void start_virtqueues(struct device *d) 1869 - { 1870 - struct virtqueue *vq; 1871 - 1872 - for (vq = d->vq; vq; vq = vq->next) { 1873 - if (vq->pci_config.queue_enable) 1874 - start_virtqueue(vq); 1875 - } 1876 - } 1877 - 1878 - static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask) 1879 - { 1880 - struct virtqueue *vq; 1881 - 1882 - switch (off) { 1883 - case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): 1884 - /* 1885 - * 4.1.4.3.1: 1886 - * 1887 - * The device MUST present the feature bits it is offering in 1888 - * device_feature, starting at bit device_feature_select ∗ 32 1889 - * for any device_feature_select written by the driver 1890 - */ 1891 - if (val == 0) 1892 - d->mmio->cfg.device_feature = d->features; 1893 - else if (val == 1) 1894 - d->mmio->cfg.device_feature = (d->features >> 32); 1895 - else 1896 - d->mmio->cfg.device_feature = 0; 1897 - goto feature_write_through32; 1898 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): 1899 - if (val > 1) 1900 - bad_driver(d, "Unexpected driver select %u", val); 1901 - goto feature_write_through32; 1902 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature): 1903 - if (d->mmio->cfg.guest_feature_select == 0) { 1904 - d->features_accepted &= ~((u64)0xFFFFFFFF); 1905 - d->features_accepted |= val; 1906 - } else { 1907 - assert(d->mmio->cfg.guest_feature_select == 1); 1908 - d->features_accepted &= 0xFFFFFFFF; 1909 - d->features_accepted |= ((u64)val) << 32; 1910 - } 1911 - /* 1912 - * 2.2.1: 1913 - * 1914 - * The driver MUST NOT accept a feature which the device did 1915 - * not offer 1916 - */ 1917 - if (d->features_accepted & ~d->features) 1918 - bad_driver(d, "over-accepted features %#llx of %#llx", 1919 - d->features_accepted, d->features); 1920 - goto feature_write_through32; 1921 - case offsetof(struct virtio_pci_mmio, cfg.device_status): { 1922 - u8 prev; 1923 - 1924 - verbose("%s: device status -> %#x\n", d->name, val); 1925 - /* 1926 - * 4.1.4.3.1: 1927 - * 1928 - * The device MUST reset when 0 is written to device_status, 1929 - * and present a 0 in device_status once that is done. 1930 - */ 1931 - if (val == 0) { 1932 - reset_device(d); 1933 - goto write_through8; 1934 - } 1935 - 1936 - /* 2.1.1: The driver MUST NOT clear a device status bit. */ 1937 - if (d->mmio->cfg.device_status & ~val) 1938 - bad_driver(d, "unset of device status bit %#x -> %#x", 1939 - d->mmio->cfg.device_status, val); 1940 - 1941 - /* 1942 - * 2.1.2: 1943 - * 1944 - * The device MUST NOT consume buffers or notify the driver 1945 - * before DRIVER_OK. 1946 - */ 1947 - if (val & VIRTIO_CONFIG_S_DRIVER_OK 1948 - && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) 1949 - start_virtqueues(d); 1950 - 1951 - /* 1952 - * 3.1.1: 1953 - * 1954 - * The driver MUST follow this sequence to initialize a device: 1955 - * - Reset the device. 1956 - * - Set the ACKNOWLEDGE status bit: the guest OS has 1957 - * notice the device. 1958 - * - Set the DRIVER status bit: the guest OS knows how 1959 - * to drive the device. 1960 - * - Read device feature bits, and write the subset 1961 - * of feature bits understood by the OS and driver 1962 - * to the device. During this step the driver MAY 1963 - * read (but MUST NOT write) the device-specific 1964 - * configuration fields to check that it can 1965 - * support the device before accepting it. 1966 - * - Set the FEATURES_OK status bit. The driver 1967 - * MUST not accept new feature bits after this 1968 - * step. 1969 - * - Re-read device status to ensure the FEATURES_OK 1970 - * bit is still set: otherwise, the device does 1971 - * not support our subset of features and the 1972 - * device is unusable. 1973 - * - Perform device-specific setup, including 1974 - * discovery of virtqueues for the device, 1975 - * optional per-bus setup, reading and possibly 1976 - * writing the device’s virtio configuration 1977 - * space, and population of virtqueues. 1978 - * - Set the DRIVER_OK status bit. At this point the 1979 - * device is “live”. 1980 - */ 1981 - prev = 0; 1982 - switch (val & ~d->mmio->cfg.device_status) { 1983 - case VIRTIO_CONFIG_S_DRIVER_OK: 1984 - prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */ 1985 - case VIRTIO_CONFIG_S_FEATURES_OK: 1986 - prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */ 1987 - case VIRTIO_CONFIG_S_DRIVER: 1988 - prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */ 1989 - case VIRTIO_CONFIG_S_ACKNOWLEDGE: 1990 - break; 1991 - default: 1992 - bad_driver(d, "unknown device status bit %#x -> %#x", 1993 - d->mmio->cfg.device_status, val); 1994 - } 1995 - if (d->mmio->cfg.device_status != prev) 1996 - bad_driver(d, "unexpected status transition %#x -> %#x", 1997 - d->mmio->cfg.device_status, val); 1998 - 1999 - /* If they just wrote FEATURES_OK, we make sure they read */ 2000 - switch (val & ~d->mmio->cfg.device_status) { 2001 - case VIRTIO_CONFIG_S_FEATURES_OK: 2002 - d->wrote_features_ok = true; 2003 - break; 2004 - case VIRTIO_CONFIG_S_DRIVER_OK: 2005 - if (d->wrote_features_ok) 2006 - bad_driver(d, "did not re-read FEATURES_OK"); 2007 - break; 2008 - } 2009 - goto write_through8; 2010 - } 2011 - case offsetof(struct virtio_pci_mmio, cfg.queue_select): 2012 - vq = vq_by_num(d, val); 2013 - /* 2014 - * 4.1.4.3.1: 2015 - * 2016 - * The device MUST present a 0 in queue_size if the virtqueue 2017 - * corresponding to the current queue_select is unavailable. 2018 - */ 2019 - if (!vq) { 2020 - d->mmio->cfg.queue_size = 0; 2021 - goto write_through16; 2022 - } 2023 - /* Save registers for old vq, if it was a valid vq */ 2024 - if (d->mmio->cfg.queue_size) 2025 - save_vq_config(&d->mmio->cfg, 2026 - vq_by_num(d, d->mmio->cfg.queue_select)); 2027 - /* Restore the registers for the queue they asked for */ 2028 - restore_vq_config(&d->mmio->cfg, vq); 2029 - goto write_through16; 2030 - case offsetof(struct virtio_pci_mmio, cfg.queue_size): 2031 - /* 2032 - * 4.1.4.3.2: 2033 - * 2034 - * The driver MUST NOT write a value which is not a power of 2 2035 - * to queue_size. 2036 - */ 2037 - if (val & (val-1)) 2038 - bad_driver(d, "invalid queue size %u", val); 2039 - if (d->mmio->cfg.queue_enable) 2040 - bad_driver(d, "changing queue size on live device"); 2041 - goto write_through16; 2042 - case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector): 2043 - bad_driver(d, "attempt to set MSIX vector to %u", val); 2044 - case offsetof(struct virtio_pci_mmio, cfg.queue_enable): { 2045 - struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select); 2046 - 2047 - /* 2048 - * 4.1.4.3.2: 2049 - * 2050 - * The driver MUST NOT write a 0 to queue_enable. 2051 - */ 2052 - if (val != 1) 2053 - bad_driver(d, "setting queue_enable to %u", val); 2054 - 2055 - /* 2056 - * 3.1.1: 2057 - * 2058 - * 7. Perform device-specific setup, including discovery of 2059 - * virtqueues for the device, optional per-bus setup, 2060 - * reading and possibly writing the device’s virtio 2061 - * configuration space, and population of virtqueues. 2062 - * 8. Set the DRIVER_OK status bit. 2063 - * 2064 - * All our devices require all virtqueues to be enabled, so 2065 - * they should have done that before setting DRIVER_OK. 2066 - */ 2067 - if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK) 2068 - bad_driver(d, "enabling vq after DRIVER_OK"); 2069 - 2070 - d->mmio->cfg.queue_enable = val; 2071 - save_vq_config(&d->mmio->cfg, vq); 2072 - check_virtqueue(d, vq); 2073 - goto write_through16; 2074 - } 2075 - case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off): 2076 - bad_driver(d, "attempt to write to queue_notify_off"); 2077 - case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo): 2078 - case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi): 2079 - case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo): 2080 - case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi): 2081 - case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo): 2082 - case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi): 2083 - /* 2084 - * 4.1.4.3.2: 2085 - * 2086 - * The driver MUST configure the other virtqueue fields before 2087 - * enabling the virtqueue with queue_enable. 2088 - */ 2089 - if (d->mmio->cfg.queue_enable) 2090 - bad_driver(d, "changing queue on live device"); 2091 - 2092 - /* 2093 - * 3.1.1: 2094 - * 2095 - * The driver MUST follow this sequence to initialize a device: 2096 - *... 2097 - * 5. Set the FEATURES_OK status bit. The driver MUST not 2098 - * accept new feature bits after this step. 2099 - */ 2100 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)) 2101 - bad_driver(d, "setting up vq before FEATURES_OK"); 2102 - 2103 - /* 2104 - * 6. Re-read device status to ensure the FEATURES_OK bit is 2105 - * still set... 2106 - */ 2107 - if (d->wrote_features_ok) 2108 - bad_driver(d, "didn't re-read FEATURES_OK before setup"); 2109 - 2110 - goto write_through32; 2111 - case offsetof(struct virtio_pci_mmio, notify): 2112 - vq = vq_by_num(d, val); 2113 - if (!vq) 2114 - bad_driver(d, "Invalid vq notification on %u", val); 2115 - /* Notify the process handling this vq by adding 1 to eventfd */ 2116 - write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8); 2117 - goto write_through16; 2118 - case offsetof(struct virtio_pci_mmio, isr): 2119 - bad_driver(d, "Unexpected write to isr"); 2120 - /* Weird corner case: write to emerg_wr of console */ 2121 - case sizeof(struct virtio_pci_mmio) 2122 - + offsetof(struct virtio_console_config, emerg_wr): 2123 - if (strcmp(d->name, "console") == 0) { 2124 - char c = val; 2125 - write(STDOUT_FILENO, &c, 1); 2126 - goto write_through32; 2127 - } 2128 - /* Fall through... */ 2129 - default: 2130 - /* 2131 - * 4.1.4.3.2: 2132 - * 2133 - * The driver MUST NOT write to device_feature, num_queues, 2134 - * config_generation or queue_notify_off. 2135 - */ 2136 - bad_driver(d, "Unexpected write to offset %u", off); 2137 - } 2138 - 2139 - feature_write_through32: 2140 - /* 2141 - * 3.1.1: 2142 - * 2143 - * The driver MUST follow this sequence to initialize a device: 2144 - *... 2145 - * - Set the DRIVER status bit: the guest OS knows how 2146 - * to drive the device. 2147 - * - Read device feature bits, and write the subset 2148 - * of feature bits understood by the OS and driver 2149 - * to the device. 2150 - *... 2151 - * - Set the FEATURES_OK status bit. The driver MUST not 2152 - * accept new feature bits after this step. 2153 - */ 2154 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2155 - bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER"); 2156 - if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK) 2157 - bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK"); 2158 - 2159 - /* 2160 - * 4.1.3.1: 2161 - * 2162 - * The driver MUST access each field using the “natural” access 2163 - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for 2164 - * 16-bit fields and 8-bit accesses for 8-bit fields. 2165 - */ 2166 - write_through32: 2167 - if (mask != 0xFFFFFFFF) { 2168 - bad_driver(d, "non-32-bit write to offset %u (%#x)", 2169 - off, getreg(eip)); 2170 - return; 2171 - } 2172 - memcpy((char *)d->mmio + off, &val, 4); 2173 - return; 2174 - 2175 - write_through16: 2176 - if (mask != 0xFFFF) 2177 - bad_driver(d, "non-16-bit write to offset %u (%#x)", 2178 - off, getreg(eip)); 2179 - memcpy((char *)d->mmio + off, &val, 2); 2180 - return; 2181 - 2182 - write_through8: 2183 - if (mask != 0xFF) 2184 - bad_driver(d, "non-8-bit write to offset %u (%#x)", 2185 - off, getreg(eip)); 2186 - memcpy((char *)d->mmio + off, &val, 1); 2187 - return; 2188 - } 2189 - 2190 - static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask) 2191 - { 2192 - u8 isr; 2193 - u32 val = 0; 2194 - 2195 - switch (off) { 2196 - case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): 2197 - case offsetof(struct virtio_pci_mmio, cfg.device_feature): 2198 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): 2199 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature): 2200 - /* 2201 - * 3.1.1: 2202 - * 2203 - * The driver MUST follow this sequence to initialize a device: 2204 - *... 2205 - * - Set the DRIVER status bit: the guest OS knows how 2206 - * to drive the device. 2207 - * - Read device feature bits, and write the subset 2208 - * of feature bits understood by the OS and driver 2209 - * to the device. 2210 - */ 2211 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2212 - bad_driver(d, 2213 - "feature read before VIRTIO_CONFIG_S_DRIVER"); 2214 - goto read_through32; 2215 - case offsetof(struct virtio_pci_mmio, cfg.msix_config): 2216 - bad_driver(d, "read of msix_config"); 2217 - case offsetof(struct virtio_pci_mmio, cfg.num_queues): 2218 - goto read_through16; 2219 - case offsetof(struct virtio_pci_mmio, cfg.device_status): 2220 - /* As they did read, any write of FEATURES_OK is now fine. */ 2221 - d->wrote_features_ok = false; 2222 - goto read_through8; 2223 - case offsetof(struct virtio_pci_mmio, cfg.config_generation): 2224 - /* 2225 - * 4.1.4.3.1: 2226 - * 2227 - * The device MUST present a changed config_generation after 2228 - * the driver has read a device-specific configuration value 2229 - * which has changed since any part of the device-specific 2230 - * configuration was last read. 2231 - * 2232 - * This is simple: none of our devices change config, so this 2233 - * is always 0. 2234 - */ 2235 - goto read_through8; 2236 - case offsetof(struct virtio_pci_mmio, notify): 2237 - /* 2238 - * 3.1.1: 2239 - * 2240 - * The driver MUST NOT notify the device before setting 2241 - * DRIVER_OK. 2242 - */ 2243 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) 2244 - bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK"); 2245 - goto read_through16; 2246 - case offsetof(struct virtio_pci_mmio, isr): 2247 - if (mask != 0xFF) 2248 - bad_driver(d, "non-8-bit read from offset %u (%#x)", 2249 - off, getreg(eip)); 2250 - isr = d->mmio->isr; 2251 - /* 2252 - * 4.1.4.5.1: 2253 - * 2254 - * The device MUST reset ISR status to 0 on driver read. 2255 - */ 2256 - d->mmio->isr = 0; 2257 - return isr; 2258 - case offsetof(struct virtio_pci_mmio, padding): 2259 - bad_driver(d, "read from padding (%#x)", getreg(eip)); 2260 - default: 2261 - /* Read from device config space, beware unaligned overflow */ 2262 - if (off > d->mmio_size - 4) 2263 - bad_driver(d, "read past end (%#x)", getreg(eip)); 2264 - 2265 - /* 2266 - * 3.1.1: 2267 - * The driver MUST follow this sequence to initialize a device: 2268 - *... 2269 - * 3. Set the DRIVER status bit: the guest OS knows how to 2270 - * drive the device. 2271 - * 4. Read device feature bits, and write the subset of 2272 - * feature bits understood by the OS and driver to the 2273 - * device. During this step the driver MAY read (but MUST NOT 2274 - * write) the device-specific configuration fields to check 2275 - * that it can support the device before accepting it. 2276 - */ 2277 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2278 - bad_driver(d, 2279 - "config read before VIRTIO_CONFIG_S_DRIVER"); 2280 - 2281 - if (mask == 0xFFFFFFFF) 2282 - goto read_through32; 2283 - else if (mask == 0xFFFF) 2284 - goto read_through16; 2285 - else 2286 - goto read_through8; 2287 - } 2288 - 2289 - /* 2290 - * 4.1.3.1: 2291 - * 2292 - * The driver MUST access each field using the “natural” access 2293 - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for 2294 - * 16-bit fields and 8-bit accesses for 8-bit fields. 2295 - */ 2296 - read_through32: 2297 - if (mask != 0xFFFFFFFF) 2298 - bad_driver(d, "non-32-bit read to offset %u (%#x)", 2299 - off, getreg(eip)); 2300 - memcpy(&val, (char *)d->mmio + off, 4); 2301 - return val; 2302 - 2303 - read_through16: 2304 - if (mask != 0xFFFF) 2305 - bad_driver(d, "non-16-bit read to offset %u (%#x)", 2306 - off, getreg(eip)); 2307 - memcpy(&val, (char *)d->mmio + off, 2); 2308 - return val; 2309 - 2310 - read_through8: 2311 - if (mask != 0xFF) 2312 - bad_driver(d, "non-8-bit read to offset %u (%#x)", 2313 - off, getreg(eip)); 2314 - memcpy(&val, (char *)d->mmio + off, 1); 2315 - return val; 2316 - } 2317 - 2318 - static void emulate_mmio(unsigned long paddr, const u8 *insn) 2319 - { 2320 - u32 val, off, mask = 0xFFFFFFFF, insnlen = 0; 2321 - struct device *d = find_mmio_region(paddr, &off); 2322 - unsigned long args[] = { LHREQ_TRAP, 14 }; 2323 - 2324 - if (!d) { 2325 - warnx("MMIO touching %#08lx (not a device)", paddr); 2326 - goto reinject; 2327 - } 2328 - 2329 - /* Prefix makes it a 16 bit op */ 2330 - if (insn[0] == 0x66) { 2331 - mask = 0xFFFF; 2332 - insnlen++; 2333 - } 2334 - 2335 - /* iowrite */ 2336 - if (insn[insnlen] == 0x89) { 2337 - /* Next byte is r/m byte: bits 3-5 are register. */ 2338 - val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask); 2339 - emulate_mmio_write(d, off, val, mask); 2340 - insnlen += 2 + insn_displacement_len(insn[insnlen+1]); 2341 - } else if (insn[insnlen] == 0x8b) { /* ioread */ 2342 - /* Next byte is r/m byte: bits 3-5 are register. */ 2343 - val = emulate_mmio_read(d, off, mask); 2344 - setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask); 2345 - insnlen += 2 + insn_displacement_len(insn[insnlen+1]); 2346 - } else if (insn[0] == 0x88) { /* 8-bit iowrite */ 2347 - mask = 0xff; 2348 - /* Next byte is r/m byte: bits 3-5 are register. */ 2349 - val = getreg_num((insn[1] >> 3) & 0x7, mask); 2350 - emulate_mmio_write(d, off, val, mask); 2351 - insnlen = 2 + insn_displacement_len(insn[1]); 2352 - } else if (insn[0] == 0x8a) { /* 8-bit ioread */ 2353 - mask = 0xff; 2354 - val = emulate_mmio_read(d, off, mask); 2355 - setreg_num((insn[1] >> 3) & 0x7, val, mask); 2356 - insnlen = 2 + insn_displacement_len(insn[1]); 2357 - } else { 2358 - warnx("Unknown MMIO instruction touching %#08lx:" 2359 - " %02x %02x %02x %02x at %u", 2360 - paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip)); 2361 - reinject: 2362 - /* Inject trap into Guest. */ 2363 - if (write(lguest_fd, args, sizeof(args)) < 0) 2364 - err(1, "Reinjecting trap 14 for fault at %#x", 2365 - getreg(eip)); 2366 - return; 2367 - } 2368 - 2369 - /* Finally, we've "done" the instruction, so move past it. */ 2370 - setreg(eip, getreg(eip) + insnlen); 2371 - } 2372 - 2373 - /*L:190 2374 - * Device Setup 2375 - * 2376 - * All devices need a descriptor so the Guest knows it exists, and a "struct 2377 - * device" so the Launcher can keep track of it. We have common helper 2378 - * routines to allocate and manage them. 2379 - */ 2380 - static void add_pci_virtqueue(struct device *dev, 2381 - void (*service)(struct virtqueue *), 2382 - const char *name) 2383 - { 2384 - struct virtqueue **i, *vq = malloc(sizeof(*vq)); 2385 - 2386 - /* Initialize the virtqueue */ 2387 - vq->next = NULL; 2388 - vq->last_avail_idx = 0; 2389 - vq->dev = dev; 2390 - vq->name = name; 2391 - 2392 - /* 2393 - * This is the routine the service thread will run, and its Process ID 2394 - * once it's running. 2395 - */ 2396 - vq->service = service; 2397 - vq->thread = (pid_t)-1; 2398 - 2399 - /* Initialize the configuration. */ 2400 - reset_vq_pci_config(vq); 2401 - vq->pci_config.queue_notify_off = 0; 2402 - 2403 - /* Add one to the number of queues */ 2404 - vq->dev->mmio->cfg.num_queues++; 2405 - 2406 - /* 2407 - * Add to tail of list, so dev->vq is first vq, dev->vq->next is 2408 - * second. 2409 - */ 2410 - for (i = &dev->vq; *i; i = &(*i)->next); 2411 - *i = vq; 2412 - } 2413 - 2414 - /* The Guest accesses the feature bits via the PCI common config MMIO region */ 2415 - static void add_pci_feature(struct device *dev, unsigned bit) 2416 - { 2417 - dev->features |= (1ULL << bit); 2418 - } 2419 - 2420 - /* For devices with no config. */ 2421 - static void no_device_config(struct device *dev) 2422 - { 2423 - dev->mmio_addr = get_mmio_region(dev->mmio_size); 2424 - 2425 - dev->config.bar[0] = dev->mmio_addr; 2426 - /* Bottom 4 bits must be zero */ 2427 - assert(~(dev->config.bar[0] & 0xF)); 2428 - } 2429 - 2430 - /* This puts the device config into BAR0 */ 2431 - static void set_device_config(struct device *dev, const void *conf, size_t len) 2432 - { 2433 - /* Set up BAR 0 */ 2434 - dev->mmio_size += len; 2435 - dev->mmio = realloc(dev->mmio, dev->mmio_size); 2436 - memcpy(dev->mmio + 1, conf, len); 2437 - 2438 - /* 2439 - * 4.1.4.6: 2440 - * 2441 - * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG 2442 - * capability for any device type which has a device-specific 2443 - * configuration. 2444 - */ 2445 - /* Hook up device cfg */ 2446 - dev->config.cfg_access.cap.cap_next 2447 - = offsetof(struct pci_config, device); 2448 - 2449 - /* 2450 - * 4.1.4.6.1: 2451 - * 2452 - * The offset for the device-specific configuration MUST be 4-byte 2453 - * aligned. 2454 - */ 2455 - assert(dev->config.cfg_access.cap.cap_next % 4 == 0); 2456 - 2457 - /* Fix up device cfg field length. */ 2458 - dev->config.device.length = len; 2459 - 2460 - /* The rest is the same as the no-config case */ 2461 - no_device_config(dev); 2462 - } 2463 - 2464 - static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type, 2465 - size_t bar_offset, size_t bar_bytes, u8 next) 2466 - { 2467 - cap->cap_vndr = PCI_CAP_ID_VNDR; 2468 - cap->cap_next = next; 2469 - cap->cap_len = caplen; 2470 - cap->cfg_type = type; 2471 - cap->bar = 0; 2472 - memset(cap->padding, 0, sizeof(cap->padding)); 2473 - cap->offset = bar_offset; 2474 - cap->length = bar_bytes; 2475 - } 2476 - 2477 - /* 2478 - * This sets up the pci_config structure, as defined in the virtio 1.0 2479 - * standard (and PCI standard). 2480 - */ 2481 - static void init_pci_config(struct pci_config *pci, u16 type, 2482 - u8 class, u8 subclass) 2483 - { 2484 - size_t bar_offset, bar_len; 2485 - 2486 - /* 2487 - * 4.1.4.4.1: 2488 - * 2489 - * The device MUST either present notify_off_multiplier as an even 2490 - * power of 2, or present notify_off_multiplier as 0. 2491 - * 2492 - * 2.1.2: 2493 - * 2494 - * The device MUST initialize device status to 0 upon reset. 2495 - */ 2496 - memset(pci, 0, sizeof(*pci)); 2497 - 2498 - /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */ 2499 - pci->vendor_id = 0x1AF4; 2500 - /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */ 2501 - pci->device_id = 0x1040 + type; 2502 - 2503 - /* 2504 - * PCI have specific codes for different types of devices. 2505 - * Linux doesn't care, but it's a good clue for people looking 2506 - * at the device. 2507 - */ 2508 - pci->class = class; 2509 - pci->subclass = subclass; 2510 - 2511 - /* 2512 - * 4.1.2.1: 2513 - * 2514 - * Non-transitional devices SHOULD have a PCI Revision ID of 1 or 2515 - * higher 2516 - */ 2517 - pci->revid = 1; 2518 - 2519 - /* 2520 - * 4.1.2.1: 2521 - * 2522 - * Non-transitional devices SHOULD have a PCI Subsystem Device ID of 2523 - * 0x40 or higher. 2524 - */ 2525 - pci->subsystem_device_id = 0x40; 2526 - 2527 - /* We use our dummy interrupt controller, and irq_line is the irq */ 2528 - pci->irq_line = devices.next_irq++; 2529 - pci->irq_pin = 0; 2530 - 2531 - /* Support for extended capabilities. */ 2532 - pci->status = (1 << 4); 2533 - 2534 - /* Link them in. */ 2535 - /* 2536 - * 4.1.4.3.1: 2537 - * 2538 - * The device MUST present at least one common configuration 2539 - * capability. 2540 - */ 2541 - pci->capabilities = offsetof(struct pci_config, common); 2542 - 2543 - /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */ 2544 - assert(pci->capabilities % 4 == 0); 2545 - 2546 - bar_offset = offsetof(struct virtio_pci_mmio, cfg); 2547 - bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg); 2548 - init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG, 2549 - bar_offset, bar_len, 2550 - offsetof(struct pci_config, notify)); 2551 - 2552 - /* 2553 - * 4.1.4.4.1: 2554 - * 2555 - * The device MUST present at least one notification capability. 2556 - */ 2557 - bar_offset += bar_len; 2558 - bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify); 2559 - 2560 - /* 2561 - * 4.1.4.4.1: 2562 - * 2563 - * The cap.offset MUST be 2-byte aligned. 2564 - */ 2565 - assert(pci->common.cap_next % 2 == 0); 2566 - 2567 - /* FIXME: Use a non-zero notify_off, for per-queue notification? */ 2568 - /* 2569 - * 4.1.4.4.1: 2570 - * 2571 - * The value cap.length presented by the device MUST be at least 2 and 2572 - * MUST be large enough to support queue notification offsets for all 2573 - * supported queues in all possible configurations. 2574 - */ 2575 - assert(bar_len >= 2); 2576 - 2577 - init_cap(&pci->notify.cap, sizeof(pci->notify), 2578 - VIRTIO_PCI_CAP_NOTIFY_CFG, 2579 - bar_offset, bar_len, 2580 - offsetof(struct pci_config, isr)); 2581 - 2582 - bar_offset += bar_len; 2583 - bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr); 2584 - /* 2585 - * 4.1.4.5.1: 2586 - * 2587 - * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG 2588 - * capability. 2589 - */ 2590 - init_cap(&pci->isr, sizeof(pci->isr), 2591 - VIRTIO_PCI_CAP_ISR_CFG, 2592 - bar_offset, bar_len, 2593 - offsetof(struct pci_config, cfg_access)); 2594 - 2595 - /* 2596 - * 4.1.4.7.1: 2597 - * 2598 - * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG 2599 - * capability. 2600 - */ 2601 - /* This doesn't have any presence in the BAR */ 2602 - init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access), 2603 - VIRTIO_PCI_CAP_PCI_CFG, 2604 - 0, 0, 0); 2605 - 2606 - bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding); 2607 - assert(bar_offset == sizeof(struct virtio_pci_mmio)); 2608 - 2609 - /* 2610 - * This gets sewn in and length set in set_device_config(). 2611 - * Some devices don't have a device configuration interface, so 2612 - * we never expose this if we don't call set_device_config(). 2613 - */ 2614 - init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG, 2615 - bar_offset, 0, 0); 2616 - } 2617 - 2618 - /* 2619 - * This routine does all the creation and setup of a new device, but we don't 2620 - * actually place the MMIO region until we know the size (if any) of the 2621 - * device-specific config. And we don't actually start the service threads 2622 - * until later. 2623 - * 2624 - * See what I mean about userspace being boring? 2625 - */ 2626 - static struct device *new_pci_device(const char *name, u16 type, 2627 - u8 class, u8 subclass) 2628 - { 2629 - struct device *dev = malloc(sizeof(*dev)); 2630 - 2631 - /* Now we populate the fields one at a time. */ 2632 - dev->name = name; 2633 - dev->vq = NULL; 2634 - dev->running = false; 2635 - dev->wrote_features_ok = false; 2636 - dev->mmio_size = sizeof(struct virtio_pci_mmio); 2637 - dev->mmio = calloc(1, dev->mmio_size); 2638 - dev->features = (u64)1 << VIRTIO_F_VERSION_1; 2639 - dev->features_accepted = 0; 2640 - 2641 - if (devices.device_num + 1 >= MAX_PCI_DEVICES) 2642 - errx(1, "Can only handle 31 PCI devices"); 2643 - 2644 - init_pci_config(&dev->config, type, class, subclass); 2645 - assert(!devices.pci[devices.device_num+1]); 2646 - devices.pci[++devices.device_num] = dev; 2647 - 2648 - return dev; 2649 - } 2650 - 2651 - /* 2652 - * Our first setup routine is the console. It's a fairly simple device, but 2653 - * UNIX tty handling makes it uglier than it could be. 2654 - */ 2655 - static void setup_console(void) 2656 - { 2657 - struct device *dev; 2658 - struct virtio_console_config conf; 2659 - 2660 - /* If we can save the initial standard input settings... */ 2661 - if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 2662 - struct termios term = orig_term; 2663 - /* 2664 - * Then we turn off echo, line buffering and ^C etc: We want a 2665 - * raw input stream to the Guest. 2666 - */ 2667 - term.c_lflag &= ~(ISIG|ICANON|ECHO); 2668 - tcsetattr(STDIN_FILENO, TCSANOW, &term); 2669 - } 2670 - 2671 - dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00); 2672 - 2673 - /* We store the console state in dev->priv, and initialize it. */ 2674 - dev->priv = malloc(sizeof(struct console_abort)); 2675 - ((struct console_abort *)dev->priv)->count = 0; 2676 - 2677 - /* 2678 - * The console needs two virtqueues: the input then the output. When 2679 - * they put something the input queue, we make sure we're listening to 2680 - * stdin. When they put something in the output queue, we write it to 2681 - * stdout. 2682 - */ 2683 - add_pci_virtqueue(dev, console_input, "input"); 2684 - add_pci_virtqueue(dev, console_output, "output"); 2685 - 2686 - /* We need a configuration area for the emerg_wr early writes. */ 2687 - add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE); 2688 - set_device_config(dev, &conf, sizeof(conf)); 2689 - 2690 - verbose("device %u: console\n", devices.device_num); 2691 - } 2692 - /*:*/ 2693 - 2694 - /*M:010 2695 - * Inter-guest networking is an interesting area. Simplest is to have a 2696 - * --sharenet=<name> option which opens or creates a named pipe. This can be 2697 - * used to send packets to another guest in a 1:1 manner. 2698 - * 2699 - * More sophisticated is to use one of the tools developed for project like UML 2700 - * to do networking. 2701 - * 2702 - * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 2703 - * completely generic ("here's my vring, attach to your vring") and would work 2704 - * for any traffic. Of course, namespace and permissions issues need to be 2705 - * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide 2706 - * multiple inter-guest channels behind one interface, although it would 2707 - * require some manner of hotplugging new virtio channels. 2708 - * 2709 - * Finally, we could use a virtio network switch in the kernel, ie. vhost. 2710 - :*/ 2711 - 2712 - static u32 str2ip(const char *ipaddr) 2713 - { 2714 - unsigned int b[4]; 2715 - 2716 - if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) 2717 - errx(1, "Failed to parse IP address '%s'", ipaddr); 2718 - return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; 2719 - } 2720 - 2721 - static void str2mac(const char *macaddr, unsigned char mac[6]) 2722 - { 2723 - unsigned int m[6]; 2724 - if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", 2725 - &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) 2726 - errx(1, "Failed to parse mac address '%s'", macaddr); 2727 - mac[0] = m[0]; 2728 - mac[1] = m[1]; 2729 - mac[2] = m[2]; 2730 - mac[3] = m[3]; 2731 - mac[4] = m[4]; 2732 - mac[5] = m[5]; 2733 - } 2734 - 2735 - /* 2736 - * This code is "adapted" from libbridge: it attaches the Host end of the 2737 - * network device to the bridge device specified by the command line. 2738 - * 2739 - * This is yet another James Morris contribution (I'm an IP-level guy, so I 2740 - * dislike bridging), and I just try not to break it. 2741 - */ 2742 - static void add_to_bridge(int fd, const char *if_name, const char *br_name) 2743 - { 2744 - int ifidx; 2745 - struct ifreq ifr; 2746 - 2747 - if (!*br_name) 2748 - errx(1, "must specify bridge name"); 2749 - 2750 - ifidx = if_nametoindex(if_name); 2751 - if (!ifidx) 2752 - errx(1, "interface %s does not exist!", if_name); 2753 - 2754 - strncpy(ifr.ifr_name, br_name, IFNAMSIZ); 2755 - ifr.ifr_name[IFNAMSIZ-1] = '\0'; 2756 - ifr.ifr_ifindex = ifidx; 2757 - if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) 2758 - err(1, "can't add %s to bridge %s", if_name, br_name); 2759 - } 2760 - 2761 - /* 2762 - * This sets up the Host end of the network device with an IP address, brings 2763 - * it up so packets will flow, the copies the MAC address into the hwaddr 2764 - * pointer. 2765 - */ 2766 - static void configure_device(int fd, const char *tapif, u32 ipaddr) 2767 - { 2768 - struct ifreq ifr; 2769 - struct sockaddr_in sin; 2770 - 2771 - memset(&ifr, 0, sizeof(ifr)); 2772 - strcpy(ifr.ifr_name, tapif); 2773 - 2774 - /* Don't read these incantations. Just cut & paste them like I did! */ 2775 - sin.sin_family = AF_INET; 2776 - sin.sin_addr.s_addr = htonl(ipaddr); 2777 - memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); 2778 - if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) 2779 - err(1, "Setting %s interface address", tapif); 2780 - ifr.ifr_flags = IFF_UP; 2781 - if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) 2782 - err(1, "Bringing interface %s up", tapif); 2783 - } 2784 - 2785 - static int get_tun_device(char tapif[IFNAMSIZ]) 2786 - { 2787 - struct ifreq ifr; 2788 - int vnet_hdr_sz; 2789 - int netfd; 2790 - 2791 - /* Start with this zeroed. Messy but sure. */ 2792 - memset(&ifr, 0, sizeof(ifr)); 2793 - 2794 - /* 2795 - * We open the /dev/net/tun device and tell it we want a tap device. A 2796 - * tap device is like a tun device, only somehow different. To tell 2797 - * the truth, I completely blundered my way through this code, but it 2798 - * works now! 2799 - */ 2800 - netfd = open_or_die("/dev/net/tun", O_RDWR); 2801 - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 2802 - strcpy(ifr.ifr_name, "tap%d"); 2803 - if (ioctl(netfd, TUNSETIFF, &ifr) != 0) 2804 - err(1, "configuring /dev/net/tun"); 2805 - 2806 - if (ioctl(netfd, TUNSETOFFLOAD, 2807 - TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 2808 - err(1, "Could not set features for tun device"); 2809 - 2810 - /* 2811 - * We don't need checksums calculated for packets coming in this 2812 - * device: trust us! 2813 - */ 2814 - ioctl(netfd, TUNSETNOCSUM, 1); 2815 - 2816 - /* 2817 - * In virtio before 1.0 (aka legacy virtio), we added a 16-bit 2818 - * field at the end of the network header iff 2819 - * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0, 2820 - * that became the norm, but we need to tell the tun device 2821 - * about our expanded header (which is called 2822 - * virtio_net_hdr_mrg_rxbuf in the legacy system). 2823 - */ 2824 - vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1); 2825 - if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0) 2826 - err(1, "Setting tun header size to %u", vnet_hdr_sz); 2827 - 2828 - memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 2829 - return netfd; 2830 - } 2831 - 2832 - /*L:195 2833 - * Our network is a Host<->Guest network. This can either use bridging or 2834 - * routing, but the principle is the same: it uses the "tun" device to inject 2835 - * packets into the Host as if they came in from a normal network card. We 2836 - * just shunt packets between the Guest and the tun device. 2837 - */ 2838 - static void setup_tun_net(char *arg) 2839 - { 2840 - struct device *dev; 2841 - struct net_info *net_info = malloc(sizeof(*net_info)); 2842 - int ipfd; 2843 - u32 ip = INADDR_ANY; 2844 - bool bridging = false; 2845 - char tapif[IFNAMSIZ], *p; 2846 - struct virtio_net_config conf; 2847 - 2848 - net_info->tunfd = get_tun_device(tapif); 2849 - 2850 - /* First we create a new network device. */ 2851 - dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00); 2852 - dev->priv = net_info; 2853 - 2854 - /* Network devices need a recv and a send queue, just like console. */ 2855 - add_pci_virtqueue(dev, net_input, "rx"); 2856 - add_pci_virtqueue(dev, net_output, "tx"); 2857 - 2858 - /* 2859 - * We need a socket to perform the magic network ioctls to bring up the 2860 - * tap interface, connect to the bridge etc. Any socket will do! 2861 - */ 2862 - ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 2863 - if (ipfd < 0) 2864 - err(1, "opening IP socket"); 2865 - 2866 - /* If the command line was --tunnet=bridge:<name> do bridging. */ 2867 - if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { 2868 - arg += strlen(BRIDGE_PFX); 2869 - bridging = true; 2870 - } 2871 - 2872 - /* A mac address may follow the bridge name or IP address */ 2873 - p = strchr(arg, ':'); 2874 - if (p) { 2875 - str2mac(p+1, conf.mac); 2876 - add_pci_feature(dev, VIRTIO_NET_F_MAC); 2877 - *p = '\0'; 2878 - } 2879 - 2880 - /* arg is now either an IP address or a bridge name */ 2881 - if (bridging) 2882 - add_to_bridge(ipfd, tapif, arg); 2883 - else 2884 - ip = str2ip(arg); 2885 - 2886 - /* Set up the tun device. */ 2887 - configure_device(ipfd, tapif, ip); 2888 - 2889 - /* Expect Guest to handle everything except UFO */ 2890 - add_pci_feature(dev, VIRTIO_NET_F_CSUM); 2891 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 2892 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 2893 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 2894 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN); 2895 - add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4); 2896 - add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6); 2897 - add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN); 2898 - /* We handle indirect ring entries */ 2899 - add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); 2900 - set_device_config(dev, &conf, sizeof(conf)); 2901 - 2902 - /* We don't need the socket any more; setup is done. */ 2903 - close(ipfd); 2904 - 2905 - if (bridging) 2906 - verbose("device %u: tun %s attached to bridge: %s\n", 2907 - devices.device_num, tapif, arg); 2908 - else 2909 - verbose("device %u: tun %s: %s\n", 2910 - devices.device_num, tapif, arg); 2911 - } 2912 - /*:*/ 2913 - 2914 - /* This hangs off device->priv. */ 2915 - struct vblk_info { 2916 - /* The size of the file. */ 2917 - off64_t len; 2918 - 2919 - /* The file descriptor for the file. */ 2920 - int fd; 2921 - 2922 - }; 2923 - 2924 - /*L:210 2925 - * The Disk 2926 - * 2927 - * The disk only has one virtqueue, so it only has one thread. It is really 2928 - * simple: the Guest asks for a block number and we read or write that position 2929 - * in the file. 2930 - * 2931 - * Before we serviced each virtqueue in a separate thread, that was unacceptably 2932 - * slow: the Guest waits until the read is finished before running anything 2933 - * else, even if it could have been doing useful work. 2934 - * 2935 - * We could have used async I/O, except it's reputed to suck so hard that 2936 - * characters actually go missing from your code when you try to use it. 2937 - */ 2938 - static void blk_request(struct virtqueue *vq) 2939 - { 2940 - struct vblk_info *vblk = vq->dev->priv; 2941 - unsigned int head, out_num, in_num, wlen; 2942 - int ret, i; 2943 - u8 *in; 2944 - struct virtio_blk_outhdr out; 2945 - struct iovec iov[vq->vring.num]; 2946 - off64_t off; 2947 - 2948 - /* 2949 - * Get the next request, where we normally wait. It triggers the 2950 - * interrupt to acknowledge previously serviced requests (if any). 2951 - */ 2952 - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 2953 - 2954 - /* Copy the output header from the front of the iov (adjusts iov) */ 2955 - iov_consume(vq->dev, iov, out_num, &out, sizeof(out)); 2956 - 2957 - /* Find and trim end of iov input array, for our status byte. */ 2958 - in = NULL; 2959 - for (i = out_num + in_num - 1; i >= out_num; i--) { 2960 - if (iov[i].iov_len > 0) { 2961 - in = iov[i].iov_base + iov[i].iov_len - 1; 2962 - iov[i].iov_len--; 2963 - break; 2964 - } 2965 - } 2966 - if (!in) 2967 - bad_driver_vq(vq, "Bad virtblk cmd with no room for status"); 2968 - 2969 - /* 2970 - * For historical reasons, block operations are expressed in 512 byte 2971 - * "sectors". 2972 - */ 2973 - off = out.sector * 512; 2974 - 2975 - if (out.type & VIRTIO_BLK_T_OUT) { 2976 - /* 2977 - * Write 2978 - * 2979 - * Move to the right location in the block file. This can fail 2980 - * if they try to write past end. 2981 - */ 2982 - if (lseek64(vblk->fd, off, SEEK_SET) != off) 2983 - err(1, "Bad seek to sector %llu", out.sector); 2984 - 2985 - ret = writev(vblk->fd, iov, out_num); 2986 - verbose("WRITE to sector %llu: %i\n", out.sector, ret); 2987 - 2988 - /* 2989 - * Grr... Now we know how long the descriptor they sent was, we 2990 - * make sure they didn't try to write over the end of the block 2991 - * file (possibly extending it). 2992 - */ 2993 - if (ret > 0 && off + ret > vblk->len) { 2994 - /* Trim it back to the correct length */ 2995 - ftruncate64(vblk->fd, vblk->len); 2996 - /* Die, bad Guest, die. */ 2997 - bad_driver_vq(vq, "Write past end %llu+%u", off, ret); 2998 - } 2999 - 3000 - wlen = sizeof(*in); 3001 - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 3002 - } else if (out.type & VIRTIO_BLK_T_FLUSH) { 3003 - /* Flush */ 3004 - ret = fdatasync(vblk->fd); 3005 - verbose("FLUSH fdatasync: %i\n", ret); 3006 - wlen = sizeof(*in); 3007 - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 3008 - } else { 3009 - /* 3010 - * Read 3011 - * 3012 - * Move to the right location in the block file. This can fail 3013 - * if they try to read past end. 3014 - */ 3015 - if (lseek64(vblk->fd, off, SEEK_SET) != off) 3016 - err(1, "Bad seek to sector %llu", out.sector); 3017 - 3018 - ret = readv(vblk->fd, iov + out_num, in_num); 3019 - if (ret >= 0) { 3020 - wlen = sizeof(*in) + ret; 3021 - *in = VIRTIO_BLK_S_OK; 3022 - } else { 3023 - wlen = sizeof(*in); 3024 - *in = VIRTIO_BLK_S_IOERR; 3025 - } 3026 - } 3027 - 3028 - /* Finished that request. */ 3029 - add_used(vq, head, wlen); 3030 - } 3031 - 3032 - /*L:198 This actually sets up a virtual block device. */ 3033 - static void setup_block_file(const char *filename) 3034 - { 3035 - struct device *dev; 3036 - struct vblk_info *vblk; 3037 - struct virtio_blk_config conf; 3038 - 3039 - /* Create the device. */ 3040 - dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80); 3041 - 3042 - /* The device has one virtqueue, where the Guest places requests. */ 3043 - add_pci_virtqueue(dev, blk_request, "request"); 3044 - 3045 - /* Allocate the room for our own bookkeeping */ 3046 - vblk = dev->priv = malloc(sizeof(*vblk)); 3047 - 3048 - /* First we open the file and store the length. */ 3049 - vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 3050 - vblk->len = lseek64(vblk->fd, 0, SEEK_END); 3051 - 3052 - /* Tell Guest how many sectors this device has. */ 3053 - conf.capacity = cpu_to_le64(vblk->len / 512); 3054 - 3055 - /* 3056 - * Tell Guest not to put in too many descriptors at once: two are used 3057 - * for the in and out elements. 3058 - */ 3059 - add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX); 3060 - conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 3061 - 3062 - set_device_config(dev, &conf, sizeof(struct virtio_blk_config)); 3063 - 3064 - verbose("device %u: virtblock %llu sectors\n", 3065 - devices.device_num, le64_to_cpu(conf.capacity)); 3066 - } 3067 - 3068 - /*L:211 3069 - * Our random number generator device reads from /dev/urandom into the Guest's 3070 - * input buffers. The usual case is that the Guest doesn't want random numbers 3071 - * and so has no buffers although /dev/urandom is still readable, whereas 3072 - * console is the reverse. 3073 - * 3074 - * The same logic applies, however. 3075 - */ 3076 - struct rng_info { 3077 - int rfd; 3078 - }; 3079 - 3080 - static void rng_input(struct virtqueue *vq) 3081 - { 3082 - int len; 3083 - unsigned int head, in_num, out_num, totlen = 0; 3084 - struct rng_info *rng_info = vq->dev->priv; 3085 - struct iovec iov[vq->vring.num]; 3086 - 3087 - /* First we need a buffer from the Guests's virtqueue. */ 3088 - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 3089 - if (out_num) 3090 - bad_driver_vq(vq, "Output buffers in rng?"); 3091 - 3092 - /* 3093 - * Just like the console write, we loop to cover the whole iovec. 3094 - * In this case, short reads actually happen quite a bit. 3095 - */ 3096 - while (!iov_empty(iov, in_num)) { 3097 - len = readv(rng_info->rfd, iov, in_num); 3098 - if (len <= 0) 3099 - err(1, "Read from /dev/urandom gave %i", len); 3100 - iov_consume(vq->dev, iov, in_num, NULL, len); 3101 - totlen += len; 3102 - } 3103 - 3104 - /* Tell the Guest about the new input. */ 3105 - add_used(vq, head, totlen); 3106 - } 3107 - 3108 - /*L:199 3109 - * This creates a "hardware" random number device for the Guest. 3110 - */ 3111 - static void setup_rng(void) 3112 - { 3113 - struct device *dev; 3114 - struct rng_info *rng_info = malloc(sizeof(*rng_info)); 3115 - 3116 - /* Our device's private info simply contains the /dev/urandom fd. */ 3117 - rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY); 3118 - 3119 - /* Create the new device. */ 3120 - dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0); 3121 - dev->priv = rng_info; 3122 - 3123 - /* The device has one virtqueue, where the Guest places inbufs. */ 3124 - add_pci_virtqueue(dev, rng_input, "input"); 3125 - 3126 - /* We don't have any configuration space */ 3127 - no_device_config(dev); 3128 - 3129 - verbose("device %u: rng\n", devices.device_num); 3130 - } 3131 - /* That's the end of device setup. */ 3132 - 3133 - /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ 3134 - static void __attribute__((noreturn)) restart_guest(void) 3135 - { 3136 - unsigned int i; 3137 - 3138 - /* 3139 - * Since we don't track all open fds, we simply close everything beyond 3140 - * stderr. 3141 - */ 3142 - for (i = 3; i < FD_SETSIZE; i++) 3143 - close(i); 3144 - 3145 - /* Reset all the devices (kills all threads). */ 3146 - cleanup_devices(); 3147 - 3148 - execv(main_args[0], main_args); 3149 - err(1, "Could not exec %s", main_args[0]); 3150 - } 3151 - 3152 - /*L:220 3153 - * Finally we reach the core of the Launcher which runs the Guest, serves 3154 - * its input and output, and finally, lays it to rest. 3155 - */ 3156 - static void __attribute__((noreturn)) run_guest(void) 3157 - { 3158 - for (;;) { 3159 - struct lguest_pending notify; 3160 - int readval; 3161 - 3162 - /* We read from the /dev/lguest device to run the Guest. */ 3163 - readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id); 3164 - if (readval == sizeof(notify)) { 3165 - if (notify.trap == 13) { 3166 - verbose("Emulating instruction at %#x\n", 3167 - getreg(eip)); 3168 - emulate_insn(notify.insn); 3169 - } else if (notify.trap == 14) { 3170 - verbose("Emulating MMIO at %#x\n", 3171 - getreg(eip)); 3172 - emulate_mmio(notify.addr, notify.insn); 3173 - } else 3174 - errx(1, "Unknown trap %i addr %#08x\n", 3175 - notify.trap, notify.addr); 3176 - /* ENOENT means the Guest died. Reading tells us why. */ 3177 - } else if (errno == ENOENT) { 3178 - char reason[1024] = { 0 }; 3179 - pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); 3180 - errx(1, "%s", reason); 3181 - /* ERESTART means that we need to reboot the guest */ 3182 - } else if (errno == ERESTART) { 3183 - restart_guest(); 3184 - /* Anything else means a bug or incompatible change. */ 3185 - } else 3186 - err(1, "Running guest failed"); 3187 - } 3188 - } 3189 - /*L:240 3190 - * This is the end of the Launcher. The good news: we are over halfway 3191 - * through! The bad news: the most fiendish part of the code still lies ahead 3192 - * of us. 3193 - * 3194 - * Are you ready? Take a deep breath and join me in the core of the Host, in 3195 - * "make Host". 3196 - :*/ 3197 - 3198 - static struct option opts[] = { 3199 - { "verbose", 0, NULL, 'v' }, 3200 - { "tunnet", 1, NULL, 't' }, 3201 - { "block", 1, NULL, 'b' }, 3202 - { "rng", 0, NULL, 'r' }, 3203 - { "initrd", 1, NULL, 'i' }, 3204 - { "username", 1, NULL, 'u' }, 3205 - { "chroot", 1, NULL, 'c' }, 3206 - { NULL }, 3207 - }; 3208 - static void usage(void) 3209 - { 3210 - errx(1, "Usage: lguest [--verbose] " 3211 - "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n" 3212 - "|--block=<filename>|--initrd=<filename>]...\n" 3213 - "<mem-in-mb> vmlinux [args...]"); 3214 - } 3215 - 3216 - /*L:105 The main routine is where the real work begins: */ 3217 - int main(int argc, char *argv[]) 3218 - { 3219 - /* Memory, code startpoint and size of the (optional) initrd. */ 3220 - unsigned long mem = 0, start, initrd_size = 0; 3221 - /* Two temporaries. */ 3222 - int i, c; 3223 - /* The boot information for the Guest. */ 3224 - struct boot_params *boot; 3225 - /* If they specify an initrd file to load. */ 3226 - const char *initrd_name = NULL; 3227 - 3228 - /* Password structure for initgroups/setres[gu]id */ 3229 - struct passwd *user_details = NULL; 3230 - 3231 - /* Directory to chroot to */ 3232 - char *chroot_path = NULL; 3233 - 3234 - /* Save the args: we "reboot" by execing ourselves again. */ 3235 - main_args = argv; 3236 - 3237 - /* 3238 - * First we initialize the device list. We remember next interrupt 3239 - * number to use for devices (1: remember that 0 is used by the timer). 3240 - */ 3241 - devices.next_irq = 1; 3242 - 3243 - /* We're CPU 0. In fact, that's the only CPU possible right now. */ 3244 - cpu_id = 0; 3245 - 3246 - /* 3247 - * We need to know how much memory so we can set up the device 3248 - * descriptor and memory pages for the devices as we parse the command 3249 - * line. So we quickly look through the arguments to find the amount 3250 - * of memory now. 3251 - */ 3252 - for (i = 1; i < argc; i++) { 3253 - if (argv[i][0] != '-') { 3254 - mem = atoi(argv[i]) * 1024 * 1024; 3255 - /* 3256 - * We start by mapping anonymous pages over all of 3257 - * guest-physical memory range. This fills it with 0, 3258 - * and ensures that the Guest won't be killed when it 3259 - * tries to access it. 3260 - */ 3261 - guest_base = map_zeroed_pages(mem / getpagesize() 3262 - + DEVICE_PAGES); 3263 - guest_limit = mem; 3264 - guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize(); 3265 - break; 3266 - } 3267 - } 3268 - 3269 - /* If we exit via err(), this kills all the threads, restores tty. */ 3270 - atexit(cleanup_devices); 3271 - 3272 - /* We always have a console device, and it's always device 1. */ 3273 - setup_console(); 3274 - 3275 - /* The options are fairly straight-forward */ 3276 - while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 3277 - switch (c) { 3278 - case 'v': 3279 - verbose = true; 3280 - break; 3281 - case 't': 3282 - setup_tun_net(optarg); 3283 - break; 3284 - case 'b': 3285 - setup_block_file(optarg); 3286 - break; 3287 - case 'r': 3288 - setup_rng(); 3289 - break; 3290 - case 'i': 3291 - initrd_name = optarg; 3292 - break; 3293 - case 'u': 3294 - user_details = getpwnam(optarg); 3295 - if (!user_details) 3296 - err(1, "getpwnam failed, incorrect username?"); 3297 - break; 3298 - case 'c': 3299 - chroot_path = optarg; 3300 - break; 3301 - default: 3302 - warnx("Unknown argument %s", argv[optind]); 3303 - usage(); 3304 - } 3305 - } 3306 - /* 3307 - * After the other arguments we expect memory and kernel image name, 3308 - * followed by command line arguments for the kernel. 3309 - */ 3310 - if (optind + 2 > argc) 3311 - usage(); 3312 - 3313 - verbose("Guest base is at %p\n", guest_base); 3314 - 3315 - /* Initialize the (fake) PCI host bridge device. */ 3316 - init_pci_host_bridge(); 3317 - 3318 - /* Now we load the kernel */ 3319 - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 3320 - 3321 - /* Boot information is stashed at physical address 0 */ 3322 - boot = from_guest_phys(0); 3323 - 3324 - /* Map the initrd image if requested (at top of physical memory) */ 3325 - if (initrd_name) { 3326 - initrd_size = load_initrd(initrd_name, mem); 3327 - /* 3328 - * These are the location in the Linux boot header where the 3329 - * start and size of the initrd are expected to be found. 3330 - */ 3331 - boot->hdr.ramdisk_image = mem - initrd_size; 3332 - boot->hdr.ramdisk_size = initrd_size; 3333 - /* The bootloader type 0xFF means "unknown"; that's OK. */ 3334 - boot->hdr.type_of_loader = 0xFF; 3335 - } 3336 - 3337 - /* 3338 - * The Linux boot header contains an "E820" memory map: ours is a 3339 - * simple, single region. 3340 - */ 3341 - boot->e820_entries = 1; 3342 - boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM }); 3343 - /* 3344 - * The boot header contains a command line pointer: we put the command 3345 - * line after the boot header. 3346 - */ 3347 - boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 3348 - /* We use a simple helper to copy the arguments separated by spaces. */ 3349 - concat((char *)(boot + 1), argv+optind+2); 3350 - 3351 - /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */ 3352 - boot->hdr.kernel_alignment = 0x1000000; 3353 - 3354 - /* Boot protocol version: 2.07 supports the fields for lguest. */ 3355 - boot->hdr.version = 0x207; 3356 - 3357 - /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ 3358 - boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; 3359 - 3360 - /* Tell the entry path not to try to reload segment registers. */ 3361 - boot->hdr.loadflags |= KEEP_SEGMENTS; 3362 - 3363 - /* We don't support tboot: */ 3364 - boot->tboot_addr = 0; 3365 - 3366 - /* Ensure this is 0 to prevent APM from loading: */ 3367 - boot->apm_bios_info.version = 0; 3368 - 3369 - /* We tell the kernel to initialize the Guest. */ 3370 - tell_kernel(start); 3371 - 3372 - /* Ensure that we terminate if a device-servicing child dies. */ 3373 - signal(SIGCHLD, kill_launcher); 3374 - 3375 - /* If requested, chroot to a directory */ 3376 - if (chroot_path) { 3377 - if (chroot(chroot_path) != 0) 3378 - err(1, "chroot(\"%s\") failed", chroot_path); 3379 - 3380 - if (chdir("/") != 0) 3381 - err(1, "chdir(\"/\") failed"); 3382 - 3383 - verbose("chroot done\n"); 3384 - } 3385 - 3386 - /* If requested, drop privileges */ 3387 - if (user_details) { 3388 - uid_t u; 3389 - gid_t g; 3390 - 3391 - u = user_details->pw_uid; 3392 - g = user_details->pw_gid; 3393 - 3394 - if (initgroups(user_details->pw_name, g) != 0) 3395 - err(1, "initgroups failed"); 3396 - 3397 - if (setresgid(g, g, g) != 0) 3398 - err(1, "setresgid failed"); 3399 - 3400 - if (setresuid(u, u, u) != 0) 3401 - err(1, "setresuid failed"); 3402 - 3403 - verbose("Dropping privileges completed\n"); 3404 - } 3405 - 3406 - /* Finally, run the Guest. This doesn't return. */ 3407 - run_guest(); 3408 - } 3409 - /*:*/ 3410 - 3411 - /*M:999 3412 - * Mastery is done: you now know everything I do. 3413 - * 3414 - * But surely you have seen code, features and bugs in your wanderings which 3415 - * you now yearn to attack? That is the real game, and I look forward to you 3416 - * patching and forking lguest into the Your-Name-Here-visor. 3417 - * 3418 - * Farewell, and good coding! 3419 - * Rusty Russell. 3420 - */
-125
tools/lguest/lguest.txt
··· 1 - __ 2 - (___()'`; Rusty's Remarkably Unreliable Guide to Lguest 3 - /, /` - or, A Young Coder's Illustrated Hypervisor 4 - \\"--\\ http://lguest.ozlabs.org 5 - 6 - Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, 7 - for Linux developers and users to experiment with virtualization with the 8 - minimum of complexity. Nonetheless, it should have sufficient features to 9 - make it useful for specific tasks, and, of course, you are encouraged to fork 10 - and enhance it (see drivers/lguest/README). 11 - 12 - Features: 13 - 14 - - Kernel module which runs in a normal kernel. 15 - - Simple I/O model for communication. 16 - - Simple program to create new guests. 17 - - Logo contains cute puppies: http://lguest.ozlabs.org 18 - 19 - Developer features: 20 - 21 - - Fun to hack on. 22 - - No ABI: being tied to a specific kernel anyway, you can change anything. 23 - - Many opportunities for improvement or feature implementation. 24 - 25 - Running Lguest: 26 - 27 - - The easiest way to run lguest is to use same kernel as guest and host. 28 - You can configure them differently, but usually it's easiest not to. 29 - 30 - You will need to configure your kernel with the following options: 31 - 32 - "Processor type and features": 33 - "Paravirtualized guest support" = Y 34 - "Lguest guest support" = Y 35 - "High Memory Support" = off/4GB 36 - "Alignment value to which kernel should be aligned" = 0x100000 37 - (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 38 - CONFIG_PHYSICAL_ALIGN=0x100000) 39 - 40 - "Device Drivers": 41 - "Block devices" 42 - "Virtio block driver" = M/Y 43 - "Network device support" 44 - "Universal TUN/TAP device driver support" = M/Y 45 - "Virtio network driver" = M/Y 46 - (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) 47 - 48 - "Virtualization" 49 - "Linux hypervisor example code" = M/Y 50 - (CONFIG_LGUEST=m) 51 - 52 - - A tool called "lguest" is available in this directory: type "make" 53 - to build it. If you didn't build your kernel in-tree, use "make 54 - O=<builddir>". 55 - 56 - - Create or find a root disk image. There are several useful ones 57 - around, such as the xm-test tiny root image at 58 - http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img 59 - 60 - For more serious work, I usually use a distribution ISO image and 61 - install it under qemu, then make multiple copies: 62 - 63 - dd if=/dev/zero of=rootfile bs=1M count=2048 64 - qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 65 - 66 - Make sure that you install a getty on /dev/hvc0 if you want to log in on the 67 - console! 68 - 69 - - "modprobe lg" if you built it as a module. 70 - 71 - - Run an lguest as root: 72 - 73 - tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ 74 - --block=rootfile root=/dev/vda 75 - 76 - Explanation: 77 - 64: the amount of memory to use, in MB. 78 - 79 - vmlinux: the kernel image found in the top of your build directory. You 80 - can also use a standard bzImage. 81 - 82 - --tunnet=192.168.19.1: configures a "tap" device for networking with this 83 - IP address. 84 - 85 - --block=rootfile: a file or block device which becomes /dev/vda 86 - inside the guest. 87 - 88 - root=/dev/vda: this (and anything else on the command line) are 89 - kernel boot parameters. 90 - 91 - - Configuring networking. I usually have the host masquerade, using 92 - "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > 93 - /proc/sys/net/ipv4/ip_forward". In this example, I would configure 94 - eth0 inside the guest at 192.168.19.2. 95 - 96 - Another method is to bridge the tap device to an external interface 97 - using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest 98 - to obtain an IP address. The bridge needs to be configured first: 99 - this option simply adds the tap interface to it. 100 - 101 - A simple example on my system: 102 - 103 - ifconfig eth0 0.0.0.0 104 - brctl addbr lg0 105 - ifconfig lg0 up 106 - brctl addif lg0 eth0 107 - dhclient lg0 108 - 109 - Then use --tunnet=bridge:lg0 when launching the guest. 110 - 111 - See: 112 - 113 - http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge 114 - 115 - for general information on how to get bridging to work. 116 - 117 - - Random number generation. Using the --rng option will provide a 118 - /dev/hwrng in the guest that will read from the host's /dev/random. 119 - Use this option in conjunction with rng-tools (see ../hw_random.txt) 120 - to provide entropy to the guest kernel's /dev/random. 121 - 122 - There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest 123 - 124 - Good luck! 125 - Rusty Russell rusty@rustcorp.com.au.