x86/lguest: Remove lguest support · tjh.dev/kernel@ecda85e

-11

MAINTAINERS

··· 7640 7640 S: Maintained 7641 7641 F: drivers/media/dvb-frontends/lgdt3305.* 7642 7642 7643 - LGUEST 7644 - M: Rusty Russell <rusty@rustcorp.com.au> 7645 - L: lguest@lists.ozlabs.org 7646 - W: http://lguest.ozlabs.org/ 7647 - S: Odd Fixes 7648 - F: arch/x86/include/asm/lguest*.h 7649 - F: arch/x86/lguest/ 7650 - F: drivers/lguest/ 7651 - F: include/linux/lguest*.h 7652 - F: tools/lguest/ 7653 - 7654 7643 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER 7655 7644 M: Viresh Kumar <vireshk@kernel.org> 7656 7645 L: linux-ide@vger.kernel.org

-3

arch/x86/Kbuild

··· 10 10 # Hyper-V paravirtualization support 11 11 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ 12 12 13 - # lguest paravirtualization support 14 - obj-$(CONFIG_LGUEST_GUEST) += lguest/ 15 - 16 13 obj-y += realmode/ 17 14 obj-y += kernel/ 18 15 obj-y += mm/

-2

arch/x86/Kconfig

··· 777 777 Statistics are displayed in debugfs filesystem. Enabling this option 778 778 may incur significant overhead. 779 779 780 - source "arch/x86/lguest/Kconfig" 781 - 782 780 config PARAVIRT_TIME_ACCOUNTING 783 781 bool "Paravirtual steal time accounting" 784 782 depends on PARAVIRT

-91

arch/x86/include/asm/lguest.h

··· 1 - #ifndef _ASM_X86_LGUEST_H 2 - #define _ASM_X86_LGUEST_H 3 - 4 - #define GDT_ENTRY_LGUEST_CS 10 5 - #define GDT_ENTRY_LGUEST_DS 11 6 - #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) 7 - #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) 8 - 9 - #ifndef __ASSEMBLY__ 10 - #include <asm/desc.h> 11 - 12 - #define GUEST_PL 1 13 - 14 - /* Page for Switcher text itself, then two pages per cpu */ 15 - #define SWITCHER_TEXT_PAGES (1) 16 - #define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) 17 - #define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) 18 - 19 - /* Where we map the Switcher, in both Host and Guest. */ 20 - extern unsigned long switcher_addr; 21 - 22 - /* Found in switcher.S */ 23 - extern unsigned long default_idt_entries[]; 24 - 25 - /* Declarations for definitions in arch/x86/lguest/head_32.S */ 26 - extern char lguest_noirq_iret[]; 27 - extern const char lgstart_cli[], lgend_cli[]; 28 - extern const char lgstart_pushf[], lgend_pushf[]; 29 - 30 - extern void lguest_iret(void); 31 - extern void lguest_init(void); 32 - 33 - struct lguest_regs { 34 - /* Manually saved part. */ 35 - unsigned long eax, ebx, ecx, edx; 36 - unsigned long esi, edi, ebp; 37 - unsigned long gs; 38 - unsigned long fs, ds, es; 39 - unsigned long trapnum, errcode; 40 - /* Trap pushed part */ 41 - unsigned long eip; 42 - unsigned long cs; 43 - unsigned long eflags; 44 - unsigned long esp; 45 - unsigned long ss; 46 - }; 47 - 48 - /* This is a guest-specific page (mapped ro) into the guest. */ 49 - struct lguest_ro_state { 50 - /* Host information we need to restore when we switch back. */ 51 - u32 host_cr3; 52 - struct desc_ptr host_idt_desc; 53 - struct desc_ptr host_gdt_desc; 54 - u32 host_sp; 55 - 56 - /* Fields which are used when guest is running. */ 57 - struct desc_ptr guest_idt_desc; 58 - struct desc_ptr guest_gdt_desc; 59 - struct x86_hw_tss guest_tss; 60 - struct desc_struct guest_idt[IDT_ENTRIES]; 61 - struct desc_struct guest_gdt[GDT_ENTRIES]; 62 - }; 63 - 64 - struct lg_cpu_arch { 65 - /* The GDT entries copied into lguest_ro_state when running. */ 66 - struct desc_struct gdt[GDT_ENTRIES]; 67 - 68 - /* The IDT entries: some copied into lguest_ro_state when running. */ 69 - struct desc_struct idt[IDT_ENTRIES]; 70 - 71 - /* The address of the last guest-visible pagefault (ie. cr2). */ 72 - unsigned long last_pagefault; 73 - }; 74 - 75 - static inline void lguest_set_ts(void) 76 - { 77 - u32 cr0; 78 - 79 - cr0 = read_cr0(); 80 - if (!(cr0 & 8)) 81 - write_cr0(cr0 | 8); 82 - } 83 - 84 - /* Full 4G segment descriptors, suitable for CS and DS. */ 85 - #define FULL_EXEC_SEGMENT \ 86 - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) 87 - #define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) 88 - 89 - #endif /* __ASSEMBLY__ */ 90 - 91 - #endif /* _ASM_X86_LGUEST_H */

-74

arch/x86/include/asm/lguest_hcall.h

··· 1 - /* Architecture specific portion of the lguest hypercalls */ 2 - #ifndef _ASM_X86_LGUEST_HCALL_H 3 - #define _ASM_X86_LGUEST_HCALL_H 4 - 5 - #define LHCALL_FLUSH_ASYNC 0 6 - #define LHCALL_LGUEST_INIT 1 7 - #define LHCALL_SHUTDOWN 2 8 - #define LHCALL_NEW_PGTABLE 4 9 - #define LHCALL_FLUSH_TLB 5 10 - #define LHCALL_LOAD_IDT_ENTRY 6 11 - #define LHCALL_SET_STACK 7 12 - #define LHCALL_SET_CLOCKEVENT 9 13 - #define LHCALL_HALT 10 14 - #define LHCALL_SET_PMD 13 15 - #define LHCALL_SET_PTE 14 16 - #define LHCALL_SET_PGD 15 17 - #define LHCALL_LOAD_TLS 16 18 - #define LHCALL_LOAD_GDT_ENTRY 18 19 - #define LHCALL_SEND_INTERRUPTS 19 20 - 21 - #define LGUEST_TRAP_ENTRY 0x1F 22 - 23 - /* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ 24 - #define LGUEST_SHUTDOWN_POWEROFF 1 25 - #define LGUEST_SHUTDOWN_RESTART 2 26 - 27 - #ifndef __ASSEMBLY__ 28 - #include <asm/hw_irq.h> 29 - 30 - /*G:030 31 - * But first, how does our Guest contact the Host to ask for privileged 32 - * operations? There are two ways: the direct way is to make a "hypercall", 33 - * to make requests of the Host Itself. 34 - * 35 - * Our hypercall mechanism uses the highest unused trap code (traps 32 and 36 - * above are used by real hardware interrupts). Seventeen hypercalls are 37 - * available: the hypercall number is put in the %eax register, and the 38 - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 39 - * If a return value makes sense, it's returned in %eax. 40 - * 41 - * Grossly invalid calls result in Sudden Death at the hands of the vengeful 42 - * Host, rather than returning failure. This reflects Winston Churchill's 43 - * definition of a gentleman: "someone who is only rude intentionally". 44 - */ 45 - static inline unsigned long 46 - hcall(unsigned long call, 47 - unsigned long arg1, unsigned long arg2, unsigned long arg3, 48 - unsigned long arg4) 49 - { 50 - /* "int" is the Intel instruction to trigger a trap. */ 51 - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 52 - /* The call in %eax (aka "a") might be overwritten */ 53 - : "=a"(call) 54 - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ 55 - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) 56 - /* "memory" means this might write somewhere in memory. 57 - * This isn't true for all calls, but it's safe to tell 58 - * gcc that it might happen so it doesn't get clever. */ 59 - : "memory"); 60 - return call; 61 - } 62 - /*:*/ 63 - 64 - /* Can't use our min() macro here: needs to be a constant */ 65 - #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 66 - 67 - #define LHCALL_RING_SIZE 64 68 - struct hcall_args { 69 - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ 70 - unsigned long arg0, arg1, arg2, arg3, arg4; 71 - }; 72 - 73 - #endif /* !__ASSEMBLY__ */ 74 - #endif /* _ASM_X86_LGUEST_HCALL_H */

+1 -1

arch/x86/include/asm/processor.h

··· 662 662 * In case NMI unmasking or performance ever becomes a problem, 663 663 * the next best option appears to be MOV-to-CR2 and an 664 664 * unconditional jump. That sequence also works on all CPUs, 665 - * but it will fault at CPL3 (i.e. Xen PV and lguest). 665 + * but it will fault at CPL3 (i.e. Xen PV). 666 666 * 667 667 * CPUID is the conventional way, but it's nasty: it doesn't 668 668 * exist on some 486-like CPUs, and it usually exits to a

+1 -1

arch/x86/include/uapi/asm/bootparam.h

··· 201 201 * 202 202 * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard 203 203 * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. 204 - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest 204 + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated 205 205 * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, 206 206 * which start at asm startup_xen() entry point and later jump to the C 207 207 * xen_start_kernel() entry point. Both domU and dom0 type of guests are

-20

arch/x86/kernel/asm-offsets_32.c

··· 4 4 5 5 #include <asm/ucontext.h> 6 6 7 - #include <linux/lguest.h> 8 - #include "../../../drivers/lguest/lg.h" 9 - 10 7 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, 11 8 static char syscalls[] = { 12 9 #include <asm/syscalls_32.h> ··· 59 62 OFFSET(stack_canary_offset, stack_canary, canary); 60 63 #endif 61 64 62 - #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 63 - BLANK(); 64 - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 65 - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); 66 - 67 - BLANK(); 68 - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 69 - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 70 - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); 71 - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); 72 - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); 73 - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); 74 - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); 75 - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); 76 - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 - #endif 79 65 BLANK(); 80 66 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); 81 67 DEFINE(NR_syscalls, sizeof(syscalls));

-2

arch/x86/kernel/head_32.S

··· 155 155 jmp *%eax 156 156 157 157 .Lbad_subarch: 158 - WEAK(lguest_entry) 159 158 WEAK(xen_entry) 160 159 /* Unknown implementation; there's really 161 160 nothing we can do at this point. */ ··· 164 165 165 166 subarch_entries: 166 167 .long .Ldefault_entry /* normal x86/PC */ 167 - .long lguest_entry /* lguest hypervisor */ 168 168 .long xen_entry /* Xen hypervisor */ 169 169 .long .Ldefault_entry /* Moorestown MID */ 170 170 num_subarch_entries = (. - subarch_entries) / 4

-1

arch/x86/kernel/platform-quirks.c

··· 16 16 x86_platform.legacy.reserve_bios_regions = 1; 17 17 break; 18 18 case X86_SUBARCH_XEN: 19 - case X86_SUBARCH_LGUEST: 20 19 x86_platform.legacy.devices.pnpbios = 0; 21 20 x86_platform.legacy.rtc = 0; 22 21 break;

-1

arch/x86/kvm/Kconfig

··· 89 89 # OK, it's a little counter-intuitive to do this, but it puts it neatly under 90 90 # the virtualization menu. 91 91 source drivers/vhost/Kconfig 92 - source drivers/lguest/Kconfig 93 92 94 93 endif # VIRTUALIZATION

-14

arch/x86/lguest/Kconfig

··· 1 - config LGUEST_GUEST 2 - bool "Lguest guest support" 3 - depends on X86_32 && PARAVIRT && PCI 4 - select TTY 5 - select VIRTUALIZATION 6 - select VIRTIO 7 - select VIRTIO_CONSOLE 8 - help 9 - Lguest is a tiny in-kernel hypervisor. Selecting this will 10 - allow your kernel to boot under lguest. This option will increase 11 - your kernel size by about 10k. If in doubt, say N. 12 - 13 - If you say Y here, make sure you say Y (or M) to the virtio block 14 - and net drivers which lguest needs.

-2

arch/x86/lguest/Makefile

··· 1 - obj-y := head_32.o boot.o 2 - CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)

-1558

arch/x86/lguest/boot.c

··· 1 - /*P:010 2 - * A hypervisor allows multiple Operating Systems to run on a single machine. 3 - * To quote David Wheeler: "Any problem in computer science can be solved with 4 - * another layer of indirection." 5 - * 6 - * We keep things simple in two ways. First, we start with a normal Linux 7 - * kernel and insert a module (lg.ko) which allows us to run other Linux 8 - * kernels the same way we'd run processes. We call the first kernel the Host, 9 - * and the others the Guests. The program which sets up and configures Guests 10 - * (such as the example in tools/lguest/lguest.c) is called the Launcher. 11 - * 12 - * Secondly, we only run specially modified Guests, not normal kernels: setting 13 - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows 14 - * how to be a Guest at boot time. This means that you can use the same kernel 15 - * you boot normally (ie. as a Host) as a Guest. 16 - * 17 - * These Guests know that they cannot do privileged operations, such as disable 18 - * interrupts, and that they have to ask the Host to do such things explicitly. 19 - * This file consists of all the replacements for such low-level native 20 - * hardware operations: these special Guest versions call the Host. 21 - * 22 - * So how does the kernel know it's a Guest? We'll see that later, but let's 23 - * just say that we end up here where we replace the native functions various 24 - * "paravirt" structures with our Guest versions, then boot like normal. 25 - :*/ 26 - 27 - /* 28 - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 - * 30 - * This program is free software; you can redistribute it and/or modify 31 - * it under the terms of the GNU General Public License as published by 32 - * the Free Software Foundation; either version 2 of the License, or 33 - * (at your option) any later version. 34 - * 35 - * This program is distributed in the hope that it will be useful, but 36 - * WITHOUT ANY WARRANTY; without even the implied warranty of 37 - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 38 - * NON INFRINGEMENT. See the GNU General Public License for more 39 - * details. 40 - * 41 - * You should have received a copy of the GNU General Public License 42 - * along with this program; if not, write to the Free Software 43 - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 44 - */ 45 - #include <linux/kernel.h> 46 - #include <linux/start_kernel.h> 47 - #include <linux/string.h> 48 - #include <linux/console.h> 49 - #include <linux/screen_info.h> 50 - #include <linux/irq.h> 51 - #include <linux/interrupt.h> 52 - #include <linux/clocksource.h> 53 - #include <linux/clockchips.h> 54 - #include <linux/lguest.h> 55 - #include <linux/lguest_launcher.h> 56 - #include <linux/virtio_console.h> 57 - #include <linux/pm.h> 58 - #include <linux/export.h> 59 - #include <linux/pci.h> 60 - #include <linux/virtio_pci.h> 61 - #include <asm/acpi.h> 62 - #include <asm/apic.h> 63 - #include <asm/lguest.h> 64 - #include <asm/paravirt.h> 65 - #include <asm/param.h> 66 - #include <asm/page.h> 67 - #include <asm/pgtable.h> 68 - #include <asm/desc.h> 69 - #include <asm/setup.h> 70 - #include <asm/e820/api.h> 71 - #include <asm/mce.h> 72 - #include <asm/io.h> 73 - #include <asm/fpu/api.h> 74 - #include <asm/stackprotector.h> 75 - #include <asm/reboot.h> /* for struct machine_ops */ 76 - #include <asm/kvm_para.h> 77 - #include <asm/pci_x86.h> 78 - #include <asm/pci-direct.h> 79 - 80 - /*G:010 81 - * Welcome to the Guest! 82 - * 83 - * The Guest in our tale is a simple creature: identical to the Host but 84 - * behaving in simplified but equivalent ways. In particular, the Guest is the 85 - * same kernel as the Host (or at least, built from the same source code). 86 - :*/ 87 - 88 - struct lguest_data lguest_data = { 89 - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 90 - .noirq_iret = (u32)lguest_noirq_iret, 91 - .kernel_address = PAGE_OFFSET, 92 - .blocked_interrupts = { 1 }, /* Block timer interrupts */ 93 - .syscall_vec = IA32_SYSCALL_VECTOR, 94 - }; 95 - 96 - /*G:037 97 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a 98 - * ring buffer of stored hypercalls which the Host will run though next time we 99 - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 100 - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 101 - * and 255 once the Host has finished with it. 102 - * 103 - * If we come around to a slot which hasn't been finished, then the table is 104 - * full and we just make the hypercall directly. This has the nice side 105 - * effect of causing the Host to run all the stored calls in the ring buffer 106 - * which empties it for next time! 107 - */ 108 - static void async_hcall(unsigned long call, unsigned long arg1, 109 - unsigned long arg2, unsigned long arg3, 110 - unsigned long arg4) 111 - { 112 - /* Note: This code assumes we're uniprocessor. */ 113 - static unsigned int next_call; 114 - unsigned long flags; 115 - 116 - /* 117 - * Disable interrupts if not already disabled: we don't want an 118 - * interrupt handler making a hypercall while we're already doing 119 - * one! 120 - */ 121 - local_irq_save(flags); 122 - if (lguest_data.hcall_status[next_call] != 0xFF) { 123 - /* Table full, so do normal hcall which will flush table. */ 124 - hcall(call, arg1, arg2, arg3, arg4); 125 - } else { 126 - lguest_data.hcalls[next_call].arg0 = call; 127 - lguest_data.hcalls[next_call].arg1 = arg1; 128 - lguest_data.hcalls[next_call].arg2 = arg2; 129 - lguest_data.hcalls[next_call].arg3 = arg3; 130 - lguest_data.hcalls[next_call].arg4 = arg4; 131 - /* Arguments must all be written before we mark it to go */ 132 - wmb(); 133 - lguest_data.hcall_status[next_call] = 0; 134 - if (++next_call == LHCALL_RING_SIZE) 135 - next_call = 0; 136 - } 137 - local_irq_restore(flags); 138 - } 139 - 140 - /*G:035 141 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real 142 - * optimization trick! 143 - * 144 - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 145 - * them as a batch when lazy_mode is eventually turned off. Because hypercalls 146 - * are reasonably expensive, batching them up makes sense. For example, a 147 - * large munmap might update dozens of page table entries: that code calls 148 - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls 149 - * lguest_leave_lazy_mode(). 150 - * 151 - * So, when we're in lazy mode, we call async_hcall() to store the call for 152 - * future processing: 153 - */ 154 - static void lazy_hcall1(unsigned long call, unsigned long arg1) 155 - { 156 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 157 - hcall(call, arg1, 0, 0, 0); 158 - else 159 - async_hcall(call, arg1, 0, 0, 0); 160 - } 161 - 162 - /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 163 - static void lazy_hcall2(unsigned long call, 164 - unsigned long arg1, 165 - unsigned long arg2) 166 - { 167 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 168 - hcall(call, arg1, arg2, 0, 0); 169 - else 170 - async_hcall(call, arg1, arg2, 0, 0); 171 - } 172 - 173 - static void lazy_hcall3(unsigned long call, 174 - unsigned long arg1, 175 - unsigned long arg2, 176 - unsigned long arg3) 177 - { 178 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 179 - hcall(call, arg1, arg2, arg3, 0); 180 - else 181 - async_hcall(call, arg1, arg2, arg3, 0); 182 - } 183 - 184 - #ifdef CONFIG_X86_PAE 185 - static void lazy_hcall4(unsigned long call, 186 - unsigned long arg1, 187 - unsigned long arg2, 188 - unsigned long arg3, 189 - unsigned long arg4) 190 - { 191 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 192 - hcall(call, arg1, arg2, arg3, arg4); 193 - else 194 - async_hcall(call, arg1, arg2, arg3, arg4); 195 - } 196 - #endif 197 - 198 - /*G:036 199 - * When lazy mode is turned off, we issue the do-nothing hypercall to 200 - * flush any stored calls, and call the generic helper to reset the 201 - * per-cpu lazy mode variable. 202 - */ 203 - static void lguest_leave_lazy_mmu_mode(void) 204 - { 205 - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 206 - paravirt_leave_lazy_mmu(); 207 - } 208 - 209 - /* 210 - * We also catch the end of context switch; we enter lazy mode for much of 211 - * that too, so again we need to flush here. 212 - * 213 - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU 214 - * mode, but unlike Xen, lguest doesn't care about the difference). 215 - */ 216 - static void lguest_end_context_switch(struct task_struct *next) 217 - { 218 - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 219 - paravirt_end_context_switch(next); 220 - } 221 - 222 - /*G:032 223 - * After that diversion we return to our first native-instruction 224 - * replacements: four functions for interrupt control. 225 - * 226 - * The simplest way of implementing these would be to have "turn interrupts 227 - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: 228 - * these are by far the most commonly called functions of those we override. 229 - * 230 - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", 231 - * which the Guest can update with a single instruction. The Host knows to 232 - * check there before it tries to deliver an interrupt. 233 - */ 234 - 235 - /* 236 - * save_flags() is expected to return the processor state (ie. "flags"). The 237 - * flags word contains all kind of stuff, but in practice Linux only cares 238 - * about the interrupt flag. Our "save_flags()" just returns that. 239 - */ 240 - asmlinkage __visible unsigned long lguest_save_fl(void) 241 - { 242 - return lguest_data.irq_enabled; 243 - } 244 - 245 - /* Interrupts go off... */ 246 - asmlinkage __visible void lguest_irq_disable(void) 247 - { 248 - lguest_data.irq_enabled = 0; 249 - } 250 - 251 - /* 252 - * Let's pause a moment. Remember how I said these are called so often? 253 - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 254 - * break some rules. In particular, these functions are assumed to save their 255 - * own registers if they need to: normal C functions assume they can trash the 256 - * eax register. To use normal C functions, we use 257 - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 258 - * C function, then restores it. 259 - */ 260 - PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); 261 - PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); 262 - /*:*/ 263 - 264 - /* These are in head_32.S */ 265 - extern void lg_irq_enable(void); 266 - extern void lg_restore_fl(unsigned long flags); 267 - 268 - /*M:003 269 - * We could be more efficient in our checking of outstanding interrupts, rather 270 - * than using a branch. One way would be to put the "irq_enabled" field in a 271 - * page by itself, and have the Host write-protect it when an interrupt comes 272 - * in when irqs are disabled. There will then be a page fault as soon as 273 - * interrupts are re-enabled. 274 - * 275 - * A better method is to implement soft interrupt disable generally for x86: 276 - * instead of disabling interrupts, we set a flag. If an interrupt does come 277 - * in, we then disable them for real. This is uncommon, so we could simply use 278 - * a hypercall for interrupt control and not worry about efficiency. 279 - :*/ 280 - 281 - /*G:034 282 - * The Interrupt Descriptor Table (IDT). 283 - * 284 - * The IDT tells the processor what to do when an interrupt comes in. Each 285 - * entry in the table is a 64-bit descriptor: this holds the privilege level, 286 - * address of the handler, and... well, who cares? The Guest just asks the 287 - * Host to make the change anyway, because the Host controls the real IDT. 288 - */ 289 - static void lguest_write_idt_entry(gate_desc *dt, 290 - int entrynum, const gate_desc *g) 291 - { 292 - /* 293 - * The gate_desc structure is 8 bytes long: we hand it to the Host in 294 - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 295 - * around like this; typesafety wasn't a big concern in Linux's early 296 - * years. 297 - */ 298 - u32 *desc = (u32 *)g; 299 - /* Keep the local copy up to date. */ 300 - native_write_idt_entry(dt, entrynum, g); 301 - /* Tell Host about this new entry. */ 302 - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); 303 - } 304 - 305 - /* 306 - * Changing to a different IDT is very rare: we keep the IDT up-to-date every 307 - * time it is written, so we can simply loop through all entries and tell the 308 - * Host about them. 309 - */ 310 - static void lguest_load_idt(const struct desc_ptr *desc) 311 - { 312 - unsigned int i; 313 - struct desc_struct *idt = (void *)desc->address; 314 - 315 - for (i = 0; i < (desc->size+1)/8; i++) 316 - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); 317 - } 318 - 319 - /* 320 - * The Global Descriptor Table. 321 - * 322 - * The Intel architecture defines another table, called the Global Descriptor 323 - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" 324 - * instruction, and then several other instructions refer to entries in the 325 - * table. There are three entries which the Switcher needs, so the Host simply 326 - * controls the entire thing and the Guest asks it to make changes using the 327 - * LOAD_GDT hypercall. 328 - * 329 - * This is the exactly like the IDT code. 330 - */ 331 - static void lguest_load_gdt(const struct desc_ptr *desc) 332 - { 333 - unsigned int i; 334 - struct desc_struct *gdt = (void *)desc->address; 335 - 336 - for (i = 0; i < (desc->size+1)/8; i++) 337 - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); 338 - } 339 - 340 - /* 341 - * For a single GDT entry which changes, we simply change our copy and 342 - * then tell the host about it. 343 - */ 344 - static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 345 - const void *desc, int type) 346 - { 347 - native_write_gdt_entry(dt, entrynum, desc, type); 348 - /* Tell Host about this new entry. */ 349 - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, 350 - dt[entrynum].a, dt[entrynum].b, 0); 351 - } 352 - 353 - /* 354 - * There are three "thread local storage" GDT entries which change 355 - * on every context switch (these three entries are how glibc implements 356 - * __thread variables). As an optimization, we have a hypercall 357 - * specifically for this case. 358 - * 359 - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall 360 - * which took a range of entries? 361 - */ 362 - static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 363 - { 364 - /* 365 - * There's one problem which normal hardware doesn't have: the Host 366 - * can't handle us removing entries we're currently using. So we clear 367 - * the GS register here: if it's needed it'll be reloaded anyway. 368 - */ 369 - lazy_load_gs(0); 370 - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 371 - } 372 - 373 - /*G:038 374 - * That's enough excitement for now, back to ploughing through each of the 375 - * different pv_ops structures (we're about 1/3 of the way through). 376 - * 377 - * This is the Local Descriptor Table, another weird Intel thingy. Linux only 378 - * uses this for some strange applications like Wine. We don't do anything 379 - * here, so they'll get an informative and friendly Segmentation Fault. 380 - */ 381 - static void lguest_set_ldt(const void *addr, unsigned entries) 382 - { 383 - } 384 - 385 - /* 386 - * This loads a GDT entry into the "Task Register": that entry points to a 387 - * structure called the Task State Segment. Some comments scattered though the 388 - * kernel code indicate that this used for task switching in ages past, along 389 - * with blood sacrifice and astrology. 390 - * 391 - * Now there's nothing interesting in here that we don't get told elsewhere. 392 - * But the native version uses the "ltr" instruction, which makes the Host 393 - * complain to the Guest about a Segmentation Fault and it'll oops. So we 394 - * override the native version with a do-nothing version. 395 - */ 396 - static void lguest_load_tr_desc(void) 397 - { 398 - } 399 - 400 - /* 401 - * The "cpuid" instruction is a way of querying both the CPU identity 402 - * (manufacturer, model, etc) and its features. It was introduced before the 403 - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 404 - * As you might imagine, after a decade and a half this treatment, it is now a 405 - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 406 - * 407 - * This instruction even it has its own Wikipedia entry. The Wikipedia entry 408 - * has been translated into 6 languages. I am not making this up! 409 - * 410 - * We could get funky here and identify ourselves as "GenuineLguest", but 411 - * instead we just use the real "cpuid" instruction. Then I pretty much turned 412 - * off feature bits until the Guest booted. (Don't say that: you'll damage 413 - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 414 - * hardly future proof.) No one's listening! They don't like you anyway, 415 - * parenthetic weirdo! 416 - * 417 - * Replacing the cpuid so we can turn features off is great for the kernel, but 418 - * anyone (including userspace) can just use the raw "cpuid" instruction and 419 - * the Host won't even notice since it isn't privileged. So we try not to get 420 - * too worked up about it. 421 - */ 422 - static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 423 - unsigned int *cx, unsigned int *dx) 424 - { 425 - int function = *ax; 426 - 427 - native_cpuid(ax, bx, cx, dx); 428 - switch (function) { 429 - /* 430 - * CPUID 0 gives the highest legal CPUID number (and the ID string). 431 - * We futureproof our code a little by sticking to known CPUID values. 432 - */ 433 - case 0: 434 - if (*ax > 5) 435 - *ax = 5; 436 - break; 437 - 438 - /* 439 - * CPUID 1 is a basic feature request. 440 - * 441 - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 442 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. 443 - */ 444 - case 1: 445 - *cx &= 0x00002201; 446 - *dx &= 0x07808151; 447 - /* 448 - * The Host can do a nice optimization if it knows that the 449 - * kernel mappings (addresses above 0xC0000000 or whatever 450 - * PAGE_OFFSET is set to) haven't changed. But Linux calls 451 - * flush_tlb_user() for both user and kernel mappings unless 452 - * the Page Global Enable (PGE) feature bit is set. 453 - */ 454 - *dx |= 0x00002000; 455 - /* 456 - * We also lie, and say we're family id 5. 6 or greater 457 - * leads to a rdmsr in early_init_intel which we can't handle. 458 - * Family ID is returned as bits 8-12 in ax. 459 - */ 460 - *ax &= 0xFFFFF0FF; 461 - *ax |= 0x00000500; 462 - break; 463 - 464 - /* 465 - * This is used to detect if we're running under KVM. We might be, 466 - * but that's a Host matter, not us. So say we're not. 467 - */ 468 - case KVM_CPUID_SIGNATURE: 469 - *bx = *cx = *dx = 0; 470 - break; 471 - 472 - /* 473 - * 0x80000000 returns the highest Extended Function, so we futureproof 474 - * like we do above by limiting it to known fields. 475 - */ 476 - case 0x80000000: 477 - if (*ax > 0x80000008) 478 - *ax = 0x80000008; 479 - break; 480 - 481 - /* 482 - * PAE systems can mark pages as non-executable. Linux calls this the 483 - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced 484 - * Virus Protection). We just switch it off here, since we don't 485 - * support it. 486 - */ 487 - case 0x80000001: 488 - *dx &= ~(1 << 20); 489 - break; 490 - } 491 - } 492 - 493 - /* 494 - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 495 - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 496 - * it. The Host needs to know when the Guest wants to change them, so we have 497 - * a whole series of functions like read_cr0() and write_cr0(). 498 - * 499 - * We start with cr0. cr0 allows you to turn on and off all kinds of basic 500 - * features, but the only cr0 bit that Linux ever used at runtime was the 501 - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) 502 - * 503 - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if 504 - * the floating point unit is used. Which allows us to restore FPU state 505 - * lazily after a task switch if we wanted to, but wouldn't a name like 506 - * "FPUTRAP bit" be a little less cryptic? 507 - * 508 - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore 509 - * cr0. 510 - */ 511 - static void lguest_write_cr0(unsigned long val) 512 - { 513 - } 514 - 515 - static unsigned long lguest_read_cr0(void) 516 - { 517 - return 0; 518 - } 519 - 520 - /* 521 - * cr2 is the virtual address of the last page fault, which the Guest only ever 522 - * reads. The Host kindly writes this into our "struct lguest_data", so we 523 - * just read it out of there. 524 - */ 525 - static unsigned long lguest_read_cr2(void) 526 - { 527 - return lguest_data.cr2; 528 - } 529 - 530 - /* See lguest_set_pte() below. */ 531 - static bool cr3_changed = false; 532 - static unsigned long current_cr3; 533 - 534 - /* 535 - * cr3 is the current toplevel pagetable page: the principle is the same as 536 - * cr0. Keep a local copy, and tell the Host when it changes. 537 - */ 538 - static void lguest_write_cr3(unsigned long cr3) 539 - { 540 - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 541 - current_cr3 = cr3; 542 - 543 - /* These two page tables are simple, linear, and used during boot */ 544 - if (cr3 != __pa_symbol(swapper_pg_dir) && 545 - cr3 != __pa_symbol(initial_page_table)) 546 - cr3_changed = true; 547 - } 548 - 549 - static unsigned long lguest_read_cr3(void) 550 - { 551 - return current_cr3; 552 - } 553 - 554 - /* cr4 is used to enable and disable PGE, but we don't care. */ 555 - static unsigned long lguest_read_cr4(void) 556 - { 557 - return 0; 558 - } 559 - 560 - static void lguest_write_cr4(unsigned long val) 561 - { 562 - } 563 - 564 - /* 565 - * Page Table Handling. 566 - * 567 - * Now would be a good time to take a rest and grab a coffee or similarly 568 - * relaxing stimulant. The easy parts are behind us, and the trek gradually 569 - * winds uphill from here. 570 - * 571 - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU 572 - * maps virtual addresses to physical addresses using "page tables". We could 573 - * use one huge index of 1 million entries: each address is 4 bytes, so that's 574 - * 1024 pages just to hold the page tables. But since most virtual addresses 575 - * are unused, we use a two level index which saves space. The cr3 register 576 - * contains the physical address of the top level "page directory" page, which 577 - * contains physical addresses of up to 1024 second-level pages. Each of these 578 - * second level pages contains up to 1024 physical addresses of actual pages, 579 - * or Page Table Entries (PTEs). 580 - * 581 - * Here's a diagram, where arrows indicate physical addresses: 582 - * 583 - * cr3 ---> +---------+ 584 - * | --------->+---------+ 585 - * | | | PADDR1 | 586 - * Mid-level | | PADDR2 | 587 - * (PMD) page | | | 588 - * | | Lower-level | 589 - * | | (PTE) page | 590 - * | | | | 591 - * .... .... 592 - * 593 - * So to convert a virtual address to a physical address, we look up the top 594 - * level, which points us to the second level, which gives us the physical 595 - * address of that page. If the top level entry was not present, or the second 596 - * level entry was not present, then the virtual address is invalid (we 597 - * say "the page was not mapped"). 598 - * 599 - * Put another way, a 32-bit virtual address is divided up like so: 600 - * 601 - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 602 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| 603 - * Index into top Index into second Offset within page 604 - * page directory page pagetable page 605 - * 606 - * Now, unfortunately, this isn't the whole story: Intel added Physical Address 607 - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). 608 - * These are held in 64-bit page table entries, so we can now only fit 512 609 - * entries in a page, and the neat three-level tree breaks down. 610 - * 611 - * The result is a four level page table: 612 - * 613 - * cr3 --> [ 4 Upper ] 614 - * [ Level ] 615 - * [ Entries ] 616 - * [(PUD Page)]---> +---------+ 617 - * | --------->+---------+ 618 - * | | | PADDR1 | 619 - * Mid-level | | PADDR2 | 620 - * (PMD) page | | | 621 - * | | Lower-level | 622 - * | | (PTE) page | 623 - * | | | | 624 - * .... .... 625 - * 626 - * 627 - * And the virtual address is decoded as: 628 - * 629 - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 630 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| 631 - * Index into Index into mid Index into lower Offset within page 632 - * top entries directory page pagetable page 633 - * 634 - * It's too hard to switch between these two formats at runtime, so Linux only 635 - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many 636 - * distributions turn it on, and not just for people with silly amounts of 637 - * memory: the larger PTE entries allow room for the NX bit, which lets the 638 - * kernel disable execution of pages and increase security. 639 - * 640 - * This was a problem for lguest, which couldn't run on these distributions; 641 - * then Matias Zabaljauregui figured it all out and implemented it, and only a 642 - * handful of puppies were crushed in the process! 643 - * 644 - * Back to our point: the kernel spends a lot of time changing both the 645 - * top-level page directory and lower-level pagetable pages. The Guest doesn't 646 - * know physical addresses, so while it maintains these page tables exactly 647 - * like normal, it also needs to keep the Host informed whenever it makes a 648 - * change: the Host will create the real page tables based on the Guests'. 649 - */ 650 - 651 - /* 652 - * The Guest calls this after it has set a second-level entry (pte), ie. to map 653 - * a page into a process' address space. We tell the Host the toplevel and 654 - * address this corresponds to. The Guest uses one pagetable per process, so 655 - * we need to tell the Host which one we're changing (mm->pgd). 656 - */ 657 - static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 658 - pte_t *ptep) 659 - { 660 - #ifdef CONFIG_X86_PAE 661 - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ 662 - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 663 - ptep->pte_low, ptep->pte_high); 664 - #else 665 - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 666 - #endif 667 - } 668 - 669 - /* This is the "set and update" combo-meal-deal version. */ 670 - static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 671 - pte_t *ptep, pte_t pteval) 672 - { 673 - native_set_pte(ptep, pteval); 674 - lguest_pte_update(mm, addr, ptep); 675 - } 676 - 677 - /* 678 - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 679 - * to set a middle-level entry when PAE is activated. 680 - * 681 - * Again, we set the entry then tell the Host which page we changed, 682 - * and the index of the entry we changed. 683 - */ 684 - #ifdef CONFIG_X86_PAE 685 - static void lguest_set_pud(pud_t *pudp, pud_t pudval) 686 - { 687 - native_set_pud(pudp, pudval); 688 - 689 - /* 32 bytes aligned pdpt address and the index. */ 690 - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, 691 - (__pa(pudp) & 0x1F) / sizeof(pud_t)); 692 - } 693 - 694 - static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 695 - { 696 - native_set_pmd(pmdp, pmdval); 697 - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 698 - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); 699 - } 700 - #else 701 - 702 - /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ 703 - static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 704 - { 705 - native_set_pmd(pmdp, pmdval); 706 - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, 707 - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); 708 - } 709 - #endif 710 - 711 - /* 712 - * There are a couple of legacy places where the kernel sets a PTE, but we 713 - * don't know the top level any more. This is useless for us, since we don't 714 - * know which pagetable is changing or what address, so we just tell the Host 715 - * to forget all of them. Fortunately, this is very rare. 716 - * 717 - * ... except in early boot when the kernel sets up the initial pagetables, 718 - * which makes booting astonishingly slow: 48 seconds! So we don't even tell 719 - * the Host anything changed until we've done the first real page table switch, 720 - * which brings boot back to 4.3 seconds. 721 - */ 722 - static void lguest_set_pte(pte_t *ptep, pte_t pteval) 723 - { 724 - native_set_pte(ptep, pteval); 725 - if (cr3_changed) 726 - lazy_hcall1(LHCALL_FLUSH_TLB, 1); 727 - } 728 - 729 - #ifdef CONFIG_X86_PAE 730 - /* 731 - * With 64-bit PTE values, we need to be careful setting them: if we set 32 732 - * bits at a time, the hardware could see a weird half-set entry. These 733 - * versions ensure we update all 64 bits at once. 734 - */ 735 - static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 736 - { 737 - native_set_pte_atomic(ptep, pte); 738 - if (cr3_changed) 739 - lazy_hcall1(LHCALL_FLUSH_TLB, 1); 740 - } 741 - 742 - static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, 743 - pte_t *ptep) 744 - { 745 - native_pte_clear(mm, addr, ptep); 746 - lguest_pte_update(mm, addr, ptep); 747 - } 748 - 749 - static void lguest_pmd_clear(pmd_t *pmdp) 750 - { 751 - lguest_set_pmd(pmdp, __pmd(0)); 752 - } 753 - #endif 754 - 755 - /* 756 - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 757 - * native page table operations. On native hardware you can set a new page 758 - * table entry whenever you want, but if you want to remove one you have to do 759 - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 760 - * 761 - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only 762 - * called when a valid entry is written, not when it's removed (ie. marked not 763 - * present). Instead, this is where we come when the Guest wants to remove a 764 - * page table entry: we tell the Host to set that entry to 0 (ie. the present 765 - * bit is zero). 766 - */ 767 - static void lguest_flush_tlb_single(unsigned long addr) 768 - { 769 - /* Simply set it to zero: if it was not, it will fault back in. */ 770 - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); 771 - } 772 - 773 - /* 774 - * This is what happens after the Guest has removed a large number of entries. 775 - * This tells the Host that any of the page table entries for userspace might 776 - * have changed, ie. virtual addresses below PAGE_OFFSET. 777 - */ 778 - static void lguest_flush_tlb_user(void) 779 - { 780 - lazy_hcall1(LHCALL_FLUSH_TLB, 0); 781 - } 782 - 783 - /* 784 - * This is called when the kernel page tables have changed. That's not very 785 - * common (unless the Guest is using highmem, which makes the Guest extremely 786 - * slow), so it's worth separating this from the user flushing above. 787 - */ 788 - static void lguest_flush_tlb_kernel(void) 789 - { 790 - lazy_hcall1(LHCALL_FLUSH_TLB, 1); 791 - } 792 - 793 - /* 794 - * The Unadvanced Programmable Interrupt Controller. 795 - * 796 - * This is an attempt to implement the simplest possible interrupt controller. 797 - * I spent some time looking though routines like set_irq_chip_and_handler, 798 - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and 799 - * I *think* this is as simple as it gets. 800 - * 801 - * We can tell the Host what interrupts we want blocked ready for using the 802 - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as 803 - * simple as setting a bit. We don't actually "ack" interrupts as such, we 804 - * just mask and unmask them. I wonder if we should be cleverer? 805 - */ 806 - static void disable_lguest_irq(struct irq_data *data) 807 - { 808 - set_bit(data->irq, lguest_data.blocked_interrupts); 809 - } 810 - 811 - static void enable_lguest_irq(struct irq_data *data) 812 - { 813 - clear_bit(data->irq, lguest_data.blocked_interrupts); 814 - } 815 - 816 - /* This structure describes the lguest IRQ controller. */ 817 - static struct irq_chip lguest_irq_controller = { 818 - .name = "lguest", 819 - .irq_mask = disable_lguest_irq, 820 - .irq_mask_ack = disable_lguest_irq, 821 - .irq_unmask = enable_lguest_irq, 822 - }; 823 - 824 - /* 825 - * Interrupt descriptors are allocated as-needed, but low-numbered ones are 826 - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it 827 - * tells us the irq is already used: other errors (ie. ENOMEM) we take 828 - * seriously. 829 - */ 830 - static int lguest_setup_irq(unsigned int irq) 831 - { 832 - struct irq_desc *desc; 833 - int err; 834 - 835 - /* Returns -ve error or vector number. */ 836 - err = irq_alloc_desc_at(irq, 0); 837 - if (err < 0 && err != -EEXIST) 838 - return err; 839 - 840 - /* 841 - * Tell the Linux infrastructure that the interrupt is 842 - * controlled by our level-based lguest interrupt controller. 843 - */ 844 - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, 845 - handle_level_irq, "level"); 846 - 847 - /* Some systems map "vectors" to interrupts weirdly. Not us! */ 848 - desc = irq_to_desc(irq); 849 - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); 850 - return 0; 851 - } 852 - 853 - static int lguest_enable_irq(struct pci_dev *dev) 854 - { 855 - int err; 856 - u8 line = 0; 857 - 858 - /* We literally use the PCI interrupt line as the irq number. */ 859 - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); 860 - err = lguest_setup_irq(line); 861 - if (!err) 862 - dev->irq = line; 863 - return err; 864 - } 865 - 866 - /* We don't do hotplug PCI, so this shouldn't be called. */ 867 - static void lguest_disable_irq(struct pci_dev *dev) 868 - { 869 - WARN_ON(1); 870 - } 871 - 872 - /* 873 - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 874 - * interrupt (except 128, which is used for system calls). 875 - */ 876 - static void __init lguest_init_IRQ(void) 877 - { 878 - unsigned int i; 879 - 880 - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { 881 - if (i != IA32_SYSCALL_VECTOR) 882 - set_intr_gate(i, irq_entries_start + 883 - 8 * (i - FIRST_EXTERNAL_VECTOR)); 884 - } 885 - 886 - /* 887 - * This call is required to set up for 4k stacks, where we have 888 - * separate stacks for hard and soft interrupts. 889 - */ 890 - irq_ctx_init(smp_processor_id()); 891 - } 892 - 893 - /* 894 - * Time. 895 - * 896 - * It would be far better for everyone if the Guest had its own clock, but 897 - * until then the Host gives us the time on every interrupt. 898 - */ 899 - static void lguest_get_wallclock(struct timespec *now) 900 - { 901 - *now = lguest_data.time; 902 - } 903 - 904 - /* 905 - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 906 - * what speed it runs at, or 0 if it's unusable as a reliable clock source. 907 - * This matches what we want here: if we return 0 from this function, the x86 908 - * TSC clock will give up and not register itself. 909 - */ 910 - static unsigned long lguest_tsc_khz(void) 911 - { 912 - return lguest_data.tsc_khz; 913 - } 914 - 915 - /* 916 - * If we can't use the TSC, the kernel falls back to our lower-priority 917 - * "lguest_clock", where we read the time value given to us by the Host. 918 - */ 919 - static u64 lguest_clock_read(struct clocksource *cs) 920 - { 921 - unsigned long sec, nsec; 922 - 923 - /* 924 - * Since the time is in two parts (seconds and nanoseconds), we risk 925 - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 926 - * and getting 99 and 0. As Linux tends to come apart under the stress 927 - * of time travel, we must be careful: 928 - */ 929 - do { 930 - /* First we read the seconds part. */ 931 - sec = lguest_data.time.tv_sec; 932 - /* 933 - * This read memory barrier tells the compiler and the CPU that 934 - * this can't be reordered: we have to complete the above 935 - * before going on. 936 - */ 937 - rmb(); 938 - /* Now we read the nanoseconds part. */ 939 - nsec = lguest_data.time.tv_nsec; 940 - /* Make sure we've done that. */ 941 - rmb(); 942 - /* Now if the seconds part has changed, try again. */ 943 - } while (unlikely(lguest_data.time.tv_sec != sec)); 944 - 945 - /* Our lguest clock is in real nanoseconds. */ 946 - return sec*1000000000ULL + nsec; 947 - } 948 - 949 - /* This is the fallback clocksource: lower priority than the TSC clocksource. */ 950 - static struct clocksource lguest_clock = { 951 - .name = "lguest", 952 - .rating = 200, 953 - .read = lguest_clock_read, 954 - .mask = CLOCKSOURCE_MASK(64), 955 - .flags = CLOCK_SOURCE_IS_CONTINUOUS, 956 - }; 957 - 958 - /* 959 - * We also need a "struct clock_event_device": Linux asks us to set it to go 960 - * off some time in the future. Actually, James Morris figured all this out, I 961 - * just applied the patch. 962 - */ 963 - static int lguest_clockevent_set_next_event(unsigned long delta, 964 - struct clock_event_device *evt) 965 - { 966 - /* FIXME: I don't think this can ever happen, but James tells me he had 967 - * to put this code in. Maybe we should remove it now. Anyone? */ 968 - if (delta < LG_CLOCK_MIN_DELTA) { 969 - if (printk_ratelimit()) 970 - printk(KERN_DEBUG "%s: small delta %lu ns\n", 971 - __func__, delta); 972 - return -ETIME; 973 - } 974 - 975 - /* Please wake us this far in the future. */ 976 - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); 977 - return 0; 978 - } 979 - 980 - static int lguest_clockevent_shutdown(struct clock_event_device *evt) 981 - { 982 - /* A 0 argument shuts the clock down. */ 983 - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); 984 - return 0; 985 - } 986 - 987 - /* This describes our primitive timer chip. */ 988 - static struct clock_event_device lguest_clockevent = { 989 - .name = "lguest", 990 - .features = CLOCK_EVT_FEAT_ONESHOT, 991 - .set_next_event = lguest_clockevent_set_next_event, 992 - .set_state_shutdown = lguest_clockevent_shutdown, 993 - .rating = INT_MAX, 994 - .mult = 1, 995 - .shift = 0, 996 - .min_delta_ns = LG_CLOCK_MIN_DELTA, 997 - .min_delta_ticks = LG_CLOCK_MIN_DELTA, 998 - .max_delta_ns = LG_CLOCK_MAX_DELTA, 999 - .max_delta_ticks = LG_CLOCK_MAX_DELTA, 1000 - }; 1001 - 1002 - /* 1003 - * This is the Guest timer interrupt handler (hardware interrupt 0). We just 1004 - * call the clockevent infrastructure and it does whatever needs doing. 1005 - */ 1006 - static void lguest_time_irq(struct irq_desc *desc) 1007 - { 1008 - unsigned long flags; 1009 - 1010 - /* Don't interrupt us while this is running. */ 1011 - local_irq_save(flags); 1012 - lguest_clockevent.event_handler(&lguest_clockevent); 1013 - local_irq_restore(flags); 1014 - } 1015 - 1016 - /* 1017 - * At some point in the boot process, we get asked to set up our timing 1018 - * infrastructure. The kernel doesn't expect timer interrupts before this, but 1019 - * we cleverly initialized the "blocked_interrupts" field of "struct 1020 - * lguest_data" so that timer interrupts were blocked until now. 1021 - */ 1022 - static void lguest_time_init(void) 1023 - { 1024 - /* Set up the timer interrupt (0) to go to our simple timer routine */ 1025 - if (lguest_setup_irq(0) != 0) 1026 - panic("Could not set up timer irq"); 1027 - irq_set_handler(0, lguest_time_irq); 1028 - 1029 - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); 1030 - 1031 - /* We can't set cpumask in the initializer: damn C limitations! Set it 1032 - * here and register our timer device. */ 1033 - lguest_clockevent.cpumask = cpumask_of(0); 1034 - clockevents_register_device(&lguest_clockevent); 1035 - 1036 - /* Finally, we unblock the timer interrupt. */ 1037 - clear_bit(0, lguest_data.blocked_interrupts); 1038 - } 1039 - 1040 - /* 1041 - * Miscellaneous bits and pieces. 1042 - * 1043 - * Here is an oddball collection of functions which the Guest needs for things 1044 - * to work. They're pretty simple. 1045 - */ 1046 - 1047 - /* 1048 - * The Guest needs to tell the Host what stack it expects traps to use. For 1049 - * native hardware, this is part of the Task State Segment mentioned above in 1050 - * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1051 - * 1052 - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1053 - * segment), the privilege level (we're privilege level 1, the Host is 0 and 1054 - * will not tolerate us trying to use that), the stack pointer, and the number 1055 - * of pages in the stack. 1056 - */ 1057 - static void lguest_load_sp0(struct tss_struct *tss, 1058 - struct thread_struct *thread) 1059 - { 1060 - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, 1061 - THREAD_SIZE / PAGE_SIZE); 1062 - tss->x86_tss.sp0 = thread->sp0; 1063 - } 1064 - 1065 - /* Let's just say, I wouldn't do debugging under a Guest. */ 1066 - static unsigned long lguest_get_debugreg(int regno) 1067 - { 1068 - /* FIXME: Implement */ 1069 - return 0; 1070 - } 1071 - 1072 - static void lguest_set_debugreg(int regno, unsigned long value) 1073 - { 1074 - /* FIXME: Implement */ 1075 - } 1076 - 1077 - /* 1078 - * There are times when the kernel wants to make sure that no memory writes are 1079 - * caught in the cache (that they've all reached real hardware devices). This 1080 - * doesn't matter for the Guest which has virtual hardware. 1081 - * 1082 - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush 1083 - * (clflush) instruction is available and the kernel uses that. Otherwise, it 1084 - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. 1085 - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can 1086 - * ignore clflush, but replace wbinvd. 1087 - */ 1088 - static void lguest_wbinvd(void) 1089 - { 1090 - } 1091 - 1092 - /* 1093 - * If the Guest expects to have an Advanced Programmable Interrupt Controller, 1094 - * we play dumb by ignoring writes and returning 0 for reads. So it's no 1095 - * longer Programmable nor Controlling anything, and I don't think 8 lines of 1096 - * code qualifies for Advanced. It will also never interrupt anything. It 1097 - * does, however, allow us to get through the Linux boot code. 1098 - */ 1099 - #ifdef CONFIG_X86_LOCAL_APIC 1100 - static void lguest_apic_write(u32 reg, u32 v) 1101 - { 1102 - } 1103 - 1104 - static u32 lguest_apic_read(u32 reg) 1105 - { 1106 - return 0; 1107 - } 1108 - 1109 - static u64 lguest_apic_icr_read(void) 1110 - { 1111 - return 0; 1112 - } 1113 - 1114 - static void lguest_apic_icr_write(u32 low, u32 id) 1115 - { 1116 - /* Warn to see if there's any stray references */ 1117 - WARN_ON(1); 1118 - } 1119 - 1120 - static void lguest_apic_wait_icr_idle(void) 1121 - { 1122 - return; 1123 - } 1124 - 1125 - static u32 lguest_apic_safe_wait_icr_idle(void) 1126 - { 1127 - return 0; 1128 - } 1129 - 1130 - static void set_lguest_basic_apic_ops(void) 1131 - { 1132 - apic->read = lguest_apic_read; 1133 - apic->write = lguest_apic_write; 1134 - apic->icr_read = lguest_apic_icr_read; 1135 - apic->icr_write = lguest_apic_icr_write; 1136 - apic->wait_icr_idle = lguest_apic_wait_icr_idle; 1137 - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; 1138 - }; 1139 - #endif 1140 - 1141 - /* STOP! Until an interrupt comes in. */ 1142 - static void lguest_safe_halt(void) 1143 - { 1144 - hcall(LHCALL_HALT, 0, 0, 0, 0); 1145 - } 1146 - 1147 - /* 1148 - * The SHUTDOWN hypercall takes a string to describe what's happening, and 1149 - * an argument which says whether this to restart (reboot) the Guest or not. 1150 - * 1151 - * Note that the Host always prefers that the Guest speak in physical addresses 1152 - * rather than virtual addresses, so we use __pa() here. 1153 - */ 1154 - static void lguest_power_off(void) 1155 - { 1156 - hcall(LHCALL_SHUTDOWN, __pa("Power down"), 1157 - LGUEST_SHUTDOWN_POWEROFF, 0, 0); 1158 - } 1159 - 1160 - /* 1161 - * Panicing. 1162 - * 1163 - * Don't. But if you did, this is what happens. 1164 - */ 1165 - static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 1166 - { 1167 - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); 1168 - /* The hcall won't return, but to keep gcc happy, we're "done". */ 1169 - return NOTIFY_DONE; 1170 - } 1171 - 1172 - static struct notifier_block paniced = { 1173 - .notifier_call = lguest_panic 1174 - }; 1175 - 1176 - /* Setting up memory is fairly easy. */ 1177 - static __init char *lguest_memory_setup(void) 1178 - { 1179 - /* 1180 - * The Linux bootloader header contains an "e820" memory map: the 1181 - * Launcher populated the first entry with our memory limit. 1182 - */ 1183 - e820__range_add(boot_params.e820_table[0].addr, 1184 - boot_params.e820_table[0].size, 1185 - boot_params.e820_table[0].type); 1186 - 1187 - /* This string is for the boot messages. */ 1188 - return "LGUEST"; 1189 - } 1190 - 1191 - /* Offset within PCI config space of BAR access capability. */ 1192 - static int console_cfg_offset = 0; 1193 - static int console_access_cap; 1194 - 1195 - /* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ 1196 - static void set_cfg_window(u32 cfg_offset, u32 off) 1197 - { 1198 - write_pci_config_byte(0, 1, 0, 1199 - cfg_offset + offsetof(struct virtio_pci_cap, bar), 1200 - 0); 1201 - write_pci_config(0, 1, 0, 1202 - cfg_offset + offsetof(struct virtio_pci_cap, length), 1203 - 4); 1204 - write_pci_config(0, 1, 0, 1205 - cfg_offset + offsetof(struct virtio_pci_cap, offset), 1206 - off); 1207 - } 1208 - 1209 - static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) 1210 - { 1211 - /* 1212 - * We could set this up once, then leave it; nothing else in the * 1213 - * kernel should touch these registers. But if it went wrong, that 1214 - * would be a horrible bug to find. 1215 - */ 1216 - set_cfg_window(cfg_offset, off); 1217 - write_pci_config(0, 1, 0, 1218 - cfg_offset + sizeof(struct virtio_pci_cap), val); 1219 - } 1220 - 1221 - static void probe_pci_console(void) 1222 - { 1223 - u8 cap, common_cap = 0, device_cap = 0; 1224 - u32 device_len; 1225 - 1226 - /* Avoid recursive printk into here. */ 1227 - console_cfg_offset = -1; 1228 - 1229 - if (!early_pci_allowed()) { 1230 - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); 1231 - return; 1232 - } 1233 - 1234 - /* We expect a console PCI device at BUS0, slot 1. */ 1235 - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { 1236 - printk(KERN_ERR "lguest: PCI device is %#x!\n", 1237 - read_pci_config(0, 1, 0, 0)); 1238 - return; 1239 - } 1240 - 1241 - /* Find the capabilities we need (must be in bar0) */ 1242 - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); 1243 - while (cap) { 1244 - u8 vndr = read_pci_config_byte(0, 1, 0, cap); 1245 - if (vndr == PCI_CAP_ID_VNDR) { 1246 - u8 type, bar; 1247 - 1248 - type = read_pci_config_byte(0, 1, 0, 1249 - cap + offsetof(struct virtio_pci_cap, cfg_type)); 1250 - bar = read_pci_config_byte(0, 1, 0, 1251 - cap + offsetof(struct virtio_pci_cap, bar)); 1252 - 1253 - switch (type) { 1254 - case VIRTIO_PCI_CAP_DEVICE_CFG: 1255 - if (bar == 0) 1256 - device_cap = cap; 1257 - break; 1258 - case VIRTIO_PCI_CAP_PCI_CFG: 1259 - console_access_cap = cap; 1260 - break; 1261 - } 1262 - } 1263 - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); 1264 - } 1265 - if (!device_cap || !console_access_cap) { 1266 - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", 1267 - common_cap, device_cap, console_access_cap); 1268 - return; 1269 - } 1270 - 1271 - /* 1272 - * Note that we can't check features, until we've set the DRIVER 1273 - * status bit. We don't want to do that until we have a real driver, 1274 - * so we just check that the device-specific config has room for 1275 - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE 1276 - * it should ignore the access. 1277 - */ 1278 - device_len = read_pci_config(0, 1, 0, 1279 - device_cap + offsetof(struct virtio_pci_cap, length)); 1280 - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) 1281 - + sizeof(u32))) { 1282 - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); 1283 - return; 1284 - } 1285 - 1286 - console_cfg_offset = read_pci_config(0, 1, 0, 1287 - device_cap + offsetof(struct virtio_pci_cap, offset)); 1288 - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); 1289 - } 1290 - 1291 - /* 1292 - * We will eventually use the virtio console device to produce console output, 1293 - * but before that is set up we use the virtio PCI console's backdoor mmio 1294 - * access and the "emergency" write facility (which is legal even before the 1295 - * device is configured). 1296 - */ 1297 - static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1298 - { 1299 - /* If we couldn't find PCI console, forget it. */ 1300 - if (console_cfg_offset < 0) 1301 - return count; 1302 - 1303 - if (unlikely(!console_cfg_offset)) { 1304 - probe_pci_console(); 1305 - if (console_cfg_offset < 0) 1306 - return count; 1307 - } 1308 - 1309 - write_bar_via_cfg(console_access_cap, 1310 - console_cfg_offset 1311 - + offsetof(struct virtio_console_config, emerg_wr), 1312 - buf[0]); 1313 - return 1; 1314 - } 1315 - 1316 - /* 1317 - * Rebooting also tells the Host we're finished, but the RESTART flag tells the 1318 - * Launcher to reboot us. 1319 - */ 1320 - static void lguest_restart(char *reason) 1321 - { 1322 - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); 1323 - } 1324 - 1325 - /*G:050 1326 - * Patching (Powerfully Placating Performance Pedants) 1327 - * 1328 - * We have already seen that pv_ops structures let us replace simple native 1329 - * instructions with calls to the appropriate back end all throughout the 1330 - * kernel. This allows the same kernel to run as a Guest and as a native 1331 - * kernel, but it's slow because of all the indirect branches. 1332 - * 1333 - * Remember that David Wheeler quote about "Any problem in computer science can 1334 - * be solved with another layer of indirection"? The rest of that quote is 1335 - * "... But that usually will create another problem." This is the first of 1336 - * those problems. 1337 - * 1338 - * Our current solution is to allow the paravirt back end to optionally patch 1339 - * over the indirect calls to replace them with something more efficient. We 1340 - * patch two of the simplest of the most commonly called functions: disable 1341 - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch 1342 - * into: the Guest versions of these operations are small enough that we can 1343 - * fit comfortably. 1344 - * 1345 - * First we need assembly templates of each of the patchable Guest operations, 1346 - * and these are in head_32.S. 1347 - */ 1348 - 1349 - /*G:060 We construct a table from the assembler templates: */ 1350 - static const struct lguest_insns 1351 - { 1352 - const char *start, *end; 1353 - } lguest_insns[] = { 1354 - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1355 - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1356 - }; 1357 - 1358 - /* 1359 - * Now our patch routine is fairly simple (based on the native one in 1360 - * paravirt.c). If we have a replacement, we copy it in and return how much of 1361 - * the available space we used. 1362 - */ 1363 - static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1364 - unsigned long addr, unsigned len) 1365 - { 1366 - unsigned int insn_len; 1367 - 1368 - /* Don't do anything special if we don't have a replacement */ 1369 - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) 1370 - return paravirt_patch_default(type, clobber, ibuf, addr, len); 1371 - 1372 - insn_len = lguest_insns[type].end - lguest_insns[type].start; 1373 - 1374 - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ 1375 - if (len < insn_len) 1376 - return paravirt_patch_default(type, clobber, ibuf, addr, len); 1377 - 1378 - /* Copy in our instructions. */ 1379 - memcpy(ibuf, lguest_insns[type].start, insn_len); 1380 - return insn_len; 1381 - } 1382 - 1383 - /*G:029 1384 - * Once we get to lguest_init(), we know we're a Guest. The various 1385 - * pv_ops structures in the kernel provide points for (almost) every routine we 1386 - * have to override to avoid privileged instructions. 1387 - */ 1388 - __init void lguest_init(void) 1389 - { 1390 - /* We're under lguest. */ 1391 - pv_info.name = "lguest"; 1392 - /* We're running at privilege level 1, not 0 as normal. */ 1393 - pv_info.kernel_rpl = 1; 1394 - /* Everyone except Xen runs with this set. */ 1395 - pv_info.shared_kernel_pmd = 1; 1396 - 1397 - /* 1398 - * We set up all the lguest overrides for sensitive operations. These 1399 - * are detailed with the operations themselves. 1400 - */ 1401 - 1402 - /* Interrupt-related operations */ 1403 - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); 1404 - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1405 - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); 1406 - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1407 - pv_irq_ops.safe_halt = lguest_safe_halt; 1408 - 1409 - /* Setup operations */ 1410 - pv_init_ops.patch = lguest_patch; 1411 - 1412 - /* Intercepts of various CPU instructions */ 1413 - pv_cpu_ops.load_gdt = lguest_load_gdt; 1414 - pv_cpu_ops.cpuid = lguest_cpuid; 1415 - pv_cpu_ops.load_idt = lguest_load_idt; 1416 - pv_cpu_ops.iret = lguest_iret; 1417 - pv_cpu_ops.load_sp0 = lguest_load_sp0; 1418 - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; 1419 - pv_cpu_ops.set_ldt = lguest_set_ldt; 1420 - pv_cpu_ops.load_tls = lguest_load_tls; 1421 - pv_cpu_ops.get_debugreg = lguest_get_debugreg; 1422 - pv_cpu_ops.set_debugreg = lguest_set_debugreg; 1423 - pv_cpu_ops.read_cr0 = lguest_read_cr0; 1424 - pv_cpu_ops.write_cr0 = lguest_write_cr0; 1425 - pv_cpu_ops.read_cr4 = lguest_read_cr4; 1426 - pv_cpu_ops.write_cr4 = lguest_write_cr4; 1427 - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1428 - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1429 - pv_cpu_ops.wbinvd = lguest_wbinvd; 1430 - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1431 - pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1432 - 1433 - /* Pagetable management */ 1434 - pv_mmu_ops.write_cr3 = lguest_write_cr3; 1435 - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1436 - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1437 - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; 1438 - pv_mmu_ops.set_pte = lguest_set_pte; 1439 - pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1440 - pv_mmu_ops.set_pmd = lguest_set_pmd; 1441 - #ifdef CONFIG_X86_PAE 1442 - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; 1443 - pv_mmu_ops.pte_clear = lguest_pte_clear; 1444 - pv_mmu_ops.pmd_clear = lguest_pmd_clear; 1445 - pv_mmu_ops.set_pud = lguest_set_pud; 1446 - #endif 1447 - pv_mmu_ops.read_cr2 = lguest_read_cr2; 1448 - pv_mmu_ops.read_cr3 = lguest_read_cr3; 1449 - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1450 - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; 1451 - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; 1452 - pv_mmu_ops.pte_update = lguest_pte_update; 1453 - 1454 - #ifdef CONFIG_X86_LOCAL_APIC 1455 - /* APIC read/write intercepts */ 1456 - set_lguest_basic_apic_ops(); 1457 - #endif 1458 - 1459 - x86_init.resources.memory_setup = lguest_memory_setup; 1460 - x86_init.irqs.intr_init = lguest_init_IRQ; 1461 - x86_init.timers.timer_init = lguest_time_init; 1462 - x86_platform.calibrate_tsc = lguest_tsc_khz; 1463 - x86_platform.get_wallclock = lguest_get_wallclock; 1464 - 1465 - /* 1466 - * Now is a good time to look at the implementations of these functions 1467 - * before returning to the rest of lguest_init(). 1468 - */ 1469 - 1470 - /*G:070 1471 - * Now we've seen all the paravirt_ops, we return to 1472 - * lguest_init() where the rest of the fairly chaotic boot setup 1473 - * occurs. 1474 - */ 1475 - 1476 - /* 1477 - * The stack protector is a weird thing where gcc places a canary 1478 - * value on the stack and then checks it on return. This file is 1479 - * compiled with -fno-stack-protector it, so we got this far without 1480 - * problems. The value of the canary is kept at offset 20 from the 1481 - * %gs register, so we need to set that up before calling C functions 1482 - * in other files. 1483 - */ 1484 - setup_stack_canary_segment(0); 1485 - 1486 - /* 1487 - * We could just call load_stack_canary_segment(), but we might as well 1488 - * call switch_to_new_gdt() which loads the whole table and sets up the 1489 - * per-cpu segment descriptor register %fs as well. 1490 - */ 1491 - switch_to_new_gdt(0); 1492 - 1493 - /* 1494 - * The Host<->Guest Switcher lives at the top of our address space, and 1495 - * the Host told us how big it is when we made LGUEST_INIT hypercall: 1496 - * it put the answer in lguest_data.reserve_mem 1497 - */ 1498 - reserve_top_address(lguest_data.reserve_mem); 1499 - 1500 - /* Hook in our special panic hypercall code. */ 1501 - atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1502 - 1503 - /* 1504 - * This is messy CPU setup stuff which the native boot code does before 1505 - * start_kernel, so we have to do, too: 1506 - */ 1507 - cpu_detect(&new_cpu_data); 1508 - /* head.S usually sets up the first capability word, so do it here. */ 1509 - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); 1510 - 1511 - /* Math is always hard! */ 1512 - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); 1513 - 1514 - /* We don't have features. We have puppies! Puppies! */ 1515 - #ifdef CONFIG_X86_MCE 1516 - mca_cfg.disabled = true; 1517 - #endif 1518 - #ifdef CONFIG_ACPI 1519 - acpi_disabled = 1; 1520 - #endif 1521 - 1522 - /* 1523 - * We set the preferred console to "hvc". This is the "hypervisor 1524 - * virtual console" driver written by the PowerPC people, which we also 1525 - * adapted for lguest's use. 1526 - */ 1527 - add_preferred_console("hvc", 0, NULL); 1528 - 1529 - /* Register our very early console. */ 1530 - virtio_cons_early_init(early_put_chars); 1531 - 1532 - /* Don't let ACPI try to control our PCI interrupts. */ 1533 - disable_acpi(); 1534 - 1535 - /* We control them ourselves, by overriding these two hooks. */ 1536 - pcibios_enable_irq = lguest_enable_irq; 1537 - pcibios_disable_irq = lguest_disable_irq; 1538 - 1539 - /* 1540 - * Last of all, we set the power management poweroff hook to point to 1541 - * the Guest routine to power off, and the reboot hook to our restart 1542 - * routine. 1543 - */ 1544 - pm_power_off = lguest_power_off; 1545 - machine_ops.restart = lguest_restart; 1546 - 1547 - /* 1548 - * Now we're set up, call i386_start_kernel() in head32.c and we proceed 1549 - * to boot as normal. It never returns. 1550 - */ 1551 - i386_start_kernel(); 1552 - } 1553 - /* 1554 - * This marks the end of stage II of our journey, The Guest. 1555 - * 1556 - * It is now time for us to explore the layer of virtual drivers and complete 1557 - * our understanding of the Guest in "make Drivers". 1558 - */

-192

arch/x86/lguest/head_32.S

··· 1 - #include <linux/linkage.h> 2 - #include <linux/lguest.h> 3 - #include <asm/lguest_hcall.h> 4 - #include <asm/asm-offsets.h> 5 - #include <asm/thread_info.h> 6 - #include <asm/processor-flags.h> 7 - 8 - /*G:020 9 - 10 - * Our story starts with the bzImage: booting starts at startup_32 in 11 - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real 12 - * kernel in place and then jumps into it: startup_32 in 13 - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi 14 - * register, which is created by the bootloader (the Launcher in our case). 15 - * 16 - * The startup_32 function does very little: it clears the uninitialized global 17 - * C variables which we expect to be zero (ie. BSS) and then copies the boot 18 - * header and kernel command line somewhere safe, and populates some initial 19 - * page tables. Finally it checks the 'hardware_subarch' field. This was 20 - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's 21 - * assigned number), then it calls us here. 22 - * 23 - * WARNING: be very careful here! We're running at addresses equal to physical 24 - * addresses (around 0), not above PAGE_OFFSET as most code expects 25 - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 26 - * data without remembering to subtract __PAGE_OFFSET! 27 - * 28 - * The .section line puts this code in .init.text so it will be discarded after 29 - * boot. 30 - */ 31 - .section .init.text, "ax", @progbits 32 - ENTRY(lguest_entry) 33 - /* 34 - * We make the "initialization" hypercall now to tell the Host where 35 - * our lguest_data struct is. 36 - */ 37 - movl $LHCALL_LGUEST_INIT, %eax 38 - movl $lguest_data - __PAGE_OFFSET, %ebx 39 - int $LGUEST_TRAP_ENTRY 40 - 41 - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ 42 - movl $LHCALL_NEW_PGTABLE, %eax 43 - movl $(initial_page_table - __PAGE_OFFSET), %ebx 44 - int $LGUEST_TRAP_ENTRY 45 - 46 - /* Set up the initial stack so we can run C code. */ 47 - movl $(init_thread_union+THREAD_SIZE),%esp 48 - 49 - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ 50 - jmp lguest_init+__PAGE_OFFSET 51 - 52 - /*G:055 53 - * We create a macro which puts the assembler code between lgstart_ and lgend_ 54 - * markers. These templates are put in the .text section: they can't be 55 - * discarded after boot as we may need to patch modules, too. 56 - */ 57 - .text 58 - #define LGUEST_PATCH(name, insns...) \ 59 - lgstart_##name: insns; lgend_##name:; \ 60 - .globl lgstart_##name; .globl lgend_##name 61 - 62 - LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 63 - LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 64 - 65 - /*G:033 66 - * But using those wrappers is inefficient (we'll see why that doesn't matter 67 - * for save_fl and irq_disable later). If we write our routines carefully in 68 - * assembler, we can avoid clobbering any registers and avoid jumping through 69 - * the wrapper functions. 70 - * 71 - * I skipped over our first piece of assembler, but this one is worth studying 72 - * in a bit more detail so I'll describe in easy stages. First, the routine to 73 - * enable interrupts: 74 - */ 75 - ENTRY(lg_irq_enable) 76 - /* 77 - * The reverse of irq_disable, this sets lguest_data.irq_enabled to 78 - * X86_EFLAGS_IF (ie. "Interrupts enabled"). 79 - */ 80 - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 81 - /* 82 - * But now we need to check if the Host wants to know: there might have 83 - * been interrupts waiting to be delivered, in which case it will have 84 - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 85 - * jump to send_interrupts, otherwise we're done. 86 - */ 87 - cmpl $0, lguest_data+LGUEST_DATA_irq_pending 88 - jnz send_interrupts 89 - /* 90 - * One cool thing about x86 is that you can do many things without using 91 - * a register. In this case, the normal path hasn't needed to save or 92 - * restore any registers at all! 93 - */ 94 - ret 95 - send_interrupts: 96 - /* 97 - * OK, now we need a register: eax is used for the hypercall number, 98 - * which is LHCALL_SEND_INTERRUPTS. 99 - * 100 - * We used not to bother with this pending detection at all, which was 101 - * much simpler. Sooner or later the Host would realize it had to 102 - * send us an interrupt. But that turns out to make performance 7 103 - * times worse on a simple tcp benchmark. So now we do this the hard 104 - * way. 105 - */ 106 - pushl %eax 107 - movl $LHCALL_SEND_INTERRUPTS, %eax 108 - /* This is the actual hypercall trap. */ 109 - int $LGUEST_TRAP_ENTRY 110 - /* Put eax back the way we found it. */ 111 - popl %eax 112 - ret 113 - 114 - /* 115 - * Finally, the "popf" or "restore flags" routine. The %eax register holds the 116 - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 117 - * enabling interrupts again, if it's 0 we're leaving them off. 118 - */ 119 - ENTRY(lg_restore_fl) 120 - /* This is just "lguest_data.irq_enabled = flags;" */ 121 - movl %eax, lguest_data+LGUEST_DATA_irq_enabled 122 - /* 123 - * Now, if the %eax value has enabled interrupts and 124 - * lguest_data.irq_pending is set, we want to tell the Host so it can 125 - * deliver any outstanding interrupts. Fortunately, both values will 126 - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 127 - * instruction will AND them together for us. If both are set, we 128 - * jump to send_interrupts. 129 - */ 130 - testl lguest_data+LGUEST_DATA_irq_pending, %eax 131 - jnz send_interrupts 132 - /* Again, the normal path has used no extra registers. Clever, huh? */ 133 - ret 134 - /*:*/ 135 - 136 - /* These demark the EIP where host should never deliver interrupts. */ 137 - .global lguest_noirq_iret 138 - 139 - /*M:004 140 - * When the Host reflects a trap or injects an interrupt into the Guest, it 141 - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, 142 - * so the Guest iret logic does the right thing when restoring it. However, 143 - * when the Host sets the Guest up for direct traps, such as system calls, the 144 - * processor is the one to push eflags onto the stack, and the interrupt bit 145 - * will be 1 (in reality, interrupts are always enabled in the Guest). 146 - * 147 - * This turns out to be harmless: the only trap which should happen under Linux 148 - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 149 - * regions), which has to be reflected through the Host anyway. If another 150 - * trap *does* go off when interrupts are disabled, the Guest will panic, and 151 - * we'll never get to this iret! 152 - :*/ 153 - 154 - /*G:045 155 - * There is one final paravirt_op that the Guest implements, and glancing at it 156 - * you can see why I left it to last. It's *cool*! It's in *assembler*! 157 - * 158 - * The "iret" instruction is used to return from an interrupt or trap. The 159 - * stack looks like this: 160 - * old address 161 - * old code segment & privilege level 162 - * old processor flags ("eflags") 163 - * 164 - * The "iret" instruction pops those values off the stack and restores them all 165 - * at once. The only problem is that eflags includes the Interrupt Flag which 166 - * the Guest can't change: the CPU will simply ignore it when we do an "iret". 167 - * So we have to copy eflags from the stack to lguest_data.irq_enabled before 168 - * we do the "iret". 169 - * 170 - * There are two problems with this: firstly, we can't clobber any registers 171 - * and secondly, the whole thing needs to be atomic. The first problem 172 - * is solved by using "push memory"/"pop memory" instruction pair for copying. 173 - * 174 - * The second is harder: copying eflags to lguest_data.irq_enabled will turn 175 - * interrupts on before we're finished, so we could be interrupted before we 176 - * return to userspace or wherever. Our solution to this is to tell the 177 - * Host that it is *never* to interrupt us there, even if interrupts seem to be 178 - * enabled. (It's not necessary to protect pop instruction, since 179 - * data gets updated only after it completes, so we only need to protect 180 - * one instruction, iret). 181 - */ 182 - ENTRY(lguest_iret) 183 - pushl 2*4(%esp) 184 - /* 185 - * Note the %ss: segment prefix here. Normal data accesses use the 186 - * "ds" segment, but that will have already been restored for whatever 187 - * we're returning to (such as userspace): we can't trust it. The %ss: 188 - * prefix makes sure we use the stack segment, which is still valid. 189 - */ 190 - popl %ss:lguest_data+LGUEST_DATA_irq_enabled 191 - lguest_noirq_iret: 192 - iret

-1

drivers/Makefile

··· 125 125 obj-$(CONFIG_ISDN) += isdn/ 126 126 obj-$(CONFIG_EDAC) += edac/ 127 127 obj-$(CONFIG_EISA) += eisa/ 128 - obj-y += lguest/ 129 128 obj-$(CONFIG_CPU_FREQ) += cpufreq/ 130 129 obj-$(CONFIG_CPU_IDLE) += cpuidle/ 131 130 obj-y += mmc/

+1 -1

drivers/block/Kconfig

··· 470 470 depends on VIRTIO 471 471 ---help--- 472 472 This is the virtual block driver for virtio. It can be used with 473 - lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. 473 + QEMU based VMMs (like KVM or Xen). Say Y or M. 474 474 475 475 config VIRTIO_BLK_SCSI 476 476 bool "SCSI passthrough request for the Virtio block driver"

+1 -1

drivers/char/Kconfig

··· 161 161 depends on VIRTIO && TTY 162 162 select HVC_DRIVER 163 163 help 164 - Virtio console for use with lguest and other hypervisors. 164 + Virtio console for use with hypervisors. 165 165 166 166 Also serves as a general-purpose serial device for data 167 167 transfer between the guest and host. Character devices at

+1 -1

drivers/char/virtio_console.c

··· 1130 1130 * We turn the characters into a scatter-gather list, add it to the 1131 1131 * output queue and then kick the Host. Then we sit here waiting for 1132 1132 * it to finish: inefficient in theory, but in practice 1133 - * implementations will do it immediately (lguest's Launcher does). 1133 + * implementations will do it immediately. 1134 1134 */ 1135 1135 static int put_chars(u32 vtermno, const char *buf, int count) 1136 1136 {

-13

drivers/lguest/Kconfig

··· 1 - config LGUEST 2 - tristate "Linux hypervisor example code" 3 - depends on X86_32 && EVENTFD && TTY && PCI_DIRECT 4 - select HVC_DRIVER 5 - ---help--- 6 - This is a very simple module which allows you to run 7 - multiple instances of the same Linux kernel, using the 8 - "lguest" command found in the tools/lguest directory. 9 - 10 - Note that "lguest" is pronounced to rhyme with "fell quest", 11 - not "rustyvisor". See tools/lguest/lguest.txt. 12 - 13 - If unsure, say N. If curious, say M. If masochistic, say Y.

-26

drivers/lguest/Makefile

··· 1 - # Host requires the other files, which can be a module. 2 - obj-$(CONFIG_LGUEST) += lg.o 3 - lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 4 - segments.o lguest_user.o 5 - 6 - lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o 7 - 8 - Preparation Preparation!: PREFIX=P 9 - Guest: PREFIX=G 10 - Drivers: PREFIX=D 11 - Launcher: PREFIX=L 12 - Host: PREFIX=H 13 - Switcher: PREFIX=S 14 - Mastery: PREFIX=M 15 - Beer: 16 - @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" 17 - Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: 18 - @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` 19 - Puppy: 20 - @clear 21 - @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" 22 - @sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear 23 - @printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n" 24 - @sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear 25 - @printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n" 26 - @sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear

-47

drivers/lguest/README

··· 1 - Welcome, friend reader, to lguest. 2 - 3 - Lguest is an adventure, with you, the reader, as Hero. I can't think of many 4 - 5000-line projects which offer both such capability and glimpses of future 5 - potential; it is an exciting time to be delving into the source! 6 - 7 - But be warned; this is an arduous journey of several hours or more! And as we 8 - know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or 9 - equivalent) to anyone I meet who has completed this documentation. 10 - 11 - So get comfortable and keep your wits about you (both quick and humorous). 12 - Along your way to the Noble Goal, you will also gain masterly insight into 13 - lguest, and hypervisors and x86 virtualization in general. 14 - 15 - Our Quest is in seven parts: (best read with C highlighting turned on) 16 - 17 - I) Preparation 18 - - In which our potential hero is flown quickly over the landscape for a 19 - taste of its scope. Suitable for the armchair coders and other such 20 - persons of faint constitution. 21 - 22 - II) Guest 23 - - Where we encounter the first tantalising wisps of code, and come to 24 - understand the details of the life of a Guest kernel. 25 - 26 - III) Drivers 27 - - Whereby the Guest finds its voice and become useful, and our 28 - understanding of the Guest is completed. 29 - 30 - IV) Launcher 31 - - Where we trace back to the creation of the Guest, and thus begin our 32 - understanding of the Host. 33 - 34 - V) Host 35 - - Where we master the Host code, through a long and tortuous journey. 36 - Indeed, it is here that our hero is tested in the Bit of Despair. 37 - 38 - VI) Switcher 39 - - Where our understanding of the intertwined nature of Guests and Hosts 40 - is completed. 41 - 42 - VII) Mastery 43 - - Where our fully fledged hero grapples with the Great Question: 44 - "What next?" 45 - 46 - make Preparation! 47 - Rusty Russell.

-398

drivers/lguest/core.c

··· 1 - /*P:400 2 - * This contains run_guest() which actually calls into the Host<->Guest 3 - * Switcher and analyzes the return, such as determining if the Guest wants the 4 - * Host to do something. This file also contains useful helper routines. 5 - :*/ 6 - #include <linux/module.h> 7 - #include <linux/stringify.h> 8 - #include <linux/stddef.h> 9 - #include <linux/io.h> 10 - #include <linux/mm.h> 11 - #include <linux/sched/signal.h> 12 - #include <linux/vmalloc.h> 13 - #include <linux/cpu.h> 14 - #include <linux/freezer.h> 15 - #include <linux/highmem.h> 16 - #include <linux/slab.h> 17 - #include <asm/paravirt.h> 18 - #include <asm/pgtable.h> 19 - #include <linux/uaccess.h> 20 - #include <asm/poll.h> 21 - #include <asm/asm-offsets.h> 22 - #include "lg.h" 23 - 24 - unsigned long switcher_addr; 25 - struct page **lg_switcher_pages; 26 - static struct vm_struct *switcher_text_vma; 27 - static struct vm_struct *switcher_stacks_vma; 28 - 29 - /* This One Big lock protects all inter-guest data structures. */ 30 - DEFINE_MUTEX(lguest_lock); 31 - 32 - /*H:010 33 - * We need to set up the Switcher at a high virtual address. Remember the 34 - * Switcher is a few hundred bytes of assembler code which actually changes the 35 - * CPU to run the Guest, and then changes back to the Host when a trap or 36 - * interrupt happens. 37 - * 38 - * The Switcher code must be at the same virtual address in the Guest as the 39 - * Host since it will be running as the switchover occurs. 40 - * 41 - * Trying to map memory at a particular address is an unusual thing to do, so 42 - * it's not a simple one-liner. 43 - */ 44 - static __init int map_switcher(void) 45 - { 46 - int i, err; 47 - 48 - /* 49 - * Map the Switcher in to high memory. 50 - * 51 - * It turns out that if we choose the address 0xFFC00000 (4MB under the 52 - * top virtual address), it makes setting up the page tables really 53 - * easy. 54 - */ 55 - 56 - /* We assume Switcher text fits into a single page. */ 57 - if (end_switcher_text - start_switcher_text > PAGE_SIZE) { 58 - printk(KERN_ERR "lguest: switcher text too large (%zu)\n", 59 - end_switcher_text - start_switcher_text); 60 - return -EINVAL; 61 - } 62 - 63 - /* 64 - * We allocate an array of struct page pointers. map_vm_area() wants 65 - * this, rather than just an array of pages. 66 - */ 67 - lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) 68 - * TOTAL_SWITCHER_PAGES, 69 - GFP_KERNEL); 70 - if (!lg_switcher_pages) { 71 - err = -ENOMEM; 72 - goto out; 73 - } 74 - 75 - /* 76 - * Now we actually allocate the pages. The Guest will see these pages, 77 - * so we make sure they're zeroed. 78 - */ 79 - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 80 - lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); 81 - if (!lg_switcher_pages[i]) { 82 - err = -ENOMEM; 83 - goto free_some_pages; 84 - } 85 - } 86 - 87 - /* 88 - * Copy in the compiled-in Switcher code (from x86/switcher_32.S). 89 - * It goes in the first page, which we map in momentarily. 90 - */ 91 - memcpy(kmap(lg_switcher_pages[0]), start_switcher_text, 92 - end_switcher_text - start_switcher_text); 93 - kunmap(lg_switcher_pages[0]); 94 - 95 - /* 96 - * We place the Switcher underneath the fixmap area, which is the 97 - * highest virtual address we can get. This is important, since we 98 - * tell the Guest it can't access this memory, so we want its ceiling 99 - * as high as possible. 100 - */ 101 - switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE; 102 - 103 - /* 104 - * Now we reserve the "virtual memory area"s we want. We might 105 - * not get them in theory, but in practice it's worked so far. 106 - * 107 - * We want the switcher text to be read-only and executable, and 108 - * the stacks to be read-write and non-executable. 109 - */ 110 - switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD, 111 - switcher_addr, 112 - switcher_addr + PAGE_SIZE); 113 - 114 - if (!switcher_text_vma) { 115 - err = -ENOMEM; 116 - printk("lguest: could not map switcher pages high\n"); 117 - goto free_pages; 118 - } 119 - 120 - switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE, 121 - VM_ALLOC|VM_NO_GUARD, 122 - switcher_addr + PAGE_SIZE, 123 - switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE); 124 - if (!switcher_stacks_vma) { 125 - err = -ENOMEM; 126 - printk("lguest: could not map switcher pages high\n"); 127 - goto free_text_vma; 128 - } 129 - 130 - /* 131 - * This code actually sets up the pages we've allocated to appear at 132 - * switcher_addr. map_vm_area() takes the vma we allocated above, the 133 - * kind of pages we're mapping (kernel text pages and kernel writable 134 - * pages respectively), and a pointer to our array of struct pages. 135 - */ 136 - err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages); 137 - if (err) { 138 - printk("lguest: text map_vm_area failed: %i\n", err); 139 - goto free_vmas; 140 - } 141 - 142 - err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL, 143 - lg_switcher_pages + SWITCHER_TEXT_PAGES); 144 - if (err) { 145 - printk("lguest: stacks map_vm_area failed: %i\n", err); 146 - goto free_vmas; 147 - } 148 - 149 - /* 150 - * Now the Switcher is mapped at the right address, we can't fail! 151 - */ 152 - printk(KERN_INFO "lguest: mapped switcher at %p\n", 153 - switcher_text_vma->addr); 154 - /* And we succeeded... */ 155 - return 0; 156 - 157 - free_vmas: 158 - /* Undoes map_vm_area and __get_vm_area */ 159 - vunmap(switcher_stacks_vma->addr); 160 - free_text_vma: 161 - vunmap(switcher_text_vma->addr); 162 - free_pages: 163 - i = TOTAL_SWITCHER_PAGES; 164 - free_some_pages: 165 - for (--i; i >= 0; i--) 166 - __free_pages(lg_switcher_pages[i], 0); 167 - kfree(lg_switcher_pages); 168 - out: 169 - return err; 170 - } 171 - /*:*/ 172 - 173 - /* Cleaning up the mapping when the module is unloaded is almost... too easy. */ 174 - static void unmap_switcher(void) 175 - { 176 - unsigned int i; 177 - 178 - /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ 179 - vunmap(switcher_text_vma->addr); 180 - vunmap(switcher_stacks_vma->addr); 181 - /* Now we just need to free the pages we copied the switcher into */ 182 - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 183 - __free_pages(lg_switcher_pages[i], 0); 184 - kfree(lg_switcher_pages); 185 - } 186 - 187 - /*H:032 188 - * Dealing With Guest Memory. 189 - * 190 - * Before we go too much further into the Host, we need to grok the routines 191 - * we use to deal with Guest memory. 192 - * 193 - * When the Guest gives us (what it thinks is) a physical address, we can use 194 - * the normal copy_from_user() & copy_to_user() on the corresponding place in 195 - * the memory region allocated by the Launcher. 196 - * 197 - * But we can't trust the Guest: it might be trying to access the Launcher 198 - * code. We have to check that the range is below the pfn_limit the Launcher 199 - * gave us. We have to make sure that addr + len doesn't give us a false 200 - * positive by overflowing, too. 201 - */ 202 - bool lguest_address_ok(const struct lguest *lg, 203 - unsigned long addr, unsigned long len) 204 - { 205 - return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr); 206 - } 207 - 208 - /* 209 - * This routine copies memory from the Guest. Here we can see how useful the 210 - * kill_lguest() routine we met in the Launcher can be: we return a random 211 - * value (all zeroes) instead of needing to return an error. 212 - */ 213 - void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) 214 - { 215 - if (!lguest_address_ok(cpu->lg, addr, bytes) 216 - || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) { 217 - /* copy_from_user should do this, but as we rely on it... */ 218 - memset(b, 0, bytes); 219 - kill_guest(cpu, "bad read address %#lx len %u", addr, bytes); 220 - } 221 - } 222 - 223 - /* This is the write (copy into Guest) version. */ 224 - void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, 225 - unsigned bytes) 226 - { 227 - if (!lguest_address_ok(cpu->lg, addr, bytes) 228 - || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0) 229 - kill_guest(cpu, "bad write address %#lx len %u", addr, bytes); 230 - } 231 - /*:*/ 232 - 233 - /*H:030 234 - * Let's jump straight to the the main loop which runs the Guest. 235 - * Remember, this is called by the Launcher reading /dev/lguest, and we keep 236 - * going around and around until something interesting happens. 237 - */ 238 - int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 239 - { 240 - /* If the launcher asked for a register with LHREQ_GETREG */ 241 - if (cpu->reg_read) { 242 - if (put_user(*cpu->reg_read, user)) 243 - return -EFAULT; 244 - cpu->reg_read = NULL; 245 - return sizeof(*cpu->reg_read); 246 - } 247 - 248 - /* We stop running once the Guest is dead. */ 249 - while (!cpu->lg->dead) { 250 - unsigned int irq; 251 - bool more; 252 - 253 - /* First we run any hypercalls the Guest wants done. */ 254 - if (cpu->hcall) 255 - do_hypercalls(cpu); 256 - 257 - /* Do we have to tell the Launcher about a trap? */ 258 - if (cpu->pending.trap) { 259 - if (copy_to_user(user, &cpu->pending, 260 - sizeof(cpu->pending))) 261 - return -EFAULT; 262 - return sizeof(cpu->pending); 263 - } 264 - 265 - /* 266 - * All long-lived kernel loops need to check with this horrible 267 - * thing called the freezer. If the Host is trying to suspend, 268 - * it stops us. 269 - */ 270 - try_to_freeze(); 271 - 272 - /* Check for signals */ 273 - if (signal_pending(current)) 274 - return -ERESTARTSYS; 275 - 276 - /* 277 - * Check if there are any interrupts which can be delivered now: 278 - * if so, this sets up the hander to be executed when we next 279 - * run the Guest. 280 - */ 281 - irq = interrupt_pending(cpu, &more); 282 - if (irq < LGUEST_IRQS) 283 - try_deliver_interrupt(cpu, irq, more); 284 - 285 - /* 286 - * Just make absolutely sure the Guest is still alive. One of 287 - * those hypercalls could have been fatal, for example. 288 - */ 289 - if (cpu->lg->dead) 290 - break; 291 - 292 - /* 293 - * If the Guest asked to be stopped, we sleep. The Guest's 294 - * clock timer will wake us. 295 - */ 296 - if (cpu->halted) { 297 - set_current_state(TASK_INTERRUPTIBLE); 298 - /* 299 - * Just before we sleep, make sure no interrupt snuck in 300 - * which we should be doing. 301 - */ 302 - if (interrupt_pending(cpu, &more) < LGUEST_IRQS) 303 - set_current_state(TASK_RUNNING); 304 - else 305 - schedule(); 306 - continue; 307 - } 308 - 309 - /* 310 - * OK, now we're ready to jump into the Guest. First we put up 311 - * the "Do Not Disturb" sign: 312 - */ 313 - local_irq_disable(); 314 - 315 - /* Actually run the Guest until something happens. */ 316 - lguest_arch_run_guest(cpu); 317 - 318 - /* Now we're ready to be interrupted or moved to other CPUs */ 319 - local_irq_enable(); 320 - 321 - /* Now we deal with whatever happened to the Guest. */ 322 - lguest_arch_handle_trap(cpu); 323 - } 324 - 325 - /* Special case: Guest is 'dead' but wants a reboot. */ 326 - if (cpu->lg->dead == ERR_PTR(-ERESTART)) 327 - return -ERESTART; 328 - 329 - /* The Guest is dead => "No such file or directory" */ 330 - return -ENOENT; 331 - } 332 - 333 - /*H:000 334 - * Welcome to the Host! 335 - * 336 - * By this point your brain has been tickled by the Guest code and numbed by 337 - * the Launcher code; prepare for it to be stretched by the Host code. This is 338 - * the heart. Let's begin at the initialization routine for the Host's lg 339 - * module. 340 - */ 341 - static int __init init(void) 342 - { 343 - int err; 344 - 345 - /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ 346 - if (get_kernel_rpl() != 0) { 347 - printk("lguest is afraid of being a guest\n"); 348 - return -EPERM; 349 - } 350 - 351 - /* First we put the Switcher up in very high virtual memory. */ 352 - err = map_switcher(); 353 - if (err) 354 - goto out; 355 - 356 - /* We might need to reserve an interrupt vector. */ 357 - err = init_interrupts(); 358 - if (err) 359 - goto unmap; 360 - 361 - /* /dev/lguest needs to be registered. */ 362 - err = lguest_device_init(); 363 - if (err) 364 - goto free_interrupts; 365 - 366 - /* Finally we do some architecture-specific setup. */ 367 - lguest_arch_host_init(); 368 - 369 - /* All good! */ 370 - return 0; 371 - 372 - free_interrupts: 373 - free_interrupts(); 374 - unmap: 375 - unmap_switcher(); 376 - out: 377 - return err; 378 - } 379 - 380 - /* Cleaning up is just the same code, backwards. With a little French. */ 381 - static void __exit fini(void) 382 - { 383 - lguest_device_remove(); 384 - free_interrupts(); 385 - unmap_switcher(); 386 - 387 - lguest_arch_host_fini(); 388 - } 389 - /*:*/ 390 - 391 - /* 392 - * The Host side of lguest can be a module. This is a nice way for people to 393 - * play with it. 394 - */ 395 - module_init(init); 396 - module_exit(fini); 397 - MODULE_LICENSE("GPL"); 398 - MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");

-304

drivers/lguest/hypercalls.c

··· 1 - /*P:500 2 - * Just as userspace programs request kernel operations through a system 3 - * call, the Guest requests Host operations through a "hypercall". You might 4 - * notice this nomenclature doesn't really follow any logic, but the name has 5 - * been around for long enough that we're stuck with it. As you'd expect, this 6 - * code is basically a one big switch statement. 7 - :*/ 8 - 9 - /* Copyright (C) 2006 Rusty Russell IBM Corporation 10 - 11 - This program is free software; you can redistribute it and/or modify 12 - it under the terms of the GNU General Public License as published by 13 - the Free Software Foundation; either version 2 of the License, or 14 - (at your option) any later version. 15 - 16 - This program is distributed in the hope that it will be useful, 17 - but WITHOUT ANY WARRANTY; without even the implied warranty of 18 - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 - GNU General Public License for more details. 20 - 21 - You should have received a copy of the GNU General Public License 22 - along with this program; if not, write to the Free Software 23 - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 24 - */ 25 - #include <linux/uaccess.h> 26 - #include <linux/syscalls.h> 27 - #include <linux/mm.h> 28 - #include <linux/ktime.h> 29 - #include <asm/page.h> 30 - #include <asm/pgtable.h> 31 - #include "lg.h" 32 - 33 - /*H:120 34 - * This is the core hypercall routine: where the Guest gets what it wants. 35 - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. 36 - */ 37 - static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 38 - { 39 - switch (args->arg0) { 40 - case LHCALL_FLUSH_ASYNC: 41 - /* 42 - * This call does nothing, except by breaking out of the Guest 43 - * it makes us process all the asynchronous hypercalls. 44 - */ 45 - break; 46 - case LHCALL_SEND_INTERRUPTS: 47 - /* 48 - * This call does nothing too, but by breaking out of the Guest 49 - * it makes us process any pending interrupts. 50 - */ 51 - break; 52 - case LHCALL_LGUEST_INIT: 53 - /* 54 - * You can't get here unless you're already initialized. Don't 55 - * do that. 56 - */ 57 - kill_guest(cpu, "already have lguest_data"); 58 - break; 59 - case LHCALL_SHUTDOWN: { 60 - char msg[128]; 61 - /* 62 - * Shutdown is such a trivial hypercall that we do it in five 63 - * lines right here. 64 - * 65 - * If the lgread fails, it will call kill_guest() itself; the 66 - * kill_guest() with the message will be ignored. 67 - */ 68 - __lgread(cpu, msg, args->arg1, sizeof(msg)); 69 - msg[sizeof(msg)-1] = '\0'; 70 - kill_guest(cpu, "CRASH: %s", msg); 71 - if (args->arg2 == LGUEST_SHUTDOWN_RESTART) 72 - cpu->lg->dead = ERR_PTR(-ERESTART); 73 - break; 74 - } 75 - case LHCALL_FLUSH_TLB: 76 - /* FLUSH_TLB comes in two flavors, depending on the argument: */ 77 - if (args->arg1) 78 - guest_pagetable_clear_all(cpu); 79 - else 80 - guest_pagetable_flush_user(cpu); 81 - break; 82 - 83 - /* 84 - * All these calls simply pass the arguments through to the right 85 - * routines. 86 - */ 87 - case LHCALL_NEW_PGTABLE: 88 - guest_new_pagetable(cpu, args->arg1); 89 - break; 90 - case LHCALL_SET_STACK: 91 - guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); 92 - break; 93 - case LHCALL_SET_PTE: 94 - #ifdef CONFIG_X86_PAE 95 - guest_set_pte(cpu, args->arg1, args->arg2, 96 - __pte(args->arg3 | (u64)args->arg4 << 32)); 97 - #else 98 - guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); 99 - #endif 100 - break; 101 - case LHCALL_SET_PGD: 102 - guest_set_pgd(cpu->lg, args->arg1, args->arg2); 103 - break; 104 - #ifdef CONFIG_X86_PAE 105 - case LHCALL_SET_PMD: 106 - guest_set_pmd(cpu->lg, args->arg1, args->arg2); 107 - break; 108 - #endif 109 - case LHCALL_SET_CLOCKEVENT: 110 - guest_set_clockevent(cpu, args->arg1); 111 - break; 112 - case LHCALL_HALT: 113 - /* Similarly, this sets the halted flag for run_guest(). */ 114 - cpu->halted = 1; 115 - break; 116 - default: 117 - /* It should be an architecture-specific hypercall. */ 118 - if (lguest_arch_do_hcall(cpu, args)) 119 - kill_guest(cpu, "Bad hypercall %li\n", args->arg0); 120 - } 121 - } 122 - 123 - /*H:124 124 - * Asynchronous hypercalls are easy: we just look in the array in the 125 - * Guest's "struct lguest_data" to see if any new ones are marked "ready". 126 - * 127 - * We are careful to do these in order: obviously we respect the order the 128 - * Guest put them in the ring, but we also promise the Guest that they will 129 - * happen before any normal hypercall (which is why we check this before 130 - * checking for a normal hcall). 131 - */ 132 - static void do_async_hcalls(struct lg_cpu *cpu) 133 - { 134 - unsigned int i; 135 - u8 st[LHCALL_RING_SIZE]; 136 - 137 - /* For simplicity, we copy the entire call status array in at once. */ 138 - if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st))) 139 - return; 140 - 141 - /* We process "struct lguest_data"s hcalls[] ring once. */ 142 - for (i = 0; i < ARRAY_SIZE(st); i++) { 143 - struct hcall_args args; 144 - /* 145 - * We remember where we were up to from last time. This makes 146 - * sure that the hypercalls are done in the order the Guest 147 - * places them in the ring. 148 - */ 149 - unsigned int n = cpu->next_hcall; 150 - 151 - /* 0xFF means there's no call here (yet). */ 152 - if (st[n] == 0xFF) 153 - break; 154 - 155 - /* 156 - * OK, we have hypercall. Increment the "next_hcall" cursor, 157 - * and wrap back to 0 if we reach the end. 158 - */ 159 - if (++cpu->next_hcall == LHCALL_RING_SIZE) 160 - cpu->next_hcall = 0; 161 - 162 - /* 163 - * Copy the hypercall arguments into a local copy of the 164 - * hcall_args struct. 165 - */ 166 - if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], 167 - sizeof(struct hcall_args))) { 168 - kill_guest(cpu, "Fetching async hypercalls"); 169 - break; 170 - } 171 - 172 - /* Do the hypercall, same as a normal one. */ 173 - do_hcall(cpu, &args); 174 - 175 - /* Mark the hypercall done. */ 176 - if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) { 177 - kill_guest(cpu, "Writing result for async hypercall"); 178 - break; 179 - } 180 - 181 - /* 182 - * Stop doing hypercalls if they want to notify the Launcher: 183 - * it needs to service this first. 184 - */ 185 - if (cpu->pending.trap) 186 - break; 187 - } 188 - } 189 - 190 - /* 191 - * Last of all, we look at what happens first of all. The very first time the 192 - * Guest makes a hypercall, we end up here to set things up: 193 - */ 194 - static void initialize(struct lg_cpu *cpu) 195 - { 196 - /* 197 - * You can't do anything until you're initialized. The Guest knows the 198 - * rules, so we're unforgiving here. 199 - */ 200 - if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { 201 - kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); 202 - return; 203 - } 204 - 205 - if (lguest_arch_init_hypercalls(cpu)) 206 - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 207 - 208 - /* 209 - * The Guest tells us where we're not to deliver interrupts by putting 210 - * the instruction address into "struct lguest_data". 211 - */ 212 - if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret)) 213 - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 214 - 215 - /* 216 - * We write the current time into the Guest's data page once so it can 217 - * set its clock. 218 - */ 219 - write_timestamp(cpu); 220 - 221 - /* page_tables.c will also do some setup. */ 222 - page_table_guest_data_init(cpu); 223 - 224 - /* 225 - * This is the one case where the above accesses might have been the 226 - * first write to a Guest page. This may have caused a copy-on-write 227 - * fault, but the old page might be (read-only) in the Guest 228 - * pagetable. 229 - */ 230 - guest_pagetable_clear_all(cpu); 231 - } 232 - /*:*/ 233 - 234 - /*M:013 235 - * If a Guest reads from a page (so creates a mapping) that it has never 236 - * written to, and then the Launcher writes to it (ie. the output of a virtual 237 - * device), the Guest will still see the old page. In practice, this never 238 - * happens: why would the Guest read a page which it has never written to? But 239 - * a similar scenario might one day bite us, so it's worth mentioning. 240 - * 241 - * Note that if we used a shared anonymous mapping in the Launcher instead of 242 - * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we 243 - * need that to switch the Launcher to processes (away from threads) anyway. 244 - :*/ 245 - 246 - /*H:100 247 - * Hypercalls 248 - * 249 - * Remember from the Guest, hypercalls come in two flavors: normal and 250 - * asynchronous. This file handles both of types. 251 - */ 252 - void do_hypercalls(struct lg_cpu *cpu) 253 - { 254 - /* Not initialized yet? This hypercall must do it. */ 255 - if (unlikely(!cpu->lg->lguest_data)) { 256 - /* Set up the "struct lguest_data" */ 257 - initialize(cpu); 258 - /* Hcall is done. */ 259 - cpu->hcall = NULL; 260 - return; 261 - } 262 - 263 - /* 264 - * The Guest has initialized. 265 - * 266 - * Look in the hypercall ring for the async hypercalls: 267 - */ 268 - do_async_hcalls(cpu); 269 - 270 - /* 271 - * If we stopped reading the hypercall ring because the Guest did a 272 - * NOTIFY to the Launcher, we want to return now. Otherwise we do 273 - * the hypercall. 274 - */ 275 - if (!cpu->pending.trap) { 276 - do_hcall(cpu, cpu->hcall); 277 - /* 278 - * Tricky point: we reset the hcall pointer to mark the 279 - * hypercall as "done". We use the hcall pointer rather than 280 - * the trap number to indicate a hypercall is pending. 281 - * Normally it doesn't matter: the Guest will run again and 282 - * update the trap number before we come back here. 283 - * 284 - * However, if we are signalled or the Guest sends I/O to the 285 - * Launcher, the run_guest() loop will exit without running the 286 - * Guest. When it comes back it would try to re-run the 287 - * hypercall. Finding that bug sucked. 288 - */ 289 - cpu->hcall = NULL; 290 - } 291 - } 292 - 293 - /* 294 - * This routine supplies the Guest with time: it's used for wallclock time at 295 - * initial boot and as a rough time source if the TSC isn't available. 296 - */ 297 - void write_timestamp(struct lg_cpu *cpu) 298 - { 299 - struct timespec now; 300 - ktime_get_real_ts(&now); 301 - if (copy_to_user(&cpu->lg->lguest_data->time, 302 - &now, sizeof(struct timespec))) 303 - kill_guest(cpu, "Writing timestamp"); 304 - }

-706

drivers/lguest/interrupts_and_traps.c

··· 1 - /*P:800 2 - * Interrupts (traps) are complicated enough to earn their own file. 3 - * There are three classes of interrupts: 4 - * 5 - * 1) Real hardware interrupts which occur while we're running the Guest, 6 - * 2) Interrupts for virtual devices attached to the Guest, and 7 - * 3) Traps and faults from the Guest. 8 - * 9 - * Real hardware interrupts must be delivered to the Host, not the Guest. 10 - * Virtual interrupts must be delivered to the Guest, but we make them look 11 - * just like real hardware would deliver them. Traps from the Guest can be set 12 - * up to go directly back into the Guest, but sometimes the Host wants to see 13 - * them first, so we also have a way of "reflecting" them into the Guest as if 14 - * they had been delivered to it directly. 15 - :*/ 16 - #include <linux/uaccess.h> 17 - #include <linux/interrupt.h> 18 - #include <linux/module.h> 19 - #include <linux/sched.h> 20 - #include "lg.h" 21 - 22 - /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ 23 - static unsigned int syscall_vector = IA32_SYSCALL_VECTOR; 24 - module_param(syscall_vector, uint, 0444); 25 - 26 - /* The address of the interrupt handler is split into two bits: */ 27 - static unsigned long idt_address(u32 lo, u32 hi) 28 - { 29 - return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); 30 - } 31 - 32 - /* 33 - * The "type" of the interrupt handler is a 4 bit field: we only support a 34 - * couple of types. 35 - */ 36 - static int idt_type(u32 lo, u32 hi) 37 - { 38 - return (hi >> 8) & 0xF; 39 - } 40 - 41 - /* An IDT entry can't be used unless the "present" bit is set. */ 42 - static bool idt_present(u32 lo, u32 hi) 43 - { 44 - return (hi & 0x8000); 45 - } 46 - 47 - /* 48 - * We need a helper to "push" a value onto the Guest's stack, since that's a 49 - * big part of what delivering an interrupt does. 50 - */ 51 - static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) 52 - { 53 - /* Stack grows upwards: move stack then write value. */ 54 - *gstack -= 4; 55 - lgwrite(cpu, *gstack, u32, val); 56 - } 57 - 58 - /*H:210 59 - * The push_guest_interrupt_stack() routine saves Guest state on the stack for 60 - * an interrupt or trap. The mechanics of delivering traps and interrupts to 61 - * the Guest are the same, except some traps have an "error code" which gets 62 - * pushed onto the stack as well: the caller tells us if this is one. 63 - * 64 - * We set up the stack just like the CPU does for a real interrupt, so it's 65 - * identical for the Guest (and the standard "iret" instruction will undo 66 - * it). 67 - */ 68 - static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err) 69 - { 70 - unsigned long gstack, origstack; 71 - u32 eflags, ss, irq_enable; 72 - unsigned long virtstack; 73 - 74 - /* 75 - * There are two cases for interrupts: one where the Guest is already 76 - * in the kernel, and a more complex one where the Guest is in 77 - * userspace. We check the privilege level to find out. 78 - */ 79 - if ((cpu->regs->ss&0x3) != GUEST_PL) { 80 - /* 81 - * The Guest told us their kernel stack with the SET_STACK 82 - * hypercall: both the virtual address and the segment. 83 - */ 84 - virtstack = cpu->esp1; 85 - ss = cpu->ss1; 86 - 87 - origstack = gstack = guest_pa(cpu, virtstack); 88 - /* 89 - * We push the old stack segment and pointer onto the new 90 - * stack: when the Guest does an "iret" back from the interrupt 91 - * handler the CPU will notice they're dropping privilege 92 - * levels and expect these here. 93 - */ 94 - push_guest_stack(cpu, &gstack, cpu->regs->ss); 95 - push_guest_stack(cpu, &gstack, cpu->regs->esp); 96 - } else { 97 - /* We're staying on the same Guest (kernel) stack. */ 98 - virtstack = cpu->regs->esp; 99 - ss = cpu->regs->ss; 100 - 101 - origstack = gstack = guest_pa(cpu, virtstack); 102 - } 103 - 104 - /* 105 - * Remember that we never let the Guest actually disable interrupts, so 106 - * the "Interrupt Flag" bit is always set. We copy that bit from the 107 - * Guest's "irq_enabled" field into the eflags word: we saw the Guest 108 - * copy it back in "lguest_iret". 109 - */ 110 - eflags = cpu->regs->eflags; 111 - if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 112 - && !(irq_enable & X86_EFLAGS_IF)) 113 - eflags &= ~X86_EFLAGS_IF; 114 - 115 - /* 116 - * An interrupt is expected to push three things on the stack: the old 117 - * "eflags" word, the old code segment, and the old instruction 118 - * pointer. 119 - */ 120 - push_guest_stack(cpu, &gstack, eflags); 121 - push_guest_stack(cpu, &gstack, cpu->regs->cs); 122 - push_guest_stack(cpu, &gstack, cpu->regs->eip); 123 - 124 - /* For the six traps which supply an error code, we push that, too. */ 125 - if (has_err) 126 - push_guest_stack(cpu, &gstack, cpu->regs->errcode); 127 - 128 - /* Adjust the stack pointer and stack segment. */ 129 - cpu->regs->ss = ss; 130 - cpu->regs->esp = virtstack + (gstack - origstack); 131 - } 132 - 133 - /* 134 - * This actually makes the Guest start executing the given interrupt/trap 135 - * handler. 136 - * 137 - * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this 138 - * interrupt or trap. It's split into two parts for traditional reasons: gcc 139 - * on i386 used to be frightened by 64 bit numbers. 140 - */ 141 - static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi) 142 - { 143 - /* If we're already in the kernel, we don't change stacks. */ 144 - if ((cpu->regs->ss&0x3) != GUEST_PL) 145 - cpu->regs->ss = cpu->esp1; 146 - 147 - /* 148 - * Set the code segment and the address to execute. 149 - */ 150 - cpu->regs->cs = (__KERNEL_CS|GUEST_PL); 151 - cpu->regs->eip = idt_address(lo, hi); 152 - 153 - /* 154 - * Trapping always clears these flags: 155 - * TF: Trap flag 156 - * VM: Virtual 8086 mode 157 - * RF: Resume 158 - * NT: Nested task. 159 - */ 160 - cpu->regs->eflags &= 161 - ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); 162 - 163 - /* 164 - * There are two kinds of interrupt handlers: 0xE is an "interrupt 165 - * gate" which expects interrupts to be disabled on entry. 166 - */ 167 - if (idt_type(lo, hi) == 0xE) 168 - if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) 169 - kill_guest(cpu, "Disabling interrupts"); 170 - } 171 - 172 - /* This restores the eflags word which was pushed on the stack by a trap */ 173 - static void restore_eflags(struct lg_cpu *cpu) 174 - { 175 - /* This is the physical address of the stack. */ 176 - unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp); 177 - 178 - /* 179 - * Stack looks like this: 180 - * Address Contents 181 - * esp EIP 182 - * esp + 4 CS 183 - * esp + 8 EFLAGS 184 - */ 185 - cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32); 186 - cpu->regs->eflags &= 187 - ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); 188 - } 189 - 190 - /*H:205 191 - * Virtual Interrupts. 192 - * 193 - * interrupt_pending() returns the first pending interrupt which isn't blocked 194 - * by the Guest. It is called before every entry to the Guest, and just before 195 - * we go to sleep when the Guest has halted itself. 196 - */ 197 - unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) 198 - { 199 - unsigned int irq; 200 - DECLARE_BITMAP(blk, LGUEST_IRQS); 201 - 202 - /* If the Guest hasn't even initialized yet, we can do nothing. */ 203 - if (!cpu->lg->lguest_data) 204 - return LGUEST_IRQS; 205 - 206 - /* 207 - * Take our "irqs_pending" array and remove any interrupts the Guest 208 - * wants blocked: the result ends up in "blk". 209 - */ 210 - if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 211 - sizeof(blk))) 212 - return LGUEST_IRQS; 213 - bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); 214 - 215 - /* Find the first interrupt. */ 216 - irq = find_first_bit(blk, LGUEST_IRQS); 217 - *more = find_next_bit(blk, LGUEST_IRQS, irq+1); 218 - 219 - return irq; 220 - } 221 - 222 - /* 223 - * This actually diverts the Guest to running an interrupt handler, once an 224 - * interrupt has been identified by interrupt_pending(). 225 - */ 226 - void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) 227 - { 228 - struct desc_struct *idt; 229 - 230 - BUG_ON(irq >= LGUEST_IRQS); 231 - 232 - /* If they're halted, interrupts restart them. */ 233 - if (cpu->halted) { 234 - /* Re-enable interrupts. */ 235 - if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled)) 236 - kill_guest(cpu, "Re-enabling interrupts"); 237 - cpu->halted = 0; 238 - } else { 239 - /* Otherwise we check if they have interrupts disabled. */ 240 - u32 irq_enabled; 241 - if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) 242 - irq_enabled = 0; 243 - if (!irq_enabled) { 244 - /* Make sure they know an IRQ is pending. */ 245 - put_user(X86_EFLAGS_IF, 246 - &cpu->lg->lguest_data->irq_pending); 247 - return; 248 - } 249 - } 250 - 251 - /* 252 - * Look at the IDT entry the Guest gave us for this interrupt. The 253 - * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 254 - * over them. 255 - */ 256 - idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 257 - /* If they don't have a handler (yet?), we just ignore it */ 258 - if (idt_present(idt->a, idt->b)) { 259 - /* OK, mark it no longer pending and deliver it. */ 260 - clear_bit(irq, cpu->irqs_pending); 261 - 262 - /* 263 - * They may be about to iret, where they asked us never to 264 - * deliver interrupts. In this case, we can emulate that iret 265 - * then immediately deliver the interrupt. This is basically 266 - * a noop: the iret would pop the interrupt frame and restore 267 - * eflags, and then we'd set it up again. So just restore the 268 - * eflags word and jump straight to the handler in this case. 269 - * 270 - * Denys Vlasenko points out that this isn't quite right: if 271 - * the iret was returning to userspace, then that interrupt 272 - * would reset the stack pointer (which the Guest told us 273 - * about via LHCALL_SET_STACK). But unless the Guest is being 274 - * *really* weird, that will be the same as the current stack 275 - * anyway. 276 - */ 277 - if (cpu->regs->eip == cpu->lg->noirq_iret) { 278 - restore_eflags(cpu); 279 - } else { 280 - /* 281 - * set_guest_interrupt() takes a flag to say whether 282 - * this interrupt pushes an error code onto the stack 283 - * as well: virtual interrupts never do. 284 - */ 285 - push_guest_interrupt_stack(cpu, false); 286 - } 287 - /* Actually make Guest cpu jump to handler. */ 288 - guest_run_interrupt(cpu, idt->a, idt->b); 289 - } 290 - 291 - /* 292 - * Every time we deliver an interrupt, we update the timestamp in the 293 - * Guest's lguest_data struct. It would be better for the Guest if we 294 - * did this more often, but it can actually be quite slow: doing it 295 - * here is a compromise which means at least it gets updated every 296 - * timer interrupt. 297 - */ 298 - write_timestamp(cpu); 299 - 300 - /* 301 - * If there are no other interrupts we want to deliver, clear 302 - * the pending flag. 303 - */ 304 - if (!more) 305 - put_user(0, &cpu->lg->lguest_data->irq_pending); 306 - } 307 - 308 - /* And this is the routine when we want to set an interrupt for the Guest. */ 309 - void set_interrupt(struct lg_cpu *cpu, unsigned int irq) 310 - { 311 - /* 312 - * Next time the Guest runs, the core code will see if it can deliver 313 - * this interrupt. 314 - */ 315 - set_bit(irq, cpu->irqs_pending); 316 - 317 - /* 318 - * Make sure it sees it; it might be asleep (eg. halted), or running 319 - * the Guest right now, in which case kick_process() will knock it out. 320 - */ 321 - if (!wake_up_process(cpu->tsk)) 322 - kick_process(cpu->tsk); 323 - } 324 - /*:*/ 325 - 326 - /* 327 - * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 328 - * me a patch, so we support that too. It'd be a big step for lguest if half 329 - * the Plan 9 user base were to start using it. 330 - * 331 - * Actually now I think of it, it's possible that Ron *is* half the Plan 9 332 - * userbase. Oh well. 333 - */ 334 - bool could_be_syscall(unsigned int num) 335 - { 336 - /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */ 337 - return num == IA32_SYSCALL_VECTOR || num == syscall_vector; 338 - } 339 - 340 - /* The syscall vector it wants must be unused by Host. */ 341 - bool check_syscall_vector(struct lguest *lg) 342 - { 343 - u32 vector; 344 - 345 - if (get_user(vector, &lg->lguest_data->syscall_vec)) 346 - return false; 347 - 348 - return could_be_syscall(vector); 349 - } 350 - 351 - int init_interrupts(void) 352 - { 353 - /* If they want some strange system call vector, reserve it now */ 354 - if (syscall_vector != IA32_SYSCALL_VECTOR) { 355 - if (test_bit(syscall_vector, used_vectors) || 356 - vector_used_by_percpu_irq(syscall_vector)) { 357 - printk(KERN_ERR "lg: couldn't reserve syscall %u\n", 358 - syscall_vector); 359 - return -EBUSY; 360 - } 361 - set_bit(syscall_vector, used_vectors); 362 - } 363 - 364 - return 0; 365 - } 366 - 367 - void free_interrupts(void) 368 - { 369 - if (syscall_vector != IA32_SYSCALL_VECTOR) 370 - clear_bit(syscall_vector, used_vectors); 371 - } 372 - 373 - /*H:220 374 - * Now we've got the routines to deliver interrupts, delivering traps like 375 - * page fault is easy. The only trick is that Intel decided that some traps 376 - * should have error codes: 377 - */ 378 - static bool has_err(unsigned int trap) 379 - { 380 - return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); 381 - } 382 - 383 - /* deliver_trap() returns true if it could deliver the trap. */ 384 - bool deliver_trap(struct lg_cpu *cpu, unsigned int num) 385 - { 386 - /* 387 - * Trap numbers are always 8 bit, but we set an impossible trap number 388 - * for traps inside the Switcher, so check that here. 389 - */ 390 - if (num >= ARRAY_SIZE(cpu->arch.idt)) 391 - return false; 392 - 393 - /* 394 - * Early on the Guest hasn't set the IDT entries (or maybe it put a 395 - * bogus one in): if we fail here, the Guest will be killed. 396 - */ 397 - if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) 398 - return false; 399 - push_guest_interrupt_stack(cpu, has_err(num)); 400 - guest_run_interrupt(cpu, cpu->arch.idt[num].a, 401 - cpu->arch.idt[num].b); 402 - return true; 403 - } 404 - 405 - /*H:250 406 - * Here's the hard part: returning to the Host every time a trap happens 407 - * and then calling deliver_trap() and re-entering the Guest is slow. 408 - * Particularly because Guest userspace system calls are traps (usually trap 409 - * 128). 410 - * 411 - * So we'd like to set up the IDT to tell the CPU to deliver traps directly 412 - * into the Guest. This is possible, but the complexities cause the size of 413 - * this file to double! However, 150 lines of code is worth writing for taking 414 - * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 415 - * the other hypervisors would beat it up at lunchtime. 416 - * 417 - * This routine indicates if a particular trap number could be delivered 418 - * directly. 419 - * 420 - * Unfortunately, Linux 4.6 started using an interrupt gate instead of a 421 - * trap gate for syscalls, so this trick is ineffective. See Mastery for 422 - * how we could do this anyway... 423 - */ 424 - static bool direct_trap(unsigned int num) 425 - { 426 - /* 427 - * Hardware interrupts don't go to the Guest at all (except system 428 - * call). 429 - */ 430 - if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 431 - return false; 432 - 433 - /* 434 - * The Host needs to see page faults (for shadow paging and to save the 435 - * fault address), general protection faults (in/out emulation) and 436 - * device not available (TS handling) and of course, the hypercall trap. 437 - */ 438 - return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; 439 - } 440 - /*:*/ 441 - 442 - /*M:005 443 - * The Guest has the ability to turn its interrupt gates into trap gates, 444 - * if it is careful. The Host will let trap gates can go directly to the 445 - * Guest, but the Guest needs the interrupts atomically disabled for an 446 - * interrupt gate. The Host could provide a mechanism to register more 447 - * "no-interrupt" regions, and the Guest could point the trap gate at 448 - * instructions within that region, where it can safely disable interrupts. 449 - */ 450 - 451 - /*M:006 452 - * The Guests do not use the sysenter (fast system call) instruction, 453 - * because it's hardcoded to enter privilege level 0 and so can't go direct. 454 - * It's about twice as fast as the older "int 0x80" system call, so it might 455 - * still be worthwhile to handle it in the Switcher and lcall down to the 456 - * Guest. The sysenter semantics are hairy tho: search for that keyword in 457 - * entry.S 458 - :*/ 459 - 460 - /*H:260 461 - * When we make traps go directly into the Guest, we need to make sure 462 - * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the 463 - * CPU trying to deliver the trap will fault while trying to push the interrupt 464 - * words on the stack: this is called a double fault, and it forces us to kill 465 - * the Guest. 466 - * 467 - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. 468 - */ 469 - void pin_stack_pages(struct lg_cpu *cpu) 470 - { 471 - unsigned int i; 472 - 473 - /* 474 - * Depending on the CONFIG_4KSTACKS option, the Guest can have one or 475 - * two pages of stack space. 476 - */ 477 - for (i = 0; i < cpu->lg->stack_pages; i++) 478 - /* 479 - * The stack grows *upwards*, so the address we're given is the 480 - * start of the page after the kernel stack. Subtract one to 481 - * get back onto the first stack page, and keep subtracting to 482 - * get to the rest of the stack pages. 483 - */ 484 - pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); 485 - } 486 - 487 - /* 488 - * Direct traps also mean that we need to know whenever the Guest wants to use 489 - * a different kernel stack, so we can change the guest TSS to use that 490 - * stack. The TSS entries expect a virtual address, so unlike most addresses 491 - * the Guest gives us, the "esp" (stack pointer) value here is virtual, not 492 - * physical. 493 - * 494 - * In Linux each process has its own kernel stack, so this happens a lot: we 495 - * change stacks on each context switch. 496 - */ 497 - void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) 498 - { 499 - /* 500 - * You're not allowed a stack segment with privilege level 0: bad Guest! 501 - */ 502 - if ((seg & 0x3) != GUEST_PL) 503 - kill_guest(cpu, "bad stack segment %i", seg); 504 - /* We only expect one or two stack pages. */ 505 - if (pages > 2) 506 - kill_guest(cpu, "bad stack pages %u", pages); 507 - /* Save where the stack is, and how many pages */ 508 - cpu->ss1 = seg; 509 - cpu->esp1 = esp; 510 - cpu->lg->stack_pages = pages; 511 - /* Make sure the new stack pages are mapped */ 512 - pin_stack_pages(cpu); 513 - } 514 - 515 - /* 516 - * All this reference to mapping stacks leads us neatly into the other complex 517 - * part of the Host: page table handling. 518 - */ 519 - 520 - /*H:235 521 - * This is the routine which actually checks the Guest's IDT entry and 522 - * transfers it into the entry in "struct lguest": 523 - */ 524 - static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, 525 - unsigned int num, u32 lo, u32 hi) 526 - { 527 - u8 type = idt_type(lo, hi); 528 - 529 - /* We zero-out a not-present entry */ 530 - if (!idt_present(lo, hi)) { 531 - trap->a = trap->b = 0; 532 - return; 533 - } 534 - 535 - /* We only support interrupt and trap gates. */ 536 - if (type != 0xE && type != 0xF) 537 - kill_guest(cpu, "bad IDT type %i", type); 538 - 539 - /* 540 - * We only copy the handler address, present bit, privilege level and 541 - * type. The privilege level controls where the trap can be triggered 542 - * manually with an "int" instruction. This is usually GUEST_PL, 543 - * except for system calls which userspace can use. 544 - */ 545 - trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); 546 - trap->b = (hi&0xFFFFEF00); 547 - } 548 - 549 - /*H:230 550 - * While we're here, dealing with delivering traps and interrupts to the 551 - * Guest, we might as well complete the picture: how the Guest tells us where 552 - * it wants them to go. This would be simple, except making traps fast 553 - * requires some tricks. 554 - * 555 - * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 556 - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. 557 - */ 558 - void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) 559 - { 560 - /* 561 - * Guest never handles: NMI, doublefault, spurious interrupt or 562 - * hypercall. We ignore when it tries to set them. 563 - */ 564 - if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) 565 - return; 566 - 567 - /* 568 - * Mark the IDT as changed: next time the Guest runs we'll know we have 569 - * to copy this again. 570 - */ 571 - cpu->changed |= CHANGED_IDT; 572 - 573 - /* Check that the Guest doesn't try to step outside the bounds. */ 574 - if (num >= ARRAY_SIZE(cpu->arch.idt)) 575 - kill_guest(cpu, "Setting idt entry %u", num); 576 - else 577 - set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); 578 - } 579 - 580 - /* 581 - * The default entry for each interrupt points into the Switcher routines which 582 - * simply return to the Host. The run_guest() loop will then call 583 - * deliver_trap() to bounce it back into the Guest. 584 - */ 585 - static void default_idt_entry(struct desc_struct *idt, 586 - int trap, 587 - const unsigned long handler, 588 - const struct desc_struct *base) 589 - { 590 - /* A present interrupt gate. */ 591 - u32 flags = 0x8e00; 592 - 593 - /* 594 - * Set the privilege level on the entry for the hypercall: this allows 595 - * the Guest to use the "int" instruction to trigger it. 596 - */ 597 - if (trap == LGUEST_TRAP_ENTRY) 598 - flags |= (GUEST_PL << 13); 599 - else if (base) 600 - /* 601 - * Copy privilege level from what Guest asked for. This allows 602 - * debug (int 3) traps from Guest userspace, for example. 603 - */ 604 - flags |= (base->b & 0x6000); 605 - 606 - /* Now pack it into the IDT entry in its weird format. */ 607 - idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); 608 - idt->b = (handler&0xFFFF0000) | flags; 609 - } 610 - 611 - /* When the Guest first starts, we put default entries into the IDT. */ 612 - void setup_default_idt_entries(struct lguest_ro_state *state, 613 - const unsigned long *def) 614 - { 615 - unsigned int i; 616 - 617 - for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) 618 - default_idt_entry(&state->guest_idt[i], i, def[i], NULL); 619 - } 620 - 621 - /*H:240 622 - * We don't use the IDT entries in the "struct lguest" directly, instead 623 - * we copy them into the IDT which we've set up for Guests on this CPU, just 624 - * before we run the Guest. This routine does that copy. 625 - */ 626 - void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 627 - const unsigned long *def) 628 - { 629 - unsigned int i; 630 - 631 - /* 632 - * We can simply copy the direct traps, otherwise we use the default 633 - * ones in the Switcher: they will return to the Host. 634 - */ 635 - for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { 636 - const struct desc_struct *gidt = &cpu->arch.idt[i]; 637 - 638 - /* If no Guest can ever override this trap, leave it alone. */ 639 - if (!direct_trap(i)) 640 - continue; 641 - 642 - /* 643 - * Only trap gates (type 15) can go direct to the Guest. 644 - * Interrupt gates (type 14) disable interrupts as they are 645 - * entered, which we never let the Guest do. Not present 646 - * entries (type 0x0) also can't go direct, of course. 647 - * 648 - * If it can't go direct, we still need to copy the priv. level: 649 - * they might want to give userspace access to a software 650 - * interrupt. 651 - */ 652 - if (idt_type(gidt->a, gidt->b) == 0xF) 653 - idt[i] = *gidt; 654 - else 655 - default_idt_entry(&idt[i], i, def[i], gidt); 656 - } 657 - } 658 - 659 - /*H:200 660 - * The Guest Clock. 661 - * 662 - * There are two sources of virtual interrupts. We saw one in lguest_user.c: 663 - * the Launcher sending interrupts for virtual devices. The other is the Guest 664 - * timer interrupt. 665 - * 666 - * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to 667 - * the next timer interrupt (in nanoseconds). We use the high-resolution timer 668 - * infrastructure to set a callback at that time. 669 - * 670 - * 0 means "turn off the clock". 671 - */ 672 - void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) 673 - { 674 - ktime_t expires; 675 - 676 - if (unlikely(delta == 0)) { 677 - /* Clock event device is shutting down. */ 678 - hrtimer_cancel(&cpu->hrt); 679 - return; 680 - } 681 - 682 - /* 683 - * We use wallclock time here, so the Guest might not be running for 684 - * all the time between now and the timer interrupt it asked for. This 685 - * is almost always the right thing to do. 686 - */ 687 - expires = ktime_add_ns(ktime_get_real(), delta); 688 - hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); 689 - } 690 - 691 - /* This is the function called when the Guest's timer expires. */ 692 - static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 693 - { 694 - struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); 695 - 696 - /* Remember the first interrupt is the timer interrupt. */ 697 - set_interrupt(cpu, 0); 698 - return HRTIMER_NORESTART; 699 - } 700 - 701 - /* This sets up the timer for this Guest. */ 702 - void init_clockdev(struct lg_cpu *cpu) 703 - { 704 - hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); 705 - cpu->hrt.function = clockdev_fn; 706 - }

-258

drivers/lguest/lg.h

··· 1 - #ifndef _LGUEST_H 2 - #define _LGUEST_H 3 - 4 - #ifndef __ASSEMBLY__ 5 - #include <linux/types.h> 6 - #include <linux/init.h> 7 - #include <linux/stringify.h> 8 - #include <linux/lguest.h> 9 - #include <linux/lguest_launcher.h> 10 - #include <linux/wait.h> 11 - #include <linux/hrtimer.h> 12 - #include <linux/err.h> 13 - #include <linux/slab.h> 14 - 15 - #include <asm/lguest.h> 16 - 17 - struct pgdir { 18 - unsigned long gpgdir; 19 - bool switcher_mapped; 20 - int last_host_cpu; 21 - pgd_t *pgdir; 22 - }; 23 - 24 - /* We have two pages shared with guests, per cpu. */ 25 - struct lguest_pages { 26 - /* This is the stack page mapped rw in guest */ 27 - char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; 28 - struct lguest_regs regs; 29 - 30 - /* This is the host state & guest descriptor page, ro in guest */ 31 - struct lguest_ro_state state; 32 - } __attribute__((aligned(PAGE_SIZE))); 33 - 34 - #define CHANGED_IDT 1 35 - #define CHANGED_GDT 2 36 - #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ 37 - #define CHANGED_ALL 3 38 - 39 - struct lg_cpu { 40 - unsigned int id; 41 - struct lguest *lg; 42 - struct task_struct *tsk; 43 - struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 44 - 45 - u32 cr2; 46 - u32 esp1; 47 - u16 ss1; 48 - 49 - /* Bitmap of what has changed: see CHANGED_* above. */ 50 - int changed; 51 - 52 - /* Pending operation. */ 53 - struct lguest_pending pending; 54 - 55 - unsigned long *reg_read; /* register from LHREQ_GETREG */ 56 - 57 - /* At end of a page shared mapped over lguest_pages in guest. */ 58 - unsigned long regs_page; 59 - struct lguest_regs *regs; 60 - 61 - struct lguest_pages *last_pages; 62 - 63 - /* Initialization mode: linear map everything. */ 64 - bool linear_pages; 65 - int cpu_pgd; /* Which pgd this cpu is currently using */ 66 - 67 - /* If a hypercall was asked for, this points to the arguments. */ 68 - struct hcall_args *hcall; 69 - u32 next_hcall; 70 - 71 - /* Virtual clock device */ 72 - struct hrtimer hrt; 73 - 74 - /* Did the Guest tell us to halt? */ 75 - int halted; 76 - 77 - /* Pending virtual interrupts */ 78 - DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); 79 - 80 - struct lg_cpu_arch arch; 81 - }; 82 - 83 - /* The private info the thread maintains about the guest. */ 84 - struct lguest { 85 - struct lguest_data __user *lguest_data; 86 - struct lg_cpu cpus[NR_CPUS]; 87 - unsigned int nr_cpus; 88 - 89 - /* Valid guest memory pages must be < this. */ 90 - u32 pfn_limit; 91 - 92 - /* Device memory is >= pfn_limit and < device_limit. */ 93 - u32 device_limit; 94 - 95 - /* 96 - * This provides the offset to the base of guest-physical memory in the 97 - * Launcher. 98 - */ 99 - void __user *mem_base; 100 - unsigned long kernel_address; 101 - 102 - struct pgdir pgdirs[4]; 103 - 104 - unsigned long noirq_iret; 105 - 106 - unsigned int stack_pages; 107 - u32 tsc_khz; 108 - 109 - /* Dead? */ 110 - const char *dead; 111 - }; 112 - 113 - extern struct mutex lguest_lock; 114 - 115 - /* core.c: */ 116 - bool lguest_address_ok(const struct lguest *lg, 117 - unsigned long addr, unsigned long len); 118 - void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 119 - void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 120 - extern struct page **lg_switcher_pages; 121 - 122 - /*H:035 123 - * Using memory-copy operations like that is usually inconvient, so we 124 - * have the following helper macros which read and write a specific type (often 125 - * an unsigned long). 126 - * 127 - * This reads into a variable of the given type then returns that. 128 - */ 129 - #define lgread(cpu, addr, type) \ 130 - ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) 131 - 132 - /* This checks that the variable is of the given type, then writes it out. */ 133 - #define lgwrite(cpu, addr, type, val) \ 134 - do { \ 135 - typecheck(type, val); \ 136 - __lgwrite((cpu), (addr), &(val), sizeof(val)); \ 137 - } while(0) 138 - /* (end of memory access helper routines) :*/ 139 - 140 - int run_guest(struct lg_cpu *cpu, unsigned long __user *user); 141 - 142 - /* 143 - * Helper macros to obtain the first 12 or the last 20 bits, this is only the 144 - * first step in the migration to the kernel types. pte_pfn is already defined 145 - * in the kernel. 146 - */ 147 - #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 148 - #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 149 - #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) 150 - #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) 151 - 152 - /* interrupts_and_traps.c: */ 153 - unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); 154 - void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); 155 - void set_interrupt(struct lg_cpu *cpu, unsigned int irq); 156 - bool deliver_trap(struct lg_cpu *cpu, unsigned int num); 157 - void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, 158 - u32 low, u32 hi); 159 - void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages); 160 - void pin_stack_pages(struct lg_cpu *cpu); 161 - void setup_default_idt_entries(struct lguest_ro_state *state, 162 - const unsigned long *def); 163 - void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 164 - const unsigned long *def); 165 - void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); 166 - bool send_notify_to_eventfd(struct lg_cpu *cpu); 167 - void init_clockdev(struct lg_cpu *cpu); 168 - bool check_syscall_vector(struct lguest *lg); 169 - bool could_be_syscall(unsigned int num); 170 - int init_interrupts(void); 171 - void free_interrupts(void); 172 - 173 - /* segments.c: */ 174 - void setup_default_gdt_entries(struct lguest_ro_state *state); 175 - void setup_guest_gdt(struct lg_cpu *cpu); 176 - void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i, 177 - u32 low, u32 hi); 178 - void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array); 179 - void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); 180 - void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); 181 - 182 - /* page_tables.c: */ 183 - int init_guest_pagetable(struct lguest *lg); 184 - void free_guest_pagetable(struct lguest *lg); 185 - void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); 186 - void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); 187 - #ifdef CONFIG_X86_PAE 188 - void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 189 - #endif 190 - void guest_pagetable_clear_all(struct lg_cpu *cpu); 191 - void guest_pagetable_flush_user(struct lg_cpu *cpu); 192 - void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, 193 - unsigned long vaddr, pte_t val); 194 - void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); 195 - bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode, 196 - unsigned long *iomem); 197 - void pin_page(struct lg_cpu *cpu, unsigned long vaddr); 198 - bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr); 199 - unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); 200 - void page_table_guest_data_init(struct lg_cpu *cpu); 201 - 202 - /* <arch>/core.c: */ 203 - void lguest_arch_host_init(void); 204 - void lguest_arch_host_fini(void); 205 - void lguest_arch_run_guest(struct lg_cpu *cpu); 206 - void lguest_arch_handle_trap(struct lg_cpu *cpu); 207 - int lguest_arch_init_hypercalls(struct lg_cpu *cpu); 208 - int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); 209 - void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); 210 - unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any); 211 - 212 - /* <arch>/switcher.S: */ 213 - extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 214 - 215 - /* lguest_user.c: */ 216 - int lguest_device_init(void); 217 - void lguest_device_remove(void); 218 - 219 - /* hypercalls.c: */ 220 - void do_hypercalls(struct lg_cpu *cpu); 221 - void write_timestamp(struct lg_cpu *cpu); 222 - 223 - /*L:035 224 - * Let's step aside for the moment, to study one important routine that's used 225 - * widely in the Host code. 226 - * 227 - * There are many cases where the Guest can do something invalid, like pass crap 228 - * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite 229 - * acceptable to simply terminate the Guest and give the Launcher a nicely 230 - * formatted reason. It's also simpler for the Guest itself, which doesn't 231 - * need to check most hypercalls for "success"; if you're still running, it 232 - * succeeded. 233 - * 234 - * Once this is called, the Guest will never run again, so most Host code can 235 - * call this then continue as if nothing had happened. This means many 236 - * functions don't have to explicitly return an error code, which keeps the 237 - * code simple. 238 - * 239 - * It also means that this can be called more than once: only the first one is 240 - * remembered. The only trick is that we still need to kill the Guest even if 241 - * we can't allocate memory to store the reason. Linux has a neat way of 242 - * packing error codes into invalid pointers, so we use that here. 243 - * 244 - * Like any macro which uses an "if", it is safely wrapped in a run-once "do { 245 - * } while(0)". 246 - */ 247 - #define kill_guest(cpu, fmt...) \ 248 - do { \ 249 - if (!(cpu)->lg->dead) { \ 250 - (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \ 251 - if (!(cpu)->lg->dead) \ 252 - (cpu)->lg->dead = ERR_PTR(-ENOMEM); \ 253 - } \ 254 - } while(0) 255 - /* (End of aside) :*/ 256 - 257 - #endif /* __ASSEMBLY__ */ 258 - #endif /* _LGUEST_H */

-446

drivers/lguest/lguest_user.c

··· 1 - /*P:200 This contains all the /dev/lguest code, whereby the userspace 2 - * launcher controls and communicates with the Guest. For example, 3 - * the first write will tell us the Guest's memory layout and entry 4 - * point. A read will run the Guest until something happens, such as 5 - * a signal or the Guest accessing a device. 6 - :*/ 7 - #include <linux/uaccess.h> 8 - #include <linux/miscdevice.h> 9 - #include <linux/fs.h> 10 - #include <linux/sched.h> 11 - #include <linux/sched/mm.h> 12 - #include <linux/file.h> 13 - #include <linux/slab.h> 14 - #include <linux/export.h> 15 - #include "lg.h" 16 - 17 - /*L:052 18 - The Launcher can get the registers, and also set some of them. 19 - */ 20 - static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) 21 - { 22 - unsigned long which; 23 - 24 - /* We re-use the ptrace structure to specify which register to read. */ 25 - if (get_user(which, input) != 0) 26 - return -EFAULT; 27 - 28 - /* 29 - * We set up the cpu register pointer, and their next read will 30 - * actually get the value (instead of running the guest). 31 - * 32 - * The last argument 'true' says we can access any register. 33 - */ 34 - cpu->reg_read = lguest_arch_regptr(cpu, which, true); 35 - if (!cpu->reg_read) 36 - return -ENOENT; 37 - 38 - /* And because this is a write() call, we return the length used. */ 39 - return sizeof(unsigned long) * 2; 40 - } 41 - 42 - static int setreg(struct lg_cpu *cpu, const unsigned long __user *input) 43 - { 44 - unsigned long which, value, *reg; 45 - 46 - /* We re-use the ptrace structure to specify which register to read. */ 47 - if (get_user(which, input) != 0) 48 - return -EFAULT; 49 - input++; 50 - if (get_user(value, input) != 0) 51 - return -EFAULT; 52 - 53 - /* The last argument 'false' means we can't access all registers. */ 54 - reg = lguest_arch_regptr(cpu, which, false); 55 - if (!reg) 56 - return -ENOENT; 57 - 58 - *reg = value; 59 - 60 - /* And because this is a write() call, we return the length used. */ 61 - return sizeof(unsigned long) * 3; 62 - } 63 - 64 - /*L:050 65 - * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 66 - * number to /dev/lguest. 67 - */ 68 - static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) 69 - { 70 - unsigned long irq; 71 - 72 - if (get_user(irq, input) != 0) 73 - return -EFAULT; 74 - if (irq >= LGUEST_IRQS) 75 - return -EINVAL; 76 - 77 - /* 78 - * Next time the Guest runs, the core code will see if it can deliver 79 - * this interrupt. 80 - */ 81 - set_interrupt(cpu, irq); 82 - return 0; 83 - } 84 - 85 - /*L:053 86 - * Deliver a trap: this is used by the Launcher if it can't emulate 87 - * an instruction. 88 - */ 89 - static int trap(struct lg_cpu *cpu, const unsigned long __user *input) 90 - { 91 - unsigned long trapnum; 92 - 93 - if (get_user(trapnum, input) != 0) 94 - return -EFAULT; 95 - 96 - if (!deliver_trap(cpu, trapnum)) 97 - return -EINVAL; 98 - 99 - return 0; 100 - } 101 - 102 - /*L:040 103 - * Once our Guest is initialized, the Launcher makes it run by reading 104 - * from /dev/lguest. 105 - */ 106 - static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 107 - { 108 - struct lguest *lg = file->private_data; 109 - struct lg_cpu *cpu; 110 - unsigned int cpu_id = *o; 111 - 112 - /* You must write LHREQ_INITIALIZE first! */ 113 - if (!lg) 114 - return -EINVAL; 115 - 116 - /* Watch out for arbitrary vcpu indexes! */ 117 - if (cpu_id >= lg->nr_cpus) 118 - return -EINVAL; 119 - 120 - cpu = &lg->cpus[cpu_id]; 121 - 122 - /* If you're not the task which owns the Guest, go away. */ 123 - if (current != cpu->tsk) 124 - return -EPERM; 125 - 126 - /* If the Guest is already dead, we indicate why */ 127 - if (lg->dead) { 128 - size_t len; 129 - 130 - /* lg->dead either contains an error code, or a string. */ 131 - if (IS_ERR(lg->dead)) 132 - return PTR_ERR(lg->dead); 133 - 134 - /* We can only return as much as the buffer they read with. */ 135 - len = min(size, strlen(lg->dead)+1); 136 - if (copy_to_user(user, lg->dead, len) != 0) 137 - return -EFAULT; 138 - return len; 139 - } 140 - 141 - /* 142 - * If we returned from read() last time because the Guest sent I/O, 143 - * clear the flag. 144 - */ 145 - if (cpu->pending.trap) 146 - cpu->pending.trap = 0; 147 - 148 - /* Run the Guest until something interesting happens. */ 149 - return run_guest(cpu, (unsigned long __user *)user); 150 - } 151 - 152 - /*L:025 153 - * This actually initializes a CPU. For the moment, a Guest is only 154 - * uniprocessor, so "id" is always 0. 155 - */ 156 - static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 157 - { 158 - /* We have a limited number of CPUs in the lguest struct. */ 159 - if (id >= ARRAY_SIZE(cpu->lg->cpus)) 160 - return -EINVAL; 161 - 162 - /* Set up this CPU's id, and pointer back to the lguest struct. */ 163 - cpu->id = id; 164 - cpu->lg = container_of(cpu, struct lguest, cpus[id]); 165 - cpu->lg->nr_cpus++; 166 - 167 - /* Each CPU has a timer it can set. */ 168 - init_clockdev(cpu); 169 - 170 - /* 171 - * We need a complete page for the Guest registers: they are accessible 172 - * to the Guest and we can only grant it access to whole pages. 173 - */ 174 - cpu->regs_page = get_zeroed_page(GFP_KERNEL); 175 - if (!cpu->regs_page) 176 - return -ENOMEM; 177 - 178 - /* We actually put the registers at the end of the page. */ 179 - cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 180 - 181 - /* 182 - * Now we initialize the Guest's registers, handing it the start 183 - * address. 184 - */ 185 - lguest_arch_setup_regs(cpu, start_ip); 186 - 187 - /* 188 - * We keep a pointer to the Launcher task (ie. current task) for when 189 - * other Guests want to wake this one (eg. console input). 190 - */ 191 - cpu->tsk = current; 192 - 193 - /* 194 - * We need to keep a pointer to the Launcher's memory map, because if 195 - * the Launcher dies we need to clean it up. If we don't keep a 196 - * reference, it is destroyed before close() is called. 197 - */ 198 - cpu->mm = get_task_mm(cpu->tsk); 199 - 200 - /* 201 - * We remember which CPU's pages this Guest used last, for optimization 202 - * when the same Guest runs on the same CPU twice. 203 - */ 204 - cpu->last_pages = NULL; 205 - 206 - /* No error == success. */ 207 - return 0; 208 - } 209 - 210 - /*L:020 211 - * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in 212 - * addition to the LHREQ_INITIALIZE value). These are: 213 - * 214 - * base: The start of the Guest-physical memory inside the Launcher memory. 215 - * 216 - * pfnlimit: The highest (Guest-physical) page number the Guest should be 217 - * allowed to access. The Guest memory lives inside the Launcher, so it sets 218 - * this to ensure the Guest can only reach its own memory. 219 - * 220 - * start: The first instruction to execute ("eip" in x86-speak). 221 - */ 222 - static int initialize(struct file *file, const unsigned long __user *input) 223 - { 224 - /* "struct lguest" contains all we (the Host) know about a Guest. */ 225 - struct lguest *lg; 226 - int err; 227 - unsigned long args[4]; 228 - 229 - /* 230 - * We grab the Big Lguest lock, which protects against multiple 231 - * simultaneous initializations. 232 - */ 233 - mutex_lock(&lguest_lock); 234 - /* You can't initialize twice! Close the device and start again... */ 235 - if (file->private_data) { 236 - err = -EBUSY; 237 - goto unlock; 238 - } 239 - 240 - if (copy_from_user(args, input, sizeof(args)) != 0) { 241 - err = -EFAULT; 242 - goto unlock; 243 - } 244 - 245 - lg = kzalloc(sizeof(*lg), GFP_KERNEL); 246 - if (!lg) { 247 - err = -ENOMEM; 248 - goto unlock; 249 - } 250 - 251 - /* Populate the easy fields of our "struct lguest" */ 252 - lg->mem_base = (void __user *)args[0]; 253 - lg->pfn_limit = args[1]; 254 - lg->device_limit = args[3]; 255 - 256 - /* This is the first cpu (cpu 0) and it will start booting at args[2] */ 257 - err = lg_cpu_start(&lg->cpus[0], 0, args[2]); 258 - if (err) 259 - goto free_lg; 260 - 261 - /* 262 - * Initialize the Guest's shadow page tables. This allocates 263 - * memory, so can fail. 264 - */ 265 - err = init_guest_pagetable(lg); 266 - if (err) 267 - goto free_regs; 268 - 269 - /* We keep our "struct lguest" in the file's private_data. */ 270 - file->private_data = lg; 271 - 272 - mutex_unlock(&lguest_lock); 273 - 274 - /* And because this is a write() call, we return the length used. */ 275 - return sizeof(args); 276 - 277 - free_regs: 278 - /* FIXME: This should be in free_vcpu */ 279 - free_page(lg->cpus[0].regs_page); 280 - free_lg: 281 - kfree(lg); 282 - unlock: 283 - mutex_unlock(&lguest_lock); 284 - return err; 285 - } 286 - 287 - /*L:010 288 - * The first operation the Launcher does must be a write. All writes 289 - * start with an unsigned long number: for the first write this must be 290 - * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 291 - * writes of other values to send interrupts or set up receipt of notifications. 292 - * 293 - * Note that we overload the "offset" in the /dev/lguest file to indicate what 294 - * CPU number we're dealing with. Currently this is always 0 since we only 295 - * support uniprocessor Guests, but you can see the beginnings of SMP support 296 - * here. 297 - */ 298 - static ssize_t write(struct file *file, const char __user *in, 299 - size_t size, loff_t *off) 300 - { 301 - /* 302 - * Once the Guest is initialized, we hold the "struct lguest" in the 303 - * file private data. 304 - */ 305 - struct lguest *lg = file->private_data; 306 - const unsigned long __user *input = (const unsigned long __user *)in; 307 - unsigned long req; 308 - struct lg_cpu *uninitialized_var(cpu); 309 - unsigned int cpu_id = *off; 310 - 311 - /* The first value tells us what this request is. */ 312 - if (get_user(req, input) != 0) 313 - return -EFAULT; 314 - input++; 315 - 316 - /* If you haven't initialized, you must do that first. */ 317 - if (req != LHREQ_INITIALIZE) { 318 - if (!lg || (cpu_id >= lg->nr_cpus)) 319 - return -EINVAL; 320 - cpu = &lg->cpus[cpu_id]; 321 - 322 - /* Once the Guest is dead, you can only read() why it died. */ 323 - if (lg->dead) 324 - return -ENOENT; 325 - } 326 - 327 - switch (req) { 328 - case LHREQ_INITIALIZE: 329 - return initialize(file, input); 330 - case LHREQ_IRQ: 331 - return user_send_irq(cpu, input); 332 - case LHREQ_GETREG: 333 - return getreg_setup(cpu, input); 334 - case LHREQ_SETREG: 335 - return setreg(cpu, input); 336 - case LHREQ_TRAP: 337 - return trap(cpu, input); 338 - default: 339 - return -EINVAL; 340 - } 341 - } 342 - 343 - static int open(struct inode *inode, struct file *file) 344 - { 345 - file->private_data = NULL; 346 - 347 - return 0; 348 - } 349 - 350 - /*L:060 351 - * The final piece of interface code is the close() routine. It reverses 352 - * everything done in initialize(). This is usually called because the 353 - * Launcher exited. 354 - * 355 - * Note that the close routine returns 0 or a negative error number: it can't 356 - * really fail, but it can whine. I blame Sun for this wart, and K&R C for 357 - * letting them do it. 358 - :*/ 359 - static int close(struct inode *inode, struct file *file) 360 - { 361 - struct lguest *lg = file->private_data; 362 - unsigned int i; 363 - 364 - /* If we never successfully initialized, there's nothing to clean up */ 365 - if (!lg) 366 - return 0; 367 - 368 - /* 369 - * We need the big lock, to protect from inter-guest I/O and other 370 - * Launchers initializing guests. 371 - */ 372 - mutex_lock(&lguest_lock); 373 - 374 - /* Free up the shadow page tables for the Guest. */ 375 - free_guest_pagetable(lg); 376 - 377 - for (i = 0; i < lg->nr_cpus; i++) { 378 - /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 379 - hrtimer_cancel(&lg->cpus[i].hrt); 380 - /* We can free up the register page we allocated. */ 381 - free_page(lg->cpus[i].regs_page); 382 - /* 383 - * Now all the memory cleanups are done, it's safe to release 384 - * the Launcher's memory management structure. 385 - */ 386 - mmput(lg->cpus[i].mm); 387 - } 388 - 389 - /* 390 - * If lg->dead doesn't contain an error code it will be NULL or a 391 - * kmalloc()ed string, either of which is ok to hand to kfree(). 392 - */ 393 - if (!IS_ERR(lg->dead)) 394 - kfree(lg->dead); 395 - /* Free the memory allocated to the lguest_struct */ 396 - kfree(lg); 397 - /* Release lock and exit. */ 398 - mutex_unlock(&lguest_lock); 399 - 400 - return 0; 401 - } 402 - 403 - /*L:000 404 - * Welcome to our journey through the Launcher! 405 - * 406 - * The Launcher is the Host userspace program which sets up, runs and services 407 - * the Guest. In fact, many comments in the Drivers which refer to "the Host" 408 - * doing things are inaccurate: the Launcher does all the device handling for 409 - * the Guest, but the Guest can't know that. 410 - * 411 - * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we 412 - * shall see more of that later. 413 - * 414 - * We begin our understanding with the Host kernel interface which the Launcher 415 - * uses: reading and writing a character device called /dev/lguest. All the 416 - * work happens in the read(), write() and close() routines: 417 - */ 418 - static const struct file_operations lguest_fops = { 419 - .owner = THIS_MODULE, 420 - .open = open, 421 - .release = close, 422 - .write = write, 423 - .read = read, 424 - .llseek = default_llseek, 425 - }; 426 - /*:*/ 427 - 428 - /* 429 - * This is a textbook example of a "misc" character device. Populate a "struct 430 - * miscdevice" and register it with misc_register(). 431 - */ 432 - static struct miscdevice lguest_dev = { 433 - .minor = MISC_DYNAMIC_MINOR, 434 - .name = "lguest", 435 - .fops = &lguest_fops, 436 - }; 437 - 438 - int __init lguest_device_init(void) 439 - { 440 - return misc_register(&lguest_dev); 441 - } 442 - 443 - void __exit lguest_device_remove(void) 444 - { 445 - misc_deregister(&lguest_dev); 446 - }

-1239

drivers/lguest/page_tables.c

··· 1 - /*P:700 2 - * The pagetable code, on the other hand, still shows the scars of 3 - * previous encounters. It's functional, and as neat as it can be in the 4 - * circumstances, but be wary, for these things are subtle and break easily. 5 - * The Guest provides a virtual to physical mapping, but we can neither trust 6 - * it nor use it: we verify and convert it here then point the CPU to the 7 - * converted Guest pages when running the Guest. 8 - :*/ 9 - 10 - /* Copyright (C) Rusty Russell IBM Corporation 2013. 11 - * GPL v2 and any later version */ 12 - #include <linux/mm.h> 13 - #include <linux/gfp.h> 14 - #include <linux/types.h> 15 - #include <linux/spinlock.h> 16 - #include <linux/random.h> 17 - #include <linux/percpu.h> 18 - #include <asm/tlbflush.h> 19 - #include <linux/uaccess.h> 20 - #include "lg.h" 21 - 22 - /*M:008 23 - * We hold reference to pages, which prevents them from being swapped. 24 - * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 25 - * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 26 - * could probably consider launching Guests as non-root. 27 - :*/ 28 - 29 - /*H:300 30 - * The Page Table Code 31 - * 32 - * We use two-level page tables for the Guest, or three-level with PAE. If 33 - * you're not entirely comfortable with virtual addresses, physical addresses 34 - * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page 35 - * Table Handling" (with diagrams!). 36 - * 37 - * The Guest keeps page tables, but we maintain the actual ones here: these are 38 - * called "shadow" page tables. Which is a very Guest-centric name: these are 39 - * the real page tables the CPU uses, although we keep them up to date to 40 - * reflect the Guest's. (See what I mean about weird naming? Since when do 41 - * shadows reflect anything?) 42 - * 43 - * Anyway, this is the most complicated part of the Host code. There are seven 44 - * parts to this: 45 - * (i) Looking up a page table entry when the Guest faults, 46 - * (ii) Making sure the Guest stack is mapped, 47 - * (iii) Setting up a page table entry when the Guest tells us one has changed, 48 - * (iv) Switching page tables, 49 - * (v) Flushing (throwing away) page tables, 50 - * (vi) Mapping the Switcher when the Guest is about to run, 51 - * (vii) Setting up the page tables initially. 52 - :*/ 53 - 54 - /* 55 - * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) 56 - * or 512 PTE entries with PAE (2MB). 57 - */ 58 - #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59 - 60 - /* 61 - * For PAE we need the PMD index as well. We use the last 2MB, so we 62 - * will need the last pmd entry of the last pmd page. 63 - */ 64 - #ifdef CONFIG_X86_PAE 65 - #define CHECK_GPGD_MASK _PAGE_PRESENT 66 - #else 67 - #define CHECK_GPGD_MASK _PAGE_TABLE 68 - #endif 69 - 70 - /*H:320 71 - * The page table code is curly enough to need helper functions to keep it 72 - * clear and clean. The kernel itself provides many of them; one advantage 73 - * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting. 74 - * 75 - * There are two functions which return pointers to the shadow (aka "real") 76 - * page tables. 77 - * 78 - * spgd_addr() takes the virtual address and returns a pointer to the top-level 79 - * page directory entry (PGD) for that address. Since we keep track of several 80 - * page tables, the "i" argument tells us which one we're interested in (it's 81 - * usually the current one). 82 - */ 83 - static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 84 - { 85 - unsigned int index = pgd_index(vaddr); 86 - 87 - /* Return a pointer index'th pgd entry for the i'th page table. */ 88 - return &cpu->lg->pgdirs[i].pgdir[index]; 89 - } 90 - 91 - #ifdef CONFIG_X86_PAE 92 - /* 93 - * This routine then takes the PGD entry given above, which contains the 94 - * address of the PMD page. It then returns a pointer to the PMD entry for the 95 - * given address. 96 - */ 97 - static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 98 - { 99 - unsigned int index = pmd_index(vaddr); 100 - pmd_t *page; 101 - 102 - /* You should never call this if the PGD entry wasn't valid */ 103 - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 104 - page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 105 - 106 - return &page[index]; 107 - } 108 - #endif 109 - 110 - /* 111 - * This routine then takes the page directory entry returned above, which 112 - * contains the address of the page table entry (PTE) page. It then returns a 113 - * pointer to the PTE entry for the given address. 114 - */ 115 - static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 116 - { 117 - #ifdef CONFIG_X86_PAE 118 - pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); 119 - pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); 120 - 121 - /* You should never call this if the PMD entry wasn't valid */ 122 - BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); 123 - #else 124 - pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 125 - /* You should never call this if the PGD entry wasn't valid */ 126 - BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 127 - #endif 128 - 129 - return &page[pte_index(vaddr)]; 130 - } 131 - 132 - /* 133 - * These functions are just like the above, except they access the Guest 134 - * page tables. Hence they return a Guest address. 135 - */ 136 - static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 137 - { 138 - unsigned int index = vaddr >> (PGDIR_SHIFT); 139 - return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); 140 - } 141 - 142 - #ifdef CONFIG_X86_PAE 143 - /* Follow the PGD to the PMD. */ 144 - static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 145 - { 146 - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 147 - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 148 - return gpage + pmd_index(vaddr) * sizeof(pmd_t); 149 - } 150 - 151 - /* Follow the PMD to the PTE. */ 152 - static unsigned long gpte_addr(struct lg_cpu *cpu, 153 - pmd_t gpmd, unsigned long vaddr) 154 - { 155 - unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; 156 - 157 - BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); 158 - return gpage + pte_index(vaddr) * sizeof(pte_t); 159 - } 160 - #else 161 - /* Follow the PGD to the PTE (no mid-level for !PAE). */ 162 - static unsigned long gpte_addr(struct lg_cpu *cpu, 163 - pgd_t gpgd, unsigned long vaddr) 164 - { 165 - unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 166 - 167 - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 168 - return gpage + pte_index(vaddr) * sizeof(pte_t); 169 - } 170 - #endif 171 - /*:*/ 172 - 173 - /*M:007 174 - * get_pfn is slow: we could probably try to grab batches of pages here as 175 - * an optimization (ie. pre-faulting). 176 - :*/ 177 - 178 - /*H:350 179 - * This routine takes a page number given by the Guest and converts it to 180 - * an actual, physical page number. It can fail for several reasons: the 181 - * virtual address might not be mapped by the Launcher, the write flag is set 182 - * and the page is read-only, or the write flag was set and the page was 183 - * shared so had to be copied, but we ran out of memory. 184 - * 185 - * This holds a reference to the page, so release_pte() is careful to put that 186 - * back. 187 - */ 188 - static unsigned long get_pfn(unsigned long virtpfn, int write) 189 - { 190 - struct page *page; 191 - 192 - /* gup me one page at this address please! */ 193 - if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1) 194 - return page_to_pfn(page); 195 - 196 - /* This value indicates failure. */ 197 - return -1UL; 198 - } 199 - 200 - /*H:340 201 - * Converting a Guest page table entry to a shadow (ie. real) page table 202 - * entry can be a little tricky. The flags are (almost) the same, but the 203 - * Guest PTE contains a virtual page number: the CPU needs the real page 204 - * number. 205 - */ 206 - static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 207 - { 208 - unsigned long pfn, base, flags; 209 - 210 - /* 211 - * The Guest sets the global flag, because it thinks that it is using 212 - * PGE. We only told it to use PGE so it would tell us whether it was 213 - * flushing a kernel mapping or a userspace mapping. We don't actually 214 - * use the global bit, so throw it away. 215 - */ 216 - flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 217 - 218 - /* The Guest's pages are offset inside the Launcher. */ 219 - base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 220 - 221 - /* 222 - * We need a temporary "unsigned long" variable to hold the answer from 223 - * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 224 - * fit in spte.pfn. get_pfn() finds the real physical number of the 225 - * page, given the virtual number. 226 - */ 227 - pfn = get_pfn(base + pte_pfn(gpte), write); 228 - if (pfn == -1UL) { 229 - kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 230 - /* 231 - * When we destroy the Guest, we'll go through the shadow page 232 - * tables and release_pte() them. Make sure we don't think 233 - * this one is valid! 234 - */ 235 - flags = 0; 236 - } 237 - /* Now we assemble our shadow PTE from the page number and flags. */ 238 - return pfn_pte(pfn, __pgprot(flags)); 239 - } 240 - 241 - /*H:460 And to complete the chain, release_pte() looks like this: */ 242 - static void release_pte(pte_t pte) 243 - { 244 - /* 245 - * Remember that get_user_pages_fast() took a reference to the page, in 246 - * get_pfn()? We have to put it back now. 247 - */ 248 - if (pte_flags(pte) & _PAGE_PRESENT) 249 - put_page(pte_page(pte)); 250 - } 251 - /*:*/ 252 - 253 - static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte) 254 - { 255 - /* We don't handle large pages. */ 256 - if (pte_flags(gpte) & _PAGE_PSE) 257 - return false; 258 - 259 - return (pte_pfn(gpte) >= cpu->lg->pfn_limit 260 - && pte_pfn(gpte) < cpu->lg->device_limit); 261 - } 262 - 263 - static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) 264 - { 265 - if ((pte_flags(gpte) & _PAGE_PSE) || 266 - pte_pfn(gpte) >= cpu->lg->pfn_limit) { 267 - kill_guest(cpu, "bad page table entry"); 268 - return false; 269 - } 270 - return true; 271 - } 272 - 273 - static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 274 - { 275 - if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || 276 - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { 277 - kill_guest(cpu, "bad page directory entry"); 278 - return false; 279 - } 280 - return true; 281 - } 282 - 283 - #ifdef CONFIG_X86_PAE 284 - static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) 285 - { 286 - if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || 287 - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { 288 - kill_guest(cpu, "bad page middle directory entry"); 289 - return false; 290 - } 291 - return true; 292 - } 293 - #endif 294 - 295 - /*H:331 296 - * This is the core routine to walk the shadow page tables and find the page 297 - * table entry for a specific address. 298 - * 299 - * If allocate is set, then we allocate any missing levels, setting the flags 300 - * on the new page directory and mid-level directories using the arguments 301 - * (which are copied from the Guest's page table entries). 302 - */ 303 - static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, 304 - int pgd_flags, int pmd_flags) 305 - { 306 - pgd_t *spgd; 307 - /* Mid level for PAE. */ 308 - #ifdef CONFIG_X86_PAE 309 - pmd_t *spmd; 310 - #endif 311 - 312 - /* Get top level entry. */ 313 - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 314 - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 315 - /* No shadow entry: allocate a new shadow PTE page. */ 316 - unsigned long ptepage; 317 - 318 - /* If they didn't want us to allocate anything, stop. */ 319 - if (!allocate) 320 - return NULL; 321 - 322 - ptepage = get_zeroed_page(GFP_KERNEL); 323 - /* 324 - * This is not really the Guest's fault, but killing it is 325 - * simple for this corner case. 326 - */ 327 - if (!ptepage) { 328 - kill_guest(cpu, "out of memory allocating pte page"); 329 - return NULL; 330 - } 331 - /* 332 - * And we copy the flags to the shadow PGD entry. The page 333 - * number in the shadow PGD is the page we just allocated. 334 - */ 335 - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); 336 - } 337 - 338 - /* 339 - * Intel's Physical Address Extension actually uses three levels of 340 - * page tables, so we need to look in the mid-level. 341 - */ 342 - #ifdef CONFIG_X86_PAE 343 - /* Now look at the mid-level shadow entry. */ 344 - spmd = spmd_addr(cpu, *spgd, vaddr); 345 - 346 - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { 347 - /* No shadow entry: allocate a new shadow PTE page. */ 348 - unsigned long ptepage; 349 - 350 - /* If they didn't want us to allocate anything, stop. */ 351 - if (!allocate) 352 - return NULL; 353 - 354 - ptepage = get_zeroed_page(GFP_KERNEL); 355 - 356 - /* 357 - * This is not really the Guest's fault, but killing it is 358 - * simple for this corner case. 359 - */ 360 - if (!ptepage) { 361 - kill_guest(cpu, "out of memory allocating pmd page"); 362 - return NULL; 363 - } 364 - 365 - /* 366 - * And we copy the flags to the shadow PMD entry. The page 367 - * number in the shadow PMD is the page we just allocated. 368 - */ 369 - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); 370 - } 371 - #endif 372 - 373 - /* Get the pointer to the shadow PTE entry we're going to set. */ 374 - return spte_addr(cpu, *spgd, vaddr); 375 - } 376 - 377 - /*H:330 378 - * (i) Looking up a page table entry when the Guest faults. 379 - * 380 - * We saw this call in run_guest(): when we see a page fault in the Guest, we 381 - * come here. That's because we only set up the shadow page tables lazily as 382 - * they're needed, so we get page faults all the time and quietly fix them up 383 - * and return to the Guest without it knowing. 384 - * 385 - * If we fixed up the fault (ie. we mapped the address), this routine returns 386 - * true. Otherwise, it was a real fault and we need to tell the Guest. 387 - * 388 - * There's a corner case: they're trying to access memory between 389 - * pfn_limit and device_limit, which is I/O memory. In this case, we 390 - * return false and set @iomem to the physical address, so the the 391 - * Launcher can handle the instruction manually. 392 - */ 393 - bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode, 394 - unsigned long *iomem) 395 - { 396 - unsigned long gpte_ptr; 397 - pte_t gpte; 398 - pte_t *spte; 399 - pmd_t gpmd; 400 - pgd_t gpgd; 401 - 402 - *iomem = 0; 403 - 404 - /* We never demand page the Switcher, so trying is a mistake. */ 405 - if (vaddr >= switcher_addr) 406 - return false; 407 - 408 - /* First step: get the top-level Guest page table entry. */ 409 - if (unlikely(cpu->linear_pages)) { 410 - /* Faking up a linear mapping. */ 411 - gpgd = __pgd(CHECK_GPGD_MASK); 412 - } else { 413 - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 414 - /* Toplevel not present? We can't map it in. */ 415 - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 416 - return false; 417 - 418 - /* 419 - * This kills the Guest if it has weird flags or tries to 420 - * refer to a "physical" address outside the bounds. 421 - */ 422 - if (!check_gpgd(cpu, gpgd)) 423 - return false; 424 - } 425 - 426 - /* This "mid-level" entry is only used for non-linear, PAE mode. */ 427 - gpmd = __pmd(_PAGE_TABLE); 428 - 429 - #ifdef CONFIG_X86_PAE 430 - if (likely(!cpu->linear_pages)) { 431 - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 432 - /* Middle level not present? We can't map it in. */ 433 - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 434 - return false; 435 - 436 - /* 437 - * This kills the Guest if it has weird flags or tries to 438 - * refer to a "physical" address outside the bounds. 439 - */ 440 - if (!check_gpmd(cpu, gpmd)) 441 - return false; 442 - } 443 - 444 - /* 445 - * OK, now we look at the lower level in the Guest page table: keep its 446 - * address, because we might update it later. 447 - */ 448 - gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 449 - #else 450 - /* 451 - * OK, now we look at the lower level in the Guest page table: keep its 452 - * address, because we might update it later. 453 - */ 454 - gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 455 - #endif 456 - 457 - if (unlikely(cpu->linear_pages)) { 458 - /* Linear? Make up a PTE which points to same page. */ 459 - gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); 460 - } else { 461 - /* Read the actual PTE value. */ 462 - gpte = lgread(cpu, gpte_ptr, pte_t); 463 - } 464 - 465 - /* If this page isn't in the Guest page tables, we can't page it in. */ 466 - if (!(pte_flags(gpte) & _PAGE_PRESENT)) 467 - return false; 468 - 469 - /* 470 - * Check they're not trying to write to a page the Guest wants 471 - * read-only (bit 2 of errcode == write). 472 - */ 473 - if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 474 - return false; 475 - 476 - /* User access to a kernel-only page? (bit 3 == user access) */ 477 - if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 478 - return false; 479 - 480 - /* If they're accessing io memory, we expect a fault. */ 481 - if (gpte_in_iomem(cpu, gpte)) { 482 - *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); 483 - return false; 484 - } 485 - 486 - /* 487 - * Check that the Guest PTE flags are OK, and the page number is below 488 - * the pfn_limit (ie. not mapping the Launcher binary). 489 - */ 490 - if (!check_gpte(cpu, gpte)) 491 - return false; 492 - 493 - /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 494 - gpte = pte_mkyoung(gpte); 495 - if (errcode & 2) 496 - gpte = pte_mkdirty(gpte); 497 - 498 - /* Get the pointer to the shadow PTE entry we're going to set. */ 499 - spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); 500 - if (!spte) 501 - return false; 502 - 503 - /* 504 - * If there was a valid shadow PTE entry here before, we release it. 505 - * This can happen with a write to a previously read-only entry. 506 - */ 507 - release_pte(*spte); 508 - 509 - /* 510 - * If this is a write, we insist that the Guest page is writable (the 511 - * final arg to gpte_to_spte()). 512 - */ 513 - if (pte_dirty(gpte)) 514 - *spte = gpte_to_spte(cpu, gpte, 1); 515 - else 516 - /* 517 - * If this is a read, don't set the "writable" bit in the page 518 - * table entry, even if the Guest says it's writable. That way 519 - * we will come back here when a write does actually occur, so 520 - * we can update the Guest's _PAGE_DIRTY flag. 521 - */ 522 - set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 523 - 524 - /* 525 - * Finally, we write the Guest PTE entry back: we've set the 526 - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 527 - */ 528 - if (likely(!cpu->linear_pages)) 529 - lgwrite(cpu, gpte_ptr, pte_t, gpte); 530 - 531 - /* 532 - * The fault is fixed, the page table is populated, the mapping 533 - * manipulated, the result returned and the code complete. A small 534 - * delay and a trace of alliteration are the only indications the Guest 535 - * has that a page fault occurred at all. 536 - */ 537 - return true; 538 - } 539 - 540 - /*H:360 541 - * (ii) Making sure the Guest stack is mapped. 542 - * 543 - * Remember that direct traps into the Guest need a mapped Guest kernel stack. 544 - * pin_stack_pages() calls us here: we could simply call demand_page(), but as 545 - * we've seen that logic is quite long, and usually the stack pages are already 546 - * mapped, so it's overkill. 547 - * 548 - * This is a quick version which answers the question: is this virtual address 549 - * mapped by the shadow page tables, and is it writable? 550 - */ 551 - static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 552 - { 553 - pte_t *spte; 554 - unsigned long flags; 555 - 556 - /* You can't put your stack in the Switcher! */ 557 - if (vaddr >= switcher_addr) 558 - return false; 559 - 560 - /* If there's no shadow PTE, it's not writable. */ 561 - spte = find_spte(cpu, vaddr, false, 0, 0); 562 - if (!spte) 563 - return false; 564 - 565 - /* 566 - * Check the flags on the pte entry itself: it must be present and 567 - * writable. 568 - */ 569 - flags = pte_flags(*spte); 570 - return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 571 - } 572 - 573 - /* 574 - * So, when pin_stack_pages() asks us to pin a page, we check if it's already 575 - * in the page tables, and if not, we call demand_page() with error code 2 576 - * (meaning "write"). 577 - */ 578 - void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 579 - { 580 - unsigned long iomem; 581 - 582 - if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem)) 583 - kill_guest(cpu, "bad stack page %#lx", vaddr); 584 - } 585 - /*:*/ 586 - 587 - #ifdef CONFIG_X86_PAE 588 - static void release_pmd(pmd_t *spmd) 589 - { 590 - /* If the entry's not present, there's nothing to release. */ 591 - if (pmd_flags(*spmd) & _PAGE_PRESENT) { 592 - unsigned int i; 593 - pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); 594 - /* For each entry in the page, we might need to release it. */ 595 - for (i = 0; i < PTRS_PER_PTE; i++) 596 - release_pte(ptepage[i]); 597 - /* Now we can free the page of PTEs */ 598 - free_page((long)ptepage); 599 - /* And zero out the PMD entry so we never release it twice. */ 600 - set_pmd(spmd, __pmd(0)); 601 - } 602 - } 603 - 604 - static void release_pgd(pgd_t *spgd) 605 - { 606 - /* If the entry's not present, there's nothing to release. */ 607 - if (pgd_flags(*spgd) & _PAGE_PRESENT) { 608 - unsigned int i; 609 - pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 610 - 611 - for (i = 0; i < PTRS_PER_PMD; i++) 612 - release_pmd(&pmdpage[i]); 613 - 614 - /* Now we can free the page of PMDs */ 615 - free_page((long)pmdpage); 616 - /* And zero out the PGD entry so we never release it twice. */ 617 - set_pgd(spgd, __pgd(0)); 618 - } 619 - } 620 - 621 - #else /* !CONFIG_X86_PAE */ 622 - /*H:450 623 - * If we chase down the release_pgd() code, the non-PAE version looks like 624 - * this. The PAE version is almost identical, but instead of calling 625 - * release_pte it calls release_pmd(), which looks much like this. 626 - */ 627 - static void release_pgd(pgd_t *spgd) 628 - { 629 - /* If the entry's not present, there's nothing to release. */ 630 - if (pgd_flags(*spgd) & _PAGE_PRESENT) { 631 - unsigned int i; 632 - /* 633 - * Converting the pfn to find the actual PTE page is easy: turn 634 - * the page number into a physical address, then convert to a 635 - * virtual address (easy for kernel pages like this one). 636 - */ 637 - pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 638 - /* For each entry in the page, we might need to release it. */ 639 - for (i = 0; i < PTRS_PER_PTE; i++) 640 - release_pte(ptepage[i]); 641 - /* Now we can free the page of PTEs */ 642 - free_page((long)ptepage); 643 - /* And zero out the PGD entry so we never release it twice. */ 644 - *spgd = __pgd(0); 645 - } 646 - } 647 - #endif 648 - 649 - /*H:445 650 - * We saw flush_user_mappings() twice: once from the flush_user_mappings() 651 - * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 652 - * It simply releases every PTE page from 0 up to the Guest's kernel address. 653 - */ 654 - static void flush_user_mappings(struct lguest *lg, int idx) 655 - { 656 - unsigned int i; 657 - /* Release every pgd entry up to the kernel's address. */ 658 - for (i = 0; i < pgd_index(lg->kernel_address); i++) 659 - release_pgd(lg->pgdirs[idx].pgdir + i); 660 - } 661 - 662 - /*H:440 663 - * (v) Flushing (throwing away) page tables, 664 - * 665 - * The Guest has a hypercall to throw away the page tables: it's used when a 666 - * large number of mappings have been changed. 667 - */ 668 - void guest_pagetable_flush_user(struct lg_cpu *cpu) 669 - { 670 - /* Drop the userspace part of the current page table. */ 671 - flush_user_mappings(cpu->lg, cpu->cpu_pgd); 672 - } 673 - /*:*/ 674 - 675 - /* We walk down the guest page tables to get a guest-physical address */ 676 - bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr) 677 - { 678 - pgd_t gpgd; 679 - pte_t gpte; 680 - #ifdef CONFIG_X86_PAE 681 - pmd_t gpmd; 682 - #endif 683 - 684 - /* Still not set up? Just map 1:1. */ 685 - if (unlikely(cpu->linear_pages)) { 686 - *paddr = vaddr; 687 - return true; 688 - } 689 - 690 - /* First step: get the top-level Guest page table entry. */ 691 - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 692 - /* Toplevel not present? We can't map it in. */ 693 - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 694 - goto fail; 695 - 696 - #ifdef CONFIG_X86_PAE 697 - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 698 - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 699 - goto fail; 700 - gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); 701 - #else 702 - gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); 703 - #endif 704 - if (!(pte_flags(gpte) & _PAGE_PRESENT)) 705 - goto fail; 706 - 707 - *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 708 - return true; 709 - 710 - fail: 711 - *paddr = -1UL; 712 - return false; 713 - } 714 - 715 - /* 716 - * This is the version we normally use: kills the Guest if it uses a 717 - * bad address 718 - */ 719 - unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) 720 - { 721 - unsigned long paddr; 722 - 723 - if (!__guest_pa(cpu, vaddr, &paddr)) 724 - kill_guest(cpu, "Bad address %#lx", vaddr); 725 - return paddr; 726 - } 727 - 728 - /* 729 - * We keep several page tables. This is a simple routine to find the page 730 - * table (if any) corresponding to this top-level address the Guest has given 731 - * us. 732 - */ 733 - static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 734 - { 735 - unsigned int i; 736 - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 737 - if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) 738 - break; 739 - return i; 740 - } 741 - 742 - /*H:435 743 - * And this is us, creating the new page directory. If we really do 744 - * allocate a new one (and so the kernel parts are not there), we set 745 - * blank_pgdir. 746 - */ 747 - static unsigned int new_pgdir(struct lg_cpu *cpu, 748 - unsigned long gpgdir, 749 - int *blank_pgdir) 750 - { 751 - unsigned int next; 752 - 753 - /* 754 - * We pick one entry at random to throw out. Choosing the Least 755 - * Recently Used might be better, but this is easy. 756 - */ 757 - next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs); 758 - /* If it's never been allocated at all before, try now. */ 759 - if (!cpu->lg->pgdirs[next].pgdir) { 760 - cpu->lg->pgdirs[next].pgdir = 761 - (pgd_t *)get_zeroed_page(GFP_KERNEL); 762 - /* If the allocation fails, just keep using the one we have */ 763 - if (!cpu->lg->pgdirs[next].pgdir) 764 - next = cpu->cpu_pgd; 765 - else { 766 - /* 767 - * This is a blank page, so there are no kernel 768 - * mappings: caller must map the stack! 769 - */ 770 - *blank_pgdir = 1; 771 - } 772 - } 773 - /* Record which Guest toplevel this shadows. */ 774 - cpu->lg->pgdirs[next].gpgdir = gpgdir; 775 - /* Release all the non-kernel mappings. */ 776 - flush_user_mappings(cpu->lg, next); 777 - 778 - /* This hasn't run on any CPU at all. */ 779 - cpu->lg->pgdirs[next].last_host_cpu = -1; 780 - 781 - return next; 782 - } 783 - 784 - /*H:501 785 - * We do need the Switcher code mapped at all times, so we allocate that 786 - * part of the Guest page table here. We map the Switcher code immediately, 787 - * but defer mapping of the guest register page and IDT/LDT etc page until 788 - * just before we run the guest in map_switcher_in_guest(). 789 - * 790 - * We *could* do this setup in map_switcher_in_guest(), but at that point 791 - * we've interrupts disabled, and allocating pages like that is fraught: we 792 - * can't sleep if we need to free up some memory. 793 - */ 794 - static bool allocate_switcher_mapping(struct lg_cpu *cpu) 795 - { 796 - int i; 797 - 798 - for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 799 - pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, 800 - CHECK_GPGD_MASK, _PAGE_TABLE); 801 - if (!pte) 802 - return false; 803 - 804 - /* 805 - * Map the switcher page if not already there. It might 806 - * already be there because we call allocate_switcher_mapping() 807 - * in guest_set_pgd() just in case it did discard our Switcher 808 - * mapping, but it probably didn't. 809 - */ 810 - if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { 811 - /* Get a reference to the Switcher page. */ 812 - get_page(lg_switcher_pages[0]); 813 - /* Create a read-only, exectuable, kernel-style PTE */ 814 - set_pte(pte, 815 - mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); 816 - } 817 - } 818 - cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; 819 - return true; 820 - } 821 - 822 - /*H:470 823 - * Finally, a routine which throws away everything: all PGD entries in all 824 - * the shadow page tables, including the Guest's kernel mappings. This is used 825 - * when we destroy the Guest. 826 - */ 827 - static void release_all_pagetables(struct lguest *lg) 828 - { 829 - unsigned int i, j; 830 - 831 - /* Every shadow pagetable this Guest has */ 832 - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { 833 - if (!lg->pgdirs[i].pgdir) 834 - continue; 835 - 836 - /* Every PGD entry. */ 837 - for (j = 0; j < PTRS_PER_PGD; j++) 838 - release_pgd(lg->pgdirs[i].pgdir + j); 839 - lg->pgdirs[i].switcher_mapped = false; 840 - lg->pgdirs[i].last_host_cpu = -1; 841 - } 842 - } 843 - 844 - /* 845 - * We also throw away everything when a Guest tells us it's changed a kernel 846 - * mapping. Since kernel mappings are in every page table, it's easiest to 847 - * throw them all away. This traps the Guest in amber for a while as 848 - * everything faults back in, but it's rare. 849 - */ 850 - void guest_pagetable_clear_all(struct lg_cpu *cpu) 851 - { 852 - release_all_pagetables(cpu->lg); 853 - /* We need the Guest kernel stack mapped again. */ 854 - pin_stack_pages(cpu); 855 - /* And we need Switcher allocated. */ 856 - if (!allocate_switcher_mapping(cpu)) 857 - kill_guest(cpu, "Cannot populate switcher mapping"); 858 - } 859 - 860 - /*H:430 861 - * (iv) Switching page tables 862 - * 863 - * Now we've seen all the page table setting and manipulation, let's see 864 - * what happens when the Guest changes page tables (ie. changes the top-level 865 - * pgdir). This occurs on almost every context switch. 866 - */ 867 - void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 868 - { 869 - int newpgdir, repin = 0; 870 - 871 - /* 872 - * The very first time they call this, we're actually running without 873 - * any page tables; we've been making it up. Throw them away now. 874 - */ 875 - if (unlikely(cpu->linear_pages)) { 876 - release_all_pagetables(cpu->lg); 877 - cpu->linear_pages = false; 878 - /* Force allocation of a new pgdir. */ 879 - newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); 880 - } else { 881 - /* Look to see if we have this one already. */ 882 - newpgdir = find_pgdir(cpu->lg, pgtable); 883 - } 884 - 885 - /* 886 - * If not, we allocate or mug an existing one: if it's a fresh one, 887 - * repin gets set to 1. 888 - */ 889 - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 890 - newpgdir = new_pgdir(cpu, pgtable, &repin); 891 - /* Change the current pgd index to the new one. */ 892 - cpu->cpu_pgd = newpgdir; 893 - /* 894 - * If it was completely blank, we map in the Guest kernel stack and 895 - * the Switcher. 896 - */ 897 - if (repin) 898 - pin_stack_pages(cpu); 899 - 900 - if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { 901 - if (!allocate_switcher_mapping(cpu)) 902 - kill_guest(cpu, "Cannot populate switcher mapping"); 903 - } 904 - } 905 - /*:*/ 906 - 907 - /*M:009 908 - * Since we throw away all mappings when a kernel mapping changes, our 909 - * performance sucks for guests using highmem. In fact, a guest with 910 - * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 911 - * usually slower than a Guest with less memory. 912 - * 913 - * This, of course, cannot be fixed. It would take some kind of... well, I 914 - * don't know, but the term "puissant code-fu" comes to mind. 915 - :*/ 916 - 917 - /*H:420 918 - * This is the routine which actually sets the page table entry for then 919 - * "idx"'th shadow page table. 920 - * 921 - * Normally, we can just throw out the old entry and replace it with 0: if they 922 - * use it demand_page() will put the new entry in. We need to do this anyway: 923 - * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page 924 - * is read from, and _PAGE_DIRTY when it's written to. 925 - * 926 - * But Avi Kivity pointed out that most Operating Systems (Linux included) set 927 - * these bits on PTEs immediately anyway. This is done to save the CPU from 928 - * having to update them, but it helps us the same way: if they set 929 - * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if 930 - * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 931 - */ 932 - static void __guest_set_pte(struct lg_cpu *cpu, int idx, 933 - unsigned long vaddr, pte_t gpte) 934 - { 935 - /* Look up the matching shadow page directory entry. */ 936 - pgd_t *spgd = spgd_addr(cpu, idx, vaddr); 937 - #ifdef CONFIG_X86_PAE 938 - pmd_t *spmd; 939 - #endif 940 - 941 - /* If the top level isn't present, there's no entry to update. */ 942 - if (pgd_flags(*spgd) & _PAGE_PRESENT) { 943 - #ifdef CONFIG_X86_PAE 944 - spmd = spmd_addr(cpu, *spgd, vaddr); 945 - if (pmd_flags(*spmd) & _PAGE_PRESENT) { 946 - #endif 947 - /* Otherwise, start by releasing the existing entry. */ 948 - pte_t *spte = spte_addr(cpu, *spgd, vaddr); 949 - release_pte(*spte); 950 - 951 - /* 952 - * If they're setting this entry as dirty or accessed, 953 - * we might as well put that entry they've given us in 954 - * now. This shaves 10% off a copy-on-write 955 - * micro-benchmark. 956 - */ 957 - if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) 958 - && !gpte_in_iomem(cpu, gpte)) { 959 - if (!check_gpte(cpu, gpte)) 960 - return; 961 - set_pte(spte, 962 - gpte_to_spte(cpu, gpte, 963 - pte_flags(gpte) & _PAGE_DIRTY)); 964 - } else { 965 - /* 966 - * Otherwise kill it and we can demand_page() 967 - * it in later. 968 - */ 969 - set_pte(spte, __pte(0)); 970 - } 971 - #ifdef CONFIG_X86_PAE 972 - } 973 - #endif 974 - } 975 - } 976 - 977 - /*H:410 978 - * Updating a PTE entry is a little trickier. 979 - * 980 - * We keep track of several different page tables (the Guest uses one for each 981 - * process, so it makes sense to cache at least a few). Each of these have 982 - * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for 983 - * all processes. So when the page table above that address changes, we update 984 - * all the page tables, not just the current one. This is rare. 985 - * 986 - * The benefit is that when we have to track a new page table, we can keep all 987 - * the kernel mappings. This speeds up context switch immensely. 988 - */ 989 - void guest_set_pte(struct lg_cpu *cpu, 990 - unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 991 - { 992 - /* We don't let you remap the Switcher; we need it to get back! */ 993 - if (vaddr >= switcher_addr) { 994 - kill_guest(cpu, "attempt to set pte into Switcher pages"); 995 - return; 996 - } 997 - 998 - /* 999 - * Kernel mappings must be changed on all top levels. Slow, but doesn't 1000 - * happen often. 1001 - */ 1002 - if (vaddr >= cpu->lg->kernel_address) { 1003 - unsigned int i; 1004 - for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 1005 - if (cpu->lg->pgdirs[i].pgdir) 1006 - __guest_set_pte(cpu, i, vaddr, gpte); 1007 - } else { 1008 - /* Is this page table one we have a shadow for? */ 1009 - int pgdir = find_pgdir(cpu->lg, gpgdir); 1010 - if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) 1011 - /* If so, do the update. */ 1012 - __guest_set_pte(cpu, pgdir, vaddr, gpte); 1013 - } 1014 - } 1015 - 1016 - /*H:400 1017 - * (iii) Setting up a page table entry when the Guest tells us one has changed. 1018 - * 1019 - * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 1020 - * with the other side of page tables while we're here: what happens when the 1021 - * Guest asks for a page table to be updated? 1022 - * 1023 - * We already saw that demand_page() will fill in the shadow page tables when 1024 - * needed, so we can simply remove shadow page table entries whenever the Guest 1025 - * tells us they've changed. When the Guest tries to use the new entry it will 1026 - * fault and demand_page() will fix it up. 1027 - * 1028 - * So with that in mind here's our code to update a (top-level) PGD entry: 1029 - */ 1030 - void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) 1031 - { 1032 - int pgdir; 1033 - 1034 - if (idx > PTRS_PER_PGD) { 1035 - kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", 1036 - idx, PTRS_PER_PGD); 1037 - return; 1038 - } 1039 - 1040 - /* If they're talking about a page table we have a shadow for... */ 1041 - pgdir = find_pgdir(lg, gpgdir); 1042 - if (pgdir < ARRAY_SIZE(lg->pgdirs)) { 1043 - /* ... throw it away. */ 1044 - release_pgd(lg->pgdirs[pgdir].pgdir + idx); 1045 - /* That might have been the Switcher mapping, remap it. */ 1046 - if (!allocate_switcher_mapping(&lg->cpus[0])) { 1047 - kill_guest(&lg->cpus[0], 1048 - "Cannot populate switcher mapping"); 1049 - } 1050 - lg->pgdirs[pgdir].last_host_cpu = -1; 1051 - } 1052 - } 1053 - 1054 - #ifdef CONFIG_X86_PAE 1055 - /* For setting a mid-level, we just throw everything away. It's easy. */ 1056 - void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 1057 - { 1058 - guest_pagetable_clear_all(&lg->cpus[0]); 1059 - } 1060 - #endif 1061 - 1062 - /*H:500 1063 - * (vii) Setting up the page tables initially. 1064 - * 1065 - * When a Guest is first created, set initialize a shadow page table which 1066 - * we will populate on future faults. The Guest doesn't have any actual 1067 - * pagetables yet, so we set linear_pages to tell demand_page() to fake it 1068 - * for the moment. 1069 - * 1070 - * We do need the Switcher to be mapped at all times, so we allocate that 1071 - * part of the Guest page table here. 1072 - */ 1073 - int init_guest_pagetable(struct lguest *lg) 1074 - { 1075 - struct lg_cpu *cpu = &lg->cpus[0]; 1076 - int allocated = 0; 1077 - 1078 - /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ 1079 - cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); 1080 - if (!allocated) 1081 - return -ENOMEM; 1082 - 1083 - /* We start with a linear mapping until the initialize. */ 1084 - cpu->linear_pages = true; 1085 - 1086 - /* Allocate the page tables for the Switcher. */ 1087 - if (!allocate_switcher_mapping(cpu)) { 1088 - release_all_pagetables(lg); 1089 - return -ENOMEM; 1090 - } 1091 - 1092 - return 0; 1093 - } 1094 - 1095 - /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1096 - void page_table_guest_data_init(struct lg_cpu *cpu) 1097 - { 1098 - /* 1099 - * We tell the Guest that it can't use the virtual addresses 1100 - * used by the Switcher. This trick is equivalent to 4GB - 1101 - * switcher_addr. 1102 - */ 1103 - u32 top = ~switcher_addr + 1; 1104 - 1105 - /* We get the kernel address: above this is all kernel memory. */ 1106 - if (get_user(cpu->lg->kernel_address, 1107 - &cpu->lg->lguest_data->kernel_address) 1108 - /* 1109 - * We tell the Guest that it can't use the top virtual 1110 - * addresses (used by the Switcher). 1111 - */ 1112 - || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { 1113 - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1114 - return; 1115 - } 1116 - 1117 - /* 1118 - * In flush_user_mappings() we loop from 0 to 1119 - * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1120 - * Switcher mappings, so check that now. 1121 - */ 1122 - if (cpu->lg->kernel_address >= switcher_addr) 1123 - kill_guest(cpu, "bad kernel address %#lx", 1124 - cpu->lg->kernel_address); 1125 - } 1126 - 1127 - /* When a Guest dies, our cleanup is fairly simple. */ 1128 - void free_guest_pagetable(struct lguest *lg) 1129 - { 1130 - unsigned int i; 1131 - 1132 - /* Throw away all page table pages. */ 1133 - release_all_pagetables(lg); 1134 - /* Now free the top levels: free_page() can handle 0 just fine. */ 1135 - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 1136 - free_page((long)lg->pgdirs[i].pgdir); 1137 - } 1138 - 1139 - /*H:481 1140 - * This clears the Switcher mappings for cpu #i. 1141 - */ 1142 - static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) 1143 - { 1144 - unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; 1145 - pte_t *pte; 1146 - 1147 - /* Clear the mappings for both pages. */ 1148 - pte = find_spte(cpu, base, false, 0, 0); 1149 - release_pte(*pte); 1150 - set_pte(pte, __pte(0)); 1151 - 1152 - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); 1153 - release_pte(*pte); 1154 - set_pte(pte, __pte(0)); 1155 - } 1156 - 1157 - /*H:480 1158 - * (vi) Mapping the Switcher when the Guest is about to run. 1159 - * 1160 - * The Switcher and the two pages for this CPU need to be visible in the Guest 1161 - * (and not the pages for other CPUs). 1162 - * 1163 - * The pages for the pagetables have all been allocated before: we just need 1164 - * to make sure the actual PTEs are up-to-date for the CPU we're about to run 1165 - * on. 1166 - */ 1167 - void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1168 - { 1169 - unsigned long base; 1170 - struct page *percpu_switcher_page, *regs_page; 1171 - pte_t *pte; 1172 - struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; 1173 - 1174 - /* Switcher page should always be mapped by now! */ 1175 - BUG_ON(!pgdir->switcher_mapped); 1176 - 1177 - /* 1178 - * Remember that we have two pages for each Host CPU, so we can run a 1179 - * Guest on each CPU without them interfering. We need to make sure 1180 - * those pages are mapped correctly in the Guest, but since we usually 1181 - * run on the same CPU, we cache that, and only update the mappings 1182 - * when we move. 1183 - */ 1184 - if (pgdir->last_host_cpu == raw_smp_processor_id()) 1185 - return; 1186 - 1187 - /* -1 means unknown so we remove everything. */ 1188 - if (pgdir->last_host_cpu == -1) { 1189 - unsigned int i; 1190 - for_each_possible_cpu(i) 1191 - remove_switcher_percpu_map(cpu, i); 1192 - } else { 1193 - /* We know exactly what CPU mapping to remove. */ 1194 - remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); 1195 - } 1196 - 1197 - /* 1198 - * When we're running the Guest, we want the Guest's "regs" page to 1199 - * appear where the first Switcher page for this CPU is. This is an 1200 - * optimization: when the Switcher saves the Guest registers, it saves 1201 - * them into the first page of this CPU's "struct lguest_pages": if we 1202 - * make sure the Guest's register page is already mapped there, we 1203 - * don't have to copy them out again. 1204 - */ 1205 - /* Find the shadow PTE for this regs page. */ 1206 - base = switcher_addr + PAGE_SIZE 1207 - + raw_smp_processor_id() * sizeof(struct lguest_pages); 1208 - pte = find_spte(cpu, base, false, 0, 0); 1209 - regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); 1210 - get_page(regs_page); 1211 - set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); 1212 - 1213 - /* 1214 - * We map the second page of the struct lguest_pages read-only in 1215 - * the Guest: the IDT, GDT and other things it's not supposed to 1216 - * change. 1217 - */ 1218 - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); 1219 - percpu_switcher_page 1220 - = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; 1221 - get_page(percpu_switcher_page); 1222 - set_pte(pte, mk_pte(percpu_switcher_page, 1223 - __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); 1224 - 1225 - pgdir->last_host_cpu = raw_smp_processor_id(); 1226 - } 1227 - 1228 - /*H:490 1229 - * We've made it through the page table code. Perhaps our tired brains are 1230 - * still processing the details, or perhaps we're simply glad it's over. 1231 - * 1232 - * If nothing else, note that all this complexity in juggling shadow page tables 1233 - * in sync with the Guest's page tables is for one reason: for most Guests this 1234 - * page table dance determines how bad performance will be. This is why Xen 1235 - * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1236 - * have implemented shadow page table support directly into hardware. 1237 - * 1238 - * There is just one file remaining in the Host. 1239 - */

-228

drivers/lguest/segments.c

··· 1 - /*P:600 2 - * The x86 architecture has segments, which involve a table of descriptors 3 - * which can be used to do funky things with virtual address interpretation. 4 - * We originally used to use segments so the Guest couldn't alter the 5 - * Guest<->Host Switcher, and then we had to trim Guest segments, and restore 6 - * for userspace per-thread segments, but trim again for on userspace->kernel 7 - * transitions... This nightmarish creation was contained within this file, 8 - * where we knew not to tread without heavy armament and a change of underwear. 9 - * 10 - * In these modern times, the segment handling code consists of simple sanity 11 - * checks, and the worst you'll experience reading this code is butterfly-rash 12 - * from frolicking through its parklike serenity. 13 - :*/ 14 - #include "lg.h" 15 - 16 - /*H:600 17 - * Segments & The Global Descriptor Table 18 - * 19 - * (That title sounds like a bad Nerdcore group. Not to suggest that there are 20 - * any good Nerdcore groups, but in high school a friend of mine had a band 21 - * called Joe Fish and the Chips, so there are definitely worse band names). 22 - * 23 - * To refresh: the GDT is a table of 8-byte values describing segments. Once 24 - * set up, these segments can be loaded into one of the 6 "segment registers". 25 - * 26 - * GDT entries are passed around as "struct desc_struct"s, which like IDT 27 - * entries are split into two 32-bit members, "a" and "b". One day, someone 28 - * will clean that up, and be declared a Hero. (No pressure, I'm just saying). 29 - * 30 - * Anyway, the GDT entry contains a base (the start address of the segment), a 31 - * limit (the size of the segment - 1), and some flags. Sounds simple, and it 32 - * would be, except those zany Intel engineers decided that it was too boring 33 - * to put the base at one end, the limit at the other, and the flags in 34 - * between. They decided to shotgun the bits at random throughout the 8 bytes, 35 - * like so: 36 - * 37 - * 0 16 40 48 52 56 63 38 - * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ] 39 - * mit ags part 2 40 - * part 2 41 - * 42 - * As a result, this file contains a certain amount of magic numeracy. Let's 43 - * begin. 44 - */ 45 - 46 - /* 47 - * There are several entries we don't let the Guest set. The TSS entry is the 48 - * "Task State Segment" which controls all kinds of delicate things. The 49 - * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the 50 - * the Guest can't be trusted to deal with double faults. 51 - */ 52 - static bool ignored_gdt(unsigned int num) 53 - { 54 - return (num == GDT_ENTRY_TSS 55 - || num == GDT_ENTRY_LGUEST_CS 56 - || num == GDT_ENTRY_LGUEST_DS 57 - || num == GDT_ENTRY_DOUBLEFAULT_TSS); 58 - } 59 - 60 - /*H:630 61 - * Once the Guest gave us new GDT entries, we fix them up a little. We 62 - * don't care if they're invalid: the worst that can happen is a General 63 - * Protection Fault in the Switcher when it restores a Guest segment register 64 - * which tries to use that entry. Then we kill the Guest for causing such a 65 - * mess: the message will be "unhandled trap 256". 66 - */ 67 - static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) 68 - { 69 - unsigned int i; 70 - 71 - for (i = start; i < end; i++) { 72 - /* 73 - * We never copy these ones to real GDT, so we don't care what 74 - * they say 75 - */ 76 - if (ignored_gdt(i)) 77 - continue; 78 - 79 - /* 80 - * Segment descriptors contain a privilege level: the Guest is 81 - * sometimes careless and leaves this as 0, even though it's 82 - * running at privilege level 1. If so, we fix it here. 83 - */ 84 - if (cpu->arch.gdt[i].dpl == 0) 85 - cpu->arch.gdt[i].dpl |= GUEST_PL; 86 - 87 - /* 88 - * Each descriptor has an "accessed" bit. If we don't set it 89 - * now, the CPU will try to set it when the Guest first loads 90 - * that entry into a segment register. But the GDT isn't 91 - * writable by the Guest, so bad things can happen. 92 - */ 93 - cpu->arch.gdt[i].type |= 0x1; 94 - } 95 - } 96 - 97 - /*H:610 98 - * Like the IDT, we never simply use the GDT the Guest gives us. We keep 99 - * a GDT for each CPU, and copy across the Guest's entries each time we want to 100 - * run the Guest on that CPU. 101 - * 102 - * This routine is called at boot or modprobe time for each CPU to set up the 103 - * constant GDT entries: the ones which are the same no matter what Guest we're 104 - * running. 105 - */ 106 - void setup_default_gdt_entries(struct lguest_ro_state *state) 107 - { 108 - struct desc_struct *gdt = state->guest_gdt; 109 - unsigned long tss = (unsigned long)&state->guest_tss; 110 - 111 - /* The Switcher segments are full 0-4G segments, privilege level 0 */ 112 - gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 113 - gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 114 - 115 - /* 116 - * The TSS segment refers to the TSS entry for this particular CPU. 117 - */ 118 - gdt[GDT_ENTRY_TSS].a = 0; 119 - gdt[GDT_ENTRY_TSS].b = 0; 120 - 121 - gdt[GDT_ENTRY_TSS].limit0 = 0x67; 122 - gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF; 123 - gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF; 124 - gdt[GDT_ENTRY_TSS].base2 = tss >> 24; 125 - gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */ 126 - gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */ 127 - gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */ 128 - gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */ 129 - 130 - } 131 - 132 - /* 133 - * This routine sets up the initial Guest GDT for booting. All entries start 134 - * as 0 (unusable). 135 - */ 136 - void setup_guest_gdt(struct lg_cpu *cpu) 137 - { 138 - /* 139 - * Start with full 0-4G segments...except the Guest is allowed to use 140 - * them, so set the privilege level appropriately in the flags. 141 - */ 142 - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 143 - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 144 - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL; 145 - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL; 146 - } 147 - 148 - /*H:650 149 - * An optimization of copy_gdt(), for just the three "thead-local storage" 150 - * entries. 151 - */ 152 - void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) 153 - { 154 - unsigned int i; 155 - 156 - for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 157 - gdt[i] = cpu->arch.gdt[i]; 158 - } 159 - 160 - /*H:640 161 - * When the Guest is run on a different CPU, or the GDT entries have changed, 162 - * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's 163 - * GDT. 164 - */ 165 - void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) 166 - { 167 - unsigned int i; 168 - 169 - /* 170 - * The default entries from setup_default_gdt_entries() are not 171 - * replaced. See ignored_gdt() above. 172 - */ 173 - for (i = 0; i < GDT_ENTRIES; i++) 174 - if (!ignored_gdt(i)) 175 - gdt[i] = cpu->arch.gdt[i]; 176 - } 177 - 178 - /*H:620 179 - * This is where the Guest asks us to load a new GDT entry 180 - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. 181 - */ 182 - void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) 183 - { 184 - /* 185 - * We assume the Guest has the same number of GDT entries as the 186 - * Host, otherwise we'd have to dynamically allocate the Guest GDT. 187 - */ 188 - if (num >= ARRAY_SIZE(cpu->arch.gdt)) { 189 - kill_guest(cpu, "too many gdt entries %i", num); 190 - return; 191 - } 192 - 193 - /* Set it up, then fix it. */ 194 - cpu->arch.gdt[num].a = lo; 195 - cpu->arch.gdt[num].b = hi; 196 - fixup_gdt_table(cpu, num, num+1); 197 - /* 198 - * Mark that the GDT changed so the core knows it has to copy it again, 199 - * even if the Guest is run on the same CPU. 200 - */ 201 - cpu->changed |= CHANGED_GDT; 202 - } 203 - 204 - /* 205 - * This is the fast-track version for just changing the three TLS entries. 206 - * Remember that this happens on every context switch, so it's worth 207 - * optimizing. But wouldn't it be neater to have a single hypercall to cover 208 - * both cases? 209 - */ 210 - void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) 211 - { 212 - struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; 213 - 214 - __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 215 - fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 216 - /* Note that just the TLS entries have changed. */ 217 - cpu->changed |= CHANGED_GDT_TLS; 218 - } 219 - 220 - /*H:660 221 - * With this, we have finished the Host. 222 - * 223 - * Five of the seven parts of our task are complete. You have made it through 224 - * the Bit of Despair (I think that's somewhere in the page table code, 225 - * myself). 226 - * 227 - * Next, we examine "make Switcher". It's short, but intense. 228 - */

-724

drivers/lguest/x86/core.c

··· 1 - /* 2 - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 3 - * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI. 4 - * 5 - * This program is free software; you can redistribute it and/or modify 6 - * it under the terms of the GNU General Public License as published by 7 - * the Free Software Foundation; either version 2 of the License, or 8 - * (at your option) any later version. 9 - * 10 - * This program is distributed in the hope that it will be useful, but 11 - * WITHOUT ANY WARRANTY; without even the implied warranty of 12 - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 13 - * NON INFRINGEMENT. See the GNU General Public License for more 14 - * details. 15 - * 16 - * You should have received a copy of the GNU General Public License 17 - * along with this program; if not, write to the Free Software 18 - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 - */ 20 - /*P:450 21 - * This file contains the x86-specific lguest code. It used to be all 22 - * mixed in with drivers/lguest/core.c but several foolhardy code slashers 23 - * wrestled most of the dependencies out to here in preparation for porting 24 - * lguest to other architectures (see what I mean by foolhardy?). 25 - * 26 - * This also contains a couple of non-obvious setup and teardown pieces which 27 - * were implemented after days of debugging pain. 28 - :*/ 29 - #include <linux/kernel.h> 30 - #include <linux/start_kernel.h> 31 - #include <linux/string.h> 32 - #include <linux/console.h> 33 - #include <linux/screen_info.h> 34 - #include <linux/irq.h> 35 - #include <linux/interrupt.h> 36 - #include <linux/clocksource.h> 37 - #include <linux/clockchips.h> 38 - #include <linux/cpu.h> 39 - #include <linux/lguest.h> 40 - #include <linux/lguest_launcher.h> 41 - #include <asm/paravirt.h> 42 - #include <asm/param.h> 43 - #include <asm/page.h> 44 - #include <asm/pgtable.h> 45 - #include <asm/desc.h> 46 - #include <asm/setup.h> 47 - #include <asm/lguest.h> 48 - #include <linux/uaccess.h> 49 - #include <asm/fpu/internal.h> 50 - #include <asm/tlbflush.h> 51 - #include "../lg.h" 52 - 53 - static int cpu_had_pge; 54 - 55 - static struct { 56 - unsigned long offset; 57 - unsigned short segment; 58 - } lguest_entry; 59 - 60 - /* Offset from where switcher.S was compiled to where we've copied it */ 61 - static unsigned long switcher_offset(void) 62 - { 63 - return switcher_addr - (unsigned long)start_switcher_text; 64 - } 65 - 66 - /* This cpu's struct lguest_pages (after the Switcher text page) */ 67 - static struct lguest_pages *lguest_pages(unsigned int cpu) 68 - { 69 - return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); 70 - } 71 - 72 - static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); 73 - 74 - /*S:010 75 - * We approach the Switcher. 76 - * 77 - * Remember that each CPU has two pages which are visible to the Guest when it 78 - * runs on that CPU. This has to contain the state for that Guest: we copy the 79 - * state in just before we run the Guest. 80 - * 81 - * Each Guest has "changed" flags which indicate what has changed in the Guest 82 - * since it last ran. We saw this set in interrupts_and_traps.c and 83 - * segments.c. 84 - */ 85 - static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) 86 - { 87 - /* 88 - * Copying all this data can be quite expensive. We usually run the 89 - * same Guest we ran last time (and that Guest hasn't run anywhere else 90 - * meanwhile). If that's not the case, we pretend everything in the 91 - * Guest has changed. 92 - */ 93 - if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) { 94 - __this_cpu_write(lg_last_cpu, cpu); 95 - cpu->last_pages = pages; 96 - cpu->changed = CHANGED_ALL; 97 - } 98 - 99 - /* 100 - * These copies are pretty cheap, so we do them unconditionally: */ 101 - /* Save the current Host top-level page directory. 102 - */ 103 - pages->state.host_cr3 = __pa(current->mm->pgd); 104 - /* 105 - * Set up the Guest's page tables to see this CPU's pages (and no 106 - * other CPU's pages). 107 - */ 108 - map_switcher_in_guest(cpu, pages); 109 - /* 110 - * Set up the two "TSS" members which tell the CPU what stack to use 111 - * for traps which do directly into the Guest (ie. traps at privilege 112 - * level 1). 113 - */ 114 - pages->state.guest_tss.sp1 = cpu->esp1; 115 - pages->state.guest_tss.ss1 = cpu->ss1; 116 - 117 - /* Copy direct-to-Guest trap entries. */ 118 - if (cpu->changed & CHANGED_IDT) 119 - copy_traps(cpu, pages->state.guest_idt, default_idt_entries); 120 - 121 - /* Copy all GDT entries which the Guest can change. */ 122 - if (cpu->changed & CHANGED_GDT) 123 - copy_gdt(cpu, pages->state.guest_gdt); 124 - /* If only the TLS entries have changed, copy them. */ 125 - else if (cpu->changed & CHANGED_GDT_TLS) 126 - copy_gdt_tls(cpu, pages->state.guest_gdt); 127 - 128 - /* Mark the Guest as unchanged for next time. */ 129 - cpu->changed = 0; 130 - } 131 - 132 - /* Finally: the code to actually call into the Switcher to run the Guest. */ 133 - static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) 134 - { 135 - /* This is a dummy value we need for GCC's sake. */ 136 - unsigned int clobber; 137 - 138 - /* 139 - * Copy the guest-specific information into this CPU's "struct 140 - * lguest_pages". 141 - */ 142 - copy_in_guest_info(cpu, pages); 143 - 144 - /* 145 - * Set the trap number to 256 (impossible value). If we fault while 146 - * switching to the Guest (bad segment registers or bug), this will 147 - * cause us to abort the Guest. 148 - */ 149 - cpu->regs->trapnum = 256; 150 - 151 - /* 152 - * Now: we push the "eflags" register on the stack, then do an "lcall". 153 - * This is how we change from using the kernel code segment to using 154 - * the dedicated lguest code segment, as well as jumping into the 155 - * Switcher. 156 - * 157 - * The lcall also pushes the old code segment (KERNEL_CS) onto the 158 - * stack, then the address of this call. This stack layout happens to 159 - * exactly match the stack layout created by an interrupt... 160 - */ 161 - asm volatile("pushf; lcall *%4" 162 - /* 163 - * This is how we tell GCC that %eax ("a") and %ebx ("b") 164 - * are changed by this routine. The "=" means output. 165 - */ 166 - : "=a"(clobber), "=b"(clobber) 167 - /* 168 - * %eax contains the pages pointer. ("0" refers to the 169 - * 0-th argument above, ie "a"). %ebx contains the 170 - * physical address of the Guest's top-level page 171 - * directory. 172 - */ 173 - : "0"(pages), 174 - "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)), 175 - "m"(lguest_entry) 176 - /* 177 - * We tell gcc that all these registers could change, 178 - * which means we don't have to save and restore them in 179 - * the Switcher. 180 - */ 181 - : "memory", "%edx", "%ecx", "%edi", "%esi"); 182 - } 183 - /*:*/ 184 - 185 - unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any) 186 - { 187 - switch (reg_off) { 188 - case offsetof(struct pt_regs, bx): 189 - return &cpu->regs->ebx; 190 - case offsetof(struct pt_regs, cx): 191 - return &cpu->regs->ecx; 192 - case offsetof(struct pt_regs, dx): 193 - return &cpu->regs->edx; 194 - case offsetof(struct pt_regs, si): 195 - return &cpu->regs->esi; 196 - case offsetof(struct pt_regs, di): 197 - return &cpu->regs->edi; 198 - case offsetof(struct pt_regs, bp): 199 - return &cpu->regs->ebp; 200 - case offsetof(struct pt_regs, ax): 201 - return &cpu->regs->eax; 202 - case offsetof(struct pt_regs, ip): 203 - return &cpu->regs->eip; 204 - case offsetof(struct pt_regs, sp): 205 - return &cpu->regs->esp; 206 - } 207 - 208 - /* Launcher can read these, but we don't allow any setting. */ 209 - if (any) { 210 - switch (reg_off) { 211 - case offsetof(struct pt_regs, ds): 212 - return &cpu->regs->ds; 213 - case offsetof(struct pt_regs, es): 214 - return &cpu->regs->es; 215 - case offsetof(struct pt_regs, fs): 216 - return &cpu->regs->fs; 217 - case offsetof(struct pt_regs, gs): 218 - return &cpu->regs->gs; 219 - case offsetof(struct pt_regs, cs): 220 - return &cpu->regs->cs; 221 - case offsetof(struct pt_regs, flags): 222 - return &cpu->regs->eflags; 223 - case offsetof(struct pt_regs, ss): 224 - return &cpu->regs->ss; 225 - } 226 - } 227 - 228 - return NULL; 229 - } 230 - 231 - /*M:002 232 - * There are hooks in the scheduler which we can register to tell when we 233 - * get kicked off the CPU (preempt_notifier_register()). This would allow us 234 - * to lazily disable SYSENTER which would regain some performance, and should 235 - * also simplify copy_in_guest_info(). Note that we'd still need to restore 236 - * things when we exit to Launcher userspace, but that's fairly easy. 237 - * 238 - * We could also try using these hooks for PGE, but that might be too expensive. 239 - * 240 - * The hooks were designed for KVM, but we can also put them to good use. 241 - :*/ 242 - 243 - /*H:040 244 - * This is the i386-specific code to setup and run the Guest. Interrupts 245 - * are disabled: we own the CPU. 246 - */ 247 - void lguest_arch_run_guest(struct lg_cpu *cpu) 248 - { 249 - /* 250 - * SYSENTER is an optimized way of doing system calls. We can't allow 251 - * it because it always jumps to privilege level 0. A normal Guest 252 - * won't try it because we don't advertise it in CPUID, but a malicious 253 - * Guest (or malicious Guest userspace program) could, so we tell the 254 - * CPU to disable it before running the Guest. 255 - */ 256 - if (boot_cpu_has(X86_FEATURE_SEP)) 257 - wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 258 - 259 - /* 260 - * Now we actually run the Guest. It will return when something 261 - * interesting happens, and we can examine its registers to see what it 262 - * was doing. 263 - */ 264 - run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 265 - 266 - /* 267 - * Note that the "regs" structure contains two extra entries which are 268 - * not really registers: a trap number which says what interrupt or 269 - * trap made the switcher code come back, and an error code which some 270 - * traps set. 271 - */ 272 - 273 - /* Restore SYSENTER if it's supposed to be on. */ 274 - if (boot_cpu_has(X86_FEATURE_SEP)) 275 - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 276 - 277 - /* 278 - * If the Guest page faulted, then the cr2 register will tell us the 279 - * bad virtual address. We have to grab this now, because once we 280 - * re-enable interrupts an interrupt could fault and thus overwrite 281 - * cr2, or we could even move off to a different CPU. 282 - */ 283 - if (cpu->regs->trapnum == 14) 284 - cpu->arch.last_pagefault = read_cr2(); 285 - /* 286 - * Similarly, if we took a trap because the Guest used the FPU, 287 - * we have to restore the FPU it expects to see. 288 - * fpu__restore() may sleep and we may even move off to 289 - * a different CPU. So all the critical stuff should be done 290 - * before this. 291 - */ 292 - else if (cpu->regs->trapnum == 7 && !fpregs_active()) 293 - fpu__restore(&current->thread.fpu); 294 - } 295 - 296 - /*H:130 297 - * Now we've examined the hypercall code; our Guest can make requests. 298 - * Our Guest is usually so well behaved; it never tries to do things it isn't 299 - * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 300 - * infrastructure isn't quite complete, because it doesn't contain replacements 301 - * for the Intel I/O instructions. As a result, the Guest sometimes fumbles 302 - * across one during the boot process as it probes for various things which are 303 - * usually attached to a PC. 304 - * 305 - * When the Guest uses one of these instructions, we get a trap (General 306 - * Protection Fault) and come here. We queue this to be sent out to the 307 - * Launcher to handle. 308 - */ 309 - 310 - /* 311 - * The eip contains the *virtual* address of the Guest's instruction: 312 - * we copy the instruction here so the Launcher doesn't have to walk 313 - * the page tables to decode it. We handle the case (eg. in a kernel 314 - * module) where the instruction is over two pages, and the pages are 315 - * virtually but not physically contiguous. 316 - * 317 - * The longest possible x86 instruction is 15 bytes, but we don't handle 318 - * anything that strange. 319 - */ 320 - static void copy_from_guest(struct lg_cpu *cpu, 321 - void *dst, unsigned long vaddr, size_t len) 322 - { 323 - size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE); 324 - unsigned long paddr; 325 - 326 - BUG_ON(len > PAGE_SIZE); 327 - 328 - /* If it goes over a page, copy in two parts. */ 329 - if (len > to_page_end) { 330 - /* But make sure the next page is mapped! */ 331 - if (__guest_pa(cpu, vaddr + to_page_end, &paddr)) 332 - copy_from_guest(cpu, dst + to_page_end, 333 - vaddr + to_page_end, 334 - len - to_page_end); 335 - else 336 - /* Otherwise fill with zeroes. */ 337 - memset(dst + to_page_end, 0, len - to_page_end); 338 - len = to_page_end; 339 - } 340 - 341 - /* This will kill the guest if it isn't mapped, but that 342 - * shouldn't happen. */ 343 - __lgread(cpu, dst, guest_pa(cpu, vaddr), len); 344 - } 345 - 346 - 347 - static void setup_emulate_insn(struct lg_cpu *cpu) 348 - { 349 - cpu->pending.trap = 13; 350 - copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, 351 - sizeof(cpu->pending.insn)); 352 - } 353 - 354 - static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr) 355 - { 356 - cpu->pending.trap = 14; 357 - cpu->pending.addr = iomem_addr; 358 - copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, 359 - sizeof(cpu->pending.insn)); 360 - } 361 - 362 - /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 363 - void lguest_arch_handle_trap(struct lg_cpu *cpu) 364 - { 365 - unsigned long iomem_addr; 366 - 367 - switch (cpu->regs->trapnum) { 368 - case 13: /* We've intercepted a General Protection Fault. */ 369 - /* Hand to Launcher to emulate those pesky IN and OUT insns */ 370 - if (cpu->regs->errcode == 0) { 371 - setup_emulate_insn(cpu); 372 - return; 373 - } 374 - break; 375 - case 14: /* We've intercepted a Page Fault. */ 376 - /* 377 - * The Guest accessed a virtual address that wasn't mapped. 378 - * This happens a lot: we don't actually set up most of the page 379 - * tables for the Guest at all when we start: as it runs it asks 380 - * for more and more, and we set them up as required. In this 381 - * case, we don't even tell the Guest that the fault happened. 382 - * 383 - * The errcode tells whether this was a read or a write, and 384 - * whether kernel or userspace code. 385 - */ 386 - if (demand_page(cpu, cpu->arch.last_pagefault, 387 - cpu->regs->errcode, &iomem_addr)) 388 - return; 389 - 390 - /* Was this an access to memory mapped IO? */ 391 - if (iomem_addr) { 392 - /* Tell Launcher, let it handle it. */ 393 - setup_iomem_insn(cpu, iomem_addr); 394 - return; 395 - } 396 - 397 - /* 398 - * OK, it's really not there (or not OK): the Guest needs to 399 - * know. We write out the cr2 value so it knows where the 400 - * fault occurred. 401 - * 402 - * Note that if the Guest were really messed up, this could 403 - * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 404 - * lg->lguest_data could be NULL 405 - */ 406 - if (cpu->lg->lguest_data && 407 - put_user(cpu->arch.last_pagefault, 408 - &cpu->lg->lguest_data->cr2)) 409 - kill_guest(cpu, "Writing cr2"); 410 - break; 411 - case 7: /* We've intercepted a Device Not Available fault. */ 412 - /* No special handling is needed here. */ 413 - break; 414 - case 32 ... 255: 415 - /* This might be a syscall. */ 416 - if (could_be_syscall(cpu->regs->trapnum)) 417 - break; 418 - 419 - /* 420 - * Other values mean a real interrupt occurred, in which case 421 - * the Host handler has already been run. We just do a 422 - * friendly check if another process should now be run, then 423 - * return to run the Guest again. 424 - */ 425 - cond_resched(); 426 - return; 427 - case LGUEST_TRAP_ENTRY: 428 - /* 429 - * Our 'struct hcall_args' maps directly over our regs: we set 430 - * up the pointer now to indicate a hypercall is pending. 431 - */ 432 - cpu->hcall = (struct hcall_args *)cpu->regs; 433 - return; 434 - } 435 - 436 - /* We didn't handle the trap, so it needs to go to the Guest. */ 437 - if (!deliver_trap(cpu, cpu->regs->trapnum)) 438 - /* 439 - * If the Guest doesn't have a handler (either it hasn't 440 - * registered any yet, or it's one of the faults we don't let 441 - * it handle), it dies with this cryptic error message. 442 - */ 443 - kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 444 - cpu->regs->trapnum, cpu->regs->eip, 445 - cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault 446 - : cpu->regs->errcode); 447 - } 448 - 449 - /* 450 - * Now we can look at each of the routines this calls, in increasing order of 451 - * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 452 - * deliver_trap() and demand_page(). After all those, we'll be ready to 453 - * examine the Switcher, and our philosophical understanding of the Host/Guest 454 - * duality will be complete. 455 - :*/ 456 - static void adjust_pge(void *on) 457 - { 458 - if (on) 459 - cr4_set_bits(X86_CR4_PGE); 460 - else 461 - cr4_clear_bits(X86_CR4_PGE); 462 - } 463 - 464 - /*H:020 465 - * Now the Switcher is mapped and every thing else is ready, we need to do 466 - * some more i386-specific initialization. 467 - */ 468 - void __init lguest_arch_host_init(void) 469 - { 470 - int i; 471 - 472 - /* 473 - * Most of the x86/switcher_32.S doesn't care that it's been moved; on 474 - * Intel, jumps are relative, and it doesn't access any references to 475 - * external code or data. 476 - * 477 - * The only exception is the interrupt handlers in switcher.S: their 478 - * addresses are placed in a table (default_idt_entries), so we need to 479 - * update the table with the new addresses. switcher_offset() is a 480 - * convenience function which returns the distance between the 481 - * compiled-in switcher code and the high-mapped copy we just made. 482 - */ 483 - for (i = 0; i < IDT_ENTRIES; i++) 484 - default_idt_entries[i] += switcher_offset(); 485 - 486 - /* 487 - * Set up the Switcher's per-cpu areas. 488 - * 489 - * Each CPU gets two pages of its own within the high-mapped region 490 - * (aka. "struct lguest_pages"). Much of this can be initialized now, 491 - * but some depends on what Guest we are running (which is set up in 492 - * copy_in_guest_info()). 493 - */ 494 - for_each_possible_cpu(i) { 495 - /* lguest_pages() returns this CPU's two pages. */ 496 - struct lguest_pages *pages = lguest_pages(i); 497 - /* This is a convenience pointer to make the code neater. */ 498 - struct lguest_ro_state *state = &pages->state; 499 - 500 - /* 501 - * The Global Descriptor Table: the Host has a different one 502 - * for each CPU. We keep a descriptor for the GDT which says 503 - * where it is and how big it is (the size is actually the last 504 - * byte, not the size, hence the "-1"). 505 - */ 506 - state->host_gdt_desc.size = GDT_SIZE-1; 507 - state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i); 508 - 509 - /* 510 - * All CPUs on the Host use the same Interrupt Descriptor 511 - * Table, so we just use store_idt(), which gets this CPU's IDT 512 - * descriptor. 513 - */ 514 - store_idt(&state->host_idt_desc); 515 - 516 - /* 517 - * The descriptors for the Guest's GDT and IDT can be filled 518 - * out now, too. We copy the GDT & IDT into ->guest_gdt and 519 - * ->guest_idt before actually running the Guest. 520 - */ 521 - state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 522 - state->guest_idt_desc.address = (long)&state->guest_idt; 523 - state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 524 - state->guest_gdt_desc.address = (long)&state->guest_gdt; 525 - 526 - /* 527 - * We know where we want the stack to be when the Guest enters 528 - * the Switcher: in pages->regs. The stack grows upwards, so 529 - * we start it at the end of that structure. 530 - */ 531 - state->guest_tss.sp0 = (long)(&pages->regs + 1); 532 - /* 533 - * And this is the GDT entry to use for the stack: we keep a 534 - * couple of special LGUEST entries. 535 - */ 536 - state->guest_tss.ss0 = LGUEST_DS; 537 - 538 - /* 539 - * x86 can have a finegrained bitmap which indicates what I/O 540 - * ports the process can use. We set it to the end of our 541 - * structure, meaning "none". 542 - */ 543 - state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 544 - 545 - /* 546 - * Some GDT entries are the same across all Guests, so we can 547 - * set them up now. 548 - */ 549 - setup_default_gdt_entries(state); 550 - /* Most IDT entries are the same for all Guests, too.*/ 551 - setup_default_idt_entries(state, default_idt_entries); 552 - 553 - /* 554 - * The Host needs to be able to use the LGUEST segments on this 555 - * CPU, too, so put them in the Host GDT. 556 - */ 557 - get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 558 - get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 559 - } 560 - 561 - /* 562 - * In the Switcher, we want the %cs segment register to use the 563 - * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 564 - * it will be undisturbed when we switch. To change %cs and jump we 565 - * need this structure to feed to Intel's "lcall" instruction. 566 - */ 567 - lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 568 - lguest_entry.segment = LGUEST_CS; 569 - 570 - /* 571 - * Finally, we need to turn off "Page Global Enable". PGE is an 572 - * optimization where page table entries are specially marked to show 573 - * they never change. The Host kernel marks all the kernel pages this 574 - * way because it's always present, even when userspace is running. 575 - * 576 - * Lguest breaks this: unbeknownst to the rest of the Host kernel, we 577 - * switch to the Guest kernel. If you don't disable this on all CPUs, 578 - * you'll get really weird bugs that you'll chase for two days. 579 - * 580 - * I used to turn PGE off every time we switched to the Guest and back 581 - * on when we return, but that slowed the Switcher down noticibly. 582 - */ 583 - 584 - /* 585 - * We don't need the complexity of CPUs coming and going while we're 586 - * doing this. 587 - */ 588 - get_online_cpus(); 589 - if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */ 590 - /* Remember that this was originally set (for cleanup). */ 591 - cpu_had_pge = 1; 592 - /* 593 - * adjust_pge is a helper function which sets or unsets the PGE 594 - * bit on its CPU, depending on the argument (0 == unset). 595 - */ 596 - on_each_cpu(adjust_pge, (void *)0, 1); 597 - /* Turn off the feature in the global feature set. */ 598 - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 599 - } 600 - put_online_cpus(); 601 - } 602 - /*:*/ 603 - 604 - void __exit lguest_arch_host_fini(void) 605 - { 606 - /* If we had PGE before we started, turn it back on now. */ 607 - get_online_cpus(); 608 - if (cpu_had_pge) { 609 - set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 610 - /* adjust_pge's argument "1" means set PGE. */ 611 - on_each_cpu(adjust_pge, (void *)1, 1); 612 - } 613 - put_online_cpus(); 614 - } 615 - 616 - 617 - /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ 618 - int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 619 - { 620 - switch (args->arg0) { 621 - case LHCALL_LOAD_GDT_ENTRY: 622 - load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3); 623 - break; 624 - case LHCALL_LOAD_IDT_ENTRY: 625 - load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3); 626 - break; 627 - case LHCALL_LOAD_TLS: 628 - guest_load_tls(cpu, args->arg1); 629 - break; 630 - default: 631 - /* Bad Guest. Bad! */ 632 - return -EIO; 633 - } 634 - return 0; 635 - } 636 - 637 - /*H:126 i386-specific hypercall initialization: */ 638 - int lguest_arch_init_hypercalls(struct lg_cpu *cpu) 639 - { 640 - u32 tsc_speed; 641 - 642 - /* 643 - * The pointer to the Guest's "struct lguest_data" is the only argument. 644 - * We check that address now. 645 - */ 646 - if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 647 - sizeof(*cpu->lg->lguest_data))) 648 - return -EFAULT; 649 - 650 - /* 651 - * Having checked it, we simply set lg->lguest_data to point straight 652 - * into the Launcher's memory at the right place and then use 653 - * copy_to_user/from_user from now on, instead of lgread/write. I put 654 - * this in to show that I'm not immune to writing stupid 655 - * optimizations. 656 - */ 657 - cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; 658 - 659 - /* 660 - * We insist that the Time Stamp Counter exist and doesn't change with 661 - * cpu frequency. Some devious chip manufacturers decided that TSC 662 - * changes could be handled in software. I decided that time going 663 - * backwards might be good for benchmarks, but it's bad for users. 664 - * 665 - * We also insist that the TSC be stable: the kernel detects unreliable 666 - * TSCs for its own purposes, and we use that here. 667 - */ 668 - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 669 - tsc_speed = tsc_khz; 670 - else 671 - tsc_speed = 0; 672 - if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz)) 673 - return -EFAULT; 674 - 675 - /* The interrupt code might not like the system call vector. */ 676 - if (!check_syscall_vector(cpu->lg)) 677 - kill_guest(cpu, "bad syscall vector"); 678 - 679 - return 0; 680 - } 681 - /*:*/ 682 - 683 - /*L:030 684 - * Most of the Guest's registers are left alone: we used get_zeroed_page() to 685 - * allocate the structure, so they will be 0. 686 - */ 687 - void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) 688 - { 689 - struct lguest_regs *regs = cpu->regs; 690 - 691 - /* 692 - * There are four "segment" registers which the Guest needs to boot: 693 - * The "code segment" register (cs) refers to the kernel code segment 694 - * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 695 - * refer to the kernel data segment __KERNEL_DS. 696 - * 697 - * The privilege level is packed into the lower bits. The Guest runs 698 - * at privilege level 1 (GUEST_PL). 699 - */ 700 - regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 701 - regs->cs = __KERNEL_CS|GUEST_PL; 702 - 703 - /* 704 - * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 705 - * is supposed to always be "1". Bit 9 (0x200) controls whether 706 - * interrupts are enabled. We always leave interrupts enabled while 707 - * running the Guest. 708 - */ 709 - regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; 710 - 711 - /* 712 - * The "Extended Instruction Pointer" register says where the Guest is 713 - * running. 714 - */ 715 - regs->eip = start; 716 - 717 - /* 718 - * %esi points to our boot information, at physical address 0, so don't 719 - * touch it. 720 - */ 721 - 722 - /* There are a couple of GDT entries the Guest expects at boot. */ 723 - setup_guest_gdt(cpu); 724 - }

-388

drivers/lguest/x86/switcher_32.S

··· 1 - /*P:900 2 - * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride 3 - * both the Host and Guest to do the low-level Guest<->Host switch. It is as 4 - * simple as it can be made, but it's naturally very specific to x86. 5 - * 6 - * You have now completed Preparation. If this has whet your appetite; if you 7 - * are feeling invigorated and refreshed then the next, more challenging stage 8 - * can be found in "make Guest". 9 - :*/ 10 - 11 - /*M:012 12 - * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 13 - * gain at least 1% more performance. Since neither LOC nor performance can be 14 - * measured beforehand, it generally means implementing a feature then deciding 15 - * if it's worth it. And once it's implemented, who can say no? 16 - * 17 - * This is why I haven't implemented this idea myself. I want to, but I 18 - * haven't. You could, though. 19 - * 20 - * The main place where lguest performance sucks is Guest page faulting. When 21 - * a Guest userspace process hits an unmapped page we switch back to the Host, 22 - * walk the page tables, find it's not mapped, switch back to the Guest page 23 - * fault handler, which calls a hypercall to set the page table entry, then 24 - * finally returns to userspace. That's two round-trips. 25 - * 26 - * If we had a small walker in the Switcher, we could quickly check the Guest 27 - * page table and if the page isn't mapped, immediately reflect the fault back 28 - * into the Guest. This means the Switcher would have to know the top of the 29 - * Guest page table and the page fault handler address. 30 - * 31 - * For simplicity, the Guest should only handle the case where the privilege 32 - * level of the fault is 3 and probably only not present or write faults. It 33 - * should also detect recursive faults, and hand the original fault to the 34 - * Host (which is actually really easy). 35 - * 36 - * Two questions remain. Would the performance gain outweigh the complexity? 37 - * And who would write the verse documenting it? 38 - :*/ 39 - 40 - /*M:011 41 - * Lguest64 handles NMI. This gave me NMI envy (until I looked at their 42 - * code). It's worth doing though, since it would let us use oprofile in the 43 - * Host when a Guest is running. 44 - :*/ 45 - 46 - /*S:100 47 - * Welcome to the Switcher itself! 48 - * 49 - * This file contains the low-level code which changes the CPU to run the Guest 50 - * code, and returns to the Host when something happens. Understand this, and 51 - * you understand the heart of our journey. 52 - * 53 - * Because this is in assembler rather than C, our tale switches from prose to 54 - * verse. First I tried limericks: 55 - * 56 - * There once was an eax reg, 57 - * To which our pointer was fed, 58 - * It needed an add, 59 - * Which asm-offsets.h had 60 - * But this limerick is hurting my head. 61 - * 62 - * Next I tried haikus, but fitting the required reference to the seasons in 63 - * every stanza was quickly becoming tiresome: 64 - * 65 - * The %eax reg 66 - * Holds "struct lguest_pages" now: 67 - * Cherry blossoms fall. 68 - * 69 - * Then I started with Heroic Verse, but the rhyming requirement leeched away 70 - * the content density and led to some uniquely awful oblique rhymes: 71 - * 72 - * These constants are coming from struct offsets 73 - * For use within the asm switcher text. 74 - * 75 - * Finally, I settled for something between heroic hexameter, and normal prose 76 - * with inappropriate linebreaks. Anyway, it aint no Shakespeare. 77 - */ 78 - 79 - // Not all kernel headers work from assembler 80 - // But these ones are needed: the ENTRY() define 81 - // And constants extracted from struct offsets 82 - // To avoid magic numbers and breakage: 83 - // Should they change the compiler can't save us 84 - // Down here in the depths of assembler code. 85 - #include <linux/linkage.h> 86 - #include <asm/asm-offsets.h> 87 - #include <asm/page.h> 88 - #include <asm/segment.h> 89 - #include <asm/lguest.h> 90 - 91 - // We mark the start of the code to copy 92 - // It's placed in .text tho it's never run here 93 - // You'll see the trick macro at the end 94 - // Which interleaves data and text to effect. 95 - .text 96 - ENTRY(start_switcher_text) 97 - 98 - // When we reach switch_to_guest we have just left 99 - // The safe and comforting shores of C code 100 - // %eax has the "struct lguest_pages" to use 101 - // Where we save state and still see it from the Guest 102 - // And %ebx holds the Guest shadow pagetable: 103 - // Once set we have truly left Host behind. 104 - ENTRY(switch_to_guest) 105 - // We told gcc all its regs could fade, 106 - // Clobbered by our journey into the Guest 107 - // We could have saved them, if we tried 108 - // But time is our master and cycles count. 109 - 110 - // Segment registers must be saved for the Host 111 - // We push them on the Host stack for later 112 - pushl %es 113 - pushl %ds 114 - pushl %gs 115 - pushl %fs 116 - // But the compiler is fickle, and heeds 117 - // No warning of %ebp clobbers 118 - // When frame pointers are used. That register 119 - // Must be saved and restored or chaos strikes. 120 - pushl %ebp 121 - // The Host's stack is done, now save it away 122 - // In our "struct lguest_pages" at offset 123 - // Distilled into asm-offsets.h 124 - movl %esp, LGUEST_PAGES_host_sp(%eax) 125 - 126 - // All saved and there's now five steps before us: 127 - // Stack, GDT, IDT, TSS 128 - // Then last of all the page tables are flipped. 129 - 130 - // Yet beware that our stack pointer must be 131 - // Always valid lest an NMI hits 132 - // %edx does the duty here as we juggle 133 - // %eax is lguest_pages: our stack lies within. 134 - movl %eax, %edx 135 - addl $LGUEST_PAGES_regs, %edx 136 - movl %edx, %esp 137 - 138 - // The Guest's GDT we so carefully 139 - // Placed in the "struct lguest_pages" before 140 - lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 141 - 142 - // The Guest's IDT we did partially 143 - // Copy to "struct lguest_pages" as well. 144 - lidt LGUEST_PAGES_guest_idt_desc(%eax) 145 - 146 - // The TSS entry which controls traps 147 - // Must be loaded up with "ltr" now: 148 - // The GDT entry that TSS uses 149 - // Changes type when we load it: damn Intel! 150 - // For after we switch over our page tables 151 - // That entry will be read-only: we'd crash. 152 - movl $(GDT_ENTRY_TSS*8), %edx 153 - ltr %dx 154 - 155 - // Look back now, before we take this last step! 156 - // The Host's TSS entry was also marked used; 157 - // Let's clear it again for our return. 158 - // The GDT descriptor of the Host 159 - // Points to the table after two "size" bytes 160 - movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 161 - // Clear "used" from type field (byte 5, bit 2) 162 - andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 163 - 164 - // Once our page table's switched, the Guest is live! 165 - // The Host fades as we run this final step. 166 - // Our "struct lguest_pages" is now read-only. 167 - movl %ebx, %cr3 168 - 169 - // The page table change did one tricky thing: 170 - // The Guest's register page has been mapped 171 - // Writable under our %esp (stack) -- 172 - // We can simply pop off all Guest regs. 173 - popl %eax 174 - popl %ebx 175 - popl %ecx 176 - popl %edx 177 - popl %esi 178 - popl %edi 179 - popl %ebp 180 - popl %gs 181 - popl %fs 182 - popl %ds 183 - popl %es 184 - 185 - // Near the base of the stack lurk two strange fields 186 - // Which we fill as we exit the Guest 187 - // These are the trap number and its error 188 - // We can simply step past them on our way. 189 - addl $8, %esp 190 - 191 - // The last five stack slots hold return address 192 - // And everything needed to switch privilege 193 - // From Switcher's level 0 to Guest's 1, 194 - // And the stack where the Guest had last left it. 195 - // Interrupts are turned back on: we are Guest. 196 - iret 197 - 198 - // We tread two paths to switch back to the Host 199 - // Yet both must save Guest state and restore Host 200 - // So we put the routine in a macro. 201 - #define SWITCH_TO_HOST \ 202 - /* We save the Guest state: all registers first \ 203 - * Laid out just as "struct lguest_regs" defines */ \ 204 - pushl %es; \ 205 - pushl %ds; \ 206 - pushl %fs; \ 207 - pushl %gs; \ 208 - pushl %ebp; \ 209 - pushl %edi; \ 210 - pushl %esi; \ 211 - pushl %edx; \ 212 - pushl %ecx; \ 213 - pushl %ebx; \ 214 - pushl %eax; \ 215 - /* Our stack and our code are using segments \ 216 - * Set in the TSS and IDT \ 217 - * Yet if we were to touch data we'd use \ 218 - * Whatever data segment the Guest had. \ 219 - * Load the lguest ds segment for now. */ \ 220 - movl $(LGUEST_DS), %eax; \ 221 - movl %eax, %ds; \ 222 - /* So where are we? Which CPU, which struct? \ 223 - * The stack is our clue: our TSS starts \ 224 - * It at the end of "struct lguest_pages". \ 225 - * Or we may have stumbled while restoring \ 226 - * Our Guest segment regs while in switch_to_guest, \ 227 - * The fault pushed atop that part-unwound stack. \ 228 - * If we round the stack down to the page start \ 229 - * We're at the start of "struct lguest_pages". */ \ 230 - movl %esp, %eax; \ 231 - andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 232 - /* Save our trap number: the switch will obscure it \ 233 - * (In the Host the Guest regs are not mapped here) \ 234 - * %ebx holds it safe for deliver_to_host */ \ 235 - movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 236 - /* The Host GDT, IDT and stack! \ 237 - * All these lie safely hidden from the Guest: \ 238 - * We must return to the Host page tables \ 239 - * (Hence that was saved in struct lguest_pages) */ \ 240 - movl LGUEST_PAGES_host_cr3(%eax), %edx; \ 241 - movl %edx, %cr3; \ 242 - /* As before, when we looked back at the Host \ 243 - * As we left and marked TSS unused \ 244 - * So must we now for the Guest left behind. */ \ 245 - andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ 246 - /* Switch to Host's GDT, IDT. */ \ 247 - lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 248 - lidt LGUEST_PAGES_host_idt_desc(%eax); \ 249 - /* Restore the Host's stack where its saved regs lie */ \ 250 - movl LGUEST_PAGES_host_sp(%eax), %esp; \ 251 - /* Last the TSS: our Host is returned */ \ 252 - movl $(GDT_ENTRY_TSS*8), %edx; \ 253 - ltr %dx; \ 254 - /* Restore now the regs saved right at the first. */ \ 255 - popl %ebp; \ 256 - popl %fs; \ 257 - popl %gs; \ 258 - popl %ds; \ 259 - popl %es 260 - 261 - // The first path is trod when the Guest has trapped: 262 - // (Which trap it was has been pushed on the stack). 263 - // We need only switch back, and the Host will decode 264 - // Why we came home, and what needs to be done. 265 - return_to_host: 266 - SWITCH_TO_HOST 267 - iret 268 - 269 - // We are lead to the second path like so: 270 - // An interrupt, with some cause external 271 - // Has ajerked us rudely from the Guest's code 272 - // Again we must return home to the Host 273 - deliver_to_host: 274 - SWITCH_TO_HOST 275 - // But now we must go home via that place 276 - // Where that interrupt was supposed to go 277 - // Had we not been ensconced, running the Guest. 278 - // Here we see the trickness of run_guest_once(): 279 - // The Host stack is formed like an interrupt 280 - // With EIP, CS and EFLAGS layered. 281 - // Interrupt handlers end with "iret" 282 - // And that will take us home at long long last. 283 - 284 - // But first we must find the handler to call! 285 - // The IDT descriptor for the Host 286 - // Has two bytes for size, and four for address: 287 - // %edx will hold it for us for now. 288 - movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx 289 - // We now know the table address we need, 290 - // And saved the trap's number inside %ebx. 291 - // Yet the pointer to the handler is smeared 292 - // Across the bits of the table entry. 293 - // What oracle can tell us how to extract 294 - // From such a convoluted encoding? 295 - // I consulted gcc, and it gave 296 - // These instructions, which I gladly credit: 297 - leal (%edx,%ebx,8), %eax 298 - movzwl (%eax),%edx 299 - movl 4(%eax), %eax 300 - xorw %ax, %ax 301 - orl %eax, %edx 302 - // Now the address of the handler's in %edx 303 - // We call it now: its "iret" drops us home. 304 - jmp *%edx 305 - 306 - // Every interrupt can come to us here 307 - // But we must truly tell each apart. 308 - // They number two hundred and fifty six 309 - // And each must land in a different spot, 310 - // Push its number on stack, and join the stream. 311 - 312 - // And worse, a mere six of the traps stand apart 313 - // And push on their stack an addition: 314 - // An error number, thirty two bits long 315 - // So we punish the other two fifty 316 - // And make them push a zero so they match. 317 - 318 - // Yet two fifty six entries is long 319 - // And all will look most the same as the last 320 - // So we create a macro which can make 321 - // As many entries as we need to fill. 322 - 323 - // Note the change to .data then .text: 324 - // We plant the address of each entry 325 - // Into a (data) table for the Host 326 - // To know where each Guest interrupt should go. 327 - .macro IRQ_STUB N TARGET 328 - .data; .long 1f; .text; 1: 329 - // Trap eight, ten through fourteen and seventeen 330 - // Supply an error number. Else zero. 331 - .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) 332 - pushl $0 333 - .endif 334 - pushl $\N 335 - jmp \TARGET 336 - ALIGN 337 - .endm 338 - 339 - // This macro creates numerous entries 340 - // Using GAS macros which out-power C's. 341 - .macro IRQ_STUBS FIRST LAST TARGET 342 - irq=\FIRST 343 - .rept \LAST-\FIRST+1 344 - IRQ_STUB irq \TARGET 345 - irq=irq+1 346 - .endr 347 - .endm 348 - 349 - // Here's the marker for our pointer table 350 - // Laid in the data section just before 351 - // Each macro places the address of code 352 - // Forming an array: each one points to text 353 - // Which handles interrupt in its turn. 354 - .data 355 - .global default_idt_entries 356 - default_idt_entries: 357 - .text 358 - // The first two traps go straight back to the Host 359 - IRQ_STUBS 0 1 return_to_host 360 - // We'll say nothing, yet, about NMI 361 - IRQ_STUB 2 handle_nmi 362 - // Other traps also return to the Host 363 - IRQ_STUBS 3 31 return_to_host 364 - // All interrupts go via their handlers 365 - IRQ_STUBS 32 127 deliver_to_host 366 - // 'Cept system calls coming from userspace 367 - // Are to go to the Guest, never the Host. 368 - IRQ_STUB 128 return_to_host 369 - IRQ_STUBS 129 255 deliver_to_host 370 - 371 - // The NMI, what a fabulous beast 372 - // Which swoops in and stops us no matter that 373 - // We're suspended between heaven and hell, 374 - // (Or more likely between the Host and Guest) 375 - // When in it comes! We are dazed and confused 376 - // So we do the simplest thing which one can. 377 - // Though we've pushed the trap number and zero 378 - // We discard them, return, and hope we live. 379 - handle_nmi: 380 - addl $8, %esp 381 - iret 382 - 383 - // We are done; all that's left is Mastery 384 - // And "make Mastery" is a journey long 385 - // Designed to make your fingers itch to code. 386 - 387 - // Here ends the text, the file and poem. 388 - ENTRY(end_switcher_text)

+1 -1

drivers/net/Kconfig

··· 333 333 depends on VIRTIO 334 334 ---help--- 335 335 This is the virtual network driver for virtio. It can be used with 336 - lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. 336 + QEMU based VMMs (like KVM or Xen). Say Y or M. 337 337 338 338 config NLMON 339 339 tristate "Virtual netlink monitoring device"

+1 -1

drivers/tty/hvc/Kconfig

··· 4 4 bool 5 5 help 6 6 Generic "hypervisor virtual console" infrastructure for various 7 - hypervisors (pSeries, iSeries, Xen, lguest). 7 + hypervisors (pSeries, iSeries, Xen). 8 8 It will automatically be selected if one of the back-end console drivers 9 9 is selected. 10 10

+2 -2

drivers/virtio/Kconfig

··· 2 2 tristate 3 3 ---help--- 4 4 This option is selected by any driver which implements the virtio 5 - bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST, 6 - CONFIG_RPMSG or CONFIG_S390_GUEST. 5 + bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG 6 + or CONFIG_S390_GUEST. 7 7 8 8 menu "Virtio drivers" 9 9

-73

include/linux/lguest.h

··· 1 - /* 2 - * Things the lguest guest needs to know. Note: like all lguest interfaces, 3 - * this is subject to wild and random change between versions. 4 - */ 5 - #ifndef _LINUX_LGUEST_H 6 - #define _LINUX_LGUEST_H 7 - 8 - #ifndef __ASSEMBLY__ 9 - #include <linux/time.h> 10 - #include <asm/irq.h> 11 - #include <asm/lguest_hcall.h> 12 - 13 - #define LG_CLOCK_MIN_DELTA 100UL 14 - #define LG_CLOCK_MAX_DELTA ULONG_MAX 15 - 16 - /*G:031 17 - * The second method of communicating with the Host is to via "struct 18 - * lguest_data". Once the Guest's initialization hypercall tells the Host where 19 - * this is, the Guest and Host both publish information in it. 20 - :*/ 21 - struct lguest_data { 22 - /* 23 - * 512 == enabled (same as eflags in normal hardware). The Guest 24 - * changes interrupts so often that a hypercall is too slow. 25 - */ 26 - unsigned int irq_enabled; 27 - /* Fine-grained interrupt disabling by the Guest */ 28 - DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); 29 - 30 - /* 31 - * The Host writes the virtual address of the last page fault here, 32 - * which saves the Guest a hypercall. CR2 is the native register where 33 - * this address would normally be found. 34 - */ 35 - unsigned long cr2; 36 - 37 - /* Wallclock time set by the Host. */ 38 - struct timespec time; 39 - 40 - /* 41 - * Interrupt pending set by the Host. The Guest should do a hypercall 42 - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). 43 - */ 44 - int irq_pending; 45 - 46 - /* 47 - * Async hypercall ring. Instead of directly making hypercalls, we can 48 - * place them in here for processing the next time the Host wants. 49 - * This batching can be quite efficient. 50 - */ 51 - 52 - /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 53 - u8 hcall_status[LHCALL_RING_SIZE]; 54 - /* The actual registers for the hypercalls. */ 55 - struct hcall_args hcalls[LHCALL_RING_SIZE]; 56 - 57 - /* Fields initialized by the Host at boot: */ 58 - /* Memory not to try to access */ 59 - unsigned long reserve_mem; 60 - /* KHz for the TSC clock. */ 61 - u32 tsc_khz; 62 - 63 - /* Fields initialized by the Guest at boot: */ 64 - /* Instruction to suppress interrupts even if enabled */ 65 - unsigned long noirq_iret; 66 - /* Address above which page tables are all identical. */ 67 - unsigned long kernel_address; 68 - /* The vector to try to use for system calls (0x40 or 0x80). */ 69 - unsigned int syscall_vec; 70 - }; 71 - extern struct lguest_data lguest_data; 72 - #endif /* __ASSEMBLY__ */ 73 - #endif /* _LINUX_LGUEST_H */

-44

include/linux/lguest_launcher.h

··· 1 - #ifndef _LINUX_LGUEST_LAUNCHER 2 - #define _LINUX_LGUEST_LAUNCHER 3 - /* Everything the "lguest" userspace program needs to know. */ 4 - #include <linux/types.h> 5 - 6 - /*D:010 7 - * Drivers 8 - * 9 - * The Guest needs devices to do anything useful. Since we don't let it touch 10 - * real devices (think of the damage it could do!) we provide virtual devices. 11 - * We emulate a PCI bus with virtio devices on it; we used to have our own 12 - * lguest bus which was far simpler, but this tests the virtio 1.0 standard. 13 - * 14 - * Virtio devices are also used by kvm, so we can simply reuse their optimized 15 - * device drivers. And one day when everyone uses virtio, my plan will be 16 - * complete. Bwahahahah! 17 - */ 18 - 19 - /* Write command first word is a request. */ 20 - enum lguest_req 21 - { 22 - LHREQ_INITIALIZE, /* + base, pfnlimit, start */ 23 - LHREQ_GETDMA, /* No longer used */ 24 - LHREQ_IRQ, /* + irq */ 25 - LHREQ_BREAK, /* No longer used */ 26 - LHREQ_EVENTFD, /* No longer used. */ 27 - LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */ 28 - LHREQ_SETREG, /* + offset within struct pt_regs, value. */ 29 - LHREQ_TRAP, /* + trap number to deliver to guest. */ 30 - }; 31 - 32 - /* 33 - * This is what read() of the lguest fd populates. trap == 34 - * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the 35 - * argument), 14 for a page fault in the MMIO region (addr is 36 - * the trap address, insn is the instruction), or 13 for a GPF 37 - * (insn is the instruction). 38 - */ 39 - struct lguest_pending { 40 - __u8 trap; 41 - __u8 insn[7]; 42 - __u32 addr; 43 - }; 44 - #endif /* _LINUX_LGUEST_LAUNCHER */

+2 -2

include/uapi/linux/virtio_ring.h

··· 1 1 #ifndef _UAPI_LINUX_VIRTIO_RING_H 2 2 #define _UAPI_LINUX_VIRTIO_RING_H 3 - /* An interface for efficient virtio implementation, currently for use by KVM 4 - * and lguest, but hopefully others soon. Do NOT change this since it will 3 + /* An interface for efficient virtio implementation, currently for use by KVM, 4 + * but hopefully others soon. Do NOT change this since it will 5 5 * break existing servers and clients. 6 6 * 7 7 * This header is BSD licensed so anyone can use the definitions to implement

+5 -6

tools/Makefile

··· 18 18 @echo ' iio - IIO tools' 19 19 @echo ' kvm_stat - top-like utility for displaying kvm statistics' 20 20 @echo ' leds - LEDs tools' 21 - @echo ' lguest - a minimal 32-bit x86 hypervisor' 22 21 @echo ' liblockdep - user-space wrapper for kernel locking-validator' 23 22 @echo ' net - misc networking tools' 24 23 @echo ' perf - Linux performance measurement and analysis tool' ··· 89 90 kvm_stat: FORCE 90 91 $(call descend,kvm/$@) 91 92 92 - all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \ 93 + all: acpi cgroup cpupower gpio hv firewire liblockdep \ 93 94 perf selftests turbostat usb \ 94 95 virtio vm net x86_energy_perf_policy \ 95 96 tmon freefall objtool kvm_stat ··· 100 101 cpupower_install: 101 102 $(call descend,power/$(@:_install=),install) 102 103 103 - cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install: 104 + cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install: 104 105 $(call descend,$(@:_install=),install) 105 106 106 107 liblockdep_install: ··· 122 123 $(call descend,kvm/$(@:_install=),install) 123 124 124 125 install: acpi_install cgroup_install cpupower_install gpio_install \ 125 - hv_install firewire_install lguest_install liblockdep_install \ 126 + hv_install firewire_install liblockdep_install \ 126 127 perf_install selftests_install turbostat_install usb_install \ 127 128 virtio_install vm_install net_install x86_energy_perf_policy_install \ 128 129 tmon_install freefall_install objtool_install kvm_stat_install ··· 133 134 cpupower_clean: 134 135 $(call descend,power/cpupower,clean) 135 136 136 - cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: 137 + cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: 137 138 $(call descend,$(@:_clean=),clean) 138 139 139 140 liblockdep_clean: ··· 167 168 build_clean: 168 169 $(call descend,build,clean) 169 170 170 - clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \ 171 + clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \ 171 172 perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \ 172 173 vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ 173 174 freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \

-2

tools/lguest/.gitignore

··· 1 - lguest 2 - include

-14

tools/lguest/Makefile

··· 1 - # This creates the demonstration utility "lguest" which runs a Linux guest. 2 - CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude 3 - 4 - all: lguest 5 - 6 - include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h 7 - mkdir -p include/linux 2>&1 || true 8 - ln -sf ../../../../include/uapi/linux/virtio_types.h $@ 9 - 10 - lguest: include/linux/virtio_types.h 11 - 12 - clean: 13 - rm -f lguest 14 - rm -rf include

-58

tools/lguest/extract

··· 1 - #! /bin/sh 2 - 3 - set -e 4 - 5 - PREFIX=$1 6 - shift 7 - 8 - trap 'rm -r $TMPDIR' 0 9 - TMPDIR=`mktemp -d` 10 - 11 - exec 3>/dev/null 12 - for f; do 13 - while IFS=" 14 - " read -r LINE; do 15 - case "$LINE" in 16 - *$PREFIX:[0-9]*:\**) 17 - NUM=`echo "$LINE" | sed "s/.*$PREFIX:$[0-9]*$.*/\1/"` 18 - if [ -f $TMPDIR/$NUM ]; then 19 - echo "$TMPDIR/$NUM already exits prior to $f" 20 - exit 1 21 - fi 22 - exec 3>>$TMPDIR/$NUM 23 - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM 24 - /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 25 - ;; 26 - *$PREFIX:[0-9]*) 27 - NUM=`echo "$LINE" | sed "s/.*$PREFIX:$[0-9]*$.*/\1/"` 28 - if [ -f $TMPDIR/$NUM ]; then 29 - echo "$TMPDIR/$NUM already exits prior to $f" 30 - exit 1 31 - fi 32 - exec 3>>$TMPDIR/$NUM 33 - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM 34 - /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 35 - ;; 36 - *:\**) 37 - /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 38 - echo >&3 39 - exec 3>/dev/null 40 - ;; 41 - *) 42 - /bin/echo "$LINE" >&3 43 - ;; 44 - esac 45 - done < $f 46 - echo >&3 47 - exec 3>/dev/null 48 - done 49 - 50 - LASTFILE="" 51 - for f in $TMPDIR/*; do 52 - if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then 53 - LASTFILE=$(cat $TMPDIR/.$(basename $f) ) 54 - echo "[ $LASTFILE ]" 55 - fi 56 - cat $f 57 - done 58 -

-3420

tools/lguest/lguest.c

··· 1 - /*P:100 2 - * This is the Launcher code, a simple program which lays out the "physical" 3 - * memory for the new Guest by mapping the kernel image and the virtual 4 - * devices, then opens /dev/lguest to tell the kernel about the Guest and 5 - * control it. 6 - :*/ 7 - #define _LARGEFILE64_SOURCE 8 - #define _GNU_SOURCE 9 - #include <stdio.h> 10 - #include <string.h> 11 - #include <unistd.h> 12 - #include <err.h> 13 - #include <stdint.h> 14 - #include <stdlib.h> 15 - #include <elf.h> 16 - #include <sys/mman.h> 17 - #include <sys/param.h> 18 - #include <sys/types.h> 19 - #include <sys/stat.h> 20 - #include <sys/wait.h> 21 - #include <sys/eventfd.h> 22 - #include <fcntl.h> 23 - #include <stdbool.h> 24 - #include <errno.h> 25 - #include <ctype.h> 26 - #include <sys/socket.h> 27 - #include <sys/ioctl.h> 28 - #include <sys/time.h> 29 - #include <time.h> 30 - #include <netinet/in.h> 31 - #include <net/if.h> 32 - #include <linux/sockios.h> 33 - #include <linux/if_tun.h> 34 - #include <sys/uio.h> 35 - #include <termios.h> 36 - #include <getopt.h> 37 - #include <assert.h> 38 - #include <sched.h> 39 - #include <limits.h> 40 - #include <stddef.h> 41 - #include <signal.h> 42 - #include <pwd.h> 43 - #include <grp.h> 44 - #include <sys/user.h> 45 - #include <linux/pci_regs.h> 46 - 47 - #ifndef VIRTIO_F_ANY_LAYOUT 48 - #define VIRTIO_F_ANY_LAYOUT 27 49 - #endif 50 - 51 - /*L:110 52 - * We can ignore the 43 include files we need for this program, but I do want 53 - * to draw attention to the use of kernel-style types. 54 - * 55 - * As Linus said, "C is a Spartan language, and so should your naming be." I 56 - * like these abbreviations, so we define them here. Note that u64 is always 57 - * unsigned long long, which works on all Linux systems: this means that we can 58 - * use %llu in printf for any u64. 59 - */ 60 - typedef unsigned long long u64; 61 - typedef uint32_t u32; 62 - typedef uint16_t u16; 63 - typedef uint8_t u8; 64 - /*:*/ 65 - 66 - #define VIRTIO_CONFIG_NO_LEGACY 67 - #define VIRTIO_PCI_NO_LEGACY 68 - #define VIRTIO_BLK_NO_LEGACY 69 - #define VIRTIO_NET_NO_LEGACY 70 - 71 - /* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */ 72 - #include "../../include/uapi/linux/virtio_config.h" 73 - #include "../../include/uapi/linux/virtio_net.h" 74 - #include "../../include/uapi/linux/virtio_blk.h" 75 - #include "../../include/uapi/linux/virtio_console.h" 76 - #include "../../include/uapi/linux/virtio_rng.h" 77 - #include <linux/virtio_ring.h> 78 - #include "../../include/uapi/linux/virtio_pci.h" 79 - #include <asm/bootparam.h> 80 - #include "../../include/linux/lguest_launcher.h" 81 - 82 - #define BRIDGE_PFX "bridge:" 83 - #ifndef SIOCBRADDIF 84 - #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 85 - #endif 86 - /* We can have up to 256 pages for devices. */ 87 - #define DEVICE_PAGES 256 88 - /* This will occupy 3 pages: it must be a power of 2. */ 89 - #define VIRTQUEUE_NUM 256 90 - 91 - /*L:120 92 - * verbose is both a global flag and a macro. The C preprocessor allows 93 - * this, and although I wouldn't recommend it, it works quite nicely here. 94 - */ 95 - static bool verbose; 96 - #define verbose(args...) \ 97 - do { if (verbose) printf(args); } while(0) 98 - /*:*/ 99 - 100 - /* The pointer to the start of guest memory. */ 101 - static void *guest_base; 102 - /* The maximum guest physical address allowed, and maximum possible. */ 103 - static unsigned long guest_limit, guest_max, guest_mmio; 104 - /* The /dev/lguest file descriptor. */ 105 - static int lguest_fd; 106 - 107 - /* a per-cpu variable indicating whose vcpu is currently running */ 108 - static unsigned int __thread cpu_id; 109 - 110 - /* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */ 111 - #define MAX_PCI_DEVICES 32 112 - 113 - /* This is our list of devices. */ 114 - struct device_list { 115 - /* Counter to assign interrupt numbers. */ 116 - unsigned int next_irq; 117 - 118 - /* Counter to print out convenient device numbers. */ 119 - unsigned int device_num; 120 - 121 - /* PCI devices. */ 122 - struct device *pci[MAX_PCI_DEVICES]; 123 - }; 124 - 125 - /* The list of Guest devices, based on command line arguments. */ 126 - static struct device_list devices; 127 - 128 - /* 129 - * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h, 130 - * but uses a u32 explicitly for the data. 131 - */ 132 - struct virtio_pci_cfg_cap_u32 { 133 - struct virtio_pci_cap cap; 134 - u32 pci_cfg_data; /* Data for BAR access. */ 135 - }; 136 - 137 - struct virtio_pci_mmio { 138 - struct virtio_pci_common_cfg cfg; 139 - u16 notify; 140 - u8 isr; 141 - u8 padding; 142 - /* Device-specific configuration follows this. */ 143 - }; 144 - 145 - /* This is the layout (little-endian) of the PCI config space. */ 146 - struct pci_config { 147 - u16 vendor_id, device_id; 148 - u16 command, status; 149 - u8 revid, prog_if, subclass, class; 150 - u8 cacheline_size, lat_timer, header_type, bist; 151 - u32 bar[6]; 152 - u32 cardbus_cis_ptr; 153 - u16 subsystem_vendor_id, subsystem_device_id; 154 - u32 expansion_rom_addr; 155 - u8 capabilities, reserved1[3]; 156 - u32 reserved2; 157 - u8 irq_line, irq_pin, min_grant, max_latency; 158 - 159 - /* Now, this is the linked capability list. */ 160 - struct virtio_pci_cap common; 161 - struct virtio_pci_notify_cap notify; 162 - struct virtio_pci_cap isr; 163 - struct virtio_pci_cap device; 164 - struct virtio_pci_cfg_cap_u32 cfg_access; 165 - }; 166 - 167 - /* The device structure describes a single device. */ 168 - struct device { 169 - /* The name of this device, for --verbose. */ 170 - const char *name; 171 - 172 - /* Any queues attached to this device */ 173 - struct virtqueue *vq; 174 - 175 - /* Is it operational */ 176 - bool running; 177 - 178 - /* Has it written FEATURES_OK but not re-checked it? */ 179 - bool wrote_features_ok; 180 - 181 - /* PCI configuration */ 182 - union { 183 - struct pci_config config; 184 - u32 config_words[sizeof(struct pci_config) / sizeof(u32)]; 185 - }; 186 - 187 - /* Features we offer, and those accepted. */ 188 - u64 features, features_accepted; 189 - 190 - /* Device-specific config hangs off the end of this. */ 191 - struct virtio_pci_mmio *mmio; 192 - 193 - /* PCI MMIO resources (all in BAR0) */ 194 - size_t mmio_size; 195 - u32 mmio_addr; 196 - 197 - /* Device-specific data. */ 198 - void *priv; 199 - }; 200 - 201 - /* The virtqueue structure describes a queue attached to a device. */ 202 - struct virtqueue { 203 - struct virtqueue *next; 204 - 205 - /* Which device owns me. */ 206 - struct device *dev; 207 - 208 - /* Name for printing errors. */ 209 - const char *name; 210 - 211 - /* The actual ring of buffers. */ 212 - struct vring vring; 213 - 214 - /* The information about this virtqueue (we only use queue_size on) */ 215 - struct virtio_pci_common_cfg pci_config; 216 - 217 - /* Last available index we saw. */ 218 - u16 last_avail_idx; 219 - 220 - /* How many are used since we sent last irq? */ 221 - unsigned int pending_used; 222 - 223 - /* Eventfd where Guest notifications arrive. */ 224 - int eventfd; 225 - 226 - /* Function for the thread which is servicing this virtqueue. */ 227 - void (*service)(struct virtqueue *vq); 228 - pid_t thread; 229 - }; 230 - 231 - /* Remember the arguments to the program so we can "reboot" */ 232 - static char **main_args; 233 - 234 - /* The original tty settings to restore on exit. */ 235 - static struct termios orig_term; 236 - 237 - /* 238 - * We have to be careful with barriers: our devices are all run in separate 239 - * threads and so we need to make sure that changes visible to the Guest happen 240 - * in precise order. 241 - */ 242 - #define wmb() __asm__ __volatile__("" : : : "memory") 243 - #define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") 244 - #define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") 245 - 246 - /* Wrapper for the last available index. Makes it easier to change. */ 247 - #define lg_last_avail(vq) ((vq)->last_avail_idx) 248 - 249 - /* 250 - * The virtio configuration space is defined to be little-endian. x86 is 251 - * little-endian too, but it's nice to be explicit so we have these helpers. 252 - */ 253 - #define cpu_to_le16(v16) (v16) 254 - #define cpu_to_le32(v32) (v32) 255 - #define cpu_to_le64(v64) (v64) 256 - #define le16_to_cpu(v16) (v16) 257 - #define le32_to_cpu(v32) (v32) 258 - #define le64_to_cpu(v64) (v64) 259 - 260 - /* 261 - * A real device would ignore weird/non-compliant driver behaviour. We 262 - * stop and flag it, to help debugging Linux problems. 263 - */ 264 - #define bad_driver(d, fmt, ...) \ 265 - errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__) 266 - #define bad_driver_vq(vq, fmt, ...) \ 267 - errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \ 268 - vq->name, ## __VA_ARGS__) 269 - 270 - /* Is this iovec empty? */ 271 - static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 272 - { 273 - unsigned int i; 274 - 275 - for (i = 0; i < num_iov; i++) 276 - if (iov[i].iov_len) 277 - return false; 278 - return true; 279 - } 280 - 281 - /* Take len bytes from the front of this iovec. */ 282 - static void iov_consume(struct device *d, 283 - struct iovec iov[], unsigned num_iov, 284 - void *dest, unsigned len) 285 - { 286 - unsigned int i; 287 - 288 - for (i = 0; i < num_iov; i++) { 289 - unsigned int used; 290 - 291 - used = iov[i].iov_len < len ? iov[i].iov_len : len; 292 - if (dest) { 293 - memcpy(dest, iov[i].iov_base, used); 294 - dest += used; 295 - } 296 - iov[i].iov_base += used; 297 - iov[i].iov_len -= used; 298 - len -= used; 299 - } 300 - if (len != 0) 301 - bad_driver(d, "iovec too short!"); 302 - } 303 - 304 - /*L:100 305 - * The Launcher code itself takes us out into userspace, that scary place where 306 - * pointers run wild and free! Unfortunately, like most userspace programs, 307 - * it's quite boring (which is why everyone likes to hack on the kernel!). 308 - * Perhaps if you make up an Lguest Drinking Game at this point, it will get 309 - * you through this section. Or, maybe not. 310 - * 311 - * The Launcher sets up a big chunk of memory to be the Guest's "physical" 312 - * memory and stores it in "guest_base". In other words, Guest physical == 313 - * Launcher virtual with an offset. 314 - * 315 - * This can be tough to get your head around, but usually it just means that we 316 - * use these trivial conversion functions when the Guest gives us its 317 - * "physical" addresses: 318 - */ 319 - static void *from_guest_phys(unsigned long addr) 320 - { 321 - return guest_base + addr; 322 - } 323 - 324 - static unsigned long to_guest_phys(const void *addr) 325 - { 326 - return (addr - guest_base); 327 - } 328 - 329 - /*L:130 330 - * Loading the Kernel. 331 - * 332 - * We start with couple of simple helper routines. open_or_die() avoids 333 - * error-checking code cluttering the callers: 334 - */ 335 - static int open_or_die(const char *name, int flags) 336 - { 337 - int fd = open(name, flags); 338 - if (fd < 0) 339 - err(1, "Failed to open %s", name); 340 - return fd; 341 - } 342 - 343 - /* map_zeroed_pages() takes a number of pages. */ 344 - static void *map_zeroed_pages(unsigned int num) 345 - { 346 - int fd = open_or_die("/dev/zero", O_RDONLY); 347 - void *addr; 348 - 349 - /* 350 - * We use a private mapping (ie. if we write to the page, it will be 351 - * copied). We allocate an extra two pages PROT_NONE to act as guard 352 - * pages against read/write attempts that exceed allocated space. 353 - */ 354 - addr = mmap(NULL, getpagesize() * (num+2), 355 - PROT_NONE, MAP_PRIVATE, fd, 0); 356 - 357 - if (addr == MAP_FAILED) 358 - err(1, "Mmapping %u pages of /dev/zero", num); 359 - 360 - if (mprotect(addr + getpagesize(), getpagesize() * num, 361 - PROT_READ|PROT_WRITE) == -1) 362 - err(1, "mprotect rw %u pages failed", num); 363 - 364 - /* 365 - * One neat mmap feature is that you can close the fd, and it 366 - * stays mapped. 367 - */ 368 - close(fd); 369 - 370 - /* Return address after PROT_NONE page */ 371 - return addr + getpagesize(); 372 - } 373 - 374 - /* Get some bytes which won't be mapped into the guest. */ 375 - static unsigned long get_mmio_region(size_t size) 376 - { 377 - unsigned long addr = guest_mmio; 378 - size_t i; 379 - 380 - if (!size) 381 - return addr; 382 - 383 - /* Size has to be a power of 2 (and multiple of 16) */ 384 - for (i = 1; i < size; i <<= 1); 385 - 386 - guest_mmio += i; 387 - 388 - return addr; 389 - } 390 - 391 - /* 392 - * This routine is used to load the kernel or initrd. It tries mmap, but if 393 - * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 394 - * it falls back to reading the memory in. 395 - */ 396 - static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 397 - { 398 - ssize_t r; 399 - 400 - /* 401 - * We map writable even though for some segments are marked read-only. 402 - * The kernel really wants to be writable: it patches its own 403 - * instructions. 404 - * 405 - * MAP_PRIVATE means that the page won't be copied until a write is 406 - * done to it. This allows us to share untouched memory between 407 - * Guests. 408 - */ 409 - if (mmap(addr, len, PROT_READ|PROT_WRITE, 410 - MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 411 - return; 412 - 413 - /* pread does a seek and a read in one shot: saves a few lines. */ 414 - r = pread(fd, addr, len, offset); 415 - if (r != len) 416 - err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 417 - } 418 - 419 - /* 420 - * This routine takes an open vmlinux image, which is in ELF, and maps it into 421 - * the Guest memory. ELF = Embedded Linking Format, which is the format used 422 - * by all modern binaries on Linux including the kernel. 423 - * 424 - * The ELF headers give *two* addresses: a physical address, and a virtual 425 - * address. We use the physical address; the Guest will map itself to the 426 - * virtual address. 427 - * 428 - * We return the starting address. 429 - */ 430 - static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 431 - { 432 - Elf32_Phdr phdr[ehdr->e_phnum]; 433 - unsigned int i; 434 - 435 - /* 436 - * Sanity checks on the main ELF header: an x86 executable with a 437 - * reasonable number of correctly-sized program headers. 438 - */ 439 - if (ehdr->e_type != ET_EXEC 440 - || ehdr->e_machine != EM_386 441 - || ehdr->e_phentsize != sizeof(Elf32_Phdr) 442 - || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 443 - errx(1, "Malformed elf header"); 444 - 445 - /* 446 - * An ELF executable contains an ELF header and a number of "program" 447 - * headers which indicate which parts ("segments") of the program to 448 - * load where. 449 - */ 450 - 451 - /* We read in all the program headers at once: */ 452 - if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 453 - err(1, "Seeking to program headers"); 454 - if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 455 - err(1, "Reading program headers"); 456 - 457 - /* 458 - * Try all the headers: there are usually only three. A read-only one, 459 - * a read-write one, and a "note" section which we don't load. 460 - */ 461 - for (i = 0; i < ehdr->e_phnum; i++) { 462 - /* If this isn't a loadable segment, we ignore it */ 463 - if (phdr[i].p_type != PT_LOAD) 464 - continue; 465 - 466 - verbose("Section %i: size %i addr %p\n", 467 - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 468 - 469 - /* We map this section of the file at its physical address. */ 470 - map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), 471 - phdr[i].p_offset, phdr[i].p_filesz); 472 - } 473 - 474 - /* The entry point is given in the ELF header. */ 475 - return ehdr->e_entry; 476 - } 477 - 478 - /*L:150 479 - * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed 480 - * to jump into it and it will unpack itself. We used to have to perform some 481 - * hairy magic because the unpacking code scared me. 482 - * 483 - * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 484 - * a small patch to jump over the tricky bits in the Guest, so now we just read 485 - * the funky header so we know where in the file to load, and away we go! 486 - */ 487 - static unsigned long load_bzimage(int fd) 488 - { 489 - struct boot_params boot; 490 - int r; 491 - /* Modern bzImages get loaded at 1M. */ 492 - void *p = from_guest_phys(0x100000); 493 - 494 - /* 495 - * Go back to the start of the file and read the header. It should be 496 - * a Linux boot header (see Documentation/x86/boot.txt) 497 - */ 498 - lseek(fd, 0, SEEK_SET); 499 - read(fd, &boot, sizeof(boot)); 500 - 501 - /* Inside the setup_hdr, we expect the magic "HdrS" */ 502 - if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) 503 - errx(1, "This doesn't look like a bzImage to me"); 504 - 505 - /* Skip over the extra sectors of the header. */ 506 - lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); 507 - 508 - /* Now read everything into memory. in nice big chunks. */ 509 - while ((r = read(fd, p, 65536)) > 0) 510 - p += r; 511 - 512 - /* Finally, code32_start tells us where to enter the kernel. */ 513 - return boot.hdr.code32_start; 514 - } 515 - 516 - /*L:140 517 - * Loading the kernel is easy when it's a "vmlinux", but most kernels 518 - * come wrapped up in the self-decompressing "bzImage" format. With a little 519 - * work, we can load those, too. 520 - */ 521 - static unsigned long load_kernel(int fd) 522 - { 523 - Elf32_Ehdr hdr; 524 - 525 - /* Read in the first few bytes. */ 526 - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 527 - err(1, "Reading kernel"); 528 - 529 - /* If it's an ELF file, it starts with "\177ELF" */ 530 - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 531 - return map_elf(fd, &hdr); 532 - 533 - /* Otherwise we assume it's a bzImage, and try to load it. */ 534 - return load_bzimage(fd); 535 - } 536 - 537 - /* 538 - * This is a trivial little helper to align pages. Andi Kleen hated it because 539 - * it calls getpagesize() twice: "it's dumb code." 540 - * 541 - * Kernel guys get really het up about optimization, even when it's not 542 - * necessary. I leave this code as a reaction against that. 543 - */ 544 - static inline unsigned long page_align(unsigned long addr) 545 - { 546 - /* Add upwards and truncate downwards. */ 547 - return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 548 - } 549 - 550 - /*L:180 551 - * An "initial ram disk" is a disk image loaded into memory along with the 552 - * kernel which the kernel can use to boot from without needing any drivers. 553 - * Most distributions now use this as standard: the initrd contains the code to 554 - * load the appropriate driver modules for the current machine. 555 - * 556 - * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 557 - * kernels. He sent me this (and tells me when I break it). 558 - */ 559 - static unsigned long load_initrd(const char *name, unsigned long mem) 560 - { 561 - int ifd; 562 - struct stat st; 563 - unsigned long len; 564 - 565 - ifd = open_or_die(name, O_RDONLY); 566 - /* fstat() is needed to get the file size. */ 567 - if (fstat(ifd, &st) < 0) 568 - err(1, "fstat() on initrd '%s'", name); 569 - 570 - /* 571 - * We map the initrd at the top of memory, but mmap wants it to be 572 - * page-aligned, so we round the size up for that. 573 - */ 574 - len = page_align(st.st_size); 575 - map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 576 - /* 577 - * Once a file is mapped, you can close the file descriptor. It's a 578 - * little odd, but quite useful. 579 - */ 580 - close(ifd); 581 - verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 582 - 583 - /* We return the initrd size. */ 584 - return len; 585 - } 586 - /*:*/ 587 - 588 - /* 589 - * Simple routine to roll all the commandline arguments together with spaces 590 - * between them. 591 - */ 592 - static void concat(char *dst, char *args[]) 593 - { 594 - unsigned int i, len = 0; 595 - 596 - for (i = 0; args[i]; i++) { 597 - if (i) { 598 - strcat(dst+len, " "); 599 - len++; 600 - } 601 - strcpy(dst+len, args[i]); 602 - len += strlen(args[i]); 603 - } 604 - /* In case it's empty. */ 605 - dst[len] = '\0'; 606 - } 607 - 608 - /*L:185 609 - * This is where we actually tell the kernel to initialize the Guest. We 610 - * saw the arguments it expects when we looked at initialize() in lguest_user.c: 611 - * the base of Guest "physical" memory, the top physical page to allow and the 612 - * entry point for the Guest. 613 - */ 614 - static void tell_kernel(unsigned long start) 615 - { 616 - unsigned long args[] = { LHREQ_INITIALIZE, 617 - (unsigned long)guest_base, 618 - guest_limit / getpagesize(), start, 619 - (guest_mmio+getpagesize()-1) / getpagesize() }; 620 - verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n", 621 - guest_base, guest_base + guest_limit, 622 - guest_limit, guest_mmio); 623 - lguest_fd = open_or_die("/dev/lguest", O_RDWR); 624 - if (write(lguest_fd, args, sizeof(args)) < 0) 625 - err(1, "Writing to /dev/lguest"); 626 - } 627 - /*:*/ 628 - 629 - /*L:200 630 - * Device Handling. 631 - * 632 - * When the Guest gives us a buffer, it sends an array of addresses and sizes. 633 - * We need to make sure it's not trying to reach into the Launcher itself, so 634 - * we have a convenient routine which checks it and exits with an error message 635 - * if something funny is going on: 636 - */ 637 - static void *_check_pointer(struct device *d, 638 - unsigned long addr, unsigned int size, 639 - unsigned int line) 640 - { 641 - /* 642 - * Check if the requested address and size exceeds the allocated memory, 643 - * or addr + size wraps around. 644 - */ 645 - if ((addr + size) > guest_limit || (addr + size) < addr) 646 - bad_driver(d, "%s:%i: Invalid address %#lx", 647 - __FILE__, line, addr); 648 - /* 649 - * We return a pointer for the caller's convenience, now we know it's 650 - * safe to use. 651 - */ 652 - return from_guest_phys(addr); 653 - } 654 - /* A macro which transparently hands the line number to the real function. */ 655 - #define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__) 656 - 657 - /* 658 - * Each buffer in the virtqueues is actually a chain of descriptors. This 659 - * function returns the next descriptor in the chain, or vq->vring.num if we're 660 - * at the end. 661 - */ 662 - static unsigned next_desc(struct device *d, struct vring_desc *desc, 663 - unsigned int i, unsigned int max) 664 - { 665 - unsigned int next; 666 - 667 - /* If this descriptor says it doesn't chain, we're done. */ 668 - if (!(desc[i].flags & VRING_DESC_F_NEXT)) 669 - return max; 670 - 671 - /* Check they're not leading us off end of descriptors. */ 672 - next = desc[i].next; 673 - /* Make sure compiler knows to grab that: we don't want it changing! */ 674 - wmb(); 675 - 676 - if (next >= max) 677 - bad_driver(d, "Desc next is %u", next); 678 - 679 - return next; 680 - } 681 - 682 - /* 683 - * This actually sends the interrupt for this virtqueue, if we've used a 684 - * buffer. 685 - */ 686 - static void trigger_irq(struct virtqueue *vq) 687 - { 688 - unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line }; 689 - 690 - /* Don't inform them if nothing used. */ 691 - if (!vq->pending_used) 692 - return; 693 - vq->pending_used = 0; 694 - 695 - /* 696 - * 2.4.7.1: 697 - * 698 - * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: 699 - * The driver MUST set flags to 0 or 1. 700 - */ 701 - if (vq->vring.avail->flags > 1) 702 - bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags); 703 - 704 - /* 705 - * 2.4.7.2: 706 - * 707 - * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: 708 - * 709 - * - The device MUST ignore the used_event value. 710 - * - After the device writes a descriptor index into the used ring: 711 - * - If flags is 1, the device SHOULD NOT send an interrupt. 712 - * - If flags is 0, the device MUST send an interrupt. 713 - */ 714 - if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 715 - return; 716 - } 717 - 718 - /* 719 - * 4.1.4.5.1: 720 - * 721 - * If MSI-X capability is disabled, the device MUST set the Queue 722 - * Interrupt bit in ISR status before sending a virtqueue notification 723 - * to the driver. 724 - */ 725 - vq->dev->mmio->isr = 0x1; 726 - 727 - /* Send the Guest an interrupt tell them we used something up. */ 728 - if (write(lguest_fd, buf, sizeof(buf)) != 0) 729 - err(1, "Triggering irq %i", vq->dev->config.irq_line); 730 - } 731 - 732 - /* 733 - * This looks in the virtqueue for the first available buffer, and converts 734 - * it to an iovec for convenient access. Since descriptors consist of some 735 - * number of output then some number of input descriptors, it's actually two 736 - * iovecs, but we pack them into one and note how many of each there were. 737 - * 738 - * This function waits if necessary, and returns the descriptor number found. 739 - */ 740 - static unsigned wait_for_vq_desc(struct virtqueue *vq, 741 - struct iovec iov[], 742 - unsigned int *out_num, unsigned int *in_num) 743 - { 744 - unsigned int i, head, max; 745 - struct vring_desc *desc; 746 - u16 last_avail = lg_last_avail(vq); 747 - 748 - /* 749 - * 2.4.7.1: 750 - * 751 - * The driver MUST handle spurious interrupts from the device. 752 - * 753 - * That's why this is a while loop. 754 - */ 755 - 756 - /* There's nothing available? */ 757 - while (last_avail == vq->vring.avail->idx) { 758 - u64 event; 759 - 760 - /* 761 - * Since we're about to sleep, now is a good time to tell the 762 - * Guest about what we've used up to now. 763 - */ 764 - trigger_irq(vq); 765 - 766 - /* OK, now we need to know about added descriptors. */ 767 - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 768 - 769 - /* 770 - * They could have slipped one in as we were doing that: make 771 - * sure it's written, then check again. 772 - */ 773 - mb(); 774 - if (last_avail != vq->vring.avail->idx) { 775 - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 776 - break; 777 - } 778 - 779 - /* Nothing new? Wait for eventfd to tell us they refilled. */ 780 - if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) 781 - errx(1, "Event read failed?"); 782 - 783 - /* We don't need to be notified again. */ 784 - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 785 - } 786 - 787 - /* Check it isn't doing very strange things with descriptor numbers. */ 788 - if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 789 - bad_driver_vq(vq, "Guest moved used index from %u to %u", 790 - last_avail, vq->vring.avail->idx); 791 - 792 - /* 793 - * Make sure we read the descriptor number *after* we read the ring 794 - * update; don't let the cpu or compiler change the order. 795 - */ 796 - rmb(); 797 - 798 - /* 799 - * Grab the next descriptor number they're advertising, and increment 800 - * the index we've seen. 801 - */ 802 - head = vq->vring.avail->ring[last_avail % vq->vring.num]; 803 - lg_last_avail(vq)++; 804 - 805 - /* If their number is silly, that's a fatal mistake. */ 806 - if (head >= vq->vring.num) 807 - bad_driver_vq(vq, "Guest says index %u is available", head); 808 - 809 - /* When we start there are none of either input nor output. */ 810 - *out_num = *in_num = 0; 811 - 812 - max = vq->vring.num; 813 - desc = vq->vring.desc; 814 - i = head; 815 - 816 - /* 817 - * We have to read the descriptor after we read the descriptor number, 818 - * but there's a data dependency there so the CPU shouldn't reorder 819 - * that: no rmb() required. 820 - */ 821 - 822 - do { 823 - /* 824 - * If this is an indirect entry, then this buffer contains a 825 - * descriptor table which we handle as if it's any normal 826 - * descriptor chain. 827 - */ 828 - if (desc[i].flags & VRING_DESC_F_INDIRECT) { 829 - /* 2.4.5.3.1: 830 - * 831 - * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT 832 - * flag unless the VIRTIO_F_INDIRECT_DESC feature was 833 - * negotiated. 834 - */ 835 - if (!(vq->dev->features_accepted & 836 - (1<<VIRTIO_RING_F_INDIRECT_DESC))) 837 - bad_driver_vq(vq, "vq indirect not negotiated"); 838 - 839 - /* 840 - * 2.4.5.3.1: 841 - * 842 - * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT 843 - * flag within an indirect descriptor (ie. only one 844 - * table per descriptor). 845 - */ 846 - if (desc != vq->vring.desc) 847 - bad_driver_vq(vq, "Indirect within indirect"); 848 - 849 - /* 850 - * Proposed update VIRTIO-134 spells this out: 851 - * 852 - * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 853 - * and VIRTQ_DESC_F_NEXT in flags. 854 - */ 855 - if (desc[i].flags & VRING_DESC_F_NEXT) 856 - bad_driver_vq(vq, "indirect and next together"); 857 - 858 - if (desc[i].len % sizeof(struct vring_desc)) 859 - bad_driver_vq(vq, 860 - "Invalid size for indirect table"); 861 - /* 862 - * 2.4.5.3.2: 863 - * 864 - * The device MUST ignore the write-only flag 865 - * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that 866 - * refers to an indirect table. 867 - * 868 - * We ignore it here: :) 869 - */ 870 - 871 - max = desc[i].len / sizeof(struct vring_desc); 872 - desc = check_pointer(vq->dev, desc[i].addr, desc[i].len); 873 - i = 0; 874 - 875 - /* 2.4.5.3.1: 876 - * 877 - * A driver MUST NOT create a descriptor chain longer 878 - * than the Queue Size of the device. 879 - */ 880 - if (max > vq->pci_config.queue_size) 881 - bad_driver_vq(vq, 882 - "indirect has too many entries"); 883 - } 884 - 885 - /* Grab the first descriptor, and check it's OK. */ 886 - iov[*out_num + *in_num].iov_len = desc[i].len; 887 - iov[*out_num + *in_num].iov_base 888 - = check_pointer(vq->dev, desc[i].addr, desc[i].len); 889 - /* If this is an input descriptor, increment that count. */ 890 - if (desc[i].flags & VRING_DESC_F_WRITE) 891 - (*in_num)++; 892 - else { 893 - /* 894 - * If it's an output descriptor, they're all supposed 895 - * to come before any input descriptors. 896 - */ 897 - if (*in_num) 898 - bad_driver_vq(vq, 899 - "Descriptor has out after in"); 900 - (*out_num)++; 901 - } 902 - 903 - /* If we've got too many, that implies a descriptor loop. */ 904 - if (*out_num + *in_num > max) 905 - bad_driver_vq(vq, "Looped descriptor"); 906 - } while ((i = next_desc(vq->dev, desc, i, max)) != max); 907 - 908 - return head; 909 - } 910 - 911 - /* 912 - * After we've used one of their buffers, we tell the Guest about it. Sometime 913 - * later we'll want to send them an interrupt using trigger_irq(); note that 914 - * wait_for_vq_desc() does that for us if it has to wait. 915 - */ 916 - static void add_used(struct virtqueue *vq, unsigned int head, int len) 917 - { 918 - struct vring_used_elem *used; 919 - 920 - /* 921 - * The virtqueue contains a ring of used buffers. Get a pointer to the 922 - * next entry in that used ring. 923 - */ 924 - used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 925 - used->id = head; 926 - used->len = len; 927 - /* Make sure buffer is written before we update index. */ 928 - wmb(); 929 - vq->vring.used->idx++; 930 - vq->pending_used++; 931 - } 932 - 933 - /* And here's the combo meal deal. Supersize me! */ 934 - static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) 935 - { 936 - add_used(vq, head, len); 937 - trigger_irq(vq); 938 - } 939 - 940 - /* 941 - * The Console 942 - * 943 - * We associate some data with the console for our exit hack. 944 - */ 945 - struct console_abort { 946 - /* How many times have they hit ^C? */ 947 - int count; 948 - /* When did they start? */ 949 - struct timeval start; 950 - }; 951 - 952 - /* This is the routine which handles console input (ie. stdin). */ 953 - static void console_input(struct virtqueue *vq) 954 - { 955 - int len; 956 - unsigned int head, in_num, out_num; 957 - struct console_abort *abort = vq->dev->priv; 958 - struct iovec iov[vq->vring.num]; 959 - 960 - /* Make sure there's a descriptor available. */ 961 - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 962 - if (out_num) 963 - bad_driver_vq(vq, "Output buffers in console in queue?"); 964 - 965 - /* Read into it. This is where we usually wait. */ 966 - len = readv(STDIN_FILENO, iov, in_num); 967 - if (len <= 0) { 968 - /* Ran out of input? */ 969 - warnx("Failed to get console input, ignoring console."); 970 - /* 971 - * For simplicity, dying threads kill the whole Launcher. So 972 - * just nap here. 973 - */ 974 - for (;;) 975 - pause(); 976 - } 977 - 978 - /* Tell the Guest we used a buffer. */ 979 - add_used_and_trigger(vq, head, len); 980 - 981 - /* 982 - * Three ^C within one second? Exit. 983 - * 984 - * This is such a hack, but works surprisingly well. Each ^C has to 985 - * be in a buffer by itself, so they can't be too fast. But we check 986 - * that we get three within about a second, so they can't be too 987 - * slow. 988 - */ 989 - if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 990 - abort->count = 0; 991 - return; 992 - } 993 - 994 - abort->count++; 995 - if (abort->count == 1) 996 - gettimeofday(&abort->start, NULL); 997 - else if (abort->count == 3) { 998 - struct timeval now; 999 - gettimeofday(&now, NULL); 1000 - /* Kill all Launcher processes with SIGINT, like normal ^C */ 1001 - if (now.tv_sec <= abort->start.tv_sec+1) 1002 - kill(0, SIGINT); 1003 - abort->count = 0; 1004 - } 1005 - } 1006 - 1007 - /* This is the routine which handles console output (ie. stdout). */ 1008 - static void console_output(struct virtqueue *vq) 1009 - { 1010 - unsigned int head, out, in; 1011 - struct iovec iov[vq->vring.num]; 1012 - 1013 - /* We usually wait in here, for the Guest to give us something. */ 1014 - head = wait_for_vq_desc(vq, iov, &out, &in); 1015 - if (in) 1016 - bad_driver_vq(vq, "Input buffers in console output queue?"); 1017 - 1018 - /* writev can return a partial write, so we loop here. */ 1019 - while (!iov_empty(iov, out)) { 1020 - int len = writev(STDOUT_FILENO, iov, out); 1021 - if (len <= 0) { 1022 - warn("Write to stdout gave %i (%d)", len, errno); 1023 - break; 1024 - } 1025 - iov_consume(vq->dev, iov, out, NULL, len); 1026 - } 1027 - 1028 - /* 1029 - * We're finished with that buffer: if we're going to sleep, 1030 - * wait_for_vq_desc() will prod the Guest with an interrupt. 1031 - */ 1032 - add_used(vq, head, 0); 1033 - } 1034 - 1035 - /* 1036 - * The Network 1037 - * 1038 - * Handling output for network is also simple: we get all the output buffers 1039 - * and write them to /dev/net/tun. 1040 - */ 1041 - struct net_info { 1042 - int tunfd; 1043 - }; 1044 - 1045 - static void net_output(struct virtqueue *vq) 1046 - { 1047 - struct net_info *net_info = vq->dev->priv; 1048 - unsigned int head, out, in; 1049 - struct iovec iov[vq->vring.num]; 1050 - 1051 - /* We usually wait in here for the Guest to give us a packet. */ 1052 - head = wait_for_vq_desc(vq, iov, &out, &in); 1053 - if (in) 1054 - bad_driver_vq(vq, "Input buffers in net output queue?"); 1055 - /* 1056 - * Send the whole thing through to /dev/net/tun. It expects the exact 1057 - * same format: what a coincidence! 1058 - */ 1059 - if (writev(net_info->tunfd, iov, out) < 0) 1060 - warnx("Write to tun failed (%d)?", errno); 1061 - 1062 - /* 1063 - * Done with that one; wait_for_vq_desc() will send the interrupt if 1064 - * all packets are processed. 1065 - */ 1066 - add_used(vq, head, 0); 1067 - } 1068 - 1069 - /* 1070 - * Handling network input is a bit trickier, because I've tried to optimize it. 1071 - * 1072 - * First we have a helper routine which tells is if from this file descriptor 1073 - * (ie. the /dev/net/tun device) will block: 1074 - */ 1075 - static bool will_block(int fd) 1076 - { 1077 - fd_set fdset; 1078 - struct timeval zero = { 0, 0 }; 1079 - FD_ZERO(&fdset); 1080 - FD_SET(fd, &fdset); 1081 - return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 1082 - } 1083 - 1084 - /* 1085 - * This handles packets coming in from the tun device to our Guest. Like all 1086 - * service routines, it gets called again as soon as it returns, so you don't 1087 - * see a while(1) loop here. 1088 - */ 1089 - static void net_input(struct virtqueue *vq) 1090 - { 1091 - int len; 1092 - unsigned int head, out, in; 1093 - struct iovec iov[vq->vring.num]; 1094 - struct net_info *net_info = vq->dev->priv; 1095 - 1096 - /* 1097 - * Get a descriptor to write an incoming packet into. This will also 1098 - * send an interrupt if they're out of descriptors. 1099 - */ 1100 - head = wait_for_vq_desc(vq, iov, &out, &in); 1101 - if (out) 1102 - bad_driver_vq(vq, "Output buffers in net input queue?"); 1103 - 1104 - /* 1105 - * If it looks like we'll block reading from the tun device, send them 1106 - * an interrupt. 1107 - */ 1108 - if (vq->pending_used && will_block(net_info->tunfd)) 1109 - trigger_irq(vq); 1110 - 1111 - /* 1112 - * Read in the packet. This is where we normally wait (when there's no 1113 - * incoming network traffic). 1114 - */ 1115 - len = readv(net_info->tunfd, iov, in); 1116 - if (len <= 0) 1117 - warn("Failed to read from tun (%d).", errno); 1118 - 1119 - /* 1120 - * Mark that packet buffer as used, but don't interrupt here. We want 1121 - * to wait until we've done as much work as we can. 1122 - */ 1123 - add_used(vq, head, len); 1124 - } 1125 - /*:*/ 1126 - 1127 - /* This is the helper to create threads: run the service routine in a loop. */ 1128 - static int do_thread(void *_vq) 1129 - { 1130 - struct virtqueue *vq = _vq; 1131 - 1132 - for (;;) 1133 - vq->service(vq); 1134 - return 0; 1135 - } 1136 - 1137 - /* 1138 - * When a child dies, we kill our entire process group with SIGTERM. This 1139 - * also has the side effect that the shell restores the console for us! 1140 - */ 1141 - static void kill_launcher(int signal) 1142 - { 1143 - kill(0, SIGTERM); 1144 - } 1145 - 1146 - static void reset_vq_pci_config(struct virtqueue *vq) 1147 - { 1148 - vq->pci_config.queue_size = VIRTQUEUE_NUM; 1149 - vq->pci_config.queue_enable = 0; 1150 - } 1151 - 1152 - static void reset_device(struct device *dev) 1153 - { 1154 - struct virtqueue *vq; 1155 - 1156 - verbose("Resetting device %s\n", dev->name); 1157 - 1158 - /* Clear any features they've acked. */ 1159 - dev->features_accepted = 0; 1160 - 1161 - /* We're going to be explicitly killing threads, so ignore them. */ 1162 - signal(SIGCHLD, SIG_IGN); 1163 - 1164 - /* 1165 - * 4.1.4.3.1: 1166 - * 1167 - * The device MUST present a 0 in queue_enable on reset. 1168 - * 1169 - * This means we set it here, and reset the saved ones in every vq. 1170 - */ 1171 - dev->mmio->cfg.queue_enable = 0; 1172 - 1173 - /* Get rid of the virtqueue threads */ 1174 - for (vq = dev->vq; vq; vq = vq->next) { 1175 - vq->last_avail_idx = 0; 1176 - reset_vq_pci_config(vq); 1177 - if (vq->thread != (pid_t)-1) { 1178 - kill(vq->thread, SIGTERM); 1179 - waitpid(vq->thread, NULL, 0); 1180 - vq->thread = (pid_t)-1; 1181 - } 1182 - } 1183 - dev->running = false; 1184 - dev->wrote_features_ok = false; 1185 - 1186 - /* Now we care if threads die. */ 1187 - signal(SIGCHLD, (void *)kill_launcher); 1188 - } 1189 - 1190 - static void cleanup_devices(void) 1191 - { 1192 - unsigned int i; 1193 - 1194 - for (i = 1; i < MAX_PCI_DEVICES; i++) { 1195 - struct device *d = devices.pci[i]; 1196 - if (!d) 1197 - continue; 1198 - reset_device(d); 1199 - } 1200 - 1201 - /* If we saved off the original terminal settings, restore them now. */ 1202 - if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) 1203 - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 1204 - } 1205 - 1206 - /*L:217 1207 - * We do PCI. This is mainly done to let us test the kernel virtio PCI 1208 - * code. 1209 - */ 1210 - 1211 - /* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */ 1212 - static struct device pci_host_bridge; 1213 - 1214 - static void init_pci_host_bridge(void) 1215 - { 1216 - pci_host_bridge.name = "PCI Host Bridge"; 1217 - pci_host_bridge.config.class = 0x06; /* bridge */ 1218 - pci_host_bridge.config.subclass = 0; /* host bridge */ 1219 - devices.pci[0] = &pci_host_bridge; 1220 - } 1221 - 1222 - /* The IO ports used to read the PCI config space. */ 1223 - #define PCI_CONFIG_ADDR 0xCF8 1224 - #define PCI_CONFIG_DATA 0xCFC 1225 - 1226 - /* 1227 - * Not really portable, but does help readability: this is what the Guest 1228 - * writes to the PCI_CONFIG_ADDR IO port. 1229 - */ 1230 - union pci_config_addr { 1231 - struct { 1232 - unsigned mbz: 2; 1233 - unsigned offset: 6; 1234 - unsigned funcnum: 3; 1235 - unsigned devnum: 5; 1236 - unsigned busnum: 8; 1237 - unsigned reserved: 7; 1238 - unsigned enabled : 1; 1239 - } bits; 1240 - u32 val; 1241 - }; 1242 - 1243 - /* 1244 - * We cache what they wrote to the address port, so we know what they're 1245 - * talking about when they access the data port. 1246 - */ 1247 - static union pci_config_addr pci_config_addr; 1248 - 1249 - static struct device *find_pci_device(unsigned int index) 1250 - { 1251 - return devices.pci[index]; 1252 - } 1253 - 1254 - /* PCI can do 1, 2 and 4 byte reads; we handle that here. */ 1255 - static void ioread(u16 off, u32 v, u32 mask, u32 *val) 1256 - { 1257 - assert(off < 4); 1258 - assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); 1259 - *val = (v >> (off * 8)) & mask; 1260 - } 1261 - 1262 - /* PCI can do 1, 2 and 4 byte writes; we handle that here. */ 1263 - static void iowrite(u16 off, u32 v, u32 mask, u32 *dst) 1264 - { 1265 - assert(off < 4); 1266 - assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); 1267 - *dst &= ~(mask << (off * 8)); 1268 - *dst |= (v & mask) << (off * 8); 1269 - } 1270 - 1271 - /* 1272 - * Where PCI_CONFIG_DATA accesses depends on the previous write to 1273 - * PCI_CONFIG_ADDR. 1274 - */ 1275 - static struct device *dev_and_reg(u32 *reg) 1276 - { 1277 - if (!pci_config_addr.bits.enabled) 1278 - return NULL; 1279 - 1280 - if (pci_config_addr.bits.funcnum != 0) 1281 - return NULL; 1282 - 1283 - if (pci_config_addr.bits.busnum != 0) 1284 - return NULL; 1285 - 1286 - if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config)) 1287 - return NULL; 1288 - 1289 - *reg = pci_config_addr.bits.offset; 1290 - return find_pci_device(pci_config_addr.bits.devnum); 1291 - } 1292 - 1293 - /* 1294 - * We can get invalid combinations of values while they're writing, so we 1295 - * only fault if they try to write with some invalid bar/offset/length. 1296 - */ 1297 - static bool valid_bar_access(struct device *d, 1298 - struct virtio_pci_cfg_cap_u32 *cfg_access) 1299 - { 1300 - /* We only have 1 bar (BAR0) */ 1301 - if (cfg_access->cap.bar != 0) 1302 - return false; 1303 - 1304 - /* Check it's within BAR0. */ 1305 - if (cfg_access->cap.offset >= d->mmio_size 1306 - || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size) 1307 - return false; 1308 - 1309 - /* Check length is 1, 2 or 4. */ 1310 - if (cfg_access->cap.length != 1 1311 - && cfg_access->cap.length != 2 1312 - && cfg_access->cap.length != 4) 1313 - return false; 1314 - 1315 - /* 1316 - * 4.1.4.7.2: 1317 - * 1318 - * The driver MUST NOT write a cap.offset which is not a multiple of 1319 - * cap.length (ie. all accesses MUST be aligned). 1320 - */ 1321 - if (cfg_access->cap.offset % cfg_access->cap.length != 0) 1322 - return false; 1323 - 1324 - /* Return pointer into word in BAR0. */ 1325 - return true; 1326 - } 1327 - 1328 - /* Is this accessing the PCI config address port?. */ 1329 - static bool is_pci_addr_port(u16 port) 1330 - { 1331 - return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4; 1332 - } 1333 - 1334 - static bool pci_addr_iowrite(u16 port, u32 mask, u32 val) 1335 - { 1336 - iowrite(port - PCI_CONFIG_ADDR, val, mask, 1337 - &pci_config_addr.val); 1338 - verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n", 1339 - pci_config_addr.bits.enabled ? "" : " DISABLED", 1340 - val, mask, 1341 - pci_config_addr.bits.busnum, 1342 - pci_config_addr.bits.devnum, 1343 - pci_config_addr.bits.funcnum, 1344 - pci_config_addr.bits.offset); 1345 - return true; 1346 - } 1347 - 1348 - static void pci_addr_ioread(u16 port, u32 mask, u32 *val) 1349 - { 1350 - ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val); 1351 - } 1352 - 1353 - /* Is this accessing the PCI config data port?. */ 1354 - static bool is_pci_data_port(u16 port) 1355 - { 1356 - return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4; 1357 - } 1358 - 1359 - static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask); 1360 - 1361 - static bool pci_data_iowrite(u16 port, u32 mask, u32 val) 1362 - { 1363 - u32 reg, portoff; 1364 - struct device *d = dev_and_reg(&reg); 1365 - 1366 - /* Complain if they don't belong to a device. */ 1367 - if (!d) 1368 - return false; 1369 - 1370 - /* They can do 1 byte writes, etc. */ 1371 - portoff = port - PCI_CONFIG_DATA; 1372 - 1373 - /* 1374 - * PCI uses a weird way to determine the BAR size: the OS 1375 - * writes all 1's, and sees which ones stick. 1376 - */ 1377 - if (&d->config_words[reg] == &d->config.bar[0]) { 1378 - int i; 1379 - 1380 - iowrite(portoff, val, mask, &d->config.bar[0]); 1381 - for (i = 0; (1 << i) < d->mmio_size; i++) 1382 - d->config.bar[0] &= ~(1 << i); 1383 - return true; 1384 - } else if ((&d->config_words[reg] > &d->config.bar[0] 1385 - && &d->config_words[reg] <= &d->config.bar[6]) 1386 - || &d->config_words[reg] == &d->config.expansion_rom_addr) { 1387 - /* Allow writing to any other BAR, or expansion ROM */ 1388 - iowrite(portoff, val, mask, &d->config_words[reg]); 1389 - return true; 1390 - /* We let them override latency timer and cacheline size */ 1391 - } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) { 1392 - /* Only let them change the first two fields. */ 1393 - if (mask == 0xFFFFFFFF) 1394 - mask = 0xFFFF; 1395 - iowrite(portoff, val, mask, &d->config_words[reg]); 1396 - return true; 1397 - } else if (&d->config_words[reg] == (void *)&d->config.command 1398 - && mask == 0xFFFF) { 1399 - /* Ignore command writes. */ 1400 - return true; 1401 - } else if (&d->config_words[reg] 1402 - == (void *)&d->config.cfg_access.cap.bar 1403 - || &d->config_words[reg] 1404 - == &d->config.cfg_access.cap.length 1405 - || &d->config_words[reg] 1406 - == &d->config.cfg_access.cap.offset) { 1407 - 1408 - /* 1409 - * The VIRTIO_PCI_CAP_PCI_CFG capability 1410 - * provides a backdoor to access the MMIO 1411 - * regions without mapping them. Weird, but 1412 - * useful. 1413 - */ 1414 - iowrite(portoff, val, mask, &d->config_words[reg]); 1415 - return true; 1416 - } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { 1417 - u32 write_mask; 1418 - 1419 - /* 1420 - * 4.1.4.7.1: 1421 - * 1422 - * Upon detecting driver write access to pci_cfg_data, the 1423 - * device MUST execute a write access at offset cap.offset at 1424 - * BAR selected by cap.bar using the first cap.length bytes 1425 - * from pci_cfg_data. 1426 - */ 1427 - 1428 - /* Must be bar 0 */ 1429 - if (!valid_bar_access(d, &d->config.cfg_access)) 1430 - return false; 1431 - 1432 - iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data); 1433 - 1434 - /* 1435 - * Now emulate a write. The mask we use is set by 1436 - * len, *not* this write! 1437 - */ 1438 - write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1; 1439 - verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n", 1440 - d->config.cfg_access.pci_cfg_data, write_mask, 1441 - d->config.cfg_access.cap.bar, 1442 - d->config.cfg_access.cap.offset, 1443 - d->config.cfg_access.cap.length); 1444 - 1445 - emulate_mmio_write(d, d->config.cfg_access.cap.offset, 1446 - d->config.cfg_access.pci_cfg_data, 1447 - write_mask); 1448 - return true; 1449 - } 1450 - 1451 - /* 1452 - * 4.1.4.1: 1453 - * 1454 - * The driver MUST NOT write into any field of the capability 1455 - * structure, with the exception of those with cap_type 1456 - * VIRTIO_PCI_CAP_PCI_CFG... 1457 - */ 1458 - return false; 1459 - } 1460 - 1461 - static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask); 1462 - 1463 - static void pci_data_ioread(u16 port, u32 mask, u32 *val) 1464 - { 1465 - u32 reg; 1466 - struct device *d = dev_and_reg(&reg); 1467 - 1468 - if (!d) 1469 - return; 1470 - 1471 - /* Read through the PCI MMIO access window is special */ 1472 - if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { 1473 - u32 read_mask; 1474 - 1475 - /* 1476 - * 4.1.4.7.1: 1477 - * 1478 - * Upon detecting driver read access to pci_cfg_data, the 1479 - * device MUST execute a read access of length cap.length at 1480 - * offset cap.offset at BAR selected by cap.bar and store the 1481 - * first cap.length bytes in pci_cfg_data. 1482 - */ 1483 - /* Must be bar 0 */ 1484 - if (!valid_bar_access(d, &d->config.cfg_access)) 1485 - bad_driver(d, 1486 - "Invalid cfg_access to bar%u, offset %u len %u", 1487 - d->config.cfg_access.cap.bar, 1488 - d->config.cfg_access.cap.offset, 1489 - d->config.cfg_access.cap.length); 1490 - 1491 - /* 1492 - * Read into the window. The mask we use is set by 1493 - * len, *not* this read! 1494 - */ 1495 - read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1; 1496 - d->config.cfg_access.pci_cfg_data 1497 - = emulate_mmio_read(d, 1498 - d->config.cfg_access.cap.offset, 1499 - read_mask); 1500 - verbose("Window read %#x/%#x from bar %u, offset %u len %u\n", 1501 - d->config.cfg_access.pci_cfg_data, read_mask, 1502 - d->config.cfg_access.cap.bar, 1503 - d->config.cfg_access.cap.offset, 1504 - d->config.cfg_access.cap.length); 1505 - } 1506 - ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val); 1507 - } 1508 - 1509 - /*L:216 1510 - * This is where we emulate a handful of Guest instructions. It's ugly 1511 - * and we used to do it in the kernel but it grew over time. 1512 - */ 1513 - 1514 - /* 1515 - * We use the ptrace syscall's pt_regs struct to talk about registers 1516 - * to lguest: these macros convert the names to the offsets. 1517 - */ 1518 - #define getreg(name) getreg_off(offsetof(struct user_regs_struct, name)) 1519 - #define setreg(name, val) \ 1520 - setreg_off(offsetof(struct user_regs_struct, name), (val)) 1521 - 1522 - static u32 getreg_off(size_t offset) 1523 - { 1524 - u32 r; 1525 - unsigned long args[] = { LHREQ_GETREG, offset }; 1526 - 1527 - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) 1528 - err(1, "Getting register %u", offset); 1529 - if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r)) 1530 - err(1, "Reading register %u", offset); 1531 - 1532 - return r; 1533 - } 1534 - 1535 - static void setreg_off(size_t offset, u32 val) 1536 - { 1537 - unsigned long args[] = { LHREQ_SETREG, offset, val }; 1538 - 1539 - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) 1540 - err(1, "Setting register %u", offset); 1541 - } 1542 - 1543 - /* Get register by instruction encoding */ 1544 - static u32 getreg_num(unsigned regnum, u32 mask) 1545 - { 1546 - /* 8 bit ops use regnums 4-7 for high parts of word */ 1547 - if (mask == 0xFF && (regnum & 0x4)) 1548 - return getreg_num(regnum & 0x3, 0xFFFF) >> 8; 1549 - 1550 - switch (regnum) { 1551 - case 0: return getreg(eax) & mask; 1552 - case 1: return getreg(ecx) & mask; 1553 - case 2: return getreg(edx) & mask; 1554 - case 3: return getreg(ebx) & mask; 1555 - case 4: return getreg(esp) & mask; 1556 - case 5: return getreg(ebp) & mask; 1557 - case 6: return getreg(esi) & mask; 1558 - case 7: return getreg(edi) & mask; 1559 - } 1560 - abort(); 1561 - } 1562 - 1563 - /* Set register by instruction encoding */ 1564 - static void setreg_num(unsigned regnum, u32 val, u32 mask) 1565 - { 1566 - /* Don't try to set bits out of range */ 1567 - assert(~(val & ~mask)); 1568 - 1569 - /* 8 bit ops use regnums 4-7 for high parts of word */ 1570 - if (mask == 0xFF && (regnum & 0x4)) { 1571 - /* Construct the 16 bits we want. */ 1572 - val = (val << 8) | getreg_num(regnum & 0x3, 0xFF); 1573 - setreg_num(regnum & 0x3, val, 0xFFFF); 1574 - return; 1575 - } 1576 - 1577 - switch (regnum) { 1578 - case 0: setreg(eax, val | (getreg(eax) & ~mask)); return; 1579 - case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return; 1580 - case 2: setreg(edx, val | (getreg(edx) & ~mask)); return; 1581 - case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return; 1582 - case 4: setreg(esp, val | (getreg(esp) & ~mask)); return; 1583 - case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return; 1584 - case 6: setreg(esi, val | (getreg(esi) & ~mask)); return; 1585 - case 7: setreg(edi, val | (getreg(edi) & ~mask)); return; 1586 - } 1587 - abort(); 1588 - } 1589 - 1590 - /* Get bytes of displacement appended to instruction, from r/m encoding */ 1591 - static u32 insn_displacement_len(u8 mod_reg_rm) 1592 - { 1593 - /* Switch on the mod bits */ 1594 - switch (mod_reg_rm >> 6) { 1595 - case 0: 1596 - /* If mod == 0, and r/m == 101, 16-bit displacement follows */ 1597 - if ((mod_reg_rm & 0x7) == 0x5) 1598 - return 2; 1599 - /* Normally, mod == 0 means no literal displacement */ 1600 - return 0; 1601 - case 1: 1602 - /* One byte displacement */ 1603 - return 1; 1604 - case 2: 1605 - /* Four byte displacement */ 1606 - return 4; 1607 - case 3: 1608 - /* Register mode */ 1609 - return 0; 1610 - } 1611 - abort(); 1612 - } 1613 - 1614 - static void emulate_insn(const u8 insn[]) 1615 - { 1616 - unsigned long args[] = { LHREQ_TRAP, 13 }; 1617 - unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access; 1618 - unsigned int eax, port, mask; 1619 - /* 1620 - * Default is to return all-ones on IO port reads, which traditionally 1621 - * means "there's nothing there". 1622 - */ 1623 - u32 val = 0xFFFFFFFF; 1624 - 1625 - /* 1626 - * This must be the Guest kernel trying to do something, not userspace! 1627 - * The bottom two bits of the CS segment register are the privilege 1628 - * level. 1629 - */ 1630 - if ((getreg(xcs) & 3) != 0x1) 1631 - goto no_emulate; 1632 - 1633 - /* Decoding x86 instructions is icky. */ 1634 - 1635 - /* 1636 - * Around 2.6.33, the kernel started using an emulation for the 1637 - * cmpxchg8b instruction in early boot on many configurations. This 1638 - * code isn't paravirtualized, and it tries to disable interrupts. 1639 - * Ignore it, which will Mostly Work. 1640 - */ 1641 - if (insn[insnlen] == 0xfa) { 1642 - /* "cli", or Clear Interrupt Enable instruction. Skip it. */ 1643 - insnlen = 1; 1644 - goto skip_insn; 1645 - } 1646 - 1647 - /* 1648 - * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. 1649 - */ 1650 - if (insn[insnlen] == 0x66) { 1651 - small_operand = 1; 1652 - /* The instruction is 1 byte so far, read the next byte. */ 1653 - insnlen = 1; 1654 - } 1655 - 1656 - /* If the lower bit isn't set, it's a single byte access */ 1657 - byte_access = !(insn[insnlen] & 1); 1658 - 1659 - /* 1660 - * Now we can ignore the lower bit and decode the 4 opcodes 1661 - * we need to emulate. 1662 - */ 1663 - switch (insn[insnlen] & 0xFE) { 1664 - case 0xE4: /* in <next byte>,%al */ 1665 - port = insn[insnlen+1]; 1666 - insnlen += 2; 1667 - in = 1; 1668 - break; 1669 - case 0xEC: /* in (%dx),%al */ 1670 - port = getreg(edx) & 0xFFFF; 1671 - insnlen += 1; 1672 - in = 1; 1673 - break; 1674 - case 0xE6: /* out %al,<next byte> */ 1675 - port = insn[insnlen+1]; 1676 - insnlen += 2; 1677 - break; 1678 - case 0xEE: /* out %al,(%dx) */ 1679 - port = getreg(edx) & 0xFFFF; 1680 - insnlen += 1; 1681 - break; 1682 - default: 1683 - /* OK, we don't know what this is, can't emulate. */ 1684 - goto no_emulate; 1685 - } 1686 - 1687 - /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */ 1688 - if (byte_access) 1689 - mask = 0xFF; 1690 - else if (small_operand) 1691 - mask = 0xFFFF; 1692 - else 1693 - mask = 0xFFFFFFFF; 1694 - 1695 - /* 1696 - * If it was an "IN" instruction, they expect the result to be read 1697 - * into %eax, so we change %eax. 1698 - */ 1699 - eax = getreg(eax); 1700 - 1701 - if (in) { 1702 - /* This is the PS/2 keyboard status; 1 means ready for output */ 1703 - if (port == 0x64) 1704 - val = 1; 1705 - else if (is_pci_addr_port(port)) 1706 - pci_addr_ioread(port, mask, &val); 1707 - else if (is_pci_data_port(port)) 1708 - pci_data_ioread(port, mask, &val); 1709 - 1710 - /* Clear the bits we're about to read */ 1711 - eax &= ~mask; 1712 - /* Copy bits in from val. */ 1713 - eax |= val & mask; 1714 - /* Now update the register. */ 1715 - setreg(eax, eax); 1716 - } else { 1717 - if (is_pci_addr_port(port)) { 1718 - if (!pci_addr_iowrite(port, mask, eax)) 1719 - goto bad_io; 1720 - } else if (is_pci_data_port(port)) { 1721 - if (!pci_data_iowrite(port, mask, eax)) 1722 - goto bad_io; 1723 - } 1724 - /* There are many other ports, eg. CMOS clock, serial 1725 - * and parallel ports, so we ignore them all. */ 1726 - } 1727 - 1728 - verbose("IO %s of %x to %u: %#08x\n", 1729 - in ? "IN" : "OUT", mask, port, eax); 1730 - skip_insn: 1731 - /* Finally, we've "done" the instruction, so move past it. */ 1732 - setreg(eip, getreg(eip) + insnlen); 1733 - return; 1734 - 1735 - bad_io: 1736 - warnx("Attempt to %s port %u (%#x mask)", 1737 - in ? "read from" : "write to", port, mask); 1738 - 1739 - no_emulate: 1740 - /* Inject trap into Guest. */ 1741 - if (write(lguest_fd, args, sizeof(args)) < 0) 1742 - err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip)); 1743 - } 1744 - 1745 - static struct device *find_mmio_region(unsigned long paddr, u32 *off) 1746 - { 1747 - unsigned int i; 1748 - 1749 - for (i = 1; i < MAX_PCI_DEVICES; i++) { 1750 - struct device *d = devices.pci[i]; 1751 - 1752 - if (!d) 1753 - continue; 1754 - if (paddr < d->mmio_addr) 1755 - continue; 1756 - if (paddr >= d->mmio_addr + d->mmio_size) 1757 - continue; 1758 - *off = paddr - d->mmio_addr; 1759 - return d; 1760 - } 1761 - return NULL; 1762 - } 1763 - 1764 - /* FIXME: Use vq array. */ 1765 - static struct virtqueue *vq_by_num(struct device *d, u32 num) 1766 - { 1767 - struct virtqueue *vq = d->vq; 1768 - 1769 - while (num-- && vq) 1770 - vq = vq->next; 1771 - 1772 - return vq; 1773 - } 1774 - 1775 - static void save_vq_config(const struct virtio_pci_common_cfg *cfg, 1776 - struct virtqueue *vq) 1777 - { 1778 - vq->pci_config = *cfg; 1779 - } 1780 - 1781 - static void restore_vq_config(struct virtio_pci_common_cfg *cfg, 1782 - struct virtqueue *vq) 1783 - { 1784 - /* Only restore the per-vq part */ 1785 - size_t off = offsetof(struct virtio_pci_common_cfg, queue_size); 1786 - 1787 - memcpy((void *)cfg + off, (void *)&vq->pci_config + off, 1788 - sizeof(*cfg) - off); 1789 - } 1790 - 1791 - /* 1792 - * 4.1.4.3.2: 1793 - * 1794 - * The driver MUST configure the other virtqueue fields before 1795 - * enabling the virtqueue with queue_enable. 1796 - * 1797 - * When they enable the virtqueue, we check that their setup is valid. 1798 - */ 1799 - static void check_virtqueue(struct device *d, struct virtqueue *vq) 1800 - { 1801 - /* Because lguest is 32 bit, all the descriptor high bits must be 0 */ 1802 - if (vq->pci_config.queue_desc_hi 1803 - || vq->pci_config.queue_avail_hi 1804 - || vq->pci_config.queue_used_hi) 1805 - bad_driver_vq(vq, "invalid 64-bit queue address"); 1806 - 1807 - /* 1808 - * 2.4.1: 1809 - * 1810 - * The driver MUST ensure that the physical address of the first byte 1811 - * of each virtqueue part is a multiple of the specified alignment 1812 - * value in the above table. 1813 - */ 1814 - if (vq->pci_config.queue_desc_lo % 16 1815 - || vq->pci_config.queue_avail_lo % 2 1816 - || vq->pci_config.queue_used_lo % 4) 1817 - bad_driver_vq(vq, "invalid alignment in queue addresses"); 1818 - 1819 - /* Initialize the virtqueue and check they're all in range. */ 1820 - vq->vring.num = vq->pci_config.queue_size; 1821 - vq->vring.desc = check_pointer(vq->dev, 1822 - vq->pci_config.queue_desc_lo, 1823 - sizeof(*vq->vring.desc) * vq->vring.num); 1824 - vq->vring.avail = check_pointer(vq->dev, 1825 - vq->pci_config.queue_avail_lo, 1826 - sizeof(*vq->vring.avail) 1827 - + (sizeof(vq->vring.avail->ring[0]) 1828 - * vq->vring.num)); 1829 - vq->vring.used = check_pointer(vq->dev, 1830 - vq->pci_config.queue_used_lo, 1831 - sizeof(*vq->vring.used) 1832 - + (sizeof(vq->vring.used->ring[0]) 1833 - * vq->vring.num)); 1834 - 1835 - /* 1836 - * 2.4.9.1: 1837 - * 1838 - * The driver MUST initialize flags in the used ring to 0 1839 - * when allocating the used ring. 1840 - */ 1841 - if (vq->vring.used->flags != 0) 1842 - bad_driver_vq(vq, "invalid initial used.flags %#x", 1843 - vq->vring.used->flags); 1844 - } 1845 - 1846 - static void start_virtqueue(struct virtqueue *vq) 1847 - { 1848 - /* 1849 - * Create stack for thread. Since the stack grows upwards, we point 1850 - * the stack pointer to the end of this region. 1851 - */ 1852 - char *stack = malloc(32768); 1853 - 1854 - /* Create a zero-initialized eventfd. */ 1855 - vq->eventfd = eventfd(0, 0); 1856 - if (vq->eventfd < 0) 1857 - err(1, "Creating eventfd"); 1858 - 1859 - /* 1860 - * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 1861 - * we get a signal if it dies. 1862 - */ 1863 - vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1864 - if (vq->thread == (pid_t)-1) 1865 - err(1, "Creating clone"); 1866 - } 1867 - 1868 - static void start_virtqueues(struct device *d) 1869 - { 1870 - struct virtqueue *vq; 1871 - 1872 - for (vq = d->vq; vq; vq = vq->next) { 1873 - if (vq->pci_config.queue_enable) 1874 - start_virtqueue(vq); 1875 - } 1876 - } 1877 - 1878 - static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask) 1879 - { 1880 - struct virtqueue *vq; 1881 - 1882 - switch (off) { 1883 - case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): 1884 - /* 1885 - * 4.1.4.3.1: 1886 - * 1887 - * The device MUST present the feature bits it is offering in 1888 - * device_feature, starting at bit device_feature_select ∗ 32 1889 - * for any device_feature_select written by the driver 1890 - */ 1891 - if (val == 0) 1892 - d->mmio->cfg.device_feature = d->features; 1893 - else if (val == 1) 1894 - d->mmio->cfg.device_feature = (d->features >> 32); 1895 - else 1896 - d->mmio->cfg.device_feature = 0; 1897 - goto feature_write_through32; 1898 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): 1899 - if (val > 1) 1900 - bad_driver(d, "Unexpected driver select %u", val); 1901 - goto feature_write_through32; 1902 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature): 1903 - if (d->mmio->cfg.guest_feature_select == 0) { 1904 - d->features_accepted &= ~((u64)0xFFFFFFFF); 1905 - d->features_accepted |= val; 1906 - } else { 1907 - assert(d->mmio->cfg.guest_feature_select == 1); 1908 - d->features_accepted &= 0xFFFFFFFF; 1909 - d->features_accepted |= ((u64)val) << 32; 1910 - } 1911 - /* 1912 - * 2.2.1: 1913 - * 1914 - * The driver MUST NOT accept a feature which the device did 1915 - * not offer 1916 - */ 1917 - if (d->features_accepted & ~d->features) 1918 - bad_driver(d, "over-accepted features %#llx of %#llx", 1919 - d->features_accepted, d->features); 1920 - goto feature_write_through32; 1921 - case offsetof(struct virtio_pci_mmio, cfg.device_status): { 1922 - u8 prev; 1923 - 1924 - verbose("%s: device status -> %#x\n", d->name, val); 1925 - /* 1926 - * 4.1.4.3.1: 1927 - * 1928 - * The device MUST reset when 0 is written to device_status, 1929 - * and present a 0 in device_status once that is done. 1930 - */ 1931 - if (val == 0) { 1932 - reset_device(d); 1933 - goto write_through8; 1934 - } 1935 - 1936 - /* 2.1.1: The driver MUST NOT clear a device status bit. */ 1937 - if (d->mmio->cfg.device_status & ~val) 1938 - bad_driver(d, "unset of device status bit %#x -> %#x", 1939 - d->mmio->cfg.device_status, val); 1940 - 1941 - /* 1942 - * 2.1.2: 1943 - * 1944 - * The device MUST NOT consume buffers or notify the driver 1945 - * before DRIVER_OK. 1946 - */ 1947 - if (val & VIRTIO_CONFIG_S_DRIVER_OK 1948 - && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) 1949 - start_virtqueues(d); 1950 - 1951 - /* 1952 - * 3.1.1: 1953 - * 1954 - * The driver MUST follow this sequence to initialize a device: 1955 - * - Reset the device. 1956 - * - Set the ACKNOWLEDGE status bit: the guest OS has 1957 - * notice the device. 1958 - * - Set the DRIVER status bit: the guest OS knows how 1959 - * to drive the device. 1960 - * - Read device feature bits, and write the subset 1961 - * of feature bits understood by the OS and driver 1962 - * to the device. During this step the driver MAY 1963 - * read (but MUST NOT write) the device-specific 1964 - * configuration fields to check that it can 1965 - * support the device before accepting it. 1966 - * - Set the FEATURES_OK status bit. The driver 1967 - * MUST not accept new feature bits after this 1968 - * step. 1969 - * - Re-read device status to ensure the FEATURES_OK 1970 - * bit is still set: otherwise, the device does 1971 - * not support our subset of features and the 1972 - * device is unusable. 1973 - * - Perform device-specific setup, including 1974 - * discovery of virtqueues for the device, 1975 - * optional per-bus setup, reading and possibly 1976 - * writing the device’s virtio configuration 1977 - * space, and population of virtqueues. 1978 - * - Set the DRIVER_OK status bit. At this point the 1979 - * device is “live”. 1980 - */ 1981 - prev = 0; 1982 - switch (val & ~d->mmio->cfg.device_status) { 1983 - case VIRTIO_CONFIG_S_DRIVER_OK: 1984 - prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */ 1985 - case VIRTIO_CONFIG_S_FEATURES_OK: 1986 - prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */ 1987 - case VIRTIO_CONFIG_S_DRIVER: 1988 - prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */ 1989 - case VIRTIO_CONFIG_S_ACKNOWLEDGE: 1990 - break; 1991 - default: 1992 - bad_driver(d, "unknown device status bit %#x -> %#x", 1993 - d->mmio->cfg.device_status, val); 1994 - } 1995 - if (d->mmio->cfg.device_status != prev) 1996 - bad_driver(d, "unexpected status transition %#x -> %#x", 1997 - d->mmio->cfg.device_status, val); 1998 - 1999 - /* If they just wrote FEATURES_OK, we make sure they read */ 2000 - switch (val & ~d->mmio->cfg.device_status) { 2001 - case VIRTIO_CONFIG_S_FEATURES_OK: 2002 - d->wrote_features_ok = true; 2003 - break; 2004 - case VIRTIO_CONFIG_S_DRIVER_OK: 2005 - if (d->wrote_features_ok) 2006 - bad_driver(d, "did not re-read FEATURES_OK"); 2007 - break; 2008 - } 2009 - goto write_through8; 2010 - } 2011 - case offsetof(struct virtio_pci_mmio, cfg.queue_select): 2012 - vq = vq_by_num(d, val); 2013 - /* 2014 - * 4.1.4.3.1: 2015 - * 2016 - * The device MUST present a 0 in queue_size if the virtqueue 2017 - * corresponding to the current queue_select is unavailable. 2018 - */ 2019 - if (!vq) { 2020 - d->mmio->cfg.queue_size = 0; 2021 - goto write_through16; 2022 - } 2023 - /* Save registers for old vq, if it was a valid vq */ 2024 - if (d->mmio->cfg.queue_size) 2025 - save_vq_config(&d->mmio->cfg, 2026 - vq_by_num(d, d->mmio->cfg.queue_select)); 2027 - /* Restore the registers for the queue they asked for */ 2028 - restore_vq_config(&d->mmio->cfg, vq); 2029 - goto write_through16; 2030 - case offsetof(struct virtio_pci_mmio, cfg.queue_size): 2031 - /* 2032 - * 4.1.4.3.2: 2033 - * 2034 - * The driver MUST NOT write a value which is not a power of 2 2035 - * to queue_size. 2036 - */ 2037 - if (val & (val-1)) 2038 - bad_driver(d, "invalid queue size %u", val); 2039 - if (d->mmio->cfg.queue_enable) 2040 - bad_driver(d, "changing queue size on live device"); 2041 - goto write_through16; 2042 - case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector): 2043 - bad_driver(d, "attempt to set MSIX vector to %u", val); 2044 - case offsetof(struct virtio_pci_mmio, cfg.queue_enable): { 2045 - struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select); 2046 - 2047 - /* 2048 - * 4.1.4.3.2: 2049 - * 2050 - * The driver MUST NOT write a 0 to queue_enable. 2051 - */ 2052 - if (val != 1) 2053 - bad_driver(d, "setting queue_enable to %u", val); 2054 - 2055 - /* 2056 - * 3.1.1: 2057 - * 2058 - * 7. Perform device-specific setup, including discovery of 2059 - * virtqueues for the device, optional per-bus setup, 2060 - * reading and possibly writing the device’s virtio 2061 - * configuration space, and population of virtqueues. 2062 - * 8. Set the DRIVER_OK status bit. 2063 - * 2064 - * All our devices require all virtqueues to be enabled, so 2065 - * they should have done that before setting DRIVER_OK. 2066 - */ 2067 - if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK) 2068 - bad_driver(d, "enabling vq after DRIVER_OK"); 2069 - 2070 - d->mmio->cfg.queue_enable = val; 2071 - save_vq_config(&d->mmio->cfg, vq); 2072 - check_virtqueue(d, vq); 2073 - goto write_through16; 2074 - } 2075 - case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off): 2076 - bad_driver(d, "attempt to write to queue_notify_off"); 2077 - case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo): 2078 - case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi): 2079 - case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo): 2080 - case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi): 2081 - case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo): 2082 - case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi): 2083 - /* 2084 - * 4.1.4.3.2: 2085 - * 2086 - * The driver MUST configure the other virtqueue fields before 2087 - * enabling the virtqueue with queue_enable. 2088 - */ 2089 - if (d->mmio->cfg.queue_enable) 2090 - bad_driver(d, "changing queue on live device"); 2091 - 2092 - /* 2093 - * 3.1.1: 2094 - * 2095 - * The driver MUST follow this sequence to initialize a device: 2096 - *... 2097 - * 5. Set the FEATURES_OK status bit. The driver MUST not 2098 - * accept new feature bits after this step. 2099 - */ 2100 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)) 2101 - bad_driver(d, "setting up vq before FEATURES_OK"); 2102 - 2103 - /* 2104 - * 6. Re-read device status to ensure the FEATURES_OK bit is 2105 - * still set... 2106 - */ 2107 - if (d->wrote_features_ok) 2108 - bad_driver(d, "didn't re-read FEATURES_OK before setup"); 2109 - 2110 - goto write_through32; 2111 - case offsetof(struct virtio_pci_mmio, notify): 2112 - vq = vq_by_num(d, val); 2113 - if (!vq) 2114 - bad_driver(d, "Invalid vq notification on %u", val); 2115 - /* Notify the process handling this vq by adding 1 to eventfd */ 2116 - write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8); 2117 - goto write_through16; 2118 - case offsetof(struct virtio_pci_mmio, isr): 2119 - bad_driver(d, "Unexpected write to isr"); 2120 - /* Weird corner case: write to emerg_wr of console */ 2121 - case sizeof(struct virtio_pci_mmio) 2122 - + offsetof(struct virtio_console_config, emerg_wr): 2123 - if (strcmp(d->name, "console") == 0) { 2124 - char c = val; 2125 - write(STDOUT_FILENO, &c, 1); 2126 - goto write_through32; 2127 - } 2128 - /* Fall through... */ 2129 - default: 2130 - /* 2131 - * 4.1.4.3.2: 2132 - * 2133 - * The driver MUST NOT write to device_feature, num_queues, 2134 - * config_generation or queue_notify_off. 2135 - */ 2136 - bad_driver(d, "Unexpected write to offset %u", off); 2137 - } 2138 - 2139 - feature_write_through32: 2140 - /* 2141 - * 3.1.1: 2142 - * 2143 - * The driver MUST follow this sequence to initialize a device: 2144 - *... 2145 - * - Set the DRIVER status bit: the guest OS knows how 2146 - * to drive the device. 2147 - * - Read device feature bits, and write the subset 2148 - * of feature bits understood by the OS and driver 2149 - * to the device. 2150 - *... 2151 - * - Set the FEATURES_OK status bit. The driver MUST not 2152 - * accept new feature bits after this step. 2153 - */ 2154 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2155 - bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER"); 2156 - if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK) 2157 - bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK"); 2158 - 2159 - /* 2160 - * 4.1.3.1: 2161 - * 2162 - * The driver MUST access each field using the “natural” access 2163 - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for 2164 - * 16-bit fields and 8-bit accesses for 8-bit fields. 2165 - */ 2166 - write_through32: 2167 - if (mask != 0xFFFFFFFF) { 2168 - bad_driver(d, "non-32-bit write to offset %u (%#x)", 2169 - off, getreg(eip)); 2170 - return; 2171 - } 2172 - memcpy((char *)d->mmio + off, &val, 4); 2173 - return; 2174 - 2175 - write_through16: 2176 - if (mask != 0xFFFF) 2177 - bad_driver(d, "non-16-bit write to offset %u (%#x)", 2178 - off, getreg(eip)); 2179 - memcpy((char *)d->mmio + off, &val, 2); 2180 - return; 2181 - 2182 - write_through8: 2183 - if (mask != 0xFF) 2184 - bad_driver(d, "non-8-bit write to offset %u (%#x)", 2185 - off, getreg(eip)); 2186 - memcpy((char *)d->mmio + off, &val, 1); 2187 - return; 2188 - } 2189 - 2190 - static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask) 2191 - { 2192 - u8 isr; 2193 - u32 val = 0; 2194 - 2195 - switch (off) { 2196 - case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): 2197 - case offsetof(struct virtio_pci_mmio, cfg.device_feature): 2198 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): 2199 - case offsetof(struct virtio_pci_mmio, cfg.guest_feature): 2200 - /* 2201 - * 3.1.1: 2202 - * 2203 - * The driver MUST follow this sequence to initialize a device: 2204 - *... 2205 - * - Set the DRIVER status bit: the guest OS knows how 2206 - * to drive the device. 2207 - * - Read device feature bits, and write the subset 2208 - * of feature bits understood by the OS and driver 2209 - * to the device. 2210 - */ 2211 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2212 - bad_driver(d, 2213 - "feature read before VIRTIO_CONFIG_S_DRIVER"); 2214 - goto read_through32; 2215 - case offsetof(struct virtio_pci_mmio, cfg.msix_config): 2216 - bad_driver(d, "read of msix_config"); 2217 - case offsetof(struct virtio_pci_mmio, cfg.num_queues): 2218 - goto read_through16; 2219 - case offsetof(struct virtio_pci_mmio, cfg.device_status): 2220 - /* As they did read, any write of FEATURES_OK is now fine. */ 2221 - d->wrote_features_ok = false; 2222 - goto read_through8; 2223 - case offsetof(struct virtio_pci_mmio, cfg.config_generation): 2224 - /* 2225 - * 4.1.4.3.1: 2226 - * 2227 - * The device MUST present a changed config_generation after 2228 - * the driver has read a device-specific configuration value 2229 - * which has changed since any part of the device-specific 2230 - * configuration was last read. 2231 - * 2232 - * This is simple: none of our devices change config, so this 2233 - * is always 0. 2234 - */ 2235 - goto read_through8; 2236 - case offsetof(struct virtio_pci_mmio, notify): 2237 - /* 2238 - * 3.1.1: 2239 - * 2240 - * The driver MUST NOT notify the device before setting 2241 - * DRIVER_OK. 2242 - */ 2243 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) 2244 - bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK"); 2245 - goto read_through16; 2246 - case offsetof(struct virtio_pci_mmio, isr): 2247 - if (mask != 0xFF) 2248 - bad_driver(d, "non-8-bit read from offset %u (%#x)", 2249 - off, getreg(eip)); 2250 - isr = d->mmio->isr; 2251 - /* 2252 - * 4.1.4.5.1: 2253 - * 2254 - * The device MUST reset ISR status to 0 on driver read. 2255 - */ 2256 - d->mmio->isr = 0; 2257 - return isr; 2258 - case offsetof(struct virtio_pci_mmio, padding): 2259 - bad_driver(d, "read from padding (%#x)", getreg(eip)); 2260 - default: 2261 - /* Read from device config space, beware unaligned overflow */ 2262 - if (off > d->mmio_size - 4) 2263 - bad_driver(d, "read past end (%#x)", getreg(eip)); 2264 - 2265 - /* 2266 - * 3.1.1: 2267 - * The driver MUST follow this sequence to initialize a device: 2268 - *... 2269 - * 3. Set the DRIVER status bit: the guest OS knows how to 2270 - * drive the device. 2271 - * 4. Read device feature bits, and write the subset of 2272 - * feature bits understood by the OS and driver to the 2273 - * device. During this step the driver MAY read (but MUST NOT 2274 - * write) the device-specific configuration fields to check 2275 - * that it can support the device before accepting it. 2276 - */ 2277 - if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2278 - bad_driver(d, 2279 - "config read before VIRTIO_CONFIG_S_DRIVER"); 2280 - 2281 - if (mask == 0xFFFFFFFF) 2282 - goto read_through32; 2283 - else if (mask == 0xFFFF) 2284 - goto read_through16; 2285 - else 2286 - goto read_through8; 2287 - } 2288 - 2289 - /* 2290 - * 4.1.3.1: 2291 - * 2292 - * The driver MUST access each field using the “natural” access 2293 - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for 2294 - * 16-bit fields and 8-bit accesses for 8-bit fields. 2295 - */ 2296 - read_through32: 2297 - if (mask != 0xFFFFFFFF) 2298 - bad_driver(d, "non-32-bit read to offset %u (%#x)", 2299 - off, getreg(eip)); 2300 - memcpy(&val, (char *)d->mmio + off, 4); 2301 - return val; 2302 - 2303 - read_through16: 2304 - if (mask != 0xFFFF) 2305 - bad_driver(d, "non-16-bit read to offset %u (%#x)", 2306 - off, getreg(eip)); 2307 - memcpy(&val, (char *)d->mmio + off, 2); 2308 - return val; 2309 - 2310 - read_through8: 2311 - if (mask != 0xFF) 2312 - bad_driver(d, "non-8-bit read to offset %u (%#x)", 2313 - off, getreg(eip)); 2314 - memcpy(&val, (char *)d->mmio + off, 1); 2315 - return val; 2316 - } 2317 - 2318 - static void emulate_mmio(unsigned long paddr, const u8 *insn) 2319 - { 2320 - u32 val, off, mask = 0xFFFFFFFF, insnlen = 0; 2321 - struct device *d = find_mmio_region(paddr, &off); 2322 - unsigned long args[] = { LHREQ_TRAP, 14 }; 2323 - 2324 - if (!d) { 2325 - warnx("MMIO touching %#08lx (not a device)", paddr); 2326 - goto reinject; 2327 - } 2328 - 2329 - /* Prefix makes it a 16 bit op */ 2330 - if (insn[0] == 0x66) { 2331 - mask = 0xFFFF; 2332 - insnlen++; 2333 - } 2334 - 2335 - /* iowrite */ 2336 - if (insn[insnlen] == 0x89) { 2337 - /* Next byte is r/m byte: bits 3-5 are register. */ 2338 - val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask); 2339 - emulate_mmio_write(d, off, val, mask); 2340 - insnlen += 2 + insn_displacement_len(insn[insnlen+1]); 2341 - } else if (insn[insnlen] == 0x8b) { /* ioread */ 2342 - /* Next byte is r/m byte: bits 3-5 are register. */ 2343 - val = emulate_mmio_read(d, off, mask); 2344 - setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask); 2345 - insnlen += 2 + insn_displacement_len(insn[insnlen+1]); 2346 - } else if (insn[0] == 0x88) { /* 8-bit iowrite */ 2347 - mask = 0xff; 2348 - /* Next byte is r/m byte: bits 3-5 are register. */ 2349 - val = getreg_num((insn[1] >> 3) & 0x7, mask); 2350 - emulate_mmio_write(d, off, val, mask); 2351 - insnlen = 2 + insn_displacement_len(insn[1]); 2352 - } else if (insn[0] == 0x8a) { /* 8-bit ioread */ 2353 - mask = 0xff; 2354 - val = emulate_mmio_read(d, off, mask); 2355 - setreg_num((insn[1] >> 3) & 0x7, val, mask); 2356 - insnlen = 2 + insn_displacement_len(insn[1]); 2357 - } else { 2358 - warnx("Unknown MMIO instruction touching %#08lx:" 2359 - " %02x %02x %02x %02x at %u", 2360 - paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip)); 2361 - reinject: 2362 - /* Inject trap into Guest. */ 2363 - if (write(lguest_fd, args, sizeof(args)) < 0) 2364 - err(1, "Reinjecting trap 14 for fault at %#x", 2365 - getreg(eip)); 2366 - return; 2367 - } 2368 - 2369 - /* Finally, we've "done" the instruction, so move past it. */ 2370 - setreg(eip, getreg(eip) + insnlen); 2371 - } 2372 - 2373 - /*L:190 2374 - * Device Setup 2375 - * 2376 - * All devices need a descriptor so the Guest knows it exists, and a "struct 2377 - * device" so the Launcher can keep track of it. We have common helper 2378 - * routines to allocate and manage them. 2379 - */ 2380 - static void add_pci_virtqueue(struct device *dev, 2381 - void (*service)(struct virtqueue *), 2382 - const char *name) 2383 - { 2384 - struct virtqueue **i, *vq = malloc(sizeof(*vq)); 2385 - 2386 - /* Initialize the virtqueue */ 2387 - vq->next = NULL; 2388 - vq->last_avail_idx = 0; 2389 - vq->dev = dev; 2390 - vq->name = name; 2391 - 2392 - /* 2393 - * This is the routine the service thread will run, and its Process ID 2394 - * once it's running. 2395 - */ 2396 - vq->service = service; 2397 - vq->thread = (pid_t)-1; 2398 - 2399 - /* Initialize the configuration. */ 2400 - reset_vq_pci_config(vq); 2401 - vq->pci_config.queue_notify_off = 0; 2402 - 2403 - /* Add one to the number of queues */ 2404 - vq->dev->mmio->cfg.num_queues++; 2405 - 2406 - /* 2407 - * Add to tail of list, so dev->vq is first vq, dev->vq->next is 2408 - * second. 2409 - */ 2410 - for (i = &dev->vq; *i; i = &(*i)->next); 2411 - *i = vq; 2412 - } 2413 - 2414 - /* The Guest accesses the feature bits via the PCI common config MMIO region */ 2415 - static void add_pci_feature(struct device *dev, unsigned bit) 2416 - { 2417 - dev->features |= (1ULL << bit); 2418 - } 2419 - 2420 - /* For devices with no config. */ 2421 - static void no_device_config(struct device *dev) 2422 - { 2423 - dev->mmio_addr = get_mmio_region(dev->mmio_size); 2424 - 2425 - dev->config.bar[0] = dev->mmio_addr; 2426 - /* Bottom 4 bits must be zero */ 2427 - assert(~(dev->config.bar[0] & 0xF)); 2428 - } 2429 - 2430 - /* This puts the device config into BAR0 */ 2431 - static void set_device_config(struct device *dev, const void *conf, size_t len) 2432 - { 2433 - /* Set up BAR 0 */ 2434 - dev->mmio_size += len; 2435 - dev->mmio = realloc(dev->mmio, dev->mmio_size); 2436 - memcpy(dev->mmio + 1, conf, len); 2437 - 2438 - /* 2439 - * 4.1.4.6: 2440 - * 2441 - * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG 2442 - * capability for any device type which has a device-specific 2443 - * configuration. 2444 - */ 2445 - /* Hook up device cfg */ 2446 - dev->config.cfg_access.cap.cap_next 2447 - = offsetof(struct pci_config, device); 2448 - 2449 - /* 2450 - * 4.1.4.6.1: 2451 - * 2452 - * The offset for the device-specific configuration MUST be 4-byte 2453 - * aligned. 2454 - */ 2455 - assert(dev->config.cfg_access.cap.cap_next % 4 == 0); 2456 - 2457 - /* Fix up device cfg field length. */ 2458 - dev->config.device.length = len; 2459 - 2460 - /* The rest is the same as the no-config case */ 2461 - no_device_config(dev); 2462 - } 2463 - 2464 - static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type, 2465 - size_t bar_offset, size_t bar_bytes, u8 next) 2466 - { 2467 - cap->cap_vndr = PCI_CAP_ID_VNDR; 2468 - cap->cap_next = next; 2469 - cap->cap_len = caplen; 2470 - cap->cfg_type = type; 2471 - cap->bar = 0; 2472 - memset(cap->padding, 0, sizeof(cap->padding)); 2473 - cap->offset = bar_offset; 2474 - cap->length = bar_bytes; 2475 - } 2476 - 2477 - /* 2478 - * This sets up the pci_config structure, as defined in the virtio 1.0 2479 - * standard (and PCI standard). 2480 - */ 2481 - static void init_pci_config(struct pci_config *pci, u16 type, 2482 - u8 class, u8 subclass) 2483 - { 2484 - size_t bar_offset, bar_len; 2485 - 2486 - /* 2487 - * 4.1.4.4.1: 2488 - * 2489 - * The device MUST either present notify_off_multiplier as an even 2490 - * power of 2, or present notify_off_multiplier as 0. 2491 - * 2492 - * 2.1.2: 2493 - * 2494 - * The device MUST initialize device status to 0 upon reset. 2495 - */ 2496 - memset(pci, 0, sizeof(*pci)); 2497 - 2498 - /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */ 2499 - pci->vendor_id = 0x1AF4; 2500 - /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */ 2501 - pci->device_id = 0x1040 + type; 2502 - 2503 - /* 2504 - * PCI have specific codes for different types of devices. 2505 - * Linux doesn't care, but it's a good clue for people looking 2506 - * at the device. 2507 - */ 2508 - pci->class = class; 2509 - pci->subclass = subclass; 2510 - 2511 - /* 2512 - * 4.1.2.1: 2513 - * 2514 - * Non-transitional devices SHOULD have a PCI Revision ID of 1 or 2515 - * higher 2516 - */ 2517 - pci->revid = 1; 2518 - 2519 - /* 2520 - * 4.1.2.1: 2521 - * 2522 - * Non-transitional devices SHOULD have a PCI Subsystem Device ID of 2523 - * 0x40 or higher. 2524 - */ 2525 - pci->subsystem_device_id = 0x40; 2526 - 2527 - /* We use our dummy interrupt controller, and irq_line is the irq */ 2528 - pci->irq_line = devices.next_irq++; 2529 - pci->irq_pin = 0; 2530 - 2531 - /* Support for extended capabilities. */ 2532 - pci->status = (1 << 4); 2533 - 2534 - /* Link them in. */ 2535 - /* 2536 - * 4.1.4.3.1: 2537 - * 2538 - * The device MUST present at least one common configuration 2539 - * capability. 2540 - */ 2541 - pci->capabilities = offsetof(struct pci_config, common); 2542 - 2543 - /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */ 2544 - assert(pci->capabilities % 4 == 0); 2545 - 2546 - bar_offset = offsetof(struct virtio_pci_mmio, cfg); 2547 - bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg); 2548 - init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG, 2549 - bar_offset, bar_len, 2550 - offsetof(struct pci_config, notify)); 2551 - 2552 - /* 2553 - * 4.1.4.4.1: 2554 - * 2555 - * The device MUST present at least one notification capability. 2556 - */ 2557 - bar_offset += bar_len; 2558 - bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify); 2559 - 2560 - /* 2561 - * 4.1.4.4.1: 2562 - * 2563 - * The cap.offset MUST be 2-byte aligned. 2564 - */ 2565 - assert(pci->common.cap_next % 2 == 0); 2566 - 2567 - /* FIXME: Use a non-zero notify_off, for per-queue notification? */ 2568 - /* 2569 - * 4.1.4.4.1: 2570 - * 2571 - * The value cap.length presented by the device MUST be at least 2 and 2572 - * MUST be large enough to support queue notification offsets for all 2573 - * supported queues in all possible configurations. 2574 - */ 2575 - assert(bar_len >= 2); 2576 - 2577 - init_cap(&pci->notify.cap, sizeof(pci->notify), 2578 - VIRTIO_PCI_CAP_NOTIFY_CFG, 2579 - bar_offset, bar_len, 2580 - offsetof(struct pci_config, isr)); 2581 - 2582 - bar_offset += bar_len; 2583 - bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr); 2584 - /* 2585 - * 4.1.4.5.1: 2586 - * 2587 - * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG 2588 - * capability. 2589 - */ 2590 - init_cap(&pci->isr, sizeof(pci->isr), 2591 - VIRTIO_PCI_CAP_ISR_CFG, 2592 - bar_offset, bar_len, 2593 - offsetof(struct pci_config, cfg_access)); 2594 - 2595 - /* 2596 - * 4.1.4.7.1: 2597 - * 2598 - * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG 2599 - * capability. 2600 - */ 2601 - /* This doesn't have any presence in the BAR */ 2602 - init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access), 2603 - VIRTIO_PCI_CAP_PCI_CFG, 2604 - 0, 0, 0); 2605 - 2606 - bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding); 2607 - assert(bar_offset == sizeof(struct virtio_pci_mmio)); 2608 - 2609 - /* 2610 - * This gets sewn in and length set in set_device_config(). 2611 - * Some devices don't have a device configuration interface, so 2612 - * we never expose this if we don't call set_device_config(). 2613 - */ 2614 - init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG, 2615 - bar_offset, 0, 0); 2616 - } 2617 - 2618 - /* 2619 - * This routine does all the creation and setup of a new device, but we don't 2620 - * actually place the MMIO region until we know the size (if any) of the 2621 - * device-specific config. And we don't actually start the service threads 2622 - * until later. 2623 - * 2624 - * See what I mean about userspace being boring? 2625 - */ 2626 - static struct device *new_pci_device(const char *name, u16 type, 2627 - u8 class, u8 subclass) 2628 - { 2629 - struct device *dev = malloc(sizeof(*dev)); 2630 - 2631 - /* Now we populate the fields one at a time. */ 2632 - dev->name = name; 2633 - dev->vq = NULL; 2634 - dev->running = false; 2635 - dev->wrote_features_ok = false; 2636 - dev->mmio_size = sizeof(struct virtio_pci_mmio); 2637 - dev->mmio = calloc(1, dev->mmio_size); 2638 - dev->features = (u64)1 << VIRTIO_F_VERSION_1; 2639 - dev->features_accepted = 0; 2640 - 2641 - if (devices.device_num + 1 >= MAX_PCI_DEVICES) 2642 - errx(1, "Can only handle 31 PCI devices"); 2643 - 2644 - init_pci_config(&dev->config, type, class, subclass); 2645 - assert(!devices.pci[devices.device_num+1]); 2646 - devices.pci[++devices.device_num] = dev; 2647 - 2648 - return dev; 2649 - } 2650 - 2651 - /* 2652 - * Our first setup routine is the console. It's a fairly simple device, but 2653 - * UNIX tty handling makes it uglier than it could be. 2654 - */ 2655 - static void setup_console(void) 2656 - { 2657 - struct device *dev; 2658 - struct virtio_console_config conf; 2659 - 2660 - /* If we can save the initial standard input settings... */ 2661 - if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 2662 - struct termios term = orig_term; 2663 - /* 2664 - * Then we turn off echo, line buffering and ^C etc: We want a 2665 - * raw input stream to the Guest. 2666 - */ 2667 - term.c_lflag &= ~(ISIG|ICANON|ECHO); 2668 - tcsetattr(STDIN_FILENO, TCSANOW, &term); 2669 - } 2670 - 2671 - dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00); 2672 - 2673 - /* We store the console state in dev->priv, and initialize it. */ 2674 - dev->priv = malloc(sizeof(struct console_abort)); 2675 - ((struct console_abort *)dev->priv)->count = 0; 2676 - 2677 - /* 2678 - * The console needs two virtqueues: the input then the output. When 2679 - * they put something the input queue, we make sure we're listening to 2680 - * stdin. When they put something in the output queue, we write it to 2681 - * stdout. 2682 - */ 2683 - add_pci_virtqueue(dev, console_input, "input"); 2684 - add_pci_virtqueue(dev, console_output, "output"); 2685 - 2686 - /* We need a configuration area for the emerg_wr early writes. */ 2687 - add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE); 2688 - set_device_config(dev, &conf, sizeof(conf)); 2689 - 2690 - verbose("device %u: console\n", devices.device_num); 2691 - } 2692 - /*:*/ 2693 - 2694 - /*M:010 2695 - * Inter-guest networking is an interesting area. Simplest is to have a 2696 - * --sharenet=<name> option which opens or creates a named pipe. This can be 2697 - * used to send packets to another guest in a 1:1 manner. 2698 - * 2699 - * More sophisticated is to use one of the tools developed for project like UML 2700 - * to do networking. 2701 - * 2702 - * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 2703 - * completely generic ("here's my vring, attach to your vring") and would work 2704 - * for any traffic. Of course, namespace and permissions issues need to be 2705 - * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide 2706 - * multiple inter-guest channels behind one interface, although it would 2707 - * require some manner of hotplugging new virtio channels. 2708 - * 2709 - * Finally, we could use a virtio network switch in the kernel, ie. vhost. 2710 - :*/ 2711 - 2712 - static u32 str2ip(const char *ipaddr) 2713 - { 2714 - unsigned int b[4]; 2715 - 2716 - if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) 2717 - errx(1, "Failed to parse IP address '%s'", ipaddr); 2718 - return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; 2719 - } 2720 - 2721 - static void str2mac(const char *macaddr, unsigned char mac[6]) 2722 - { 2723 - unsigned int m[6]; 2724 - if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", 2725 - &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) 2726 - errx(1, "Failed to parse mac address '%s'", macaddr); 2727 - mac[0] = m[0]; 2728 - mac[1] = m[1]; 2729 - mac[2] = m[2]; 2730 - mac[3] = m[3]; 2731 - mac[4] = m[4]; 2732 - mac[5] = m[5]; 2733 - } 2734 - 2735 - /* 2736 - * This code is "adapted" from libbridge: it attaches the Host end of the 2737 - * network device to the bridge device specified by the command line. 2738 - * 2739 - * This is yet another James Morris contribution (I'm an IP-level guy, so I 2740 - * dislike bridging), and I just try not to break it. 2741 - */ 2742 - static void add_to_bridge(int fd, const char *if_name, const char *br_name) 2743 - { 2744 - int ifidx; 2745 - struct ifreq ifr; 2746 - 2747 - if (!*br_name) 2748 - errx(1, "must specify bridge name"); 2749 - 2750 - ifidx = if_nametoindex(if_name); 2751 - if (!ifidx) 2752 - errx(1, "interface %s does not exist!", if_name); 2753 - 2754 - strncpy(ifr.ifr_name, br_name, IFNAMSIZ); 2755 - ifr.ifr_name[IFNAMSIZ-1] = '\0'; 2756 - ifr.ifr_ifindex = ifidx; 2757 - if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) 2758 - err(1, "can't add %s to bridge %s", if_name, br_name); 2759 - } 2760 - 2761 - /* 2762 - * This sets up the Host end of the network device with an IP address, brings 2763 - * it up so packets will flow, the copies the MAC address into the hwaddr 2764 - * pointer. 2765 - */ 2766 - static void configure_device(int fd, const char *tapif, u32 ipaddr) 2767 - { 2768 - struct ifreq ifr; 2769 - struct sockaddr_in sin; 2770 - 2771 - memset(&ifr, 0, sizeof(ifr)); 2772 - strcpy(ifr.ifr_name, tapif); 2773 - 2774 - /* Don't read these incantations. Just cut & paste them like I did! */ 2775 - sin.sin_family = AF_INET; 2776 - sin.sin_addr.s_addr = htonl(ipaddr); 2777 - memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); 2778 - if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) 2779 - err(1, "Setting %s interface address", tapif); 2780 - ifr.ifr_flags = IFF_UP; 2781 - if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) 2782 - err(1, "Bringing interface %s up", tapif); 2783 - } 2784 - 2785 - static int get_tun_device(char tapif[IFNAMSIZ]) 2786 - { 2787 - struct ifreq ifr; 2788 - int vnet_hdr_sz; 2789 - int netfd; 2790 - 2791 - /* Start with this zeroed. Messy but sure. */ 2792 - memset(&ifr, 0, sizeof(ifr)); 2793 - 2794 - /* 2795 - * We open the /dev/net/tun device and tell it we want a tap device. A 2796 - * tap device is like a tun device, only somehow different. To tell 2797 - * the truth, I completely blundered my way through this code, but it 2798 - * works now! 2799 - */ 2800 - netfd = open_or_die("/dev/net/tun", O_RDWR); 2801 - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 2802 - strcpy(ifr.ifr_name, "tap%d"); 2803 - if (ioctl(netfd, TUNSETIFF, &ifr) != 0) 2804 - err(1, "configuring /dev/net/tun"); 2805 - 2806 - if (ioctl(netfd, TUNSETOFFLOAD, 2807 - TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 2808 - err(1, "Could not set features for tun device"); 2809 - 2810 - /* 2811 - * We don't need checksums calculated for packets coming in this 2812 - * device: trust us! 2813 - */ 2814 - ioctl(netfd, TUNSETNOCSUM, 1); 2815 - 2816 - /* 2817 - * In virtio before 1.0 (aka legacy virtio), we added a 16-bit 2818 - * field at the end of the network header iff 2819 - * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0, 2820 - * that became the norm, but we need to tell the tun device 2821 - * about our expanded header (which is called 2822 - * virtio_net_hdr_mrg_rxbuf in the legacy system). 2823 - */ 2824 - vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1); 2825 - if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0) 2826 - err(1, "Setting tun header size to %u", vnet_hdr_sz); 2827 - 2828 - memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 2829 - return netfd; 2830 - } 2831 - 2832 - /*L:195 2833 - * Our network is a Host<->Guest network. This can either use bridging or 2834 - * routing, but the principle is the same: it uses the "tun" device to inject 2835 - * packets into the Host as if they came in from a normal network card. We 2836 - * just shunt packets between the Guest and the tun device. 2837 - */ 2838 - static void setup_tun_net(char *arg) 2839 - { 2840 - struct device *dev; 2841 - struct net_info *net_info = malloc(sizeof(*net_info)); 2842 - int ipfd; 2843 - u32 ip = INADDR_ANY; 2844 - bool bridging = false; 2845 - char tapif[IFNAMSIZ], *p; 2846 - struct virtio_net_config conf; 2847 - 2848 - net_info->tunfd = get_tun_device(tapif); 2849 - 2850 - /* First we create a new network device. */ 2851 - dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00); 2852 - dev->priv = net_info; 2853 - 2854 - /* Network devices need a recv and a send queue, just like console. */ 2855 - add_pci_virtqueue(dev, net_input, "rx"); 2856 - add_pci_virtqueue(dev, net_output, "tx"); 2857 - 2858 - /* 2859 - * We need a socket to perform the magic network ioctls to bring up the 2860 - * tap interface, connect to the bridge etc. Any socket will do! 2861 - */ 2862 - ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 2863 - if (ipfd < 0) 2864 - err(1, "opening IP socket"); 2865 - 2866 - /* If the command line was --tunnet=bridge:<name> do bridging. */ 2867 - if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { 2868 - arg += strlen(BRIDGE_PFX); 2869 - bridging = true; 2870 - } 2871 - 2872 - /* A mac address may follow the bridge name or IP address */ 2873 - p = strchr(arg, ':'); 2874 - if (p) { 2875 - str2mac(p+1, conf.mac); 2876 - add_pci_feature(dev, VIRTIO_NET_F_MAC); 2877 - *p = '\0'; 2878 - } 2879 - 2880 - /* arg is now either an IP address or a bridge name */ 2881 - if (bridging) 2882 - add_to_bridge(ipfd, tapif, arg); 2883 - else 2884 - ip = str2ip(arg); 2885 - 2886 - /* Set up the tun device. */ 2887 - configure_device(ipfd, tapif, ip); 2888 - 2889 - /* Expect Guest to handle everything except UFO */ 2890 - add_pci_feature(dev, VIRTIO_NET_F_CSUM); 2891 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 2892 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 2893 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 2894 - add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN); 2895 - add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4); 2896 - add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6); 2897 - add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN); 2898 - /* We handle indirect ring entries */ 2899 - add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); 2900 - set_device_config(dev, &conf, sizeof(conf)); 2901 - 2902 - /* We don't need the socket any more; setup is done. */ 2903 - close(ipfd); 2904 - 2905 - if (bridging) 2906 - verbose("device %u: tun %s attached to bridge: %s\n", 2907 - devices.device_num, tapif, arg); 2908 - else 2909 - verbose("device %u: tun %s: %s\n", 2910 - devices.device_num, tapif, arg); 2911 - } 2912 - /*:*/ 2913 - 2914 - /* This hangs off device->priv. */ 2915 - struct vblk_info { 2916 - /* The size of the file. */ 2917 - off64_t len; 2918 - 2919 - /* The file descriptor for the file. */ 2920 - int fd; 2921 - 2922 - }; 2923 - 2924 - /*L:210 2925 - * The Disk 2926 - * 2927 - * The disk only has one virtqueue, so it only has one thread. It is really 2928 - * simple: the Guest asks for a block number and we read or write that position 2929 - * in the file. 2930 - * 2931 - * Before we serviced each virtqueue in a separate thread, that was unacceptably 2932 - * slow: the Guest waits until the read is finished before running anything 2933 - * else, even if it could have been doing useful work. 2934 - * 2935 - * We could have used async I/O, except it's reputed to suck so hard that 2936 - * characters actually go missing from your code when you try to use it. 2937 - */ 2938 - static void blk_request(struct virtqueue *vq) 2939 - { 2940 - struct vblk_info *vblk = vq->dev->priv; 2941 - unsigned int head, out_num, in_num, wlen; 2942 - int ret, i; 2943 - u8 *in; 2944 - struct virtio_blk_outhdr out; 2945 - struct iovec iov[vq->vring.num]; 2946 - off64_t off; 2947 - 2948 - /* 2949 - * Get the next request, where we normally wait. It triggers the 2950 - * interrupt to acknowledge previously serviced requests (if any). 2951 - */ 2952 - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 2953 - 2954 - /* Copy the output header from the front of the iov (adjusts iov) */ 2955 - iov_consume(vq->dev, iov, out_num, &out, sizeof(out)); 2956 - 2957 - /* Find and trim end of iov input array, for our status byte. */ 2958 - in = NULL; 2959 - for (i = out_num + in_num - 1; i >= out_num; i--) { 2960 - if (iov[i].iov_len > 0) { 2961 - in = iov[i].iov_base + iov[i].iov_len - 1; 2962 - iov[i].iov_len--; 2963 - break; 2964 - } 2965 - } 2966 - if (!in) 2967 - bad_driver_vq(vq, "Bad virtblk cmd with no room for status"); 2968 - 2969 - /* 2970 - * For historical reasons, block operations are expressed in 512 byte 2971 - * "sectors". 2972 - */ 2973 - off = out.sector * 512; 2974 - 2975 - if (out.type & VIRTIO_BLK_T_OUT) { 2976 - /* 2977 - * Write 2978 - * 2979 - * Move to the right location in the block file. This can fail 2980 - * if they try to write past end. 2981 - */ 2982 - if (lseek64(vblk->fd, off, SEEK_SET) != off) 2983 - err(1, "Bad seek to sector %llu", out.sector); 2984 - 2985 - ret = writev(vblk->fd, iov, out_num); 2986 - verbose("WRITE to sector %llu: %i\n", out.sector, ret); 2987 - 2988 - /* 2989 - * Grr... Now we know how long the descriptor they sent was, we 2990 - * make sure they didn't try to write over the end of the block 2991 - * file (possibly extending it). 2992 - */ 2993 - if (ret > 0 && off + ret > vblk->len) { 2994 - /* Trim it back to the correct length */ 2995 - ftruncate64(vblk->fd, vblk->len); 2996 - /* Die, bad Guest, die. */ 2997 - bad_driver_vq(vq, "Write past end %llu+%u", off, ret); 2998 - } 2999 - 3000 - wlen = sizeof(*in); 3001 - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 3002 - } else if (out.type & VIRTIO_BLK_T_FLUSH) { 3003 - /* Flush */ 3004 - ret = fdatasync(vblk->fd); 3005 - verbose("FLUSH fdatasync: %i\n", ret); 3006 - wlen = sizeof(*in); 3007 - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 3008 - } else { 3009 - /* 3010 - * Read 3011 - * 3012 - * Move to the right location in the block file. This can fail 3013 - * if they try to read past end. 3014 - */ 3015 - if (lseek64(vblk->fd, off, SEEK_SET) != off) 3016 - err(1, "Bad seek to sector %llu", out.sector); 3017 - 3018 - ret = readv(vblk->fd, iov + out_num, in_num); 3019 - if (ret >= 0) { 3020 - wlen = sizeof(*in) + ret; 3021 - *in = VIRTIO_BLK_S_OK; 3022 - } else { 3023 - wlen = sizeof(*in); 3024 - *in = VIRTIO_BLK_S_IOERR; 3025 - } 3026 - } 3027 - 3028 - /* Finished that request. */ 3029 - add_used(vq, head, wlen); 3030 - } 3031 - 3032 - /*L:198 This actually sets up a virtual block device. */ 3033 - static void setup_block_file(const char *filename) 3034 - { 3035 - struct device *dev; 3036 - struct vblk_info *vblk; 3037 - struct virtio_blk_config conf; 3038 - 3039 - /* Create the device. */ 3040 - dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80); 3041 - 3042 - /* The device has one virtqueue, where the Guest places requests. */ 3043 - add_pci_virtqueue(dev, blk_request, "request"); 3044 - 3045 - /* Allocate the room for our own bookkeeping */ 3046 - vblk = dev->priv = malloc(sizeof(*vblk)); 3047 - 3048 - /* First we open the file and store the length. */ 3049 - vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 3050 - vblk->len = lseek64(vblk->fd, 0, SEEK_END); 3051 - 3052 - /* Tell Guest how many sectors this device has. */ 3053 - conf.capacity = cpu_to_le64(vblk->len / 512); 3054 - 3055 - /* 3056 - * Tell Guest not to put in too many descriptors at once: two are used 3057 - * for the in and out elements. 3058 - */ 3059 - add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX); 3060 - conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 3061 - 3062 - set_device_config(dev, &conf, sizeof(struct virtio_blk_config)); 3063 - 3064 - verbose("device %u: virtblock %llu sectors\n", 3065 - devices.device_num, le64_to_cpu(conf.capacity)); 3066 - } 3067 - 3068 - /*L:211 3069 - * Our random number generator device reads from /dev/urandom into the Guest's 3070 - * input buffers. The usual case is that the Guest doesn't want random numbers 3071 - * and so has no buffers although /dev/urandom is still readable, whereas 3072 - * console is the reverse. 3073 - * 3074 - * The same logic applies, however. 3075 - */ 3076 - struct rng_info { 3077 - int rfd; 3078 - }; 3079 - 3080 - static void rng_input(struct virtqueue *vq) 3081 - { 3082 - int len; 3083 - unsigned int head, in_num, out_num, totlen = 0; 3084 - struct rng_info *rng_info = vq->dev->priv; 3085 - struct iovec iov[vq->vring.num]; 3086 - 3087 - /* First we need a buffer from the Guests's virtqueue. */ 3088 - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 3089 - if (out_num) 3090 - bad_driver_vq(vq, "Output buffers in rng?"); 3091 - 3092 - /* 3093 - * Just like the console write, we loop to cover the whole iovec. 3094 - * In this case, short reads actually happen quite a bit. 3095 - */ 3096 - while (!iov_empty(iov, in_num)) { 3097 - len = readv(rng_info->rfd, iov, in_num); 3098 - if (len <= 0) 3099 - err(1, "Read from /dev/urandom gave %i", len); 3100 - iov_consume(vq->dev, iov, in_num, NULL, len); 3101 - totlen += len; 3102 - } 3103 - 3104 - /* Tell the Guest about the new input. */ 3105 - add_used(vq, head, totlen); 3106 - } 3107 - 3108 - /*L:199 3109 - * This creates a "hardware" random number device for the Guest. 3110 - */ 3111 - static void setup_rng(void) 3112 - { 3113 - struct device *dev; 3114 - struct rng_info *rng_info = malloc(sizeof(*rng_info)); 3115 - 3116 - /* Our device's private info simply contains the /dev/urandom fd. */ 3117 - rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY); 3118 - 3119 - /* Create the new device. */ 3120 - dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0); 3121 - dev->priv = rng_info; 3122 - 3123 - /* The device has one virtqueue, where the Guest places inbufs. */ 3124 - add_pci_virtqueue(dev, rng_input, "input"); 3125 - 3126 - /* We don't have any configuration space */ 3127 - no_device_config(dev); 3128 - 3129 - verbose("device %u: rng\n", devices.device_num); 3130 - } 3131 - /* That's the end of device setup. */ 3132 - 3133 - /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ 3134 - static void __attribute__((noreturn)) restart_guest(void) 3135 - { 3136 - unsigned int i; 3137 - 3138 - /* 3139 - * Since we don't track all open fds, we simply close everything beyond 3140 - * stderr. 3141 - */ 3142 - for (i = 3; i < FD_SETSIZE; i++) 3143 - close(i); 3144 - 3145 - /* Reset all the devices (kills all threads). */ 3146 - cleanup_devices(); 3147 - 3148 - execv(main_args[0], main_args); 3149 - err(1, "Could not exec %s", main_args[0]); 3150 - } 3151 - 3152 - /*L:220 3153 - * Finally we reach the core of the Launcher which runs the Guest, serves 3154 - * its input and output, and finally, lays it to rest. 3155 - */ 3156 - static void __attribute__((noreturn)) run_guest(void) 3157 - { 3158 - for (;;) { 3159 - struct lguest_pending notify; 3160 - int readval; 3161 - 3162 - /* We read from the /dev/lguest device to run the Guest. */ 3163 - readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id); 3164 - if (readval == sizeof(notify)) { 3165 - if (notify.trap == 13) { 3166 - verbose("Emulating instruction at %#x\n", 3167 - getreg(eip)); 3168 - emulate_insn(notify.insn); 3169 - } else if (notify.trap == 14) { 3170 - verbose("Emulating MMIO at %#x\n", 3171 - getreg(eip)); 3172 - emulate_mmio(notify.addr, notify.insn); 3173 - } else 3174 - errx(1, "Unknown trap %i addr %#08x\n", 3175 - notify.trap, notify.addr); 3176 - /* ENOENT means the Guest died. Reading tells us why. */ 3177 - } else if (errno == ENOENT) { 3178 - char reason[1024] = { 0 }; 3179 - pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); 3180 - errx(1, "%s", reason); 3181 - /* ERESTART means that we need to reboot the guest */ 3182 - } else if (errno == ERESTART) { 3183 - restart_guest(); 3184 - /* Anything else means a bug or incompatible change. */ 3185 - } else 3186 - err(1, "Running guest failed"); 3187 - } 3188 - } 3189 - /*L:240 3190 - * This is the end of the Launcher. The good news: we are over halfway 3191 - * through! The bad news: the most fiendish part of the code still lies ahead 3192 - * of us. 3193 - * 3194 - * Are you ready? Take a deep breath and join me in the core of the Host, in 3195 - * "make Host". 3196 - :*/ 3197 - 3198 - static struct option opts[] = { 3199 - { "verbose", 0, NULL, 'v' }, 3200 - { "tunnet", 1, NULL, 't' }, 3201 - { "block", 1, NULL, 'b' }, 3202 - { "rng", 0, NULL, 'r' }, 3203 - { "initrd", 1, NULL, 'i' }, 3204 - { "username", 1, NULL, 'u' }, 3205 - { "chroot", 1, NULL, 'c' }, 3206 - { NULL }, 3207 - }; 3208 - static void usage(void) 3209 - { 3210 - errx(1, "Usage: lguest [--verbose] " 3211 - "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n" 3212 - "|--block=<filename>|--initrd=<filename>]...\n" 3213 - "<mem-in-mb> vmlinux [args...]"); 3214 - } 3215 - 3216 - /*L:105 The main routine is where the real work begins: */ 3217 - int main(int argc, char *argv[]) 3218 - { 3219 - /* Memory, code startpoint and size of the (optional) initrd. */ 3220 - unsigned long mem = 0, start, initrd_size = 0; 3221 - /* Two temporaries. */ 3222 - int i, c; 3223 - /* The boot information for the Guest. */ 3224 - struct boot_params *boot; 3225 - /* If they specify an initrd file to load. */ 3226 - const char *initrd_name = NULL; 3227 - 3228 - /* Password structure for initgroups/setres[gu]id */ 3229 - struct passwd *user_details = NULL; 3230 - 3231 - /* Directory to chroot to */ 3232 - char *chroot_path = NULL; 3233 - 3234 - /* Save the args: we "reboot" by execing ourselves again. */ 3235 - main_args = argv; 3236 - 3237 - /* 3238 - * First we initialize the device list. We remember next interrupt 3239 - * number to use for devices (1: remember that 0 is used by the timer). 3240 - */ 3241 - devices.next_irq = 1; 3242 - 3243 - /* We're CPU 0. In fact, that's the only CPU possible right now. */ 3244 - cpu_id = 0; 3245 - 3246 - /* 3247 - * We need to know how much memory so we can set up the device 3248 - * descriptor and memory pages for the devices as we parse the command 3249 - * line. So we quickly look through the arguments to find the amount 3250 - * of memory now. 3251 - */ 3252 - for (i = 1; i < argc; i++) { 3253 - if (argv[i][0] != '-') { 3254 - mem = atoi(argv[i]) * 1024 * 1024; 3255 - /* 3256 - * We start by mapping anonymous pages over all of 3257 - * guest-physical memory range. This fills it with 0, 3258 - * and ensures that the Guest won't be killed when it 3259 - * tries to access it. 3260 - */ 3261 - guest_base = map_zeroed_pages(mem / getpagesize() 3262 - + DEVICE_PAGES); 3263 - guest_limit = mem; 3264 - guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize(); 3265 - break; 3266 - } 3267 - } 3268 - 3269 - /* If we exit via err(), this kills all the threads, restores tty. */ 3270 - atexit(cleanup_devices); 3271 - 3272 - /* We always have a console device, and it's always device 1. */ 3273 - setup_console(); 3274 - 3275 - /* The options are fairly straight-forward */ 3276 - while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 3277 - switch (c) { 3278 - case 'v': 3279 - verbose = true; 3280 - break; 3281 - case 't': 3282 - setup_tun_net(optarg); 3283 - break; 3284 - case 'b': 3285 - setup_block_file(optarg); 3286 - break; 3287 - case 'r': 3288 - setup_rng(); 3289 - break; 3290 - case 'i': 3291 - initrd_name = optarg; 3292 - break; 3293 - case 'u': 3294 - user_details = getpwnam(optarg); 3295 - if (!user_details) 3296 - err(1, "getpwnam failed, incorrect username?"); 3297 - break; 3298 - case 'c': 3299 - chroot_path = optarg; 3300 - break; 3301 - default: 3302 - warnx("Unknown argument %s", argv[optind]); 3303 - usage(); 3304 - } 3305 - } 3306 - /* 3307 - * After the other arguments we expect memory and kernel image name, 3308 - * followed by command line arguments for the kernel. 3309 - */ 3310 - if (optind + 2 > argc) 3311 - usage(); 3312 - 3313 - verbose("Guest base is at %p\n", guest_base); 3314 - 3315 - /* Initialize the (fake) PCI host bridge device. */ 3316 - init_pci_host_bridge(); 3317 - 3318 - /* Now we load the kernel */ 3319 - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 3320 - 3321 - /* Boot information is stashed at physical address 0 */ 3322 - boot = from_guest_phys(0); 3323 - 3324 - /* Map the initrd image if requested (at top of physical memory) */ 3325 - if (initrd_name) { 3326 - initrd_size = load_initrd(initrd_name, mem); 3327 - /* 3328 - * These are the location in the Linux boot header where the 3329 - * start and size of the initrd are expected to be found. 3330 - */ 3331 - boot->hdr.ramdisk_image = mem - initrd_size; 3332 - boot->hdr.ramdisk_size = initrd_size; 3333 - /* The bootloader type 0xFF means "unknown"; that's OK. */ 3334 - boot->hdr.type_of_loader = 0xFF; 3335 - } 3336 - 3337 - /* 3338 - * The Linux boot header contains an "E820" memory map: ours is a 3339 - * simple, single region. 3340 - */ 3341 - boot->e820_entries = 1; 3342 - boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM }); 3343 - /* 3344 - * The boot header contains a command line pointer: we put the command 3345 - * line after the boot header. 3346 - */ 3347 - boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 3348 - /* We use a simple helper to copy the arguments separated by spaces. */ 3349 - concat((char *)(boot + 1), argv+optind+2); 3350 - 3351 - /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */ 3352 - boot->hdr.kernel_alignment = 0x1000000; 3353 - 3354 - /* Boot protocol version: 2.07 supports the fields for lguest. */ 3355 - boot->hdr.version = 0x207; 3356 - 3357 - /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ 3358 - boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; 3359 - 3360 - /* Tell the entry path not to try to reload segment registers. */ 3361 - boot->hdr.loadflags |= KEEP_SEGMENTS; 3362 - 3363 - /* We don't support tboot: */ 3364 - boot->tboot_addr = 0; 3365 - 3366 - /* Ensure this is 0 to prevent APM from loading: */ 3367 - boot->apm_bios_info.version = 0; 3368 - 3369 - /* We tell the kernel to initialize the Guest. */ 3370 - tell_kernel(start); 3371 - 3372 - /* Ensure that we terminate if a device-servicing child dies. */ 3373 - signal(SIGCHLD, kill_launcher); 3374 - 3375 - /* If requested, chroot to a directory */ 3376 - if (chroot_path) { 3377 - if (chroot(chroot_path) != 0) 3378 - err(1, "chroot(\"%s\") failed", chroot_path); 3379 - 3380 - if (chdir("/") != 0) 3381 - err(1, "chdir(\"/\") failed"); 3382 - 3383 - verbose("chroot done\n"); 3384 - } 3385 - 3386 - /* If requested, drop privileges */ 3387 - if (user_details) { 3388 - uid_t u; 3389 - gid_t g; 3390 - 3391 - u = user_details->pw_uid; 3392 - g = user_details->pw_gid; 3393 - 3394 - if (initgroups(user_details->pw_name, g) != 0) 3395 - err(1, "initgroups failed"); 3396 - 3397 - if (setresgid(g, g, g) != 0) 3398 - err(1, "setresgid failed"); 3399 - 3400 - if (setresuid(u, u, u) != 0) 3401 - err(1, "setresuid failed"); 3402 - 3403 - verbose("Dropping privileges completed\n"); 3404 - } 3405 - 3406 - /* Finally, run the Guest. This doesn't return. */ 3407 - run_guest(); 3408 - } 3409 - /*:*/ 3410 - 3411 - /*M:999 3412 - * Mastery is done: you now know everything I do. 3413 - * 3414 - * But surely you have seen code, features and bugs in your wanderings which 3415 - * you now yearn to attack? That is the real game, and I look forward to you 3416 - * patching and forking lguest into the Your-Name-Here-visor. 3417 - * 3418 - * Farewell, and good coding! 3419 - * Rusty Russell. 3420 - */

-125

tools/lguest/lguest.txt

··· 1 - __ 2 - (___()'`; Rusty's Remarkably Unreliable Guide to Lguest 3 - /, /` - or, A Young Coder's Illustrated Hypervisor 4 - \\"--\\ http://lguest.ozlabs.org 5 - 6 - Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, 7 - for Linux developers and users to experiment with virtualization with the 8 - minimum of complexity. Nonetheless, it should have sufficient features to 9 - make it useful for specific tasks, and, of course, you are encouraged to fork 10 - and enhance it (see drivers/lguest/README). 11 - 12 - Features: 13 - 14 - - Kernel module which runs in a normal kernel. 15 - - Simple I/O model for communication. 16 - - Simple program to create new guests. 17 - - Logo contains cute puppies: http://lguest.ozlabs.org 18 - 19 - Developer features: 20 - 21 - - Fun to hack on. 22 - - No ABI: being tied to a specific kernel anyway, you can change anything. 23 - - Many opportunities for improvement or feature implementation. 24 - 25 - Running Lguest: 26 - 27 - - The easiest way to run lguest is to use same kernel as guest and host. 28 - You can configure them differently, but usually it's easiest not to. 29 - 30 - You will need to configure your kernel with the following options: 31 - 32 - "Processor type and features": 33 - "Paravirtualized guest support" = Y 34 - "Lguest guest support" = Y 35 - "High Memory Support" = off/4GB 36 - "Alignment value to which kernel should be aligned" = 0x100000 37 - (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 38 - CONFIG_PHYSICAL_ALIGN=0x100000) 39 - 40 - "Device Drivers": 41 - "Block devices" 42 - "Virtio block driver" = M/Y 43 - "Network device support" 44 - "Universal TUN/TAP device driver support" = M/Y 45 - "Virtio network driver" = M/Y 46 - (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) 47 - 48 - "Virtualization" 49 - "Linux hypervisor example code" = M/Y 50 - (CONFIG_LGUEST=m) 51 - 52 - - A tool called "lguest" is available in this directory: type "make" 53 - to build it. If you didn't build your kernel in-tree, use "make 54 - O=<builddir>". 55 - 56 - - Create or find a root disk image. There are several useful ones 57 - around, such as the xm-test tiny root image at 58 - http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img 59 - 60 - For more serious work, I usually use a distribution ISO image and 61 - install it under qemu, then make multiple copies: 62 - 63 - dd if=/dev/zero of=rootfile bs=1M count=2048 64 - qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 65 - 66 - Make sure that you install a getty on /dev/hvc0 if you want to log in on the 67 - console! 68 - 69 - - "modprobe lg" if you built it as a module. 70 - 71 - - Run an lguest as root: 72 - 73 - tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ 74 - --block=rootfile root=/dev/vda 75 - 76 - Explanation: 77 - 64: the amount of memory to use, in MB. 78 - 79 - vmlinux: the kernel image found in the top of your build directory. You 80 - can also use a standard bzImage. 81 - 82 - --tunnet=192.168.19.1: configures a "tap" device for networking with this 83 - IP address. 84 - 85 - --block=rootfile: a file or block device which becomes /dev/vda 86 - inside the guest. 87 - 88 - root=/dev/vda: this (and anything else on the command line) are 89 - kernel boot parameters. 90 - 91 - - Configuring networking. I usually have the host masquerade, using 92 - "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > 93 - /proc/sys/net/ipv4/ip_forward". In this example, I would configure 94 - eth0 inside the guest at 192.168.19.2. 95 - 96 - Another method is to bridge the tap device to an external interface 97 - using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest 98 - to obtain an IP address. The bridge needs to be configured first: 99 - this option simply adds the tap interface to it. 100 - 101 - A simple example on my system: 102 - 103 - ifconfig eth0 0.0.0.0 104 - brctl addbr lg0 105 - ifconfig lg0 up 106 - brctl addif lg0 eth0 107 - dhclient lg0 108 - 109 - Then use --tunnet=bridge:lg0 when launching the guest. 110 - 111 - See: 112 - 113 - http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge 114 - 115 - for general information on how to get bridging to work. 116 - 117 - - Random number generation. Using the --rng option will provide a 118 - /dev/hwrng in the guest that will read from the host's /dev/random. 119 - Use this option in conjunction with rng-tools (see ../hw_random.txt) 120 - to provide entropy to the guest kernel's /dev/random. 121 - 122 - There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest 123 - 124 - Good luck! 125 - Rusty Russell rusty@rustcorp.com.au.