lguest: comment documentation update. · tjh.dev/kernel@a6bd8e1

+38 -31

Documentation/lguest/lguest.c

··· 1 1 /*P:100 This is the Launcher code, a simple program which lays out the 2 - * "physical" memory for the new Guest by mapping the kernel image and the 3 - * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 4 - :*/ 2 + * "physical" memory for the new Guest by mapping the kernel image and 3 + * the virtual devices, then opens /dev/lguest to tell the kernel 4 + * about the Guest and control it. :*/ 5 5 #define _LARGEFILE64_SOURCE 6 6 #define _GNU_SOURCE 7 7 #include <stdio.h> ··· 43 43 #include "linux/virtio_console.h" 44 44 #include "linux/virtio_ring.h" 45 45 #include "asm-x86/bootparam.h" 46 - /*L:110 We can ignore the 38 include files we need for this program, but I do 46 + /*L:110 We can ignore the 39 include files we need for this program, but I do 47 47 * want to draw attention to the use of kernel-style types. 48 48 * 49 49 * As Linus said, "C is a Spartan language, and so should your naming be." I ··· 320 320 err(1, "Reading program headers"); 321 321 322 322 /* Try all the headers: there are usually only three. A read-only one, 323 - * a read-write one, and a "note" section which isn't loadable. */ 323 + * a read-write one, and a "note" section which we don't load. */ 324 324 for (i = 0; i < ehdr->e_phnum; i++) { 325 325 /* If this isn't a loadable segment, we ignore it */ 326 326 if (phdr[i].p_type != PT_LOAD) ··· 387 387 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 388 388 return map_elf(fd, &hdr); 389 389 390 - /* Otherwise we assume it's a bzImage, and try to unpack it */ 390 + /* Otherwise we assume it's a bzImage, and try to load it. */ 391 391 return load_bzimage(fd); 392 392 } 393 393 ··· 433 433 return len; 434 434 } 435 435 436 - /* Once we know how much memory we have, we can construct simple linear page 436 + /* Once we know how much memory we have we can construct simple linear page 437 437 * tables which set virtual == physical which will get the Guest far enough 438 438 * into the boot to create its own. 439 439 * 440 440 * We lay them out of the way, just below the initrd (which is why we need to 441 - * know its size). */ 441 + * know its size here). */ 442 442 static unsigned long setup_pagetables(unsigned long mem, 443 443 unsigned long initrd_size) 444 444 { ··· 850 850 * 851 851 * Handling output for network is also simple: we get all the output buffers 852 852 * and write them (ignoring the first element) to this device's file descriptor 853 - * (stdout). */ 853 + * (/dev/net/tun). 854 + */ 854 855 static void handle_net_output(int fd, struct virtqueue *vq) 855 856 { 856 857 unsigned int head, out, in; ··· 925 924 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); 926 925 } 927 926 928 - /* Resetting a device is fairly easy. */ 927 + /* When the Guest asks us to reset a device, it's is fairly easy. */ 929 928 static void reset_device(struct device *dev) 930 929 { 931 930 struct virtqueue *vq; ··· 1004 1003 if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) 1005 1004 break; 1006 1005 1007 - /* Otherwise, call the device(s) which have readable 1008 - * file descriptors and a method of handling them. */ 1006 + /* Otherwise, call the device(s) which have readable file 1007 + * descriptors and a method of handling them. */ 1009 1008 for (i = devices.dev; i; i = i->next) { 1010 1009 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 1011 1010 int dev_fd; ··· 1016 1015 * should no longer service it. Networking and 1017 1016 * console do this when there's no input 1018 1017 * buffers to deliver into. Console also uses 1019 - * it when it discovers that stdin is 1020 - * closed. */ 1018 + * it when it discovers that stdin is closed. */ 1021 1019 FD_CLR(i->fd, &devices.infds); 1022 1020 /* Tell waker to ignore it too, by sending a 1023 1021 * negative fd number (-1, since 0 is a valid ··· 1033 1033 * 1034 1034 * All devices need a descriptor so the Guest knows it exists, and a "struct 1035 1035 * device" so the Launcher can keep track of it. We have common helper 1036 - * routines to allocate and manage them. */ 1036 + * routines to allocate and manage them. 1037 + */ 1037 1038 1038 1039 /* The layout of the device page is a "struct lguest_device_desc" followed by a 1039 1040 * number of virtqueue descriptors, then two sets of feature bits, then an ··· 1079 1078 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1080 1079 void *p; 1081 1080 1082 - /* First we need some pages for this virtqueue. */ 1081 + /* First we need some memory for this virtqueue. */ 1083 1082 pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) 1084 1083 / getpagesize(); 1085 1084 p = get_pages(pages); ··· 1123 1122 } 1124 1123 1125 1124 /* The first half of the feature bitmask is for us to advertise features. The 1126 - * second half if for the Guest to accept features. */ 1125 + * second half is for the Guest to accept features. */ 1127 1126 static void add_feature(struct device *dev, unsigned bit) 1128 1127 { 1129 1128 u8 *features = get_feature_bits(dev); ··· 1152 1151 } 1153 1152 1154 1153 /* This routine does all the creation and setup of a new device, including 1155 - * calling new_dev_desc() to allocate the descriptor and device memory. */ 1154 + * calling new_dev_desc() to allocate the descriptor and device memory. 1155 + * 1156 + * See what I mean about userspace being boring? */ 1156 1157 static struct device *new_device(const char *name, u16 type, int fd, 1157 1158 bool (*handle_input)(int, struct device *)) 1158 1159 { ··· 1495 1492 while (read(vblk->workpipe[0], &c, 1) == 1) { 1496 1493 /* We acknowledge each request immediately to reduce latency, 1497 1494 * rather than waiting until we've done them all. I haven't 1498 - * measured to see if it makes any difference. */ 1495 + * measured to see if it makes any difference. 1496 + * 1497 + * That would be an interesting test, wouldn't it? You could 1498 + * also try having more than one I/O thread. */ 1499 1499 while (service_io(dev)) 1500 1500 write(vblk->done_fd, &c, 1); 1501 1501 } ··· 1506 1500 } 1507 1501 1508 1502 /* Now we've seen the I/O thread, we return to the Launcher to see what happens 1509 - * when the thread tells us it's completed some I/O. */ 1503 + * when that thread tells us it's completed some I/O. */ 1510 1504 static bool handle_io_finish(int fd, struct device *dev) 1511 1505 { 1512 1506 char c; ··· 1578 1572 * more work. */ 1579 1573 pipe(vblk->workpipe); 1580 1574 1581 - /* Create stack for thread and run it */ 1575 + /* Create stack for thread and run it. Since stack grows upwards, we 1576 + * point the stack pointer to the end of this region. */ 1582 1577 stack = malloc(32768); 1583 1578 /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from 1584 1579 * becoming a zombie. */ 1585 - if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) 1580 + if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) 1586 1581 err(1, "Creating clone"); 1587 1582 1588 1583 /* We don't need to keep the I/O thread's end of the pipes open. */ ··· 1593 1586 verbose("device %u: virtblock %llu sectors\n", 1594 1587 devices.device_num, le64_to_cpu(conf.capacity)); 1595 1588 } 1596 - /* That's the end of device setup. :*/ 1589 + /* That's the end of device setup. */ 1597 1590 1598 - /* Reboot */ 1591 + /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ 1599 1592 static void __attribute__((noreturn)) restart_guest(void) 1600 1593 { 1601 1594 unsigned int i; 1602 1595 1603 - /* Closing pipes causes the waker thread and io_threads to die, and 1596 + /* Closing pipes causes the Waker thread and io_threads to die, and 1604 1597 * closing /dev/lguest cleans up the Guest. Since we don't track all 1605 1598 * open fds, we simply close everything beyond stderr. */ 1606 1599 for (i = 3; i < FD_SETSIZE; i++) ··· 1609 1602 err(1, "Could not exec %s", main_args[0]); 1610 1603 } 1611 1604 1612 - /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1605 + /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1613 1606 * its input and output, and finally, lays it to rest. */ 1614 1607 static void __attribute__((noreturn)) run_guest(int lguest_fd) 1615 1608 { ··· 1650 1643 err(1, "Resetting break"); 1651 1644 } 1652 1645 } 1653 - /* 1646 + /*L:240 1654 1647 * This is the end of the Launcher. The good news: we are over halfway 1655 1648 * through! The bad news: the most fiendish part of the code still lies ahead 1656 1649 * of us. ··· 1697 1690 * device receive input from a file descriptor, we keep an fdset 1698 1691 * (infds) and the maximum fd number (max_infd) with the head of the 1699 1692 * list. We also keep a pointer to the last device. Finally, we keep 1700 - * the next interrupt number to hand out (1: remember that 0 is used by 1701 - * the timer). */ 1693 + * the next interrupt number to use for devices (1: remember that 0 is 1694 + * used by the timer). */ 1702 1695 FD_ZERO(&devices.infds); 1703 1696 devices.max_infd = -1; 1704 1697 devices.lastdev = NULL; ··· 1799 1792 lguest_fd = tell_kernel(pgdir, start); 1800 1793 1801 1794 /* We fork off a child process, which wakes the Launcher whenever one 1802 - * of the input file descriptors needs attention. Otherwise we would 1803 - * run the Guest until it tries to output something. */ 1795 + * of the input file descriptors needs attention. We call this the 1796 + * Waker, and we'll cover it in a moment. */ 1804 1797 waker_fd = setup_waker(lguest_fd); 1805 1798 1806 1799 /* Finally, run the Guest. This doesn't return. */

+62 -46

arch/x86/lguest/boot.c

··· 10 10 * (such as the example in Documentation/lguest/lguest.c) is called the 11 11 * Launcher. 12 12 * 13 - * Secondly, we only run specially modified Guests, not normal kernels. When 14 - * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets 15 - * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows 16 - * how to be a Guest. This means that you can use the same kernel you boot 17 - * normally (ie. as a Host) as a Guest. 13 + * Secondly, we only run specially modified Guests, not normal kernels: setting 14 + * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows 15 + * how to be a Guest at boot time. This means that you can use the same kernel 16 + * you boot normally (ie. as a Host) as a Guest. 18 17 * 19 18 * These Guests know that they cannot do privileged operations, such as disable 20 19 * interrupts, and that they have to ask the Host to do such things explicitly. 21 20 * This file consists of all the replacements for such low-level native 22 21 * hardware operations: these special Guest versions call the Host. 23 22 * 24 - * So how does the kernel know it's a Guest? The Guest starts at a special 25 - * entry point marked with a magic string, which sets up a few things then 26 - * calls here. We replace the native functions various "paravirt" structures 27 - * with our Guest versions, then boot like normal. :*/ 23 + * So how does the kernel know it's a Guest? We'll see that later, but let's 24 + * just say that we end up here where we replace the native functions various 25 + * "paravirt" structures with our Guest versions, then boot like normal. :*/ 28 26 29 27 /* 30 28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. ··· 132 134 * lguest_leave_lazy_mode(). 133 135 * 134 136 * So, when we're in lazy mode, we call async_hcall() to store the call for 135 - * future processing. */ 137 + * future processing: */ 136 138 static void lazy_hcall(unsigned long call, 137 139 unsigned long arg1, 138 140 unsigned long arg2, ··· 145 147 } 146 148 147 149 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then 148 - * issue a hypercall to flush any stored calls. */ 150 + * issue the do-nothing hypercall to flush any stored calls. */ 149 151 static void lguest_leave_lazy_mode(void) 150 152 { 151 153 paravirt_leave_lazy(paravirt_get_lazy_mode()); ··· 162 164 * 163 165 * So instead we keep an "irq_enabled" field inside our "struct lguest_data", 164 166 * which the Guest can update with a single instruction. The Host knows to 165 - * check there when it wants to deliver an interrupt. 167 + * check there before it tries to deliver an interrupt. 166 168 */ 167 169 168 170 /* save_flags() is expected to return the processor state (ie. "flags"). The ··· 194 196 /*M:003 Note that we don't check for outstanding interrupts when we re-enable 195 197 * them (or when we unmask an interrupt). This seems to work for the moment, 196 198 * since interrupts are rare and we'll just get the interrupt on the next timer 197 - * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way 199 + * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 198 200 * would be to put the "irq_enabled" field in a page by itself, and have the 199 201 * Host write-protect it when an interrupt comes in when irqs are disabled. 200 - * There will then be a page fault as soon as interrupts are re-enabled. :*/ 202 + * There will then be a page fault as soon as interrupts are re-enabled. 203 + * 204 + * A better method is to implement soft interrupt disable generally for x86: 205 + * instead of disabling interrupts, we set a flag. If an interrupt does come 206 + * in, we then disable them for real. This is uncommon, so we could simply use 207 + * a hypercall for interrupt control and not worry about efficiency. :*/ 201 208 202 209 /*G:034 203 210 * The Interrupt Descriptor Table (IDT). ··· 215 212 static void lguest_write_idt_entry(gate_desc *dt, 216 213 int entrynum, const gate_desc *g) 217 214 { 215 + /* The gate_desc structure is 8 bytes long: we hand it to the Host in 216 + * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 217 + * around like this; typesafety wasn't a big concern in Linux's early 218 + * years. */ 218 219 u32 *desc = (u32 *)g; 219 220 /* Keep the local copy up to date. */ 220 221 native_write_idt_entry(dt, entrynum, g); ··· 250 243 * 251 244 * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY 252 245 * hypercall and use that repeatedly to load a new IDT. I don't think it 253 - * really matters, but wouldn't it be nice if they were the same? 246 + * really matters, but wouldn't it be nice if they were the same? Wouldn't 247 + * it be even better if you were the one to send the patch to fix it? 254 248 */ 255 249 static void lguest_load_gdt(const struct desc_ptr *desc) 256 250 { ··· 306 298 307 299 /* The "cpuid" instruction is a way of querying both the CPU identity 308 300 * (manufacturer, model, etc) and its features. It was introduced before the 309 - * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you 310 - * might imagine, after a decade and a half this treatment, it is now a giant 311 - * ball of hair. Its entry in the current Intel manual runs to 28 pages. 301 + * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 302 + * As you might imagine, after a decade and a half this treatment, it is now a 303 + * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 312 304 * 313 305 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 314 306 * has been translated into 4 languages. I am not making this up! ··· 602 594 return lguest_data.time.tv_sec; 603 595 } 604 596 605 - /* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, 606 - * or 0 if it's unusable as a reliable clock source. This matches what we want 607 - * here: if we return 0 from this function, the x86 TSC clock will not register 608 - * itself. */ 597 + /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 598 + * what speed it runs at, or 0 if it's unusable as a reliable clock source. 599 + * This matches what we want here: if we return 0 from this function, the x86 600 + * TSC clock will give up and not register itself. */ 609 601 static unsigned long lguest_cpu_khz(void) 610 602 { 611 603 return lguest_data.tsc_khz; 612 604 } 613 605 614 - /* If we can't use the TSC, the kernel falls back to our "lguest_clock", where 615 - * we read the time value given to us by the Host. */ 606 + /* If we can't use the TSC, the kernel falls back to our lower-priority 607 + * "lguest_clock", where we read the time value given to us by the Host. */ 616 608 static cycle_t lguest_clock_read(void) 617 609 { 618 610 unsigned long sec, nsec; ··· 656 648 static int lguest_clockevent_set_next_event(unsigned long delta, 657 649 struct clock_event_device *evt) 658 650 { 651 + /* FIXME: I don't think this can ever happen, but James tells me he had 652 + * to put this code in. Maybe we should remove it now. Anyone? */ 659 653 if (delta < LG_CLOCK_MIN_DELTA) { 660 654 if (printk_ratelimit()) 661 655 printk(KERN_DEBUG "%s: small delta %lu ns\n", 662 656 __FUNCTION__, delta); 663 657 return -ETIME; 664 658 } 659 + 660 + /* Please wake us this far in the future. */ 665 661 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); 666 662 return 0; 667 663 } ··· 750 738 * will not tolerate us trying to use that), the stack pointer, and the number 751 739 * of pages in the stack. */ 752 740 static void lguest_load_sp0(struct tss_struct *tss, 753 - struct thread_struct *thread) 741 + struct thread_struct *thread) 754 742 { 755 743 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, 756 744 THREAD_SIZE/PAGE_SIZE); ··· 798 786 hcall(LHCALL_HALT, 0, 0, 0); 799 787 } 800 788 801 - /* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a 802 - * message out when we're crashing as well as elegant termination like powering 803 - * off. 789 + /* The SHUTDOWN hypercall takes a string to describe what's happening, and 790 + * an argument which says whether this to restart (reboot) the Guest or not. 804 791 * 805 792 * Note that the Host always prefers that the Guest speak in physical addresses 806 793 * rather than virtual addresses, so we use __pa() here. */ ··· 827 816 /* Setting up memory is fairly easy. */ 828 817 static __init char *lguest_memory_setup(void) 829 818 { 830 - /* We do this here and not earlier because lockcheck barfs if we do it 831 - * before start_kernel() */ 819 + /* We do this here and not earlier because lockcheck used to barf if we 820 + * did it before start_kernel(). I think we fixed that, so it'd be 821 + * nice to move it back to lguest_init. Patch welcome... */ 832 822 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 833 823 834 824 /* The Linux bootloader header contains an "e820" memory map: the ··· 862 850 return len; 863 851 } 864 852 853 + /* Rebooting also tells the Host we're finished, but the RESTART flag tells the 854 + * Launcher to reboot us. */ 855 + static void lguest_restart(char *reason) 856 + { 857 + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); 858 + } 859 + 865 860 /*G:050 866 861 * Patching (Powerfully Placating Performance Pedants) 867 862 * 868 - * We have already seen that pv_ops structures let us replace simple 869 - * native instructions with calls to the appropriate back end all throughout 870 - * the kernel. This allows the same kernel to run as a Guest and as a native 863 + * We have already seen that pv_ops structures let us replace simple native 864 + * instructions with calls to the appropriate back end all throughout the 865 + * kernel. This allows the same kernel to run as a Guest and as a native 871 866 * kernel, but it's slow because of all the indirect branches. 872 867 * 873 868 * Remember that David Wheeler quote about "Any problem in computer science can ··· 927 908 return insn_len; 928 909 } 929 910 930 - static void lguest_restart(char *reason) 931 - { 932 - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); 933 - } 934 - 935 - /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 936 - * structures in the kernel provide points for (almost) every routine we have 937 - * to override to avoid privileged instructions. */ 911 + /*G:030 Once we get to lguest_init(), we know we're a Guest. The various 912 + * pv_ops structures in the kernel provide points for (almost) every routine we 913 + * have to override to avoid privileged instructions. */ 938 914 __init void lguest_init(void) 939 915 { 940 916 /* We're under lguest, paravirt is enabled, and we're running at ··· 1017 1003 * the normal data segment to get through booting. */ 1018 1004 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1019 1005 1020 - /* The Host uses the top of the Guest's virtual address space for the 1021 - * Host<->Guest Switcher, and it tells us how big that is in 1022 - * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ 1006 + /* The Host<->Guest Switcher lives at the top of our address space, and 1007 + * the Host told us how big it is when we made LGUEST_INIT hypercall: 1008 + * it put the answer in lguest_data.reserve_mem */ 1023 1009 reserve_top_address(lguest_data.reserve_mem); 1024 1010 1025 1011 /* If we don't initialize the lock dependency checker now, it crashes ··· 1041 1027 /* Math is always hard! */ 1042 1028 new_cpu_data.hard_math = 1; 1043 1029 1030 + /* We don't have features. We have puppies! Puppies! */ 1044 1031 #ifdef CONFIG_X86_MCE 1045 1032 mce_disabled = 1; 1046 1033 #endif ··· 1059 1044 virtio_cons_early_init(early_put_chars); 1060 1045 1061 1046 /* Last of all, we set the power management poweroff hook to point to 1062 - * the Guest routine to power off. */ 1047 + * the Guest routine to power off, and the reboot hook to our restart 1048 + * routine. */ 1063 1049 pm_power_off = lguest_power_off; 1064 - 1065 1050 machine_ops.restart = lguest_restart; 1051 + 1066 1052 /* Now we're set up, call start_kernel() in init/main.c and we proceed 1067 1053 * to boot as normal. It never returns. */ 1068 1054 start_kernel();

+11 -4

arch/x86/lguest/i386_head.S

··· 5 5 #include <asm/thread_info.h> 6 6 #include <asm/processor-flags.h> 7 7 8 - /*G:020 This is where we begin: head.S notes that the boot header's platform 9 - * type field is "1" (lguest), so calls us here. 8 + /*G:020 Our story starts with the kernel booting into startup_32 in 9 + * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 + * the bootloader (the Launcher in our case). 11 + * 12 + * The startup_32 function does very little: it clears the uninitialized global 13 + * C variables which we expect to be zero (ie. BSS) and then copies the boot 14 + * header and kernel command line somewhere safe. Finally it checks the 15 + * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: 16 + * if it's set to '1' (lguest's assigned number), then it calls us here. 10 17 * 11 18 * WARNING: be very careful here! We're running at addresses equal to physical 12 19 * addesses (around 0), not above PAGE_OFFSET as most code expectes 13 20 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 14 - * data. 21 + * data without remembering to subtract __PAGE_OFFSET! 15 22 * 16 23 * The .section line puts this code in .init.text so it will be discarded after 17 24 * boot. */ ··· 31 24 int $LGUEST_TRAP_ENTRY 32 25 33 26 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl 34 - * instruction uses %esi implicitly as the source for the copy we' 27 + * instruction uses %esi implicitly as the source for the copy we're 35 28 * about to do. */ 36 29 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi 37 30

+9 -9

drivers/lguest/core.c

··· 1 1 /*P:400 This contains run_guest() which actually calls into the Host<->Guest 2 2 * Switcher and analyzes the return, such as determining if the Guest wants the 3 - * Host to do something. This file also contains useful helper routines, and a 4 - * couple of non-obvious setup and teardown pieces which were implemented after 5 - * days of debugging pain. :*/ 3 + * Host to do something. This file also contains useful helper routines. :*/ 6 4 #include <linux/module.h> 7 5 #include <linux/stringify.h> 8 6 #include <linux/stddef.h> ··· 47 49 * easy. 48 50 */ 49 51 50 - /* We allocate an array of "struct page"s. map_vm_area() wants the 51 - * pages in this form, rather than just an array of pointers. */ 52 + /* We allocate an array of struct page pointers. map_vm_area() wants 53 + * this, rather than just an array of pages. */ 52 54 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 53 55 GFP_KERNEL); 54 56 if (!switcher_page) { ··· 170 172 } 171 173 } 172 174 173 - /* This is the write (copy into guest) version. */ 175 + /* This is the write (copy into Guest) version. */ 174 176 void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, 175 177 unsigned bytes) 176 178 { ··· 207 209 if (cpu->break_out) 208 210 return -EAGAIN; 209 211 210 - /* Check if there are any interrupts which can be delivered 211 - * now: if so, this sets up the hander to be executed when we 212 - * next run the Guest. */ 212 + /* Check if there are any interrupts which can be delivered now: 213 + * if so, this sets up the hander to be executed when we next 214 + * run the Guest. */ 213 215 maybe_do_interrupt(cpu); 214 216 215 217 /* All long-lived kernel loops need to check with this horrible ··· 244 246 lguest_arch_handle_trap(cpu); 245 247 } 246 248 249 + /* Special case: Guest is 'dead' but wants a reboot. */ 247 250 if (cpu->lg->dead == ERR_PTR(-ERESTART)) 248 251 return -ERESTART; 252 + 249 253 /* The Guest is dead => "No such file or directory" */ 250 254 return -ENOENT; 251 255 }

+9 -2

drivers/lguest/hypercalls.c

··· 29 29 #include "lg.h" 30 30 31 31 /*H:120 This is the core hypercall routine: where the Guest gets what it wants. 32 - * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ 32 + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ 33 33 static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 34 34 { 35 35 switch (args->arg0) { ··· 190 190 * pagetable. */ 191 191 guest_pagetable_clear_all(cpu); 192 192 } 193 + /*:*/ 194 + 195 + /*M:013 If a Guest reads from a page (so creates a mapping) that it has never 196 + * written to, and then the Launcher writes to it (ie. the output of a virtual 197 + * device), the Guest will still see the old page. In practice, this never 198 + * happens: why would the Guest read a page which it has never written to? But 199 + * a similar scenario might one day bite us, so it's worth mentioning. :*/ 193 200 194 201 /*H:100 195 202 * Hypercalls ··· 234 227 * However, if we are signalled or the Guest sends I/O to the 235 228 * Launcher, the run_guest() loop will exit without running the 236 229 * Guest. When it comes back it would try to re-run the 237 - * hypercall. */ 230 + * hypercall. Finding that bug sucked. */ 238 231 cpu->hcall = NULL; 239 232 } 240 233 }

+3 -4

drivers/lguest/interrupts_and_traps.c

··· 144 144 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 145 145 sizeof(blk))) 146 146 return; 147 - 148 147 bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); 149 148 150 149 /* Find the first interrupt. */ ··· 236 237 clear_bit(syscall_vector, used_vectors); 237 238 } 238 239 239 - /*H:220 Now we've got the routines to deliver interrupts, delivering traps 240 - * like page fault is easy. The only trick is that Intel decided that some 241 - * traps should have error codes: */ 240 + /*H:220 Now we've got the routines to deliver interrupts, delivering traps like 241 + * page fault is easy. The only trick is that Intel decided that some traps 242 + * should have error codes: */ 242 243 static int has_err(unsigned int trap) 243 244 { 244 245 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);

+6 -5

drivers/lguest/lguest_device.c

··· 1 1 /*P:050 Lguest guests use a very simple method to describe devices. It's a 2 - * series of device descriptors contained just above the top of normal 2 + * series of device descriptors contained just above the top of normal Guest 3 3 * memory. 4 4 * 5 5 * We use the standard "virtio" device infrastructure, which provides us with a 6 6 * console, a network and a block driver. Each one expects some configuration 7 - * information and a "virtqueue" mechanism to send and receive data. :*/ 7 + * information and a "virtqueue" or two to send and receive data. :*/ 8 8 #include <linux/init.h> 9 9 #include <linux/bootmem.h> 10 10 #include <linux/lguest_launcher.h> ··· 53 53 * Device configurations 54 54 * 55 55 * The configuration information for a device consists of one or more 56 - * virtqueues, a feature bitmaks, and some configuration bytes. The 56 + * virtqueues, a feature bitmap, and some configuration bytes. The 57 57 * configuration bytes don't really matter to us: the Launcher sets them up, and 58 58 * the driver will look at them during setup. 59 59 * ··· 179 179 }; 180 180 181 181 /* When the virtio_ring code wants to prod the Host, it calls us here and we 182 - * make a hypercall. We hand the page number of the virtqueue so the Host 182 + * make a hypercall. We hand the physical address of the virtqueue so the Host 183 183 * knows which virtqueue we're talking about. */ 184 184 static void lg_notify(struct virtqueue *vq) 185 185 { ··· 199 199 * allocate its own pages and tell the Host where they are, but for lguest it's 200 200 * simpler for the Host to simply tell us where the pages are. 201 201 * 202 - * So we provide devices with a "find virtqueue and set it up" function. */ 202 + * So we provide drivers with a "find the Nth virtqueue and set it up" 203 + * function. */ 203 204 static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 204 205 unsigned index, 205 206 void (*callback)(struct virtqueue *vq))

+21 -9

drivers/lguest/lguest_user.c

··· 73 73 if (current != cpu->tsk) 74 74 return -EPERM; 75 75 76 - /* If the guest is already dead, we indicate why */ 76 + /* If the Guest is already dead, we indicate why */ 77 77 if (lg->dead) { 78 78 size_t len; 79 79 ··· 88 88 return len; 89 89 } 90 90 91 - /* If we returned from read() last time because the Guest notified, 91 + /* If we returned from read() last time because the Guest sent I/O, 92 92 * clear the flag. */ 93 93 if (cpu->pending_notify) 94 94 cpu->pending_notify = 0; ··· 97 97 return run_guest(cpu, (unsigned long __user *)user); 98 98 } 99 99 100 + /*L:025 This actually initializes a CPU. For the moment, a Guest is only 101 + * uniprocessor, so "id" is always 0. */ 100 102 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 101 103 { 104 + /* We have a limited number the number of CPUs in the lguest struct. */ 102 105 if (id >= NR_CPUS) 103 106 return -EINVAL; 104 107 108 + /* Set up this CPU's id, and pointer back to the lguest struct. */ 105 109 cpu->id = id; 106 110 cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); 107 111 cpu->lg->nr_cpus++; 112 + 113 + /* Each CPU has a timer it can set. */ 108 114 init_clockdev(cpu); 109 115 110 116 /* We need a complete page for the Guest registers: they are accessible ··· 126 120 * address. */ 127 121 lguest_arch_setup_regs(cpu, start_ip); 128 122 129 - /* Initialize the queue for the waker to wait on */ 123 + /* Initialize the queue for the Waker to wait on */ 130 124 init_waitqueue_head(&cpu->break_wq); 131 125 132 126 /* We keep a pointer to the Launcher task (ie. current task) for when 133 - * other Guests want to wake this one (inter-Guest I/O). */ 127 + * other Guests want to wake this one (eg. console input). */ 134 128 cpu->tsk = current; 135 129 136 130 /* We need to keep a pointer to the Launcher's memory map, because if ··· 142 136 * when the same Guest runs on the same CPU twice. */ 143 137 cpu->last_pages = NULL; 144 138 139 + /* No error == success. */ 145 140 return 0; 146 141 } 147 142 ··· 192 185 lg->mem_base = (void __user *)(long)args[0]; 193 186 lg->pfn_limit = args[1]; 194 187 195 - /* This is the first cpu */ 188 + /* This is the first cpu (cpu 0) and it will start booting at args[3] */ 196 189 err = lg_cpu_start(&lg->cpus[0], 0, args[3]); 197 190 if (err) 198 191 goto release_guest; 199 192 200 193 /* Initialize the Guest's shadow page tables, using the toplevel 201 - * address the Launcher gave us. This allocates memory, so can 202 - * fail. */ 194 + * address the Launcher gave us. This allocates memory, so can fail. */ 203 195 err = init_guest_pagetable(lg, args[2]); 204 196 if (err) 205 197 goto free_regs; ··· 224 218 /*L:010 The first operation the Launcher does must be a write. All writes 225 219 * start with an unsigned long number: for the first write this must be 226 220 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 227 - * writes of other values to send interrupts. */ 221 + * writes of other values to send interrupts. 222 + * 223 + * Note that we overload the "offset" in the /dev/lguest file to indicate what 224 + * CPU number we're dealing with. Currently this is always 0, since we only 225 + * support uniprocessor Guests, but you can see the beginnings of SMP support 226 + * here. */ 228 227 static ssize_t write(struct file *file, const char __user *in, 229 228 size_t size, loff_t *off) 230 229 { 231 - /* Once the guest is initialized, we hold the "struct lguest" in the 230 + /* Once the Guest is initialized, we hold the "struct lguest" in the 232 231 * file private data. */ 233 232 struct lguest *lg = file->private_data; 234 233 const unsigned long __user *input = (const unsigned long __user *)in; ··· 241 230 struct lg_cpu *uninitialized_var(cpu); 242 231 unsigned int cpu_id = *off; 243 232 233 + /* The first value tells us what this request is. */ 244 234 if (get_user(req, input) != 0) 245 235 return -EFAULT; 246 236 input++;

+18 -14

drivers/lguest/page_tables.c

··· 2 2 * previous encounters. It's functional, and as neat as it can be in the 3 3 * circumstances, but be wary, for these things are subtle and break easily. 4 4 * The Guest provides a virtual to physical mapping, but we can neither trust 5 - * it nor use it: we verify and convert it here to point the hardware to the 6 - * actual Guest pages when running the Guest. :*/ 5 + * it nor use it: we verify and convert it here then point the CPU to the 6 + * converted Guest pages when running the Guest. :*/ 7 7 8 8 /* Copyright (C) Rusty Russell IBM Corporation 2006. 9 9 * GPL v2 and any later version */ ··· 106 106 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 107 107 return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); 108 108 } 109 + /*:*/ 110 + 111 + /*M:014 get_pfn is slow; it takes the mmap sem and calls get_user_pages. We 112 + * could probably try to grab batches of pages here as an optimization 113 + * (ie. pre-faulting). :*/ 109 114 110 115 /*H:350 This routine takes a page number given by the Guest and converts it to 111 116 * an actual, physical page number. It can fail for several reasons: the ··· 118 113 * and the page is read-only, or the write flag was set and the page was 119 114 * shared so had to be copied, but we ran out of memory. 120 115 * 121 - * This holds a reference to the page, so release_pte() is careful to 122 - * put that back. */ 116 + * This holds a reference to the page, so release_pte() is careful to put that 117 + * back. */ 123 118 static unsigned long get_pfn(unsigned long virtpfn, int write) 124 119 { 125 120 struct page *page; ··· 537 532 * all processes. So when the page table above that address changes, we update 538 533 * all the page tables, not just the current one. This is rare. 539 534 * 540 - * The benefit is that when we have to track a new page table, we can copy keep 541 - * all the kernel mappings. This speeds up context switch immensely. */ 535 + * The benefit is that when we have to track a new page table, we can keep all 536 + * the kernel mappings. This speeds up context switch immensely. */ 542 537 void guest_set_pte(struct lg_cpu *cpu, 543 538 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 544 539 { 545 - /* Kernel mappings must be changed on all top levels. Slow, but 546 - * doesn't happen often. */ 540 + /* Kernel mappings must be changed on all top levels. Slow, but doesn't 541 + * happen often. */ 547 542 if (vaddr >= cpu->lg->kernel_address) { 548 543 unsigned int i; 549 544 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) ··· 709 704 /* We've made it through the page table code. Perhaps our tired brains are 710 705 * still processing the details, or perhaps we're simply glad it's over. 711 706 * 712 - * If nothing else, note that all this complexity in juggling shadow page 713 - * tables in sync with the Guest's page tables is for one reason: for most 714 - * Guests this page table dance determines how bad performance will be. This 715 - * is why Xen uses exotic direct Guest pagetable manipulation, and why both 716 - * Intel and AMD have implemented shadow page table support directly into 717 - * hardware. 707 + * If nothing else, note that all this complexity in juggling shadow page tables 708 + * in sync with the Guest's page tables is for one reason: for most Guests this 709 + * page table dance determines how bad performance will be. This is why Xen 710 + * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 711 + * have implemented shadow page table support directly into hardware. 718 712 * 719 713 * There is just one file remaining in the Host. */ 720 714

+21 -12

drivers/lguest/x86/core.c

··· 17 17 * along with this program; if not, write to the Free Software 18 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 19 */ 20 + /*P:450 This file contains the x86-specific lguest code. It used to be all 21 + * mixed in with drivers/lguest/core.c but several foolhardy code slashers 22 + * wrestled most of the dependencies out to here in preparation for porting 23 + * lguest to other architectures (see what I mean by foolhardy?). 24 + * 25 + * This also contains a couple of non-obvious setup and teardown pieces which 26 + * were implemented after days of debugging pain. :*/ 20 27 #include <linux/kernel.h> 21 28 #include <linux/start_kernel.h> 22 29 #include <linux/string.h> ··· 164 157 * also simplify copy_in_guest_info(). Note that we'd still need to restore 165 158 * things when we exit to Launcher userspace, but that's fairly easy. 166 159 * 160 + * We could also try using this hooks for PGE, but that might be too expensive. 161 + * 167 162 * The hooks were designed for KVM, but we can also put them to good use. :*/ 168 163 169 164 /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts ··· 191 182 * was doing. */ 192 183 run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 193 184 194 - /* Note that the "regs" pointer contains two extra entries which are 185 + /* Note that the "regs" structure contains two extra entries which are 195 186 * not really registers: a trap number which says what interrupt or 196 187 * trap made the switcher code come back, and an error code which some 197 188 * traps set. */ ··· 302 293 break; 303 294 case 14: /* We've intercepted a Page Fault. */ 304 295 /* The Guest accessed a virtual address that wasn't mapped. 305 - * This happens a lot: we don't actually set up most of the 306 - * page tables for the Guest at all when we start: as it runs 307 - * it asks for more and more, and we set them up as 308 - * required. In this case, we don't even tell the Guest that 309 - * the fault happened. 296 + * This happens a lot: we don't actually set up most of the page 297 + * tables for the Guest at all when we start: as it runs it asks 298 + * for more and more, and we set them up as required. In this 299 + * case, we don't even tell the Guest that the fault happened. 310 300 * 311 301 * The errcode tells whether this was a read or a write, and 312 302 * whether kernel or userspace code. */ ··· 350 342 if (!deliver_trap(cpu, cpu->regs->trapnum)) 351 343 /* If the Guest doesn't have a handler (either it hasn't 352 344 * registered any yet, or it's one of the faults we don't let 353 - * it handle), it dies with a cryptic error message. */ 345 + * it handle), it dies with this cryptic error message. */ 354 346 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 355 347 cpu->regs->trapnum, cpu->regs->eip, 356 348 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault ··· 383 375 * The only exception is the interrupt handlers in switcher.S: their 384 376 * addresses are placed in a table (default_idt_entries), so we need to 385 377 * update the table with the new addresses. switcher_offset() is a 386 - * convenience function which returns the distance between the builtin 387 - * switcher code and the high-mapped copy we just made. */ 378 + * convenience function which returns the distance between the 379 + * compiled-in switcher code and the high-mapped copy we just made. */ 388 380 for (i = 0; i < IDT_ENTRIES; i++) 389 381 default_idt_entries[i] += switcher_offset(); 390 382 ··· 424 416 state->guest_gdt_desc.address = (long)&state->guest_gdt; 425 417 426 418 /* We know where we want the stack to be when the Guest enters 427 - * the switcher: in pages->regs. The stack grows upwards, so 419 + * the Switcher: in pages->regs. The stack grows upwards, so 428 420 * we start it at the end of that structure. */ 429 421 state->guest_tss.sp0 = (long)(&pages->regs + 1); 430 422 /* And this is the GDT entry to use for the stack: we keep a ··· 521 513 { 522 514 u32 tsc_speed; 523 515 524 - /* The pointer to the Guest's "struct lguest_data" is the only 525 - * argument. We check that address now. */ 516 + /* The pointer to the Guest's "struct lguest_data" is the only argument. 517 + * We check that address now. */ 526 518 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 527 519 sizeof(*cpu->lg->lguest_data))) 528 520 return -EFAULT; ··· 554 546 555 547 return 0; 556 548 } 549 + /*:*/ 557 550 558 551 /*L:030 lguest_arch_setup_regs() 559 552 *

+4 -4

drivers/lguest/x86/switcher_32.S

··· 1 - /*P:900 This is the Switcher: code which sits at 0xFFC00000 to do the low-level 2 - * Guest<->Host switch. It is as simple as it can be made, but it's naturally 3 - * very specific to x86. 1 + /*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the 2 + * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 3 + * it can be made, but it's naturally very specific to x86. 4 4 * 5 5 * You have now completed Preparation. If this has whet your appetite; if you 6 6 * are feeling invigorated and refreshed then the next, more challenging stage ··· 189 189 // Interrupts are turned back on: we are Guest. 190 190 iret 191 191 192 - // We treat two paths to switch back to the Host 192 + // We tread two paths to switch back to the Host 193 193 // Yet both must save Guest state and restore Host 194 194 // So we put the routine in a macro. 195 195 #define SWITCH_TO_HOST \

+1 -1

include/asm-x86/lguest_hcall.h

··· 27 27 #ifndef __ASSEMBLY__ 28 28 #include <asm/hw_irq.h> 29 29 30 - /*G:031 First, how does our Guest contact the Host to ask for privileged 30 + /*G:031 But first, how does our Guest contact the Host to ask for privileged 31 31 * operations? There are two ways: the direct way is to make a "hypercall", 32 32 * to make requests of the Host Itself. 33 33 *

+5 -1

include/linux/lguest_launcher.h

··· 16 16 * a new device, we simply need to write a new virtio driver and create support 17 17 * for it in the Launcher: this code won't need to change. 18 18 * 19 + * Virtio devices are also used by kvm, so we can simply reuse their optimized 20 + * device drivers. And one day when everyone uses virtio, my plan will be 21 + * complete. Bwahahahah! 22 + * 19 23 * Devices are described by a simplified ID, a status byte, and some "config" 20 24 * bytes which describe this device's configuration. This is placed by the 21 25 * Launcher just above the top of physical memory: ··· 30 26 /* The number of virtqueues (first in config array) */ 31 27 __u8 num_vq; 32 28 /* The number of bytes of feature bits. Multiply by 2: one for host 33 - * features and one for guest acknowledgements. */ 29 + * features and one for Guest acknowledgements. */ 34 30 __u8 feature_len; 35 31 /* The number of bytes of the config array after virtqueues. */ 36 32 __u8 config_len;