···360360}361361362362/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels363363- * come wrapped up in the self-decompressing "bzImage" format. With some funky364364- * coding, we can load those, too. */363363+ * come wrapped up in the self-decompressing "bzImage" format. With a little364364+ * work, we can load those, too. */365365static unsigned long load_kernel(int fd)366366{367367 Elf32_Ehdr hdr;···464464 * to know where it is. */465465 return to_guest_phys(pgdir);466466}467467+/*:*/467468468469/* Simple routine to roll all the commandline arguments together with spaces469470 * between them. */···481480 dst[len] = '\0';482481}483482484484-/* This is where we actually tell the kernel to initialize the Guest. We saw485485- * the arguments it expects when we looked at initialize() in lguest_user.c:486486- * the base of guest "physical" memory, the top physical page to allow, the483483+/*L:185 This is where we actually tell the kernel to initialize the Guest. We484484+ * saw the arguments it expects when we looked at initialize() in lguest_user.c:485485+ * the base of Guest "physical" memory, the top physical page to allow, the487486 * top level pagetable and the entry point for the Guest. */488487static int tell_kernel(unsigned long pgdir, unsigned long start)489488{···513512/*L:200514513 * The Waker.515514 *516516- * With a console and network devices, we can have lots of input which we need517517- * to process. We could try to tell the kernel what file descriptors to watch,518518- * but handing a file descriptor mask through to the kernel is fairly icky.515515+ * With console, block and network devices, we can have lots of input which we516516+ * need to process. We could try to tell the kernel what file descriptors to517517+ * watch, but handing a file descriptor mask through to the kernel is fairly518518+ * icky.519519 *520520 * Instead, we fork off a process which watches the file descriptors and writes521521- * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host522522- * loop to stop running the Guest. This causes it to return from the521521+ * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host522522+ * stop running the Guest. This causes the Launcher to return from the523523 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset524524 * the LHREQ_BREAK and wake us up again.525525 *···546544 if (read(pipefd, &fd, sizeof(fd)) == 0)547545 exit(0);548546 /* Otherwise it's telling us to change what file549549- * descriptors we're to listen to. */547547+ * descriptors we're to listen to. Positive means548548+ * listen to a new one, negative means stop549549+ * listening. */550550 if (fd >= 0)551551 FD_SET(fd, &devices.infds);552552 else···563559{564560 int pipefd[2], child;565561566566- /* We create a pipe to talk to the waker, and also so it knows when the562562+ /* We create a pipe to talk to the Waker, and also so it knows when the567563 * Launcher dies (and closes pipe). */568564 pipe(pipefd);569565 child = fork();···571567 err(1, "forking");572568573569 if (child == 0) {574574- /* Close the "writing" end of our copy of the pipe */570570+ /* We are the Waker: close the "writing" end of our copy of the571571+ * pipe and start waiting for input. */575572 close(pipefd[1]);576573 wake_parent(pipefd[0], lguest_fd);577574 }···583578 return pipefd[1];584579}585580586586-/*L:210581581+/*587582 * Device Handling.588583 *589589- * When the Guest sends DMA to us, it sends us an array of addresses and sizes.584584+ * When the Guest gives us a buffer, it sends an array of addresses and sizes.590585 * We need to make sure it's not trying to reach into the Launcher itself, so591591- * we have a convenient routine which check it and exits with an error message586586+ * we have a convenient routine which checks it and exits with an error message592587 * if something funny is going on:593588 */594589static void *_check_pointer(unsigned long addr, unsigned int size,···605600/* A macro which transparently hands the line number to the real function. */606601#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)607602608608-/* This function returns the next descriptor in the chain, or vq->vring.num. */603603+/* Each buffer in the virtqueues is actually a chain of descriptors. This604604+ * function returns the next descriptor in the chain, or vq->vring.num if we're605605+ * at the end. */609606static unsigned next_desc(struct virtqueue *vq, unsigned int i)610607{611608 unsigned int next;···686679 return head;687680}688681689689-/* Once we've used one of their buffers, we tell them about it. We'll then682682+/* After we've used one of their buffers, we tell them about it. We'll then690683 * want to send them an interrupt, using trigger_irq(). */691684static void add_used(struct virtqueue *vq, unsigned int head, int len)692685{693686 struct vring_used_elem *used;694687695695- /* Get a pointer to the next entry in the used ring. */688688+ /* The virtqueue contains a ring of used buffers. Get a pointer to the689689+ * next entry in that used ring. */696690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];697691 used->id = head;698692 used->len = len;···707699{708700 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };709701702702+ /* If they don't want an interrupt, don't send one. */710703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)711704 return;712705···724715 trigger_irq(fd, vq);725716}726717727727-/* Here is the input terminal setting we save, and the routine to restore them728728- * on exit so the user can see what they type next. */718718+/*719719+ * The Console720720+ *721721+ * Here is the input terminal setting we save, and the routine to restore them722722+ * on exit so the user gets their terminal back. */729723static struct termios orig_term;730724static void restore_term(void)731725{···829817 }830818}831819832832-/* Handling output for network is also simple: we get all the output buffers820820+/*821821+ * The Network822822+ *823823+ * Handling output for network is also simple: we get all the output buffers833824 * and write them (ignoring the first element) to this device's file descriptor834825 * (stdout). */835826static void handle_net_output(int fd, struct virtqueue *vq)···845830 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {846831 if (in)847832 errx(1, "Input buffers in output queue?");848848- /* Check header, but otherwise ignore it (we said we supported849849- * no features). */833833+ /* Check header, but otherwise ignore it (we told the Guest we834834+ * supported no features, so it shouldn't have anything835835+ * interesting). */850836 (void)convert(&iov[0], struct virtio_net_hdr);851837 len = writev(vq->dev->fd, iov+1, out-1);852838 add_used_and_trigger(fd, vq, head, len);···898882 return true;899883}900884901901-/* This callback ensures we try again, in case we stopped console or net885885+/*L:215 This is the callback attached to the network and console input886886+ * virtqueues: it ensures we try again, in case we stopped console or net902887 * delivery because Guest didn't have any buffers. */903888static void enable_fd(int fd, struct virtqueue *vq)904889{···935918 strnlen(from_guest_phys(addr), guest_limit - addr));936919}937920938938-/* This is called when the waker wakes us up: check for incoming file921921+/* This is called when the Waker wakes us up: check for incoming file939922 * descriptors. */940923static void handle_input(int fd)941924{···1002985}10039861004987/* Each device descriptor is followed by some configuration information.10051005- * The first byte is a "status" byte for the Guest to report what's happening.10061006- * After that are fields: u8 type, u8 len, [... len bytes...].988988+ * Each configuration field looks like: u8 type, u8 len, [... len bytes...].1007989 *1008990 * This routine adds a new field to an existing device's descriptor. It only1009991 * works for the last device, but that's OK because that's how we use it. */···10591043 /* Link virtqueue back to device. */10601044 vq->dev = dev;1061104510621062- /* Set up handler. */10461046+ /* Set the routine to call when the Guest does something to this10471047+ * virtqueue. */10631048 vq->handle_output = handle_output;10491049+10501050+ /* Set the "Don't Notify Me" flag if we don't have a handler */10641051 if (!handle_output)10651052 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;10661053}1067105410681055/* This routine does all the creation and setup of a new device, including10691069- * caling new_dev_desc() to allocate the descriptor and device memory. */10561056+ * calling new_dev_desc() to allocate the descriptor and device memory. */10701057static struct device *new_device(const char *name, u16 type, int fd,10711058 bool (*handle_input)(int, struct device *))10721059{···10781059 /* Append to device list. Prepending to a single-linked list is10791060 * easier, but the user expects the devices to be arranged on the bus10801061 * in command-line order. The first network device on the command line10811081- * is eth0, the first block device /dev/lgba, etc. */10621062+ * is eth0, the first block device /dev/vda, etc. */10821063 *devices.lastdev = dev;10831064 dev->next = NULL;10841065 devices.lastdev = &dev->next;···11221103 /* The console needs two virtqueues: the input then the output. When11231104 * they put something the input queue, we make sure we're listening to11241105 * stdin. When they put something in the output queue, we write it to11251125- * stdout. */11061106+ * stdout. */11261107 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);11271108 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);11281109···12701251 verbose("attached to bridge: %s\n", br_name);12711252}1272125312731273-12741274-/*12751275- * Block device.12541254+/* Our block (disk) device should be really simple: the Guest asks for a block12551255+ * number and we read or write that position in the file. Unfortunately, that12561256+ * was amazingly slow: the Guest waits until the read is finished before12571257+ * running anything else, even if it could have been doing useful work.12761258 *12771277- * Serving a block device is really easy: the Guest asks for a block number and12781278- * we read or write that position in the file.12791279- *12801280- * Unfortunately, this is amazingly slow: the Guest waits until the read is12811281- * finished before running anything else, even if it could be doing useful12821282- * work. We could use async I/O, except it's reputed to suck so hard that12831283- * characters actually go missing from your code when you try to use it.12591259+ * We could use async I/O, except it's reputed to suck so hard that characters12601260+ * actually go missing from your code when you try to use it.12841261 *12851262 * So we farm the I/O out to thread, and communicate with it via a pipe. */1286126312871287-/* This hangs off device->priv, with the data. */12641264+/* This hangs off device->priv. */12881265struct vblk_info12891266{12901267 /* The size of the file. */···12961281 * Launcher triggers interrupt to Guest. */12971282 int done_fd;12981283};12841284+/*:*/1299128513001300-/* This is the core of the I/O thread. It returns true if it did something. */12861286+/*L:21012871287+ * The Disk12881288+ *12891289+ * Remember that the block device is handled by a separate I/O thread. We head12901290+ * straight into the core of that thread here:12911291+ */13011292static bool service_io(struct device *dev)13021293{13031294 struct vblk_info *vblk = dev->priv;···13141293 struct iovec iov[dev->vq->vring.num];13151294 off64_t off;1316129512961296+ /* See if there's a request waiting. If not, nothing to do. */13171297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);13181298 if (head == dev->vq->vring.num)13191299 return false;1320130013011301+ /* Every block request should contain at least one output buffer13021302+ * (detailing the location on disk and the type of request) and one13031303+ * input buffer (to hold the result). */13211304 if (out_num == 0 || in_num == 0)13221305 errx(1, "Bad virtblk cmd %u out=%u in=%u",13231306 head, out_num, in_num);···13301305 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);13311306 off = out->sector * 512;1332130713331333- /* This is how we implement barriers. Pretty poor, no? */13081308+ /* The block device implements "barriers", where the Guest indicates13091309+ * that it wants all previous writes to occur before this write. We13101310+ * don't have a way of asking our kernel to do a barrier, so we just13111311+ * synchronize all the data in the file. Pretty poor, no? */13341312 if (out->type & VIRTIO_BLK_T_BARRIER)13351313 fdatasync(vblk->fd);1336131413151315+ /* In general the virtio block driver is allowed to try SCSI commands.13161316+ * It'd be nice if we supported eject, for example, but we don't. */13371317 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {13381318 fprintf(stderr, "Scsi commands unsupported\n");13391319 in->status = VIRTIO_BLK_S_UNSUPP;···1404137414051375 /* When this read fails, it means Launcher died, so we follow. */14061376 while (read(vblk->workpipe[0], &c, 1) == 1) {14071407- /* We acknowledge each request immediately, to reduce latency,13771377+ /* We acknowledge each request immediately to reduce latency,14081378 * rather than waiting until we've done them all. I haven't14091379 * measured to see if it makes any difference. */14101380 while (service_io(dev))···14131383 return 0;14141384}1415138514161416-/* When the thread says some I/O is done, we interrupt the Guest. */13861386+/* Now we've seen the I/O thread, we return to the Launcher to see what happens13871387+ * when the thread tells us it's completed some I/O. */14171388static bool handle_io_finish(int fd, struct device *dev)14181389{14191390 char c;1420139114211421- /* If child died, presumably it printed message. */13921392+ /* If the I/O thread died, presumably it printed the error, so we13931393+ * simply exit. */14221394 if (read(dev->fd, &c, 1) != 1)14231395 exit(1);14241396···14291397 return true;14301398}1431139914321432-/* When the Guest submits some I/O, we wake the I/O thread. */14001400+/* When the Guest submits some I/O, we just need to wake the I/O thread. */14331401static void handle_virtblk_output(int fd, struct virtqueue *vq)14341402{14351403 struct vblk_info *vblk = vq->dev->priv;···14411409 exit(1);14421410}1443141114441444-/* This creates a virtual block device. */14121412+/*L:198 This actually sets up a virtual block device. */14451413static void setup_block_file(const char *filename)14461414{14471415 int p[2];···14571425 /* The device responds to return from I/O thread. */14581426 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);1459142714601460- /* The device has a virtqueue. */14281428+ /* The device has one virtqueue, where the Guest places requests. */14611429 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);1462143014631431 /* Allocate the room for our own bookkeeping */···14791447 /* The I/O thread writes to this end of the pipe when done. */14801448 vblk->done_fd = p[1];1481144914821482- /* This is how we tell the I/O thread about more work. */14501450+ /* This is the second pipe, which is how we tell the I/O thread about14511451+ * more work. */14831452 pipe(vblk->workpipe);1484145314851454 /* Create stack for thread and run it */···15191486 char reason[1024] = { 0 };15201487 read(lguest_fd, reason, sizeof(reason)-1);15211488 errx(1, "%s", reason);15221522- /* EAGAIN means the waker wanted us to look at some input.14891489+ /* EAGAIN means the Waker wanted us to look at some input.15231490 * Anything else means a bug or incompatible change. */15241491 } else if (errno != EAGAIN)15251492 err(1, "Running guest failed");1526149315271527- /* Service input, then unset the BREAK which releases15281528- * the Waker. */14941494+ /* Service input, then unset the BREAK to release the Waker. */15291495 handle_input(lguest_fd);15301496 if (write(lguest_fd, args, sizeof(args)) < 0)15311497 err(1, "Resetting break");15321498 }15331499}15341500/*15351535- * This is the end of the Launcher.15011501+ * This is the end of the Launcher. The good news: we are over halfway15021502+ * through! The bad news: the most fiendish part of the code still lies ahead15031503+ * of us.15361504 *15371537- * But wait! We've seen I/O from the Launcher, and we've seen I/O from the15381538- * Drivers. If we were to see the Host kernel I/O code, our understanding15391539- * would be complete... :*/15051505+ * Are you ready? Take a deep breath and join me in the core of the Host, in15061506+ * "make Host".15071507+ :*/1540150815411509static struct option opts[] = {15421510 { "verbose", 0, NULL, 'v' },···15601526 /* Memory, top-level pagetable, code startpoint and size of the15611527 * (optional) initrd. */15621528 unsigned long mem = 0, pgdir, start, initrd_size = 0;15631563- /* A temporary and the /dev/lguest file descriptor. */15291529+ /* Two temporaries and the /dev/lguest file descriptor. */15641530 int i, c, lguest_fd;15651531 /* The boot information for the Guest. */15661532 struct boot_params *boot;···16551621 /* The boot header contains a command line pointer: we put the command16561622 * line after the boot header. */16571623 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);16241624+ /* We use a simple helper to copy the arguments separated by spaces. */16581625 concat((char *)(boot + 1), argv+optind+2);1659162616601627 /* Boot protocol version: 2.07 supports the fields for lguest. */
+25-23
arch/x86/lguest/boot.c
···9999 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do100100 * them as a batch when lazy_mode is eventually turned off. Because hypercalls101101 * are reasonably expensive, batching them up makes sense. For example, a102102- * large mmap might update dozens of page table entries: that code calls102102+ * large munmap might update dozens of page table entries: that code calls103103 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls104104 * lguest_leave_lazy_mode().105105 *···164164/*:*/165165166166/*G:033167167- * Here are our first native-instruction replacements: four functions for168168- * interrupt control.167167+ * After that diversion we return to our first native-instruction168168+ * replacements: four functions for interrupt control.169169 *170170 * The simplest way of implementing these would be to have "turn interrupts171171 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:···184184 return lguest_data.irq_enabled;185185}186186187187-/* "restore_flags" just sets the flags back to the value given. */187187+/* restore_flags() just sets the flags back to the value given. */188188static void restore_fl(unsigned long flags)189189{190190 lguest_data.irq_enabled = flags;···357357 * it. The Host needs to know when the Guest wants to change them, so we have358358 * a whole series of functions like read_cr0() and write_cr0().359359 *360360- * We start with CR0. CR0 allows you to turn on and off all kinds of basic360360+ * We start with cr0. cr0 allows you to turn on and off all kinds of basic361361 * features, but Linux only really cares about one: the horrifically-named Task362362 * Switched (TS) bit at bit 3 (ie. 8)363363 *···390390 current_cr0 &= ~X86_CR0_TS;391391}392392393393-/* CR2 is the virtual address of the last page fault, which the Guest only ever393393+/* cr2 is the virtual address of the last page fault, which the Guest only ever394394 * reads. The Host kindly writes this into our "struct lguest_data", so we395395 * just read it out of there. */396396static unsigned long lguest_read_cr2(void)···398398 return lguest_data.cr2;399399}400400401401-/* CR3 is the current toplevel pagetable page: the principle is the same as401401+/* cr3 is the current toplevel pagetable page: the principle is the same as402402 * cr0. Keep a local copy, and tell the Host when it changes. */403403static void lguest_write_cr3(unsigned long cr3)404404{···411411 return current_cr3;412412}413413414414-/* CR4 is used to enable and disable PGE, but we don't care. */414414+/* cr4 is used to enable and disable PGE, but we don't care. */415415static unsigned long lguest_read_cr4(void)416416{417417 return 0;···432432 * maps virtual addresses to physical addresses using "page tables". We could433433 * use one huge index of 1 million entries: each address is 4 bytes, so that's434434 * 1024 pages just to hold the page tables. But since most virtual addresses435435- * are unused, we use a two level index which saves space. The CR3 register435435+ * are unused, we use a two level index which saves space. The cr3 register436436 * contains the physical address of the top level "page directory" page, which437437 * contains physical addresses of up to 1024 second-level pages. Each of these438438 * second level pages contains up to 1024 physical addresses of actual pages,···440440 *441441 * Here's a diagram, where arrows indicate physical addresses:442442 *443443- * CR3 ---> +---------+443443+ * cr3 ---> +---------+444444 * | --------->+---------+445445 * | | | PADDR1 |446446 * Top-level | | PADDR2 |···498498 *499499 * ... except in early boot when the kernel sets up the initial pagetables,500500 * which makes booting astonishingly slow. So we don't even tell the Host501501- * anything changed until we've done the first page table switch.502502- */501501+ * anything changed until we've done the first page table switch. */503502static void lguest_set_pte(pte_t *ptep, pte_t pteval)504503{505504 *ptep = pteval;···719720 /* Set up the timer interrupt (0) to go to our simple timer routine */720721 set_irq_handler(0, lguest_time_irq);721722722722- /* Our clock structure look like arch/i386/kernel/tsc.c if we can use723723- * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either724724- * way, the "rating" is initialized so high that it's always chosen725725- * over any other clocksource. */723723+ /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can724724+ * use the TSC, otherwise it's a dumb nanosecond-resolution clock.725725+ * Either way, the "rating" is set so high that it's always chosen over726726+ * any other clocksource. */726727 if (lguest_data.tsc_khz)727728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,728729 lguest_clock.shift);···748749 * to work. They're pretty simple.749750 */750751751751-/* The Guest needs to tell the host what stack it expects traps to use. For752752+/* The Guest needs to tell the Host what stack it expects traps to use. For752753 * native hardware, this is part of the Task State Segment mentioned above in753754 * lguest_load_tr_desc(), but to help hypervisors there's this special call.754755 *···849850 return "LGUEST";850851}851852852852-/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to853853- * produce console output. */853853+/* We will eventually use the virtio console device to produce console output,854854+ * but before that is set up we use LHCALL_NOTIFY on normal memory to produce855855+ * console output. */854856static __init int early_put_chars(u32 vtermno, const char *buf, int count)855857{856858 char scratch[17];857859 unsigned int len = count;858860861861+ /* We use a nul-terminated string, so we have to make a copy. Icky,862862+ * huh? */859863 if (len > sizeof(scratch) - 1)860864 len = sizeof(scratch) - 1;861865 scratch[len] = '\0';···885883 * Our current solution is to allow the paravirt back end to optionally patch886884 * over the indirect calls to replace them with something more efficient. We887885 * patch the four most commonly called functions: disable interrupts, enable888888- * interrupts, restore interrupts and save interrupts. We usually have 10886886+ * interrupts, restore interrupts and save interrupts. We usually have 6 or 10889887 * bytes to patch into: the Guest versions of these operations are small enough890888 * that we can fit comfortably.891889 *···10171015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");1018101610191017 /* The Host uses the top of the Guest's virtual address space for the10201020- * Host<->Guest Switcher, and it tells us how much it needs in10181018+ * Host<->Guest Switcher, and it tells us how big that is in10211019 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */10221020 reserve_top_address(lguest_data.reserve_mem);10231021···10671065/*10681066 * This marks the end of stage II of our journey, The Guest.10691067 *10701070- * It is now time for us to explore the nooks and crannies of the three Guest10711071- * devices and complete our understanding of the Guest in "make Drivers".10681068+ * It is now time for us to explore the layer of virtual drivers and complete10691069+ * our understanding of the Guest in "make Drivers".10721070 */
+5-3
arch/x86/lguest/i386_head.S
···66#include <asm/processor-flags.h>7788/*G:020 This is where we begin: head.S notes that the boot header's platform99- * type field is "1" (lguest), so calls us here. The boot header is in %esi.99+ * type field is "1" (lguest), so calls us here.1010 *1111 * WARNING: be very careful here! We're running at addresses equal to physical1212 * addesses (around 0), not above PAGE_OFFSET as most code expectes···1717 * boot. */1818.section .init.text, "ax", @progbits1919ENTRY(lguest_entry)2020- /* Make initial hypercall now, so we can set up the pagetables. */2020+ /* We make the "initialization" hypercall now to tell the Host about2121+ * us, and also find out where it put our page tables. */2122 movl $LHCALL_LGUEST_INIT, %eax2223 movl $lguest_data - __PAGE_OFFSET, %edx2324 int $LGUEST_TRAP_ENTRY24252526 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl2626- * instruction uses %esi implicitly. */2727+ * instruction uses %esi implicitly as the source for the copy we'2828+ * about to do. */2729 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi28302931 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
+4-1
drivers/lguest/core.c
···128128 __free_pages(switcher_page[i], 0);129129}130130131131-/*L:305131131+/*H:032132132 * Dealing With Guest Memory.133133+ *134134+ * Before we go too much further into the Host, we need to grok the routines135135+ * we use to deal with Guest memory.133136 *134137 * When the Guest gives us (what it thinks is) a physical address, we can use135138 * the normal copy_from_user() & copy_to_user() on the corresponding place in
+6-5
drivers/lguest/hypercalls.c
···9090 lg->pending_notify = args->arg1;9191 break;9292 default:9393+ /* It should be an architecture-specific hypercall. */9394 if (lguest_arch_do_hcall(lg, args))9495 kill_guest(lg, "Bad hypercall %li\n", args->arg0);9596 }···158157 * Guest makes a hypercall, we end up here to set things up: */159158static void initialize(struct lguest *lg)160159{161161-162160 /* You can't do anything until you're initialized. The Guest knows the163161 * rules, so we're unforgiving here. */164162 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {···174174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))175175 kill_guest(lg, "bad guest page %p", lg->lguest_data);176176177177- /* We write the current time into the Guest's data page once now. */177177+ /* We write the current time into the Guest's data page once so it can178178+ * set its clock. */178179 write_timestamp(lg);179180180181 /* page_tables.c will also do some setup. */···183182184183 /* This is the one case where the above accesses might have been the185184 * first write to a Guest page. This may have caused a copy-on-write186186- * fault, but the Guest might be referring to the old (read-only)187187- * page. */185185+ * fault, but the old page might be (read-only) in the Guest186186+ * pagetable. */188187 guest_pagetable_clear_all(lg);189188}190189···221220 * Normally it doesn't matter: the Guest will run again and222221 * update the trap number before we come back here.223222 *224224- * However, if we are signalled or the Guest sends DMA to the223223+ * However, if we are signalled or the Guest sends I/O to the225224 * Launcher, the run_guest() loop will exit without running the226225 * Guest. When it comes back it would try to re-run the227226 * hypercall. */
+29-8
drivers/lguest/interrupts_and_traps.c
···92929393 /* Remember that we never let the Guest actually disable interrupts, so9494 * the "Interrupt Flag" bit is always set. We copy that bit from the9595- * Guest's "irq_enabled" field into the eflags word: the Guest copies9696- * it back in "lguest_iret". */9595+ * Guest's "irq_enabled" field into the eflags word: we saw the Guest9696+ * copy it back in "lguest_iret". */9797 eflags = lg->regs->eflags;9898 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 09999 && !(irq_enable & X86_EFLAGS_IF))···124124 kill_guest(lg, "Disabling interrupts");125125}126126127127-/*H:200127127+/*H:205128128 * Virtual Interrupts.129129 *130130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if···256256 * bogus one in): if we fail here, the Guest will be killed. */257257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))258258 return 0;259259- set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num));259259+ set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,260260+ has_err(num));260261 return 1;261262}262263263264/*H:250 Here's the hard part: returning to the Host every time a trap happens264265 * and then calling deliver_trap() and re-entering the Guest is slow.265265- * Particularly because Guest userspace system calls are traps (trap 128).266266+ * Particularly because Guest userspace system calls are traps (usually trap267267+ * 128).266268 *267269 * So we'd like to set up the IDT to tell the CPU to deliver traps directly268270 * into the Guest. This is possible, but the complexities cause the size of269271 * this file to double! However, 150 lines of code is worth writing for taking270272 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all271271- * the other hypervisors would tease it.273273+ * the other hypervisors would beat it up at lunchtime.272274 *273275 * This routine indicates if a particular trap number could be delivered274276 * directly. */···333331 * change stacks on each context switch. */334332void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)335333{336336- /* You are not allowd have a stack segment with privilege level 0: bad334334+ /* You are not allowed have a stack segment with privilege level 0: bad337335 * Guest! */338336 if ((seg & 0x3) != GUEST_PL)339337 kill_guest(lg, "bad stack segment %i", seg);···352350 * part of the Host: page table handling. */353351354352/*H:235 This is the routine which actually checks the Guest's IDT entry and355355- * transfers it into our entry in "struct lguest": */353353+ * transfers it into the entry in "struct lguest": */356354static void set_trap(struct lguest *lg, struct desc_struct *trap,357355 unsigned int num, u32 lo, u32 hi)358356{···458456 }459457}460458459459+/*H:200460460+ * The Guest Clock.461461+ *462462+ * There are two sources of virtual interrupts. We saw one in lguest_user.c:463463+ * the Launcher sending interrupts for virtual devices. The other is the Guest464464+ * timer interrupt.465465+ *466466+ * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to467467+ * the next timer interrupt (in nanoseconds). We use the high-resolution timer468468+ * infrastructure to set a callback at that time.469469+ *470470+ * 0 means "turn off the clock". */461471void guest_set_clockevent(struct lguest *lg, unsigned long delta)462472{463473 ktime_t expires;···480466 return;481467 }482468469469+ /* We use wallclock time here, so the Guest might not be running for470470+ * all the time between now and the timer interrupt it asked for. This471471+ * is almost always the right thing to do. */483472 expires = ktime_add_ns(ktime_get_real(), delta);484473 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);485474}486475476476+/* This is the function called when the Guest's timer expires. */487477static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)488478{489479 struct lguest *lg = container_of(timer, struct lguest, hrt);490480481481+ /* Remember the first interrupt is the timer interrupt. */491482 set_bit(0, lg->irqs_pending);483483+ /* If the Guest is actually stopped, we need to wake it up. */492484 if (lg->halted)493485 wake_up_process(lg->tsk);494486 return HRTIMER_NORESTART;495487}496488489489+/* This sets up the timer for this Guest. */497490void init_clockdev(struct lguest *lg)498491{499492 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+2-2
drivers/lguest/lg.h
···100100void __lgread(struct lguest *, void *, unsigned long, unsigned);101101void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);102102103103-/*L:306 Using memory-copy operations like that is usually inconvient, so we103103+/*H:035 Using memory-copy operations like that is usually inconvient, so we104104 * have the following helper macros which read and write a specific type (often105105 * an unsigned long).106106 *···188188 * Let's step aside for the moment, to study one important routine that's used189189 * widely in the Host code.190190 *191191- * There are many cases where the Guest does something invalid, like pass crap191191+ * There are many cases where the Guest can do something invalid, like pass crap192192 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite193193 * acceptable to simply terminate the Guest and give the Launcher a nicely194194 * formatted reason. It's also simpler for the Guest itself, which doesn't
+7-4
drivers/lguest/lguest_device.c
···5353 * Device configurations5454 *5555 * The configuration information for a device consists of a series of fields.5656- * The device will look for these fields during setup.5656+ * We don't really care what they are: the Launcher set them up, and the driver5757+ * will look at them during setup.5758 *5859 * For us these fields come immediately after that device's descriptor in the5960 * lguest_devices page.···123122 * The other piece of infrastructure virtio needs is a "virtqueue": a way of124123 * the Guest device registering buffers for the other side to read from or125124 * write into (ie. send and receive buffers). Each device can have multiple126126- * virtqueues: for example the console has one queue for sending and one for127127- * receiving.125125+ * virtqueues: for example the console driver uses one queue for sending and126126+ * another for receiving.128127 *129128 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue130129 * already exists in virtio_ring.c. We just need to connect it up.···159158 *160159 * This is kind of an ugly duckling. It'd be nicer to have a standard161160 * representation of a virtqueue in the configuration space, but it seems that162162- * everyone wants to do it differently. The KVM guys want the Guest to161161+ * everyone wants to do it differently. The KVM coders want the Guest to163162 * allocate its own pages and tell the Host where they are, but for lguest it's164163 * simpler for the Host to simply tell us where the pages are.165164 *···285284{286285 struct lguest_device *ldev;287286287287+ /* Start with zeroed memory; Linux's device layer seems to count on288288+ * it. */288289 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);289290 if (!ldev) {290291 printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
+12-11
drivers/lguest/lguest_user.c
···88#include <linux/fs.h>99#include "lg.h"10101111-/*L:315 To force the Guest to stop running and return to the Launcher, the1212- * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The1313- * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */1111+/*L:055 When something happens, the Waker process needs a way to stop the1212+ * kernel running the Guest and return to the Launcher. So the Waker writes1313+ * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher1414+ * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release1515+ * the Waker. */1416static int break_guest_out(struct lguest *lg, const unsigned long __user *input)1517{1618 unsigned long on;17191818- /* Fetch whether they're turning break on or off.. */2020+ /* Fetch whether they're turning break on or off. */1921 if (get_user(on, input) != 0)2022 return -EFAULT;21232224 if (on) {2325 lg->break_out = 1;2424- /* Pop it out (may be running on different CPU) */2626+ /* Pop it out of the Guest (may be running on different CPU) */2527 wake_up_process(lg->tsk);2628 /* Wait for them to reset it */2729 return wait_event_interruptible(lg->break_wq, !lg->break_out);···6058 if (!lg)6159 return -EINVAL;62606363- /* If you're not the task which owns the guest, go away. */6161+ /* If you're not the task which owns the Guest, go away. */6462 if (current != lg->tsk)6563 return -EPERM;6664···9492 * base: The start of the Guest-physical memory inside the Launcher memory.9593 *9694 * pfnlimit: The highest (Guest-physical) page number the Guest should be9797- * allowed to access. The Launcher has to live in Guest memory, so it sets9898- * this to ensure the Guest can't reach it.9595+ * allowed to access. The Guest memory lives inside the Launcher, so it sets9696+ * this to ensure the Guest can only reach its own memory.9997 *10098 * pgdir: The (Guest-physical) address of the top of the initial Guest10199 * pagetables (which are set up by the Launcher).···191189}192190193191/*L:010 The first operation the Launcher does must be a write. All writes194194- * start with a 32 bit number: for the first write this must be192192+ * start with an unsigned long number: for the first write this must be195193 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use196194 * writes of other values to send interrupts. */197195static ssize_t write(struct file *file, const char __user *in,···277275 * The Launcher is the Host userspace program which sets up, runs and services278276 * the Guest. In fact, many comments in the Drivers which refer to "the Host"279277 * doing things are inaccurate: the Launcher does all the device handling for280280- * the Guest. The Guest can't tell what's done by the the Launcher and what by281281- * the Host.278278+ * the Guest, but the Guest can't know that.282279 *283280 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we284281 * shall see more of that later.
+72-41
drivers/lguest/page_tables.c
···2626 *2727 * We use two-level page tables for the Guest. If you're not entirely2828 * comfortable with virtual addresses, physical addresses and page tables then2929- * I recommend you review lguest.c's "Page Table Handling" (with diagrams!).2929+ * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with3030+ * diagrams!).3031 *3132 * The Guest keeps page tables, but we maintain the actual ones here: these are3233 * called "shadow" page tables. Which is a very Guest-centric name: these are···3736 *3837 * Anyway, this is the most complicated part of the Host code. There are seven3938 * parts to this:4040- * (i) Setting up a page table entry for the Guest when it faults,4141- * (ii) Setting up the page table entry for the Guest stack,4242- * (iii) Setting up a page table entry when the Guest tells us it has changed,3939+ * (i) Looking up a page table entry when the Guest faults,4040+ * (ii) Making sure the Guest stack is mapped,4141+ * (iii) Setting up a page table entry when the Guest tells us one has changed,4342 * (iv) Switching page tables,4444- * (v) Flushing (thowing away) page tables,4343+ * (v) Flushing (throwing away) page tables,4544 * (vi) Mapping the Switcher when the Guest is about to run,4645 * (vii) Setting up the page tables initially.4746 :*/···5857static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);5958#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)60596161-/*H:320 With our shadow and Guest types established, we need to deal with6262- * them: the page table code is curly enough to need helper functions to keep6363- * it clear and clean.6060+/*H:320 The page table code is curly enough to need helper functions to keep it6161+ * clear and clean.6462 *6563 * There are two functions which return pointers to the shadow (aka "real")6664 * page tables.6765 *6866 * spgd_addr() takes the virtual address and returns a pointer to the top-level6969- * page directory entry for that address. Since we keep track of several page7070- * tables, the "i" argument tells us which one we're interested in (it's6767+ * page directory entry (PGD) for that address. Since we keep track of several6868+ * page tables, the "i" argument tells us which one we're interested in (it's7169 * usually the current one). */7270static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)7371{···8181 return &lg->pgdirs[i].pgdir[index];8282}83838484-/* This routine then takes the PGD entry given above, which contains the8585- * address of the PTE page. It then returns a pointer to the PTE entry for the8686- * given address. */8484+/* This routine then takes the page directory entry returned above, which8585+ * contains the address of the page table entry (PTE) page. It then returns a8686+ * pointer to the PTE entry for the given address. */8787static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)8888{8989 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);···191191}192192193193/*H:330194194- * (i) Setting up a page table entry for the Guest when it faults194194+ * (i) Looking up a page table entry when the Guest faults.195195 *196196 * We saw this call in run_guest(): when we see a page fault in the Guest, we197197 * come here. That's because we only set up the shadow page tables lazily as···199199 * and return to the Guest without it knowing.200200 *201201 * If we fixed up the fault (ie. we mapped the address), this routine returns202202- * true. */202202+ * true. Otherwise, it was a real fault and we need to tell the Guest. */203203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)204204{205205 pgd_t gpgd;···246246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))247247 return 0;248248249249- /* User access to a kernel page? (bit 3 == user access) */249249+ /* User access to a kernel-only page? (bit 3 == user access) */250250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))251251 return 0;252252253253 /* Check that the Guest PTE flags are OK, and the page number is below254254 * the pfn_limit (ie. not mapping the Launcher binary). */255255 check_gpte(lg, gpte);256256+256257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */257258 gpte = pte_mkyoung(gpte);258258-259259 if (errcode & 2)260260 gpte = pte_mkdirty(gpte);261261···272272 else273273 /* If this is a read, don't set the "writable" bit in the page274274 * table entry, even if the Guest says it's writable. That way275275- * we come back here when a write does actually ocur, so we can276276- * update the Guest's _PAGE_DIRTY flag. */275275+ * we will come back here when a write does actually occur, so276276+ * we can update the Guest's _PAGE_DIRTY flag. */277277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);278278279279 /* Finally, we write the Guest PTE entry back: we've set the280280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */281281 lgwrite(lg, gpte_ptr, pte_t, gpte);282282283283- /* We succeeded in mapping the page! */283283+ /* The fault is fixed, the page table is populated, the mapping284284+ * manipulated, the result returned and the code complete. A small285285+ * delay and a trace of alliteration are the only indications the Guest286286+ * has that a page fault occurred at all. */284287 return 1;285288}286289287287-/*H:360 (ii) Setting up the page table entry for the Guest stack.290290+/*H:360291291+ * (ii) Making sure the Guest stack is mapped.288292 *289289- * Remember pin_stack_pages() which makes sure the stack is mapped? It could290290- * simply call demand_page(), but as we've seen that logic is quite long, and291291- * usually the stack pages are already mapped anyway, so it's not required.293293+ * Remember that direct traps into the Guest need a mapped Guest kernel stack.294294+ * pin_stack_pages() calls us here: we could simply call demand_page(), but as295295+ * we've seen that logic is quite long, and usually the stack pages are already296296+ * mapped, so it's overkill.292297 *293298 * This is a quick version which answers the question: is this virtual address294299 * mapped by the shadow page tables, and is it writable? */···302297 pgd_t *spgd;303298 unsigned long flags;304299305305- /* Look at the top level entry: is it present? */300300+ /* Look at the current top level entry: is it present? */306301 spgd = spgd_addr(lg, lg->pgdidx, vaddr);307302 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))308303 return 0;···338333 release_pte(ptepage[i]);339334 /* Now we can free the page of PTEs */340335 free_page((long)ptepage);341341- /* And zero out the PGD entry we we never release it twice. */336336+ /* And zero out the PGD entry so we never release it twice. */342337 *spgd = __pgd(0);343338 }344339}345340346346-/*H:440 (v) Flushing (thowing away) page tables,347347- *348348- * We saw flush_user_mappings() called when we re-used a top-level pgdir page.349349- * It simply releases every PTE page from 0 up to the kernel address. */341341+/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()342342+ * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.343343+ * It simply releases every PTE page from 0 up to the Guest's kernel address. */350344static void flush_user_mappings(struct lguest *lg, int idx)351345{352346 unsigned int i;···354350 release_pgd(lg, lg->pgdirs[idx].pgdir + i);355351}356352357357-/* The Guest also has a hypercall to do this manually: it's used when a large358358- * number of mappings have been changed. */353353+/*H:440 (v) Flushing (throwing away) page tables,354354+ *355355+ * The Guest has a hypercall to throw away the page tables: it's used when a356356+ * large number of mappings have been changed. */359357void guest_pagetable_flush_user(struct lguest *lg)360358{361359 /* Drop the userspace part of the current page table. */···429423430424/*H:430 (iv) Switching page tables431425 *432432- * This is what happens when the Guest changes page tables (ie. changes the433433- * top-level pgdir). This happens on almost every context switch. */426426+ * Now we've seen all the page table setting and manipulation, let's see what427427+ * what happens when the Guest changes page tables (ie. changes the top-level428428+ * pgdir). This occurs on almost every context switch. */434429void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)435430{436431 int newpgdir, repin = 0;···450443}451444452445/*H:470 Finally, a routine which throws away everything: all PGD entries in all453453- * the shadow page tables. This is used when we destroy the Guest. */446446+ * the shadow page tables, including the Guest's kernel mappings. This is used447447+ * when we destroy the Guest. */454448static void release_all_pagetables(struct lguest *lg)455449{456450 unsigned int i, j;···466458467459/* We also throw away everything when a Guest tells us it's changed a kernel468460 * mapping. Since kernel mappings are in every page table, it's easiest to469469- * throw them all away. This is amazingly slow, but thankfully rare. */461461+ * throw them all away. This traps the Guest in amber for a while as462462+ * everything faults back in, but it's rare. */470463void guest_pagetable_clear_all(struct lguest *lg)471464{472465 release_all_pagetables(lg);473466 /* We need the Guest kernel stack mapped again. */474467 pin_stack_pages(lg);475468}469469+/*:*/470470+/*M:009 Since we throw away all mappings when a kernel mapping changes, our471471+ * performance sucks for guests using highmem. In fact, a guest with472472+ * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is473473+ * usually slower than a Guest with less memory.474474+ *475475+ * This, of course, cannot be fixed. It would take some kind of... well, I476476+ * don't know, but the term "puissant code-fu" comes to mind. :*/476477477478/*H:420 This is the routine which actually sets the page table entry for then478479 * "idx"'th shadow page table.···500483static void do_set_pte(struct lguest *lg, int idx,501484 unsigned long vaddr, pte_t gpte)502485{503503- /* Look up the matching shadow page directot entry. */486486+ /* Look up the matching shadow page directory entry. */504487 pgd_t *spgd = spgd_addr(lg, idx, vaddr);505488506489 /* If the top level isn't present, there's no entry to update. */···517500 *spte = gpte_to_spte(lg, gpte,518501 pte_flags(gpte) & _PAGE_DIRTY);519502 } else520520- /* Otherwise we can demand_page() it in later. */503503+ /* Otherwise kill it and we can demand_page() it in504504+ * later. */521505 *spte = __pte(0);522506 }523507}···553535}554536555537/*H:400556556- * (iii) Setting up a page table entry when the Guest tells us it has changed.538538+ * (iii) Setting up a page table entry when the Guest tells us one has changed.557539 *558540 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal559541 * with the other side of page tables while we're here: what happens when the···630612631613/*H:480 (vi) Mapping the Switcher when the Guest is about to run.632614 *633633- * The Switcher and the two pages for this CPU need to be available to the615615+ * The Switcher and the two pages for this CPU need to be visible in the634616 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages635635- * for each CPU already set up, we just need to hook them in. */617617+ * for each CPU already set up, we just need to hook them in now we know which618618+ * Guest is about to run on this CPU. */636619void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)637620{638621 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);···695676 pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),696677 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));697678}679679+680680+/* We've made it through the page table code. Perhaps our tired brains are681681+ * still processing the details, or perhaps we're simply glad it's over.682682+ *683683+ * If nothing else, note that all this complexity in juggling shadow page684684+ * tables in sync with the Guest's page tables is for one reason: for most685685+ * Guests this page table dance determines how bad performance will be. This686686+ * is why Xen uses exotic direct Guest pagetable manipulation, and why both687687+ * Intel and AMD have implemented shadow page table support directly into688688+ * hardware.689689+ *690690+ * There is just one file remaining in the Host. */698691699692/*H:510 At boot or module load time, init_pagetables() allocates and populates700693 * the Switcher PTE page for each CPU. */
+29-19
drivers/lguest/segments.c
···1212#include "lg.h"13131414/*H:6001515- * We've almost completed the Host; there's just one file to go!1616- *1715 * Segments & The Global Descriptor Table1816 *1917 * (That title sounds like a bad Nerdcore group. Not to suggest that there are···5355 || num == GDT_ENTRY_DOUBLEFAULT_TSS);5456}55575656-/*H:610 Once the GDT has been changed, we fix the new entries up a little. We5858+/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We5759 * don't care if they're invalid: the worst that can happen is a General5860 * Protection Fault in the Switcher when it restores a Guest segment register5961 * which tries to use that entry. Then we kill the Guest for causing such a···8284 }8385}84868585-/* This routine is called at boot or modprobe time for each CPU to set up the8686- * "constant" GDT entries for Guests running on that CPU. */8787+/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep8888+ * a GDT for each CPU, and copy across the Guest's entries each time we want to8989+ * run the Guest on that CPU.9090+ *9191+ * This routine is called at boot or modprobe time for each CPU to set up the9292+ * constant GDT entries: the ones which are the same no matter what Guest we're9393+ * running. */8794void setup_default_gdt_entries(struct lguest_ro_state *state)8895{8996 struct desc_struct *gdt = state->guest_gdt;9097 unsigned long tss = (unsigned long)&state->guest_tss;91989292- /* The hypervisor segments are full 0-4G segments, privilege level 0 */9999+ /* The Switcher segments are full 0-4G segments, privilege level 0 */93100 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;94101 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;951029696- /* The TSS segment refers to the TSS entry for this CPU, so we cannot9797- * copy it from the Guest. Forgive the magic flags */103103+ /* The TSS segment refers to the TSS entry for this particular CPU.104104+ * Forgive the magic flags: the 0x8900 means the entry is Present, it's105105+ * privilege level 0 Available 386 TSS system segment, and the 0x67106106+ * means Saturn is eclipsed by Mercury in the twelfth house. */98107 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);99108 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)100109 | ((tss >> 16) & 0x000000FF);101110}102111103103-/* This routine is called before the Guest is run for the first time. */112112+/* This routine sets up the initial Guest GDT for booting. All entries start113113+ * as 0 (unusable). */104114void setup_guest_gdt(struct lguest *lg)105115{106116 /* Start with full 0-4G segments... */···120114 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);121115}122116123123-/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the124124- * GDTs for each CPU, then we copy across the entries each time we want to run125125- * a different Guest on that CPU. */126126-127127-/* A partial GDT load, for the three "thead-local storage" entries. Otherwise128128- * it's just like load_guest_gdt(). So much, in fact, it would probably be129129- * neater to have a single hypercall to cover both. */117117+/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"118118+ * entries. */130119void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)131120{132121 unsigned int i;···130129 gdt[i] = lg->arch.gdt[i];131130}132131133133-/* This is the full version */132132+/*H:640 When the Guest is run on a different CPU, or the GDT entries have133133+ * changed, copy_gdt() is called to copy the Guest's GDT entries across to this134134+ * CPU's GDT. */134135void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)135136{136137 unsigned int i;···144141 gdt[i] = lg->arch.gdt[i];145142}146143147147-/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */144144+/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).145145+ * We copy it from the Guest and tweak the entries. */148146void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)149147{150148 /* We assume the Guest has the same number of GDT entries as the···161157 lg->changed |= CHANGED_GDT;162158}163159160160+/* This is the fast-track version for just changing the three TLS entries.161161+ * Remember that this happens on every context switch, so it's worth162162+ * optimizing. But wouldn't it be neater to have a single hypercall to cover163163+ * both cases? */164164void guest_load_tls(struct lguest *lg, unsigned long gtls)165165{166166 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];167167168168 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);169169 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);170170+ /* Note that just the TLS entries have changed. */170171 lg->changed |= CHANGED_GDT_TLS;171172}173173+/*:*/172174173173-/*175175+/*H:660174176 * With this, we have finished the Host.175177 *176178 * Five of the seven parts of our task are complete. You have made it through
+62-58
drivers/lguest/x86/core.c
···6363static DEFINE_PER_CPU(struct lguest *, last_guest);64646565/*S:0106666- * We are getting close to the Switcher.6666+ * We approach the Switcher.6767 *6868 * Remember that each CPU has two pages which are visible to the Guest when it6969 * runs on that CPU. This has to contain the state for that Guest: we copy the···134134 *135135 * The lcall also pushes the old code segment (KERNEL_CS) onto the136136 * stack, then the address of this call. This stack layout happens to137137- * exactly match the stack of an interrupt... */137137+ * exactly match the stack layout created by an interrupt... */138138 asm volatile("pushf; lcall *lguest_entry"139139 /* This is how we tell GCC that %eax ("a") and %ebx ("b")140140 * are changed by this routine. The "=" means output. */···151151}152152/*:*/153153154154+/*M:002 There are hooks in the scheduler which we can register to tell when we155155+ * get kicked off the CPU (preempt_notifier_register()). This would allow us156156+ * to lazily disable SYSENTER which would regain some performance, and should157157+ * also simplify copy_in_guest_info(). Note that we'd still need to restore158158+ * things when we exit to Launcher userspace, but that's fairly easy.159159+ *160160+ * The hooks were designed for KVM, but we can also put them to good use. :*/161161+154162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts155163 * are disabled: we own the CPU. */156164void lguest_arch_run_guest(struct lguest *lg)157165{158158- /* Remember the awfully-named TS bit? If the Guest has asked159159- * to set it we set it now, so we can trap and pass that trap160160- * to the Guest if it uses the FPU. */166166+ /* Remember the awfully-named TS bit? If the Guest has asked to set it167167+ * we set it now, so we can trap and pass that trap to the Guest if it168168+ * uses the FPU. */161169 if (lg->ts)162170 lguest_set_ts();163171164164- /* SYSENTER is an optimized way of doing system calls. We165165- * can't allow it because it always jumps to privilege level 0.166166- * A normal Guest won't try it because we don't advertise it in167167- * CPUID, but a malicious Guest (or malicious Guest userspace168168- * program) could, so we tell the CPU to disable it before169169- * running the Guest. */172172+ /* SYSENTER is an optimized way of doing system calls. We can't allow173173+ * it because it always jumps to privilege level 0. A normal Guest174174+ * won't try it because we don't advertise it in CPUID, but a malicious175175+ * Guest (or malicious Guest userspace program) could, so we tell the176176+ * CPU to disable it before running the Guest. */170177 if (boot_cpu_has(X86_FEATURE_SEP))171178 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);172179173173- /* Now we actually run the Guest. It will pop back out when174174- * something interesting happens, and we can examine its175175- * registers to see what it was doing. */180180+ /* Now we actually run the Guest. It will return when something181181+ * interesting happens, and we can examine its registers to see what it182182+ * was doing. */176183 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));177184178178- /* The "regs" pointer contains two extra entries which are not179179- * really registers: a trap number which says what interrupt or180180- * trap made the switcher code come back, and an error code181181- * which some traps set. */185185+ /* Note that the "regs" pointer contains two extra entries which are186186+ * not really registers: a trap number which says what interrupt or187187+ * trap made the switcher code come back, and an error code which some188188+ * traps set. */182189183183- /* If the Guest page faulted, then the cr2 register will tell184184- * us the bad virtual address. We have to grab this now,185185- * because once we re-enable interrupts an interrupt could186186- * fault and thus overwrite cr2, or we could even move off to a187187- * different CPU. */190190+ /* If the Guest page faulted, then the cr2 register will tell us the191191+ * bad virtual address. We have to grab this now, because once we192192+ * re-enable interrupts an interrupt could fault and thus overwrite193193+ * cr2, or we could even move off to a different CPU. */188194 if (lg->regs->trapnum == 14)189195 lg->arch.last_pagefault = read_cr2();190196 /* Similarly, if we took a trap because the Guest used the FPU,···203197 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);204198}205199206206-/*H:130 Our Guest is usually so well behaved; it never tries to do things it207207- * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't208208- * quite complete, because it doesn't contain replacements for the Intel I/O209209- * instructions. As a result, the Guest sometimes fumbles across one during210210- * the boot process as it probes for various things which are usually attached211211- * to a PC.200200+/*H:130 Now we've examined the hypercall code; our Guest can make requests.201201+ * Our Guest is usually so well behaved; it never tries to do things it isn't202202+ * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual203203+ * infrastructure isn't quite complete, because it doesn't contain replacements204204+ * for the Intel I/O instructions. As a result, the Guest sometimes fumbles205205+ * across one during the boot process as it probes for various things which are206206+ * usually attached to a PC.212207 *213213- * When the Guest uses one of these instructions, we get trap #13 (General208208+ * When the Guest uses one of these instructions, we get a trap (General214209 * Protection Fault) and come here. We see if it's one of those troublesome215210 * instructions and skip over it. We return true if we did. */216211static int emulate_insn(struct lguest *lg)···282275void lguest_arch_handle_trap(struct lguest *lg)283276{284277 switch (lg->regs->trapnum) {285285- case 13: /* We've intercepted a GPF. */286286- /* Check if this was one of those annoying IN or OUT287287- * instructions which we need to emulate. If so, we288288- * just go back into the Guest after we've done it. */278278+ case 13: /* We've intercepted a General Protection Fault. */279279+ /* Check if this was one of those annoying IN or OUT280280+ * instructions which we need to emulate. If so, we just go281281+ * back into the Guest after we've done it. */289282 if (lg->regs->errcode == 0) {290283 if (emulate_insn(lg))291284 return;292285 }293286 break;294294- case 14: /* We've intercepted a page fault. */295295- /* The Guest accessed a virtual address that wasn't296296- * mapped. This happens a lot: we don't actually set297297- * up most of the page tables for the Guest at all when298298- * we start: as it runs it asks for more and more, and299299- * we set them up as required. In this case, we don't300300- * even tell the Guest that the fault happened.301301- *302302- * The errcode tells whether this was a read or a303303- * write, and whether kernel or userspace code. */287287+ case 14: /* We've intercepted a Page Fault. */288288+ /* The Guest accessed a virtual address that wasn't mapped.289289+ * This happens a lot: we don't actually set up most of the290290+ * page tables for the Guest at all when we start: as it runs291291+ * it asks for more and more, and we set them up as292292+ * required. In this case, we don't even tell the Guest that293293+ * the fault happened.294294+ *295295+ * The errcode tells whether this was a read or a write, and296296+ * whether kernel or userspace code. */304297 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))305298 return;306299307307- /* OK, it's really not there (or not OK): the Guest308308- * needs to know. We write out the cr2 value so it309309- * knows where the fault occurred.310310- *311311- * Note that if the Guest were really messed up, this312312- * could happen before it's done the INITIALIZE313313- * hypercall, so lg->lguest_data will be NULL */300300+ /* OK, it's really not there (or not OK): the Guest needs to301301+ * know. We write out the cr2 value so it knows where the302302+ * fault occurred.303303+ *304304+ * Note that if the Guest were really messed up, this could305305+ * happen before it's done the LHCALL_LGUEST_INIT hypercall, so306306+ * lg->lguest_data could be NULL */314307 if (lg->lguest_data &&315308 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))316309 kill_guest(lg, "Writing cr2");317310 break;318311 case 7: /* We've intercepted a Device Not Available fault. */319319- /* If the Guest doesn't want to know, we already320320- * restored the Floating Point Unit, so we just321321- * continue without telling it. */312312+ /* If the Guest doesn't want to know, we already restored the313313+ * Floating Point Unit, so we just continue without telling314314+ * it. */322315 if (!lg->ts)323316 return;324317 break;···543536544537 return 0;545538}546546-/* Now we've examined the hypercall code; our Guest can make requests. There547547- * is one other way we can do things for the Guest, as we see in548548- * emulate_insn(). :*/549539550540/*L:030 lguest_arch_setup_regs()551541 *···574570575571 /* %esi points to our boot information, at physical address 0, so don't576572 * touch it. */573573+577574 /* There are a couple of GDT entries the Guest expects when first578575 * booting. */579579-580576 setup_guest_gdt(lg);581577}
+51-20
drivers/lguest/x86/switcher_32.S
···66 * are feeling invigorated and refreshed then the next, more challenging stage77 * can be found in "make Guest". :*/8899+/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must1010+ * gain at least 1% more performance. Since neither LOC nor performance can be1111+ * measured beforehand, it generally means implementing a feature then deciding1212+ * if it's worth it. And once it's implemented, who can say no?1313+ *1414+ * This is why I haven't implemented this idea myself. I want to, but I1515+ * haven't. You could, though.1616+ *1717+ * The main place where lguest performance sucks is Guest page faulting. When1818+ * a Guest userspace process hits an unmapped page we switch back to the Host,1919+ * walk the page tables, find it's not mapped, switch back to the Guest page2020+ * fault handler, which calls a hypercall to set the page table entry, then2121+ * finally returns to userspace. That's two round-trips.2222+ *2323+ * If we had a small walker in the Switcher, we could quickly check the Guest2424+ * page table and if the page isn't mapped, immediately reflect the fault back2525+ * into the Guest. This means the Switcher would have to know the top of the2626+ * Guest page table and the page fault handler address.2727+ *2828+ * For simplicity, the Guest should only handle the case where the privilege2929+ * level of the fault is 3 and probably only not present or write faults. It3030+ * should also detect recursive faults, and hand the original fault to the3131+ * Host (which is actually really easy).3232+ *3333+ * Two questions remain. Would the performance gain outweigh the complexity?3434+ * And who would write the verse documenting it? :*/3535+3636+/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their3737+ * code). It's worth doing though, since it would let us use oprofile in the3838+ * Host when a Guest is running. :*/3939+940/*S:1001041 * Welcome to the Switcher itself!1142 *···1198812089 // All saved and there's now five steps before us:12190 // Stack, GDT, IDT, TSS122122- // And last of all the page tables are flipped.9191+ // Then last of all the page tables are flipped.1239212493 // Yet beware that our stack pointer must be12594 // Always valid lest an NMI hits···134103 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)135104136105 // The Guest's IDT we did partially137137- // Move to the "struct lguest_pages" as well.106106+ // Copy to "struct lguest_pages" as well.138107 lidt LGUEST_PAGES_guest_idt_desc(%eax)139108140109 // The TSS entry which controls traps141110 // Must be loaded up with "ltr" now:111111+ // The GDT entry that TSS uses 112112+ // Changes type when we load it: damn Intel!142113 // For after we switch over our page tables143143- // It (as the rest) will be writable no more.144144- // (The GDT entry TSS needs145145- // Changes type when we load it: damn Intel!)114114+ // That entry will be read-only: we'd crash.146115 movl $(GDT_ENTRY_TSS*8), %edx147116 ltr %dx148117149118 // Look back now, before we take this last step!150119 // The Host's TSS entry was also marked used;151151- // Let's clear it again, ere we return.120120+ // Let's clear it again for our return.152121 // The GDT descriptor of the Host153122 // Points to the table after two "size" bytes154123 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx155155- // Clear the type field of "used" (byte 5, bit 2)124124+ // Clear "used" from type field (byte 5, bit 2)156125 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)157126158127 // Once our page table's switched, the Guest is live!···162131163132 // The page table change did one tricky thing:164133 // The Guest's register page has been mapped165165- // Writable onto our %esp (stack) --134134+ // Writable under our %esp (stack) --166135 // We can simply pop off all Guest regs.167136 popl %eax168137 popl %ebx···183152 addl $8, %esp184153185154 // The last five stack slots hold return address186186- // And everything needed to change privilege187187- // Into the Guest privilege level of 1,155155+ // And everything needed to switch privilege156156+ // From Switcher's level 0 to Guest's 1,188157 // And the stack where the Guest had last left it.189158 // Interrupts are turned back on: we are Guest.190159 iret191160192192-// There are two paths where we switch to the Host161161+// We treat two paths to switch back to the Host162162+// Yet both must save Guest state and restore Host193163// So we put the routine in a macro.194194-// We are on our way home, back to the Host195195-// Interrupted out of the Guest, we come here.196164#define SWITCH_TO_HOST \197165 /* We save the Guest state: all registers first \198166 * Laid out just as "struct lguest_regs" defines */ \···224194 movl %esp, %eax; \225195 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \226196 /* Save our trap number: the switch will obscure it \227227- * (The Guest regs are not mapped here in the Host) \197197+ * (In the Host the Guest regs are not mapped here) \228198 * %ebx holds it safe for deliver_to_host */ \229199 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \230200 /* The Host GDT, IDT and stack! \···240210 /* Switch to Host's GDT, IDT. */ \241211 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \242212 lidt LGUEST_PAGES_host_idt_desc(%eax); \243243- /* Restore the Host's stack where it's saved regs lie */ \213213+ /* Restore the Host's stack where its saved regs lie */ \244214 movl LGUEST_PAGES_host_sp(%eax), %esp; \245245- /* Last the TSS: our Host is complete */ \215215+ /* Last the TSS: our Host is returned */ \246216 movl $(GDT_ENTRY_TSS*8), %edx; \247217 ltr %dx; \248218 /* Restore now the regs saved right at the first. */ \···252222 popl %ds; \253223 popl %es254224255255-// Here's where we come when the Guest has just trapped:256256-// (Which trap we'll see has been pushed on the stack).225225+// The first path is trod when the Guest has trapped:226226+// (Which trap it was has been pushed on the stack).257227// We need only switch back, and the Host will decode258228// Why we came home, and what needs to be done.259229return_to_host:260230 SWITCH_TO_HOST261231 iret262232233233+// We are lead to the second path like so:263234// An interrupt, with some cause external264235// Has ajerked us rudely from the Guest's code265236// Again we must return home to the Host···269238 // But now we must go home via that place270239 // Where that interrupt was supposed to go271240 // Had we not been ensconced, running the Guest.272272- // Here we see the cleverness of our stack:241241+ // Here we see the trickness of run_guest_once():273242 // The Host stack is formed like an interrupt274243 // With EIP, CS and EFLAGS layered.275244 // Interrupt handlers end with "iret"···294263 xorw %ax, %ax295264 orl %eax, %edx296265 // Now the address of the handler's in %edx297297- // We call it now: its "iret" takes us home.266266+ // We call it now: its "iret" drops us home.298267 jmp *%edx299268300269// Every interrupt can come to us here
+8-8
include/asm-x86/lguest_hcall.h
···1818#define LHCALL_LOAD_TLS 161919#define LHCALL_NOTIFY 1720202121+#define LGUEST_TRAP_ENTRY 0x1F2222+2323+#ifndef __ASSEMBLY__2424+#include <asm/hw_irq.h>2525+2126/*G:031 First, how does our Guest contact the Host to ask for privileged2227 * operations? There are two ways: the direct way is to make a "hypercall",2328 * to make requests of the Host Itself.2429 *2530 * Our hypercall mechanism uses the highest unused trap code (traps 32 and2626- * above are used by real hardware interrupts). Seventeen hypercalls are3131+ * above are used by real hardware interrupts). Fifteen hypercalls are2732 * available: the hypercall number is put in the %eax register, and the2833 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return2934 * value makes sense, it's returned in %eax.···3631 * Grossly invalid calls result in Sudden Death at the hands of the vengeful3732 * Host, rather than returning failure. This reflects Winston Churchill's3833 * definition of a gentleman: "someone who is only rude intentionally". */3939-#define LGUEST_TRAP_ENTRY 0x1F4040-4141-#ifndef __ASSEMBLY__4242-#include <asm/hw_irq.h>4343-4434static inline unsigned long4535hcall(unsigned long call,4636 unsigned long arg1, unsigned long arg2, unsigned long arg3)4737{4838 /* "int" is the Intel instruction to trigger a trap. */4939 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)5050- /* The call is in %eax (aka "a"), and can be replaced */4040+ /* The call in %eax (aka "a") might be overwritten */5141 : "=a"(call)5252- /* The other arguments are in %eax, %edx, %ebx & %ecx */4242+ /* The arguments are in %eax, %edx, %ebx & %ecx */5343 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)5444 /* "memory" means this might write somewhere in memory.5545 * This isn't true for all calls, but it's safe to tell
+2-2
include/linux/lguest.h
···1212#define LG_CLOCK_MAX_DELTA ULONG_MAX13131414/*G:032 The second method of communicating with the Host is to via "struct1515- * lguest_data". The Guest's very first hypercall is to tell the Host where1616- * this is, and then the Guest and Host both publish information in it. :*/1515+ * lguest_data". Once the Guest's initialization hypercall tells the Host where1616+ * this is, the Guest and Host both publish information in it. :*/1717struct lguest_data1818{1919 /* 512 == enabled (same as eflags in normal hardware). The Guest
+5-1
include/linux/lguest_launcher.h
···1010 * real devices (think of the damage it could do!) we provide virtual devices.1111 * We could emulate a PCI bus with various devices on it, but that is a fairly1212 * complex burden for the Host and suboptimal for the Guest, so we have our own1313- * "lguest" bus and simple drivers.1313+ * simple lguest bus and we use "virtio" drivers. These drivers need a set of1414+ * routines from us which will actually do the virtual I/O, but they handle all1515+ * the net/block/console stuff themselves. This means that if we want to add1616+ * a new device, we simply need to write a new virtio driver and create support1717+ * for it in the Launcher: this code won't need to change.1418 *1519 * Devices are described by a simplified ID, a status byte, and some "config"1620 * bytes which describe this device's configuration. This is placed by the