lguest: update commentry · tjh.dev/kernel@a91d74a

+139 -45

Documentation/lguest/lguest.c

··· 49 49 #include "linux/virtio_ring.h" 50 50 #include "asm/bootparam.h" 51 51 /*L:110 52 - * We can ignore the 39 include files we need for this program, but I do want 52 + * We can ignore the 42 include files we need for this program, but I do want 53 53 * to draw attention to the use of kernel-style types. 54 54 * 55 55 * As Linus said, "C is a Spartan language, and so should your naming be." I ··· 305 305 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 306 306 if (addr == MAP_FAILED) 307 307 err(1, "Mmaping %u pages of /dev/zero", num); 308 + 309 + /* 310 + * One neat mmap feature is that you can close the fd, and it 311 + * stays mapped. 312 + */ 308 313 close(fd); 309 314 310 315 return addr; ··· 562 557 } 563 558 /*:*/ 564 559 565 - /* 560 + /*L:200 566 561 * Device Handling. 567 562 * 568 563 * When the Guest gives us a buffer, it sends an array of addresses and sizes. ··· 613 608 return next; 614 609 } 615 610 616 - /* This actually sends the interrupt for this virtqueue */ 611 + /* 612 + * This actually sends the interrupt for this virtqueue, if we've used a 613 + * buffer. 614 + */ 617 615 static void trigger_irq(struct virtqueue *vq) 618 616 { 619 617 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; ··· 637 629 } 638 630 639 631 /* 640 - * This looks in the virtqueue and for the first available buffer, and converts 632 + * This looks in the virtqueue for the first available buffer, and converts 641 633 * it to an iovec for convenient access. Since descriptors consist of some 642 634 * number of output then some number of input descriptors, it's actually two 643 635 * iovecs, but we pack them into one and note how many of each there were. 644 636 * 645 - * This function returns the descriptor number found. 637 + * This function waits if necessary, and returns the descriptor number found. 646 638 */ 647 639 static unsigned wait_for_vq_desc(struct virtqueue *vq, 648 640 struct iovec iov[], ··· 652 644 struct vring_desc *desc; 653 645 u16 last_avail = lg_last_avail(vq); 654 646 647 + /* There's nothing available? */ 655 648 while (last_avail == vq->vring.avail->idx) { 656 649 u64 event; 657 650 658 - /* OK, tell Guest about progress up to now. */ 651 + /* 652 + * Since we're about to sleep, now is a good time to tell the 653 + * Guest about what we've used up to now. 654 + */ 659 655 trigger_irq(vq); 660 656 661 657 /* OK, now we need to know about added descriptors. */ ··· 746 734 } 747 735 748 736 /* 749 - * After we've used one of their buffers, we tell them about it. We'll then 750 - * want to send them an interrupt, using trigger_irq(). 737 + * After we've used one of their buffers, we tell the Guest about it. Sometime 738 + * later we'll want to send them an interrupt using trigger_irq(); note that 739 + * wait_for_vq_desc() does that for us if it has to wait. 751 740 */ 752 741 static void add_used(struct virtqueue *vq, unsigned int head, int len) 753 742 { ··· 795 782 struct console_abort *abort = vq->dev->priv; 796 783 struct iovec iov[vq->vring.num]; 797 784 798 - /* Make sure there's a descriptor waiting. */ 785 + /* Make sure there's a descriptor available. */ 799 786 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 800 787 if (out_num) 801 788 errx(1, "Output buffers in console in queue?"); 802 789 803 - /* Read it in. */ 790 + /* Read into it. This is where we usually wait. */ 804 791 len = readv(STDIN_FILENO, iov, in_num); 805 792 if (len <= 0) { 806 793 /* Ran out of input? */ ··· 813 800 pause(); 814 801 } 815 802 803 + /* Tell the Guest we used a buffer. */ 816 804 add_used_and_trigger(vq, head, len); 817 805 818 806 /* ··· 848 834 unsigned int head, out, in; 849 835 struct iovec iov[vq->vring.num]; 850 836 837 + /* We usually wait in here, for the Guest to give us something. */ 851 838 head = wait_for_vq_desc(vq, iov, &out, &in); 852 839 if (in) 853 840 errx(1, "Input buffers in console output queue?"); 841 + 842 + /* writev can return a partial write, so we loop here. */ 854 843 while (!iov_empty(iov, out)) { 855 844 int len = writev(STDOUT_FILENO, iov, out); 856 845 if (len <= 0) 857 846 err(1, "Write to stdout gave %i", len); 858 847 iov_consume(iov, out, len); 859 848 } 849 + 850 + /* 851 + * We're finished with that buffer: if we're going to sleep, 852 + * wait_for_vq_desc() will prod the Guest with an interrupt. 853 + */ 860 854 add_used(vq, head, 0); 861 855 } 862 856 ··· 884 862 unsigned int head, out, in; 885 863 struct iovec iov[vq->vring.num]; 886 864 865 + /* We usually wait in here for the Guest to give us a packet. */ 887 866 head = wait_for_vq_desc(vq, iov, &out, &in); 888 867 if (in) 889 868 errx(1, "Input buffers in net output queue?"); 869 + /* 870 + * Send the whole thing through to /dev/net/tun. It expects the exact 871 + * same format: what a coincidence! 872 + */ 890 873 if (writev(net_info->tunfd, iov, out) < 0) 891 874 errx(1, "Write to tun failed?"); 875 + 876 + /* 877 + * Done with that one; wait_for_vq_desc() will send the interrupt if 878 + * all packets are processed. 879 + */ 892 880 add_used(vq, head, 0); 893 881 } 894 882 895 - /* Will reading from this file descriptor block? */ 883 + /* 884 + * Handling network input is a bit trickier, because I've tried to optimize it. 885 + * 886 + * First we have a helper routine which tells is if from this file descriptor 887 + * (ie. the /dev/net/tun device) will block: 888 + */ 896 889 static bool will_block(int fd) 897 890 { 898 891 fd_set fdset; ··· 917 880 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 918 881 } 919 882 920 - /* This handles packets coming in from the tun device to our Guest. */ 883 + /* 884 + * This handles packets coming in from the tun device to our Guest. Like all 885 + * service routines, it gets called again as soon as it returns, so you don't 886 + * see a while(1) loop here. 887 + */ 921 888 static void net_input(struct virtqueue *vq) 922 889 { 923 890 int len; ··· 929 888 struct iovec iov[vq->vring.num]; 930 889 struct net_info *net_info = vq->dev->priv; 931 890 891 + /* 892 + * Get a descriptor to write an incoming packet into. This will also 893 + * send an interrupt if they're out of descriptors. 894 + */ 932 895 head = wait_for_vq_desc(vq, iov, &out, &in); 933 896 if (out) 934 897 errx(1, "Output buffers in net input queue?"); 935 898 936 - /* Deliver interrupt now, since we're about to sleep. */ 899 + /* 900 + * If it looks like we'll block reading from the tun device, send them 901 + * an interrupt. 902 + */ 937 903 if (vq->pending_used && will_block(net_info->tunfd)) 938 904 trigger_irq(vq); 939 905 906 + /* 907 + * Read in the packet. This is where we normally wait (when there's no 908 + * incoming network traffic). 909 + */ 940 910 len = readv(net_info->tunfd, iov, in); 941 911 if (len <= 0) 942 912 err(1, "Failed to read from tun."); 913 + 914 + /* 915 + * Mark that packet buffer as used, but don't interrupt here. We want 916 + * to wait until we've done as much work as we can. 917 + */ 943 918 add_used(vq, head, len); 944 919 } 920 + /*:*/ 945 921 946 - /* This is the helper to create threads. */ 922 + /* This is the helper to create threads: run the service routine in a loop. */ 947 923 static int do_thread(void *_vq) 948 924 { 949 925 struct virtqueue *vq = _vq; ··· 1008 950 signal(SIGCHLD, (void *)kill_launcher); 1009 951 } 1010 952 953 + /*L:216 954 + * This actually creates the thread which services the virtqueue for a device. 955 + */ 1011 956 static void create_thread(struct virtqueue *vq) 1012 957 { 1013 958 /* 1014 - * Create stack for thread and run it. Since the stack grows upwards, 1015 - * we point the stack pointer to the end of this region. 959 + * Create stack for thread. Since the stack grows upwards, we point 960 + * the stack pointer to the end of this region. 1016 961 */ 1017 962 char *stack = malloc(32768); 1018 963 unsigned long args[] = { LHREQ_EVENTFD, ··· 1027 966 err(1, "Creating eventfd"); 1028 967 args[2] = vq->eventfd; 1029 968 1030 - /* Attach an eventfd to this virtqueue: it will go off 1031 - * when the Guest does an LHCALL_NOTIFY for this vq. */ 969 + /* 970 + * Attach an eventfd to this virtqueue: it will go off when the Guest 971 + * does an LHCALL_NOTIFY for this vq. 972 + */ 1032 973 if (write(lguest_fd, &args, sizeof(args)) != 0) 1033 974 err(1, "Attaching eventfd"); 1034 975 1035 - /* CLONE_VM: because it has to access the Guest memory, and 1036 - * SIGCHLD so we get a signal if it dies. */ 976 + /* 977 + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 978 + * we get a signal if it dies. 979 + */ 1037 980 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1038 981 if (vq->thread == (pid_t)-1) 1039 982 err(1, "Creating clone"); 1040 - /* We close our local copy, now the child has it. */ 983 + 984 + /* We close our local copy now the child has it. */ 1041 985 close(vq->eventfd); 1042 986 } 1043 987 ··· 1094 1028 } 1095 1029 } 1096 1030 1097 - /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 1031 + /*L:215 1032 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In 1033 + * particular, it's used to notify us of device status changes during boot. 1034 + */ 1098 1035 static void handle_output(unsigned long addr) 1099 1036 { 1100 1037 struct device *i; ··· 1106 1037 for (i = devices.dev; i; i = i->next) { 1107 1038 struct virtqueue *vq; 1108 1039 1109 - /* Notifications to device descriptors update device status. */ 1040 + /* 1041 + * Notifications to device descriptors mean they updated the 1042 + * device status. 1043 + */ 1110 1044 if (from_guest_phys(addr) == i->desc) { 1111 1045 update_device_status(i); 1112 1046 return; 1113 1047 } 1114 1048 1115 - /* Devices *can* be used before status is set to DRIVER_OK. */ 1049 + /* 1050 + * Devices *can* be used before status is set to DRIVER_OK. 1051 + * The original plan was that they would never do this: they 1052 + * would always finish setting up their status bits before 1053 + * actually touching the virtqueues. In practice, we allowed 1054 + * them to, and they do (eg. the disk probes for partition 1055 + * tables as part of initialization). 1056 + * 1057 + * If we see this, we start the device: once it's running, we 1058 + * expect the device to catch all the notifications. 1059 + */ 1116 1060 for (vq = i->vq; vq; vq = vq->next) { 1117 1061 if (addr != vq->config.pfn*getpagesize()) 1118 1062 continue; 1119 1063 if (i->running) 1120 1064 errx(1, "Notification on running %s", i->name); 1065 + /* This just calls create_thread() for each virtqueue */ 1121 1066 start_device(i); 1122 1067 return; 1123 1068 } ··· 1215 1132 vq->next = NULL; 1216 1133 vq->last_avail_idx = 0; 1217 1134 vq->dev = dev; 1135 + 1136 + /* 1137 + * This is the routine the service thread will run, and its Process ID 1138 + * once it's running. 1139 + */ 1218 1140 vq->service = service; 1219 1141 vq->thread = (pid_t)-1; 1220 1142 ··· 1290 1202 1291 1203 /* 1292 1204 * This routine does all the creation and setup of a new device, including 1293 - * calling new_dev_desc() to allocate the descriptor and device memory. 1205 + * calling new_dev_desc() to allocate the descriptor and device memory. We 1206 + * don't actually start the service threads until later. 1294 1207 * 1295 1208 * See what I mean about userspace being boring? 1296 1209 */ ··· 1567 1478 verbose("device %u: tun %s: %s\n", 1568 1479 devices.device_num, tapif, arg); 1569 1480 } 1570 - 1571 - /* 1572 - * Our block (disk) device should be really simple: the Guest asks for a block 1573 - * number and we read or write that position in the file. Unfortunately, that 1574 - * was amazingly slow: the Guest waits until the read is finished before 1575 - * running anything else, even if it could have been doing useful work. 1576 - * 1577 - * We could use async I/O, except it's reputed to suck so hard that characters 1578 - * actually go missing from your code when you try to use it. 1579 - * 1580 - * So this was one reason why lguest now does all virtqueue servicing in 1581 - * separate threads: it's more efficient and more like a real device. 1582 - */ 1481 + /*:*/ 1583 1482 1584 1483 /* This hangs off device->priv. */ 1585 1484 struct vblk_info ··· 1589 1512 /*L:210 1590 1513 * The Disk 1591 1514 * 1592 - * Remember that the block device is handled by a separate I/O thread. We head 1593 - * straight into the core of that thread here: 1515 + * The disk only has one virtqueue, so it only has one thread. It is really 1516 + * simple: the Guest asks for a block number and we read or write that position 1517 + * in the file. 1518 + * 1519 + * Before we serviced each virtqueue in a separate thread, that was unacceptably 1520 + * slow: the Guest waits until the read is finished before running anything 1521 + * else, even if it could have been doing useful work. 1522 + * 1523 + * We could have used async I/O, except it's reputed to suck so hard that 1524 + * characters actually go missing from your code when you try to use it. 1594 1525 */ 1595 1526 static void blk_request(struct virtqueue *vq) 1596 1527 { ··· 1610 1525 struct iovec iov[vq->vring.num]; 1611 1526 off64_t off; 1612 1527 1613 - /* Get the next request. */ 1528 + /* 1529 + * Get the next request, where we normally wait. It triggers the 1530 + * interrupt to acknowledge previously serviced requests (if any). 1531 + */ 1614 1532 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1615 1533 1616 1534 /* ··· 1627 1539 1628 1540 out = convert(&iov[0], struct virtio_blk_outhdr); 1629 1541 in = convert(&iov[out_num+in_num-1], u8); 1542 + /* 1543 + * For historical reasons, block operations are expressed in 512 byte 1544 + * "sectors". 1545 + */ 1630 1546 off = out->sector * 512; 1631 1547 1632 1548 /* ··· 1706 1614 if (out->type & VIRTIO_BLK_T_BARRIER) 1707 1615 fdatasync(vblk->fd); 1708 1616 1617 + /* Finished that request. */ 1709 1618 add_used(vq, head, wlen); 1710 1619 } 1711 1620 ··· 1775 1682 errx(1, "Output buffers in rng?"); 1776 1683 1777 1684 /* 1778 - * This is why we convert to iovecs: the readv() call uses them, and so 1779 - * it reads straight into the Guest's buffer. We loop to make sure we 1780 - * fill it. 1685 + * Just like the console write, we loop to cover the whole iovec. 1686 + * In this case, short reads actually happen quite a bit. 1781 1687 */ 1782 1688 while (!iov_empty(iov, in_num)) { 1783 1689 len = readv(rng_info->rfd, iov, in_num); ··· 1910 1818 devices.lastdev = NULL; 1911 1819 devices.next_irq = 1; 1912 1820 1821 + /* We're CPU 0. In fact, that's the only CPU possible right now. */ 1913 1822 cpu_id = 0; 1823 + 1914 1824 /* 1915 1825 * We need to know how much memory so we can set up the device 1916 1826 * descriptor and memory pages for the devices as we parse the command ··· 2020 1926 */ 2021 1927 tell_kernel(start); 2022 1928 2023 - /* Ensure that we terminate if a child dies. */ 1929 + /* Ensure that we terminate if a device-servicing child dies. */ 2024 1930 signal(SIGCHLD, kill_launcher); 2025 1931 2026 1932 /* If we exit via err(), this kills all the threads, restores tty. */

+4 -4

arch/x86/include/asm/lguest_hcall.h

··· 35 35 * operations? There are two ways: the direct way is to make a "hypercall", 36 36 * to make requests of the Host Itself. 37 37 * 38 - * We use the KVM hypercall mechanism. Seventeen hypercalls are 39 - * available: the hypercall number is put in the %eax register, and the 40 - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 41 - * If a return value makes sense, it's returned in %eax. 38 + * We use the KVM hypercall mechanism, though completely different hypercall 39 + * numbers. Seventeen hypercalls are available: the hypercall number is put in 40 + * the %eax register, and the arguments (when required) are placed in %ebx, 41 + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. 42 42 * 43 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 44 44 * Host, rather than returning failure. This reflects Winston Churchill's

+77 -22

arch/x86/lguest/boot.c

··· 154 154 async_hcall(call, arg1, 0, 0, 0); 155 155 } 156 156 157 + /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 157 158 static void lazy_hcall2(unsigned long call, 158 159 unsigned long arg1, 159 160 unsigned long arg2) ··· 190 189 } 191 190 #endif 192 191 193 - /* When lazy mode is turned off reset the per-cpu lazy mode variable and then 194 - * issue the do-nothing hypercall to flush any stored calls. */ 192 + /*G:036 193 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then 194 + * issue the do-nothing hypercall to flush any stored calls. 195 + :*/ 195 196 static void lguest_leave_lazy_mmu_mode(void) 196 197 { 197 198 kvm_hypercall0(LHCALL_FLUSH_ASYNC); ··· 253 250 extern void lg_restore_fl(unsigned long flags); 254 251 255 252 /*M:003 256 - * Note that we don't check for outstanding interrupts when we re-enable them 257 - * (or when we unmask an interrupt). This seems to work for the moment, since 258 - * interrupts are rare and we'll just get the interrupt on the next timer tick, 259 - * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would 260 - * be to put the "irq_enabled" field in a page by itself, and have the Host 261 - * write-protect it when an interrupt comes in when irqs are disabled. There 262 - * will then be a page fault as soon as interrupts are re-enabled. 253 + * We could be more efficient in our checking of outstanding interrupts, rather 254 + * than using a branch. One way would be to put the "irq_enabled" field in a 255 + * page by itself, and have the Host write-protect it when an interrupt comes 256 + * in when irqs are disabled. There will then be a page fault as soon as 257 + * interrupts are re-enabled. 263 258 * 264 259 * A better method is to implement soft interrupt disable generally for x86: 265 260 * instead of disabling interrupts, we set a flag. If an interrupt does come ··· 569 568 * cr3 ---> +---------+ 570 569 * | --------->+---------+ 571 570 * | | | PADDR1 | 572 - * Top-level | | PADDR2 | 571 + * Mid-level | | PADDR2 | 573 572 * (PMD) page | | | 574 573 * | | Lower-level | 575 574 * | | (PTE) page | ··· 589 588 * Index into top Index into second Offset within page 590 589 * page directory page pagetable page 591 590 * 592 - * The kernel spends a lot of time changing both the top-level page directory 593 - * and lower-level pagetable pages. The Guest doesn't know physical addresses, 594 - * so while it maintains these page tables exactly like normal, it also needs 595 - * to keep the Host informed whenever it makes a change: the Host will create 596 - * the real page tables based on the Guests'. 591 + * Now, unfortunately, this isn't the whole story: Intel added Physical Address 592 + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). 593 + * These are held in 64-bit page table entries, so we can now only fit 512 594 + * entries in a page, and the neat three-level tree breaks down. 595 + * 596 + * The result is a four level page table: 597 + * 598 + * cr3 --> [ 4 Upper ] 599 + * [ Level ] 600 + * [ Entries ] 601 + * [(PUD Page)]---> +---------+ 602 + * | --------->+---------+ 603 + * | | | PADDR1 | 604 + * Mid-level | | PADDR2 | 605 + * (PMD) page | | | 606 + * | | Lower-level | 607 + * | | (PTE) page | 608 + * | | | | 609 + * .... .... 610 + * 611 + * 612 + * And the virtual address is decoded as: 613 + * 614 + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 615 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| 616 + * Index into Index into mid Index into lower Offset within page 617 + * top entries directory page pagetable page 618 + * 619 + * It's too hard to switch between these two formats at runtime, so Linux only 620 + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many 621 + * distributions turn it on, and not just for people with silly amounts of 622 + * memory: the larger PTE entries allow room for the NX bit, which lets the 623 + * kernel disable execution of pages and increase security. 624 + * 625 + * This was a problem for lguest, which couldn't run on these distributions; 626 + * then Matias Zabaljauregui figured it all out and implemented it, and only a 627 + * handful of puppies were crushed in the process! 628 + * 629 + * Back to our point: the kernel spends a lot of time changing both the 630 + * top-level page directory and lower-level pagetable pages. The Guest doesn't 631 + * know physical addresses, so while it maintains these page tables exactly 632 + * like normal, it also needs to keep the Host informed whenever it makes a 633 + * change: the Host will create the real page tables based on the Guests'. 597 634 */ 598 635 599 636 /* 600 - * The Guest calls this to set a second-level entry (pte), ie. to map a page 601 - * into a process' address space. We set the entry then tell the Host the 602 - * toplevel and address this corresponds to. The Guest uses one pagetable per 603 - * process, so we need to tell the Host which one we're changing (mm->pgd). 637 + * The Guest calls this after it has set a second-level entry (pte), ie. to map 638 + * a page into a process' address space. Wetell the Host the toplevel and 639 + * address this corresponds to. The Guest uses one pagetable per process, so 640 + * we need to tell the Host which one we're changing (mm->pgd). 604 641 */ 605 642 static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 606 643 pte_t *ptep) 607 644 { 608 645 #ifdef CONFIG_X86_PAE 646 + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ 609 647 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 610 648 ptep->pte_low, ptep->pte_high); 611 649 #else ··· 652 612 #endif 653 613 } 654 614 615 + /* This is the "set and update" combo-meal-deal version. */ 655 616 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656 617 pte_t *ptep, pte_t pteval) 657 618 { ··· 713 672 } 714 673 715 674 #ifdef CONFIG_X86_PAE 675 + /* 676 + * With 64-bit PTE values, we need to be careful setting them: if we set 32 677 + * bits at a time, the hardware could see a weird half-set entry. These 678 + * versions ensure we update all 64 bits at once. 679 + */ 716 680 static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 717 681 { 718 682 native_set_pte_atomic(ptep, pte); ··· 725 679 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 726 680 } 727 681 728 - void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 682 + static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, 683 + pte_t *ptep) 729 684 { 730 685 native_pte_clear(mm, addr, ptep); 731 686 lguest_pte_update(mm, addr, ptep); 732 687 } 733 688 734 - void lguest_pmd_clear(pmd_t *pmdp) 689 + static void lguest_pmd_clear(pmd_t *pmdp) 735 690 { 736 691 lguest_set_pmd(pmdp, __pmd(0)); 737 692 } ··· 831 784 irq_ctx_init(smp_processor_id()); 832 785 } 833 786 787 + /* 788 + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so 789 + * rather than set them in lguest_init_IRQ we are called here every time an 790 + * lguest device needs an interrupt. 791 + * 792 + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should 793 + * pass that up! 794 + */ 834 795 void lguest_setup_irq(unsigned int irq) 835 796 { 836 797 irq_to_desc_alloc_node(irq, 0); ··· 1353 1298 */ 1354 1299 switch_to_new_gdt(0); 1355 1300 1356 - /* As described in head_32.S, we map the first 128M of memory. */ 1301 + /* We actually boot with all memory mapped, but let's say 128MB. */ 1357 1302 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1358 1303 1359 1304 /*

+2

arch/x86/lguest/i386_head.S

··· 102 102 * create one manually here. 103 103 */ 104 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 105 + /* Put eax back the way we found it. */ 105 106 popl %eax 106 107 ret 107 108 ··· 126 125 jnz send_interrupts 127 126 /* Again, the normal path has used no extra registers. Clever, huh? */ 128 127 ret 128 + /*:*/ 129 129 130 130 /* These demark the EIP range where host should never deliver interrupts. */ 131 131 .global lguest_noirq_start

+6 -1

drivers/lguest/core.c

··· 217 217 218 218 /* 219 219 * It's possible the Guest did a NOTIFY hypercall to the 220 - * Launcher, in which case we return from the read() now. 220 + * Launcher. 221 221 */ 222 222 if (cpu->pending_notify) { 223 + /* 224 + * Does it just needs to write to a registered 225 + * eventfd (ie. the appropriate virtqueue thread)? 226 + */ 223 227 if (!send_notify_to_eventfd(cpu)) { 228 + /* OK, we tell the main Laucher. */ 224 229 if (put_user(cpu->pending_notify, user)) 225 230 return -EFAULT; 226 231 return sizeof(cpu->pending_notify);

+5 -1

drivers/lguest/hypercalls.c

··· 59 59 case LHCALL_SHUTDOWN: { 60 60 char msg[128]; 61 61 /* 62 - * Shutdown is such a trivial hypercall that we do it in four 62 + * Shutdown is such a trivial hypercall that we do it in five 63 63 * lines right here. 64 64 * 65 65 * If the lgread fails, it will call kill_guest() itself; the ··· 245 245 * device), the Guest will still see the old page. In practice, this never 246 246 * happens: why would the Guest read a page which it has never written to? But 247 247 * a similar scenario might one day bite us, so it's worth mentioning. 248 + * 249 + * Note that if we used a shared anonymous mapping in the Launcher instead of 250 + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we 251 + * need that to switch the Launcher to processes (away from threads) anyway. 248 252 :*/ 249 253 250 254 /*H:100

+6 -5

drivers/lguest/lguest_device.c

··· 236 236 extern void lguest_setup_irq(unsigned int irq); 237 237 238 238 /* 239 - * This routine finds the first virtqueue described in the configuration of 239 + * This routine finds the Nth virtqueue described in the configuration of 240 240 * this device and sets it up. 241 241 * 242 242 * This is kind of an ugly duckling. It'd be nicer to have a standard ··· 244 244 * everyone wants to do it differently. The KVM coders want the Guest to 245 245 * allocate its own pages and tell the Host where they are, but for lguest it's 246 246 * simpler for the Host to simply tell us where the pages are. 247 - * 248 - * So we provide drivers with a "find the Nth virtqueue and set it up" 249 - * function. 250 247 */ 251 248 static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 252 249 unsigned index, ··· 419 422 420 423 /* This devices' parent is the lguest/ dir. */ 421 424 ldev->vdev.dev.parent = lguest_root; 422 - /* We have a unique device index thanks to the dev_index counter. */ 425 + /* 426 + * The device type comes straight from the descriptor. There's also a 427 + * device vendor field in the virtio_device struct, which we leave as 428 + * 0. 429 + */ 423 430 ldev->vdev.id.device = d->type; 424 431 /* 425 432 * We have a simple set of routines for querying the device's

+90 -10

drivers/lguest/lguest_user.c

··· 1 - /*P:200 2 - * This contains all the /dev/lguest code, whereby the userspace launcher 1 + /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 3 2 * controls and communicates with the Guest. For example, the first write will 4 - * tell us the Guest's memory layout, pagetable, entry point and kernel address 5 - * offset. A read will run the Guest until something happens, such as a signal 6 - * or the Guest doing a NOTIFY out to the Launcher. 3 + * tell us the Guest's memory layout and entry point. A read will run the 4 + * Guest until something happens, such as a signal or the Guest doing a NOTIFY 5 + * out to the Launcher. 7 6 :*/ 8 7 #include <linux/uaccess.h> 9 8 #include <linux/miscdevice.h> ··· 12 13 #include <linux/file.h> 13 14 #include "lg.h" 14 15 16 + /*L:056 17 + * Before we move on, let's jump ahead and look at what the kernel does when 18 + * it needs to look up the eventfds. That will complete our picture of how we 19 + * use RCU. 20 + * 21 + * The notification value is in cpu->pending_notify: we return true if it went 22 + * to an eventfd. 23 + */ 15 24 bool send_notify_to_eventfd(struct lg_cpu *cpu) 16 25 { 17 26 unsigned int i; 18 27 struct lg_eventfd_map *map; 19 28 20 - /* lg->eventfds is RCU-protected */ 29 + /* 30 + * This "rcu_read_lock()" helps track when someone is still looking at 31 + * the (RCU-using) eventfds array. It's not actually a lock at all; 32 + * indeed it's a noop in many configurations. (You didn't expect me to 33 + * explain all the RCU secrets here, did you?) 34 + */ 21 35 rcu_read_lock(); 36 + /* 37 + * rcu_dereference is the counter-side of rcu_assign_pointer(); it 38 + * makes sure we don't access the memory pointed to by 39 + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, 40 + * but Alpha allows this! Paul McKenney points out that a really 41 + * aggressive compiler could have the same effect: 42 + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html 43 + * 44 + * So play safe, use rcu_dereference to get the rcu-protected pointer: 45 + */ 22 46 map = rcu_dereference(cpu->lg->eventfds); 47 + /* 48 + * Simple array search: even if they add an eventfd while we do this, 49 + * we'll continue to use the old array and just won't see the new one. 50 + */ 23 51 for (i = 0; i < map->num; i++) { 24 52 if (map->map[i].addr == cpu->pending_notify) { 25 53 eventfd_signal(map->map[i].event, 1); ··· 54 28 break; 55 29 } 56 30 } 31 + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ 57 32 rcu_read_unlock(); 33 + 34 + /* If we cleared the notification, it's because we found a match. */ 58 35 return cpu->pending_notify == 0; 59 36 } 60 37 38 + /*L:055 39 + * One of the more tricksy tricks in the Linux Kernel is a technique called 40 + * Read Copy Update. Since one point of lguest is to teach lguest journeyers 41 + * about kernel coding, I use it here. (In case you're curious, other purposes 42 + * include learning about virtualization and instilling a deep appreciation for 43 + * simplicity and puppies). 44 + * 45 + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we 46 + * add new eventfds without ever blocking readers from accessing the array. 47 + * The current Launcher only does this during boot, so that never happens. But 48 + * Read Copy Update is cool, and adding a lock risks damaging even more puppies 49 + * than this code does. 50 + * 51 + * We allocate a brand new one-larger array, copy the old one and add our new 52 + * element. Then we make the lg eventfd pointer point to the new array. 53 + * That's the easy part: now we need to free the old one, but we need to make 54 + * sure no slow CPU somewhere is still looking at it. That's what 55 + * synchronize_rcu does for us: waits until every CPU has indicated that it has 56 + * moved on to know it's no longer using the old one. 57 + * 58 + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. 59 + */ 61 60 static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) 62 61 { 63 62 struct lg_eventfd_map *new, *old = lg->eventfds; 64 63 64 + /* 65 + * We don't allow notifications on value 0 anyway (pending_notify of 66 + * 0 means "nothing pending"). 67 + */ 65 68 if (!addr) 66 69 return -EINVAL; 67 70 ··· 117 62 } 118 63 new->num++; 119 64 120 - /* Now put new one in place. */ 65 + /* 66 + * Now put new one in place: rcu_assign_pointer() is a fancy way of 67 + * doing "lg->eventfds = new", but it uses memory barriers to make 68 + * absolutely sure that the contents of "new" written above is nailed 69 + * down before we actually do the assignment. 70 + * 71 + * We have to think about these kinds of things when we're operating on 72 + * live data without locks. 73 + */ 121 74 rcu_assign_pointer(lg->eventfds, new); 122 75 123 76 /* 124 77 * We're not in a big hurry. Wait until noone's looking at old 125 - * version, then delete it. 78 + * version, then free it. 126 79 */ 127 80 synchronize_rcu(); 128 81 kfree(old); ··· 138 75 return 0; 139 76 } 140 77 78 + /*L:052 79 + * Receiving notifications from the Guest is usually done by attaching a 80 + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will 81 + * become readable when the Guest does an LHCALL_NOTIFY with that value. 82 + * 83 + * This is really convenient for processing each virtqueue in a separate 84 + * thread. 85 + */ 141 86 static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) 142 87 { 143 88 unsigned long addr, fd; ··· 157 86 if (get_user(fd, input) != 0) 158 87 return -EFAULT; 159 88 89 + /* 90 + * Just make sure two callers don't add eventfds at once. We really 91 + * only need to lock against callers adding to the same Guest, so using 92 + * the Big Lguest Lock is overkill. But this is setup, not a fast path. 93 + */ 160 94 mutex_lock(&lguest_lock); 161 95 err = add_eventfd(lg, addr, fd); 162 96 mutex_unlock(&lguest_lock); ··· 182 106 if (irq >= LGUEST_IRQS) 183 107 return -EINVAL; 184 108 109 + /* 110 + * Next time the Guest runs, the core code will see if it can deliver 111 + * this interrupt. 112 + */ 185 113 set_interrupt(cpu, irq); 186 114 return 0; 187 115 } ··· 387 307 * The first operation the Launcher does must be a write. All writes 388 308 * start with an unsigned long number: for the first write this must be 389 309 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 390 - * writes of other values to send interrupts. 310 + * writes of other values to send interrupts or set up receipt of notifications. 391 311 * 392 312 * Note that we overload the "offset" in the /dev/lguest file to indicate what 393 - * CPU number we're dealing with. Currently this is always 0, since we only 313 + * CPU number we're dealing with. Currently this is always 0 since we only 394 314 * support uniprocessor Guests, but you can see the beginnings of SMP support 395 315 * here. 396 316 */

+66 -20

drivers/lguest/page_tables.c

··· 29 29 /*H:300 30 30 * The Page Table Code 31 31 * 32 - * We use two-level page tables for the Guest. If you're not entirely 33 - * comfortable with virtual addresses, physical addresses and page tables then 34 - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 35 - * diagrams!). 32 + * We use two-level page tables for the Guest, or three-level with PAE. If 33 + * you're not entirely comfortable with virtual addresses, physical addresses 34 + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page 35 + * Table Handling" (with diagrams!). 36 36 * 37 37 * The Guest keeps page tables, but we maintain the actual ones here: these are 38 38 * called "shadow" page tables. Which is a very Guest-centric name: these are ··· 52 52 :*/ 53 53 54 54 /* 55 - * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 56 - * conveniently placed at the top 4MB, so it uses a separate, complete PTE 57 - * page. 55 + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) 56 + * or 512 PTE entries with PAE (2MB). 58 57 */ 59 58 #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 60 59 ··· 80 81 81 82 /*H:320 82 83 * The page table code is curly enough to need helper functions to keep it 83 - * clear and clean. 84 + * clear and clean. The kernel itself provides many of them; one advantage 85 + * of insisting that the Guest and Host use the same CONFIG_PAE setting. 84 86 * 85 87 * There are two functions which return pointers to the shadow (aka "real") 86 88 * page tables. ··· 155 155 } 156 156 157 157 /* 158 - * These two functions just like the above two, except they access the Guest 158 + * These functions are just like the above two, except they access the Guest 159 159 * page tables. Hence they return a Guest address. 160 160 */ 161 161 static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) ··· 165 165 } 166 166 167 167 #ifdef CONFIG_X86_PAE 168 + /* Follow the PGD to the PMD. */ 168 169 static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169 170 { 170 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; ··· 173 172 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 174 173 } 175 174 175 + /* Follow the PMD to the PTE. */ 176 176 static unsigned long gpte_addr(struct lg_cpu *cpu, 177 177 pmd_t gpmd, unsigned long vaddr) 178 178 { ··· 183 181 return gpage + pte_index(vaddr) * sizeof(pte_t); 184 182 } 185 183 #else 184 + /* Follow the PGD to the PTE (no mid-level for !PAE). */ 186 185 static unsigned long gpte_addr(struct lg_cpu *cpu, 187 186 pgd_t gpgd, unsigned long vaddr) 188 187 { ··· 317 314 pte_t gpte; 318 315 pte_t *spte; 319 316 317 + /* Mid level for PAE. */ 320 318 #ifdef CONFIG_X86_PAE 321 319 pmd_t *spmd; 322 320 pmd_t gpmd; ··· 395 391 */ 396 392 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 397 393 #endif 394 + 395 + /* Read the actual PTE value. */ 398 396 gpte = lgread(cpu, gpte_ptr, pte_t); 399 397 400 398 /* If this page isn't in the Guest page tables, we can't page it in. */ ··· 513 507 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 514 508 kill_guest(cpu, "bad stack page %#lx", vaddr); 515 509 } 510 + /*:*/ 516 511 517 512 #ifdef CONFIG_X86_PAE 518 513 static void release_pmd(pmd_t *spmd) ··· 550 543 } 551 544 552 545 #else /* !CONFIG_X86_PAE */ 553 - /*H:450 If we chase down the release_pgd() code, it looks like this: */ 546 + /*H:450 547 + * If we chase down the release_pgd() code, the non-PAE version looks like 548 + * this. The PAE version is almost identical, but instead of calling 549 + * release_pte it calls release_pmd(), which looks much like this. 550 + */ 554 551 static void release_pgd(pgd_t *spgd) 555 552 { 556 553 /* If the entry's not present, there's nothing to release. */ ··· 909 898 /* ... throw it away. */ 910 899 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 911 900 } 901 + 912 902 #ifdef CONFIG_X86_PAE 903 + /* For setting a mid-level, we just throw everything away. It's easy. */ 913 904 void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 914 905 { 915 906 guest_pagetable_clear_all(&lg->cpus[0]); 916 907 } 917 908 #endif 918 909 919 - /* 920 - * Once we know how much memory we have we can construct simple identity (which 910 + /*H:505 911 + * To get through boot, we construct simple identity page mappings (which 921 912 * set virtual == physical) and linear mappings which will get the Guest far 922 - * enough into the boot to create its own. 913 + * enough into the boot to create its own. The linear mapping means we 914 + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, 915 + * as you'll see. 923 916 * 924 917 * We lay them out of the way, just below the initrd (which is why we need to 925 918 * know its size here). ··· 959 944 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 960 945 961 946 #ifdef CONFIG_X86_PAE 947 + /* 948 + * And the single mid page goes below that. We only use one, but 949 + * that's enough to map 1G, which definitely gets us through boot. 950 + */ 962 951 pmds = (void *)linear - PAGE_SIZE; 963 952 #endif 964 953 /* ··· 976 957 return -EFAULT; 977 958 } 978 959 979 - /* 980 - * The top level points to the linear page table pages above. 981 - * We setup the identity and linear mappings here. 982 - */ 983 960 #ifdef CONFIG_X86_PAE 961 + /* 962 + * Make the Guest PMD entries point to the corresponding place in the 963 + * linear mapping (up to one page worth of PMD). 964 + */ 984 965 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 985 966 i += PTRS_PER_PTE, j++) { 967 + /* FIXME: native_set_pmd is overkill here. */ 986 968 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 987 969 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 970 ··· 991 971 return -EFAULT; 992 972 } 993 973 974 + /* One PGD entry, pointing to that PMD page. */ 994 975 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 976 + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ 995 977 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 996 978 return -EFAULT; 979 + /* 980 + * And the third PGD entry (ie. addresses 3G-4G). 981 + * 982 + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. 983 + */ 997 984 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 998 985 return -EFAULT; 999 986 #else 987 + /* 988 + * The top level points to the linear page table pages above. 989 + * We setup the identity and linear mappings here. 990 + */ 1000 991 phys_linear = (unsigned long)linear - mem_base; 1001 992 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1002 993 pgd_t pgd; 994 + /* 995 + * Create a PGD entry which points to the right part of the 996 + * linear PTE pages. 997 + */ 1003 998 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1004 999 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1005 1000 1001 + /* 1002 + * Copy it into the PGD page at 0 and PAGE_OFFSET. 1003 + */ 1006 1004 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1007 1005 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1008 1006 + i / PTRS_PER_PTE], ··· 1030 992 #endif 1031 993 1032 994 /* 1033 - * We return the top level (guest-physical) address: remember where 1034 - * this is. 995 + * We return the top level (guest-physical) address: we remember where 996 + * this is to write it into lguest_data when the Guest initializes. 1035 997 */ 1036 998 return (unsigned long)pgdir - mem_base; 1037 999 } ··· 1069 1031 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1070 1032 if (!lg->pgdirs[0].pgdir) 1071 1033 return -ENOMEM; 1034 + 1072 1035 #ifdef CONFIG_X86_PAE 1036 + /* For PAE, we also create the initial mid-level. */ 1073 1037 pgd = lg->pgdirs[0].pgdir; 1074 1038 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1075 1039 if (!pmd_table) ··· 1080 1040 set_pgd(pgd + SWITCHER_PGD_INDEX, 1081 1041 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1082 1042 #endif 1043 + 1044 + /* This is the current page table. */ 1083 1045 lg->cpus[0].cpu_pgd = 0; 1084 1046 return 0; 1085 1047 } 1086 1048 1087 - /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1049 + /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1088 1050 void page_table_guest_data_init(struct lg_cpu *cpu) 1089 1051 { 1090 1052 /* We get the kernel address: above this is all kernel memory. */ ··· 1147 1105 pmd_t switcher_pmd; 1148 1106 pmd_t *pmd_table; 1149 1107 1108 + /* FIXME: native_set_pmd is overkill here. */ 1150 1109 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 1110 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 1111 1112 + /* Figure out where the pmd page is, by reading the PGD, and converting 1113 + * it to a virtual address. */ 1153 1114 pmd_table = __va(pgd_pfn(cpu->lg-> 1154 1115 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1155 1116 << PAGE_SHIFT); 1117 + /* Now write it into the shadow page table. */ 1156 1118 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1157 1119 #else 1158 1120 pgd_t switcher_pgd;

+1 -1

drivers/lguest/x86/core.c

··· 187 187 * also simplify copy_in_guest_info(). Note that we'd still need to restore 188 188 * things when we exit to Launcher userspace, but that's fairly easy. 189 189 * 190 - * We could also try using this hooks for PGE, but that might be too expensive. 190 + * We could also try using these hooks for PGE, but that might be too expensive. 191 191 * 192 192 * The hooks were designed for KVM, but we can also put them to good use. 193 193 :*/

+3 -3

drivers/lguest/x86/switcher_32.S

··· 1 1 /*P:900 2 - * This is the Switcher: code which sits at 0xFFC00000 astride both the 3 - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 4 - * it can be made, but it's naturally very specific to x86. 2 + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride 3 + * both the Host and Guest to do the low-level Guest<->Host switch. It is as 4 + * simple as it can be made, but it's naturally very specific to x86. 5 5 * 6 6 * You have now completed Preparation. If this has whet your appetite; if you 7 7 * are feeling invigorated and refreshed then the next, more challenging stage