lguest: documentation update · tjh.dev/kernel@e1e7296

+95 -60

Documentation/lguest/lguest.c

··· 360 360 } 361 361 362 362 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 363 - * come wrapped up in the self-decompressing "bzImage" format. With some funky 364 - * coding, we can load those, too. */ 363 + * come wrapped up in the self-decompressing "bzImage" format. With a little 364 + * work, we can load those, too. */ 365 365 static unsigned long load_kernel(int fd) 366 366 { 367 367 Elf32_Ehdr hdr; ··· 464 464 * to know where it is. */ 465 465 return to_guest_phys(pgdir); 466 466 } 467 + /*:*/ 467 468 468 469 /* Simple routine to roll all the commandline arguments together with spaces 469 470 * between them. */ ··· 481 480 dst[len] = '\0'; 482 481 } 483 482 484 - /* This is where we actually tell the kernel to initialize the Guest. We saw 485 - * the arguments it expects when we looked at initialize() in lguest_user.c: 486 - * the base of guest "physical" memory, the top physical page to allow, the 483 + /*L:185 This is where we actually tell the kernel to initialize the Guest. We 484 + * saw the arguments it expects when we looked at initialize() in lguest_user.c: 485 + * the base of Guest "physical" memory, the top physical page to allow, the 487 486 * top level pagetable and the entry point for the Guest. */ 488 487 static int tell_kernel(unsigned long pgdir, unsigned long start) 489 488 { ··· 513 512 /*L:200 514 513 * The Waker. 515 514 * 516 - * With a console and network devices, we can have lots of input which we need 517 - * to process. We could try to tell the kernel what file descriptors to watch, 518 - * but handing a file descriptor mask through to the kernel is fairly icky. 515 + * With console, block and network devices, we can have lots of input which we 516 + * need to process. We could try to tell the kernel what file descriptors to 517 + * watch, but handing a file descriptor mask through to the kernel is fairly 518 + * icky. 519 519 * 520 520 * Instead, we fork off a process which watches the file descriptors and writes 521 - * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host 522 - * loop to stop running the Guest. This causes it to return from the 521 + * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host 522 + * stop running the Guest. This causes the Launcher to return from the 523 523 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset 524 524 * the LHREQ_BREAK and wake us up again. 525 525 * ··· 546 544 if (read(pipefd, &fd, sizeof(fd)) == 0) 547 545 exit(0); 548 546 /* Otherwise it's telling us to change what file 549 - * descriptors we're to listen to. */ 547 + * descriptors we're to listen to. Positive means 548 + * listen to a new one, negative means stop 549 + * listening. */ 550 550 if (fd >= 0) 551 551 FD_SET(fd, &devices.infds); 552 552 else ··· 563 559 { 564 560 int pipefd[2], child; 565 561 566 - /* We create a pipe to talk to the waker, and also so it knows when the 562 + /* We create a pipe to talk to the Waker, and also so it knows when the 567 563 * Launcher dies (and closes pipe). */ 568 564 pipe(pipefd); 569 565 child = fork(); ··· 571 567 err(1, "forking"); 572 568 573 569 if (child == 0) { 574 - /* Close the "writing" end of our copy of the pipe */ 570 + /* We are the Waker: close the "writing" end of our copy of the 571 + * pipe and start waiting for input. */ 575 572 close(pipefd[1]); 576 573 wake_parent(pipefd[0], lguest_fd); 577 574 } ··· 583 578 return pipefd[1]; 584 579 } 585 580 586 - /*L:210 581 + /* 587 582 * Device Handling. 588 583 * 589 - * When the Guest sends DMA to us, it sends us an array of addresses and sizes. 584 + * When the Guest gives us a buffer, it sends an array of addresses and sizes. 590 585 * We need to make sure it's not trying to reach into the Launcher itself, so 591 - * we have a convenient routine which check it and exits with an error message 586 + * we have a convenient routine which checks it and exits with an error message 592 587 * if something funny is going on: 593 588 */ 594 589 static void *_check_pointer(unsigned long addr, unsigned int size, ··· 605 600 /* A macro which transparently hands the line number to the real function. */ 606 601 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 607 602 608 - /* This function returns the next descriptor in the chain, or vq->vring.num. */ 603 + /* Each buffer in the virtqueues is actually a chain of descriptors. This 604 + * function returns the next descriptor in the chain, or vq->vring.num if we're 605 + * at the end. */ 609 606 static unsigned next_desc(struct virtqueue *vq, unsigned int i) 610 607 { 611 608 unsigned int next; ··· 686 679 return head; 687 680 } 688 681 689 - /* Once we've used one of their buffers, we tell them about it. We'll then 682 + /* After we've used one of their buffers, we tell them about it. We'll then 690 683 * want to send them an interrupt, using trigger_irq(). */ 691 684 static void add_used(struct virtqueue *vq, unsigned int head, int len) 692 685 { 693 686 struct vring_used_elem *used; 694 687 695 - /* Get a pointer to the next entry in the used ring. */ 688 + /* The virtqueue contains a ring of used buffers. Get a pointer to the 689 + * next entry in that used ring. */ 696 690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 697 691 used->id = head; 698 692 used->len = len; ··· 707 699 { 708 700 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 709 701 702 + /* If they don't want an interrupt, don't send one. */ 710 703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 711 704 return; 712 705 ··· 724 715 trigger_irq(fd, vq); 725 716 } 726 717 727 - /* Here is the input terminal setting we save, and the routine to restore them 728 - * on exit so the user can see what they type next. */ 718 + /* 719 + * The Console 720 + * 721 + * Here is the input terminal setting we save, and the routine to restore them 722 + * on exit so the user gets their terminal back. */ 729 723 static struct termios orig_term; 730 724 static void restore_term(void) 731 725 { ··· 829 817 } 830 818 } 831 819 832 - /* Handling output for network is also simple: we get all the output buffers 820 + /* 821 + * The Network 822 + * 823 + * Handling output for network is also simple: we get all the output buffers 833 824 * and write them (ignoring the first element) to this device's file descriptor 834 825 * (stdout). */ 835 826 static void handle_net_output(int fd, struct virtqueue *vq) ··· 845 830 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 846 831 if (in) 847 832 errx(1, "Input buffers in output queue?"); 848 - /* Check header, but otherwise ignore it (we said we supported 849 - * no features). */ 833 + /* Check header, but otherwise ignore it (we told the Guest we 834 + * supported no features, so it shouldn't have anything 835 + * interesting). */ 850 836 (void)convert(&iov[0], struct virtio_net_hdr); 851 837 len = writev(vq->dev->fd, iov+1, out-1); 852 838 add_used_and_trigger(fd, vq, head, len); ··· 898 882 return true; 899 883 } 900 884 901 - /* This callback ensures we try again, in case we stopped console or net 885 + /*L:215 This is the callback attached to the network and console input 886 + * virtqueues: it ensures we try again, in case we stopped console or net 902 887 * delivery because Guest didn't have any buffers. */ 903 888 static void enable_fd(int fd, struct virtqueue *vq) 904 889 { ··· 935 918 strnlen(from_guest_phys(addr), guest_limit - addr)); 936 919 } 937 920 938 - /* This is called when the waker wakes us up: check for incoming file 921 + /* This is called when the Waker wakes us up: check for incoming file 939 922 * descriptors. */ 940 923 static void handle_input(int fd) 941 924 { ··· 1002 985 } 1003 986 1004 987 /* Each device descriptor is followed by some configuration information. 1005 - * The first byte is a "status" byte for the Guest to report what's happening. 1006 - * After that are fields: u8 type, u8 len, [... len bytes...]. 988 + * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. 1007 989 * 1008 990 * This routine adds a new field to an existing device's descriptor. It only 1009 991 * works for the last device, but that's OK because that's how we use it. */ ··· 1059 1043 /* Link virtqueue back to device. */ 1060 1044 vq->dev = dev; 1061 1045 1062 - /* Set up handler. */ 1046 + /* Set the routine to call when the Guest does something to this 1047 + * virtqueue. */ 1063 1048 vq->handle_output = handle_output; 1049 + 1050 + /* Set the "Don't Notify Me" flag if we don't have a handler */ 1064 1051 if (!handle_output) 1065 1052 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1066 1053 } 1067 1054 1068 1055 /* This routine does all the creation and setup of a new device, including 1069 - * caling new_dev_desc() to allocate the descriptor and device memory. */ 1056 + * calling new_dev_desc() to allocate the descriptor and device memory. */ 1070 1057 static struct device *new_device(const char *name, u16 type, int fd, 1071 1058 bool (*handle_input)(int, struct device *)) 1072 1059 { ··· 1078 1059 /* Append to device list. Prepending to a single-linked list is 1079 1060 * easier, but the user expects the devices to be arranged on the bus 1080 1061 * in command-line order. The first network device on the command line 1081 - * is eth0, the first block device /dev/lgba, etc. */ 1062 + * is eth0, the first block device /dev/vda, etc. */ 1082 1063 *devices.lastdev = dev; 1083 1064 dev->next = NULL; 1084 1065 devices.lastdev = &dev->next; ··· 1122 1103 /* The console needs two virtqueues: the input then the output. When 1123 1104 * they put something the input queue, we make sure we're listening to 1124 1105 * stdin. When they put something in the output queue, we write it to 1125 - * stdout. */ 1106 + * stdout. */ 1126 1107 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1127 1108 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1128 1109 ··· 1270 1251 verbose("attached to bridge: %s\n", br_name); 1271 1252 } 1272 1253 1273 - 1274 - /* 1275 - * Block device. 1254 + /* Our block (disk) device should be really simple: the Guest asks for a block 1255 + * number and we read or write that position in the file. Unfortunately, that 1256 + * was amazingly slow: the Guest waits until the read is finished before 1257 + * running anything else, even if it could have been doing useful work. 1276 1258 * 1277 - * Serving a block device is really easy: the Guest asks for a block number and 1278 - * we read or write that position in the file. 1279 - * 1280 - * Unfortunately, this is amazingly slow: the Guest waits until the read is 1281 - * finished before running anything else, even if it could be doing useful 1282 - * work. We could use async I/O, except it's reputed to suck so hard that 1283 - * characters actually go missing from your code when you try to use it. 1259 + * We could use async I/O, except it's reputed to suck so hard that characters 1260 + * actually go missing from your code when you try to use it. 1284 1261 * 1285 1262 * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1286 1263 1287 - /* This hangs off device->priv, with the data. */ 1264 + /* This hangs off device->priv. */ 1288 1265 struct vblk_info 1289 1266 { 1290 1267 /* The size of the file. */ ··· 1296 1281 * Launcher triggers interrupt to Guest. */ 1297 1282 int done_fd; 1298 1283 }; 1284 + /*:*/ 1299 1285 1300 - /* This is the core of the I/O thread. It returns true if it did something. */ 1286 + /*L:210 1287 + * The Disk 1288 + * 1289 + * Remember that the block device is handled by a separate I/O thread. We head 1290 + * straight into the core of that thread here: 1291 + */ 1301 1292 static bool service_io(struct device *dev) 1302 1293 { 1303 1294 struct vblk_info *vblk = dev->priv; ··· 1314 1293 struct iovec iov[dev->vq->vring.num]; 1315 1294 off64_t off; 1316 1295 1296 + /* See if there's a request waiting. If not, nothing to do. */ 1317 1297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1318 1298 if (head == dev->vq->vring.num) 1319 1299 return false; 1320 1300 1301 + /* Every block request should contain at least one output buffer 1302 + * (detailing the location on disk and the type of request) and one 1303 + * input buffer (to hold the result). */ 1321 1304 if (out_num == 0 || in_num == 0) 1322 1305 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1323 1306 head, out_num, in_num); ··· 1330 1305 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); 1331 1306 off = out->sector * 512; 1332 1307 1333 - /* This is how we implement barriers. Pretty poor, no? */ 1308 + /* The block device implements "barriers", where the Guest indicates 1309 + * that it wants all previous writes to occur before this write. We 1310 + * don't have a way of asking our kernel to do a barrier, so we just 1311 + * synchronize all the data in the file. Pretty poor, no? */ 1334 1312 if (out->type & VIRTIO_BLK_T_BARRIER) 1335 1313 fdatasync(vblk->fd); 1336 1314 1315 + /* In general the virtio block driver is allowed to try SCSI commands. 1316 + * It'd be nice if we supported eject, for example, but we don't. */ 1337 1317 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1338 1318 fprintf(stderr, "Scsi commands unsupported\n"); 1339 1319 in->status = VIRTIO_BLK_S_UNSUPP; ··· 1404 1374 1405 1375 /* When this read fails, it means Launcher died, so we follow. */ 1406 1376 while (read(vblk->workpipe[0], &c, 1) == 1) { 1407 - /* We acknowledge each request immediately, to reduce latency, 1377 + /* We acknowledge each request immediately to reduce latency, 1408 1378 * rather than waiting until we've done them all. I haven't 1409 1379 * measured to see if it makes any difference. */ 1410 1380 while (service_io(dev)) ··· 1413 1383 return 0; 1414 1384 } 1415 1385 1416 - /* When the thread says some I/O is done, we interrupt the Guest. */ 1386 + /* Now we've seen the I/O thread, we return to the Launcher to see what happens 1387 + * when the thread tells us it's completed some I/O. */ 1417 1388 static bool handle_io_finish(int fd, struct device *dev) 1418 1389 { 1419 1390 char c; 1420 1391 1421 - /* If child died, presumably it printed message. */ 1392 + /* If the I/O thread died, presumably it printed the error, so we 1393 + * simply exit. */ 1422 1394 if (read(dev->fd, &c, 1) != 1) 1423 1395 exit(1); 1424 1396 ··· 1429 1397 return true; 1430 1398 } 1431 1399 1432 - /* When the Guest submits some I/O, we wake the I/O thread. */ 1400 + /* When the Guest submits some I/O, we just need to wake the I/O thread. */ 1433 1401 static void handle_virtblk_output(int fd, struct virtqueue *vq) 1434 1402 { 1435 1403 struct vblk_info *vblk = vq->dev->priv; ··· 1441 1409 exit(1); 1442 1410 } 1443 1411 1444 - /* This creates a virtual block device. */ 1412 + /*L:198 This actually sets up a virtual block device. */ 1445 1413 static void setup_block_file(const char *filename) 1446 1414 { 1447 1415 int p[2]; ··· 1457 1425 /* The device responds to return from I/O thread. */ 1458 1426 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1459 1427 1460 - /* The device has a virtqueue. */ 1428 + /* The device has one virtqueue, where the Guest places requests. */ 1461 1429 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1462 1430 1463 1431 /* Allocate the room for our own bookkeeping */ ··· 1479 1447 /* The I/O thread writes to this end of the pipe when done. */ 1480 1448 vblk->done_fd = p[1]; 1481 1449 1482 - /* This is how we tell the I/O thread about more work. */ 1450 + /* This is the second pipe, which is how we tell the I/O thread about 1451 + * more work. */ 1483 1452 pipe(vblk->workpipe); 1484 1453 1485 1454 /* Create stack for thread and run it */ ··· 1519 1486 char reason[1024] = { 0 }; 1520 1487 read(lguest_fd, reason, sizeof(reason)-1); 1521 1488 errx(1, "%s", reason); 1522 - /* EAGAIN means the waker wanted us to look at some input. 1489 + /* EAGAIN means the Waker wanted us to look at some input. 1523 1490 * Anything else means a bug or incompatible change. */ 1524 1491 } else if (errno != EAGAIN) 1525 1492 err(1, "Running guest failed"); 1526 1493 1527 - /* Service input, then unset the BREAK which releases 1528 - * the Waker. */ 1494 + /* Service input, then unset the BREAK to release the Waker. */ 1529 1495 handle_input(lguest_fd); 1530 1496 if (write(lguest_fd, args, sizeof(args)) < 0) 1531 1497 err(1, "Resetting break"); 1532 1498 } 1533 1499 } 1534 1500 /* 1535 - * This is the end of the Launcher. 1501 + * This is the end of the Launcher. The good news: we are over halfway 1502 + * through! The bad news: the most fiendish part of the code still lies ahead 1503 + * of us. 1536 1504 * 1537 - * But wait! We've seen I/O from the Launcher, and we've seen I/O from the 1538 - * Drivers. If we were to see the Host kernel I/O code, our understanding 1539 - * would be complete... :*/ 1505 + * Are you ready? Take a deep breath and join me in the core of the Host, in 1506 + * "make Host". 1507 + :*/ 1540 1508 1541 1509 static struct option opts[] = { 1542 1510 { "verbose", 0, NULL, 'v' }, ··· 1560 1526 /* Memory, top-level pagetable, code startpoint and size of the 1561 1527 * (optional) initrd. */ 1562 1528 unsigned long mem = 0, pgdir, start, initrd_size = 0; 1563 - /* A temporary and the /dev/lguest file descriptor. */ 1529 + /* Two temporaries and the /dev/lguest file descriptor. */ 1564 1530 int i, c, lguest_fd; 1565 1531 /* The boot information for the Guest. */ 1566 1532 struct boot_params *boot; ··· 1655 1621 /* The boot header contains a command line pointer: we put the command 1656 1622 * line after the boot header. */ 1657 1623 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1624 + /* We use a simple helper to copy the arguments separated by spaces. */ 1658 1625 concat((char *)(boot + 1), argv+optind+2); 1659 1626 1660 1627 /* Boot protocol version: 2.07 supports the fields for lguest. */

+25 -23

arch/x86/lguest/boot.c

··· 99 99 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 100 100 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 101 101 * are reasonably expensive, batching them up makes sense. For example, a 102 - * large mmap might update dozens of page table entries: that code calls 102 + * large munmap might update dozens of page table entries: that code calls 103 103 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls 104 104 * lguest_leave_lazy_mode(). 105 105 * ··· 164 164 /*:*/ 165 165 166 166 /*G:033 167 - * Here are our first native-instruction replacements: four functions for 168 - * interrupt control. 167 + * After that diversion we return to our first native-instruction 168 + * replacements: four functions for interrupt control. 169 169 * 170 170 * The simplest way of implementing these would be to have "turn interrupts 171 171 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: ··· 184 184 return lguest_data.irq_enabled; 185 185 } 186 186 187 - /* "restore_flags" just sets the flags back to the value given. */ 187 + /* restore_flags() just sets the flags back to the value given. */ 188 188 static void restore_fl(unsigned long flags) 189 189 { 190 190 lguest_data.irq_enabled = flags; ··· 357 357 * it. The Host needs to know when the Guest wants to change them, so we have 358 358 * a whole series of functions like read_cr0() and write_cr0(). 359 359 * 360 - * We start with CR0. CR0 allows you to turn on and off all kinds of basic 360 + * We start with cr0. cr0 allows you to turn on and off all kinds of basic 361 361 * features, but Linux only really cares about one: the horrifically-named Task 362 362 * Switched (TS) bit at bit 3 (ie. 8) 363 363 * ··· 390 390 current_cr0 &= ~X86_CR0_TS; 391 391 } 392 392 393 - /* CR2 is the virtual address of the last page fault, which the Guest only ever 393 + /* cr2 is the virtual address of the last page fault, which the Guest only ever 394 394 * reads. The Host kindly writes this into our "struct lguest_data", so we 395 395 * just read it out of there. */ 396 396 static unsigned long lguest_read_cr2(void) ··· 398 398 return lguest_data.cr2; 399 399 } 400 400 401 - /* CR3 is the current toplevel pagetable page: the principle is the same as 401 + /* cr3 is the current toplevel pagetable page: the principle is the same as 402 402 * cr0. Keep a local copy, and tell the Host when it changes. */ 403 403 static void lguest_write_cr3(unsigned long cr3) 404 404 { ··· 411 411 return current_cr3; 412 412 } 413 413 414 - /* CR4 is used to enable and disable PGE, but we don't care. */ 414 + /* cr4 is used to enable and disable PGE, but we don't care. */ 415 415 static unsigned long lguest_read_cr4(void) 416 416 { 417 417 return 0; ··· 432 432 * maps virtual addresses to physical addresses using "page tables". We could 433 433 * use one huge index of 1 million entries: each address is 4 bytes, so that's 434 434 * 1024 pages just to hold the page tables. But since most virtual addresses 435 - * are unused, we use a two level index which saves space. The CR3 register 435 + * are unused, we use a two level index which saves space. The cr3 register 436 436 * contains the physical address of the top level "page directory" page, which 437 437 * contains physical addresses of up to 1024 second-level pages. Each of these 438 438 * second level pages contains up to 1024 physical addresses of actual pages, ··· 440 440 * 441 441 * Here's a diagram, where arrows indicate physical addresses: 442 442 * 443 - * CR3 ---> +---------+ 443 + * cr3 ---> +---------+ 444 444 * | --------->+---------+ 445 445 * | | | PADDR1 | 446 446 * Top-level | | PADDR2 | ··· 498 498 * 499 499 * ... except in early boot when the kernel sets up the initial pagetables, 500 500 * which makes booting astonishingly slow. So we don't even tell the Host 501 - * anything changed until we've done the first page table switch. 502 - */ 501 + * anything changed until we've done the first page table switch. */ 503 502 static void lguest_set_pte(pte_t *ptep, pte_t pteval) 504 503 { 505 504 *ptep = pteval; ··· 719 720 /* Set up the timer interrupt (0) to go to our simple timer routine */ 720 721 set_irq_handler(0, lguest_time_irq); 721 722 722 - /* Our clock structure look like arch/i386/kernel/tsc.c if we can use 723 - * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 724 - * way, the "rating" is initialized so high that it's always chosen 725 - * over any other clocksource. */ 723 + /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can 724 + * use the TSC, otherwise it's a dumb nanosecond-resolution clock. 725 + * Either way, the "rating" is set so high that it's always chosen over 726 + * any other clocksource. */ 726 727 if (lguest_data.tsc_khz) 727 728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 728 729 lguest_clock.shift); ··· 748 749 * to work. They're pretty simple. 749 750 */ 750 751 751 - /* The Guest needs to tell the host what stack it expects traps to use. For 752 + /* The Guest needs to tell the Host what stack it expects traps to use. For 752 753 * native hardware, this is part of the Task State Segment mentioned above in 753 754 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 754 755 * ··· 849 850 return "LGUEST"; 850 851 } 851 852 852 - /* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to 853 - * produce console output. */ 853 + /* We will eventually use the virtio console device to produce console output, 854 + * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 855 + * console output. */ 854 856 static __init int early_put_chars(u32 vtermno, const char *buf, int count) 855 857 { 856 858 char scratch[17]; 857 859 unsigned int len = count; 858 860 861 + /* We use a nul-terminated string, so we have to make a copy. Icky, 862 + * huh? */ 859 863 if (len > sizeof(scratch) - 1) 860 864 len = sizeof(scratch) - 1; 861 865 scratch[len] = '\0'; ··· 885 883 * Our current solution is to allow the paravirt back end to optionally patch 886 884 * over the indirect calls to replace them with something more efficient. We 887 885 * patch the four most commonly called functions: disable interrupts, enable 888 - * interrupts, restore interrupts and save interrupts. We usually have 10 886 + * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 889 887 * bytes to patch into: the Guest versions of these operations are small enough 890 888 * that we can fit comfortably. 891 889 * ··· 1017 1015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1018 1016 1019 1017 /* The Host uses the top of the Guest's virtual address space for the 1020 - * Host<->Guest Switcher, and it tells us how much it needs in 1018 + * Host<->Guest Switcher, and it tells us how big that is in 1021 1019 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ 1022 1020 reserve_top_address(lguest_data.reserve_mem); 1023 1021 ··· 1067 1065 /* 1068 1066 * This marks the end of stage II of our journey, The Guest. 1069 1067 * 1070 - * It is now time for us to explore the nooks and crannies of the three Guest 1071 - * devices and complete our understanding of the Guest in "make Drivers". 1068 + * It is now time for us to explore the layer of virtual drivers and complete 1069 + * our understanding of the Guest in "make Drivers". 1072 1070 */

+5 -3

arch/x86/lguest/i386_head.S

··· 6 6 #include <asm/processor-flags.h> 7 7 8 8 /*G:020 This is where we begin: head.S notes that the boot header's platform 9 - * type field is "1" (lguest), so calls us here. The boot header is in %esi. 9 + * type field is "1" (lguest), so calls us here. 10 10 * 11 11 * WARNING: be very careful here! We're running at addresses equal to physical 12 12 * addesses (around 0), not above PAGE_OFFSET as most code expectes ··· 17 17 * boot. */ 18 18 .section .init.text, "ax", @progbits 19 19 ENTRY(lguest_entry) 20 - /* Make initial hypercall now, so we can set up the pagetables. */ 20 + /* We make the "initialization" hypercall now to tell the Host about 21 + * us, and also find out where it put our page tables. */ 21 22 movl $LHCALL_LGUEST_INIT, %eax 22 23 movl $lguest_data - __PAGE_OFFSET, %edx 23 24 int $LGUEST_TRAP_ENTRY 24 25 25 26 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl 26 - * instruction uses %esi implicitly. */ 27 + * instruction uses %esi implicitly as the source for the copy we' 28 + * about to do. */ 27 29 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi 28 30 29 31 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.

+4 -1

drivers/lguest/core.c

··· 128 128 __free_pages(switcher_page[i], 0); 129 129 } 130 130 131 - /*L:305 131 + /*H:032 132 132 * Dealing With Guest Memory. 133 + * 134 + * Before we go too much further into the Host, we need to grok the routines 135 + * we use to deal with Guest memory. 133 136 * 134 137 * When the Guest gives us (what it thinks is) a physical address, we can use 135 138 * the normal copy_from_user() & copy_to_user() on the corresponding place in

+6 -5

drivers/lguest/hypercalls.c

··· 90 90 lg->pending_notify = args->arg1; 91 91 break; 92 92 default: 93 + /* It should be an architecture-specific hypercall. */ 93 94 if (lguest_arch_do_hcall(lg, args)) 94 95 kill_guest(lg, "Bad hypercall %li\n", args->arg0); 95 96 } ··· 158 157 * Guest makes a hypercall, we end up here to set things up: */ 159 158 static void initialize(struct lguest *lg) 160 159 { 161 - 162 160 /* You can't do anything until you're initialized. The Guest knows the 163 161 * rules, so we're unforgiving here. */ 164 162 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { ··· 174 174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 175 175 kill_guest(lg, "bad guest page %p", lg->lguest_data); 176 176 177 - /* We write the current time into the Guest's data page once now. */ 177 + /* We write the current time into the Guest's data page once so it can 178 + * set its clock. */ 178 179 write_timestamp(lg); 179 180 180 181 /* page_tables.c will also do some setup. */ ··· 183 182 184 183 /* This is the one case where the above accesses might have been the 185 184 * first write to a Guest page. This may have caused a copy-on-write 186 - * fault, but the Guest might be referring to the old (read-only) 187 - * page. */ 185 + * fault, but the old page might be (read-only) in the Guest 186 + * pagetable. */ 188 187 guest_pagetable_clear_all(lg); 189 188 } 190 189 ··· 221 220 * Normally it doesn't matter: the Guest will run again and 222 221 * update the trap number before we come back here. 223 222 * 224 - * However, if we are signalled or the Guest sends DMA to the 223 + * However, if we are signalled or the Guest sends I/O to the 225 224 * Launcher, the run_guest() loop will exit without running the 226 225 * Guest. When it comes back it would try to re-run the 227 226 * hypercall. */

+29 -8

drivers/lguest/interrupts_and_traps.c

··· 92 92 93 93 /* Remember that we never let the Guest actually disable interrupts, so 94 94 * the "Interrupt Flag" bit is always set. We copy that bit from the 95 - * Guest's "irq_enabled" field into the eflags word: the Guest copies 96 - * it back in "lguest_iret". */ 95 + * Guest's "irq_enabled" field into the eflags word: we saw the Guest 96 + * copy it back in "lguest_iret". */ 97 97 eflags = lg->regs->eflags; 98 98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 99 99 && !(irq_enable & X86_EFLAGS_IF)) ··· 124 124 kill_guest(lg, "Disabling interrupts"); 125 125 } 126 126 127 - /*H:200 127 + /*H:205 128 128 * Virtual Interrupts. 129 129 * 130 130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if ··· 256 256 * bogus one in): if we fail here, the Guest will be killed. */ 257 257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 258 258 return 0; 259 - set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); 259 + set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, 260 + has_err(num)); 260 261 return 1; 261 262 } 262 263 263 264 /*H:250 Here's the hard part: returning to the Host every time a trap happens 264 265 * and then calling deliver_trap() and re-entering the Guest is slow. 265 - * Particularly because Guest userspace system calls are traps (trap 128). 266 + * Particularly because Guest userspace system calls are traps (usually trap 267 + * 128). 266 268 * 267 269 * So we'd like to set up the IDT to tell the CPU to deliver traps directly 268 270 * into the Guest. This is possible, but the complexities cause the size of 269 271 * this file to double! However, 150 lines of code is worth writing for taking 270 272 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 271 - * the other hypervisors would tease it. 273 + * the other hypervisors would beat it up at lunchtime. 272 274 * 273 275 * This routine indicates if a particular trap number could be delivered 274 276 * directly. */ ··· 333 331 * change stacks on each context switch. */ 334 332 void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) 335 333 { 336 - /* You are not allowd have a stack segment with privilege level 0: bad 334 + /* You are not allowed have a stack segment with privilege level 0: bad 337 335 * Guest! */ 338 336 if ((seg & 0x3) != GUEST_PL) 339 337 kill_guest(lg, "bad stack segment %i", seg); ··· 352 350 * part of the Host: page table handling. */ 353 351 354 352 /*H:235 This is the routine which actually checks the Guest's IDT entry and 355 - * transfers it into our entry in "struct lguest": */ 353 + * transfers it into the entry in "struct lguest": */ 356 354 static void set_trap(struct lguest *lg, struct desc_struct *trap, 357 355 unsigned int num, u32 lo, u32 hi) 358 356 { ··· 458 456 } 459 457 } 460 458 459 + /*H:200 460 + * The Guest Clock. 461 + * 462 + * There are two sources of virtual interrupts. We saw one in lguest_user.c: 463 + * the Launcher sending interrupts for virtual devices. The other is the Guest 464 + * timer interrupt. 465 + * 466 + * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to 467 + * the next timer interrupt (in nanoseconds). We use the high-resolution timer 468 + * infrastructure to set a callback at that time. 469 + * 470 + * 0 means "turn off the clock". */ 461 471 void guest_set_clockevent(struct lguest *lg, unsigned long delta) 462 472 { 463 473 ktime_t expires; ··· 480 466 return; 481 467 } 482 468 469 + /* We use wallclock time here, so the Guest might not be running for 470 + * all the time between now and the timer interrupt it asked for. This 471 + * is almost always the right thing to do. */ 483 472 expires = ktime_add_ns(ktime_get_real(), delta); 484 473 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); 485 474 } 486 475 476 + /* This is the function called when the Guest's timer expires. */ 487 477 static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 488 478 { 489 479 struct lguest *lg = container_of(timer, struct lguest, hrt); 490 480 481 + /* Remember the first interrupt is the timer interrupt. */ 491 482 set_bit(0, lg->irqs_pending); 483 + /* If the Guest is actually stopped, we need to wake it up. */ 492 484 if (lg->halted) 493 485 wake_up_process(lg->tsk); 494 486 return HRTIMER_NORESTART; 495 487 } 496 488 489 + /* This sets up the timer for this Guest. */ 497 490 void init_clockdev(struct lguest *lg) 498 491 { 499 492 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);

+2 -2

drivers/lguest/lg.h

··· 100 100 void __lgread(struct lguest *, void *, unsigned long, unsigned); 101 101 void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 102 102 103 - /*L:306 Using memory-copy operations like that is usually inconvient, so we 103 + /*H:035 Using memory-copy operations like that is usually inconvient, so we 104 104 * have the following helper macros which read and write a specific type (often 105 105 * an unsigned long). 106 106 * ··· 188 188 * Let's step aside for the moment, to study one important routine that's used 189 189 * widely in the Host code. 190 190 * 191 - * There are many cases where the Guest does something invalid, like pass crap 191 + * There are many cases where the Guest can do something invalid, like pass crap 192 192 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite 193 193 * acceptable to simply terminate the Guest and give the Launcher a nicely 194 194 * formatted reason. It's also simpler for the Guest itself, which doesn't

+7 -4

drivers/lguest/lguest_device.c

··· 53 53 * Device configurations 54 54 * 55 55 * The configuration information for a device consists of a series of fields. 56 - * The device will look for these fields during setup. 56 + * We don't really care what they are: the Launcher set them up, and the driver 57 + * will look at them during setup. 57 58 * 58 59 * For us these fields come immediately after that device's descriptor in the 59 60 * lguest_devices page. ··· 123 122 * The other piece of infrastructure virtio needs is a "virtqueue": a way of 124 123 * the Guest device registering buffers for the other side to read from or 125 124 * write into (ie. send and receive buffers). Each device can have multiple 126 - * virtqueues: for example the console has one queue for sending and one for 127 - * receiving. 125 + * virtqueues: for example the console driver uses one queue for sending and 126 + * another for receiving. 128 127 * 129 128 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue 130 129 * already exists in virtio_ring.c. We just need to connect it up. ··· 159 158 * 160 159 * This is kind of an ugly duckling. It'd be nicer to have a standard 161 160 * representation of a virtqueue in the configuration space, but it seems that 162 - * everyone wants to do it differently. The KVM guys want the Guest to 161 + * everyone wants to do it differently. The KVM coders want the Guest to 163 162 * allocate its own pages and tell the Host where they are, but for lguest it's 164 163 * simpler for the Host to simply tell us where the pages are. 165 164 * ··· 285 284 { 286 285 struct lguest_device *ldev; 287 286 287 + /* Start with zeroed memory; Linux's device layer seems to count on 288 + * it. */ 288 289 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 289 290 if (!ldev) { 290 291 printk(KERN_EMERG "Cannot allocate lguest dev %u\n",

+12 -11

drivers/lguest/lguest_user.c

··· 8 8 #include <linux/fs.h> 9 9 #include "lg.h" 10 10 11 - /*L:315 To force the Guest to stop running and return to the Launcher, the 12 - * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 13 - * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 11 + /*L:055 When something happens, the Waker process needs a way to stop the 12 + * kernel running the Guest and return to the Launcher. So the Waker writes 13 + * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher 14 + * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release 15 + * the Waker. */ 14 16 static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 15 17 { 16 18 unsigned long on; 17 19 18 - /* Fetch whether they're turning break on or off.. */ 20 + /* Fetch whether they're turning break on or off. */ 19 21 if (get_user(on, input) != 0) 20 22 return -EFAULT; 21 23 22 24 if (on) { 23 25 lg->break_out = 1; 24 - /* Pop it out (may be running on different CPU) */ 26 + /* Pop it out of the Guest (may be running on different CPU) */ 25 27 wake_up_process(lg->tsk); 26 28 /* Wait for them to reset it */ 27 29 return wait_event_interruptible(lg->break_wq, !lg->break_out); ··· 60 58 if (!lg) 61 59 return -EINVAL; 62 60 63 - /* If you're not the task which owns the guest, go away. */ 61 + /* If you're not the task which owns the Guest, go away. */ 64 62 if (current != lg->tsk) 65 63 return -EPERM; 66 64 ··· 94 92 * base: The start of the Guest-physical memory inside the Launcher memory. 95 93 * 96 94 * pfnlimit: The highest (Guest-physical) page number the Guest should be 97 - * allowed to access. The Launcher has to live in Guest memory, so it sets 98 - * this to ensure the Guest can't reach it. 95 + * allowed to access. The Guest memory lives inside the Launcher, so it sets 96 + * this to ensure the Guest can only reach its own memory. 99 97 * 100 98 * pgdir: The (Guest-physical) address of the top of the initial Guest 101 99 * pagetables (which are set up by the Launcher). ··· 191 189 } 192 190 193 191 /*L:010 The first operation the Launcher does must be a write. All writes 194 - * start with a 32 bit number: for the first write this must be 192 + * start with an unsigned long number: for the first write this must be 195 193 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 196 194 * writes of other values to send interrupts. */ 197 195 static ssize_t write(struct file *file, const char __user *in, ··· 277 275 * The Launcher is the Host userspace program which sets up, runs and services 278 276 * the Guest. In fact, many comments in the Drivers which refer to "the Host" 279 277 * doing things are inaccurate: the Launcher does all the device handling for 280 - * the Guest. The Guest can't tell what's done by the the Launcher and what by 281 - * the Host. 278 + * the Guest, but the Guest can't know that. 282 279 * 283 280 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we 284 281 * shall see more of that later.

+72 -41

drivers/lguest/page_tables.c

··· 26 26 * 27 27 * We use two-level page tables for the Guest. If you're not entirely 28 28 * comfortable with virtual addresses, physical addresses and page tables then 29 - * I recommend you review lguest.c's "Page Table Handling" (with diagrams!). 29 + * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 30 + * diagrams!). 30 31 * 31 32 * The Guest keeps page tables, but we maintain the actual ones here: these are 32 33 * called "shadow" page tables. Which is a very Guest-centric name: these are ··· 37 36 * 38 37 * Anyway, this is the most complicated part of the Host code. There are seven 39 38 * parts to this: 40 - * (i) Setting up a page table entry for the Guest when it faults, 41 - * (ii) Setting up the page table entry for the Guest stack, 42 - * (iii) Setting up a page table entry when the Guest tells us it has changed, 39 + * (i) Looking up a page table entry when the Guest faults, 40 + * (ii) Making sure the Guest stack is mapped, 41 + * (iii) Setting up a page table entry when the Guest tells us one has changed, 43 42 * (iv) Switching page tables, 44 - * (v) Flushing (thowing away) page tables, 43 + * (v) Flushing (throwing away) page tables, 45 44 * (vi) Mapping the Switcher when the Guest is about to run, 46 45 * (vii) Setting up the page tables initially. 47 46 :*/ ··· 58 57 static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 59 58 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 60 59 61 - /*H:320 With our shadow and Guest types established, we need to deal with 62 - * them: the page table code is curly enough to need helper functions to keep 63 - * it clear and clean. 60 + /*H:320 The page table code is curly enough to need helper functions to keep it 61 + * clear and clean. 64 62 * 65 63 * There are two functions which return pointers to the shadow (aka "real") 66 64 * page tables. 67 65 * 68 66 * spgd_addr() takes the virtual address and returns a pointer to the top-level 69 - * page directory entry for that address. Since we keep track of several page 70 - * tables, the "i" argument tells us which one we're interested in (it's 67 + * page directory entry (PGD) for that address. Since we keep track of several 68 + * page tables, the "i" argument tells us which one we're interested in (it's 71 69 * usually the current one). */ 72 70 static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 73 71 { ··· 81 81 return &lg->pgdirs[i].pgdir[index]; 82 82 } 83 83 84 - /* This routine then takes the PGD entry given above, which contains the 85 - * address of the PTE page. It then returns a pointer to the PTE entry for the 86 - * given address. */ 84 + /* This routine then takes the page directory entry returned above, which 85 + * contains the address of the page table entry (PTE) page. It then returns a 86 + * pointer to the PTE entry for the given address. */ 87 87 static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 88 88 { 89 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); ··· 191 191 } 192 192 193 193 /*H:330 194 - * (i) Setting up a page table entry for the Guest when it faults 194 + * (i) Looking up a page table entry when the Guest faults. 195 195 * 196 196 * We saw this call in run_guest(): when we see a page fault in the Guest, we 197 197 * come here. That's because we only set up the shadow page tables lazily as ··· 199 199 * and return to the Guest without it knowing. 200 200 * 201 201 * If we fixed up the fault (ie. we mapped the address), this routine returns 202 - * true. */ 202 + * true. Otherwise, it was a real fault and we need to tell the Guest. */ 203 203 int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 204 204 { 205 205 pgd_t gpgd; ··· 246 246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 247 247 return 0; 248 248 249 - /* User access to a kernel page? (bit 3 == user access) */ 249 + /* User access to a kernel-only page? (bit 3 == user access) */ 250 250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 251 251 return 0; 252 252 253 253 /* Check that the Guest PTE flags are OK, and the page number is below 254 254 * the pfn_limit (ie. not mapping the Launcher binary). */ 255 255 check_gpte(lg, gpte); 256 + 256 257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 258 gpte = pte_mkyoung(gpte); 258 - 259 259 if (errcode & 2) 260 260 gpte = pte_mkdirty(gpte); 261 261 ··· 272 272 else 273 273 /* If this is a read, don't set the "writable" bit in the page 274 274 * table entry, even if the Guest says it's writable. That way 275 - * we come back here when a write does actually ocur, so we can 276 - * update the Guest's _PAGE_DIRTY flag. */ 275 + * we will come back here when a write does actually occur, so 276 + * we can update the Guest's _PAGE_DIRTY flag. */ 277 277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 278 278 279 279 /* Finally, we write the Guest PTE entry back: we've set the 280 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 281 281 lgwrite(lg, gpte_ptr, pte_t, gpte); 282 282 283 - /* We succeeded in mapping the page! */ 283 + /* The fault is fixed, the page table is populated, the mapping 284 + * manipulated, the result returned and the code complete. A small 285 + * delay and a trace of alliteration are the only indications the Guest 286 + * has that a page fault occurred at all. */ 284 287 return 1; 285 288 } 286 289 287 - /*H:360 (ii) Setting up the page table entry for the Guest stack. 290 + /*H:360 291 + * (ii) Making sure the Guest stack is mapped. 288 292 * 289 - * Remember pin_stack_pages() which makes sure the stack is mapped? It could 290 - * simply call demand_page(), but as we've seen that logic is quite long, and 291 - * usually the stack pages are already mapped anyway, so it's not required. 293 + * Remember that direct traps into the Guest need a mapped Guest kernel stack. 294 + * pin_stack_pages() calls us here: we could simply call demand_page(), but as 295 + * we've seen that logic is quite long, and usually the stack pages are already 296 + * mapped, so it's overkill. 292 297 * 293 298 * This is a quick version which answers the question: is this virtual address 294 299 * mapped by the shadow page tables, and is it writable? */ ··· 302 297 pgd_t *spgd; 303 298 unsigned long flags; 304 299 305 - /* Look at the top level entry: is it present? */ 300 + /* Look at the current top level entry: is it present? */ 306 301 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 307 302 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 308 303 return 0; ··· 338 333 release_pte(ptepage[i]); 339 334 /* Now we can free the page of PTEs */ 340 335 free_page((long)ptepage); 341 - /* And zero out the PGD entry we we never release it twice. */ 336 + /* And zero out the PGD entry so we never release it twice. */ 342 337 *spgd = __pgd(0); 343 338 } 344 339 } 345 340 346 - /*H:440 (v) Flushing (thowing away) page tables, 347 - * 348 - * We saw flush_user_mappings() called when we re-used a top-level pgdir page. 349 - * It simply releases every PTE page from 0 up to the kernel address. */ 341 + /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 342 + * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 343 + * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 350 344 static void flush_user_mappings(struct lguest *lg, int idx) 351 345 { 352 346 unsigned int i; ··· 354 350 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 355 351 } 356 352 357 - /* The Guest also has a hypercall to do this manually: it's used when a large 358 - * number of mappings have been changed. */ 353 + /*H:440 (v) Flushing (throwing away) page tables, 354 + * 355 + * The Guest has a hypercall to throw away the page tables: it's used when a 356 + * large number of mappings have been changed. */ 359 357 void guest_pagetable_flush_user(struct lguest *lg) 360 358 { 361 359 /* Drop the userspace part of the current page table. */ ··· 429 423 430 424 /*H:430 (iv) Switching page tables 431 425 * 432 - * This is what happens when the Guest changes page tables (ie. changes the 433 - * top-level pgdir). This happens on almost every context switch. */ 426 + * Now we've seen all the page table setting and manipulation, let's see what 427 + * what happens when the Guest changes page tables (ie. changes the top-level 428 + * pgdir). This occurs on almost every context switch. */ 434 429 void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) 435 430 { 436 431 int newpgdir, repin = 0; ··· 450 443 } 451 444 452 445 /*H:470 Finally, a routine which throws away everything: all PGD entries in all 453 - * the shadow page tables. This is used when we destroy the Guest. */ 446 + * the shadow page tables, including the Guest's kernel mappings. This is used 447 + * when we destroy the Guest. */ 454 448 static void release_all_pagetables(struct lguest *lg) 455 449 { 456 450 unsigned int i, j; ··· 466 458 467 459 /* We also throw away everything when a Guest tells us it's changed a kernel 468 460 * mapping. Since kernel mappings are in every page table, it's easiest to 469 - * throw them all away. This is amazingly slow, but thankfully rare. */ 461 + * throw them all away. This traps the Guest in amber for a while as 462 + * everything faults back in, but it's rare. */ 470 463 void guest_pagetable_clear_all(struct lguest *lg) 471 464 { 472 465 release_all_pagetables(lg); 473 466 /* We need the Guest kernel stack mapped again. */ 474 467 pin_stack_pages(lg); 475 468 } 469 + /*:*/ 470 + /*M:009 Since we throw away all mappings when a kernel mapping changes, our 471 + * performance sucks for guests using highmem. In fact, a guest with 472 + * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 473 + * usually slower than a Guest with less memory. 474 + * 475 + * This, of course, cannot be fixed. It would take some kind of... well, I 476 + * don't know, but the term "puissant code-fu" comes to mind. :*/ 476 477 477 478 /*H:420 This is the routine which actually sets the page table entry for then 478 479 * "idx"'th shadow page table. ··· 500 483 static void do_set_pte(struct lguest *lg, int idx, 501 484 unsigned long vaddr, pte_t gpte) 502 485 { 503 - /* Look up the matching shadow page directot entry. */ 486 + /* Look up the matching shadow page directory entry. */ 504 487 pgd_t *spgd = spgd_addr(lg, idx, vaddr); 505 488 506 489 /* If the top level isn't present, there's no entry to update. */ ··· 517 500 *spte = gpte_to_spte(lg, gpte, 518 501 pte_flags(gpte) & _PAGE_DIRTY); 519 502 } else 520 - /* Otherwise we can demand_page() it in later. */ 503 + /* Otherwise kill it and we can demand_page() it in 504 + * later. */ 521 505 *spte = __pte(0); 522 506 } 523 507 } ··· 553 535 } 554 536 555 537 /*H:400 556 - * (iii) Setting up a page table entry when the Guest tells us it has changed. 538 + * (iii) Setting up a page table entry when the Guest tells us one has changed. 557 539 * 558 540 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 559 541 * with the other side of page tables while we're here: what happens when the ··· 630 612 631 613 /*H:480 (vi) Mapping the Switcher when the Guest is about to run. 632 614 * 633 - * The Switcher and the two pages for this CPU need to be available to the 615 + * The Switcher and the two pages for this CPU need to be visible in the 634 616 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 635 - * for each CPU already set up, we just need to hook them in. */ 617 + * for each CPU already set up, we just need to hook them in now we know which 618 + * Guest is about to run on this CPU. */ 636 619 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 637 620 { 638 621 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); ··· 695 676 pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), 696 677 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 697 678 } 679 + 680 + /* We've made it through the page table code. Perhaps our tired brains are 681 + * still processing the details, or perhaps we're simply glad it's over. 682 + * 683 + * If nothing else, note that all this complexity in juggling shadow page 684 + * tables in sync with the Guest's page tables is for one reason: for most 685 + * Guests this page table dance determines how bad performance will be. This 686 + * is why Xen uses exotic direct Guest pagetable manipulation, and why both 687 + * Intel and AMD have implemented shadow page table support directly into 688 + * hardware. 689 + * 690 + * There is just one file remaining in the Host. */ 698 691 699 692 /*H:510 At boot or module load time, init_pagetables() allocates and populates 700 693 * the Switcher PTE page for each CPU. */

+29 -19

drivers/lguest/segments.c

··· 12 12 #include "lg.h" 13 13 14 14 /*H:600 15 - * We've almost completed the Host; there's just one file to go! 16 - * 17 15 * Segments & The Global Descriptor Table 18 16 * 19 17 * (That title sounds like a bad Nerdcore group. Not to suggest that there are ··· 53 55 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 54 56 } 55 57 56 - /*H:610 Once the GDT has been changed, we fix the new entries up a little. We 58 + /*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We 57 59 * don't care if they're invalid: the worst that can happen is a General 58 60 * Protection Fault in the Switcher when it restores a Guest segment register 59 61 * which tries to use that entry. Then we kill the Guest for causing such a ··· 82 84 } 83 85 } 84 86 85 - /* This routine is called at boot or modprobe time for each CPU to set up the 86 - * "constant" GDT entries for Guests running on that CPU. */ 87 + /*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep 88 + * a GDT for each CPU, and copy across the Guest's entries each time we want to 89 + * run the Guest on that CPU. 90 + * 91 + * This routine is called at boot or modprobe time for each CPU to set up the 92 + * constant GDT entries: the ones which are the same no matter what Guest we're 93 + * running. */ 87 94 void setup_default_gdt_entries(struct lguest_ro_state *state) 88 95 { 89 96 struct desc_struct *gdt = state->guest_gdt; 90 97 unsigned long tss = (unsigned long)&state->guest_tss; 91 98 92 - /* The hypervisor segments are full 0-4G segments, privilege level 0 */ 99 + /* The Switcher segments are full 0-4G segments, privilege level 0 */ 93 100 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 94 101 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 95 102 96 - /* The TSS segment refers to the TSS entry for this CPU, so we cannot 97 - * copy it from the Guest. Forgive the magic flags */ 103 + /* The TSS segment refers to the TSS entry for this particular CPU. 104 + * Forgive the magic flags: the 0x8900 means the entry is Present, it's 105 + * privilege level 0 Available 386 TSS system segment, and the 0x67 106 + * means Saturn is eclipsed by Mercury in the twelfth house. */ 98 107 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 99 108 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 100 109 | ((tss >> 16) & 0x000000FF); 101 110 } 102 111 103 - /* This routine is called before the Guest is run for the first time. */ 112 + /* This routine sets up the initial Guest GDT for booting. All entries start 113 + * as 0 (unusable). */ 104 114 void setup_guest_gdt(struct lguest *lg) 105 115 { 106 116 /* Start with full 0-4G segments... */ ··· 120 114 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 121 115 } 122 116 123 - /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the 124 - * GDTs for each CPU, then we copy across the entries each time we want to run 125 - * a different Guest on that CPU. */ 126 - 127 - /* A partial GDT load, for the three "thead-local storage" entries. Otherwise 128 - * it's just like load_guest_gdt(). So much, in fact, it would probably be 129 - * neater to have a single hypercall to cover both. */ 117 + /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" 118 + * entries. */ 130 119 void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) 131 120 { 132 121 unsigned int i; ··· 130 129 gdt[i] = lg->arch.gdt[i]; 131 130 } 132 131 133 - /* This is the full version */ 132 + /*H:640 When the Guest is run on a different CPU, or the GDT entries have 133 + * changed, copy_gdt() is called to copy the Guest's GDT entries across to this 134 + * CPU's GDT. */ 134 135 void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) 135 136 { 136 137 unsigned int i; ··· 144 141 gdt[i] = lg->arch.gdt[i]; 145 142 } 146 143 147 - /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ 144 + /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). 145 + * We copy it from the Guest and tweak the entries. */ 148 146 void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) 149 147 { 150 148 /* We assume the Guest has the same number of GDT entries as the ··· 161 157 lg->changed |= CHANGED_GDT; 162 158 } 163 159 160 + /* This is the fast-track version for just changing the three TLS entries. 161 + * Remember that this happens on every context switch, so it's worth 162 + * optimizing. But wouldn't it be neater to have a single hypercall to cover 163 + * both cases? */ 164 164 void guest_load_tls(struct lguest *lg, unsigned long gtls) 165 165 { 166 166 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 167 167 168 168 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 169 169 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 170 + /* Note that just the TLS entries have changed. */ 170 171 lg->changed |= CHANGED_GDT_TLS; 171 172 } 173 + /*:*/ 172 174 173 - /* 175 + /*H:660 174 176 * With this, we have finished the Host. 175 177 * 176 178 * Five of the seven parts of our task are complete. You have made it through

+62 -58

drivers/lguest/x86/core.c

··· 63 63 static DEFINE_PER_CPU(struct lguest *, last_guest); 64 64 65 65 /*S:010 66 - * We are getting close to the Switcher. 66 + * We approach the Switcher. 67 67 * 68 68 * Remember that each CPU has two pages which are visible to the Guest when it 69 69 * runs on that CPU. This has to contain the state for that Guest: we copy the ··· 134 134 * 135 135 * The lcall also pushes the old code segment (KERNEL_CS) onto the 136 136 * stack, then the address of this call. This stack layout happens to 137 - * exactly match the stack of an interrupt... */ 137 + * exactly match the stack layout created by an interrupt... */ 138 138 asm volatile("pushf; lcall *lguest_entry" 139 139 /* This is how we tell GCC that %eax ("a") and %ebx ("b") 140 140 * are changed by this routine. The "=" means output. */ ··· 151 151 } 152 152 /*:*/ 153 153 154 + /*M:002 There are hooks in the scheduler which we can register to tell when we 155 + * get kicked off the CPU (preempt_notifier_register()). This would allow us 156 + * to lazily disable SYSENTER which would regain some performance, and should 157 + * also simplify copy_in_guest_info(). Note that we'd still need to restore 158 + * things when we exit to Launcher userspace, but that's fairly easy. 159 + * 160 + * The hooks were designed for KVM, but we can also put them to good use. :*/ 161 + 154 162 /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 155 163 * are disabled: we own the CPU. */ 156 164 void lguest_arch_run_guest(struct lguest *lg) 157 165 { 158 - /* Remember the awfully-named TS bit? If the Guest has asked 159 - * to set it we set it now, so we can trap and pass that trap 160 - * to the Guest if it uses the FPU. */ 166 + /* Remember the awfully-named TS bit? If the Guest has asked to set it 167 + * we set it now, so we can trap and pass that trap to the Guest if it 168 + * uses the FPU. */ 161 169 if (lg->ts) 162 170 lguest_set_ts(); 163 171 164 - /* SYSENTER is an optimized way of doing system calls. We 165 - * can't allow it because it always jumps to privilege level 0. 166 - * A normal Guest won't try it because we don't advertise it in 167 - * CPUID, but a malicious Guest (or malicious Guest userspace 168 - * program) could, so we tell the CPU to disable it before 169 - * running the Guest. */ 172 + /* SYSENTER is an optimized way of doing system calls. We can't allow 173 + * it because it always jumps to privilege level 0. A normal Guest 174 + * won't try it because we don't advertise it in CPUID, but a malicious 175 + * Guest (or malicious Guest userspace program) could, so we tell the 176 + * CPU to disable it before running the Guest. */ 170 177 if (boot_cpu_has(X86_FEATURE_SEP)) 171 178 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 172 179 173 - /* Now we actually run the Guest. It will pop back out when 174 - * something interesting happens, and we can examine its 175 - * registers to see what it was doing. */ 180 + /* Now we actually run the Guest. It will return when something 181 + * interesting happens, and we can examine its registers to see what it 182 + * was doing. */ 176 183 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 177 184 178 - /* The "regs" pointer contains two extra entries which are not 179 - * really registers: a trap number which says what interrupt or 180 - * trap made the switcher code come back, and an error code 181 - * which some traps set. */ 185 + /* Note that the "regs" pointer contains two extra entries which are 186 + * not really registers: a trap number which says what interrupt or 187 + * trap made the switcher code come back, and an error code which some 188 + * traps set. */ 182 189 183 - /* If the Guest page faulted, then the cr2 register will tell 184 - * us the bad virtual address. We have to grab this now, 185 - * because once we re-enable interrupts an interrupt could 186 - * fault and thus overwrite cr2, or we could even move off to a 187 - * different CPU. */ 190 + /* If the Guest page faulted, then the cr2 register will tell us the 191 + * bad virtual address. We have to grab this now, because once we 192 + * re-enable interrupts an interrupt could fault and thus overwrite 193 + * cr2, or we could even move off to a different CPU. */ 188 194 if (lg->regs->trapnum == 14) 189 195 lg->arch.last_pagefault = read_cr2(); 190 196 /* Similarly, if we took a trap because the Guest used the FPU, ··· 203 197 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 204 198 } 205 199 206 - /*H:130 Our Guest is usually so well behaved; it never tries to do things it 207 - * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 208 - * quite complete, because it doesn't contain replacements for the Intel I/O 209 - * instructions. As a result, the Guest sometimes fumbles across one during 210 - * the boot process as it probes for various things which are usually attached 211 - * to a PC. 200 + /*H:130 Now we've examined the hypercall code; our Guest can make requests. 201 + * Our Guest is usually so well behaved; it never tries to do things it isn't 202 + * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 203 + * infrastructure isn't quite complete, because it doesn't contain replacements 204 + * for the Intel I/O instructions. As a result, the Guest sometimes fumbles 205 + * across one during the boot process as it probes for various things which are 206 + * usually attached to a PC. 212 207 * 213 - * When the Guest uses one of these instructions, we get trap #13 (General 208 + * When the Guest uses one of these instructions, we get a trap (General 214 209 * Protection Fault) and come here. We see if it's one of those troublesome 215 210 * instructions and skip over it. We return true if we did. */ 216 211 static int emulate_insn(struct lguest *lg) ··· 282 275 void lguest_arch_handle_trap(struct lguest *lg) 283 276 { 284 277 switch (lg->regs->trapnum) { 285 - case 13: /* We've intercepted a GPF. */ 286 - /* Check if this was one of those annoying IN or OUT 287 - * instructions which we need to emulate. If so, we 288 - * just go back into the Guest after we've done it. */ 278 + case 13: /* We've intercepted a General Protection Fault. */ 279 + /* Check if this was one of those annoying IN or OUT 280 + * instructions which we need to emulate. If so, we just go 281 + * back into the Guest after we've done it. */ 289 282 if (lg->regs->errcode == 0) { 290 283 if (emulate_insn(lg)) 291 284 return; 292 285 } 293 286 break; 294 - case 14: /* We've intercepted a page fault. */ 295 - /* The Guest accessed a virtual address that wasn't 296 - * mapped. This happens a lot: we don't actually set 297 - * up most of the page tables for the Guest at all when 298 - * we start: as it runs it asks for more and more, and 299 - * we set them up as required. In this case, we don't 300 - * even tell the Guest that the fault happened. 301 - * 302 - * The errcode tells whether this was a read or a 303 - * write, and whether kernel or userspace code. */ 287 + case 14: /* We've intercepted a Page Fault. */ 288 + /* The Guest accessed a virtual address that wasn't mapped. 289 + * This happens a lot: we don't actually set up most of the 290 + * page tables for the Guest at all when we start: as it runs 291 + * it asks for more and more, and we set them up as 292 + * required. In this case, we don't even tell the Guest that 293 + * the fault happened. 294 + * 295 + * The errcode tells whether this was a read or a write, and 296 + * whether kernel or userspace code. */ 304 297 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 305 298 return; 306 299 307 - /* OK, it's really not there (or not OK): the Guest 308 - * needs to know. We write out the cr2 value so it 309 - * knows where the fault occurred. 310 - * 311 - * Note that if the Guest were really messed up, this 312 - * could happen before it's done the INITIALIZE 313 - * hypercall, so lg->lguest_data will be NULL */ 300 + /* OK, it's really not there (or not OK): the Guest needs to 301 + * know. We write out the cr2 value so it knows where the 302 + * fault occurred. 303 + * 304 + * Note that if the Guest were really messed up, this could 305 + * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 306 + * lg->lguest_data could be NULL */ 314 307 if (lg->lguest_data && 315 308 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 316 309 kill_guest(lg, "Writing cr2"); 317 310 break; 318 311 case 7: /* We've intercepted a Device Not Available fault. */ 319 - /* If the Guest doesn't want to know, we already 320 - * restored the Floating Point Unit, so we just 321 - * continue without telling it. */ 312 + /* If the Guest doesn't want to know, we already restored the 313 + * Floating Point Unit, so we just continue without telling 314 + * it. */ 322 315 if (!lg->ts) 323 316 return; 324 317 break; ··· 543 536 544 537 return 0; 545 538 } 546 - /* Now we've examined the hypercall code; our Guest can make requests. There 547 - * is one other way we can do things for the Guest, as we see in 548 - * emulate_insn(). :*/ 549 539 550 540 /*L:030 lguest_arch_setup_regs() 551 541 * ··· 574 570 575 571 /* %esi points to our boot information, at physical address 0, so don't 576 572 * touch it. */ 573 + 577 574 /* There are a couple of GDT entries the Guest expects when first 578 575 * booting. */ 579 - 580 576 setup_guest_gdt(lg); 581 577 }

+51 -20

drivers/lguest/x86/switcher_32.S

··· 6 6 * are feeling invigorated and refreshed then the next, more challenging stage 7 7 * can be found in "make Guest". :*/ 8 8 9 + /*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 10 + * gain at least 1% more performance. Since neither LOC nor performance can be 11 + * measured beforehand, it generally means implementing a feature then deciding 12 + * if it's worth it. And once it's implemented, who can say no? 13 + * 14 + * This is why I haven't implemented this idea myself. I want to, but I 15 + * haven't. You could, though. 16 + * 17 + * The main place where lguest performance sucks is Guest page faulting. When 18 + * a Guest userspace process hits an unmapped page we switch back to the Host, 19 + * walk the page tables, find it's not mapped, switch back to the Guest page 20 + * fault handler, which calls a hypercall to set the page table entry, then 21 + * finally returns to userspace. That's two round-trips. 22 + * 23 + * If we had a small walker in the Switcher, we could quickly check the Guest 24 + * page table and if the page isn't mapped, immediately reflect the fault back 25 + * into the Guest. This means the Switcher would have to know the top of the 26 + * Guest page table and the page fault handler address. 27 + * 28 + * For simplicity, the Guest should only handle the case where the privilege 29 + * level of the fault is 3 and probably only not present or write faults. It 30 + * should also detect recursive faults, and hand the original fault to the 31 + * Host (which is actually really easy). 32 + * 33 + * Two questions remain. Would the performance gain outweigh the complexity? 34 + * And who would write the verse documenting it? :*/ 35 + 36 + /*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their 37 + * code). It's worth doing though, since it would let us use oprofile in the 38 + * Host when a Guest is running. :*/ 39 + 9 40 /*S:100 10 41 * Welcome to the Switcher itself! 11 42 * ··· 119 88 120 89 // All saved and there's now five steps before us: 121 90 // Stack, GDT, IDT, TSS 122 - // And last of all the page tables are flipped. 91 + // Then last of all the page tables are flipped. 123 92 124 93 // Yet beware that our stack pointer must be 125 94 // Always valid lest an NMI hits ··· 134 103 lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 135 104 136 105 // The Guest's IDT we did partially 137 - // Move to the "struct lguest_pages" as well. 106 + // Copy to "struct lguest_pages" as well. 138 107 lidt LGUEST_PAGES_guest_idt_desc(%eax) 139 108 140 109 // The TSS entry which controls traps 141 110 // Must be loaded up with "ltr" now: 111 + // The GDT entry that TSS uses 112 + // Changes type when we load it: damn Intel! 142 113 // For after we switch over our page tables 143 - // It (as the rest) will be writable no more. 144 - // (The GDT entry TSS needs 145 - // Changes type when we load it: damn Intel!) 114 + // That entry will be read-only: we'd crash. 146 115 movl $(GDT_ENTRY_TSS*8), %edx 147 116 ltr %dx 148 117 149 118 // Look back now, before we take this last step! 150 119 // The Host's TSS entry was also marked used; 151 - // Let's clear it again, ere we return. 120 + // Let's clear it again for our return. 152 121 // The GDT descriptor of the Host 153 122 // Points to the table after two "size" bytes 154 123 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 155 - // Clear the type field of "used" (byte 5, bit 2) 124 + // Clear "used" from type field (byte 5, bit 2) 156 125 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 157 126 158 127 // Once our page table's switched, the Guest is live! ··· 162 131 163 132 // The page table change did one tricky thing: 164 133 // The Guest's register page has been mapped 165 - // Writable onto our %esp (stack) -- 134 + // Writable under our %esp (stack) -- 166 135 // We can simply pop off all Guest regs. 167 136 popl %eax 168 137 popl %ebx ··· 183 152 addl $8, %esp 184 153 185 154 // The last five stack slots hold return address 186 - // And everything needed to change privilege 187 - // Into the Guest privilege level of 1, 155 + // And everything needed to switch privilege 156 + // From Switcher's level 0 to Guest's 1, 188 157 // And the stack where the Guest had last left it. 189 158 // Interrupts are turned back on: we are Guest. 190 159 iret 191 160 192 - // There are two paths where we switch to the Host 161 + // We treat two paths to switch back to the Host 162 + // Yet both must save Guest state and restore Host 193 163 // So we put the routine in a macro. 194 - // We are on our way home, back to the Host 195 - // Interrupted out of the Guest, we come here. 196 164 #define SWITCH_TO_HOST \ 197 165 /* We save the Guest state: all registers first \ 198 166 * Laid out just as "struct lguest_regs" defines */ \ ··· 224 194 movl %esp, %eax; \ 225 195 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 226 196 /* Save our trap number: the switch will obscure it \ 227 - * (The Guest regs are not mapped here in the Host) \ 197 + * (In the Host the Guest regs are not mapped here) \ 228 198 * %ebx holds it safe for deliver_to_host */ \ 229 199 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 230 200 /* The Host GDT, IDT and stack! \ ··· 240 210 /* Switch to Host's GDT, IDT. */ \ 241 211 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 242 212 lidt LGUEST_PAGES_host_idt_desc(%eax); \ 243 - /* Restore the Host's stack where it's saved regs lie */ \ 213 + /* Restore the Host's stack where its saved regs lie */ \ 244 214 movl LGUEST_PAGES_host_sp(%eax), %esp; \ 245 - /* Last the TSS: our Host is complete */ \ 215 + /* Last the TSS: our Host is returned */ \ 246 216 movl $(GDT_ENTRY_TSS*8), %edx; \ 247 217 ltr %dx; \ 248 218 /* Restore now the regs saved right at the first. */ \ ··· 252 222 popl %ds; \ 253 223 popl %es 254 224 255 - // Here's where we come when the Guest has just trapped: 256 - // (Which trap we'll see has been pushed on the stack). 225 + // The first path is trod when the Guest has trapped: 226 + // (Which trap it was has been pushed on the stack). 257 227 // We need only switch back, and the Host will decode 258 228 // Why we came home, and what needs to be done. 259 229 return_to_host: 260 230 SWITCH_TO_HOST 261 231 iret 262 232 233 + // We are lead to the second path like so: 263 234 // An interrupt, with some cause external 264 235 // Has ajerked us rudely from the Guest's code 265 236 // Again we must return home to the Host ··· 269 238 // But now we must go home via that place 270 239 // Where that interrupt was supposed to go 271 240 // Had we not been ensconced, running the Guest. 272 - // Here we see the cleverness of our stack: 241 + // Here we see the trickness of run_guest_once(): 273 242 // The Host stack is formed like an interrupt 274 243 // With EIP, CS and EFLAGS layered. 275 244 // Interrupt handlers end with "iret" ··· 294 263 xorw %ax, %ax 295 264 orl %eax, %edx 296 265 // Now the address of the handler's in %edx 297 - // We call it now: its "iret" takes us home. 266 + // We call it now: its "iret" drops us home. 298 267 jmp *%edx 299 268 300 269 // Every interrupt can come to us here

+8 -8

include/asm-x86/lguest_hcall.h

··· 18 18 #define LHCALL_LOAD_TLS 16 19 19 #define LHCALL_NOTIFY 17 20 20 21 + #define LGUEST_TRAP_ENTRY 0x1F 22 + 23 + #ifndef __ASSEMBLY__ 24 + #include <asm/hw_irq.h> 25 + 21 26 /*G:031 First, how does our Guest contact the Host to ask for privileged 22 27 * operations? There are two ways: the direct way is to make a "hypercall", 23 28 * to make requests of the Host Itself. 24 29 * 25 30 * Our hypercall mechanism uses the highest unused trap code (traps 32 and 26 - * above are used by real hardware interrupts). Seventeen hypercalls are 31 + * above are used by real hardware interrupts). Fifteen hypercalls are 27 32 * available: the hypercall number is put in the %eax register, and the 28 33 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 29 34 * value makes sense, it's returned in %eax. ··· 36 31 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 37 32 * Host, rather than returning failure. This reflects Winston Churchill's 38 33 * definition of a gentleman: "someone who is only rude intentionally". */ 39 - #define LGUEST_TRAP_ENTRY 0x1F 40 - 41 - #ifndef __ASSEMBLY__ 42 - #include <asm/hw_irq.h> 43 - 44 34 static inline unsigned long 45 35 hcall(unsigned long call, 46 36 unsigned long arg1, unsigned long arg2, unsigned long arg3) 47 37 { 48 38 /* "int" is the Intel instruction to trigger a trap. */ 49 39 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 50 - /* The call is in %eax (aka "a"), and can be replaced */ 40 + /* The call in %eax (aka "a") might be overwritten */ 51 41 : "=a"(call) 52 - /* The other arguments are in %eax, %edx, %ebx & %ecx */ 42 + /* The arguments are in %eax, %edx, %ebx & %ecx */ 53 43 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 54 44 /* "memory" means this might write somewhere in memory. 55 45 * This isn't true for all calls, but it's safe to tell

+2 -2

include/linux/lguest.h

··· 12 12 #define LG_CLOCK_MAX_DELTA ULONG_MAX 13 13 14 14 /*G:032 The second method of communicating with the Host is to via "struct 15 - * lguest_data". The Guest's very first hypercall is to tell the Host where 16 - * this is, and then the Guest and Host both publish information in it. :*/ 15 + * lguest_data". Once the Guest's initialization hypercall tells the Host where 16 + * this is, the Guest and Host both publish information in it. :*/ 17 17 struct lguest_data 18 18 { 19 19 /* 512 == enabled (same as eflags in normal hardware). The Guest

+5 -1

include/linux/lguest_launcher.h

··· 10 10 * real devices (think of the damage it could do!) we provide virtual devices. 11 11 * We could emulate a PCI bus with various devices on it, but that is a fairly 12 12 * complex burden for the Host and suboptimal for the Guest, so we have our own 13 - * "lguest" bus and simple drivers. 13 + * simple lguest bus and we use "virtio" drivers. These drivers need a set of 14 + * routines from us which will actually do the virtual I/O, but they handle all 15 + * the net/block/console stuff themselves. This means that if we want to add 16 + * a new device, we simply need to write a new virtio driver and create support 17 + * for it in the Launcher: this code won't need to change. 14 18 * 15 19 * Devices are described by a simplified ID, a status byte, and some "config" 16 20 * bytes which describe this device's configuration. This is placed by the