lguest: fix comment style · tjh.dev/kernel@2e04ef7

+349 -191

Documentation/lguest/lguest.c

··· 1 - /*P:100 This is the Launcher code, a simple program which lays out the 2 - * "physical" memory for the new Guest by mapping the kernel image and 3 - * the virtual devices, then opens /dev/lguest to tell the kernel 4 - * about the Guest and control it. :*/ 1 + /*P:100 2 + * This is the Launcher code, a simple program which lays out the "physical" 3 + * memory for the new Guest by mapping the kernel image and the virtual 4 + * devices, then opens /dev/lguest to tell the kernel about the Guest and 5 + * control it. 6 + :*/ 5 7 #define _LARGEFILE64_SOURCE 6 8 #define _GNU_SOURCE 7 9 #include <stdio.h> ··· 48 46 #include "linux/virtio_rng.h" 49 47 #include "linux/virtio_ring.h" 50 48 #include "asm/bootparam.h" 51 - /*L:110 We can ignore the 39 include files we need for this program, but I do 52 - * want to draw attention to the use of kernel-style types. 49 + /*L:110 50 + * We can ignore the 39 include files we need for this program, but I do want 51 + * to draw attention to the use of kernel-style types. 53 52 * 54 53 * As Linus said, "C is a Spartan language, and so should your naming be." I 55 54 * like these abbreviations, so we define them here. Note that u64 is always 56 55 * unsigned long long, which works on all Linux systems: this means that we can 57 - * use %llu in printf for any u64. */ 56 + * use %llu in printf for any u64. 57 + */ 58 58 typedef unsigned long long u64; 59 59 typedef uint32_t u32; 60 60 typedef uint16_t u16; ··· 73 69 /* This will occupy 3 pages: it must be a power of 2. */ 74 70 #define VIRTQUEUE_NUM 256 75 71 76 - /*L:120 verbose is both a global flag and a macro. The C preprocessor allows 77 - * this, and although I wouldn't recommend it, it works quite nicely here. */ 72 + /*L:120 73 + * verbose is both a global flag and a macro. The C preprocessor allows 74 + * this, and although I wouldn't recommend it, it works quite nicely here. 75 + */ 78 76 static bool verbose; 79 77 #define verbose(args...) \ 80 78 do { if (verbose) printf(args); } while(0) ··· 106 100 107 101 /* A single linked list of devices. */ 108 102 struct device *dev; 109 - /* And a pointer to the last device for easy append and also for 110 - * configuration appending. */ 103 + /* And a pointer to the last device for easy append. */ 111 104 struct device *lastdev; 112 105 }; 113 106 ··· 173 168 /* The original tty settings to restore on exit. */ 174 169 static struct termios orig_term; 175 170 176 - /* We have to be careful with barriers: our devices are all run in separate 171 + /* 172 + * We have to be careful with barriers: our devices are all run in separate 177 173 * threads and so we need to make sure that changes visible to the Guest happen 178 - * in precise order. */ 174 + * in precise order. 175 + */ 179 176 #define wmb() __asm__ __volatile__("" : : : "memory") 180 177 #define mb() __asm__ __volatile__("" : : : "memory") 181 178 182 - /* Convert an iovec element to the given type. 179 + /* 180 + * Convert an iovec element to the given type. 183 181 * 184 182 * This is a fairly ugly trick: we need to know the size of the type and 185 183 * alignment requirement to check the pointer is kosher. It's also nice to 186 184 * have the name of the type in case we report failure. 187 185 * 188 186 * Typing those three things all the time is cumbersome and error prone, so we 189 - * have a macro which sets them all up and passes to the real function. */ 187 + * have a macro which sets them all up and passes to the real function. 188 + */ 190 189 #define convert(iov, type) \ 191 190 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 192 191 ··· 207 198 /* Wrapper for the last available index. Makes it easier to change. */ 208 199 #define lg_last_avail(vq) ((vq)->last_avail_idx) 209 200 210 - /* The virtio configuration space is defined to be little-endian. x86 is 211 - * little-endian too, but it's nice to be explicit so we have these helpers. */ 201 + /* 202 + * The virtio configuration space is defined to be little-endian. x86 is 203 + * little-endian too, but it's nice to be explicit so we have these helpers. 204 + */ 212 205 #define cpu_to_le16(v16) (v16) 213 206 #define cpu_to_le32(v32) (v32) 214 207 #define cpu_to_le64(v64) (v64) ··· 252 241 + dev->num_vq * sizeof(struct lguest_vqconfig); 253 242 } 254 243 255 - /*L:100 The Launcher code itself takes us out into userspace, that scary place 256 - * where pointers run wild and free! Unfortunately, like most userspace 257 - * programs, it's quite boring (which is why everyone likes to hack on the 258 - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 259 - * will get you through this section. Or, maybe not. 244 + /*L:100 245 + * The Launcher code itself takes us out into userspace, that scary place where 246 + * pointers run wild and free! Unfortunately, like most userspace programs, 247 + * it's quite boring (which is why everyone likes to hack on the kernel!). 248 + * Perhaps if you make up an Lguest Drinking Game at this point, it will get 249 + * you through this section. Or, maybe not. 260 250 * 261 251 * The Launcher sets up a big chunk of memory to be the Guest's "physical" 262 252 * memory and stores it in "guest_base". In other words, Guest physical == ··· 265 253 * 266 254 * This can be tough to get your head around, but usually it just means that we 267 255 * use these trivial conversion functions when the Guest gives us it's 268 - * "physical" addresses: */ 256 + * "physical" addresses: 257 + */ 269 258 static void *from_guest_phys(unsigned long addr) 270 259 { 271 260 return guest_base + addr; ··· 281 268 * Loading the Kernel. 282 269 * 283 270 * We start with couple of simple helper routines. open_or_die() avoids 284 - * error-checking code cluttering the callers: */ 271 + * error-checking code cluttering the callers: 272 + */ 285 273 static int open_or_die(const char *name, int flags) 286 274 { 287 275 int fd = open(name, flags); ··· 297 283 int fd = open_or_die("/dev/zero", O_RDONLY); 298 284 void *addr; 299 285 300 - /* We use a private mapping (ie. if we write to the page, it will be 301 - * copied). */ 286 + /* 287 + * We use a private mapping (ie. if we write to the page, it will be 288 + * copied). 289 + */ 302 290 addr = mmap(NULL, getpagesize() * num, 303 291 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 304 292 if (addr == MAP_FAILED) ··· 321 305 return addr; 322 306 } 323 307 324 - /* This routine is used to load the kernel or initrd. It tries mmap, but if 308 + /* 309 + * This routine is used to load the kernel or initrd. It tries mmap, but if 325 310 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 326 - * it falls back to reading the memory in. */ 311 + * it falls back to reading the memory in. 312 + */ 327 313 static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 328 314 { 329 315 ssize_t r; 330 316 331 - /* We map writable even though for some segments are marked read-only. 317 + /* 318 + * We map writable even though for some segments are marked read-only. 332 319 * The kernel really wants to be writable: it patches its own 333 320 * instructions. 334 321 * 335 322 * MAP_PRIVATE means that the page won't be copied until a write is 336 323 * done to it. This allows us to share untouched memory between 337 - * Guests. */ 324 + * Guests. 325 + */ 338 326 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, 339 327 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 340 328 return; ··· 349 329 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 350 330 } 351 331 352 - /* This routine takes an open vmlinux image, which is in ELF, and maps it into 332 + /* 333 + * This routine takes an open vmlinux image, which is in ELF, and maps it into 353 334 * the Guest memory. ELF = Embedded Linking Format, which is the format used 354 335 * by all modern binaries on Linux including the kernel. 355 336 * ··· 358 337 * address. We use the physical address; the Guest will map itself to the 359 338 * virtual address. 360 339 * 361 - * We return the starting address. */ 340 + * We return the starting address. 341 + */ 362 342 static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 363 343 { 364 344 Elf32_Phdr phdr[ehdr->e_phnum]; 365 345 unsigned int i; 366 346 367 - /* Sanity checks on the main ELF header: an x86 executable with a 368 - * reasonable number of correctly-sized program headers. */ 347 + /* 348 + * Sanity checks on the main ELF header: an x86 executable with a 349 + * reasonable number of correctly-sized program headers. 350 + */ 369 351 if (ehdr->e_type != ET_EXEC 370 352 || ehdr->e_machine != EM_386 371 353 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 372 354 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 373 355 errx(1, "Malformed elf header"); 374 356 375 - /* An ELF executable contains an ELF header and a number of "program" 357 + /* 358 + * An ELF executable contains an ELF header and a number of "program" 376 359 * headers which indicate which parts ("segments") of the program to 377 - * load where. */ 360 + * load where. 361 + */ 378 362 379 363 /* We read in all the program headers at once: */ 380 364 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) ··· 387 361 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 388 362 err(1, "Reading program headers"); 389 363 390 - /* Try all the headers: there are usually only three. A read-only one, 391 - * a read-write one, and a "note" section which we don't load. */ 364 + /* 365 + * Try all the headers: there are usually only three. A read-only one, 366 + * a read-write one, and a "note" section which we don't load. 367 + */ 392 368 for (i = 0; i < ehdr->e_phnum; i++) { 393 369 /* If this isn't a loadable segment, we ignore it */ 394 370 if (phdr[i].p_type != PT_LOAD) ··· 408 380 return ehdr->e_entry; 409 381 } 410 382 411 - /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 412 - * supposed to jump into it and it will unpack itself. We used to have to 413 - * perform some hairy magic because the unpacking code scared me. 383 + /*L:150 384 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed 385 + * to jump into it and it will unpack itself. We used to have to perform some 386 + * hairy magic because the unpacking code scared me. 414 387 * 415 388 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 416 389 * a small patch to jump over the tricky bits in the Guest, so now we just read 417 - * the funky header so we know where in the file to load, and away we go! */ 390 + * the funky header so we know where in the file to load, and away we go! 391 + */ 418 392 static unsigned long load_bzimage(int fd) 419 393 { 420 394 struct boot_params boot; ··· 424 394 /* Modern bzImages get loaded at 1M. */ 425 395 void *p = from_guest_phys(0x100000); 426 396 427 - /* Go back to the start of the file and read the header. It should be 428 - * a Linux boot header (see Documentation/x86/i386/boot.txt) */ 397 + /* 398 + * Go back to the start of the file and read the header. It should be 399 + * a Linux boot header (see Documentation/x86/i386/boot.txt) 400 + */ 429 401 lseek(fd, 0, SEEK_SET); 430 402 read(fd, &boot, sizeof(boot)); 431 403 ··· 446 414 return boot.hdr.code32_start; 447 415 } 448 416 449 - /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 417 + /*L:140 418 + * Loading the kernel is easy when it's a "vmlinux", but most kernels 450 419 * come wrapped up in the self-decompressing "bzImage" format. With a little 451 - * work, we can load those, too. */ 420 + * work, we can load those, too. 421 + */ 452 422 static unsigned long load_kernel(int fd) 453 423 { 454 424 Elf32_Ehdr hdr; ··· 467 433 return load_bzimage(fd); 468 434 } 469 435 470 - /* This is a trivial little helper to align pages. Andi Kleen hated it because 436 + /* 437 + * This is a trivial little helper to align pages. Andi Kleen hated it because 471 438 * it calls getpagesize() twice: "it's dumb code." 472 439 * 473 440 * Kernel guys get really het up about optimization, even when it's not 474 - * necessary. I leave this code as a reaction against that. */ 441 + * necessary. I leave this code as a reaction against that. 442 + */ 475 443 static inline unsigned long page_align(unsigned long addr) 476 444 { 477 445 /* Add upwards and truncate downwards. */ 478 446 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 479 447 } 480 448 481 - /*L:180 An "initial ram disk" is a disk image loaded into memory along with 482 - * the kernel which the kernel can use to boot from without needing any 483 - * drivers. Most distributions now use this as standard: the initrd contains 484 - * the code to load the appropriate driver modules for the current machine. 449 + /*L:180 450 + * An "initial ram disk" is a disk image loaded into memory along with the 451 + * kernel which the kernel can use to boot from without needing any drivers. 452 + * Most distributions now use this as standard: the initrd contains the code to 453 + * load the appropriate driver modules for the current machine. 485 454 * 486 455 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 487 - * kernels. He sent me this (and tells me when I break it). */ 456 + * kernels. He sent me this (and tells me when I break it). 457 + */ 488 458 static unsigned long load_initrd(const char *name, unsigned long mem) 489 459 { 490 460 int ifd; ··· 500 462 if (fstat(ifd, &st) < 0) 501 463 err(1, "fstat() on initrd '%s'", name); 502 464 503 - /* We map the initrd at the top of memory, but mmap wants it to be 504 - * page-aligned, so we round the size up for that. */ 465 + /* 466 + * We map the initrd at the top of memory, but mmap wants it to be 467 + * page-aligned, so we round the size up for that. 468 + */ 505 469 len = page_align(st.st_size); 506 470 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 507 - /* Once a file is mapped, you can close the file descriptor. It's a 508 - * little odd, but quite useful. */ 471 + /* 472 + * Once a file is mapped, you can close the file descriptor. It's a 473 + * little odd, but quite useful. 474 + */ 509 475 close(ifd); 510 476 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 511 477 ··· 518 476 } 519 477 /*:*/ 520 478 521 - /* Simple routine to roll all the commandline arguments together with spaces 522 - * between them. */ 479 + /* 480 + * Simple routine to roll all the commandline arguments together with spaces 481 + * between them. 482 + */ 523 483 static void concat(char *dst, char *args[]) 524 484 { 525 485 unsigned int i, len = 0; ··· 538 494 dst[len] = '\0'; 539 495 } 540 496 541 - /*L:185 This is where we actually tell the kernel to initialize the Guest. We 497 + /*L:185 498 + * This is where we actually tell the kernel to initialize the Guest. We 542 499 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 543 500 * the base of Guest "physical" memory, the top physical page to allow and the 544 - * entry point for the Guest. */ 501 + * entry point for the Guest. 502 + */ 545 503 static void tell_kernel(unsigned long start) 546 504 { 547 505 unsigned long args[] = { LHREQ_INITIALIZE, ··· 568 522 static void *_check_pointer(unsigned long addr, unsigned int size, 569 523 unsigned int line) 570 524 { 571 - /* We have to separately check addr and addr+size, because size could 572 - * be huge and addr + size might wrap around. */ 525 + /* 526 + * We have to separately check addr and addr+size, because size could 527 + * be huge and addr + size might wrap around. 528 + */ 573 529 if (addr >= guest_limit || addr + size >= guest_limit) 574 530 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 575 - /* We return a pointer for the caller's convenience, now we know it's 576 - * safe to use. */ 531 + /* 532 + * We return a pointer for the caller's convenience, now we know it's 533 + * safe to use. 534 + */ 577 535 return from_guest_phys(addr); 578 536 } 579 537 /* A macro which transparently hands the line number to the real function. */ 580 538 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 581 539 582 - /* Each buffer in the virtqueues is actually a chain of descriptors. This 540 + /* 541 + * Each buffer in the virtqueues is actually a chain of descriptors. This 583 542 * function returns the next descriptor in the chain, or vq->vring.num if we're 584 - * at the end. */ 543 + * at the end. 544 + */ 585 545 static unsigned next_desc(struct vring_desc *desc, 586 546 unsigned int i, unsigned int max) 587 547 { ··· 628 576 err(1, "Triggering irq %i", vq->config.irq); 629 577 } 630 578 631 - /* This looks in the virtqueue and for the first available buffer, and converts 579 + /* 580 + * This looks in the virtqueue and for the first available buffer, and converts 632 581 * it to an iovec for convenient access. Since descriptors consist of some 633 582 * number of output then some number of input descriptors, it's actually two 634 583 * iovecs, but we pack them into one and note how many of each there were. 635 584 * 636 - * This function returns the descriptor number found. */ 585 + * This function returns the descriptor number found. 586 + */ 637 587 static unsigned wait_for_vq_desc(struct virtqueue *vq, 638 588 struct iovec iov[], 639 589 unsigned int *out_num, unsigned int *in_num) ··· 653 599 /* OK, now we need to know about added descriptors. */ 654 600 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 655 601 656 - /* They could have slipped one in as we were doing that: make 657 - * sure it's written, then check again. */ 602 + /* 603 + * They could have slipped one in as we were doing that: make 604 + * sure it's written, then check again. 605 + */ 658 606 mb(); 659 607 if (last_avail != vq->vring.avail->idx) { 660 608 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; ··· 676 620 errx(1, "Guest moved used index from %u to %u", 677 621 last_avail, vq->vring.avail->idx); 678 622 679 - /* Grab the next descriptor number they're advertising, and increment 680 - * the index we've seen. */ 623 + /* 624 + * Grab the next descriptor number they're advertising, and increment 625 + * the index we've seen. 626 + */ 681 627 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 682 628 lg_last_avail(vq)++; 683 629 ··· 694 636 desc = vq->vring.desc; 695 637 i = head; 696 638 697 - /* If this is an indirect entry, then this buffer contains a descriptor 698 - * table which we handle as if it's any normal descriptor chain. */ 639 + /* 640 + * If this is an indirect entry, then this buffer contains a descriptor 641 + * table which we handle as if it's any normal descriptor chain. 642 + */ 699 643 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 700 644 if (desc[i].len % sizeof(struct vring_desc)) 701 645 errx(1, "Invalid size for indirect buffer table"); ··· 716 656 if (desc[i].flags & VRING_DESC_F_WRITE) 717 657 (*in_num)++; 718 658 else { 719 - /* If it's an output descriptor, they're all supposed 720 - * to come before any input descriptors. */ 659 + /* 660 + * If it's an output descriptor, they're all supposed 661 + * to come before any input descriptors. 662 + */ 721 663 if (*in_num) 722 664 errx(1, "Descriptor has out after in"); 723 665 (*out_num)++; ··· 733 671 return head; 734 672 } 735 673 736 - /* After we've used one of their buffers, we tell them about it. We'll then 737 - * want to send them an interrupt, using trigger_irq(). */ 674 + /* 675 + * After we've used one of their buffers, we tell them about it. We'll then 676 + * want to send them an interrupt, using trigger_irq(). 677 + */ 738 678 static void add_used(struct virtqueue *vq, unsigned int head, int len) 739 679 { 740 680 struct vring_used_elem *used; 741 681 742 - /* The virtqueue contains a ring of used buffers. Get a pointer to the 743 - * next entry in that used ring. */ 682 + /* 683 + * The virtqueue contains a ring of used buffers. Get a pointer to the 684 + * next entry in that used ring. 685 + */ 744 686 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 745 687 used->id = head; 746 688 used->len = len; ··· 764 698 /* 765 699 * The Console 766 700 * 767 - * We associate some data with the console for our exit hack. */ 701 + * We associate some data with the console for our exit hack. 702 + */ 768 703 struct console_abort 769 704 { 770 705 /* How many times have they hit ^C? */ ··· 792 725 if (len <= 0) { 793 726 /* Ran out of input? */ 794 727 warnx("Failed to get console input, ignoring console."); 795 - /* For simplicity, dying threads kill the whole Launcher. So 796 - * just nap here. */ 728 + /* 729 + * For simplicity, dying threads kill the whole Launcher. So 730 + * just nap here. 731 + */ 797 732 for (;;) 798 733 pause(); 799 734 } 800 735 801 736 add_used_and_trigger(vq, head, len); 802 737 803 - /* Three ^C within one second? Exit. 738 + /* 739 + * Three ^C within one second? Exit. 804 740 * 805 741 * This is such a hack, but works surprisingly well. Each ^C has to 806 742 * be in a buffer by itself, so they can't be too fast. But we check 807 743 * that we get three within about a second, so they can't be too 808 - * slow. */ 744 + * slow. 745 + */ 809 746 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 810 747 abort->count = 0; 811 748 return; ··· 880 809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 881 810 } 882 811 883 - /* This is where we handle packets coming in from the tun device to our 884 - * Guest. */ 812 + /* This handles packets coming in from the tun device to our Guest. */ 885 813 static void net_input(struct virtqueue *vq) 886 814 { 887 815 int len; ··· 912 842 return 0; 913 843 } 914 844 915 - /* When a child dies, we kill our entire process group with SIGTERM. This 916 - * also has the side effect that the shell restores the console for us! */ 845 + /* 846 + * When a child dies, we kill our entire process group with SIGTERM. This 847 + * also has the side effect that the shell restores the console for us! 848 + */ 917 849 static void kill_launcher(int signal) 918 850 { 919 851 kill(0, SIGTERM); ··· 952 880 953 881 static void create_thread(struct virtqueue *vq) 954 882 { 955 - /* Create stack for thread and run it. Since stack grows 956 - * upwards, we point the stack pointer to the end of this 957 - * region. */ 883 + /* 884 + * Create stack for thread and run it. Since the stack grows upwards, 885 + * we point the stack pointer to the end of this region. 886 + */ 958 887 char *stack = malloc(32768); 959 888 unsigned long args[] = { LHREQ_EVENTFD, 960 889 vq->config.pfn*getpagesize(), 0 }; ··· 1054 981 } 1055 982 } 1056 983 1057 - /* Early console write is done using notify on a nul-terminated string 1058 - * in Guest memory. */ 984 + /* 985 + * Early console write is done using notify on a nul-terminated string 986 + * in Guest memory. It's also great for hacking debugging messages 987 + * into a Guest. 988 + */ 1059 989 if (addr >= guest_limit) 1060 990 errx(1, "Bad NOTIFY %#lx", addr); 1061 991 ··· 1074 998 * routines to allocate and manage them. 1075 999 */ 1076 1000 1077 - /* The layout of the device page is a "struct lguest_device_desc" followed by a 1001 + /* 1002 + * The layout of the device page is a "struct lguest_device_desc" followed by a 1078 1003 * number of virtqueue descriptors, then two sets of feature bits, then an 1079 1004 * array of configuration bytes. This routine returns the configuration 1080 - * pointer. */ 1005 + * pointer. 1006 + */ 1081 1007 static u8 *device_config(const struct device *dev) 1082 1008 { 1083 1009 return (void *)(dev->desc + 1) ··· 1087 1009 + dev->feature_len * 2; 1088 1010 } 1089 1011 1090 - /* This routine allocates a new "struct lguest_device_desc" from descriptor 1012 + /* 1013 + * This routine allocates a new "struct lguest_device_desc" from descriptor 1091 1014 * table page just above the Guest's normal memory. It returns a pointer to 1092 - * that descriptor. */ 1015 + * that descriptor. 1016 + */ 1093 1017 static struct lguest_device_desc *new_dev_desc(u16 type) 1094 1018 { 1095 1019 struct lguest_device_desc d = { .type = type }; ··· 1112 1032 return memcpy(p, &d, sizeof(d)); 1113 1033 } 1114 1034 1115 - /* Each device descriptor is followed by the description of its virtqueues. We 1116 - * specify how many descriptors the virtqueue is to have. */ 1035 + /* 1036 + * Each device descriptor is followed by the description of its virtqueues. We 1037 + * specify how many descriptors the virtqueue is to have. 1038 + */ 1117 1039 static void add_virtqueue(struct device *dev, unsigned int num_descs, 1118 1040 void (*service)(struct virtqueue *)) 1119 1041 { ··· 1143 1061 /* Initialize the vring. */ 1144 1062 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); 1145 1063 1146 - /* Append virtqueue to this device's descriptor. We use 1064 + /* 1065 + * Append virtqueue to this device's descriptor. We use 1147 1066 * device_config() to get the end of the device's current virtqueues; 1148 1067 * we check that we haven't added any config or feature information 1149 - * yet, otherwise we'd be overwriting them. */ 1068 + * yet, otherwise we'd be overwriting them. 1069 + */ 1150 1070 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1151 1071 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1152 1072 dev->num_vq++; ··· 1156 1072 1157 1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1158 1074 1159 - /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1160 - * second. */ 1075 + /* 1076 + * Add to tail of list, so dev->vq is first vq, dev->vq->next is 1077 + * second. 1078 + */ 1161 1079 for (i = &dev->vq; *i; i = &(*i)->next); 1162 1080 *i = vq; 1163 1081 } 1164 1082 1165 - /* The first half of the feature bitmask is for us to advertise features. The 1166 - * second half is for the Guest to accept features. */ 1083 + /* 1084 + * The first half of the feature bitmask is for us to advertise features. The 1085 + * second half is for the Guest to accept features. 1086 + */ 1167 1087 static void add_feature(struct device *dev, unsigned bit) 1168 1088 { 1169 1089 u8 *features = get_feature_bits(dev); ··· 1181 1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1182 1094 } 1183 1095 1184 - /* This routine sets the configuration fields for an existing device's 1096 + /* 1097 + * This routine sets the configuration fields for an existing device's 1185 1098 * descriptor. It only works for the last device, but that's OK because that's 1186 - * how we use it. */ 1099 + * how we use it. 1100 + */ 1187 1101 static void set_config(struct device *dev, unsigned len, const void *conf) 1188 1102 { 1189 1103 /* Check we haven't overflowed our single page. */ ··· 1200 1110 assert(dev->desc->config_len == len); 1201 1111 } 1202 1112 1203 - /* This routine does all the creation and setup of a new device, including 1113 + /* 1114 + * This routine does all the creation and setup of a new device, including 1204 1115 * calling new_dev_desc() to allocate the descriptor and device memory. 1205 1116 * 1206 - * See what I mean about userspace being boring? */ 1117 + * See what I mean about userspace being boring? 1118 + */ 1207 1119 static struct device *new_device(const char *name, u16 type) 1208 1120 { 1209 1121 struct device *dev = malloc(sizeof(*dev)); ··· 1218 1126 dev->num_vq = 0; 1219 1127 dev->running = false; 1220 1128 1221 - /* Append to device list. Prepending to a single-linked list is 1129 + /* 1130 + * Append to device list. Prepending to a single-linked list is 1222 1131 * easier, but the user expects the devices to be arranged on the bus 1223 1132 * in command-line order. The first network device on the command line 1224 - * is eth0, the first block device /dev/vda, etc. */ 1133 + * is eth0, the first block device /dev/vda, etc. 1134 + */ 1225 1135 if (devices.lastdev) 1226 1136 devices.lastdev->next = dev; 1227 1137 else ··· 1233 1139 return dev; 1234 1140 } 1235 1141 1236 - /* Our first setup routine is the console. It's a fairly simple device, but 1237 - * UNIX tty handling makes it uglier than it could be. */ 1142 + /* 1143 + * Our first setup routine is the console. It's a fairly simple device, but 1144 + * UNIX tty handling makes it uglier than it could be. 1145 + */ 1238 1146 static void setup_console(void) 1239 1147 { 1240 1148 struct device *dev; ··· 1244 1148 /* If we can save the initial standard input settings... */ 1245 1149 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 1246 1150 struct termios term = orig_term; 1247 - /* Then we turn off echo, line buffering and ^C etc. We want a 1248 - * raw input stream to the Guest. */ 1151 + /* 1152 + * Then we turn off echo, line buffering and ^C etc: We want a 1153 + * raw input stream to the Guest. 1154 + */ 1249 1155 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1250 1156 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1251 1157 } ··· 1258 1160 dev->priv = malloc(sizeof(struct console_abort)); 1259 1161 ((struct console_abort *)dev->priv)->count = 0; 1260 1162 1261 - /* The console needs two virtqueues: the input then the output. When 1163 + /* 1164 + * The console needs two virtqueues: the input then the output. When 1262 1165 * they put something the input queue, we make sure we're listening to 1263 1166 * stdin. When they put something in the output queue, we write it to 1264 - * stdout. */ 1167 + * stdout. 1168 + */ 1265 1169 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 1266 1170 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 1267 1171 ··· 1271 1171 } 1272 1172 /*:*/ 1273 1173 1274 - /*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1174 + /*M:010 1175 + * Inter-guest networking is an interesting area. Simplest is to have a 1275 1176 * --sharenet=<name> option which opens or creates a named pipe. This can be 1276 1177 * used to send packets to another guest in a 1:1 manner. 1277 1178 * ··· 1286 1185 * multiple inter-guest channels behind one interface, although it would 1287 1186 * require some manner of hotplugging new virtio channels. 1288 1187 * 1289 - * Finally, we could implement a virtio network switch in the kernel. :*/ 1188 + * Finally, we could implement a virtio network switch in the kernel. 1189 + :*/ 1290 1190 1291 1191 static u32 str2ip(const char *ipaddr) 1292 1192 { ··· 1312 1210 mac[5] = m[5]; 1313 1211 } 1314 1212 1315 - /* This code is "adapted" from libbridge: it attaches the Host end of the 1213 + /* 1214 + * This code is "adapted" from libbridge: it attaches the Host end of the 1316 1215 * network device to the bridge device specified by the command line. 1317 1216 * 1318 1217 * This is yet another James Morris contribution (I'm an IP-level guy, so I 1319 - * dislike bridging), and I just try not to break it. */ 1218 + * dislike bridging), and I just try not to break it. 1219 + */ 1320 1220 static void add_to_bridge(int fd, const char *if_name, const char *br_name) 1321 1221 { 1322 1222 int ifidx; ··· 1338 1234 err(1, "can't add %s to bridge %s", if_name, br_name); 1339 1235 } 1340 1236 1341 - /* This sets up the Host end of the network device with an IP address, brings 1237 + /* 1238 + * This sets up the Host end of the network device with an IP address, brings 1342 1239 * it up so packets will flow, the copies the MAC address into the hwaddr 1343 - * pointer. */ 1240 + * pointer. 1241 + */ 1344 1242 static void configure_device(int fd, const char *tapif, u32 ipaddr) 1345 1243 { 1346 1244 struct ifreq ifr; ··· 1369 1263 /* Start with this zeroed. Messy but sure. */ 1370 1264 memset(&ifr, 0, sizeof(ifr)); 1371 1265 1372 - /* We open the /dev/net/tun device and tell it we want a tap device. A 1266 + /* 1267 + * We open the /dev/net/tun device and tell it we want a tap device. A 1373 1268 * tap device is like a tun device, only somehow different. To tell 1374 1269 * the truth, I completely blundered my way through this code, but it 1375 - * works now! */ 1270 + * works now! 1271 + */ 1376 1272 netfd = open_or_die("/dev/net/tun", O_RDWR); 1377 1273 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 1378 1274 strcpy(ifr.ifr_name, "tap%d"); ··· 1385 1277 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 1386 1278 err(1, "Could not set features for tun device"); 1387 1279 1388 - /* We don't need checksums calculated for packets coming in this 1389 - * device: trust us! */ 1280 + /* 1281 + * We don't need checksums calculated for packets coming in this 1282 + * device: trust us! 1283 + */ 1390 1284 ioctl(netfd, TUNSETNOCSUM, 1); 1391 1285 1392 1286 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 1393 1287 return netfd; 1394 1288 } 1395 1289 1396 - /*L:195 Our network is a Host<->Guest network. This can either use bridging or 1290 + /*L:195 1291 + * Our network is a Host<->Guest network. This can either use bridging or 1397 1292 * routing, but the principle is the same: it uses the "tun" device to inject 1398 1293 * packets into the Host as if they came in from a normal network card. We 1399 - * just shunt packets between the Guest and the tun device. */ 1294 + * just shunt packets between the Guest and the tun device. 1295 + */ 1400 1296 static void setup_tun_net(char *arg) 1401 1297 { 1402 1298 struct device *dev; ··· 1417 1305 dev = new_device("net", VIRTIO_ID_NET); 1418 1306 dev->priv = net_info; 1419 1307 1420 - /* Network devices need a receive and a send queue, just like 1421 - * console. */ 1308 + /* Network devices need a recv and a send queue, just like console. */ 1422 1309 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 1423 1310 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 1424 1311 1425 - /* We need a socket to perform the magic network ioctls to bring up the 1426 - * tap interface, connect to the bridge etc. Any socket will do! */ 1312 + /* 1313 + * We need a socket to perform the magic network ioctls to bring up the 1314 + * tap interface, connect to the bridge etc. Any socket will do! 1315 + */ 1427 1316 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 1428 1317 if (ipfd < 0) 1429 1318 err(1, "opening IP socket"); ··· 1479 1366 devices.device_num, tapif, arg); 1480 1367 } 1481 1368 1482 - /* Our block (disk) device should be really simple: the Guest asks for a block 1369 + /* 1370 + * Our block (disk) device should be really simple: the Guest asks for a block 1483 1371 * number and we read or write that position in the file. Unfortunately, that 1484 1372 * was amazingly slow: the Guest waits until the read is finished before 1485 1373 * running anything else, even if it could have been doing useful work. ··· 1488 1374 * We could use async I/O, except it's reputed to suck so hard that characters 1489 1375 * actually go missing from your code when you try to use it. 1490 1376 * 1491 - * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1377 + * So this was one reason why lguest now does all virtqueue servicing in 1378 + * separate threads: it's more efficient and more like a real device. 1379 + */ 1492 1380 1493 1381 /* This hangs off device->priv. */ 1494 1382 struct vblk_info ··· 1528 1412 /* Get the next request. */ 1529 1413 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1530 1414 1531 - /* Every block request should contain at least one output buffer 1415 + /* 1416 + * Every block request should contain at least one output buffer 1532 1417 * (detailing the location on disk and the type of request) and one 1533 - * input buffer (to hold the result). */ 1418 + * input buffer (to hold the result). 1419 + */ 1534 1420 if (out_num == 0 || in_num == 0) 1535 1421 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1536 1422 head, out_num, in_num); ··· 1541 1423 in = convert(&iov[out_num+in_num-1], u8); 1542 1424 off = out->sector * 512; 1543 1425 1544 - /* The block device implements "barriers", where the Guest indicates 1426 + /* 1427 + * The block device implements "barriers", where the Guest indicates 1545 1428 * that it wants all previous writes to occur before this write. We 1546 1429 * don't have a way of asking our kernel to do a barrier, so we just 1547 - * synchronize all the data in the file. Pretty poor, no? */ 1430 + * synchronize all the data in the file. Pretty poor, no? 1431 + */ 1548 1432 if (out->type & VIRTIO_BLK_T_BARRIER) 1549 1433 fdatasync(vblk->fd); 1550 1434 1551 - /* In general the virtio block driver is allowed to try SCSI commands. 1552 - * It'd be nice if we supported eject, for example, but we don't. */ 1435 + /* 1436 + * In general the virtio block driver is allowed to try SCSI commands. 1437 + * It'd be nice if we supported eject, for example, but we don't. 1438 + */ 1553 1439 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1554 1440 fprintf(stderr, "Scsi commands unsupported\n"); 1555 1441 *in = VIRTIO_BLK_S_UNSUPP; 1556 1442 wlen = sizeof(*in); 1557 1443 } else if (out->type & VIRTIO_BLK_T_OUT) { 1558 - /* Write */ 1559 - 1560 - /* Move to the right location in the block file. This can fail 1561 - * if they try to write past end. */ 1444 + /* 1445 + * Write 1446 + * 1447 + * Move to the right location in the block file. This can fail 1448 + * if they try to write past end. 1449 + */ 1562 1450 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1563 1451 err(1, "Bad seek to sector %llu", out->sector); 1564 1452 1565 1453 ret = writev(vblk->fd, iov+1, out_num-1); 1566 1454 verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1567 1455 1568 - /* Grr... Now we know how long the descriptor they sent was, we 1456 + /* 1457 + * Grr... Now we know how long the descriptor they sent was, we 1569 1458 * make sure they didn't try to write over the end of the block 1570 - * file (possibly extending it). */ 1459 + * file (possibly extending it). 1460 + */ 1571 1461 if (ret > 0 && off + ret > vblk->len) { 1572 1462 /* Trim it back to the correct length */ 1573 1463 ftruncate64(vblk->fd, vblk->len); ··· 1585 1459 wlen = sizeof(*in); 1586 1460 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1587 1461 } else { 1588 - /* Read */ 1589 - 1590 - /* Move to the right location in the block file. This can fail 1591 - * if they try to read past end. */ 1462 + /* 1463 + * Read 1464 + * 1465 + * Move to the right location in the block file. This can fail 1466 + * if they try to read past end. 1467 + */ 1592 1468 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1593 1469 err(1, "Bad seek to sector %llu", out->sector); 1594 1470 ··· 1605 1477 } 1606 1478 } 1607 1479 1608 - /* OK, so we noted that it was pretty poor to use an fdatasync as a 1480 + /* 1481 + * OK, so we noted that it was pretty poor to use an fdatasync as a 1609 1482 * barrier. But Christoph Hellwig points out that we need a sync 1610 1483 * *afterwards* as well: "Barriers specify no reordering to the front 1611 - * or the back." And Jens Axboe confirmed it, so here we are: */ 1484 + * or the back." And Jens Axboe confirmed it, so here we are: 1485 + */ 1612 1486 if (out->type & VIRTIO_BLK_T_BARRIER) 1613 1487 fdatasync(vblk->fd); 1614 1488 ··· 1624 1494 struct vblk_info *vblk; 1625 1495 struct virtio_blk_config conf; 1626 1496 1627 - /* The device responds to return from I/O thread. */ 1497 + /* Creat the device. */ 1628 1498 dev = new_device("block", VIRTIO_ID_BLOCK); 1629 1499 1630 1500 /* The device has one virtqueue, where the Guest places requests. */ ··· 1643 1513 /* Tell Guest how many sectors this device has. */ 1644 1514 conf.capacity = cpu_to_le64(vblk->len / 512); 1645 1515 1646 - /* Tell Guest not to put in too many descriptors at once: two are used 1647 - * for the in and out elements. */ 1516 + /* 1517 + * Tell Guest not to put in too many descriptors at once: two are used 1518 + * for the in and out elements. 1519 + */ 1648 1520 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 1649 1521 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 1650 1522 ··· 1657 1525 ++devices.device_num, le64_to_cpu(conf.capacity)); 1658 1526 } 1659 1527 1660 - struct rng_info { 1661 - int rfd; 1662 - }; 1663 - 1664 - /* Our random number generator device reads from /dev/random into the Guest's 1528 + /*L:211 1529 + * Our random number generator device reads from /dev/random into the Guest's 1665 1530 * input buffers. The usual case is that the Guest doesn't want random numbers 1666 1531 * and so has no buffers although /dev/random is still readable, whereas 1667 1532 * console is the reverse. 1668 1533 * 1669 - * The same logic applies, however. */ 1534 + * The same logic applies, however. 1535 + */ 1536 + struct rng_info { 1537 + int rfd; 1538 + }; 1539 + 1670 1540 static void rng_input(struct virtqueue *vq) 1671 1541 { 1672 1542 int len; ··· 1681 1547 if (out_num) 1682 1548 errx(1, "Output buffers in rng?"); 1683 1549 1684 - /* This is why we convert to iovecs: the readv() call uses them, and so 1550 + /* 1551 + * This is why we convert to iovecs: the readv() call uses them, and so 1685 1552 * it reads straight into the Guest's buffer. We loop to make sure we 1686 - * fill it. */ 1553 + * fill it. 1554 + */ 1687 1555 while (!iov_empty(iov, in_num)) { 1688 1556 len = readv(rng_info->rfd, iov, in_num); 1689 1557 if (len <= 0) ··· 1698 1562 add_used(vq, head, totlen); 1699 1563 } 1700 1564 1701 - /* And this creates a "hardware" random number device for the Guest. */ 1565 + /*L:199 1566 + * This creates a "hardware" random number device for the Guest. 1567 + */ 1702 1568 static void setup_rng(void) 1703 1569 { 1704 1570 struct device *dev; 1705 1571 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 1706 1572 1573 + /* Our device's privat info simply contains the /dev/random fd. */ 1707 1574 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 1708 1575 1709 - /* The device responds to return from I/O thread. */ 1576 + /* Create the new device. */ 1710 1577 dev = new_device("rng", VIRTIO_ID_RNG); 1711 1578 dev->priv = rng_info; 1712 1579 ··· 1725 1586 { 1726 1587 unsigned int i; 1727 1588 1728 - /* Since we don't track all open fds, we simply close everything beyond 1729 - * stderr. */ 1589 + /* 1590 + * Since we don't track all open fds, we simply close everything beyond 1591 + * stderr. 1592 + */ 1730 1593 for (i = 3; i < FD_SETSIZE; i++) 1731 1594 close(i); 1732 1595 ··· 1739 1598 err(1, "Could not exec %s", main_args[0]); 1740 1599 } 1741 1600 1742 - /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1743 - * its input and output, and finally, lays it to rest. */ 1601 + /*L:220 1602 + * Finally we reach the core of the Launcher which runs the Guest, serves 1603 + * its input and output, and finally, lays it to rest. 1604 + */ 1744 1605 static void __attribute__((noreturn)) run_guest(void) 1745 1606 { 1746 1607 for (;;) { ··· 1777 1634 * 1778 1635 * Are you ready? Take a deep breath and join me in the core of the Host, in 1779 1636 * "make Host". 1780 - :*/ 1637 + :*/ 1781 1638 1782 1639 static struct option opts[] = { 1783 1640 { "verbose", 0, NULL, 'v' }, ··· 1798 1655 /*L:105 The main routine is where the real work begins: */ 1799 1656 int main(int argc, char *argv[]) 1800 1657 { 1801 - /* Memory, top-level pagetable, code startpoint and size of the 1802 - * (optional) initrd. */ 1658 + /* Memory, code startpoint and size of the (optional) initrd. */ 1803 1659 unsigned long mem = 0, start, initrd_size = 0; 1804 1660 /* Two temporaries. */ 1805 1661 int i, c; ··· 1810 1668 /* Save the args: we "reboot" by execing ourselves again. */ 1811 1669 main_args = argv; 1812 1670 1813 - /* First we initialize the device list. We keep a pointer to the last 1671 + /* 1672 + * First we initialize the device list. We keep a pointer to the last 1814 1673 * device, and the next interrupt number to use for devices (1: 1815 - * remember that 0 is used by the timer). */ 1674 + * remember that 0 is used by the timer). 1675 + */ 1816 1676 devices.lastdev = NULL; 1817 1677 devices.next_irq = 1; 1818 1678 1819 1679 cpu_id = 0; 1820 - /* We need to know how much memory so we can set up the device 1680 + /* 1681 + * We need to know how much memory so we can set up the device 1821 1682 * descriptor and memory pages for the devices as we parse the command 1822 1683 * line. So we quickly look through the arguments to find the amount 1823 - * of memory now. */ 1684 + * of memory now. 1685 + */ 1824 1686 for (i = 1; i < argc; i++) { 1825 1687 if (argv[i][0] != '-') { 1826 1688 mem = atoi(argv[i]) * 1024 * 1024; 1827 - /* We start by mapping anonymous pages over all of 1689 + /* 1690 + * We start by mapping anonymous pages over all of 1828 1691 * guest-physical memory range. This fills it with 0, 1829 1692 * and ensures that the Guest won't be killed when it 1830 - * tries to access it. */ 1693 + * tries to access it. 1694 + */ 1831 1695 guest_base = map_zeroed_pages(mem / getpagesize() 1832 1696 + DEVICE_PAGES); 1833 1697 guest_limit = mem; ··· 1866 1718 usage(); 1867 1719 } 1868 1720 } 1869 - /* After the other arguments we expect memory and kernel image name, 1870 - * followed by command line arguments for the kernel. */ 1721 + /* 1722 + * After the other arguments we expect memory and kernel image name, 1723 + * followed by command line arguments for the kernel. 1724 + */ 1871 1725 if (optind + 2 > argc) 1872 1726 usage(); 1873 1727 ··· 1887 1737 /* Map the initrd image if requested (at top of physical memory) */ 1888 1738 if (initrd_name) { 1889 1739 initrd_size = load_initrd(initrd_name, mem); 1890 - /* These are the location in the Linux boot header where the 1891 - * start and size of the initrd are expected to be found. */ 1740 + /* 1741 + * These are the location in the Linux boot header where the 1742 + * start and size of the initrd are expected to be found. 1743 + */ 1892 1744 boot->hdr.ramdisk_image = mem - initrd_size; 1893 1745 boot->hdr.ramdisk_size = initrd_size; 1894 1746 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1895 1747 boot->hdr.type_of_loader = 0xFF; 1896 1748 } 1897 1749 1898 - /* The Linux boot header contains an "E820" memory map: ours is a 1899 - * simple, single region. */ 1750 + /* 1751 + * The Linux boot header contains an "E820" memory map: ours is a 1752 + * simple, single region. 1753 + */ 1900 1754 boot->e820_entries = 1; 1901 1755 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 1902 - /* The boot header contains a command line pointer: we put the command 1903 - * line after the boot header. */ 1756 + /* 1757 + * The boot header contains a command line pointer: we put the command 1758 + * line after the boot header. 1759 + */ 1904 1760 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1905 1761 /* We use a simple helper to copy the arguments separated by spaces. */ 1906 1762 concat((char *)(boot + 1), argv+optind+2); ··· 1920 1764 /* Tell the entry path not to try to reload segment registers. */ 1921 1765 boot->hdr.loadflags |= KEEP_SEGMENTS; 1922 1766 1923 - /* We tell the kernel to initialize the Guest: this returns the open 1924 - * /dev/lguest file descriptor. */ 1767 + /* 1768 + * We tell the kernel to initialize the Guest: this returns the open 1769 + * /dev/lguest file descriptor. 1770 + */ 1925 1771 tell_kernel(start); 1926 1772 1927 1773 /* Ensure that we terminate if a child dies. */

+1 -2

arch/x86/include/asm/lguest.h

··· 17 17 /* Pages for switcher itself, then two pages per cpu */ 18 18 #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 19 19 20 - /* We map at -4M (-2M when PAE is activated) for ease of mapping 21 - * into the guest (one PTE page). */ 20 + /* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ 22 21 #ifdef CONFIG_X86_PAE 23 22 #define SWITCHER_ADDR 0xFFE00000 24 23 #else

+5 -5

arch/x86/include/asm/lguest_hcall.h

··· 30 30 #include <asm/hw_irq.h> 31 31 #include <asm/kvm_para.h> 32 32 33 - /*G:030 But first, how does our Guest contact the Host to ask for privileged 33 + /*G:030 34 + * But first, how does our Guest contact the Host to ask for privileged 34 35 * operations? There are two ways: the direct way is to make a "hypercall", 35 36 * to make requests of the Host Itself. 36 37 * ··· 42 41 * 43 42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 44 43 * Host, rather than returning failure. This reflects Winston Churchill's 45 - * definition of a gentleman: "someone who is only rude intentionally". */ 46 - /*:*/ 44 + * definition of a gentleman: "someone who is only rude intentionally". 45 + :*/ 47 46 48 47 /* Can't use our min() macro here: needs to be a constant */ 49 48 #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 50 49 51 50 #define LHCALL_RING_SIZE 64 52 51 struct hcall_args { 53 - /* These map directly onto eax, ebx, ecx, edx and esi 54 - * in struct lguest_regs */ 52 + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ 55 53 unsigned long arg0, arg1, arg2, arg3, arg4; 56 54 }; 57 55

+279 -149

arch/x86/lguest/boot.c

··· 22 22 * 23 23 * So how does the kernel know it's a Guest? We'll see that later, but let's 24 24 * just say that we end up here where we replace the native functions various 25 - * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 + * "paravirt" structures with our Guest versions, then boot like normal. 26 + :*/ 26 27 27 28 /* 28 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. ··· 75 74 * 76 75 * The Guest in our tale is a simple creature: identical to the Host but 77 76 * behaving in simplified but equivalent ways. In particular, the Guest is the 78 - * same kernel as the Host (or at least, built from the same source code). :*/ 77 + * same kernel as the Host (or at least, built from the same source code). 78 + :*/ 79 79 80 80 struct lguest_data lguest_data = { 81 81 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, ··· 87 85 .syscall_vec = SYSCALL_VECTOR, 88 86 }; 89 87 90 - /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88 + /*G:037 89 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a 91 90 * ring buffer of stored hypercalls which the Host will run though next time we 92 91 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 92 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, ··· 97 94 * If we come around to a slot which hasn't been finished, then the table is 98 95 * full and we just make the hypercall directly. This has the nice side 99 96 * effect of causing the Host to run all the stored calls in the ring buffer 100 - * which empties it for next time! */ 97 + * which empties it for next time! 98 + */ 101 99 static void async_hcall(unsigned long call, unsigned long arg1, 102 100 unsigned long arg2, unsigned long arg3, 103 101 unsigned long arg4) ··· 107 103 static unsigned int next_call; 108 104 unsigned long flags; 109 105 110 - /* Disable interrupts if not already disabled: we don't want an 106 + /* 107 + * Disable interrupts if not already disabled: we don't want an 111 108 * interrupt handler making a hypercall while we're already doing 112 - * one! */ 109 + * one! 110 + */ 113 111 local_irq_save(flags); 114 112 if (lguest_data.hcall_status[next_call] != 0xFF) { 115 113 /* Table full, so do normal hcall which will flush table. */ ··· 131 125 local_irq_restore(flags); 132 126 } 133 127 134 - /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 135 - * real optimization trick! 128 + /*G:035 129 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real 130 + * optimization trick! 136 131 * 137 132 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 133 * them as a batch when lazy_mode is eventually turned off. Because hypercalls ··· 143 136 * lguest_leave_lazy_mode(). 144 137 * 145 138 * So, when we're in lazy mode, we call async_hcall() to store the call for 146 - * future processing: */ 139 + * future processing: 140 + */ 147 141 static void lazy_hcall1(unsigned long call, 148 142 unsigned long arg1) 149 143 { ··· 216 208 * check there before it tries to deliver an interrupt. 217 209 */ 218 210 219 - /* save_flags() is expected to return the processor state (ie. "flags"). The 211 + /* 212 + * save_flags() is expected to return the processor state (ie. "flags"). The 220 213 * flags word contains all kind of stuff, but in practice Linux only cares 221 - * about the interrupt flag. Our "save_flags()" just returns that. */ 214 + * about the interrupt flag. Our "save_flags()" just returns that. 215 + */ 222 216 static unsigned long save_fl(void) 223 217 { 224 218 return lguest_data.irq_enabled; ··· 232 222 lguest_data.irq_enabled = 0; 233 223 } 234 224 235 - /* Let's pause a moment. Remember how I said these are called so often? 225 + /* 226 + * Let's pause a moment. Remember how I said these are called so often? 236 227 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 237 228 * break some rules. In particular, these functions are assumed to save their 238 229 * own registers if they need to: normal C functions assume they can trash the 239 230 * eax register. To use normal C functions, we use 240 231 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 241 - * C function, then restores it. */ 232 + * C function, then restores it. 233 + */ 242 234 PV_CALLEE_SAVE_REGS_THUNK(save_fl); 243 235 PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 244 236 /*:*/ ··· 249 237 extern void lg_irq_enable(void); 250 238 extern void lg_restore_fl(unsigned long flags); 251 239 252 - /*M:003 Note that we don't check for outstanding interrupts when we re-enable 253 - * them (or when we unmask an interrupt). This seems to work for the moment, 254 - * since interrupts are rare and we'll just get the interrupt on the next timer 255 - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 256 - * would be to put the "irq_enabled" field in a page by itself, and have the 257 - * Host write-protect it when an interrupt comes in when irqs are disabled. 258 - * There will then be a page fault as soon as interrupts are re-enabled. 240 + /*M:003 241 + * Note that we don't check for outstanding interrupts when we re-enable them 242 + * (or when we unmask an interrupt). This seems to work for the moment, since 243 + * interrupts are rare and we'll just get the interrupt on the next timer tick, 244 + * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would 245 + * be to put the "irq_enabled" field in a page by itself, and have the Host 246 + * write-protect it when an interrupt comes in when irqs are disabled. There 247 + * will then be a page fault as soon as interrupts are re-enabled. 259 248 * 260 249 * A better method is to implement soft interrupt disable generally for x86: 261 250 * instead of disabling interrupts, we set a flag. If an interrupt does come 262 251 * in, we then disable them for real. This is uncommon, so we could simply use 263 - * a hypercall for interrupt control and not worry about efficiency. :*/ 252 + * a hypercall for interrupt control and not worry about efficiency. 253 + :*/ 264 254 265 255 /*G:034 266 256 * The Interrupt Descriptor Table (IDT). ··· 275 261 static void lguest_write_idt_entry(gate_desc *dt, 276 262 int entrynum, const gate_desc *g) 277 263 { 278 - /* The gate_desc structure is 8 bytes long: we hand it to the Host in 264 + /* 265 + * The gate_desc structure is 8 bytes long: we hand it to the Host in 279 266 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 280 267 * around like this; typesafety wasn't a big concern in Linux's early 281 - * years. */ 268 + * years. 269 + */ 282 270 u32 *desc = (u32 *)g; 283 271 /* Keep the local copy up to date. */ 284 272 native_write_idt_entry(dt, entrynum, g); ··· 288 272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 289 273 } 290 274 291 - /* Changing to a different IDT is very rare: we keep the IDT up-to-date every 275 + /* 276 + * Changing to a different IDT is very rare: we keep the IDT up-to-date every 292 277 * time it is written, so we can simply loop through all entries and tell the 293 - * Host about them. */ 278 + * Host about them. 279 + */ 294 280 static void lguest_load_idt(const struct desc_ptr *desc) 295 281 { 296 282 unsigned int i; ··· 323 305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 324 306 } 325 307 326 - /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 308 + /* 309 + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, 327 310 * then tell the Host to reload the entire thing. This operation is so rare 328 - * that this naive implementation is reasonable. */ 311 + * that this naive implementation is reasonable. 312 + */ 329 313 static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 330 314 const void *desc, int type) 331 315 { ··· 337 317 dt[entrynum].a, dt[entrynum].b); 338 318 } 339 319 340 - /* OK, I lied. There are three "thread local storage" GDT entries which change 320 + /* 321 + * OK, I lied. There are three "thread local storage" GDT entries which change 341 322 * on every context switch (these three entries are how glibc implements 342 - * __thread variables). So we have a hypercall specifically for this case. */ 323 + * __thread variables). So we have a hypercall specifically for this case. 324 + */ 343 325 static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 344 326 { 345 - /* There's one problem which normal hardware doesn't have: the Host 327 + /* 328 + * There's one problem which normal hardware doesn't have: the Host 346 329 * can't handle us removing entries we're currently using. So we clear 347 - * the GS register here: if it's needed it'll be reloaded anyway. */ 330 + * the GS register here: if it's needed it'll be reloaded anyway. 331 + */ 348 332 lazy_load_gs(0); 349 333 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 350 334 } 351 335 352 - /*G:038 That's enough excitement for now, back to ploughing through each of 353 - * the different pv_ops structures (we're about 1/3 of the way through). 336 + /*G:038 337 + * That's enough excitement for now, back to ploughing through each of the 338 + * different pv_ops structures (we're about 1/3 of the way through). 354 339 * 355 340 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 356 341 * uses this for some strange applications like Wine. We don't do anything 357 - * here, so they'll get an informative and friendly Segmentation Fault. */ 342 + * here, so they'll get an informative and friendly Segmentation Fault. 343 + */ 358 344 static void lguest_set_ldt(const void *addr, unsigned entries) 359 345 { 360 346 } 361 347 362 - /* This loads a GDT entry into the "Task Register": that entry points to a 348 + /* 349 + * This loads a GDT entry into the "Task Register": that entry points to a 363 350 * structure called the Task State Segment. Some comments scattered though the 364 351 * kernel code indicate that this used for task switching in ages past, along 365 352 * with blood sacrifice and astrology. ··· 374 347 * Now there's nothing interesting in here that we don't get told elsewhere. 375 348 * But the native version uses the "ltr" instruction, which makes the Host 376 349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 377 - * override the native version with a do-nothing version. */ 350 + * override the native version with a do-nothing version. 351 + */ 378 352 static void lguest_load_tr_desc(void) 379 353 { 380 354 } 381 355 382 - /* The "cpuid" instruction is a way of querying both the CPU identity 356 + /* 357 + * The "cpuid" instruction is a way of querying both the CPU identity 383 358 * (manufacturer, model, etc) and its features. It was introduced before the 384 359 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 385 360 * As you might imagine, after a decade and a half this treatment, it is now a 386 361 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 387 362 * 388 363 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 389 - * has been translated into 4 languages. I am not making this up! 364 + * has been translated into 5 languages. I am not making this up! 390 365 * 391 366 * We could get funky here and identify ourselves as "GenuineLguest", but 392 367 * instead we just use the real "cpuid" instruction. Then I pretty much turned ··· 400 371 * Replacing the cpuid so we can turn features off is great for the kernel, but 401 372 * anyone (including userspace) can just use the raw "cpuid" instruction and 402 373 * the Host won't even notice since it isn't privileged. So we try not to get 403 - * too worked up about it. */ 374 + * too worked up about it. 375 + */ 404 376 static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 405 377 unsigned int *cx, unsigned int *dx) 406 378 { ··· 409 379 410 380 native_cpuid(ax, bx, cx, dx); 411 381 switch (function) { 412 - case 0: /* ID and highest CPUID. Futureproof a little by sticking to 413 - * older ones. */ 382 + /* 383 + * CPUID 0 gives the highest legal CPUID number (and the ID string). 384 + * We futureproof our code a little by sticking to known CPUID values. 385 + */ 386 + case 0: 414 387 if (*ax > 5) 415 388 *ax = 5; 416 389 break; 417 - case 1: /* Basic feature request. */ 418 - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 390 + 391 + /* 392 + * CPUID 1 is a basic feature request. 393 + * 394 + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 395 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. 396 + */ 397 + case 1: 419 398 *cx &= 0x00002201; 420 - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ 421 399 *dx &= 0x07808151; 422 - /* The Host can do a nice optimization if it knows that the 400 + /* 401 + * The Host can do a nice optimization if it knows that the 423 402 * kernel mappings (addresses above 0xC0000000 or whatever 424 403 * PAGE_OFFSET is set to) haven't changed. But Linux calls 425 404 * flush_tlb_user() for both user and kernel mappings unless 426 - * the Page Global Enable (PGE) feature bit is set. */ 405 + * the Page Global Enable (PGE) feature bit is set. 406 + */ 427 407 *dx |= 0x00002000; 428 - /* We also lie, and say we're family id 5. 6 or greater 408 + /* 409 + * We also lie, and say we're family id 5. 6 or greater 429 410 * leads to a rdmsr in early_init_intel which we can't handle. 430 - * Family ID is returned as bits 8-12 in ax. */ 411 + * Family ID is returned as bits 8-12 in ax. 412 + */ 431 413 *ax &= 0xFFFFF0FF; 432 414 *ax |= 0x00000500; 433 415 break; 416 + /* 417 + * 0x80000000 returns the highest Extended Function, so we futureproof 418 + * like we do above by limiting it to known fields. 419 + */ 434 420 case 0x80000000: 435 - /* Futureproof this a little: if they ask how much extended 436 - * processor information there is, limit it to known fields. */ 437 421 if (*ax > 0x80000008) 438 422 *ax = 0x80000008; 439 423 break; 424 + 425 + /* 426 + * PAE systems can mark pages as non-executable. Linux calls this the 427 + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced 428 + * Virus Protection). We just switch turn if off here, since we don't 429 + * support it. 430 + */ 440 431 case 0x80000001: 441 - /* Here we should fix nx cap depending on host. */ 442 - /* For this version of PAE, we just clear NX bit. */ 443 432 *dx &= ~(1 << 20); 444 433 break; 445 434 } 446 435 } 447 436 448 - /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 437 + /* 438 + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 449 439 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 450 440 * it. The Host needs to know when the Guest wants to change them, so we have 451 441 * a whole series of functions like read_cr0() and write_cr0(). ··· 480 430 * name like "FPUTRAP bit" be a little less cryptic? 481 431 * 482 432 * We store cr0 locally because the Host never changes it. The Guest sometimes 483 - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 433 + * wants to read it and we'd prefer not to bother the Host unnecessarily. 434 + */ 484 435 static unsigned long current_cr0; 485 436 static void lguest_write_cr0(unsigned long val) 486 437 { ··· 494 443 return current_cr0; 495 444 } 496 445 497 - /* Intel provided a special instruction to clear the TS bit for people too cool 446 + /* 447 + * Intel provided a special instruction to clear the TS bit for people too cool 498 448 * to use write_cr0() to do it. This "clts" instruction is faster, because all 499 - * the vowels have been optimized out. */ 449 + * the vowels have been optimized out. 450 + */ 500 451 static void lguest_clts(void) 501 452 { 502 453 lazy_hcall1(LHCALL_TS, 0); 503 454 current_cr0 &= ~X86_CR0_TS; 504 455 } 505 456 506 - /* cr2 is the virtual address of the last page fault, which the Guest only ever 457 + /* 458 + * cr2 is the virtual address of the last page fault, which the Guest only ever 507 459 * reads. The Host kindly writes this into our "struct lguest_data", so we 508 - * just read it out of there. */ 460 + * just read it out of there. 461 + */ 509 462 static unsigned long lguest_read_cr2(void) 510 463 { 511 464 return lguest_data.cr2; ··· 518 463 /* See lguest_set_pte() below. */ 519 464 static bool cr3_changed = false; 520 465 521 - /* cr3 is the current toplevel pagetable page: the principle is the same as 466 + /* 467 + * cr3 is the current toplevel pagetable page: the principle is the same as 522 468 * cr0. Keep a local copy, and tell the Host when it changes. The only 523 469 * difference is that our local copy is in lguest_data because the Host needs 524 - * to set it upon our initial hypercall. */ 470 + * to set it upon our initial hypercall. 471 + */ 525 472 static void lguest_write_cr3(unsigned long cr3) 526 473 { 527 474 lguest_data.pgdir = cr3; ··· 595 538 * the real page tables based on the Guests'. 596 539 */ 597 540 598 - /* The Guest calls this to set a second-level entry (pte), ie. to map a page 541 + /* 542 + * The Guest calls this to set a second-level entry (pte), ie. to map a page 599 543 * into a process' address space. We set the entry then tell the Host the 600 544 * toplevel and address this corresponds to. The Guest uses one pagetable per 601 - * process, so we need to tell the Host which one we're changing (mm->pgd). */ 545 + * process, so we need to tell the Host which one we're changing (mm->pgd). 546 + */ 602 547 static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 603 548 pte_t *ptep) 604 549 { ··· 619 560 lguest_pte_update(mm, addr, ptep); 620 561 } 621 562 622 - /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 563 + /* 564 + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 623 565 * to set a middle-level entry when PAE is activated. 566 + * 624 567 * Again, we set the entry then tell the Host which page we changed, 625 - * and the index of the entry we changed. */ 568 + * and the index of the entry we changed. 569 + */ 626 570 #ifdef CONFIG_X86_PAE 627 571 static void lguest_set_pud(pud_t *pudp, pud_t pudval) 628 572 { ··· 644 582 } 645 583 #else 646 584 647 - /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 648 - * activated. */ 585 + /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ 649 586 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 650 587 { 651 588 native_set_pmd(pmdp, pmdval); ··· 653 592 } 654 593 #endif 655 594 656 - /* There are a couple of legacy places where the kernel sets a PTE, but we 595 + /* 596 + * There are a couple of legacy places where the kernel sets a PTE, but we 657 597 * don't know the top level any more. This is useless for us, since we don't 658 598 * know which pagetable is changing or what address, so we just tell the Host 659 599 * to forget all of them. Fortunately, this is very rare. ··· 662 600 * ... except in early boot when the kernel sets up the initial pagetables, 663 601 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 664 602 * the Host anything changed until we've done the first page table switch, 665 - * which brings boot back to 0.25 seconds. */ 603 + * which brings boot back to 0.25 seconds. 604 + */ 666 605 static void lguest_set_pte(pte_t *ptep, pte_t pteval) 667 606 { 668 607 native_set_pte(ptep, pteval); ··· 691 628 } 692 629 #endif 693 630 694 - /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 631 + /* 632 + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 695 633 * native page table operations. On native hardware you can set a new page 696 634 * table entry whenever you want, but if you want to remove one you have to do 697 635 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). ··· 701 637 * called when a valid entry is written, not when it's removed (ie. marked not 702 638 * present). Instead, this is where we come when the Guest wants to remove a 703 639 * page table entry: we tell the Host to set that entry to 0 (ie. the present 704 - * bit is zero). */ 640 + * bit is zero). 641 + */ 705 642 static void lguest_flush_tlb_single(unsigned long addr) 706 643 { 707 644 /* Simply set it to zero: if it was not, it will fault back in. */ 708 645 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 709 646 } 710 647 711 - /* This is what happens after the Guest has removed a large number of entries. 648 + /* 649 + * This is what happens after the Guest has removed a large number of entries. 712 650 * This tells the Host that any of the page table entries for userspace might 713 - * have changed, ie. virtual addresses below PAGE_OFFSET. */ 651 + * have changed, ie. virtual addresses below PAGE_OFFSET. 652 + */ 714 653 static void lguest_flush_tlb_user(void) 715 654 { 716 655 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 717 656 } 718 657 719 - /* This is called when the kernel page tables have changed. That's not very 658 + /* 659 + * This is called when the kernel page tables have changed. That's not very 720 660 * common (unless the Guest is using highmem, which makes the Guest extremely 721 - * slow), so it's worth separating this from the user flushing above. */ 661 + * slow), so it's worth separating this from the user flushing above. 662 + */ 722 663 static void lguest_flush_tlb_kernel(void) 723 664 { 724 665 lazy_hcall1(LHCALL_FLUSH_TLB, 1); ··· 760 691 .unmask = enable_lguest_irq, 761 692 }; 762 693 763 - /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 694 + /* 695 + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 764 696 * interrupt (except 128, which is used for system calls), and then tells the 765 697 * Linux infrastructure that each interrupt is controlled by our level-based 766 - * lguest interrupt controller. */ 698 + * lguest interrupt controller. 699 + */ 767 700 static void __init lguest_init_IRQ(void) 768 701 { 769 702 unsigned int i; 770 703 771 704 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 772 - /* Some systems map "vectors" to interrupts weirdly. Lguest has 773 - * a straightforward 1 to 1 mapping, so force that here. */ 705 + /* Some systems map "vectors" to interrupts weirdly. Not us! */ 774 706 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 775 707 if (i != SYSCALL_VECTOR) 776 708 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 777 709 } 778 - /* This call is required to set up for 4k stacks, where we have 779 - * separate stacks for hard and soft interrupts. */ 710 + 711 + /* 712 + * This call is required to set up for 4k stacks, where we have 713 + * separate stacks for hard and soft interrupts. 714 + */ 780 715 irq_ctx_init(smp_processor_id()); 781 716 } 782 717 ··· 802 729 return lguest_data.time.tv_sec; 803 730 } 804 731 805 - /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 732 + /* 733 + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 806 734 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 807 735 * This matches what we want here: if we return 0 from this function, the x86 808 - * TSC clock will give up and not register itself. */ 736 + * TSC clock will give up and not register itself. 737 + */ 809 738 static unsigned long lguest_tsc_khz(void) 810 739 { 811 740 return lguest_data.tsc_khz; 812 741 } 813 742 814 - /* If we can't use the TSC, the kernel falls back to our lower-priority 815 - * "lguest_clock", where we read the time value given to us by the Host. */ 743 + /* 744 + * If we can't use the TSC, the kernel falls back to our lower-priority 745 + * "lguest_clock", where we read the time value given to us by the Host. 746 + */ 816 747 static cycle_t lguest_clock_read(struct clocksource *cs) 817 748 { 818 749 unsigned long sec, nsec; 819 750 820 - /* Since the time is in two parts (seconds and nanoseconds), we risk 751 + /* 752 + * Since the time is in two parts (seconds and nanoseconds), we risk 821 753 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 822 754 * and getting 99 and 0. As Linux tends to come apart under the stress 823 - * of time travel, we must be careful: */ 755 + * of time travel, we must be careful: 756 + */ 824 757 do { 825 758 /* First we read the seconds part. */ 826 759 sec = lguest_data.time.tv_sec; 827 - /* This read memory barrier tells the compiler and the CPU that 760 + /* 761 + * This read memory barrier tells the compiler and the CPU that 828 762 * this can't be reordered: we have to complete the above 829 - * before going on. */ 763 + * before going on. 764 + */ 830 765 rmb(); 831 766 /* Now we read the nanoseconds part. */ 832 767 nsec = lguest_data.time.tv_nsec; ··· 858 777 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 859 778 }; 860 779 861 - /* We also need a "struct clock_event_device": Linux asks us to set it to go 780 + /* 781 + * We also need a "struct clock_event_device": Linux asks us to set it to go 862 782 * off some time in the future. Actually, James Morris figured all this out, I 863 - * just applied the patch. */ 783 + * just applied the patch. 784 + */ 864 785 static int lguest_clockevent_set_next_event(unsigned long delta, 865 786 struct clock_event_device *evt) 866 787 { ··· 912 829 .max_delta_ns = LG_CLOCK_MAX_DELTA, 913 830 }; 914 831 915 - /* This is the Guest timer interrupt handler (hardware interrupt 0). We just 916 - * call the clockevent infrastructure and it does whatever needs doing. */ 832 + /* 833 + * This is the Guest timer interrupt handler (hardware interrupt 0). We just 834 + * call the clockevent infrastructure and it does whatever needs doing. 835 + */ 917 836 static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 918 837 { 919 838 unsigned long flags; ··· 926 841 local_irq_restore(flags); 927 842 } 928 843 929 - /* At some point in the boot process, we get asked to set up our timing 844 + /* 845 + * At some point in the boot process, we get asked to set up our timing 930 846 * infrastructure. The kernel doesn't expect timer interrupts before this, but 931 847 * we cleverly initialized the "blocked_interrupts" field of "struct 932 - * lguest_data" so that timer interrupts were blocked until now. */ 848 + * lguest_data" so that timer interrupts were blocked until now. 849 + */ 933 850 static void lguest_time_init(void) 934 851 { 935 852 /* Set up the timer interrupt (0) to go to our simple timer routine */ ··· 955 868 * to work. They're pretty simple. 956 869 */ 957 870 958 - /* The Guest needs to tell the Host what stack it expects traps to use. For 871 + /* 872 + * The Guest needs to tell the Host what stack it expects traps to use. For 959 873 * native hardware, this is part of the Task State Segment mentioned above in 960 874 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 961 875 * 962 876 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 963 877 * segment), the privilege level (we're privilege level 1, the Host is 0 and 964 878 * will not tolerate us trying to use that), the stack pointer, and the number 965 - * of pages in the stack. */ 879 + * of pages in the stack. 880 + */ 966 881 static void lguest_load_sp0(struct tss_struct *tss, 967 882 struct thread_struct *thread) 968 883 { ··· 978 889 /* FIXME: Implement */ 979 890 } 980 891 981 - /* There are times when the kernel wants to make sure that no memory writes are 892 + /* 893 + * There are times when the kernel wants to make sure that no memory writes are 982 894 * caught in the cache (that they've all reached real hardware devices). This 983 895 * doesn't matter for the Guest which has virtual hardware. 984 896 * ··· 993 903 { 994 904 } 995 905 996 - /* If the Guest expects to have an Advanced Programmable Interrupt Controller, 906 + /* 907 + * If the Guest expects to have an Advanced Programmable Interrupt Controller, 997 908 * we play dumb by ignoring writes and returning 0 for reads. So it's no 998 909 * longer Programmable nor Controlling anything, and I don't think 8 lines of 999 910 * code qualifies for Advanced. It will also never interrupt anything. It 1000 - * does, however, allow us to get through the Linux boot code. */ 911 + * does, however, allow us to get through the Linux boot code. 912 + */ 1001 913 #ifdef CONFIG_X86_LOCAL_APIC 1002 914 static void lguest_apic_write(u32 reg, u32 v) 1003 915 { ··· 1048 956 kvm_hypercall0(LHCALL_HALT); 1049 957 } 1050 958 1051 - /* The SHUTDOWN hypercall takes a string to describe what's happening, and 959 + /* 960 + * The SHUTDOWN hypercall takes a string to describe what's happening, and 1052 961 * an argument which says whether this to restart (reboot) the Guest or not. 1053 962 * 1054 963 * Note that the Host always prefers that the Guest speak in physical addresses 1055 - * rather than virtual addresses, so we use __pa() here. */ 964 + * rather than virtual addresses, so we use __pa() here. 965 + */ 1056 966 static void lguest_power_off(void) 1057 967 { 1058 968 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), ··· 1085 991 * nice to move it back to lguest_init. Patch welcome... */ 1086 992 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1087 993 1088 - /* The Linux bootloader header contains an "e820" memory map: the 1089 - * Launcher populated the first entry with our memory limit. */ 994 + /* 995 + *The Linux bootloader header contains an "e820" memory map: the 996 + * Launcher populated the first entry with our memory limit. 997 + */ 1090 998 e820_add_region(boot_params.e820_map[0].addr, 1091 999 boot_params.e820_map[0].size, 1092 1000 boot_params.e820_map[0].type); ··· 1097 1001 return "LGUEST"; 1098 1002 } 1099 1003 1100 - /* We will eventually use the virtio console device to produce console output, 1004 + /* 1005 + * We will eventually use the virtio console device to produce console output, 1101 1006 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1102 - * console output. */ 1007 + * console output. 1008 + */ 1103 1009 static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1104 1010 { 1105 1011 char scratch[17]; 1106 1012 unsigned int len = count; 1107 1013 1108 - /* We use a nul-terminated string, so we have to make a copy. Icky, 1109 - * huh? */ 1014 + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ 1110 1015 if (len > sizeof(scratch) - 1) 1111 1016 len = sizeof(scratch) - 1; 1112 1017 scratch[len] = '\0'; ··· 1118 1021 return len; 1119 1022 } 1120 1023 1121 - /* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1122 - * Launcher to reboot us. */ 1024 + /* 1025 + * Rebooting also tells the Host we're finished, but the RESTART flag tells the 1026 + * Launcher to reboot us. 1027 + */ 1123 1028 static void lguest_restart(char *reason) 1124 1029 { 1125 1030 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); ··· 1148 1049 * fit comfortably. 1149 1050 * 1150 1051 * First we need assembly templates of each of the patchable Guest operations, 1151 - * and these are in i386_head.S. */ 1052 + * and these are in i386_head.S. 1053 + */ 1152 1054 1153 1055 /*G:060 We construct a table from the assembler templates: */ 1154 1056 static const struct lguest_insns ··· 1160 1060 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1161 1061 }; 1162 1062 1163 - /* Now our patch routine is fairly simple (based on the native one in 1063 + /* 1064 + * Now our patch routine is fairly simple (based on the native one in 1164 1065 * paravirt.c). If we have a replacement, we copy it in and return how much of 1165 - * the available space we used. */ 1066 + * the available space we used. 1067 + */ 1166 1068 static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1167 1069 unsigned long addr, unsigned len) 1168 1070 { ··· 1176 1074 1177 1075 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1178 1076 1179 - /* Similarly if we can't fit replacement (shouldn't happen, but let's 1180 - * be thorough). */ 1077 + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ 1181 1078 if (len < insn_len) 1182 1079 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1183 1080 ··· 1185 1084 return insn_len; 1186 1085 } 1187 1086 1188 - /*G:029 Once we get to lguest_init(), we know we're a Guest. The various 1087 + /*G:029 1088 + * Once we get to lguest_init(), we know we're a Guest. The various 1189 1089 * pv_ops structures in the kernel provide points for (almost) every routine we 1190 - * have to override to avoid privileged instructions. */ 1090 + * have to override to avoid privileged instructions. 1091 + */ 1191 1092 __init void lguest_init(void) 1192 1093 { 1193 - /* We're under lguest, paravirt is enabled, and we're running at 1194 - * privilege level 1, not 0 as normal. */ 1094 + /* We're under lguest. */ 1195 1095 pv_info.name = "lguest"; 1096 + /* Paravirt is enabled. */ 1196 1097 pv_info.paravirt_enabled = 1; 1098 + /* We're running at privilege level 1, not 0 as normal. */ 1197 1099 pv_info.kernel_rpl = 1; 1100 + /* Everyone except Xen runs with this set. */ 1198 1101 pv_info.shared_kernel_pmd = 1; 1199 1102 1200 - /* We set up all the lguest overrides for sensitive operations. These 1201 - * are detailed with the operations themselves. */ 1103 + /* 1104 + * We set up all the lguest overrides for sensitive operations. These 1105 + * are detailed with the operations themselves. 1106 + */ 1202 1107 1203 - /* interrupt-related operations */ 1108 + /* Interrupt-related operations */ 1204 1109 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1205 1110 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1206 1111 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); ··· 1214 1107 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1215 1108 pv_irq_ops.safe_halt = lguest_safe_halt; 1216 1109 1217 - /* init-time operations */ 1110 + /* Setup operations */ 1218 1111 pv_init_ops.memory_setup = lguest_memory_setup; 1219 1112 pv_init_ops.patch = lguest_patch; 1220 1113 1221 - /* Intercepts of various cpu instructions */ 1114 + /* Intercepts of various CPU instructions */ 1222 1115 pv_cpu_ops.load_gdt = lguest_load_gdt; 1223 1116 pv_cpu_ops.cpuid = lguest_cpuid; 1224 1117 pv_cpu_ops.load_idt = lguest_load_idt; ··· 1239 1132 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1240 1133 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1241 1134 1242 - /* pagetable management */ 1135 + /* Pagetable management */ 1243 1136 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1244 1137 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1245 1138 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; ··· 1261 1154 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1262 1155 1263 1156 #ifdef CONFIG_X86_LOCAL_APIC 1264 - /* apic read/write intercepts */ 1157 + /* APIC read/write intercepts */ 1265 1158 set_lguest_basic_apic_ops(); 1266 1159 #endif 1267 1160 1268 - /* time operations */ 1161 + /* Time operations */ 1269 1162 pv_time_ops.get_wallclock = lguest_get_wallclock; 1270 1163 pv_time_ops.time_init = lguest_time_init; 1271 1164 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1272 1165 1273 - /* Now is a good time to look at the implementations of these functions 1274 - * before returning to the rest of lguest_init(). */ 1166 + /* 1167 + * Now is a good time to look at the implementations of these functions 1168 + * before returning to the rest of lguest_init(). 1169 + */ 1275 1170 1276 - /*G:070 Now we've seen all the paravirt_ops, we return to 1171 + /*G:070 1172 + * Now we've seen all the paravirt_ops, we return to 1277 1173 * lguest_init() where the rest of the fairly chaotic boot setup 1278 - * occurs. */ 1174 + * occurs. 1175 + */ 1279 1176 1280 - /* The stack protector is a weird thing where gcc places a canary 1177 + /* 1178 + * The stack protector is a weird thing where gcc places a canary 1281 1179 * value on the stack and then checks it on return. This file is 1282 1180 * compiled with -fno-stack-protector it, so we got this far without 1283 1181 * problems. The value of the canary is kept at offset 20 from the 1284 1182 * %gs register, so we need to set that up before calling C functions 1285 - * in other files. */ 1183 + * in other files. 1184 + */ 1286 1185 setup_stack_canary_segment(0); 1287 - /* We could just call load_stack_canary_segment(), but we might as 1288 - * call switch_to_new_gdt() which loads the whole table and sets up 1289 - * the per-cpu segment descriptor register %fs as well. */ 1186 + 1187 + /* 1188 + * We could just call load_stack_canary_segment(), but we might as well 1189 + * call switch_to_new_gdt() which loads the whole table and sets up the 1190 + * per-cpu segment descriptor register %fs as well. 1191 + */ 1290 1192 switch_to_new_gdt(0); 1291 1193 1292 1194 /* As described in head_32.S, we map the first 128M of memory. */ 1293 1195 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1294 1196 1295 - /* The Host<->Guest Switcher lives at the top of our address space, and 1197 + /* 1198 + * The Host<->Guest Switcher lives at the top of our address space, and 1296 1199 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1297 - * it put the answer in lguest_data.reserve_mem */ 1200 + * it put the answer in lguest_data.reserve_mem 1201 + */ 1298 1202 reserve_top_address(lguest_data.reserve_mem); 1299 1203 1300 - /* If we don't initialize the lock dependency checker now, it crashes 1301 - * paravirt_disable_iospace. */ 1204 + /* 1205 + * If we don't initialize the lock dependency checker now, it crashes 1206 + * paravirt_disable_iospace. 1207 + */ 1302 1208 lockdep_init(); 1303 1209 1304 - /* The IDE code spends about 3 seconds probing for disks: if we reserve 1210 + /* 1211 + * The IDE code spends about 3 seconds probing for disks: if we reserve 1305 1212 * all the I/O ports up front it can't get them and so doesn't probe. 1306 1213 * Other device drivers are similar (but less severe). This cuts the 1307 - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1214 + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. 1215 + */ 1308 1216 paravirt_disable_iospace(); 1309 1217 1310 - /* This is messy CPU setup stuff which the native boot code does before 1311 - * start_kernel, so we have to do, too: */ 1218 + /* 1219 + * This is messy CPU setup stuff which the native boot code does before 1220 + * start_kernel, so we have to do, too: 1221 + */ 1312 1222 cpu_detect(&new_cpu_data); 1313 1223 /* head.S usually sets up the first capability word, so do it here. */ 1314 1224 new_cpu_data.x86_capability[0] = cpuid_edx(1); ··· 1342 1218 acpi_ht = 0; 1343 1219 #endif 1344 1220 1345 - /* We set the preferred console to "hvc". This is the "hypervisor 1221 + /* 1222 + * We set the preferred console to "hvc". This is the "hypervisor 1346 1223 * virtual console" driver written by the PowerPC people, which we also 1347 - * adapted for lguest's use. */ 1224 + * adapted for lguest's use. 1225 + */ 1348 1226 add_preferred_console("hvc", 0, NULL); 1349 1227 1350 1228 /* Register our very early console. */ 1351 1229 virtio_cons_early_init(early_put_chars); 1352 1230 1353 - /* Last of all, we set the power management poweroff hook to point to 1231 + /* 1232 + * Last of all, we set the power management poweroff hook to point to 1354 1233 * the Guest routine to power off, and the reboot hook to our restart 1355 - * routine. */ 1234 + * routine. 1235 + */ 1356 1236 pm_power_off = lguest_power_off; 1357 1237 machine_ops.restart = lguest_restart; 1358 1238 1359 - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1360 - * to boot as normal. It never returns. */ 1239 + /* 1240 + * Now we're set up, call i386_start_kernel() in head32.c and we proceed 1241 + * to boot as normal. It never returns. 1242 + */ 1361 1243 i386_start_kernel(); 1362 1244 } 1363 1245 /*

+68 -42

arch/x86/lguest/i386_head.S

··· 5 5 #include <asm/thread_info.h> 6 6 #include <asm/processor-flags.h> 7 7 8 - /*G:020 Our story starts with the kernel booting into startup_32 in 8 + /*G:020 9 + * Our story starts with the kernel booting into startup_32 in 9 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 11 * the bootloader (the Launcher in our case). 11 12 * ··· 22 21 * data without remembering to subtract __PAGE_OFFSET! 23 22 * 24 23 * The .section line puts this code in .init.text so it will be discarded after 25 - * boot. */ 24 + * boot. 25 + */ 26 26 .section .init.text, "ax", @progbits 27 27 ENTRY(lguest_entry) 28 - /* We make the "initialization" hypercall now to tell the Host about 29 - * us, and also find out where it put our page tables. */ 28 + /* 29 + * We make the "initialization" hypercall now to tell the Host about 30 + * us, and also find out where it put our page tables. 31 + */ 30 32 movl $LHCALL_LGUEST_INIT, %eax 31 33 movl $lguest_data - __PAGE_OFFSET, %ebx 32 34 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ ··· 37 33 /* Set up the initial stack so we can run C code. */ 38 34 movl $(init_thread_union+THREAD_SIZE),%esp 39 35 40 - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 41 - * moment. */ 36 + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ 42 37 jmp lguest_init+__PAGE_OFFSET 43 38 44 - /*G:055 We create a macro which puts the assembler code between lgstart_ and 45 - * lgend_ markers. These templates are put in the .text section: they can't be 46 - * discarded after boot as we may need to patch modules, too. */ 39 + /*G:055 40 + * We create a macro which puts the assembler code between lgstart_ and lgend_ 41 + * markers. These templates are put in the .text section: they can't be 42 + * discarded after boot as we may need to patch modules, too. 43 + */ 47 44 .text 48 45 #define LGUEST_PATCH(name, insns...) \ 49 46 lgstart_##name: insns; lgend_##name:; \ ··· 53 48 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 54 49 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 55 50 56 - /*G:033 But using those wrappers is inefficient (we'll see why that doesn't 57 - * matter for save_fl and irq_disable later). If we write our routines 58 - * carefully in assembler, we can avoid clobbering any registers and avoid 59 - * jumping through the wrapper functions. 51 + /*G:033 52 + * But using those wrappers is inefficient (we'll see why that doesn't matter 53 + * for save_fl and irq_disable later). If we write our routines carefully in 54 + * assembler, we can avoid clobbering any registers and avoid jumping through 55 + * the wrapper functions. 60 56 * 61 57 * I skipped over our first piece of assembler, but this one is worth studying 62 - * in a bit more detail so I'll describe in easy stages. First, the routine 63 - * to enable interrupts: */ 58 + * in a bit more detail so I'll describe in easy stages. First, the routine to 59 + * enable interrupts: 60 + */ 64 61 ENTRY(lg_irq_enable) 65 - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 66 - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 62 + /* 63 + * The reverse of irq_disable, this sets lguest_data.irq_enabled to 64 + * X86_EFLAGS_IF (ie. "Interrupts enabled"). 65 + */ 67 66 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 68 - /* But now we need to check if the Host wants to know: there might have 67 + /* 68 + * But now we need to check if the Host wants to know: there might have 69 69 * been interrupts waiting to be delivered, in which case it will have 70 70 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 71 - * jump to send_interrupts, otherwise we're done. */ 71 + * jump to send_interrupts, otherwise we're done. 72 + */ 72 73 testl $0, lguest_data+LGUEST_DATA_irq_pending 73 74 jnz send_interrupts 74 - /* One cool thing about x86 is that you can do many things without using 75 + /* 76 + * One cool thing about x86 is that you can do many things without using 75 77 * a register. In this case, the normal path hasn't needed to save or 76 - * restore any registers at all! */ 78 + * restore any registers at all! 79 + */ 77 80 ret 78 81 send_interrupts: 79 - /* OK, now we need a register: eax is used for the hypercall number, 82 + /* 83 + * OK, now we need a register: eax is used for the hypercall number, 80 84 * which is LHCALL_SEND_INTERRUPTS. 81 85 * 82 86 * We used not to bother with this pending detection at all, which was 83 87 * much simpler. Sooner or later the Host would realize it had to 84 88 * send us an interrupt. But that turns out to make performance 7 85 89 * times worse on a simple tcp benchmark. So now we do this the hard 86 - * way. */ 90 + * way. 91 + */ 87 92 pushl %eax 88 93 movl $LHCALL_SEND_INTERRUPTS, %eax 89 - /* This is a vmcall instruction (same thing that KVM uses). Older 94 + /* 95 + * This is a vmcall instruction (same thing that KVM uses). Older 90 96 * assembler versions might not know the "vmcall" instruction, so we 91 - * create one manually here. */ 97 + * create one manually here. 98 + */ 92 99 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 93 100 popl %eax 94 101 ret 95 102 96 - /* Finally, the "popf" or "restore flags" routine. The %eax register holds the 103 + /* 104 + * Finally, the "popf" or "restore flags" routine. The %eax register holds the 97 105 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 98 - * enabling interrupts again, if it's 0 we're leaving them off. */ 106 + * enabling interrupts again, if it's 0 we're leaving them off. 107 + */ 99 108 ENTRY(lg_restore_fl) 100 109 /* This is just "lguest_data.irq_enabled = flags;" */ 101 110 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 102 - /* Now, if the %eax value has enabled interrupts and 111 + /* 112 + * Now, if the %eax value has enabled interrupts and 103 113 * lguest_data.irq_pending is set, we want to tell the Host so it can 104 114 * deliver any outstanding interrupts. Fortunately, both values will 105 115 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 106 116 * instruction will AND them together for us. If both are set, we 107 - * jump to send_interrupts. */ 117 + * jump to send_interrupts. 118 + */ 108 119 testl lguest_data+LGUEST_DATA_irq_pending, %eax 109 120 jnz send_interrupts 110 121 /* Again, the normal path has used no extra registers. Clever, huh? */ ··· 130 109 .global lguest_noirq_start 131 110 .global lguest_noirq_end 132 111 133 - /*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 134 - * it sets the eflags interrupt bit on the stack based on 135 - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 136 - * restoring it. However, when the Host sets the Guest up for direct traps, 137 - * such as system calls, the processor is the one to push eflags onto the 138 - * stack, and the interrupt bit will be 1 (in reality, interrupts are always 139 - * enabled in the Guest). 112 + /*M:004 113 + * When the Host reflects a trap or injects an interrupt into the Guest, it 114 + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, 115 + * so the Guest iret logic does the right thing when restoring it. However, 116 + * when the Host sets the Guest up for direct traps, such as system calls, the 117 + * processor is the one to push eflags onto the stack, and the interrupt bit 118 + * will be 1 (in reality, interrupts are always enabled in the Guest). 140 119 * 141 120 * This turns out to be harmless: the only trap which should happen under Linux 142 121 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 143 122 * regions), which has to be reflected through the Host anyway. If another 144 123 * trap *does* go off when interrupts are disabled, the Guest will panic, and 145 - * we'll never get to this iret! :*/ 124 + * we'll never get to this iret! 125 + :*/ 146 126 147 - /*G:045 There is one final paravirt_op that the Guest implements, and glancing 148 - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 127 + /*G:045 128 + * There is one final paravirt_op that the Guest implements, and glancing at it 129 + * you can see why I left it to last. It's *cool*! It's in *assembler*! 149 130 * 150 131 * The "iret" instruction is used to return from an interrupt or trap. The 151 132 * stack looks like this: ··· 171 148 * return to userspace or wherever. Our solution to this is to surround the 172 149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 173 150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 174 - * enabled. */ 151 + * enabled. 152 + */ 175 153 ENTRY(lguest_iret) 176 154 pushl %eax 177 155 movl 12(%esp), %eax 178 156 lguest_noirq_start: 179 - /* Note the %ss: segment prefix here. Normal data accesses use the 157 + /* 158 + * Note the %ss: segment prefix here. Normal data accesses use the 180 159 * "ds" segment, but that will have already been restored for whatever 181 160 * we're returning to (such as userspace): we can't trust it. The %ss: 182 - * prefix makes sure we use the stack segment, which is still valid. */ 161 + * prefix makes sure we use the stack segment, which is still valid. 162 + */ 183 163 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 184 164 popl %eax 185 165 iret

+75 -39

drivers/lguest/core.c

··· 1 - /*P:400 This contains run_guest() which actually calls into the Host<->Guest 1 + /*P:400 2 + * This contains run_guest() which actually calls into the Host<->Guest 2 3 * Switcher and analyzes the return, such as determining if the Guest wants the 3 - * Host to do something. This file also contains useful helper routines. :*/ 4 + * Host to do something. This file also contains useful helper routines. 5 + :*/ 4 6 #include <linux/module.h> 5 7 #include <linux/stringify.h> 6 8 #include <linux/stddef.h> ··· 26 24 /* This One Big lock protects all inter-guest data structures. */ 27 25 DEFINE_MUTEX(lguest_lock); 28 26 29 - /*H:010 We need to set up the Switcher at a high virtual address. Remember the 27 + /*H:010 28 + * We need to set up the Switcher at a high virtual address. Remember the 30 29 * Switcher is a few hundred bytes of assembler code which actually changes the 31 30 * CPU to run the Guest, and then changes back to the Host when a trap or 32 31 * interrupt happens. ··· 36 33 * Host since it will be running as the switchover occurs. 37 34 * 38 35 * Trying to map memory at a particular address is an unusual thing to do, so 39 - * it's not a simple one-liner. */ 36 + * it's not a simple one-liner. 37 + */ 40 38 static __init int map_switcher(void) 41 39 { 42 40 int i, err; ··· 51 47 * easy. 52 48 */ 53 49 54 - /* We allocate an array of struct page pointers. map_vm_area() wants 55 - * this, rather than just an array of pages. */ 50 + /* 51 + * We allocate an array of struct page pointers. map_vm_area() wants 52 + * this, rather than just an array of pages. 53 + */ 56 54 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 57 55 GFP_KERNEL); 58 56 if (!switcher_page) { ··· 62 56 goto out; 63 57 } 64 58 65 - /* Now we actually allocate the pages. The Guest will see these pages, 66 - * so we make sure they're zeroed. */ 59 + /* 60 + * Now we actually allocate the pages. The Guest will see these pages, 61 + * so we make sure they're zeroed. 62 + */ 67 63 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 68 64 unsigned long addr = get_zeroed_page(GFP_KERNEL); 69 65 if (!addr) { ··· 75 67 switcher_page[i] = virt_to_page(addr); 76 68 } 77 69 78 - /* First we check that the Switcher won't overlap the fixmap area at 70 + /* 71 + * First we check that the Switcher won't overlap the fixmap area at 79 72 * the top of memory. It's currently nowhere near, but it could have 80 - * very strange effects if it ever happened. */ 73 + * very strange effects if it ever happened. 74 + */ 81 75 if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ 82 76 err = -ENOMEM; 83 77 printk("lguest: mapping switcher would thwack fixmap\n"); 84 78 goto free_pages; 85 79 } 86 80 87 - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 81 + /* 82 + * Now we reserve the "virtual memory area" we want: 0xFFC00000 88 83 * (SWITCHER_ADDR). We might not get it in theory, but in practice 89 84 * it's worked so far. The end address needs +1 because __get_vm_area 90 - * allocates an extra guard page, so we need space for that. */ 85 + * allocates an extra guard page, so we need space for that. 86 + */ 91 87 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 92 88 VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR 93 89 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); ··· 101 89 goto free_pages; 102 90 } 103 91 104 - /* This code actually sets up the pages we've allocated to appear at 92 + /* 93 + * This code actually sets up the pages we've allocated to appear at 105 94 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the 106 95 * kind of pages we're mapping (kernel pages), and a pointer to our 107 96 * array of struct pages. It increments that pointer, but we don't 108 - * care. */ 97 + * care. 98 + */ 109 99 pagep = switcher_page; 110 100 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); 111 101 if (err) { ··· 115 101 goto free_vma; 116 102 } 117 103 118 - /* Now the Switcher is mapped at the right address, we can't fail! 119 - * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ 104 + /* 105 + * Now the Switcher is mapped at the right address, we can't fail! 106 + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). 107 + */ 120 108 memcpy(switcher_vma->addr, start_switcher_text, 121 109 end_switcher_text - start_switcher_text); 122 110 ··· 140 124 } 141 125 /*:*/ 142 126 143 - /* Cleaning up the mapping when the module is unloaded is almost... 144 - * too easy. */ 127 + /* Cleaning up the mapping when the module is unloaded is almost... too easy. */ 145 128 static void unmap_switcher(void) 146 129 { 147 130 unsigned int i; ··· 166 151 * But we can't trust the Guest: it might be trying to access the Launcher 167 152 * code. We have to check that the range is below the pfn_limit the Launcher 168 153 * gave us. We have to make sure that addr + len doesn't give us a false 169 - * positive by overflowing, too. */ 154 + * positive by overflowing, too. 155 + */ 170 156 bool lguest_address_ok(const struct lguest *lg, 171 157 unsigned long addr, unsigned long len) 172 158 { 173 159 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 174 160 } 175 161 176 - /* This routine copies memory from the Guest. Here we can see how useful the 162 + /* 163 + * This routine copies memory from the Guest. Here we can see how useful the 177 164 * kill_lguest() routine we met in the Launcher can be: we return a random 178 - * value (all zeroes) instead of needing to return an error. */ 165 + * value (all zeroes) instead of needing to return an error. 166 + */ 179 167 void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) 180 168 { 181 169 if (!lguest_address_ok(cpu->lg, addr, bytes) ··· 199 181 } 200 182 /*:*/ 201 183 202 - /*H:030 Let's jump straight to the the main loop which runs the Guest. 184 + /*H:030 185 + * Let's jump straight to the the main loop which runs the Guest. 203 186 * Remember, this is called by the Launcher reading /dev/lguest, and we keep 204 - * going around and around until something interesting happens. */ 187 + * going around and around until something interesting happens. 188 + */ 205 189 int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 206 190 { 207 191 /* We stop running once the Guest is dead. */ ··· 215 195 if (cpu->hcall) 216 196 do_hypercalls(cpu); 217 197 218 - /* It's possible the Guest did a NOTIFY hypercall to the 219 - * Launcher, in which case we return from the read() now. */ 198 + /* 199 + * It's possible the Guest did a NOTIFY hypercall to the 200 + * Launcher, in which case we return from the read() now. 201 + */ 220 202 if (cpu->pending_notify) { 221 203 if (!send_notify_to_eventfd(cpu)) { 222 204 if (put_user(cpu->pending_notify, user)) ··· 231 209 if (signal_pending(current)) 232 210 return -ERESTARTSYS; 233 211 234 - /* Check if there are any interrupts which can be delivered now: 212 + /* 213 + * Check if there are any interrupts which can be delivered now: 235 214 * if so, this sets up the hander to be executed when we next 236 - * run the Guest. */ 215 + * run the Guest. 216 + */ 237 217 irq = interrupt_pending(cpu, &more); 238 218 if (irq < LGUEST_IRQS) 239 219 try_deliver_interrupt(cpu, irq, more); 240 220 241 - /* All long-lived kernel loops need to check with this horrible 221 + /* 222 + * All long-lived kernel loops need to check with this horrible 242 223 * thing called the freezer. If the Host is trying to suspend, 243 - * it stops us. */ 224 + * it stops us. 225 + */ 244 226 try_to_freeze(); 245 227 246 - /* Just make absolutely sure the Guest is still alive. One of 247 - * those hypercalls could have been fatal, for example. */ 228 + /* 229 + * Just make absolutely sure the Guest is still alive. One of 230 + * those hypercalls could have been fatal, for example. 231 + */ 248 232 if (cpu->lg->dead) 249 233 break; 250 234 251 - /* If the Guest asked to be stopped, we sleep. The Guest's 252 - * clock timer will wake us. */ 235 + /* 236 + * If the Guest asked to be stopped, we sleep. The Guest's 237 + * clock timer will wake us. 238 + */ 253 239 if (cpu->halted) { 254 240 set_current_state(TASK_INTERRUPTIBLE); 255 - /* Just before we sleep, make sure no interrupt snuck in 256 - * which we should be doing. */ 241 + /* 242 + * Just before we sleep, make sure no interrupt snuck in 243 + * which we should be doing. 244 + */ 257 245 if (interrupt_pending(cpu, &more) < LGUEST_IRQS) 258 246 set_current_state(TASK_RUNNING); 259 247 else ··· 271 239 continue; 272 240 } 273 241 274 - /* OK, now we're ready to jump into the Guest. First we put up 275 - * the "Do Not Disturb" sign: */ 242 + /* 243 + * OK, now we're ready to jump into the Guest. First we put up 244 + * the "Do Not Disturb" sign: 245 + */ 276 246 local_irq_disable(); 277 247 278 248 /* Actually run the Guest until something happens. */ ··· 361 327 } 362 328 /*:*/ 363 329 364 - /* The Host side of lguest can be a module. This is a nice way for people to 365 - * play with it. */ 330 + /* 331 + * The Host side of lguest can be a module. This is a nice way for people to 332 + * play with it. 333 + */ 366 334 module_init(init); 367 335 module_exit(fini); 368 336 MODULE_LICENSE("GPL");

+92 -49

drivers/lguest/hypercalls.c

··· 1 - /*P:500 Just as userspace programs request kernel operations through a system 1 + /*P:500 2 + * Just as userspace programs request kernel operations through a system 2 3 * call, the Guest requests Host operations through a "hypercall". You might 3 4 * notice this nomenclature doesn't really follow any logic, but the name has 4 5 * been around for long enough that we're stuck with it. As you'd expect, this 5 - * code is basically a one big switch statement. :*/ 6 + * code is basically a one big switch statement. 7 + :*/ 6 8 7 9 /* Copyright (C) 2006 Rusty Russell IBM Corporation 8 10 ··· 30 28 #include <asm/pgtable.h> 31 29 #include "lg.h" 32 30 33 - /*H:120 This is the core hypercall routine: where the Guest gets what it wants. 34 - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ 31 + /*H:120 32 + * This is the core hypercall routine: where the Guest gets what it wants. 33 + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. 34 + */ 35 35 static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 36 36 { 37 37 switch (args->arg0) { 38 38 case LHCALL_FLUSH_ASYNC: 39 - /* This call does nothing, except by breaking out of the Guest 40 - * it makes us process all the asynchronous hypercalls. */ 39 + /* 40 + * This call does nothing, except by breaking out of the Guest 41 + * it makes us process all the asynchronous hypercalls. 42 + */ 41 43 break; 42 44 case LHCALL_SEND_INTERRUPTS: 43 - /* This call does nothing too, but by breaking out of the Guest 44 - * it makes us process any pending interrupts. */ 45 + /* 46 + * This call does nothing too, but by breaking out of the Guest 47 + * it makes us process any pending interrupts. 48 + */ 45 49 break; 46 50 case LHCALL_LGUEST_INIT: 47 - /* You can't get here unless you're already initialized. Don't 48 - * do that. */ 51 + /* 52 + * You can't get here unless you're already initialized. Don't 53 + * do that. 54 + */ 49 55 kill_guest(cpu, "already have lguest_data"); 50 56 break; 51 57 case LHCALL_SHUTDOWN: { 52 - /* Shutdown is such a trivial hypercall that we do it in four 53 - * lines right here. */ 54 58 char msg[128]; 55 - /* If the lgread fails, it will call kill_guest() itself; the 56 - * kill_guest() with the message will be ignored. */ 59 + /* 60 + * Shutdown is such a trivial hypercall that we do it in four 61 + * lines right here. 62 + * 63 + * If the lgread fails, it will call kill_guest() itself; the 64 + * kill_guest() with the message will be ignored. 65 + */ 57 66 __lgread(cpu, msg, args->arg1, sizeof(msg)); 58 67 msg[sizeof(msg)-1] = '\0'; 59 68 kill_guest(cpu, "CRASH: %s", msg); ··· 73 60 break; 74 61 } 75 62 case LHCALL_FLUSH_TLB: 76 - /* FLUSH_TLB comes in two flavors, depending on the 77 - * argument: */ 63 + /* FLUSH_TLB comes in two flavors, depending on the argument: */ 78 64 if (args->arg1) 79 65 guest_pagetable_clear_all(cpu); 80 66 else 81 67 guest_pagetable_flush_user(cpu); 82 68 break; 83 69 84 - /* All these calls simply pass the arguments through to the right 85 - * routines. */ 70 + /* 71 + * All these calls simply pass the arguments through to the right 72 + * routines. 73 + */ 86 74 case LHCALL_NEW_PGTABLE: 87 75 guest_new_pagetable(cpu, args->arg1); 88 76 break; ··· 126 112 kill_guest(cpu, "Bad hypercall %li\n", args->arg0); 127 113 } 128 114 } 129 - /*:*/ 130 115 131 - /*H:124 Asynchronous hypercalls are easy: we just look in the array in the 116 + /*H:124 117 + * Asynchronous hypercalls are easy: we just look in the array in the 132 118 * Guest's "struct lguest_data" to see if any new ones are marked "ready". 133 119 * 134 120 * We are careful to do these in order: obviously we respect the order the 135 121 * Guest put them in the ring, but we also promise the Guest that they will 136 122 * happen before any normal hypercall (which is why we check this before 137 - * checking for a normal hcall). */ 123 + * checking for a normal hcall). 124 + */ 138 125 static void do_async_hcalls(struct lg_cpu *cpu) 139 126 { 140 127 unsigned int i; ··· 148 133 /* We process "struct lguest_data"s hcalls[] ring once. */ 149 134 for (i = 0; i < ARRAY_SIZE(st); i++) { 150 135 struct hcall_args args; 151 - /* We remember where we were up to from last time. This makes 136 + /* 137 + * We remember where we were up to from last time. This makes 152 138 * sure that the hypercalls are done in the order the Guest 153 - * places them in the ring. */ 139 + * places them in the ring. 140 + */ 154 141 unsigned int n = cpu->next_hcall; 155 142 156 143 /* 0xFF means there's no call here (yet). */ 157 144 if (st[n] == 0xFF) 158 145 break; 159 146 160 - /* OK, we have hypercall. Increment the "next_hcall" cursor, 161 - * and wrap back to 0 if we reach the end. */ 147 + /* 148 + * OK, we have hypercall. Increment the "next_hcall" cursor, 149 + * and wrap back to 0 if we reach the end. 150 + */ 162 151 if (++cpu->next_hcall == LHCALL_RING_SIZE) 163 152 cpu->next_hcall = 0; 164 153 165 - /* Copy the hypercall arguments into a local copy of 166 - * the hcall_args struct. */ 154 + /* 155 + * Copy the hypercall arguments into a local copy of the 156 + * hcall_args struct. 157 + */ 167 158 if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], 168 159 sizeof(struct hcall_args))) { 169 160 kill_guest(cpu, "Fetching async hypercalls"); ··· 185 164 break; 186 165 } 187 166 188 - /* Stop doing hypercalls if they want to notify the Launcher: 189 - * it needs to service this first. */ 167 + /* 168 + * Stop doing hypercalls if they want to notify the Launcher: 169 + * it needs to service this first. 170 + */ 190 171 if (cpu->pending_notify) 191 172 break; 192 173 } 193 174 } 194 175 195 - /* Last of all, we look at what happens first of all. The very first time the 196 - * Guest makes a hypercall, we end up here to set things up: */ 176 + /* 177 + * Last of all, we look at what happens first of all. The very first time the 178 + * Guest makes a hypercall, we end up here to set things up: 179 + */ 197 180 static void initialize(struct lg_cpu *cpu) 198 181 { 199 - /* You can't do anything until you're initialized. The Guest knows the 200 - * rules, so we're unforgiving here. */ 182 + /* 183 + * You can't do anything until you're initialized. The Guest knows the 184 + * rules, so we're unforgiving here. 185 + */ 201 186 if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { 202 187 kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); 203 188 return; ··· 212 185 if (lguest_arch_init_hypercalls(cpu)) 213 186 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 214 187 215 - /* The Guest tells us where we're not to deliver interrupts by putting 216 - * the range of addresses into "struct lguest_data". */ 188 + /* 189 + * The Guest tells us where we're not to deliver interrupts by putting 190 + * the range of addresses into "struct lguest_data". 191 + */ 217 192 if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) 218 193 || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) 219 194 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 220 195 221 - /* We write the current time into the Guest's data page once so it can 222 - * set its clock. */ 196 + /* 197 + * We write the current time into the Guest's data page once so it can 198 + * set its clock. 199 + */ 223 200 write_timestamp(cpu); 224 201 225 202 /* page_tables.c will also do some setup. */ 226 203 page_table_guest_data_init(cpu); 227 204 228 - /* This is the one case where the above accesses might have been the 205 + /* 206 + * This is the one case where the above accesses might have been the 229 207 * first write to a Guest page. This may have caused a copy-on-write 230 208 * fault, but the old page might be (read-only) in the Guest 231 - * pagetable. */ 209 + * pagetable. 210 + */ 232 211 guest_pagetable_clear_all(cpu); 233 212 } 234 213 /*:*/ 235 214 236 - /*M:013 If a Guest reads from a page (so creates a mapping) that it has never 215 + /*M:013 216 + * If a Guest reads from a page (so creates a mapping) that it has never 237 217 * written to, and then the Launcher writes to it (ie. the output of a virtual 238 218 * device), the Guest will still see the old page. In practice, this never 239 219 * happens: why would the Guest read a page which it has never written to? But 240 - * a similar scenario might one day bite us, so it's worth mentioning. :*/ 220 + * a similar scenario might one day bite us, so it's worth mentioning. 221 + :*/ 241 222 242 223 /*H:100 243 224 * Hypercalls ··· 264 229 return; 265 230 } 266 231 267 - /* The Guest has initialized. 232 + /* 233 + * The Guest has initialized. 268 234 * 269 - * Look in the hypercall ring for the async hypercalls: */ 235 + * Look in the hypercall ring for the async hypercalls: 236 + */ 270 237 do_async_hcalls(cpu); 271 238 272 - /* If we stopped reading the hypercall ring because the Guest did a 239 + /* 240 + * If we stopped reading the hypercall ring because the Guest did a 273 241 * NOTIFY to the Launcher, we want to return now. Otherwise we do 274 - * the hypercall. */ 242 + * the hypercall. 243 + */ 275 244 if (!cpu->pending_notify) { 276 245 do_hcall(cpu, cpu->hcall); 277 - /* Tricky point: we reset the hcall pointer to mark the 246 + /* 247 + * Tricky point: we reset the hcall pointer to mark the 278 248 * hypercall as "done". We use the hcall pointer rather than 279 249 * the trap number to indicate a hypercall is pending. 280 250 * Normally it doesn't matter: the Guest will run again and ··· 288 248 * However, if we are signalled or the Guest sends I/O to the 289 249 * Launcher, the run_guest() loop will exit without running the 290 250 * Guest. When it comes back it would try to re-run the 291 - * hypercall. Finding that bug sucked. */ 251 + * hypercall. Finding that bug sucked. 252 + */ 292 253 cpu->hcall = NULL; 293 254 } 294 255 } 295 256 296 - /* This routine supplies the Guest with time: it's used for wallclock time at 297 - * initial boot and as a rough time source if the TSC isn't available. */ 257 + /* 258 + * This routine supplies the Guest with time: it's used for wallclock time at 259 + * initial boot and as a rough time source if the TSC isn't available. 260 + */ 298 261 void write_timestamp(struct lg_cpu *cpu) 299 262 { 300 263 struct timespec now;

+191 -97

drivers/lguest/interrupts_and_traps.c

··· 1 - /*P:800 Interrupts (traps) are complicated enough to earn their own file. 1 + /*P:800 2 + * Interrupts (traps) are complicated enough to earn their own file. 2 3 * There are three classes of interrupts: 3 4 * 4 5 * 1) Real hardware interrupts which occur while we're running the Guest, ··· 11 10 * just like real hardware would deliver them. Traps from the Guest can be set 12 11 * up to go directly back into the Guest, but sometimes the Host wants to see 13 12 * them first, so we also have a way of "reflecting" them into the Guest as if 14 - * they had been delivered to it directly. :*/ 13 + * they had been delivered to it directly. 14 + :*/ 15 15 #include <linux/uaccess.h> 16 16 #include <linux/interrupt.h> 17 17 #include <linux/module.h> ··· 28 26 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); 29 27 } 30 28 31 - /* The "type" of the interrupt handler is a 4 bit field: we only support a 32 - * couple of types. */ 29 + /* 30 + * The "type" of the interrupt handler is a 4 bit field: we only support a 31 + * couple of types. 32 + */ 33 33 static int idt_type(u32 lo, u32 hi) 34 34 { 35 35 return (hi >> 8) & 0xF; ··· 43 39 return (hi & 0x8000); 44 40 } 45 41 46 - /* We need a helper to "push" a value onto the Guest's stack, since that's a 47 - * big part of what delivering an interrupt does. */ 42 + /* 43 + * We need a helper to "push" a value onto the Guest's stack, since that's a 44 + * big part of what delivering an interrupt does. 45 + */ 48 46 static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) 49 47 { 50 48 /* Stack grows upwards: move stack then write value. */ ··· 54 48 lgwrite(cpu, *gstack, u32, val); 55 49 } 56 50 57 - /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or 51 + /*H:210 52 + * The set_guest_interrupt() routine actually delivers the interrupt or 58 53 * trap. The mechanics of delivering traps and interrupts to the Guest are the 59 54 * same, except some traps have an "error code" which gets pushed onto the 60 55 * stack as well: the caller tells us if this is one. ··· 66 59 * 67 60 * We set up the stack just like the CPU does for a real interrupt, so it's 68 61 * identical for the Guest (and the standard "iret" instruction will undo 69 - * it). */ 62 + * it). 63 + */ 70 64 static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, 71 65 bool has_err) 72 66 { ··· 75 67 u32 eflags, ss, irq_enable; 76 68 unsigned long virtstack; 77 69 78 - /* There are two cases for interrupts: one where the Guest is already 70 + /* 71 + * There are two cases for interrupts: one where the Guest is already 79 72 * in the kernel, and a more complex one where the Guest is in 80 - * userspace. We check the privilege level to find out. */ 73 + * userspace. We check the privilege level to find out. 74 + */ 81 75 if ((cpu->regs->ss&0x3) != GUEST_PL) { 82 - /* The Guest told us their kernel stack with the SET_STACK 83 - * hypercall: both the virtual address and the segment */ 76 + /* 77 + * The Guest told us their kernel stack with the SET_STACK 78 + * hypercall: both the virtual address and the segment. 79 + */ 84 80 virtstack = cpu->esp1; 85 81 ss = cpu->ss1; 86 82 87 83 origstack = gstack = guest_pa(cpu, virtstack); 88 - /* We push the old stack segment and pointer onto the new 84 + /* 85 + * We push the old stack segment and pointer onto the new 89 86 * stack: when the Guest does an "iret" back from the interrupt 90 87 * handler the CPU will notice they're dropping privilege 91 - * levels and expect these here. */ 88 + * levels and expect these here. 89 + */ 92 90 push_guest_stack(cpu, &gstack, cpu->regs->ss); 93 91 push_guest_stack(cpu, &gstack, cpu->regs->esp); 94 92 } else { ··· 105 91 origstack = gstack = guest_pa(cpu, virtstack); 106 92 } 107 93 108 - /* Remember that we never let the Guest actually disable interrupts, so 94 + /* 95 + * Remember that we never let the Guest actually disable interrupts, so 109 96 * the "Interrupt Flag" bit is always set. We copy that bit from the 110 97 * Guest's "irq_enabled" field into the eflags word: we saw the Guest 111 - * copy it back in "lguest_iret". */ 98 + * copy it back in "lguest_iret". 99 + */ 112 100 eflags = cpu->regs->eflags; 113 101 if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 114 102 && !(irq_enable & X86_EFLAGS_IF)) 115 103 eflags &= ~X86_EFLAGS_IF; 116 104 117 - /* An interrupt is expected to push three things on the stack: the old 105 + /* 106 + * An interrupt is expected to push three things on the stack: the old 118 107 * "eflags" word, the old code segment, and the old instruction 119 - * pointer. */ 108 + * pointer. 109 + */ 120 110 push_guest_stack(cpu, &gstack, eflags); 121 111 push_guest_stack(cpu, &gstack, cpu->regs->cs); 122 112 push_guest_stack(cpu, &gstack, cpu->regs->eip); ··· 129 111 if (has_err) 130 112 push_guest_stack(cpu, &gstack, cpu->regs->errcode); 131 113 132 - /* Now we've pushed all the old state, we change the stack, the code 133 - * segment and the address to execute. */ 114 + /* 115 + * Now we've pushed all the old state, we change the stack, the code 116 + * segment and the address to execute. 117 + */ 134 118 cpu->regs->ss = ss; 135 119 cpu->regs->esp = virtstack + (gstack - origstack); 136 120 cpu->regs->cs = (__KERNEL_CS|GUEST_PL); 137 121 cpu->regs->eip = idt_address(lo, hi); 138 122 139 - /* There are two kinds of interrupt handlers: 0xE is an "interrupt 140 - * gate" which expects interrupts to be disabled on entry. */ 123 + /* 124 + * There are two kinds of interrupt handlers: 0xE is an "interrupt 125 + * gate" which expects interrupts to be disabled on entry. 126 + */ 141 127 if (idt_type(lo, hi) == 0xE) 142 128 if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) 143 129 kill_guest(cpu, "Disabling interrupts"); ··· 152 130 * 153 131 * interrupt_pending() returns the first pending interrupt which isn't blocked 154 132 * by the Guest. It is called before every entry to the Guest, and just before 155 - * we go to sleep when the Guest has halted itself. */ 133 + * we go to sleep when the Guest has halted itself. 134 + */ 156 135 unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) 157 136 { 158 137 unsigned int irq; ··· 163 140 if (!cpu->lg->lguest_data) 164 141 return LGUEST_IRQS; 165 142 166 - /* Take our "irqs_pending" array and remove any interrupts the Guest 167 - * wants blocked: the result ends up in "blk". */ 143 + /* 144 + * Take our "irqs_pending" array and remove any interrupts the Guest 145 + * wants blocked: the result ends up in "blk". 146 + */ 168 147 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 169 148 sizeof(blk))) 170 149 return LGUEST_IRQS; ··· 179 154 return irq; 180 155 } 181 156 182 - /* This actually diverts the Guest to running an interrupt handler, once an 183 - * interrupt has been identified by interrupt_pending(). */ 157 + /* 158 + * This actually diverts the Guest to running an interrupt handler, once an 159 + * interrupt has been identified by interrupt_pending(). 160 + */ 184 161 void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) 185 162 { 186 163 struct desc_struct *idt; 187 164 188 165 BUG_ON(irq >= LGUEST_IRQS); 189 166 190 - /* They may be in the middle of an iret, where they asked us never to 191 - * deliver interrupts. */ 167 + /* 168 + * They may be in the middle of an iret, where they asked us never to 169 + * deliver interrupts. 170 + */ 192 171 if (cpu->regs->eip >= cpu->lg->noirq_start && 193 172 (cpu->regs->eip < cpu->lg->noirq_end)) 194 173 return; ··· 216 187 } 217 188 } 218 189 219 - /* Look at the IDT entry the Guest gave us for this interrupt. The 190 + /* 191 + * Look at the IDT entry the Guest gave us for this interrupt. The 220 192 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 221 - * over them. */ 193 + * over them. 194 + */ 222 195 idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 223 196 /* If they don't have a handler (yet?), we just ignore it */ 224 197 if (idt_present(idt->a, idt->b)) { 225 198 /* OK, mark it no longer pending and deliver it. */ 226 199 clear_bit(irq, cpu->irqs_pending); 227 - /* set_guest_interrupt() takes the interrupt descriptor and a 200 + /* 201 + * set_guest_interrupt() takes the interrupt descriptor and a 228 202 * flag to say whether this interrupt pushes an error code onto 229 - * the stack as well: virtual interrupts never do. */ 203 + * the stack as well: virtual interrupts never do. 204 + */ 230 205 set_guest_interrupt(cpu, idt->a, idt->b, false); 231 206 } 232 207 233 - /* Every time we deliver an interrupt, we update the timestamp in the 208 + /* 209 + * Every time we deliver an interrupt, we update the timestamp in the 234 210 * Guest's lguest_data struct. It would be better for the Guest if we 235 211 * did this more often, but it can actually be quite slow: doing it 236 212 * here is a compromise which means at least it gets updated every 237 - * timer interrupt. */ 213 + * timer interrupt. 214 + */ 238 215 write_timestamp(cpu); 239 216 240 - /* If there are no other interrupts we want to deliver, clear 241 - * the pending flag. */ 217 + /* 218 + * If there are no other interrupts we want to deliver, clear 219 + * the pending flag. 220 + */ 242 221 if (!more) 243 222 put_user(0, &cpu->lg->lguest_data->irq_pending); 244 223 } ··· 254 217 /* And this is the routine when we want to set an interrupt for the Guest. */ 255 218 void set_interrupt(struct lg_cpu *cpu, unsigned int irq) 256 219 { 257 - /* Next time the Guest runs, the core code will see if it can deliver 258 - * this interrupt. */ 220 + /* 221 + * Next time the Guest runs, the core code will see if it can deliver 222 + * this interrupt. 223 + */ 259 224 set_bit(irq, cpu->irqs_pending); 260 225 261 - /* Make sure it sees it; it might be asleep (eg. halted), or 262 - * running the Guest right now, in which case kick_process() 263 - * will knock it out. */ 226 + /* 227 + * Make sure it sees it; it might be asleep (eg. halted), or running 228 + * the Guest right now, in which case kick_process() will knock it out. 229 + */ 264 230 if (!wake_up_process(cpu->tsk)) 265 231 kick_process(cpu->tsk); 266 232 } 267 233 /*:*/ 268 234 269 - /* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 235 + /* 236 + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 270 237 * me a patch, so we support that too. It'd be a big step for lguest if half 271 238 * the Plan 9 user base were to start using it. 272 239 * 273 240 * Actually now I think of it, it's possible that Ron *is* half the Plan 9 274 - * userbase. Oh well. */ 241 + * userbase. Oh well. 242 + */ 275 243 static bool could_be_syscall(unsigned int num) 276 244 { 277 245 /* Normal Linux SYSCALL_VECTOR or reserved vector? */ ··· 316 274 clear_bit(syscall_vector, used_vectors); 317 275 } 318 276 319 - /*H:220 Now we've got the routines to deliver interrupts, delivering traps like 277 + /*H:220 278 + * Now we've got the routines to deliver interrupts, delivering traps like 320 279 * page fault is easy. The only trick is that Intel decided that some traps 321 - * should have error codes: */ 280 + * should have error codes: 281 + */ 322 282 static bool has_err(unsigned int trap) 323 283 { 324 284 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); ··· 329 285 /* deliver_trap() returns true if it could deliver the trap. */ 330 286 bool deliver_trap(struct lg_cpu *cpu, unsigned int num) 331 287 { 332 - /* Trap numbers are always 8 bit, but we set an impossible trap number 333 - * for traps inside the Switcher, so check that here. */ 288 + /* 289 + * Trap numbers are always 8 bit, but we set an impossible trap number 290 + * for traps inside the Switcher, so check that here. 291 + */ 334 292 if (num >= ARRAY_SIZE(cpu->arch.idt)) 335 293 return false; 336 294 337 - /* Early on the Guest hasn't set the IDT entries (or maybe it put a 338 - * bogus one in): if we fail here, the Guest will be killed. */ 295 + /* 296 + * Early on the Guest hasn't set the IDT entries (or maybe it put a 297 + * bogus one in): if we fail here, the Guest will be killed. 298 + */ 339 299 if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) 340 300 return false; 341 301 set_guest_interrupt(cpu, cpu->arch.idt[num].a, ··· 347 299 return true; 348 300 } 349 301 350 - /*H:250 Here's the hard part: returning to the Host every time a trap happens 302 + /*H:250 303 + * Here's the hard part: returning to the Host every time a trap happens 351 304 * and then calling deliver_trap() and re-entering the Guest is slow. 352 305 * Particularly because Guest userspace system calls are traps (usually trap 353 306 * 128). ··· 360 311 * the other hypervisors would beat it up at lunchtime. 361 312 * 362 313 * This routine indicates if a particular trap number could be delivered 363 - * directly. */ 314 + * directly. 315 + */ 364 316 static bool direct_trap(unsigned int num) 365 317 { 366 - /* Hardware interrupts don't go to the Guest at all (except system 367 - * call). */ 318 + /* 319 + * Hardware interrupts don't go to the Guest at all (except system 320 + * call). 321 + */ 368 322 if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 369 323 return false; 370 324 371 - /* The Host needs to see page faults (for shadow paging and to save the 325 + /* 326 + * The Host needs to see page faults (for shadow paging and to save the 372 327 * fault address), general protection faults (in/out emulation) and 373 328 * device not available (TS handling), invalid opcode fault (kvm hcall), 374 - * and of course, the hypercall trap. */ 329 + * and of course, the hypercall trap. 330 + */ 375 331 return num != 14 && num != 13 && num != 7 && 376 332 num != 6 && num != LGUEST_TRAP_ENTRY; 377 333 } 378 334 /*:*/ 379 335 380 - /*M:005 The Guest has the ability to turn its interrupt gates into trap gates, 336 + /*M:005 337 + * The Guest has the ability to turn its interrupt gates into trap gates, 381 338 * if it is careful. The Host will let trap gates can go directly to the 382 339 * Guest, but the Guest needs the interrupts atomically disabled for an 383 340 * interrupt gate. It can do this by pointing the trap gate at instructions 384 - * within noirq_start and noirq_end, where it can safely disable interrupts. */ 341 + * within noirq_start and noirq_end, where it can safely disable interrupts. 342 + */ 385 343 386 - /*M:006 The Guests do not use the sysenter (fast system call) instruction, 344 + /*M:006 345 + * The Guests do not use the sysenter (fast system call) instruction, 387 346 * because it's hardcoded to enter privilege level 0 and so can't go direct. 388 347 * It's about twice as fast as the older "int 0x80" system call, so it might 389 348 * still be worthwhile to handle it in the Switcher and lcall down to the 390 349 * Guest. The sysenter semantics are hairy tho: search for that keyword in 391 - * entry.S :*/ 350 + * entry.S 351 + :*/ 392 352 393 - /*H:260 When we make traps go directly into the Guest, we need to make sure 353 + /*H:260 354 + * When we make traps go directly into the Guest, we need to make sure 394 355 * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the 395 356 * CPU trying to deliver the trap will fault while trying to push the interrupt 396 357 * words on the stack: this is called a double fault, and it forces us to kill 397 358 * the Guest. 398 359 * 399 - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ 360 + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. 361 + */ 400 362 void pin_stack_pages(struct lg_cpu *cpu) 401 363 { 402 364 unsigned int i; 403 365 404 - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or 405 - * two pages of stack space. */ 366 + /* 367 + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or 368 + * two pages of stack space. 369 + */ 406 370 for (i = 0; i < cpu->lg->stack_pages; i++) 407 - /* The stack grows *upwards*, so the address we're given is the 371 + /* 372 + * The stack grows *upwards*, so the address we're given is the 408 373 * start of the page after the kernel stack. Subtract one to 409 374 * get back onto the first stack page, and keep subtracting to 410 - * get to the rest of the stack pages. */ 375 + * get to the rest of the stack pages. 376 + */ 411 377 pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); 412 378 } 413 379 414 - /* Direct traps also mean that we need to know whenever the Guest wants to use 380 + /* 381 + * Direct traps also mean that we need to know whenever the Guest wants to use 415 382 * a different kernel stack, so we can change the IDT entries to use that 416 383 * stack. The IDT entries expect a virtual address, so unlike most addresses 417 384 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not 418 385 * physical. 419 386 * 420 387 * In Linux each process has its own kernel stack, so this happens a lot: we 421 - * change stacks on each context switch. */ 388 + * change stacks on each context switch. 389 + */ 422 390 void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) 423 391 { 424 - /* You are not allowed have a stack segment with privilege level 0: bad 425 - * Guest! */ 392 + /* 393 + * You're not allowed a stack segment with privilege level 0: bad Guest! 394 + */ 426 395 if ((seg & 0x3) != GUEST_PL) 427 396 kill_guest(cpu, "bad stack segment %i", seg); 428 397 /* We only expect one or two stack pages. */ ··· 454 387 pin_stack_pages(cpu); 455 388 } 456 389 457 - /* All this reference to mapping stacks leads us neatly into the other complex 458 - * part of the Host: page table handling. */ 390 + /* 391 + * All this reference to mapping stacks leads us neatly into the other complex 392 + * part of the Host: page table handling. 393 + */ 459 394 460 - /*H:235 This is the routine which actually checks the Guest's IDT entry and 461 - * transfers it into the entry in "struct lguest": */ 395 + /*H:235 396 + * This is the routine which actually checks the Guest's IDT entry and 397 + * transfers it into the entry in "struct lguest": 398 + */ 462 399 static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, 463 400 unsigned int num, u32 lo, u32 hi) 464 401 { ··· 478 407 if (type != 0xE && type != 0xF) 479 408 kill_guest(cpu, "bad IDT type %i", type); 480 409 481 - /* We only copy the handler address, present bit, privilege level and 410 + /* 411 + * We only copy the handler address, present bit, privilege level and 482 412 * type. The privilege level controls where the trap can be triggered 483 413 * manually with an "int" instruction. This is usually GUEST_PL, 484 - * except for system calls which userspace can use. */ 414 + * except for system calls which userspace can use. 415 + */ 485 416 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); 486 417 trap->b = (hi&0xFFFFEF00); 487 418 } 488 419 489 - /*H:230 While we're here, dealing with delivering traps and interrupts to the 420 + /*H:230 421 + * While we're here, dealing with delivering traps and interrupts to the 490 422 * Guest, we might as well complete the picture: how the Guest tells us where 491 423 * it wants them to go. This would be simple, except making traps fast 492 424 * requires some tricks. 493 425 * 494 426 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 495 - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ 427 + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. 428 + */ 496 429 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) 497 430 { 498 - /* Guest never handles: NMI, doublefault, spurious interrupt or 499 - * hypercall. We ignore when it tries to set them. */ 431 + /* 432 + * Guest never handles: NMI, doublefault, spurious interrupt or 433 + * hypercall. We ignore when it tries to set them. 434 + */ 500 435 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) 501 436 return; 502 437 503 - /* Mark the IDT as changed: next time the Guest runs we'll know we have 504 - * to copy this again. */ 438 + /* 439 + * Mark the IDT as changed: next time the Guest runs we'll know we have 440 + * to copy this again. 441 + */ 505 442 cpu->changed |= CHANGED_IDT; 506 443 507 444 /* Check that the Guest doesn't try to step outside the bounds. */ ··· 519 440 set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); 520 441 } 521 442 522 - /* The default entry for each interrupt points into the Switcher routines which 443 + /* 444 + * The default entry for each interrupt points into the Switcher routines which 523 445 * simply return to the Host. The run_guest() loop will then call 524 - * deliver_trap() to bounce it back into the Guest. */ 446 + * deliver_trap() to bounce it back into the Guest. 447 + */ 525 448 static void default_idt_entry(struct desc_struct *idt, 526 449 int trap, 527 450 const unsigned long handler, ··· 532 451 /* A present interrupt gate. */ 533 452 u32 flags = 0x8e00; 534 453 535 - /* Set the privilege level on the entry for the hypercall: this allows 536 - * the Guest to use the "int" instruction to trigger it. */ 454 + /* 455 + * Set the privilege level on the entry for the hypercall: this allows 456 + * the Guest to use the "int" instruction to trigger it. 457 + */ 537 458 if (trap == LGUEST_TRAP_ENTRY) 538 459 flags |= (GUEST_PL << 13); 539 460 else if (base) 540 - /* Copy priv. level from what Guest asked for. This allows 541 - * debug (int 3) traps from Guest userspace, for example. */ 461 + /* 462 + * Copy privilege level from what Guest asked for. This allows 463 + * debug (int 3) traps from Guest userspace, for example. 464 + */ 542 465 flags |= (base->b & 0x6000); 543 466 544 467 /* Now pack it into the IDT entry in its weird format. */ ··· 560 475 default_idt_entry(&state->guest_idt[i], i, def[i], NULL); 561 476 } 562 477 563 - /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead 478 + /*H:240 479 + * We don't use the IDT entries in the "struct lguest" directly, instead 564 480 * we copy them into the IDT which we've set up for Guests on this CPU, just 565 - * before we run the Guest. This routine does that copy. */ 481 + * before we run the Guest. This routine does that copy. 482 + */ 566 483 void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 567 484 const unsigned long *def) 568 485 { 569 486 unsigned int i; 570 487 571 - /* We can simply copy the direct traps, otherwise we use the default 572 - * ones in the Switcher: they will return to the Host. */ 488 + /* 489 + * We can simply copy the direct traps, otherwise we use the default 490 + * ones in the Switcher: they will return to the Host. 491 + */ 573 492 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { 574 493 const struct desc_struct *gidt = &cpu->arch.idt[i]; 575 494 ··· 581 492 if (!direct_trap(i)) 582 493 continue; 583 494 584 - /* Only trap gates (type 15) can go direct to the Guest. 495 + /* 496 + * Only trap gates (type 15) can go direct to the Guest. 585 497 * Interrupt gates (type 14) disable interrupts as they are 586 498 * entered, which we never let the Guest do. Not present 587 499 * entries (type 0x0) also can't go direct, of course. 588 500 * 589 501 * If it can't go direct, we still need to copy the priv. level: 590 502 * they might want to give userspace access to a software 591 - * interrupt. */ 503 + * interrupt. 504 + */ 592 505 if (idt_type(gidt->a, gidt->b) == 0xF) 593 506 idt[i] = *gidt; 594 507 else ··· 609 518 * the next timer interrupt (in nanoseconds). We use the high-resolution timer 610 519 * infrastructure to set a callback at that time. 611 520 * 612 - * 0 means "turn off the clock". */ 521 + * 0 means "turn off the clock". 522 + */ 613 523 void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) 614 524 { 615 525 ktime_t expires; ··· 621 529 return; 622 530 } 623 531 624 - /* We use wallclock time here, so the Guest might not be running for 532 + /* 533 + * We use wallclock time here, so the Guest might not be running for 625 534 * all the time between now and the timer interrupt it asked for. This 626 - * is almost always the right thing to do. */ 535 + * is almost always the right thing to do. 536 + */ 627 537 expires = ktime_add_ns(ktime_get_real(), delta); 628 538 hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); 629 539 }

+15 -8

drivers/lguest/lg.h

··· 54 54 55 55 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 56 56 57 - /* At end of a page shared mapped over lguest_pages in guest. */ 57 + /* At end of a page shared mapped over lguest_pages in guest. */ 58 58 unsigned long regs_page; 59 59 struct lguest_regs *regs; 60 60 61 61 struct lguest_pages *last_pages; 62 62 63 - int cpu_pgd; /* which pgd this cpu is currently using */ 63 + int cpu_pgd; /* Which pgd this cpu is currently using */ 64 64 65 65 /* If a hypercall was asked for, this points to the arguments. */ 66 66 struct hcall_args *hcall; ··· 96 96 unsigned int nr_cpus; 97 97 98 98 u32 pfn_limit; 99 - /* This provides the offset to the base of guest-physical 100 - * memory in the Launcher. */ 99 + 100 + /* 101 + * This provides the offset to the base of guest-physical memory in the 102 + * Launcher. 103 + */ 101 104 void __user *mem_base; 102 105 unsigned long kernel_address; 103 106 ··· 125 122 void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 126 123 void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 127 124 128 - /*H:035 Using memory-copy operations like that is usually inconvient, so we 125 + /*H:035 126 + * Using memory-copy operations like that is usually inconvient, so we 129 127 * have the following helper macros which read and write a specific type (often 130 128 * an unsigned long). 131 129 * 132 - * This reads into a variable of the given type then returns that. */ 130 + * This reads into a variable of the given type then returns that. 131 + */ 133 132 #define lgread(cpu, addr, type) \ 134 133 ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) 135 134 ··· 145 140 146 141 int run_guest(struct lg_cpu *cpu, unsigned long __user *user); 147 142 148 - /* Helper macros to obtain the first 12 or the last 20 bits, this is only the 143 + /* 144 + * Helper macros to obtain the first 12 or the last 20 bits, this is only the 149 145 * first step in the migration to the kernel types. pte_pfn is already defined 150 - * in the kernel. */ 146 + * in the kernel. 147 + */ 151 148 #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 152 149 #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 153 150 #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)

+97 -53

drivers/lguest/lguest_device.c

··· 1 - /*P:050 Lguest guests use a very simple method to describe devices. It's a 1 + /*P:050 2 + * Lguest guests use a very simple method to describe devices. It's a 2 3 * series of device descriptors contained just above the top of normal Guest 3 4 * memory. 4 5 * 5 6 * We use the standard "virtio" device infrastructure, which provides us with a 6 7 * console, a network and a block driver. Each one expects some configuration 7 - * information and a "virtqueue" or two to send and receive data. :*/ 8 + * information and a "virtqueue" or two to send and receive data. 9 + :*/ 8 10 #include <linux/init.h> 9 11 #include <linux/bootmem.h> 10 12 #include <linux/lguest_launcher.h> ··· 22 20 /* The pointer to our (page) of device descriptions. */ 23 21 static void *lguest_devices; 24 22 25 - /* For Guests, device memory can be used as normal memory, so we cast away the 26 - * __iomem to quieten sparse. */ 23 + /* 24 + * For Guests, device memory can be used as normal memory, so we cast away the 25 + * __iomem to quieten sparse. 26 + */ 27 27 static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) 28 28 { 29 29 return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); ··· 36 32 iounmap((__force void __iomem *)addr); 37 33 } 38 34 39 - /*D:100 Each lguest device is just a virtio device plus a pointer to its entry 40 - * in the lguest_devices page. */ 35 + /*D:100 36 + * Each lguest device is just a virtio device plus a pointer to its entry 37 + * in the lguest_devices page. 38 + */ 41 39 struct lguest_device { 42 40 struct virtio_device vdev; 43 41 ··· 47 41 struct lguest_device_desc *desc; 48 42 }; 49 43 50 - /* Since the virtio infrastructure hands us a pointer to the virtio_device all 44 + /* 45 + * Since the virtio infrastructure hands us a pointer to the virtio_device all 51 46 * the time, it helps to have a curt macro to get a pointer to the struct 52 - * lguest_device it's enclosed in. */ 47 + * lguest_device it's enclosed in. 48 + */ 53 49 #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) 54 50 55 51 /*D:130 ··· 63 55 * the driver will look at them during setup. 64 56 * 65 57 * A convenient routine to return the device's virtqueue config array: 66 - * immediately after the descriptor. */ 58 + * immediately after the descriptor. 59 + */ 67 60 static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) 68 61 { 69 62 return (void *)(desc + 1); ··· 107 98 return features; 108 99 } 109 100 110 - /* The virtio core takes the features the Host offers, and copies the 111 - * ones supported by the driver into the vdev->features array. Once 112 - * that's all sorted out, this routine is called so we can tell the 113 - * Host which features we understand and accept. */ 101 + /* 102 + * The virtio core takes the features the Host offers, and copies the ones 103 + * supported by the driver into the vdev->features array. Once that's all 104 + * sorted out, this routine is called so we can tell the Host which features we 105 + * understand and accept. 106 + */ 114 107 static void lg_finalize_features(struct virtio_device *vdev) 115 108 { 116 109 unsigned int i, bits; ··· 123 112 /* Give virtio_ring a chance to accept features. */ 124 113 vring_transport_features(vdev); 125 114 126 - /* The vdev->feature array is a Linux bitmask: this isn't the 127 - * same as a the simple array of bits used by lguest devices 128 - * for features. So we do this slow, manual conversion which is 129 - * completely general. */ 115 + /* 116 + * The vdev->feature array is a Linux bitmask: this isn't the same as a 117 + * the simple array of bits used by lguest devices for features. So we 118 + * do this slow, manual conversion which is completely general. 119 + */ 130 120 memset(out_features, 0, desc->feature_len); 131 121 bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; 132 122 for (i = 0; i < bits; i++) { ··· 158 146 memcpy(lg_config(desc) + offset, buf, len); 159 147 } 160 148 161 - /* The operations to get and set the status word just access the status field 162 - * of the device descriptor. */ 149 + /* 150 + * The operations to get and set the status word just access the status field 151 + * of the device descriptor. 152 + */ 163 153 static u8 lg_get_status(struct virtio_device *vdev) 164 154 { 165 155 return to_lgdev(vdev)->desc->status; 166 156 } 167 157 168 - /* To notify on status updates, we (ab)use the NOTIFY hypercall, with the 169 - * descriptor address of the device. A zero status means "reset". */ 158 + /* 159 + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the 160 + * descriptor address of the device. A zero status means "reset". 161 + */ 170 162 static void set_status(struct virtio_device *vdev, u8 status) 171 163 { 172 164 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; ··· 216 200 void *pages; 217 201 }; 218 202 219 - /* When the virtio_ring code wants to prod the Host, it calls us here and we 203 + /* 204 + * When the virtio_ring code wants to prod the Host, it calls us here and we 220 205 * make a hypercall. We hand the physical address of the virtqueue so the Host 221 - * knows which virtqueue we're talking about. */ 206 + * knows which virtqueue we're talking about. 207 + */ 222 208 static void lg_notify(struct virtqueue *vq) 223 209 { 224 - /* We store our virtqueue information in the "priv" pointer of the 225 - * virtqueue structure. */ 210 + /* 211 + * We store our virtqueue information in the "priv" pointer of the 212 + * virtqueue structure. 213 + */ 226 214 struct lguest_vq_info *lvq = vq->priv; 227 215 228 216 kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); ··· 235 215 /* An extern declaration inside a C file is bad form. Don't do it. */ 236 216 extern void lguest_setup_irq(unsigned int irq); 237 217 238 - /* This routine finds the first virtqueue described in the configuration of 218 + /* 219 + * This routine finds the first virtqueue described in the configuration of 239 220 * this device and sets it up. 240 221 * 241 222 * This is kind of an ugly duckling. It'd be nicer to have a standard ··· 246 225 * simpler for the Host to simply tell us where the pages are. 247 226 * 248 227 * So we provide drivers with a "find the Nth virtqueue and set it up" 249 - * function. */ 228 + * function. 229 + */ 250 230 static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 251 231 unsigned index, 252 232 void (*callback)(struct virtqueue *vq), ··· 266 244 if (!lvq) 267 245 return ERR_PTR(-ENOMEM); 268 246 269 - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after 247 + /* 248 + * Make a copy of the "struct lguest_vqconfig" entry, which sits after 270 249 * the descriptor. We need a copy because the config space might not 271 - * be aligned correctly. */ 250 + * be aligned correctly. 251 + */ 272 252 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); 273 253 274 254 printk("Mapping virtqueue %i addr %lx\n", index, ··· 285 261 goto free_lvq; 286 262 } 287 263 288 - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size 289 - * and we've got a pointer to its pages. */ 264 + /* 265 + * OK, tell virtio_ring.c to set up a virtqueue now we know its size 266 + * and we've got a pointer to its pages. 267 + */ 290 268 vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, 291 269 vdev, lvq->pages, lg_notify, callback, name); 292 270 if (!vq) { ··· 299 273 /* Make sure the interrupt is allocated. */ 300 274 lguest_setup_irq(lvq->config.irq); 301 275 302 - /* Tell the interrupt for this virtqueue to go to the virtio_ring 303 - * interrupt handler. */ 304 - /* FIXME: We used to have a flag for the Host to tell us we could use 276 + /* 277 + * Tell the interrupt for this virtqueue to go to the virtio_ring 278 + * interrupt handler. 279 + * 280 + * FIXME: We used to have a flag for the Host to tell us we could use 305 281 * the interrupt as a source of randomness: it'd be nice to have that 306 - * back.. */ 282 + * back. 283 + */ 307 284 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, 308 285 dev_name(&vdev->dev), vq); 309 286 if (err) 310 287 goto destroy_vring; 311 288 312 - /* Last of all we hook up our 'struct lguest_vq_info" to the 313 - * virtqueue's priv pointer. */ 289 + /* 290 + * Last of all we hook up our 'struct lguest_vq_info" to the 291 + * virtqueue's priv pointer. 292 + */ 314 293 vq->priv = lvq; 315 294 return vq; 316 295 ··· 389 358 .del_vqs = lg_del_vqs, 390 359 }; 391 360 392 - /* The root device for the lguest virtio devices. This makes them appear as 393 - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ 361 + /* 362 + * The root device for the lguest virtio devices. This makes them appear as 363 + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. 364 + */ 394 365 static struct device *lguest_root; 395 366 396 - /*D:120 This is the core of the lguest bus: actually adding a new device. 367 + /*D:120 368 + * This is the core of the lguest bus: actually adding a new device. 397 369 * It's a separate function because it's neater that way, and because an 398 370 * earlier version of the code supported hotplug and unplug. They were removed 399 371 * early on because they were never used. ··· 405 371 * 406 372 * It's worth reading this carefully: we start with a pointer to the new device 407 373 * descriptor in the "lguest_devices" page, and the offset into the device 408 - * descriptor page so we can uniquely identify it if things go badly wrong. */ 374 + * descriptor page so we can uniquely identify it if things go badly wrong. 375 + */ 409 376 static void add_lguest_device(struct lguest_device_desc *d, 410 377 unsigned int offset) 411 378 { 412 379 struct lguest_device *ldev; 413 380 414 - /* Start with zeroed memory; Linux's device layer seems to count on 415 - * it. */ 381 + /* Start with zeroed memory; Linux's device layer counts on it. */ 416 382 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 417 383 if (!ldev) { 418 384 printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", ··· 424 390 ldev->vdev.dev.parent = lguest_root; 425 391 /* We have a unique device index thanks to the dev_index counter. */ 426 392 ldev->vdev.id.device = d->type; 427 - /* We have a simple set of routines for querying the device's 428 - * configuration information and setting its status. */ 393 + /* 394 + * We have a simple set of routines for querying the device's 395 + * configuration information and setting its status. 396 + */ 429 397 ldev->vdev.config = &lguest_config_ops; 430 398 /* And we remember the device's descriptor for lguest_config_ops. */ 431 399 ldev->desc = d; 432 400 433 - /* register_virtio_device() sets up the generic fields for the struct 401 + /* 402 + * register_virtio_device() sets up the generic fields for the struct 434 403 * virtio_device and calls device_register(). This makes the bus 435 - * infrastructure look for a matching driver. */ 404 + * infrastructure look for a matching driver. 405 + */ 436 406 if (register_virtio_device(&ldev->vdev) != 0) { 437 407 printk(KERN_ERR "Failed to register lguest dev %u type %u\n", 438 408 offset, d->type); ··· 444 406 } 445 407 } 446 408 447 - /*D:110 scan_devices() simply iterates through the device page. The type 0 is 448 - * reserved to mean "end of devices". */ 409 + /*D:110 410 + * scan_devices() simply iterates through the device page. The type 0 is 411 + * reserved to mean "end of devices". 412 + */ 449 413 static void scan_devices(void) 450 414 { 451 415 unsigned int i; ··· 466 426 } 467 427 } 468 428 469 - /*D:105 Fairly early in boot, lguest_devices_init() is called to set up the 429 + /*D:105 430 + * Fairly early in boot, lguest_devices_init() is called to set up the 470 431 * lguest device infrastructure. We check that we are a Guest by checking 471 432 * pv_info.name: there are other ways of checking, but this seems most 472 433 * obvious to me. ··· 478 437 * correct sysfs incantation). 479 438 * 480 439 * Finally we call scan_devices() which adds all the devices found in the 481 - * lguest_devices page. */ 440 + * lguest_devices page. 441 + */ 482 442 static int __init lguest_devices_init(void) 483 443 { 484 444 if (strcmp(pv_info.name, "lguest") != 0) ··· 498 456 /* We do this after core stuff, but before the drivers. */ 499 457 postcore_initcall(lguest_devices_init); 500 458 501 - /*D:150 At this point in the journey we used to now wade through the lguest 459 + /*D:150 460 + * At this point in the journey we used to now wade through the lguest 502 461 * devices themselves: net, block and console. Since they're all now virtio 503 462 * devices rather than lguest-specific, I've decided to ignore them. Mostly, 504 463 * they're kind of boring. But this does mean you'll never experience the 505 464 * thrill of reading the forbidden love scene buried deep in the block driver. 506 465 * 507 466 * "make Launcher" beckons, where we answer questions like "Where do Guests 508 - * come from?", and "What do you do when someone asks for optimization?". */ 467 + * come from?", and "What do you do when someone asks for optimization?". 468 + */

+90 -47

drivers/lguest/lguest_user.c

··· 1 - /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 1 + /*P:200 2 + * This contains all the /dev/lguest code, whereby the userspace launcher 2 3 * controls and communicates with the Guest. For example, the first write will 3 4 * tell us the Guest's memory layout, pagetable, entry point and kernel address 4 5 * offset. A read will run the Guest until something happens, such as a signal 5 - * or the Guest doing a NOTIFY out to the Launcher. :*/ 6 + * or the Guest doing a NOTIFY out to the Launcher. 7 + :*/ 6 8 #include <linux/uaccess.h> 7 9 #include <linux/miscdevice.h> 8 10 #include <linux/fs.h> ··· 39 37 if (!addr) 40 38 return -EINVAL; 41 39 42 - /* Replace the old array with the new one, carefully: others can 43 - * be accessing it at the same time */ 40 + /* 41 + * Replace the old array with the new one, carefully: others can 42 + * be accessing it at the same time. 43 + */ 44 44 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), 45 45 GFP_KERNEL); 46 46 if (!new) ··· 65 61 /* Now put new one in place. */ 66 62 rcu_assign_pointer(lg->eventfds, new); 67 63 68 - /* We're not in a big hurry. Wait until noone's looking at old 69 - * version, then delete it. */ 64 + /* 65 + * We're not in a big hurry. Wait until noone's looking at old 66 + * version, then delete it. 67 + */ 70 68 synchronize_rcu(); 71 69 kfree(old); 72 70 ··· 93 87 return err; 94 88 } 95 89 96 - /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 97 - * number to /dev/lguest. */ 90 + /*L:050 91 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 92 + * number to /dev/lguest. 93 + */ 98 94 static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) 99 95 { 100 96 unsigned long irq; ··· 110 102 return 0; 111 103 } 112 104 113 - /*L:040 Once our Guest is initialized, the Launcher makes it run by reading 114 - * from /dev/lguest. */ 105 + /*L:040 106 + * Once our Guest is initialized, the Launcher makes it run by reading 107 + * from /dev/lguest. 108 + */ 115 109 static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 116 110 { 117 111 struct lguest *lg = file->private_data; ··· 149 139 return len; 150 140 } 151 141 152 - /* If we returned from read() last time because the Guest sent I/O, 153 - * clear the flag. */ 142 + /* 143 + * If we returned from read() last time because the Guest sent I/O, 144 + * clear the flag. 145 + */ 154 146 if (cpu->pending_notify) 155 147 cpu->pending_notify = 0; 156 148 ··· 160 148 return run_guest(cpu, (unsigned long __user *)user); 161 149 } 162 150 163 - /*L:025 This actually initializes a CPU. For the moment, a Guest is only 164 - * uniprocessor, so "id" is always 0. */ 151 + /*L:025 152 + * This actually initializes a CPU. For the moment, a Guest is only 153 + * uniprocessor, so "id" is always 0. 154 + */ 165 155 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 166 156 { 167 157 /* We have a limited number the number of CPUs in the lguest struct. */ ··· 178 164 /* Each CPU has a timer it can set. */ 179 165 init_clockdev(cpu); 180 166 181 - /* We need a complete page for the Guest registers: they are accessible 182 - * to the Guest and we can only grant it access to whole pages. */ 167 + /* 168 + * We need a complete page for the Guest registers: they are accessible 169 + * to the Guest and we can only grant it access to whole pages. 170 + */ 183 171 cpu->regs_page = get_zeroed_page(GFP_KERNEL); 184 172 if (!cpu->regs_page) 185 173 return -ENOMEM; ··· 189 173 /* We actually put the registers at the bottom of the page. */ 190 174 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 191 175 192 - /* Now we initialize the Guest's registers, handing it the start 193 - * address. */ 176 + /* 177 + * Now we initialize the Guest's registers, handing it the start 178 + * address. 179 + */ 194 180 lguest_arch_setup_regs(cpu, start_ip); 195 181 196 - /* We keep a pointer to the Launcher task (ie. current task) for when 197 - * other Guests want to wake this one (eg. console input). */ 182 + /* 183 + * We keep a pointer to the Launcher task (ie. current task) for when 184 + * other Guests want to wake this one (eg. console input). 185 + */ 198 186 cpu->tsk = current; 199 187 200 - /* We need to keep a pointer to the Launcher's memory map, because if 188 + /* 189 + * We need to keep a pointer to the Launcher's memory map, because if 201 190 * the Launcher dies we need to clean it up. If we don't keep a 202 - * reference, it is destroyed before close() is called. */ 191 + * reference, it is destroyed before close() is called. 192 + */ 203 193 cpu->mm = get_task_mm(cpu->tsk); 204 194 205 - /* We remember which CPU's pages this Guest used last, for optimization 206 - * when the same Guest runs on the same CPU twice. */ 195 + /* 196 + * We remember which CPU's pages this Guest used last, for optimization 197 + * when the same Guest runs on the same CPU twice. 198 + */ 207 199 cpu->last_pages = NULL; 208 200 209 201 /* No error == success. */ 210 202 return 0; 211 203 } 212 204 213 - /*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) 214 - * values (in addition to the LHREQ_INITIALIZE value). These are: 205 + /*L:020 206 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in 207 + * addition to the LHREQ_INITIALIZE value). These are: 215 208 * 216 209 * base: The start of the Guest-physical memory inside the Launcher memory. 217 210 * ··· 232 207 */ 233 208 static int initialize(struct file *file, const unsigned long __user *input) 234 209 { 235 - /* "struct lguest" contains everything we (the Host) know about a 236 - * Guest. */ 210 + /* "struct lguest" contains all we (the Host) know about a Guest. */ 237 211 struct lguest *lg; 238 212 int err; 239 213 unsigned long args[3]; 240 214 241 - /* We grab the Big Lguest lock, which protects against multiple 242 - * simultaneous initializations. */ 215 + /* 216 + * We grab the Big Lguest lock, which protects against multiple 217 + * simultaneous initializations. 218 + */ 243 219 mutex_lock(&lguest_lock); 244 220 /* You can't initialize twice! Close the device and start again... */ 245 221 if (file->private_data) { ··· 275 249 if (err) 276 250 goto free_eventfds; 277 251 278 - /* Initialize the Guest's shadow page tables, using the toplevel 279 - * address the Launcher gave us. This allocates memory, so can fail. */ 252 + /* 253 + * Initialize the Guest's shadow page tables, using the toplevel 254 + * address the Launcher gave us. This allocates memory, so can fail. 255 + */ 280 256 err = init_guest_pagetable(lg); 281 257 if (err) 282 258 goto free_regs; ··· 303 275 return err; 304 276 } 305 277 306 - /*L:010 The first operation the Launcher does must be a write. All writes 278 + /*L:010 279 + * The first operation the Launcher does must be a write. All writes 307 280 * start with an unsigned long number: for the first write this must be 308 281 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 309 282 * writes of other values to send interrupts. ··· 312 283 * Note that we overload the "offset" in the /dev/lguest file to indicate what 313 284 * CPU number we're dealing with. Currently this is always 0, since we only 314 285 * support uniprocessor Guests, but you can see the beginnings of SMP support 315 - * here. */ 286 + * here. 287 + */ 316 288 static ssize_t write(struct file *file, const char __user *in, 317 289 size_t size, loff_t *off) 318 290 { 319 - /* Once the Guest is initialized, we hold the "struct lguest" in the 320 - * file private data. */ 291 + /* 292 + * Once the Guest is initialized, we hold the "struct lguest" in the 293 + * file private data. 294 + */ 321 295 struct lguest *lg = file->private_data; 322 296 const unsigned long __user *input = (const unsigned long __user *)in; 323 297 unsigned long req; ··· 355 323 } 356 324 } 357 325 358 - /*L:060 The final piece of interface code is the close() routine. It reverses 326 + /*L:060 327 + * The final piece of interface code is the close() routine. It reverses 359 328 * everything done in initialize(). This is usually called because the 360 329 * Launcher exited. 361 330 * 362 331 * Note that the close routine returns 0 or a negative error number: it can't 363 332 * really fail, but it can whine. I blame Sun for this wart, and K&R C for 364 - * letting them do it. :*/ 333 + * letting them do it. 334 + :*/ 365 335 static int close(struct inode *inode, struct file *file) 366 336 { 367 337 struct lguest *lg = file->private_data; ··· 373 339 if (!lg) 374 340 return 0; 375 341 376 - /* We need the big lock, to protect from inter-guest I/O and other 377 - * Launchers initializing guests. */ 342 + /* 343 + * We need the big lock, to protect from inter-guest I/O and other 344 + * Launchers initializing guests. 345 + */ 378 346 mutex_lock(&lguest_lock); 379 347 380 348 /* Free up the shadow page tables for the Guest. */ ··· 387 351 hrtimer_cancel(&lg->cpus[i].hrt); 388 352 /* We can free up the register page we allocated. */ 389 353 free_page(lg->cpus[i].regs_page); 390 - /* Now all the memory cleanups are done, it's safe to release 391 - * the Launcher's memory management structure. */ 354 + /* 355 + * Now all the memory cleanups are done, it's safe to release 356 + * the Launcher's memory management structure. 357 + */ 392 358 mmput(lg->cpus[i].mm); 393 359 } 394 360 ··· 399 361 eventfd_ctx_put(lg->eventfds->map[i].event); 400 362 kfree(lg->eventfds); 401 363 402 - /* If lg->dead doesn't contain an error code it will be NULL or a 403 - * kmalloc()ed string, either of which is ok to hand to kfree(). */ 364 + /* 365 + * If lg->dead doesn't contain an error code it will be NULL or a 366 + * kmalloc()ed string, either of which is ok to hand to kfree(). 367 + */ 404 368 if (!IS_ERR(lg->dead)) 405 369 kfree(lg->dead); 406 370 /* Free the memory allocated to the lguest_struct */ ··· 426 386 * 427 387 * We begin our understanding with the Host kernel interface which the Launcher 428 388 * uses: reading and writing a character device called /dev/lguest. All the 429 - * work happens in the read(), write() and close() routines: */ 389 + * work happens in the read(), write() and close() routines: 390 + */ 430 391 static struct file_operations lguest_fops = { 431 392 .owner = THIS_MODULE, 432 393 .release = close, ··· 435 394 .read = read, 436 395 }; 437 396 438 - /* This is a textbook example of a "misc" character device. Populate a "struct 439 - * miscdevice" and register it with misc_register(). */ 397 + /* 398 + * This is a textbook example of a "misc" character device. Populate a "struct 399 + * miscdevice" and register it with misc_register(). 400 + */ 440 401 static struct miscdevice lguest_dev = { 441 402 .minor = MISC_DYNAMIC_MINOR, 442 403 .name = "lguest",

+282 -145

drivers/lguest/page_tables.c

··· 1 - /*P:700 The pagetable code, on the other hand, still shows the scars of 1 + /*P:700 2 + * The pagetable code, on the other hand, still shows the scars of 2 3 * previous encounters. It's functional, and as neat as it can be in the 3 4 * circumstances, but be wary, for these things are subtle and break easily. 4 5 * The Guest provides a virtual to physical mapping, but we can neither trust 5 6 * it nor use it: we verify and convert it here then point the CPU to the 6 - * converted Guest pages when running the Guest. :*/ 7 + * converted Guest pages when running the Guest. 8 + :*/ 7 9 8 10 /* Copyright (C) Rusty Russell IBM Corporation 2006. 9 11 * GPL v2 and any later version */ ··· 19 17 #include <asm/bootparam.h> 20 18 #include "lg.h" 21 19 22 - /*M:008 We hold reference to pages, which prevents them from being swapped. 20 + /*M:008 21 + * We hold reference to pages, which prevents them from being swapped. 23 22 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 24 23 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 25 - * could probably consider launching Guests as non-root. :*/ 24 + * could probably consider launching Guests as non-root. 25 + :*/ 26 26 27 27 /*H:300 28 28 * The Page Table Code ··· 49 45 * (v) Flushing (throwing away) page tables, 50 46 * (vi) Mapping the Switcher when the Guest is about to run, 51 47 * (vii) Setting up the page tables initially. 52 - :*/ 48 + :*/ 53 49 54 - 55 - /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 50 + /* 51 + * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 56 52 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 57 - * page. */ 53 + * page. 54 + */ 58 55 #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59 56 60 - /* For PAE we need the PMD index as well. We use the last 2MB, so we 61 - * will need the last pmd entry of the last pmd page. */ 57 + /* 58 + * For PAE we need the PMD index as well. We use the last 2MB, so we 59 + * will need the last pmd entry of the last pmd page. 60 + */ 62 61 #ifdef CONFIG_X86_PAE 63 62 #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 64 63 #define RESERVE_MEM 2U ··· 71 64 #define CHECK_GPGD_MASK _PAGE_TABLE 72 65 #endif 73 66 74 - /* We actually need a separate PTE page for each CPU. Remember that after the 67 + /* 68 + * We actually need a separate PTE page for each CPU. Remember that after the 75 69 * Switcher code itself comes two pages for each CPU, and we don't want this 76 - * CPU's guest to see the pages of any other CPU. */ 70 + * CPU's guest to see the pages of any other CPU. 71 + */ 77 72 static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 78 73 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 79 74 80 - /*H:320 The page table code is curly enough to need helper functions to keep it 75 + /*H:320 76 + * The page table code is curly enough to need helper functions to keep it 81 77 * clear and clean. 82 78 * 83 79 * There are two functions which return pointers to the shadow (aka "real") ··· 89 79 * spgd_addr() takes the virtual address and returns a pointer to the top-level 90 80 * page directory entry (PGD) for that address. Since we keep track of several 91 81 * page tables, the "i" argument tells us which one we're interested in (it's 92 - * usually the current one). */ 82 + * usually the current one). 83 + */ 93 84 static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 94 85 { 95 86 unsigned int index = pgd_index(vaddr); ··· 107 96 } 108 97 109 98 #ifdef CONFIG_X86_PAE 110 - /* This routine then takes the PGD entry given above, which contains the 99 + /* 100 + * This routine then takes the PGD entry given above, which contains the 111 101 * address of the PMD page. It then returns a pointer to the PMD entry for the 112 - * given address. */ 102 + * given address. 103 + */ 113 104 static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 114 105 { 115 106 unsigned int index = pmd_index(vaddr); ··· 132 119 } 133 120 #endif 134 121 135 - /* This routine then takes the page directory entry returned above, which 122 + /* 123 + * This routine then takes the page directory entry returned above, which 136 124 * contains the address of the page table entry (PTE) page. It then returns a 137 - * pointer to the PTE entry for the given address. */ 125 + * pointer to the PTE entry for the given address. 126 + */ 138 127 static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 139 128 { 140 129 #ifdef CONFIG_X86_PAE ··· 154 139 return &page[pte_index(vaddr)]; 155 140 } 156 141 157 - /* These two functions just like the above two, except they access the Guest 158 - * page tables. Hence they return a Guest address. */ 142 + /* 143 + * These two functions just like the above two, except they access the Guest 144 + * page tables. Hence they return a Guest address. 145 + */ 159 146 static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 160 147 { 161 148 unsigned int index = vaddr >> (PGDIR_SHIFT); ··· 192 175 #endif 193 176 /*:*/ 194 177 195 - /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as 196 - * an optimization (ie. pre-faulting). :*/ 178 + /*M:014 179 + * get_pfn is slow: we could probably try to grab batches of pages here as 180 + * an optimization (ie. pre-faulting). 181 + :*/ 197 182 198 - /*H:350 This routine takes a page number given by the Guest and converts it to 183 + /*H:350 184 + * This routine takes a page number given by the Guest and converts it to 199 185 * an actual, physical page number. It can fail for several reasons: the 200 186 * virtual address might not be mapped by the Launcher, the write flag is set 201 187 * and the page is read-only, or the write flag was set and the page was 202 188 * shared so had to be copied, but we ran out of memory. 203 189 * 204 190 * This holds a reference to the page, so release_pte() is careful to put that 205 - * back. */ 191 + * back. 192 + */ 206 193 static unsigned long get_pfn(unsigned long virtpfn, int write) 207 194 { 208 195 struct page *page; ··· 219 198 return -1UL; 220 199 } 221 200 222 - /*H:340 Converting a Guest page table entry to a shadow (ie. real) page table 201 + /*H:340 202 + * Converting a Guest page table entry to a shadow (ie. real) page table 223 203 * entry can be a little tricky. The flags are (almost) the same, but the 224 204 * Guest PTE contains a virtual page number: the CPU needs the real page 225 - * number. */ 205 + * number. 206 + */ 226 207 static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 227 208 { 228 209 unsigned long pfn, base, flags; 229 210 230 - /* The Guest sets the global flag, because it thinks that it is using 211 + /* 212 + * The Guest sets the global flag, because it thinks that it is using 231 213 * PGE. We only told it to use PGE so it would tell us whether it was 232 214 * flushing a kernel mapping or a userspace mapping. We don't actually 233 - * use the global bit, so throw it away. */ 215 + * use the global bit, so throw it away. 216 + */ 234 217 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 235 218 236 219 /* The Guest's pages are offset inside the Launcher. */ 237 220 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 238 221 239 - /* We need a temporary "unsigned long" variable to hold the answer from 222 + /* 223 + * We need a temporary "unsigned long" variable to hold the answer from 240 224 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 241 225 * fit in spte.pfn. get_pfn() finds the real physical number of the 242 - * page, given the virtual number. */ 226 + * page, given the virtual number. 227 + */ 243 228 pfn = get_pfn(base + pte_pfn(gpte), write); 244 229 if (pfn == -1UL) { 245 230 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 246 - /* When we destroy the Guest, we'll go through the shadow page 231 + /* 232 + * When we destroy the Guest, we'll go through the shadow page 247 233 * tables and release_pte() them. Make sure we don't think 248 - * this one is valid! */ 234 + * this one is valid! 235 + */ 249 236 flags = 0; 250 237 } 251 238 /* Now we assemble our shadow PTE from the page number and flags. */ ··· 263 234 /*H:460 And to complete the chain, release_pte() looks like this: */ 264 235 static void release_pte(pte_t pte) 265 236 { 266 - /* Remember that get_user_pages_fast() took a reference to the page, in 267 - * get_pfn()? We have to put it back now. */ 237 + /* 238 + * Remember that get_user_pages_fast() took a reference to the page, in 239 + * get_pfn()? We have to put it back now. 240 + */ 268 241 if (pte_flags(pte) & _PAGE_PRESENT) 269 242 put_page(pte_page(pte)); 270 243 } ··· 304 273 * and return to the Guest without it knowing. 305 274 * 306 275 * If we fixed up the fault (ie. we mapped the address), this routine returns 307 - * true. Otherwise, it was a real fault and we need to tell the Guest. */ 276 + * true. Otherwise, it was a real fault and we need to tell the Guest. 277 + */ 308 278 bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 309 279 { 310 280 pgd_t gpgd; ··· 330 298 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 331 299 /* No shadow entry: allocate a new shadow PTE page. */ 332 300 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 333 - /* This is not really the Guest's fault, but killing it is 334 - * simple for this corner case. */ 301 + /* 302 + * This is not really the Guest's fault, but killing it is 303 + * simple for this corner case. 304 + */ 335 305 if (!ptepage) { 336 306 kill_guest(cpu, "out of memory allocating pte page"); 337 307 return false; 338 308 } 339 309 /* We check that the Guest pgd is OK. */ 340 310 check_gpgd(cpu, gpgd); 341 - /* And we copy the flags to the shadow PGD entry. The page 342 - * number in the shadow PGD is the page we just allocated. */ 311 + /* 312 + * And we copy the flags to the shadow PGD entry. The page 313 + * number in the shadow PGD is the page we just allocated. 314 + */ 343 315 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 344 316 } 345 317 346 318 #ifdef CONFIG_X86_PAE 347 319 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 348 - /* middle level not present? We can't map it in. */ 320 + /* Middle level not present? We can't map it in. */ 349 321 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 350 322 return false; 351 323 ··· 360 324 /* No shadow entry: allocate a new shadow PTE page. */ 361 325 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 362 326 363 - /* This is not really the Guest's fault, but killing it is 364 - * simple for this corner case. */ 327 + /* 328 + * This is not really the Guest's fault, but killing it is 329 + * simple for this corner case. 330 + */ 365 331 if (!ptepage) { 366 332 kill_guest(cpu, "out of memory allocating pte page"); 367 333 return false; ··· 372 334 /* We check that the Guest pmd is OK. */ 373 335 check_gpmd(cpu, gpmd); 374 336 375 - /* And we copy the flags to the shadow PMD entry. The page 376 - * number in the shadow PMD is the page we just allocated. */ 337 + /* 338 + * And we copy the flags to the shadow PMD entry. The page 339 + * number in the shadow PMD is the page we just allocated. 340 + */ 377 341 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 378 342 } 379 343 380 - /* OK, now we look at the lower level in the Guest page table: keep its 381 - * address, because we might update it later. */ 344 + /* 345 + * OK, now we look at the lower level in the Guest page table: keep its 346 + * address, because we might update it later. 347 + */ 382 348 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 383 349 #else 384 - /* OK, now we look at the lower level in the Guest page table: keep its 385 - * address, because we might update it later. */ 350 + /* 351 + * OK, now we look at the lower level in the Guest page table: keep its 352 + * address, because we might update it later. 353 + */ 386 354 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 387 355 #endif 388 356 gpte = lgread(cpu, gpte_ptr, pte_t); ··· 397 353 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 398 354 return false; 399 355 400 - /* Check they're not trying to write to a page the Guest wants 401 - * read-only (bit 2 of errcode == write). */ 356 + /* 357 + * Check they're not trying to write to a page the Guest wants 358 + * read-only (bit 2 of errcode == write). 359 + */ 402 360 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 403 361 return false; 404 362 ··· 408 362 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 409 363 return false; 410 364 411 - /* Check that the Guest PTE flags are OK, and the page number is below 412 - * the pfn_limit (ie. not mapping the Launcher binary). */ 365 + /* 366 + * Check that the Guest PTE flags are OK, and the page number is below 367 + * the pfn_limit (ie. not mapping the Launcher binary). 368 + */ 413 369 check_gpte(cpu, gpte); 414 370 415 371 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ ··· 421 373 422 374 /* Get the pointer to the shadow PTE entry we're going to set. */ 423 375 spte = spte_addr(cpu, *spgd, vaddr); 424 - /* If there was a valid shadow PTE entry here before, we release it. 425 - * This can happen with a write to a previously read-only entry. */ 376 + 377 + /* 378 + * If there was a valid shadow PTE entry here before, we release it. 379 + * This can happen with a write to a previously read-only entry. 380 + */ 426 381 release_pte(*spte); 427 382 428 - /* If this is a write, we insist that the Guest page is writable (the 429 - * final arg to gpte_to_spte()). */ 383 + /* 384 + * If this is a write, we insist that the Guest page is writable (the 385 + * final arg to gpte_to_spte()). 386 + */ 430 387 if (pte_dirty(gpte)) 431 388 *spte = gpte_to_spte(cpu, gpte, 1); 432 389 else 433 - /* If this is a read, don't set the "writable" bit in the page 390 + /* 391 + * If this is a read, don't set the "writable" bit in the page 434 392 * table entry, even if the Guest says it's writable. That way 435 393 * we will come back here when a write does actually occur, so 436 - * we can update the Guest's _PAGE_DIRTY flag. */ 394 + * we can update the Guest's _PAGE_DIRTY flag. 395 + */ 437 396 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 438 397 439 - /* Finally, we write the Guest PTE entry back: we've set the 440 - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 398 + /* 399 + * Finally, we write the Guest PTE entry back: we've set the 400 + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 401 + */ 441 402 lgwrite(cpu, gpte_ptr, pte_t, gpte); 442 403 443 - /* The fault is fixed, the page table is populated, the mapping 404 + /* 405 + * The fault is fixed, the page table is populated, the mapping 444 406 * manipulated, the result returned and the code complete. A small 445 407 * delay and a trace of alliteration are the only indications the Guest 446 - * has that a page fault occurred at all. */ 408 + * has that a page fault occurred at all. 409 + */ 447 410 return true; 448 411 } 449 412 ··· 467 408 * mapped, so it's overkill. 468 409 * 469 410 * This is a quick version which answers the question: is this virtual address 470 - * mapped by the shadow page tables, and is it writable? */ 411 + * mapped by the shadow page tables, and is it writable? 412 + */ 471 413 static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 472 414 { 473 415 pgd_t *spgd; ··· 488 428 return false; 489 429 #endif 490 430 491 - /* Check the flags on the pte entry itself: it must be present and 492 - * writable. */ 431 + /* 432 + * Check the flags on the pte entry itself: it must be present and 433 + * writable. 434 + */ 493 435 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 494 436 495 437 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 496 438 } 497 439 498 - /* So, when pin_stack_pages() asks us to pin a page, we check if it's already 440 + /* 441 + * So, when pin_stack_pages() asks us to pin a page, we check if it's already 499 442 * in the page tables, and if not, we call demand_page() with error code 2 500 - * (meaning "write"). */ 443 + * (meaning "write"). 444 + */ 501 445 void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 502 446 { 503 447 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) ··· 549 485 /* If the entry's not present, there's nothing to release. */ 550 486 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 551 487 unsigned int i; 552 - /* Converting the pfn to find the actual PTE page is easy: turn 488 + /* 489 + * Converting the pfn to find the actual PTE page is easy: turn 553 490 * the page number into a physical address, then convert to a 554 - * virtual address (easy for kernel pages like this one). */ 491 + * virtual address (easy for kernel pages like this one). 492 + */ 555 493 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 556 494 /* For each entry in the page, we might need to release it. */ 557 495 for (i = 0; i < PTRS_PER_PTE; i++) ··· 565 499 } 566 500 } 567 501 #endif 568 - /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 502 + 503 + /*H:445 504 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() 569 505 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 570 - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 506 + * It simply releases every PTE page from 0 up to the Guest's kernel address. 507 + */ 571 508 static void flush_user_mappings(struct lguest *lg, int idx) 572 509 { 573 510 unsigned int i; ··· 579 510 release_pgd(lg->pgdirs[idx].pgdir + i); 580 511 } 581 512 582 - /*H:440 (v) Flushing (throwing away) page tables, 513 + /*H:440 514 + * (v) Flushing (throwing away) page tables, 583 515 * 584 516 * The Guest has a hypercall to throw away the page tables: it's used when a 585 - * large number of mappings have been changed. */ 517 + * large number of mappings have been changed. 518 + */ 586 519 void guest_pagetable_flush_user(struct lg_cpu *cpu) 587 520 { 588 521 /* Drop the userspace part of the current page table. */ ··· 622 551 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 623 552 } 624 553 625 - /* We keep several page tables. This is a simple routine to find the page 554 + /* 555 + * We keep several page tables. This is a simple routine to find the page 626 556 * table (if any) corresponding to this top-level address the Guest has given 627 - * us. */ 557 + * us. 558 + */ 628 559 static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 629 560 { 630 561 unsigned int i; ··· 636 563 return i; 637 564 } 638 565 639 - /*H:435 And this is us, creating the new page directory. If we really do 566 + /*H:435 567 + * And this is us, creating the new page directory. If we really do 640 568 * allocate a new one (and so the kernel parts are not there), we set 641 - * blank_pgdir. */ 569 + * blank_pgdir. 570 + */ 642 571 static unsigned int new_pgdir(struct lg_cpu *cpu, 643 572 unsigned long gpgdir, 644 573 int *blank_pgdir) ··· 650 575 pmd_t *pmd_table; 651 576 #endif 652 577 653 - /* We pick one entry at random to throw out. Choosing the Least 654 - * Recently Used might be better, but this is easy. */ 578 + /* 579 + * We pick one entry at random to throw out. Choosing the Least 580 + * Recently Used might be better, but this is easy. 581 + */ 655 582 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 656 583 /* If it's never been allocated at all before, try now. */ 657 584 if (!cpu->lg->pgdirs[next].pgdir) { ··· 664 587 next = cpu->cpu_pgd; 665 588 else { 666 589 #ifdef CONFIG_X86_PAE 667 - /* In PAE mode, allocate a pmd page and populate the 668 - * last pgd entry. */ 590 + /* 591 + * In PAE mode, allocate a pmd page and populate the 592 + * last pgd entry. 593 + */ 669 594 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 670 595 if (!pmd_table) { 671 596 free_page((long)cpu->lg->pgdirs[next].pgdir); ··· 677 598 set_pgd(cpu->lg->pgdirs[next].pgdir + 678 599 SWITCHER_PGD_INDEX, 679 600 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 680 - /* This is a blank page, so there are no kernel 681 - * mappings: caller must map the stack! */ 601 + /* 602 + * This is a blank page, so there are no kernel 603 + * mappings: caller must map the stack! 604 + */ 682 605 *blank_pgdir = 1; 683 606 } 684 607 #else ··· 696 615 return next; 697 616 } 698 617 699 - /*H:430 (iv) Switching page tables 618 + /*H:430 619 + * (iv) Switching page tables 700 620 * 701 621 * Now we've seen all the page table setting and manipulation, let's see 702 622 * what happens when the Guest changes page tables (ie. changes the top-level 703 - * pgdir). This occurs on almost every context switch. */ 623 + * pgdir). This occurs on almost every context switch. 624 + */ 704 625 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 705 626 { 706 627 int newpgdir, repin = 0; 707 628 708 629 /* Look to see if we have this one already. */ 709 630 newpgdir = find_pgdir(cpu->lg, pgtable); 710 - /* If not, we allocate or mug an existing one: if it's a fresh one, 711 - * repin gets set to 1. */ 631 + /* 632 + * If not, we allocate or mug an existing one: if it's a fresh one, 633 + * repin gets set to 1. 634 + */ 712 635 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 713 636 newpgdir = new_pgdir(cpu, pgtable, &repin); 714 637 /* Change the current pgd index to the new one. */ ··· 722 637 pin_stack_pages(cpu); 723 638 } 724 639 725 - /*H:470 Finally, a routine which throws away everything: all PGD entries in all 640 + /*H:470 641 + * Finally, a routine which throws away everything: all PGD entries in all 726 642 * the shadow page tables, including the Guest's kernel mappings. This is used 727 - * when we destroy the Guest. */ 643 + * when we destroy the Guest. 644 + */ 728 645 static void release_all_pagetables(struct lguest *lg) 729 646 { 730 647 unsigned int i, j; ··· 743 656 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 744 657 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 745 658 746 - /* And release the pmd entries of that pmd page, 747 - * except for the switcher pmd. */ 659 + /* 660 + * And release the pmd entries of that pmd page, 661 + * except for the switcher pmd. 662 + */ 748 663 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 749 664 release_pmd(&pmdpage[k]); 750 665 #endif ··· 756 667 } 757 668 } 758 669 759 - /* We also throw away everything when a Guest tells us it's changed a kernel 670 + /* 671 + * We also throw away everything when a Guest tells us it's changed a kernel 760 672 * mapping. Since kernel mappings are in every page table, it's easiest to 761 673 * throw them all away. This traps the Guest in amber for a while as 762 - * everything faults back in, but it's rare. */ 674 + * everything faults back in, but it's rare. 675 + */ 763 676 void guest_pagetable_clear_all(struct lg_cpu *cpu) 764 677 { 765 678 release_all_pagetables(cpu->lg); ··· 769 678 pin_stack_pages(cpu); 770 679 } 771 680 /*:*/ 772 - /*M:009 Since we throw away all mappings when a kernel mapping changes, our 681 + 682 + /*M:009 683 + * Since we throw away all mappings when a kernel mapping changes, our 773 684 * performance sucks for guests using highmem. In fact, a guest with 774 685 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 775 686 * usually slower than a Guest with less memory. 776 687 * 777 688 * This, of course, cannot be fixed. It would take some kind of... well, I 778 - * don't know, but the term "puissant code-fu" comes to mind. :*/ 689 + * don't know, but the term "puissant code-fu" comes to mind. 690 + :*/ 779 691 780 - /*H:420 This is the routine which actually sets the page table entry for then 692 + /*H:420 693 + * This is the routine which actually sets the page table entry for then 781 694 * "idx"'th shadow page table. 782 695 * 783 696 * Normally, we can just throw out the old entry and replace it with 0: if they ··· 810 715 spmd = spmd_addr(cpu, *spgd, vaddr); 811 716 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 812 717 #endif 813 - /* Otherwise, we start by releasing 814 - * the existing entry. */ 718 + /* Otherwise, start by releasing the existing entry. */ 815 719 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 816 720 release_pte(*spte); 817 721 818 - /* If they're setting this entry as dirty or accessed, 819 - * we might as well put that entry they've given us 820 - * in now. This shaves 10% off a 821 - * copy-on-write micro-benchmark. */ 722 + /* 723 + * If they're setting this entry as dirty or accessed, 724 + * we might as well put that entry they've given us in 725 + * now. This shaves 10% off a copy-on-write 726 + * micro-benchmark. 727 + */ 822 728 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 823 729 check_gpte(cpu, gpte); 824 730 native_set_pte(spte, 825 731 gpte_to_spte(cpu, gpte, 826 732 pte_flags(gpte) & _PAGE_DIRTY)); 827 - } else 828 - /* Otherwise kill it and we can demand_page() 829 - * it in later. */ 733 + } else { 734 + /* 735 + * Otherwise kill it and we can demand_page() 736 + * it in later. 737 + */ 830 738 native_set_pte(spte, __pte(0)); 739 + } 831 740 #ifdef CONFIG_X86_PAE 832 741 } 833 742 #endif 834 743 } 835 744 } 836 745 837 - /*H:410 Updating a PTE entry is a little trickier. 746 + /*H:410 747 + * Updating a PTE entry is a little trickier. 838 748 * 839 749 * We keep track of several different page tables (the Guest uses one for each 840 750 * process, so it makes sense to cache at least a few). Each of these have ··· 848 748 * all the page tables, not just the current one. This is rare. 849 749 * 850 750 * The benefit is that when we have to track a new page table, we can keep all 851 - * the kernel mappings. This speeds up context switch immensely. */ 751 + * the kernel mappings. This speeds up context switch immensely. 752 + */ 852 753 void guest_set_pte(struct lg_cpu *cpu, 853 754 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 854 755 { 855 - /* Kernel mappings must be changed on all top levels. Slow, but doesn't 856 - * happen often. */ 756 + /* 757 + * Kernel mappings must be changed on all top levels. Slow, but doesn't 758 + * happen often. 759 + */ 857 760 if (vaddr >= cpu->lg->kernel_address) { 858 761 unsigned int i; 859 762 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) ··· 905 802 } 906 803 #endif 907 804 908 - /* Once we know how much memory we have we can construct simple identity 909 - * (which set virtual == physical) and linear mappings 910 - * which will get the Guest far enough into the boot to create its own. 805 + /* 806 + * Once we know how much memory we have we can construct simple identity (which 807 + * set virtual == physical) and linear mappings which will get the Guest far 808 + * enough into the boot to create its own. 911 809 * 912 810 * We lay them out of the way, just below the initrd (which is why we need to 913 - * know its size here). */ 811 + * know its size here). 812 + */ 914 813 static unsigned long setup_pagetables(struct lguest *lg, 915 814 unsigned long mem, 916 815 unsigned long initrd_size) ··· 930 825 unsigned int phys_linear; 931 826 #endif 932 827 933 - /* We have mapped_pages frames to map, so we need 934 - * linear_pages page tables to map them. */ 828 + /* 829 + * We have mapped_pages frames to map, so we need linear_pages page 830 + * tables to map them. 831 + */ 935 832 mapped_pages = mem / PAGE_SIZE; 936 833 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; 937 834 ··· 946 839 #ifdef CONFIG_X86_PAE 947 840 pmds = (void *)linear - PAGE_SIZE; 948 841 #endif 949 - /* Linear mapping is easy: put every page's address into the 950 - * mapping in order. */ 842 + /* 843 + * Linear mapping is easy: put every page's address into the 844 + * mapping in order. 845 + */ 951 846 for (i = 0; i < mapped_pages; i++) { 952 847 pte_t pte; 953 848 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); ··· 957 848 return -EFAULT; 958 849 } 959 850 960 - /* The top level points to the linear page table pages above. 961 - * We setup the identity and linear mappings here. */ 851 + /* 852 + * The top level points to the linear page table pages above. 853 + * We setup the identity and linear mappings here. 854 + */ 962 855 #ifdef CONFIG_X86_PAE 963 856 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 964 857 i += PTRS_PER_PTE, j++) { ··· 991 880 } 992 881 #endif 993 882 994 - /* We return the top level (guest-physical) address: remember where 995 - * this is. */ 883 + /* 884 + * We return the top level (guest-physical) address: remember where 885 + * this is. 886 + */ 996 887 return (unsigned long)pgdir - mem_base; 997 888 } 998 889 999 - /*H:500 (vii) Setting up the page tables initially. 890 + /*H:500 891 + * (vii) Setting up the page tables initially. 1000 892 * 1001 893 * When a Guest is first created, the Launcher tells us where the toplevel of 1002 - * its first page table is. We set some things up here: */ 894 + * its first page table is. We set some things up here: 895 + */ 1003 896 int init_guest_pagetable(struct lguest *lg) 1004 897 { 1005 898 u64 mem; ··· 1013 898 pgd_t *pgd; 1014 899 pmd_t *pmd_table; 1015 900 #endif 1016 - /* Get the Guest memory size and the ramdisk size from the boot header 1017 - * located at lg->mem_base (Guest address 0). */ 901 + /* 902 + * Get the Guest memory size and the ramdisk size from the boot header 903 + * located at lg->mem_base (Guest address 0). 904 + */ 1018 905 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 1019 906 || get_user(initrd_size, &boot->hdr.ramdisk_size)) 1020 907 return -EFAULT; 1021 908 1022 - /* We start on the first shadow page table, and give it a blank PGD 1023 - * page. */ 909 + /* 910 + * We start on the first shadow page table, and give it a blank PGD 911 + * page. 912 + */ 1024 913 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); 1025 914 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) 1026 915 return lg->pgdirs[0].gpgdir; ··· 1050 931 /* We get the kernel address: above this is all kernel memory. */ 1051 932 if (get_user(cpu->lg->kernel_address, 1052 933 &cpu->lg->lguest_data->kernel_address) 1053 - /* We tell the Guest that it can't use the top 2 or 4 MB 1054 - * of virtual addresses used by the Switcher. */ 934 + /* 935 + * We tell the Guest that it can't use the top 2 or 4 MB 936 + * of virtual addresses used by the Switcher. 937 + */ 1055 938 || put_user(RESERVE_MEM * 1024 * 1024, 1056 939 &cpu->lg->lguest_data->reserve_mem) 1057 940 || put_user(cpu->lg->pgdirs[0].gpgdir, 1058 941 &cpu->lg->lguest_data->pgdir)) 1059 942 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1060 943 1061 - /* In flush_user_mappings() we loop from 0 to 944 + /* 945 + * In flush_user_mappings() we loop from 0 to 1062 946 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1063 - * Switcher mappings, so check that now. */ 947 + * Switcher mappings, so check that now. 948 + */ 1064 949 #ifdef CONFIG_X86_PAE 1065 950 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1066 951 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) ··· 1087 964 free_page((long)lg->pgdirs[i].pgdir); 1088 965 } 1089 966 1090 - /*H:480 (vi) Mapping the Switcher when the Guest is about to run. 967 + /*H:480 968 + * (vi) Mapping the Switcher when the Guest is about to run. 1091 969 * 1092 970 * The Switcher and the two pages for this CPU need to be visible in the 1093 971 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1094 972 * for each CPU already set up, we just need to hook them in now we know which 1095 - * Guest is about to run on this CPU. */ 973 + * Guest is about to run on this CPU. 974 + */ 1096 975 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1097 976 { 1098 977 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); ··· 1115 990 #else 1116 991 pgd_t switcher_pgd; 1117 992 1118 - /* Make the last PGD entry for this Guest point to the Switcher's PTE 1119 - * page for this CPU (with appropriate flags). */ 993 + /* 994 + * Make the last PGD entry for this Guest point to the Switcher's PTE 995 + * page for this CPU (with appropriate flags). 996 + */ 1120 997 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1121 998 1122 999 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1123 1000 1124 1001 #endif 1125 - /* We also change the Switcher PTE page. When we're running the Guest, 1002 + /* 1003 + * We also change the Switcher PTE page. When we're running the Guest, 1126 1004 * we want the Guest's "regs" page to appear where the first Switcher 1127 1005 * page for this CPU is. This is an optimization: when the Switcher 1128 1006 * saves the Guest registers, it saves them into the first page of this 1129 1007 * CPU's "struct lguest_pages": if we make sure the Guest's register 1130 1008 * page is already mapped there, we don't have to copy them out 1131 - * again. */ 1009 + * again. 1010 + */ 1132 1011 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; 1133 1012 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL)); 1134 1013 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], ··· 1148 1019 free_page((long)switcher_pte_page(i)); 1149 1020 } 1150 1021 1151 - /*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given 1022 + /*H:520 1023 + * Setting up the Switcher PTE page for given CPU is fairly easy, given 1152 1024 * the CPU number and the "struct page"s for the Switcher code itself. 1153 1025 * 1154 - * Currently the Switcher is less than a page long, so "pages" is always 1. */ 1026 + * Currently the Switcher is less than a page long, so "pages" is always 1. 1027 + */ 1155 1028 static __init void populate_switcher_pte_page(unsigned int cpu, 1156 1029 struct page *switcher_page[], 1157 1030 unsigned int pages) ··· 1174 1043 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1175 1044 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1176 1045 1177 - /* The second page contains the "struct lguest_ro_state", and is 1178 - * read-only. */ 1046 + /* 1047 + * The second page contains the "struct lguest_ro_state", and is 1048 + * read-only. 1049 + */ 1179 1050 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1180 1051 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1181 1052 } 1182 1053 1183 - /* We've made it through the page table code. Perhaps our tired brains are 1054 + /* 1055 + * We've made it through the page table code. Perhaps our tired brains are 1184 1056 * still processing the details, or perhaps we're simply glad it's over. 1185 1057 * 1186 1058 * If nothing else, note that all this complexity in juggling shadow page tables ··· 1192 1058 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1193 1059 * have implemented shadow page table support directly into hardware. 1194 1060 * 1195 - * There is just one file remaining in the Host. */ 1061 + * There is just one file remaining in the Host. 1062 + */ 1196 1063 1197 - /*H:510 At boot or module load time, init_pagetables() allocates and populates 1198 - * the Switcher PTE page for each CPU. */ 1064 + /*H:510 1065 + * At boot or module load time, init_pagetables() allocates and populates 1066 + * the Switcher PTE page for each CPU. 1067 + */ 1199 1068 __init int init_pagetables(struct page **switcher_page, unsigned int pages) 1200 1069 { 1201 1070 unsigned int i;

+69 -37

drivers/lguest/segments.c

··· 1 - /*P:600 The x86 architecture has segments, which involve a table of descriptors 1 + /*P:600 2 + * The x86 architecture has segments, which involve a table of descriptors 2 3 * which can be used to do funky things with virtual address interpretation. 3 4 * We originally used to use segments so the Guest couldn't alter the 4 5 * Guest<->Host Switcher, and then we had to trim Guest segments, and restore ··· 9 8 * 10 9 * In these modern times, the segment handling code consists of simple sanity 11 10 * checks, and the worst you'll experience reading this code is butterfly-rash 12 - * from frolicking through its parklike serenity. :*/ 11 + * from frolicking through its parklike serenity. 12 + :*/ 13 13 #include "lg.h" 14 14 15 15 /*H:600 ··· 43 41 * begin. 44 42 */ 45 43 46 - /* There are several entries we don't let the Guest set. The TSS entry is the 44 + /* 45 + * There are several entries we don't let the Guest set. The TSS entry is the 47 46 * "Task State Segment" which controls all kinds of delicate things. The 48 47 * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the 49 - * the Guest can't be trusted to deal with double faults. */ 48 + * the Guest can't be trusted to deal with double faults. 49 + */ 50 50 static bool ignored_gdt(unsigned int num) 51 51 { 52 52 return (num == GDT_ENTRY_TSS ··· 57 53 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 58 54 } 59 55 60 - /*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We 56 + /*H:630 57 + * Once the Guest gave us new GDT entries, we fix them up a little. We 61 58 * don't care if they're invalid: the worst that can happen is a General 62 59 * Protection Fault in the Switcher when it restores a Guest segment register 63 60 * which tries to use that entry. Then we kill the Guest for causing such a 64 - * mess: the message will be "unhandled trap 256". */ 61 + * mess: the message will be "unhandled trap 256". 62 + */ 65 63 static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) 66 64 { 67 65 unsigned int i; 68 66 69 67 for (i = start; i < end; i++) { 70 - /* We never copy these ones to real GDT, so we don't care what 71 - * they say */ 68 + /* 69 + * We never copy these ones to real GDT, so we don't care what 70 + * they say 71 + */ 72 72 if (ignored_gdt(i)) 73 73 continue; 74 74 75 - /* Segment descriptors contain a privilege level: the Guest is 75 + /* 76 + * Segment descriptors contain a privilege level: the Guest is 76 77 * sometimes careless and leaves this as 0, even though it's 77 - * running at privilege level 1. If so, we fix it here. */ 78 + * running at privilege level 1. If so, we fix it here. 79 + */ 78 80 if ((cpu->arch.gdt[i].b & 0x00006000) == 0) 79 81 cpu->arch.gdt[i].b |= (GUEST_PL << 13); 80 82 81 - /* Each descriptor has an "accessed" bit. If we don't set it 83 + /* 84 + * Each descriptor has an "accessed" bit. If we don't set it 82 85 * now, the CPU will try to set it when the Guest first loads 83 86 * that entry into a segment register. But the GDT isn't 84 - * writable by the Guest, so bad things can happen. */ 87 + * writable by the Guest, so bad things can happen. 88 + */ 85 89 cpu->arch.gdt[i].b |= 0x00000100; 86 90 } 87 91 } 88 92 89 - /*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep 93 + /*H:610 94 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep 90 95 * a GDT for each CPU, and copy across the Guest's entries each time we want to 91 96 * run the Guest on that CPU. 92 97 * 93 98 * This routine is called at boot or modprobe time for each CPU to set up the 94 99 * constant GDT entries: the ones which are the same no matter what Guest we're 95 - * running. */ 100 + * running. 101 + */ 96 102 void setup_default_gdt_entries(struct lguest_ro_state *state) 97 103 { 98 104 struct desc_struct *gdt = state->guest_gdt; ··· 112 98 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 113 99 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 114 100 115 - /* The TSS segment refers to the TSS entry for this particular CPU. 101 + /* 102 + * The TSS segment refers to the TSS entry for this particular CPU. 116 103 * Forgive the magic flags: the 0x8900 means the entry is Present, it's 117 104 * privilege level 0 Available 386 TSS system segment, and the 0x67 118 - * means Saturn is eclipsed by Mercury in the twelfth house. */ 105 + * means Saturn is eclipsed by Mercury in the twelfth house. 106 + */ 119 107 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 120 108 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 121 109 | ((tss >> 16) & 0x000000FF); 122 110 } 123 111 124 - /* This routine sets up the initial Guest GDT for booting. All entries start 125 - * as 0 (unusable). */ 112 + /* 113 + * This routine sets up the initial Guest GDT for booting. All entries start 114 + * as 0 (unusable). 115 + */ 126 116 void setup_guest_gdt(struct lg_cpu *cpu) 127 117 { 128 - /* Start with full 0-4G segments... */ 118 + /* 119 + * Start with full 0-4G segments...except the Guest is allowed to use 120 + * them, so set the privilege level appropriately in the flags. 121 + */ 129 122 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 130 123 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 131 - /* ...except the Guest is allowed to use them, so set the privilege 132 - * level appropriately in the flags. */ 133 124 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 134 125 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 135 126 } 136 127 137 - /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" 138 - * entries. */ 128 + /*H:650 129 + * An optimization of copy_gdt(), for just the three "thead-local storage" 130 + * entries. 131 + */ 139 132 void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) 140 133 { 141 134 unsigned int i; ··· 151 130 gdt[i] = cpu->arch.gdt[i]; 152 131 } 153 132 154 - /*H:640 When the Guest is run on a different CPU, or the GDT entries have 155 - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this 156 - * CPU's GDT. */ 133 + /*H:640 134 + * When the Guest is run on a different CPU, or the GDT entries have changed, 135 + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's 136 + * GDT. 137 + */ 157 138 void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) 158 139 { 159 140 unsigned int i; 160 141 161 - /* The default entries from setup_default_gdt_entries() are not 162 - * replaced. See ignored_gdt() above. */ 142 + /* 143 + * The default entries from setup_default_gdt_entries() are not 144 + * replaced. See ignored_gdt() above. 145 + */ 163 146 for (i = 0; i < GDT_ENTRIES; i++) 164 147 if (!ignored_gdt(i)) 165 148 gdt[i] = cpu->arch.gdt[i]; 166 149 } 167 150 168 - /*H:620 This is where the Guest asks us to load a new GDT entry 169 - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ 151 + /*H:620 152 + * This is where the Guest asks us to load a new GDT entry 153 + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. 154 + */ 170 155 void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) 171 156 { 172 - /* We assume the Guest has the same number of GDT entries as the 173 - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 157 + /* 158 + * We assume the Guest has the same number of GDT entries as the 159 + * Host, otherwise we'd have to dynamically allocate the Guest GDT. 160 + */ 174 161 if (num >= ARRAY_SIZE(cpu->arch.gdt)) 175 162 kill_guest(cpu, "too many gdt entries %i", num); 176 163 ··· 186 157 cpu->arch.gdt[num].a = lo; 187 158 cpu->arch.gdt[num].b = hi; 188 159 fixup_gdt_table(cpu, num, num+1); 189 - /* Mark that the GDT changed so the core knows it has to copy it again, 190 - * even if the Guest is run on the same CPU. */ 160 + /* 161 + * Mark that the GDT changed so the core knows it has to copy it again, 162 + * even if the Guest is run on the same CPU. 163 + */ 191 164 cpu->changed |= CHANGED_GDT; 192 165 } 193 166 194 - /* This is the fast-track version for just changing the three TLS entries. 167 + /* 168 + * This is the fast-track version for just changing the three TLS entries. 195 169 * Remember that this happens on every context switch, so it's worth 196 170 * optimizing. But wouldn't it be neater to have a single hypercall to cover 197 - * both cases? */ 171 + * both cases? 172 + */ 198 173 void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) 199 174 { 200 175 struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; ··· 208 175 /* Note that just the TLS entries have changed. */ 209 176 cpu->changed |= CHANGED_GDT_TLS; 210 177 } 211 - /*:*/ 212 178 213 179 /*H:660 214 180 * With this, we have finished the Host.

+245 -127

drivers/lguest/x86/core.c

··· 17 17 * along with this program; if not, write to the Free Software 18 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 19 */ 20 - /*P:450 This file contains the x86-specific lguest code. It used to be all 20 + /*P:450 21 + * This file contains the x86-specific lguest code. It used to be all 21 22 * mixed in with drivers/lguest/core.c but several foolhardy code slashers 22 23 * wrestled most of the dependencies out to here in preparation for porting 23 24 * lguest to other architectures (see what I mean by foolhardy?). 24 25 * 25 26 * This also contains a couple of non-obvious setup and teardown pieces which 26 - * were implemented after days of debugging pain. :*/ 27 + * were implemented after days of debugging pain. 28 + :*/ 27 29 #include <linux/kernel.h> 28 30 #include <linux/start_kernel.h> 29 31 #include <linux/string.h> ··· 84 82 */ 85 83 static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) 86 84 { 87 - /* Copying all this data can be quite expensive. We usually run the 85 + /* 86 + * Copying all this data can be quite expensive. We usually run the 88 87 * same Guest we ran last time (and that Guest hasn't run anywhere else 89 88 * meanwhile). If that's not the case, we pretend everything in the 90 - * Guest has changed. */ 89 + * Guest has changed. 90 + */ 91 91 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { 92 92 __get_cpu_var(last_cpu) = cpu; 93 93 cpu->last_pages = pages; 94 94 cpu->changed = CHANGED_ALL; 95 95 } 96 96 97 - /* These copies are pretty cheap, so we do them unconditionally: */ 98 - /* Save the current Host top-level page directory. */ 97 + /* 98 + * These copies are pretty cheap, so we do them unconditionally: */ 99 + /* Save the current Host top-level page directory. 100 + */ 99 101 pages->state.host_cr3 = __pa(current->mm->pgd); 100 - /* Set up the Guest's page tables to see this CPU's pages (and no 101 - * other CPU's pages). */ 102 + /* 103 + * Set up the Guest's page tables to see this CPU's pages (and no 104 + * other CPU's pages). 105 + */ 102 106 map_switcher_in_guest(cpu, pages); 103 - /* Set up the two "TSS" members which tell the CPU what stack to use 107 + /* 108 + * Set up the two "TSS" members which tell the CPU what stack to use 104 109 * for traps which do directly into the Guest (ie. traps at privilege 105 - * level 1). */ 110 + * level 1). 111 + */ 106 112 pages->state.guest_tss.sp1 = cpu->esp1; 107 113 pages->state.guest_tss.ss1 = cpu->ss1; 108 114 ··· 135 125 /* This is a dummy value we need for GCC's sake. */ 136 126 unsigned int clobber; 137 127 138 - /* Copy the guest-specific information into this CPU's "struct 139 - * lguest_pages". */ 128 + /* 129 + * Copy the guest-specific information into this CPU's "struct 130 + * lguest_pages". 131 + */ 140 132 copy_in_guest_info(cpu, pages); 141 133 142 - /* Set the trap number to 256 (impossible value). If we fault while 134 + /* 135 + * Set the trap number to 256 (impossible value). If we fault while 143 136 * switching to the Guest (bad segment registers or bug), this will 144 - * cause us to abort the Guest. */ 137 + * cause us to abort the Guest. 138 + */ 145 139 cpu->regs->trapnum = 256; 146 140 147 - /* Now: we push the "eflags" register on the stack, then do an "lcall". 141 + /* 142 + * Now: we push the "eflags" register on the stack, then do an "lcall". 148 143 * This is how we change from using the kernel code segment to using 149 144 * the dedicated lguest code segment, as well as jumping into the 150 145 * Switcher. 151 146 * 152 147 * The lcall also pushes the old code segment (KERNEL_CS) onto the 153 148 * stack, then the address of this call. This stack layout happens to 154 - * exactly match the stack layout created by an interrupt... */ 149 + * exactly match the stack layout created by an interrupt... 150 + */ 155 151 asm volatile("pushf; lcall *lguest_entry" 156 - /* This is how we tell GCC that %eax ("a") and %ebx ("b") 157 - * are changed by this routine. The "=" means output. */ 152 + /* 153 + * This is how we tell GCC that %eax ("a") and %ebx ("b") 154 + * are changed by this routine. The "=" means output. 155 + */ 158 156 : "=a"(clobber), "=b"(clobber) 159 - /* %eax contains the pages pointer. ("0" refers to the 157 + /* 158 + * %eax contains the pages pointer. ("0" refers to the 160 159 * 0-th argument above, ie "a"). %ebx contains the 161 160 * physical address of the Guest's top-level page 162 - * directory. */ 161 + * directory. 162 + */ 163 163 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) 164 - /* We tell gcc that all these registers could change, 164 + /* 165 + * We tell gcc that all these registers could change, 165 166 * which means we don't have to save and restore them in 166 - * the Switcher. */ 167 + * the Switcher. 168 + */ 167 169 : "memory", "%edx", "%ecx", "%edi", "%esi"); 168 170 } 169 171 /*:*/ 170 172 171 - /*M:002 There are hooks in the scheduler which we can register to tell when we 173 + /*M:002 174 + * There are hooks in the scheduler which we can register to tell when we 172 175 * get kicked off the CPU (preempt_notifier_register()). This would allow us 173 176 * to lazily disable SYSENTER which would regain some performance, and should 174 177 * also simplify copy_in_guest_info(). Note that we'd still need to restore ··· 189 166 * 190 167 * We could also try using this hooks for PGE, but that might be too expensive. 191 168 * 192 - * The hooks were designed for KVM, but we can also put them to good use. :*/ 169 + * The hooks were designed for KVM, but we can also put them to good use. 170 + :*/ 193 171 194 - /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 195 - * are disabled: we own the CPU. */ 172 + /*H:040 173 + * This is the i386-specific code to setup and run the Guest. Interrupts 174 + * are disabled: we own the CPU. 175 + */ 196 176 void lguest_arch_run_guest(struct lg_cpu *cpu) 197 177 { 198 - /* Remember the awfully-named TS bit? If the Guest has asked to set it 178 + /* 179 + * Remember the awfully-named TS bit? If the Guest has asked to set it 199 180 * we set it now, so we can trap and pass that trap to the Guest if it 200 - * uses the FPU. */ 181 + * uses the FPU. 182 + */ 201 183 if (cpu->ts) 202 184 unlazy_fpu(current); 203 185 204 - /* SYSENTER is an optimized way of doing system calls. We can't allow 186 + /* 187 + * SYSENTER is an optimized way of doing system calls. We can't allow 205 188 * it because it always jumps to privilege level 0. A normal Guest 206 189 * won't try it because we don't advertise it in CPUID, but a malicious 207 190 * Guest (or malicious Guest userspace program) could, so we tell the 208 - * CPU to disable it before running the Guest. */ 191 + * CPU to disable it before running the Guest. 192 + */ 209 193 if (boot_cpu_has(X86_FEATURE_SEP)) 210 194 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 211 195 212 - /* Now we actually run the Guest. It will return when something 196 + /* 197 + * Now we actually run the Guest. It will return when something 213 198 * interesting happens, and we can examine its registers to see what it 214 - * was doing. */ 199 + * was doing. 200 + */ 215 201 run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 216 202 217 - /* Note that the "regs" structure contains two extra entries which are 203 + /* 204 + * Note that the "regs" structure contains two extra entries which are 218 205 * not really registers: a trap number which says what interrupt or 219 206 * trap made the switcher code come back, and an error code which some 220 - * traps set. */ 207 + * traps set. 208 + */ 221 209 222 210 /* Restore SYSENTER if it's supposed to be on. */ 223 211 if (boot_cpu_has(X86_FEATURE_SEP)) 224 212 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 225 213 226 - /* If the Guest page faulted, then the cr2 register will tell us the 214 + /* 215 + * If the Guest page faulted, then the cr2 register will tell us the 227 216 * bad virtual address. We have to grab this now, because once we 228 217 * re-enable interrupts an interrupt could fault and thus overwrite 229 - * cr2, or we could even move off to a different CPU. */ 218 + * cr2, or we could even move off to a different CPU. 219 + */ 230 220 if (cpu->regs->trapnum == 14) 231 221 cpu->arch.last_pagefault = read_cr2(); 232 - /* Similarly, if we took a trap because the Guest used the FPU, 222 + /* 223 + * Similarly, if we took a trap because the Guest used the FPU, 233 224 * we have to restore the FPU it expects to see. 234 225 * math_state_restore() may sleep and we may even move off to 235 226 * a different CPU. So all the critical stuff should be done 236 - * before this. */ 227 + * before this. 228 + */ 237 229 else if (cpu->regs->trapnum == 7) 238 230 math_state_restore(); 239 231 } 240 232 241 - /*H:130 Now we've examined the hypercall code; our Guest can make requests. 233 + /*H:130 234 + * Now we've examined the hypercall code; our Guest can make requests. 242 235 * Our Guest is usually so well behaved; it never tries to do things it isn't 243 236 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 244 237 * infrastructure isn't quite complete, because it doesn't contain replacements ··· 264 225 * 265 226 * When the Guest uses one of these instructions, we get a trap (General 266 227 * Protection Fault) and come here. We see if it's one of those troublesome 267 - * instructions and skip over it. We return true if we did. */ 228 + * instructions and skip over it. We return true if we did. 229 + */ 268 230 static int emulate_insn(struct lg_cpu *cpu) 269 231 { 270 232 u8 insn; 271 233 unsigned int insnlen = 0, in = 0, shift = 0; 272 - /* The eip contains the *virtual* address of the Guest's instruction: 273 - * guest_pa just subtracts the Guest's page_offset. */ 234 + /* 235 + * The eip contains the *virtual* address of the Guest's instruction: 236 + * guest_pa just subtracts the Guest's page_offset. 237 + */ 274 238 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); 275 239 276 - /* This must be the Guest kernel trying to do something, not userspace! 240 + /* 241 + * This must be the Guest kernel trying to do something, not userspace! 277 242 * The bottom two bits of the CS segment register are the privilege 278 - * level. */ 243 + * level. 244 + */ 279 245 if ((cpu->regs->cs & 3) != GUEST_PL) 280 246 return 0; 281 247 282 248 /* Decoding x86 instructions is icky. */ 283 249 insn = lgread(cpu, physaddr, u8); 284 250 285 - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 286 - of the eax register. */ 251 + /* 252 + * 0x66 is an "operand prefix". It means it's using the upper 16 bits 253 + * of the eax register. 254 + */ 287 255 if (insn == 0x66) { 288 256 shift = 16; 289 257 /* The instruction is 1 byte so far, read the next byte. */ ··· 298 252 insn = lgread(cpu, physaddr + insnlen, u8); 299 253 } 300 254 301 - /* We can ignore the lower bit for the moment and decode the 4 opcodes 302 - * we need to emulate. */ 255 + /* 256 + * We can ignore the lower bit for the moment and decode the 4 opcodes 257 + * we need to emulate. 258 + */ 303 259 switch (insn & 0xFE) { 304 260 case 0xE4: /* in <next byte>,%al */ 305 261 insnlen += 2; ··· 322 274 return 0; 323 275 } 324 276 325 - /* If it was an "IN" instruction, they expect the result to be read 277 + /* 278 + * If it was an "IN" instruction, they expect the result to be read 326 279 * into %eax, so we change %eax. We always return all-ones, which 327 - * traditionally means "there's nothing there". */ 280 + * traditionally means "there's nothing there". 281 + */ 328 282 if (in) { 329 283 /* Lower bit tells is whether it's a 16 or 32 bit access */ 330 284 if (insn & 0x1) ··· 340 290 return 1; 341 291 } 342 292 343 - /* Our hypercalls mechanism used to be based on direct software interrupts. 293 + /* 294 + * Our hypercalls mechanism used to be based on direct software interrupts. 344 295 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to 345 296 * change over to using kvm hypercalls. 346 297 * ··· 369 318 */ 370 319 static void rewrite_hypercall(struct lg_cpu *cpu) 371 320 { 372 - /* This are the opcodes we use to patch the Guest. The opcode for "int 321 + /* 322 + * This are the opcodes we use to patch the Guest. The opcode for "int 373 323 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we 374 - * complete the sequence with a NOP (0x90). */ 324 + * complete the sequence with a NOP (0x90). 325 + */ 375 326 u8 insn[3] = {0xcd, 0x1f, 0x90}; 376 327 377 328 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); 378 - /* The above write might have caused a copy of that page to be made 329 + /* 330 + * The above write might have caused a copy of that page to be made 379 331 * (if it was read-only). We need to make sure the Guest has 380 332 * up-to-date pagetables. As this doesn't happen often, we can just 381 - * drop them all. */ 333 + * drop them all. 334 + */ 382 335 guest_pagetable_clear_all(cpu); 383 336 } 384 337 ··· 390 335 { 391 336 u8 insn[3]; 392 337 393 - /* This must be the Guest kernel trying to do something. 338 + /* 339 + * This must be the Guest kernel trying to do something. 394 340 * The bottom two bits of the CS segment register are the privilege 395 - * level. */ 341 + * level. 342 + */ 396 343 if ((cpu->regs->cs & 3) != GUEST_PL) 397 344 return false; 398 345 ··· 408 351 { 409 352 switch (cpu->regs->trapnum) { 410 353 case 13: /* We've intercepted a General Protection Fault. */ 411 - /* Check if this was one of those annoying IN or OUT 354 + /* 355 + * Check if this was one of those annoying IN or OUT 412 356 * instructions which we need to emulate. If so, we just go 413 - * back into the Guest after we've done it. */ 357 + * back into the Guest after we've done it. 358 + */ 414 359 if (cpu->regs->errcode == 0) { 415 360 if (emulate_insn(cpu)) 416 361 return; 417 362 } 418 - /* If KVM is active, the vmcall instruction triggers a 419 - * General Protection Fault. Normally it triggers an 420 - * invalid opcode fault (6): */ 363 + /* 364 + * If KVM is active, the vmcall instruction triggers a General 365 + * Protection Fault. Normally it triggers an invalid opcode 366 + * fault (6): 367 + */ 421 368 case 6: 422 - /* We need to check if ring == GUEST_PL and 423 - * faulting instruction == vmcall. */ 369 + /* 370 + * We need to check if ring == GUEST_PL and faulting 371 + * instruction == vmcall. 372 + */ 424 373 if (is_hypercall(cpu)) { 425 374 rewrite_hypercall(cpu); 426 375 return; 427 376 } 428 377 break; 429 378 case 14: /* We've intercepted a Page Fault. */ 430 - /* The Guest accessed a virtual address that wasn't mapped. 379 + /* 380 + * The Guest accessed a virtual address that wasn't mapped. 431 381 * This happens a lot: we don't actually set up most of the page 432 382 * tables for the Guest at all when we start: as it runs it asks 433 383 * for more and more, and we set them up as required. In this 434 384 * case, we don't even tell the Guest that the fault happened. 435 385 * 436 386 * The errcode tells whether this was a read or a write, and 437 - * whether kernel or userspace code. */ 387 + * whether kernel or userspace code. 388 + */ 438 389 if (demand_page(cpu, cpu->arch.last_pagefault, 439 390 cpu->regs->errcode)) 440 391 return; 441 392 442 - /* OK, it's really not there (or not OK): the Guest needs to 393 + /* 394 + * OK, it's really not there (or not OK): the Guest needs to 443 395 * know. We write out the cr2 value so it knows where the 444 396 * fault occurred. 445 397 * 446 398 * Note that if the Guest were really messed up, this could 447 399 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 448 - * lg->lguest_data could be NULL */ 400 + * lg->lguest_data could be NULL 401 + */ 449 402 if (cpu->lg->lguest_data && 450 403 put_user(cpu->arch.last_pagefault, 451 404 &cpu->lg->lguest_data->cr2)) 452 405 kill_guest(cpu, "Writing cr2"); 453 406 break; 454 407 case 7: /* We've intercepted a Device Not Available fault. */ 455 - /* If the Guest doesn't want to know, we already restored the 456 - * Floating Point Unit, so we just continue without telling 457 - * it. */ 408 + /* 409 + * If the Guest doesn't want to know, we already restored the 410 + * Floating Point Unit, so we just continue without telling it. 411 + */ 458 412 if (!cpu->ts) 459 413 return; 460 414 break; 461 415 case 32 ... 255: 462 - /* These values mean a real interrupt occurred, in which case 416 + /* 417 + * These values mean a real interrupt occurred, in which case 463 418 * the Host handler has already been run. We just do a 464 419 * friendly check if another process should now be run, then 465 - * return to run the Guest again */ 420 + * return to run the Guest again 421 + */ 466 422 cond_resched(); 467 423 return; 468 424 case LGUEST_TRAP_ENTRY: 469 - /* Our 'struct hcall_args' maps directly over our regs: we set 470 - * up the pointer now to indicate a hypercall is pending. */ 425 + /* 426 + * Our 'struct hcall_args' maps directly over our regs: we set 427 + * up the pointer now to indicate a hypercall is pending. 428 + */ 471 429 cpu->hcall = (struct hcall_args *)cpu->regs; 472 430 return; 473 431 } 474 432 475 433 /* We didn't handle the trap, so it needs to go to the Guest. */ 476 434 if (!deliver_trap(cpu, cpu->regs->trapnum)) 477 - /* If the Guest doesn't have a handler (either it hasn't 435 + /* 436 + * If the Guest doesn't have a handler (either it hasn't 478 437 * registered any yet, or it's one of the faults we don't let 479 - * it handle), it dies with this cryptic error message. */ 438 + * it handle), it dies with this cryptic error message. 439 + */ 480 440 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 481 441 cpu->regs->trapnum, cpu->regs->eip, 482 442 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault 483 443 : cpu->regs->errcode); 484 444 } 485 445 486 - /* Now we can look at each of the routines this calls, in increasing order of 446 + /* 447 + * Now we can look at each of the routines this calls, in increasing order of 487 448 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 488 449 * deliver_trap() and demand_page(). After all those, we'll be ready to 489 450 * examine the Switcher, and our philosophical understanding of the Host/Guest 490 - * duality will be complete. :*/ 451 + * duality will be complete. 452 + :*/ 491 453 static void adjust_pge(void *on) 492 454 { 493 455 if (on) ··· 515 439 write_cr4(read_cr4() & ~X86_CR4_PGE); 516 440 } 517 441 518 - /*H:020 Now the Switcher is mapped and every thing else is ready, we need to do 519 - * some more i386-specific initialization. */ 442 + /*H:020 443 + * Now the Switcher is mapped and every thing else is ready, we need to do 444 + * some more i386-specific initialization. 445 + */ 520 446 void __init lguest_arch_host_init(void) 521 447 { 522 448 int i; 523 449 524 - /* Most of the i386/switcher.S doesn't care that it's been moved; on 450 + /* 451 + * Most of the i386/switcher.S doesn't care that it's been moved; on 525 452 * Intel, jumps are relative, and it doesn't access any references to 526 453 * external code or data. 527 454 * ··· 532 453 * addresses are placed in a table (default_idt_entries), so we need to 533 454 * update the table with the new addresses. switcher_offset() is a 534 455 * convenience function which returns the distance between the 535 - * compiled-in switcher code and the high-mapped copy we just made. */ 456 + * compiled-in switcher code and the high-mapped copy we just made. 457 + */ 536 458 for (i = 0; i < IDT_ENTRIES; i++) 537 459 default_idt_entries[i] += switcher_offset(); 538 460 ··· 548 468 for_each_possible_cpu(i) { 549 469 /* lguest_pages() returns this CPU's two pages. */ 550 470 struct lguest_pages *pages = lguest_pages(i); 551 - /* This is a convenience pointer to make the code fit one 552 - * statement to a line. */ 471 + /* This is a convenience pointer to make the code neater. */ 553 472 struct lguest_ro_state *state = &pages->state; 554 473 555 - /* The Global Descriptor Table: the Host has a different one 474 + /* 475 + * The Global Descriptor Table: the Host has a different one 556 476 * for each CPU. We keep a descriptor for the GDT which says 557 477 * where it is and how big it is (the size is actually the last 558 - * byte, not the size, hence the "-1"). */ 478 + * byte, not the size, hence the "-1"). 479 + */ 559 480 state->host_gdt_desc.size = GDT_SIZE-1; 560 481 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 561 482 562 - /* All CPUs on the Host use the same Interrupt Descriptor 483 + /* 484 + * All CPUs on the Host use the same Interrupt Descriptor 563 485 * Table, so we just use store_idt(), which gets this CPU's IDT 564 - * descriptor. */ 486 + * descriptor. 487 + */ 565 488 store_idt(&state->host_idt_desc); 566 489 567 - /* The descriptors for the Guest's GDT and IDT can be filled 490 + /* 491 + * The descriptors for the Guest's GDT and IDT can be filled 568 492 * out now, too. We copy the GDT & IDT into ->guest_gdt and 569 - * ->guest_idt before actually running the Guest. */ 493 + * ->guest_idt before actually running the Guest. 494 + */ 570 495 state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 571 496 state->guest_idt_desc.address = (long)&state->guest_idt; 572 497 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 573 498 state->guest_gdt_desc.address = (long)&state->guest_gdt; 574 499 575 - /* We know where we want the stack to be when the Guest enters 500 + /* 501 + * We know where we want the stack to be when the Guest enters 576 502 * the Switcher: in pages->regs. The stack grows upwards, so 577 - * we start it at the end of that structure. */ 503 + * we start it at the end of that structure. 504 + */ 578 505 state->guest_tss.sp0 = (long)(&pages->regs + 1); 579 - /* And this is the GDT entry to use for the stack: we keep a 580 - * couple of special LGUEST entries. */ 506 + /* 507 + * And this is the GDT entry to use for the stack: we keep a 508 + * couple of special LGUEST entries. 509 + */ 581 510 state->guest_tss.ss0 = LGUEST_DS; 582 511 583 - /* x86 can have a finegrained bitmap which indicates what I/O 512 + /* 513 + * x86 can have a finegrained bitmap which indicates what I/O 584 514 * ports the process can use. We set it to the end of our 585 - * structure, meaning "none". */ 515 + * structure, meaning "none". 516 + */ 586 517 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 587 518 588 - /* Some GDT entries are the same across all Guests, so we can 589 - * set them up now. */ 519 + /* 520 + * Some GDT entries are the same across all Guests, so we can 521 + * set them up now. 522 + */ 590 523 setup_default_gdt_entries(state); 591 524 /* Most IDT entries are the same for all Guests, too.*/ 592 525 setup_default_idt_entries(state, default_idt_entries); 593 526 594 - /* The Host needs to be able to use the LGUEST segments on this 595 - * CPU, too, so put them in the Host GDT. */ 527 + /* 528 + * The Host needs to be able to use the LGUEST segments on this 529 + * CPU, too, so put them in the Host GDT. 530 + */ 596 531 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 597 532 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 598 533 } 599 534 600 - /* In the Switcher, we want the %cs segment register to use the 535 + /* 536 + * In the Switcher, we want the %cs segment register to use the 601 537 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 602 538 * it will be undisturbed when we switch. To change %cs and jump we 603 - * need this structure to feed to Intel's "lcall" instruction. */ 539 + * need this structure to feed to Intel's "lcall" instruction. 540 + */ 604 541 lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 605 542 lguest_entry.segment = LGUEST_CS; 606 543 607 - /* Finally, we need to turn off "Page Global Enable". PGE is an 544 + /* 545 + * Finally, we need to turn off "Page Global Enable". PGE is an 608 546 * optimization where page table entries are specially marked to show 609 547 * they never change. The Host kernel marks all the kernel pages this 610 548 * way because it's always present, even when userspace is running. ··· 632 534 * you'll get really weird bugs that you'll chase for two days. 633 535 * 634 536 * I used to turn PGE off every time we switched to the Guest and back 635 - * on when we return, but that slowed the Switcher down noticibly. */ 537 + * on when we return, but that slowed the Switcher down noticibly. 538 + */ 636 539 637 - /* We don't need the complexity of CPUs coming and going while we're 638 - * doing this. */ 540 + /* 541 + * We don't need the complexity of CPUs coming and going while we're 542 + * doing this. 543 + */ 639 544 get_online_cpus(); 640 545 if (cpu_has_pge) { /* We have a broader idea of "global". */ 641 546 /* Remember that this was originally set (for cleanup). */ 642 547 cpu_had_pge = 1; 643 - /* adjust_pge is a helper function which sets or unsets the PGE 644 - * bit on its CPU, depending on the argument (0 == unset). */ 548 + /* 549 + * adjust_pge is a helper function which sets or unsets the PGE 550 + * bit on its CPU, depending on the argument (0 == unset). 551 + */ 645 552 on_each_cpu(adjust_pge, (void *)0, 1); 646 553 /* Turn off the feature in the global feature set. */ 647 554 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); ··· 693 590 { 694 591 u32 tsc_speed; 695 592 696 - /* The pointer to the Guest's "struct lguest_data" is the only argument. 697 - * We check that address now. */ 593 + /* 594 + * The pointer to the Guest's "struct lguest_data" is the only argument. 595 + * We check that address now. 596 + */ 698 597 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 699 598 sizeof(*cpu->lg->lguest_data))) 700 599 return -EFAULT; 701 600 702 - /* Having checked it, we simply set lg->lguest_data to point straight 601 + /* 602 + * Having checked it, we simply set lg->lguest_data to point straight 703 603 * into the Launcher's memory at the right place and then use 704 604 * copy_to_user/from_user from now on, instead of lgread/write. I put 705 605 * this in to show that I'm not immune to writing stupid 706 - * optimizations. */ 606 + * optimizations. 607 + */ 707 608 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; 708 609 709 - /* We insist that the Time Stamp Counter exist and doesn't change with 610 + /* 611 + * We insist that the Time Stamp Counter exist and doesn't change with 710 612 * cpu frequency. Some devious chip manufacturers decided that TSC 711 613 * changes could be handled in software. I decided that time going 712 614 * backwards might be good for benchmarks, but it's bad for users. 713 615 * 714 616 * We also insist that the TSC be stable: the kernel detects unreliable 715 - * TSCs for its own purposes, and we use that here. */ 617 + * TSCs for its own purposes, and we use that here. 618 + */ 716 619 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 717 620 tsc_speed = tsc_khz; 718 621 else ··· 734 625 } 735 626 /*:*/ 736 627 737 - /*L:030 lguest_arch_setup_regs() 628 + /*L:030 629 + * lguest_arch_setup_regs() 738 630 * 739 631 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 740 - * allocate the structure, so they will be 0. */ 632 + * allocate the structure, so they will be 0. 633 + */ 741 634 void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) 742 635 { 743 636 struct lguest_regs *regs = cpu->regs; 744 637 745 - /* There are four "segment" registers which the Guest needs to boot: 638 + /* 639 + * There are four "segment" registers which the Guest needs to boot: 746 640 * The "code segment" register (cs) refers to the kernel code segment 747 641 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 748 642 * refer to the kernel data segment __KERNEL_DS. 749 643 * 750 644 * The privilege level is packed into the lower bits. The Guest runs 751 - * at privilege level 1 (GUEST_PL).*/ 645 + * at privilege level 1 (GUEST_PL). 646 + */ 752 647 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 753 648 regs->cs = __KERNEL_CS|GUEST_PL; 754 649 755 - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 650 + /* 651 + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 756 652 * is supposed to always be "1". Bit 9 (0x200) controls whether 757 653 * interrupts are enabled. We always leave interrupts enabled while 758 - * running the Guest. */ 654 + * running the Guest. 655 + */ 759 656 regs->eflags = X86_EFLAGS_IF | 0x2; 760 657 761 - /* The "Extended Instruction Pointer" register says where the Guest is 762 - * running. */ 658 + /* 659 + * The "Extended Instruction Pointer" register says where the Guest is 660 + * running. 661 + */ 763 662 regs->eip = start; 764 663 765 - /* %esi points to our boot information, at physical address 0, so don't 766 - * touch it. */ 664 + /* 665 + * %esi points to our boot information, at physical address 0, so don't 666 + * touch it. 667 + */ 767 668 768 - /* There are a couple of GDT entries the Guest expects when first 769 - * booting. */ 669 + /* There are a couple of GDT entries the Guest expects at boot. */ 770 670 setup_guest_gdt(cpu); 771 671 }

+12 -6

drivers/lguest/x86/switcher_32.S

··· 1 - /*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the 1 + /*P:900 2 + * This is the Switcher: code which sits at 0xFFC00000 astride both the 2 3 * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 3 4 * it can be made, but it's naturally very specific to x86. 4 5 * 5 6 * You have now completed Preparation. If this has whet your appetite; if you 6 7 * are feeling invigorated and refreshed then the next, more challenging stage 7 - * can be found in "make Guest". :*/ 8 + * can be found in "make Guest". 9 + :*/ 8 10 9 - /*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 11 + /*M:012 12 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 10 13 * gain at least 1% more performance. Since neither LOC nor performance can be 11 14 * measured beforehand, it generally means implementing a feature then deciding 12 15 * if it's worth it. And once it's implemented, who can say no? ··· 34 31 * Host (which is actually really easy). 35 32 * 36 33 * Two questions remain. Would the performance gain outweigh the complexity? 37 - * And who would write the verse documenting it? :*/ 34 + * And who would write the verse documenting it? 35 + :*/ 38 36 39 - /*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their 37 + /*M:011 38 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their 40 39 * code). It's worth doing though, since it would let us use oprofile in the 41 - * Host when a Guest is running. :*/ 40 + * Host when a Guest is running. 41 + :*/ 42 42 43 43 /*S:100 44 44 * Welcome to the Switcher itself!

+24 -12

include/linux/lguest.h

··· 1 - /* Things the lguest guest needs to know. Note: like all lguest interfaces, 2 - * this is subject to wild and random change between versions. */ 1 + /* 2 + * Things the lguest guest needs to know. Note: like all lguest interfaces, 3 + * this is subject to wild and random change between versions. 4 + */ 3 5 #ifndef _LINUX_LGUEST_H 4 6 #define _LINUX_LGUEST_H 5 7 ··· 13 11 #define LG_CLOCK_MIN_DELTA 100UL 14 12 #define LG_CLOCK_MAX_DELTA ULONG_MAX 15 13 16 - /*G:031 The second method of communicating with the Host is to via "struct 14 + /*G:031 15 + * The second method of communicating with the Host is to via "struct 17 16 * lguest_data". Once the Guest's initialization hypercall tells the Host where 18 - * this is, the Guest and Host both publish information in it. :*/ 17 + * this is, the Guest and Host both publish information in it. 18 + :*/ 19 19 struct lguest_data 20 20 { 21 - /* 512 == enabled (same as eflags in normal hardware). The Guest 22 - * changes interrupts so often that a hypercall is too slow. */ 21 + /* 22 + * 512 == enabled (same as eflags in normal hardware). The Guest 23 + * changes interrupts so often that a hypercall is too slow. 24 + */ 23 25 unsigned int irq_enabled; 24 26 /* Fine-grained interrupt disabling by the Guest */ 25 27 DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); 26 28 27 - /* The Host writes the virtual address of the last page fault here, 29 + /* 30 + * The Host writes the virtual address of the last page fault here, 28 31 * which saves the Guest a hypercall. CR2 is the native register where 29 - * this address would normally be found. */ 32 + * this address would normally be found. 33 + */ 30 34 unsigned long cr2; 31 35 32 36 /* Wallclock time set by the Host. */ 33 37 struct timespec time; 34 38 35 - /* Interrupt pending set by the Host. The Guest should do a hypercall 36 - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ 39 + /* 40 + * Interrupt pending set by the Host. The Guest should do a hypercall 41 + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). 42 + */ 37 43 int irq_pending; 38 44 39 - /* Async hypercall ring. Instead of directly making hypercalls, we can 45 + /* 46 + * Async hypercall ring. Instead of directly making hypercalls, we can 40 47 * place them in here for processing the next time the Host wants. 41 - * This batching can be quite efficient. */ 48 + * This batching can be quite efficient. 49 + */ 42 50 43 51 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 44 52 u8 hcall_status[LHCALL_RING_SIZE];

+12 -6

include/linux/lguest_launcher.h

··· 29 29 __u8 type; 30 30 /* The number of virtqueues (first in config array) */ 31 31 __u8 num_vq; 32 - /* The number of bytes of feature bits. Multiply by 2: one for host 33 - * features and one for Guest acknowledgements. */ 32 + /* 33 + * The number of bytes of feature bits. Multiply by 2: one for host 34 + * features and one for Guest acknowledgements. 35 + */ 34 36 __u8 feature_len; 35 37 /* The number of bytes of the config array after virtqueues. */ 36 38 __u8 config_len; ··· 41 39 __u8 config[0]; 42 40 }; 43 41 44 - /*D:135 This is how we expect the device configuration field for a virtqueue 45 - * to be laid out in config space. */ 42 + /*D:135 43 + * This is how we expect the device configuration field for a virtqueue 44 + * to be laid out in config space. 45 + */ 46 46 struct lguest_vqconfig { 47 47 /* The number of entries in the virtio_ring */ 48 48 __u16 num; ··· 65 61 LHREQ_EVENTFD, /* + address, fd. */ 66 62 }; 67 63 68 - /* The alignment to use between consumer and producer parts of vring. 69 - * x86 pagesize for historical reasons. */ 64 + /* 65 + * The alignment to use between consumer and producer parts of vring. 66 + * x86 pagesize for historical reasons. 67 + */ 70 68 #define LGUEST_VRING_ALIGN 4096 71 69 #endif /* _LINUX_LGUEST_LAUNCHER */