commit 0d6810091cdbd05efeb31654c6a41a6cbdfdd2c8 · tjh.dev/kernel

+3 -23

Documentation/lguest/Makefile

··· 1 # This creates the demonstration utility "lguest" which runs a Linux guest. 2 - 3 - # For those people that have a separate object dir, look there for .config 4 - KBUILD_OUTPUT := ../.. 5 - ifdef O 6 - ifeq ("$(origin O)", "command line") 7 - KBUILD_OUTPUT := $(O) 8 - endif 9 - endif 10 - # We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. 11 - include $(KBUILD_OUTPUT)/.config 12 - LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) 13 - 14 - CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds 15 LDLIBS:=-lz 16 - # Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and 17 - # not others (eg. FC7). 18 - LDFLAGS+=-static 19 - all: lguest.lds lguest 20 21 - # The linker script on x86 is so complex the only way of creating one 22 - # which will link our binary in the right place is to mangle the 23 - # default one. 24 - lguest.lds: 25 - $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ 26 27 clean: 28 - rm -f lguest.lds lguest

··· 1 # This creates the demonstration utility "lguest" which runs a Linux guest. 2 + CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include 3 LDLIBS:=-lz 4 5 + all: lguest 6 7 clean: 8 + rm -f lguest

+879 -744

Documentation/lguest/lguest.c

··· 1 /*P:100 This is the Launcher code, a simple program which lays out the 2 * "physical" memory for the new Guest by mapping the kernel image and the 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 4 - * 5 - * The only trick: the Makefile links it at a high address so it will be clear 6 - * of the guest memory region. It means that each Guest cannot have more than 7 - * about 2.5G of memory on a normally configured Host. :*/ 8 #define _LARGEFILE64_SOURCE 9 #define _GNU_SOURCE 10 #include <stdio.h> ··· 12 #include <stdlib.h> 13 #include <elf.h> 14 #include <sys/mman.h> 15 #include <sys/types.h> 16 #include <sys/stat.h> 17 #include <sys/wait.h> ··· 32 #include <termios.h> 33 #include <getopt.h> 34 #include <zlib.h> 35 - /*L:110 We can ignore the 28 include files we need for this program, but I do 36 * want to draw attention to the use of kernel-style types. 37 * 38 * As Linus said, "C is a Spartan language, and so should your naming be." I ··· 45 typedef uint32_t u32; 46 typedef uint16_t u16; 47 typedef uint8_t u8; 48 - #include "../../include/linux/lguest_launcher.h" 49 - #include "../../include/asm-x86/e820_32.h" 50 /*:*/ 51 52 #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ ··· 61 #ifndef SIOCBRADDIF 62 #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 63 #endif 64 65 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows 66 * this, and although I wouldn't recommend it, it works quite nicely here. */ ··· 75 76 /* The pipe to send commands to the waker process */ 77 static int waker_fd; 78 - /* The top of guest physical memory. */ 79 - static u32 top; 80 81 /* This is our list of devices. */ 82 struct device_list ··· 88 fd_set infds; 89 int max_infd; 90 91 /* The descriptor page for the devices. */ 92 - struct lguest_device_desc *descs; 93 94 /* A single linked list of devices. */ 95 struct device *dev; ··· 106 struct device **lastdev; 107 }; 108 109 /* The device structure describes a single device. */ 110 struct device 111 { 112 /* The linked-list pointer. */ 113 struct device *next; 114 - /* The descriptor for this device, as mapped into the Guest. */ 115 struct lguest_device_desc *desc; 116 - /* The memory page(s) of this device, if any. Also mapped in Guest. */ 117 - void *mem; 118 119 /* If handle_input is set, it wants to be called when this file 120 * descriptor is ready. */ 121 int fd; 122 bool (*handle_input)(int fd, struct device *me); 123 124 - /* If handle_output is set, it wants to be called when the Guest sends 125 - * DMA to this key. */ 126 - unsigned long watch_key; 127 - u32 (*handle_output)(int fd, const struct iovec *iov, 128 - unsigned int num, struct device *me); 129 130 /* Device-specific data. */ 131 void *priv; 132 }; 133 134 /*L:130 135 * Loading the Kernel. ··· 224 return fd; 225 } 226 227 - /* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ 228 - static void *map_zeroed_pages(unsigned long addr, unsigned int num) 229 { 230 - /* We cache the /dev/zero file-descriptor so we only open it once. */ 231 - static int fd = -1; 232 - 233 - if (fd == -1) 234 - fd = open_or_die("/dev/zero", O_RDONLY); 235 236 /* We use a private mapping (ie. if we write to the page, it will be 237 - * copied), and obviously we insist that it be mapped where we ask. */ 238 - if (mmap((void *)addr, getpagesize() * num, 239 - PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 240 - != (void *)addr) 241 - err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 242 243 - /* Returning the address is just a courtesy: can simplify callers. */ 244 - return (void *)addr; 245 } 246 247 - /* To find out where to start we look for the magic Guest string, which marks 248 - * the code we see in lguest_asm.S. This is a hack which we are currently 249 - * plotting to replace with the normal Linux entry point. */ 250 - static unsigned long entry_point(void *start, void *end, 251 - unsigned long page_offset) 252 { 253 - void *p; 254 255 - /* The scan gives us the physical starting address. We want the 256 - * virtual address in this case, and fortunately, we already figured 257 - * out the physical-virtual difference and passed it here in 258 - * "page_offset". */ 259 - for (p = start; p < end; p++) 260 - if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) 261 - return (long)p + strlen("GenuineLguest") + page_offset; 262 263 - err(1, "Is this image a genuine lguest?"); 264 } 265 266 /* This routine takes an open vmlinux image, which is in ELF, and maps it into ··· 280 * by all modern binaries on Linux including the kernel. 281 * 282 * The ELF headers give *two* addresses: a physical address, and a virtual 283 - * address. The Guest kernel expects to be placed in memory at the physical 284 - * address, and the page tables set up so it will correspond to that virtual 285 - * address. We return the difference between the virtual and physical 286 - * addresses in the "page_offset" pointer. 287 * 288 * We return the starting address. */ 289 - static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 290 - unsigned long *page_offset) 291 { 292 - void *addr; 293 Elf32_Phdr phdr[ehdr->e_phnum]; 294 unsigned int i; 295 - unsigned long start = -1UL, end = 0; 296 297 /* Sanity checks on the main ELF header: an x86 executable with a 298 * reasonable number of correctly-sized program headers. */ ··· 307 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 308 err(1, "Reading program headers"); 309 310 - /* We don't know page_offset yet. */ 311 - *page_offset = 0; 312 - 313 /* Try all the headers: there are usually only three. A read-only one, 314 * a read-write one, and a "note" section which isn't loadable. */ 315 for (i = 0; i < ehdr->e_phnum; i++) { ··· 317 verbose("Section %i: size %i addr %p\n", 318 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 319 320 - /* We expect a simple linear address space: every segment must 321 - * have the same difference between virtual (p_vaddr) and 322 - * physical (p_paddr) address. */ 323 - if (!*page_offset) 324 - *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; 325 - else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) 326 - errx(1, "Page offset of section %i different", i); 327 - 328 - /* We track the first and last address we mapped, so we can 329 - * tell entry_point() where to scan. */ 330 - if (phdr[i].p_paddr < start) 331 - start = phdr[i].p_paddr; 332 - if (phdr[i].p_paddr + phdr[i].p_filesz > end) 333 - end = phdr[i].p_paddr + phdr[i].p_filesz; 334 - 335 - /* We map this section of the file at its physical address. We 336 - * map it read & write even if the header says this segment is 337 - * read-only. The kernel really wants to be writable: it 338 - * patches its own instructions which would normally be 339 - * read-only. 340 - * 341 - * MAP_PRIVATE means that the page won't be copied until a 342 - * write is done to it. This allows us to share much of the 343 - * kernel memory between Guests. */ 344 - addr = mmap((void *)phdr[i].p_paddr, 345 - phdr[i].p_filesz, 346 - PROT_READ|PROT_WRITE|PROT_EXEC, 347 - MAP_FIXED|MAP_PRIVATE, 348 - elf_fd, phdr[i].p_offset); 349 - if (addr != (void *)phdr[i].p_paddr) 350 - err(1, "Mmaping vmlinux seg %i gave %p not %p", 351 - i, addr, (void *)phdr[i].p_paddr); 352 } 353 354 - return entry_point((void *)start, (void *)end, *page_offset); 355 - } 356 - 357 - /*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. 358 - * 359 - * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects 360 - * to be. We don't know what that option was, but we can figure it out 361 - * approximately by looking at the addresses in the code. I chose the common 362 - * case of reading a memory location into the %eax register: 363 - * 364 - * movl <some-address>, %eax 365 - * 366 - * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, 367 - * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. 368 - * 369 - * In this example can guess that the kernel was compiled with 370 - * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the 371 - * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our 372 - * kernel isn't that bloated yet. 373 - * 374 - * Unfortunately, x86 has variable-length instructions, so finding this 375 - * particular instruction properly involves writing a disassembler. Instead, 376 - * we rely on statistics. We look for "0xA1" and tally the different bytes 377 - * which occur 4 bytes later (the "0xC0" in our example above). When one of 378 - * those bytes appears three times, we can be reasonably confident that it 379 - * forms the start of CONFIG_PAGE_OFFSET. 380 - * 381 - * This is amazingly reliable. */ 382 - static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) 383 - { 384 - unsigned int i, possibilities[256] = { 0 }; 385 - 386 - for (i = 0; i + 4 < len; i++) { 387 - /* mov 0xXXXXXXXX,%eax */ 388 - if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) 389 - return (unsigned long)img[i+4] << 24; 390 - } 391 - errx(1, "could not determine page offset"); 392 - } 393 - 394 - /*L:160 Unfortunately the entire ELF image isn't compressed: the segments 395 - * which need loading are extracted and compressed raw. This denies us the 396 - * information we need to make a fully-general loader. */ 397 - static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) 398 - { 399 - gzFile f; 400 - int ret, len = 0; 401 - /* A bzImage always gets loaded at physical address 1M. This is 402 - * actually configurable as CONFIG_PHYSICAL_START, but as the comment 403 - * there says, "Don't change this unless you know what you are doing". 404 - * Indeed. */ 405 - void *img = (void *)0x100000; 406 - 407 - /* gzdopen takes our file descriptor (carefully placed at the start of 408 - * the GZIP header we found) and returns a gzFile. */ 409 - f = gzdopen(fd, "rb"); 410 - /* We read it into memory in 64k chunks until we hit the end. */ 411 - while ((ret = gzread(f, img + len, 65536)) > 0) 412 - len += ret; 413 - if (ret < 0) 414 - err(1, "reading image from bzImage"); 415 - 416 - verbose("Unpacked size %i addr %p\n", len, img); 417 - 418 - /* Without the ELF header, we can't tell virtual-physical gap. This is 419 - * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, 420 - * I have a clever way of figuring it out from the code itself. */ 421 - *page_offset = intuit_page_offset(img, len); 422 - 423 - return entry_point(img, img + len, *page_offset); 424 } 425 426 /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 427 - * supposed to jump into it and it will unpack itself. We can't do that 428 - * because the Guest can't run the unpacking code, and adding features to 429 - * lguest kills puppies, so we don't want to. 430 * 431 - * The bzImage is formed by putting the decompressing code in front of the 432 - * compressed kernel code. So we can simple scan through it looking for the 433 - * first "gzip" header, and start decompressing from there. */ 434 - static unsigned long load_bzimage(int fd, unsigned long *page_offset) 435 { 436 - unsigned char c; 437 - int state = 0; 438 439 - /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */ 440 - while (read(fd, &c, 1) == 1) { 441 - switch (state) { 442 - case 0: 443 - if (c == 0x1F) 444 - state++; 445 - break; 446 - case 1: 447 - if (c == 0x8B) 448 - state++; 449 - else 450 - state = 0; 451 - break; 452 - case 2 ... 8: 453 - state++; 454 - break; 455 - case 9: 456 - /* Seek back to the start of the gzip header. */ 457 - lseek(fd, -10, SEEK_CUR); 458 - /* One final check: "compressed under UNIX". */ 459 - if (c != 0x03) 460 - state = -1; 461 - else 462 - return unpack_bzimage(fd, page_offset); 463 - } 464 - } 465 - errx(1, "Could not find kernel in bzImage"); 466 } 467 468 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 469 * come wrapped up in the self-decompressing "bzImage" format. With some funky 470 * coding, we can load those, too. */ 471 - static unsigned long load_kernel(int fd, unsigned long *page_offset) 472 { 473 Elf32_Ehdr hdr; 474 ··· 373 374 /* If it's an ELF file, it starts with "\177ELF" */ 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 376 - return map_elf(fd, &hdr, page_offset); 377 378 /* Otherwise we assume it's a bzImage, and try to unpack it */ 379 - return load_bzimage(fd, page_offset); 380 } 381 382 /* This is a trivial little helper to align pages. Andi Kleen hated it because ··· 402 int ifd; 403 struct stat st; 404 unsigned long len; 405 - void *iaddr; 406 407 ifd = open_or_die(name, O_RDONLY); 408 /* fstat() is needed to get the file size. */ 409 if (fstat(ifd, &st) < 0) 410 err(1, "fstat() on initrd '%s'", name); 411 412 - /* The length needs to be rounded up to a page size: mmap needs the 413 - * address to be page aligned. */ 414 len = page_align(st.st_size); 415 - /* We map the initrd at the top of memory. */ 416 - iaddr = mmap((void *)mem - len, st.st_size, 417 - PROT_READ|PROT_EXEC|PROT_WRITE, 418 - MAP_FIXED|MAP_PRIVATE, ifd, 0); 419 - if (iaddr != (void *)mem - len) 420 - err(1, "Mmaping initrd '%s' returned %p not %p", 421 - name, iaddr, (void *)mem - len); 422 /* Once a file is mapped, you can close the file descriptor. It's a 423 * little odd, but quite useful. */ 424 close(ifd); 425 - verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); 426 427 /* We return the initrd size. */ 428 return len; 429 } 430 431 - /* Once we know how much memory we have, and the address the Guest kernel 432 - * expects, we can construct simple linear page tables which will get the Guest 433 - * far enough into the boot to create its own. 434 * 435 * We lay them out of the way, just below the initrd (which is why we need to 436 * know its size). */ 437 static unsigned long setup_pagetables(unsigned long mem, 438 - unsigned long initrd_size, 439 - unsigned long page_offset) 440 { 441 - u32 *pgdir, *linear; 442 unsigned int mapped_pages, i, linear_pages; 443 - unsigned int ptes_per_page = getpagesize()/sizeof(u32); 444 445 - /* Ideally we map all physical memory starting at page_offset. 446 - * However, if page_offset is 0xC0000000 we can only map 1G of physical 447 - * (0xC0000000 + 1G overflows). */ 448 - if (mem <= -page_offset) 449 - mapped_pages = mem/getpagesize(); 450 - else 451 - mapped_pages = -page_offset/getpagesize(); 452 453 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 454 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 455 456 /* We put the toplevel page directory page at the top of memory. */ 457 - pgdir = (void *)mem - initrd_size - getpagesize(); 458 459 /* Now we use the next linear_pages pages as pte pages */ 460 linear = (void *)pgdir - linear_pages*getpagesize(); ··· 451 for (i = 0; i < mapped_pages; i++) 452 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 453 454 - /* The top level points to the linear page table pages above. The 455 - * entry representing page_offset points to the first one, and they 456 - * continue from there. */ 457 for (i = 0; i < mapped_pages; i += ptes_per_page) { 458 - pgdir[(i + page_offset/getpagesize())/ptes_per_page] 459 - = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 460 } 461 462 - verbose("Linear mapping of %u pages in %u pte pages at %p\n", 463 - mapped_pages, linear_pages, linear); 464 465 /* We return the top level (guest-physical) address: the kernel needs 466 * to know where it is. */ 467 - return (unsigned long)pgdir; 468 } 469 470 /* Simple routine to roll all the commandline arguments together with spaces ··· 483 484 /* This is where we actually tell the kernel to initialize the Guest. We saw 485 * the arguments it expects when we looked at initialize() in lguest_user.c: 486 - * the top physical page to allow, the top level pagetable, the entry point and 487 - * the page_offset constant for the Guest. */ 488 - static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 489 { 490 - u32 args[] = { LHREQ_INITIALIZE, 491 - top/getpagesize(), pgdir, start, page_offset }; 492 int fd; 493 494 fd = open_or_die("/dev/lguest", O_RDWR); 495 if (write(fd, args, sizeof(args)) < 0) 496 err(1, "Writing to /dev/lguest"); ··· 503 } 504 /*:*/ 505 506 - static void set_fd(int fd, struct device_list *devices) 507 { 508 - FD_SET(fd, &devices->infds); 509 - if (fd > devices->max_infd) 510 - devices->max_infd = fd; 511 } 512 513 /*L:200 ··· 525 * 526 * This, of course, is merely a different *kind* of icky. 527 */ 528 - static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) 529 { 530 /* Add the pipe from the Launcher to the fdset in the device_list, so 531 * we watch it, too. */ 532 - set_fd(pipefd, devices); 533 534 for (;;) { 535 - fd_set rfds = devices->infds; 536 - u32 args[] = { LHREQ_BREAK, 1 }; 537 538 /* Wait until input is ready from one of the devices. */ 539 - select(devices->max_infd+1, &rfds, NULL, NULL, NULL); 540 /* Is it a message from the Launcher? */ 541 if (FD_ISSET(pipefd, &rfds)) { 542 - int ignorefd; 543 /* If read() returns 0, it means the Launcher has 544 * exited. We silently follow. */ 545 - if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) 546 exit(0); 547 - /* Otherwise it's telling us there's a problem with one 548 - * of the devices, and we should ignore that file 549 - * descriptor from now on. */ 550 - FD_CLR(ignorefd, &devices->infds); 551 } else /* Send LHREQ_BREAK command. */ 552 write(lguest_fd, args, sizeof(args)); 553 } 554 } 555 556 /* This routine just sets up a pipe to the Waker process. */ 557 - static int setup_waker(int lguest_fd, struct device_list *device_list) 558 { 559 int pipefd[2], child; 560 ··· 570 if (child == 0) { 571 /* Close the "writing" end of our copy of the pipe */ 572 close(pipefd[1]); 573 - wake_parent(pipefd[0], lguest_fd, device_list); 574 } 575 /* Close the reading end of our copy of the pipe. */ 576 close(pipefd[0]); ··· 592 { 593 /* We have to separately check addr and addr+size, because size could 594 * be huge and addr + size might wrap around. */ 595 - if (addr >= top || addr + size >= top) 596 - errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 597 /* We return a pointer for the caller's convenience, now we know it's 598 * safe to use. */ 599 - return (void *)addr; 600 } 601 /* A macro which transparently hands the line number to the real function. */ 602 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 603 604 - /* The Guest has given us the address of a "struct lguest_dma". We check it's 605 - * OK and convert it to an iovec (which is a simple array of ptr/size 606 - * pairs). */ 607 - static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) 608 { 609 - unsigned int i; 610 - struct lguest_dma *udma; 611 612 - /* First we make sure that the array memory itself is valid. */ 613 - udma = check_pointer(dma, sizeof(*udma)); 614 - /* Now we check each element */ 615 - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 616 - /* A zero length ends the array. */ 617 - if (!udma->len[i]) 618 - break; 619 620 - iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); 621 - iov[i].iov_len = udma->len[i]; 622 - } 623 - *num = i; 624 625 - /* We return the pointer to where the caller should write the amount of 626 - * the buffer used. */ 627 - return &udma->used_len; 628 } 629 630 - /* This routine gets a DMA buffer from the Guest for a given key, and converts 631 - * it to an iovec array. It returns the interrupt the Guest wants when we're 632 - * finished, and a pointer to the "used_len" field to fill in. */ 633 - static u32 *get_dma_buffer(int fd, void *key, 634 - struct iovec iov[], unsigned int *num, u32 *irq) 635 { 636 - u32 buf[] = { LHREQ_GETDMA, (u32)key }; 637 - unsigned long udma; 638 - u32 *res; 639 640 - /* Ask the kernel for a DMA buffer corresponding to this key. */ 641 - udma = write(fd, buf, sizeof(buf)); 642 - /* They haven't registered any, or they're all used? */ 643 - if (udma == (unsigned long)-1) 644 - return NULL; 645 646 - /* Convert it into our iovec array */ 647 - res = dma2iov(udma, iov, num); 648 - /* The kernel stashes irq in ->used_len to get it out to us. */ 649 - *irq = *res; 650 - /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */ 651 - return res; 652 } 653 654 - /* This is a convenient routine to send the Guest an interrupt. */ 655 - static void trigger_irq(int fd, u32 irq) 656 { 657 - u32 buf[] = { LHREQ_IRQ, irq }; 658 if (write(fd, buf, sizeof(buf)) != 0) 659 - err(1, "Triggering irq %i", irq); 660 } 661 662 - /* This simply sets up an iovec array where we can put data to be discarded. 663 - * This happens when the Guest doesn't want or can't handle the input: we have 664 - * to get rid of it somewhere, and if we bury it in the ceiling space it will 665 - * start to smell after a week. */ 666 - static void discard_iovec(struct iovec *iov, unsigned int *num) 667 { 668 - static char discard_buf[1024]; 669 - *num = 1; 670 - iov->iov_base = discard_buf; 671 - iov->iov_len = sizeof(discard_buf); 672 } 673 674 /* Here is the input terminal setting we save, and the routine to restore them ··· 736 /* This is the routine which handles console input (ie. stdin). */ 737 static bool handle_console_input(int fd, struct device *dev) 738 { 739 - u32 irq = 0, *lenp; 740 int len; 741 - unsigned int num; 742 - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 743 struct console_abort *abort = dev->priv; 744 745 - /* First we get the console buffer from the Guest. The key is dev->mem 746 - * which was set to 0 in setup_console(). */ 747 - lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); 748 - if (!lenp) { 749 - /* If it's not ready for input, warn and set up to discard. */ 750 - warn("console: no dma buffer!"); 751 - discard_iovec(iov, &num); 752 - } 753 754 /* This is why we convert to iovecs: the readv() call uses them, and so 755 * it reads straight into the Guest's buffer. */ 756 - len = readv(dev->fd, iov, num); 757 if (len <= 0) { 758 /* This implies that the console is closed, is /dev/null, or 759 - * something went terribly wrong. We still go through the rest 760 - * of the logic, though, especially the exit handling below. */ 761 warnx("Failed to get console input, ignoring console."); 762 - len = 0; 763 } 764 765 - /* If we read the data into the Guest, fill in the length and send the 766 - * interrupt. */ 767 - if (lenp) { 768 - *lenp = len; 769 - trigger_irq(fd, irq); 770 - } 771 772 /* Three ^C within one second? Exit. 773 * ··· 782 struct timeval now; 783 gettimeofday(&now, NULL); 784 if (now.tv_sec <= abort->start.tv_sec+1) { 785 - u32 args[] = { LHREQ_BREAK, 0 }; 786 /* Close the fd so Waker will know it has to 787 * exit. */ 788 close(waker_fd); ··· 797 /* Any other key resets the abort counter. */ 798 abort->count = 0; 799 800 - /* Now, if we didn't read anything, put the input terminal back and 801 - * return failure (meaning, don't call us again). */ 802 - if (!len) { 803 - restore_term(); 804 - return false; 805 - } 806 /* Everything went OK! */ 807 return true; 808 } 809 810 - /* Handling console output is much simpler than input. */ 811 - static u32 handle_console_output(int fd, const struct iovec *iov, 812 - unsigned num, struct device*dev) 813 { 814 - /* Whatever the Guest sends, write it to standard output. Return the 815 - * number of bytes written. */ 816 - return writev(STDOUT_FILENO, iov, num); 817 } 818 819 - /* Guest->Host network output is also pretty easy. */ 820 - static u32 handle_tun_output(int fd, const struct iovec *iov, 821 - unsigned num, struct device *dev) 822 { 823 - /* We put a flag in the "priv" pointer of the network device, and set 824 - * it as soon as we see output. We'll see why in handle_tun_input() */ 825 - *(bool *)dev->priv = true; 826 - /* Whatever packet the Guest sent us, write it out to the tun 827 - * device. */ 828 - return writev(dev->fd, iov, num); 829 } 830 831 - /* This matches the peer_key() in lguest_net.c. The key for any given slot 832 - * is the address of the network device's page plus 4 * the slot number. */ 833 - static unsigned long peer_offset(unsigned int peernum) 834 - { 835 - return 4 * peernum; 836 - } 837 - 838 - /* This is where we handle a packet coming in from the tun device */ 839 static bool handle_tun_input(int fd, struct device *dev) 840 { 841 - u32 irq = 0, *lenp; 842 int len; 843 - unsigned num; 844 - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 845 846 - /* First we get a buffer the Guest has bound to its key. */ 847 - lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, 848 - &irq); 849 - if (!lenp) { 850 /* Now, it's expected that if we try to send a packet too 851 - * early, the Guest won't be ready yet. This is why we set a 852 - * flag when the Guest sends its first packet. If it's sent a 853 - * packet we assume it should be ready to receive them. 854 - * 855 - * Actually, this is what the status bits in the descriptor are 856 - * for: we should *use* them. FIXME! */ 857 - if (*(bool *)dev->priv) 858 warn("network: no dma buffer!"); 859 - discard_iovec(iov, &num); 860 - } 861 862 /* Read the packet from the device directly into the Guest's buffer. */ 863 - len = readv(dev->fd, iov, num); 864 if (len <= 0) 865 err(1, "reading network"); 866 867 - /* Write the used_len, and trigger the interrupt for the Guest */ 868 - if (lenp) { 869 - *lenp = len; 870 - trigger_irq(fd, irq); 871 - } 872 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 873 - ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], 874 - lenp ? "sent" : "discarded"); 875 /* All good. */ 876 return true; 877 } 878 879 - /* The last device handling routine is block output: the Guest has sent a DMA 880 - * to the block device. It will have placed the command it wants in the 881 - * "struct lguest_block_page". */ 882 - static u32 handle_block_output(int fd, const struct iovec *iov, 883 - unsigned num, struct device *dev) 884 { 885 - struct lguest_block_page *p = dev->mem; 886 - u32 irq, *lenp; 887 - unsigned int len, reply_num; 888 - struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; 889 - off64_t device_len, off = (off64_t)p->sector * 512; 890 - 891 - /* First we extract the device length from the dev->priv pointer. */ 892 - device_len = *(off64_t *)dev->priv; 893 - 894 - /* We first check that the read or write is within the length of the 895 - * block file. */ 896 - if (off >= device_len) 897 - err(1, "Bad offset %llu vs %llu", off, device_len); 898 - /* Move to the right location in the block file. This shouldn't fail, 899 - * but best to check. */ 900 - if (lseek64(dev->fd, off, SEEK_SET) != off) 901 - err(1, "Bad seek to sector %i", p->sector); 902 - 903 - verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); 904 - 905 - /* They were supposed to bind a reply buffer at key equal to the start 906 - * of the block device memory. We need this to tell them when the 907 - * request is finished. */ 908 - lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); 909 - if (!lenp) 910 - err(1, "Block request didn't give us a dma buffer"); 911 - 912 - if (p->type) { 913 - /* A write request. The DMA they sent contained the data, so 914 - * write it out. */ 915 - len = writev(dev->fd, iov, num); 916 - /* Grr... Now we know how long the "struct lguest_dma" they 917 - * sent was, we make sure they didn't try to write over the end 918 - * of the block file (possibly extending it). */ 919 - if (off + len > device_len) { 920 - /* Trim it back to the correct length */ 921 - ftruncate64(dev->fd, device_len); 922 - /* Die, bad Guest, die. */ 923 - errx(1, "Write past end %llu+%u", off, len); 924 - } 925 - /* The reply length is 0: we just send back an empty DMA to 926 - * interrupt them and tell them the write is finished. */ 927 - *lenp = 0; 928 - } else { 929 - /* A read request. They sent an empty DMA to start the 930 - * request, and we put the read contents into the reply 931 - * buffer. */ 932 - len = readv(dev->fd, reply, reply_num); 933 - *lenp = len; 934 - } 935 - 936 - /* The result is 1 (done), 2 if there was an error (short read or 937 - * write). */ 938 - p->result = 1 + (p->bytes != len); 939 - /* Now tell them we've used their reply buffer. */ 940 - trigger_irq(fd, irq); 941 - 942 - /* We're supposed to return the number of bytes of the output buffer we 943 - * used. But the block device uses the "result" field instead, so we 944 - * don't bother. */ 945 - return 0; 946 } 947 948 - /* This is the generic routine we call when the Guest sends some DMA out. */ 949 - static void handle_output(int fd, unsigned long dma, unsigned long key, 950 - struct device_list *devices) 951 { 952 struct device *i; 953 - u32 *lenp; 954 - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 955 - unsigned num = 0; 956 957 - /* Convert the "struct lguest_dma" they're sending to a "struct 958 - * iovec". */ 959 - lenp = dma2iov(dma, iov, &num); 960 - 961 - /* Check each device: if they expect output to this key, tell them to 962 - * handle it. */ 963 - for (i = devices->dev; i; i = i->next) { 964 - if (i->handle_output && key == i->watch_key) { 965 - /* We write the result straight into the used_len field 966 - * for them. */ 967 - *lenp = i->handle_output(fd, iov, num, i); 968 - return; 969 } 970 } 971 972 - /* This can happen: the kernel sends any SEND_DMA which doesn't match 973 - * another Guest to us. It could be that another Guest just left a 974 - * network, for example. But it's unusual. */ 975 - warnx("Pending dma %p, key %p", (void *)dma, (void *)key); 976 } 977 978 /* This is called when the waker wakes us up: check for incoming file 979 * descriptors. */ 980 - static void handle_input(int fd, struct device_list *devices) 981 { 982 /* select() wants a zeroed timeval to mean "don't wait". */ 983 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 984 985 for (;;) { 986 struct device *i; 987 - fd_set fds = devices->infds; 988 989 /* If nothing is ready, we're done. */ 990 - if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) 991 break; 992 993 /* Otherwise, call the device(s) which have readable 994 * file descriptors and a method of handling them. */ 995 - for (i = devices->dev; i; i = i->next) { 996 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 997 /* If handle_input() returns false, it means we 998 - * should no longer service it. 999 - * handle_console_input() does this. */ 1000 - if (!i->handle_input(fd, i)) { 1001 - /* Clear it from the set of input file 1002 - * descriptors kept at the head of the 1003 - * device list. */ 1004 - FD_CLR(i->fd, &devices->infds); 1005 - /* Tell waker to ignore it too... */ 1006 - write(waker_fd, &i->fd, sizeof(i->fd)); 1007 - } 1008 } 1009 } 1010 } ··· 967 * routines to allocate them. 968 * 969 * This routine allocates a new "struct lguest_device_desc" from descriptor 970 - * table in the devices array just above the Guest's normal memory. */ 971 - static struct lguest_device_desc * 972 - new_dev_desc(struct lguest_device_desc *descs, 973 - u16 type, u16 features, u16 num_pages) 974 { 975 - unsigned int i; 976 977 - for (i = 0; i < LGUEST_MAX_DEVICES; i++) { 978 - if (!descs[i].type) { 979 - descs[i].type = type; 980 - descs[i].features = features; 981 - descs[i].num_pages = num_pages; 982 - /* If they said the device needs memory, we allocate 983 - * that now, bumping up the top of Guest memory. */ 984 - if (num_pages) { 985 - map_zeroed_pages(top, num_pages); 986 - descs[i].pfn = top/getpagesize(); 987 - top += num_pages*getpagesize(); 988 - } 989 - return &descs[i]; 990 - } 991 - } 992 - errx(1, "too many devices"); 993 } 994 995 - /* This monster routine does all the creation and setup of a new device, 996 - * including caling new_dev_desc() to allocate the descriptor and device 997 - * memory. */ 998 - static struct device *new_device(struct device_list *devices, 999 - u16 type, u16 num_pages, u16 features, 1000 - int fd, 1001 - bool (*handle_input)(int, struct device *), 1002 - unsigned long watch_off, 1003 - u32 (*handle_output)(int, 1004 - const struct iovec *, 1005 - unsigned, 1006 - struct device *)) 1007 { 1008 struct device *dev = malloc(sizeof(*dev)); 1009 ··· 1061 * easier, but the user expects the devices to be arranged on the bus 1062 * in command-line order. The first network device on the command line 1063 * is eth0, the first block device /dev/lgba, etc. */ 1064 - *devices->lastdev = dev; 1065 dev->next = NULL; 1066 - devices->lastdev = &dev->next; 1067 1068 /* Now we populate the fields one at a time. */ 1069 dev->fd = fd; 1070 /* If we have an input handler for this file descriptor, then we add it 1071 * to the device_list's fdset and maxfd. */ 1072 if (handle_input) 1073 - set_fd(dev->fd, devices); 1074 - dev->desc = new_dev_desc(devices->descs, type, features, num_pages); 1075 - dev->mem = (void *)(dev->desc->pfn * getpagesize()); 1076 dev->handle_input = handle_input; 1077 - dev->watch_key = (unsigned long)dev->mem + watch_off; 1078 - dev->handle_output = handle_output; 1079 return dev; 1080 } 1081 1082 /* Our first setup routine is the console. It's a fairly simple device, but 1083 * UNIX tty handling makes it uglier than it could be. */ 1084 - static void setup_console(struct device_list *devices) 1085 { 1086 struct device *dev; 1087 ··· 1095 atexit(restore_term); 1096 } 1097 1098 - /* We don't currently require any memory for the console, so we ask for 1099 - * 0 pages. */ 1100 - dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, 1101 - STDIN_FILENO, handle_console_input, 1102 - LGUEST_CONSOLE_DMA_KEY, handle_console_output); 1103 /* We store the console state in dev->priv, and initialize it. */ 1104 dev->priv = malloc(sizeof(struct console_abort)); 1105 ((struct console_abort *)dev->priv)->count = 0; 1106 - verbose("device %p: console\n", 1107 - (void *)(dev->desc->pfn * getpagesize())); 1108 - } 1109 1110 - /* Setting up a block file is also fairly straightforward. */ 1111 - static void setup_block_file(const char *filename, struct device_list *devices) 1112 - { 1113 - int fd; 1114 - struct device *dev; 1115 - off64_t *device_len; 1116 - struct lguest_block_page *p; 1117 1118 - /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We 1119 - * open with O_DIRECT because otherwise our benchmarks go much too 1120 - * fast. */ 1121 - fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); 1122 - 1123 - /* We want one page, and have no input handler (the block file never 1124 - * has anything interesting to say to us). Our timing will be quite 1125 - * random, so it should be a reasonable randomness source. */ 1126 - dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, 1127 - LGUEST_DEVICE_F_RANDOMNESS, 1128 - fd, NULL, 0, handle_block_output); 1129 - 1130 - /* We store the device size in the private area */ 1131 - device_len = dev->priv = malloc(sizeof(*device_len)); 1132 - /* This is the safe way of establishing the size of our device: it 1133 - * might be a normal file or an actual block device like /dev/hdb. */ 1134 - *device_len = lseek64(fd, 0, SEEK_END); 1135 - 1136 - /* The device memory is a "struct lguest_block_page". It's zeroed 1137 - * already, we just need to put in the device size. Block devices 1138 - * think in sectors (ie. 512 byte chunks), so we translate here. */ 1139 - p = dev->mem; 1140 - p->num_sectors = *device_len/512; 1141 - verbose("device %p: block %i sectors\n", 1142 - (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); 1143 - } 1144 - 1145 - /* 1146 - * Network Devices. 1147 - * 1148 - * Setting up network devices is quite a pain, because we have three types. 1149 - * First, we have the inter-Guest network. This is a file which is mapped into 1150 - * the address space of the Guests who are on the network. Because it is a 1151 - * shared mapping, the same page underlies all the devices, and they can send 1152 - * DMA to each other. 1153 - * 1154 - * Remember from our network driver, the Guest is told what slot in the page it 1155 - * is to use. We use exclusive fnctl locks to reserve a slot. If another 1156 - * Guest is using a slot, the lock will fail and we try another. Because fnctl 1157 - * locks are cleaned up automatically when we die, this cleverly means that our 1158 - * reservation on the slot will vanish if we crash. */ 1159 - static unsigned int find_slot(int netfd, const char *filename) 1160 - { 1161 - struct flock fl; 1162 - 1163 - fl.l_type = F_WRLCK; 1164 - fl.l_whence = SEEK_SET; 1165 - fl.l_len = 1; 1166 - /* Try a 1 byte lock in each possible position number */ 1167 - for (fl.l_start = 0; 1168 - fl.l_start < getpagesize()/sizeof(struct lguest_net); 1169 - fl.l_start++) { 1170 - /* If we succeed, return the slot number. */ 1171 - if (fcntl(netfd, F_SETLK, &fl) == 0) 1172 - return fl.l_start; 1173 - } 1174 - errx(1, "No free slots in network file %s", filename); 1175 - } 1176 - 1177 - /* This function sets up the network file */ 1178 - static void setup_net_file(const char *filename, 1179 - struct device_list *devices) 1180 - { 1181 - int netfd; 1182 - struct device *dev; 1183 - 1184 - /* We don't use open_or_die() here: for friendliness we create the file 1185 - * if it doesn't already exist. */ 1186 - netfd = open(filename, O_RDWR, 0); 1187 - if (netfd < 0) { 1188 - if (errno == ENOENT) { 1189 - netfd = open(filename, O_RDWR|O_CREAT, 0600); 1190 - if (netfd >= 0) { 1191 - /* If we succeeded, initialize the file with a 1192 - * blank page. */ 1193 - char page[getpagesize()]; 1194 - memset(page, 0, sizeof(page)); 1195 - write(netfd, page, sizeof(page)); 1196 - } 1197 - } 1198 - if (netfd < 0) 1199 - err(1, "cannot open net file '%s'", filename); 1200 - } 1201 - 1202 - /* We need 1 page, and the features indicate the slot to use and that 1203 - * no checksum is needed. We never touch this device again; it's 1204 - * between the Guests on the network, so we don't register input or 1205 - * output handlers. */ 1206 - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, 1207 - find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, 1208 - -1, NULL, 0, NULL); 1209 - 1210 - /* Map the shared file. */ 1211 - if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, 1212 - MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) 1213 - err(1, "could not mmap '%s'", filename); 1214 - verbose("device %p: shared net %s, peer %i\n", 1215 - (void *)(dev->desc->pfn * getpagesize()), filename, 1216 - dev->desc->features & ~LGUEST_NET_F_NOCSUM); 1217 } 1218 /*:*/ 1219 1220 static u32 str2ip(const char *ipaddr) 1221 { ··· 1161 1162 /* This sets up the Host end of the network device with an IP address, brings 1163 * it up so packets will flow, the copies the MAC address into the hwaddr 1164 - * pointer (in practice, the Host's slot in the network device's memory). */ 1165 static void configure_device(int fd, const char *devname, u32 ipaddr, 1166 unsigned char hwaddr[6]) 1167 { ··· 1187 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1188 } 1189 1190 - /*L:195 The other kind of network is a Host<->Guest network. This can either 1191 - * use briding or routing, but the principle is the same: it uses the "tun" 1192 - * device to inject packets into the Host as if they came in from a normal 1193 - * network card. We just shunt packets between the Guest and the tun 1194 - * device. */ 1195 - static void setup_tun_net(const char *arg, struct device_list *devices) 1196 { 1197 struct device *dev; 1198 struct ifreq ifr; 1199 int netfd, ipfd; 1200 u32 ip; 1201 const char *br_name = NULL; 1202 1203 /* We open the /dev/net/tun device and tell it we want a tap device. A 1204 * tap device is like a tun device, only somehow different. To tell ··· 1214 * device: trust us! */ 1215 ioctl(netfd, TUNSETNOCSUM, 1); 1216 1217 - /* We create the net device with 1 page, using the features field of 1218 - * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and 1219 - * that the device has fairly random timing. We do *not* specify 1220 - * LGUEST_NET_F_NOCSUM: these packets can reach the real world. 1221 - * 1222 - * We will put our MAC address is slot 0 for the Guest to see, so 1223 - * it will send packets to us using the key "peer_offset(0)": */ 1224 - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, 1225 - NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, 1226 - handle_tun_input, peer_offset(0), handle_tun_output); 1227 1228 - /* We keep a flag which says whether we've seen packets come out from 1229 - * this network device. */ 1230 - dev->priv = malloc(sizeof(bool)); 1231 - *(bool *)dev->priv = false; 1232 1233 /* We need a socket to perform the magic network ioctls to bring up the 1234 * tap interface, connect to the bridge etc. Any socket will do! */ ··· 1236 } else /* It is an IP address to set up the device with */ 1237 ip = str2ip(arg); 1238 1239 - /* We are peer 0, ie. first slot, so we hand dev->mem to this routine 1240 - * to write the MAC address at the start of the device memory. */ 1241 - configure_device(ipfd, ifr.ifr_name, ip, dev->mem); 1242 1243 - /* Set "promisc" bit: we want every single packet if we're going to 1244 - * bridge to other machines (and otherwise it doesn't matter). */ 1245 - *((u8 *)dev->mem) |= 0x1; 1246 1247 close(ipfd); 1248 1249 - verbose("device %p: tun net %u.%u.%u.%u\n", 1250 - (void *)(dev->desc->pfn * getpagesize()), 1251 - (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); 1252 if (br_name) 1253 verbose("attached to bridge: %s\n", br_name); 1254 } 1255 /* That's the end of device setup. */ 1256 1257 /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1258 * its input and output, and finally, lays it to rest. */ 1259 - static void __attribute__((noreturn)) 1260 - run_guest(int lguest_fd, struct device_list *device_list) 1261 { 1262 for (;;) { 1263 - u32 args[] = { LHREQ_BREAK, 0 }; 1264 - unsigned long arr[2]; 1265 int readval; 1266 1267 /* We read from the /dev/lguest device to run the Guest. */ 1268 - readval = read(lguest_fd, arr, sizeof(arr)); 1269 1270 - /* The read can only really return sizeof(arr) (the Guest did a 1271 - * SEND_DMA to us), or an error. */ 1272 - 1273 - /* For a successful read, arr[0] is the address of the "struct 1274 - * lguest_dma", and arr[1] is the key the Guest sent to. */ 1275 - if (readval == sizeof(arr)) { 1276 - handle_output(lguest_fd, arr[0], arr[1], device_list); 1277 continue; 1278 /* ENOENT means the Guest died. Reading tells us why. */ 1279 } else if (errno == ENOENT) { ··· 1494 1495 /* Service input, then unset the BREAK which releases 1496 * the Waker. */ 1497 - handle_input(lguest_fd, device_list); 1498 if (write(lguest_fd, args, sizeof(args)) < 0) 1499 err(1, "Resetting break"); 1500 } ··· 1508 1509 static struct option opts[] = { 1510 { "verbose", 0, NULL, 'v' }, 1511 - { "sharenet", 1, NULL, 's' }, 1512 { "tunnet", 1, NULL, 't' }, 1513 { "block", 1, NULL, 'b' }, 1514 { "initrd", 1, NULL, 'i' }, ··· 1516 static void usage(void) 1517 { 1518 errx(1, "Usage: lguest [--verbose] " 1519 - "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1520 "|--block=<filename>|--initrd=<filename>]...\n" 1521 "<mem-in-mb> vmlinux [args...]"); 1522 } 1523 1524 - /*L:100 The Launcher code itself takes us out into userspace, that scary place 1525 - * where pointers run wild and free! Unfortunately, like most userspace 1526 - * programs, it's quite boring (which is why everyone like to hack on the 1527 - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 1528 - * will get you through this section. Or, maybe not. 1529 - * 1530 - * The Launcher binary sits up high, usually starting at address 0xB8000000. 1531 - * Everything below this is the "physical" memory for the Guest. For example, 1532 - * if the Guest were to write a "1" at physical address 0, we would see a "1" 1533 - * in the Launcher at "(int *)0". Guest physical == Launcher virtual. 1534 - * 1535 - * This can be tough to get your head around, but usually it just means that we 1536 - * don't need to do any conversion when the Guest gives us it's "physical" 1537 - * addresses. 1538 - */ 1539 int main(int argc, char *argv[]) 1540 { 1541 - /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1542 - * of the (optional) initrd. */ 1543 - unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1544 /* A temporary and the /dev/lguest file descriptor. */ 1545 int i, c, lguest_fd; 1546 - /* The list of Guest devices, based on command line arguments. */ 1547 - struct device_list device_list; 1548 - /* The boot information for the Guest: at guest-physical address 0. */ 1549 - void *boot = (void *)0; 1550 /* If they specify an initrd file to load. */ 1551 const char *initrd_name = NULL; 1552 ··· 1538 * device receive input from a file descriptor, we keep an fdset 1539 * (infds) and the maximum fd number (max_infd) with the head of the 1540 * list. We also keep a pointer to the last device, for easy appending 1541 - * to the list. */ 1542 - device_list.max_infd = -1; 1543 - device_list.dev = NULL; 1544 - device_list.lastdev = &device_list.dev; 1545 - FD_ZERO(&device_list.infds); 1546 1547 /* We need to know how much memory so we can set up the device 1548 * descriptor and memory pages for the devices as we parse the command ··· 1551 * of memory now. */ 1552 for (i = 1; i < argc; i++) { 1553 if (argv[i][0] != '-') { 1554 - mem = top = atoi(argv[i]) * 1024 * 1024; 1555 - device_list.descs = map_zeroed_pages(top, 1); 1556 - top += getpagesize(); 1557 break; 1558 } 1559 } ··· 1571 case 'v': 1572 verbose = true; 1573 break; 1574 - case 's': 1575 - setup_net_file(optarg, &device_list); 1576 - break; 1577 case 't': 1578 - setup_tun_net(optarg, &device_list); 1579 break; 1580 case 'b': 1581 - setup_block_file(optarg, &device_list); 1582 break; 1583 case 'i': 1584 initrd_name = optarg; ··· 1590 if (optind + 2 > argc) 1591 usage(); 1592 1593 - /* We always have a console device */ 1594 - setup_console(&device_list); 1595 1596 - /* We start by mapping anonymous pages over all of guest-physical 1597 - * memory range. This fills it with 0, and ensures that the Guest 1598 - * won't be killed when it tries to access it. */ 1599 - map_zeroed_pages(0, mem / getpagesize()); 1600 1601 /* Now we load the kernel */ 1602 - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1603 - &page_offset); 1604 1605 /* Map the initrd image if requested (at top of physical memory) */ 1606 if (initrd_name) { 1607 initrd_size = load_initrd(initrd_name, mem); 1608 /* These are the location in the Linux boot header where the 1609 * start and size of the initrd are expected to be found. */ 1610 - *(unsigned long *)(boot+0x218) = mem - initrd_size; 1611 - *(unsigned long *)(boot+0x21c) = initrd_size; 1612 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1613 - *(unsigned char *)(boot+0x210) = 0xFF; 1614 } 1615 1616 /* Set up the initial linear pagetables, starting below the initrd. */ 1617 - pgdir = setup_pagetables(mem, initrd_size, page_offset); 1618 1619 /* The Linux boot header contains an "E820" memory map: ours is a 1620 * simple, single region. */ 1621 - *(char*)(boot+E820NR) = 1; 1622 - *((struct e820entry *)(boot+E820MAP)) 1623 - = ((struct e820entry) { 0, mem, E820_RAM }); 1624 /* The boot header contains a command line pointer: we put the command 1625 - * line after the boot header (at address 4096) */ 1626 - *(void **)(boot + 0x228) = boot + 4096; 1627 - concat(boot + 4096, argv+optind+2); 1628 1629 - /* The guest type value of "1" tells the Guest it's under lguest. */ 1630 - *(int *)(boot + 0x23c) = 1; 1631 1632 /* We tell the kernel to initialize the Guest: this returns the open 1633 * /dev/lguest file descriptor. */ 1634 - lguest_fd = tell_kernel(pgdir, start, page_offset); 1635 1636 /* We fork off a child process, which wakes the Launcher whenever one 1637 * of the input file descriptors needs attention. Otherwise we would 1638 * run the Guest until it tries to output something. */ 1639 - waker_fd = setup_waker(lguest_fd, &device_list); 1640 1641 /* Finally, run the Guest. This doesn't return. */ 1642 - run_guest(lguest_fd, &device_list); 1643 } 1644 /*:*/ 1645

··· 1 /*P:100 This is the Launcher code, a simple program which lays out the 2 * "physical" memory for the new Guest by mapping the kernel image and the 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 4 + :*/ 5 #define _LARGEFILE64_SOURCE 6 #define _GNU_SOURCE 7 #include <stdio.h> ··· 15 #include <stdlib.h> 16 #include <elf.h> 17 #include <sys/mman.h> 18 + #include <sys/param.h> 19 #include <sys/types.h> 20 #include <sys/stat.h> 21 #include <sys/wait.h> ··· 34 #include <termios.h> 35 #include <getopt.h> 36 #include <zlib.h> 37 + #include <assert.h> 38 + #include <sched.h> 39 + /*L:110 We can ignore the 30 include files we need for this program, but I do 40 * want to draw attention to the use of kernel-style types. 41 * 42 * As Linus said, "C is a Spartan language, and so should your naming be." I ··· 45 typedef uint32_t u32; 46 typedef uint16_t u16; 47 typedef uint8_t u8; 48 + #include "linux/lguest_launcher.h" 49 + #include "linux/pci_ids.h" 50 + #include "linux/virtio_config.h" 51 + #include "linux/virtio_net.h" 52 + #include "linux/virtio_blk.h" 53 + #include "linux/virtio_console.h" 54 + #include "linux/virtio_ring.h" 55 + #include "asm-x86/bootparam.h" 56 /*:*/ 57 58 #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ ··· 55 #ifndef SIOCBRADDIF 56 #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 57 #endif 58 + /* We can have up to 256 pages for devices. */ 59 + #define DEVICE_PAGES 256 60 + /* This fits nicely in a single 4096-byte page. */ 61 + #define VIRTQUEUE_NUM 127 62 63 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows 64 * this, and although I wouldn't recommend it, it works quite nicely here. */ ··· 65 66 /* The pipe to send commands to the waker process */ 67 static int waker_fd; 68 + /* The pointer to the start of guest memory. */ 69 + static void *guest_base; 70 + /* The maximum guest physical address allowed, and maximum possible. */ 71 + static unsigned long guest_limit, guest_max; 72 73 /* This is our list of devices. */ 74 struct device_list ··· 76 fd_set infds; 77 int max_infd; 78 79 + /* Counter to assign interrupt numbers. */ 80 + unsigned int next_irq; 81 + 82 + /* Counter to print out convenient device numbers. */ 83 + unsigned int device_num; 84 + 85 /* The descriptor page for the devices. */ 86 + u8 *descpage; 87 + 88 + /* The tail of the last descriptor. */ 89 + unsigned int desc_used; 90 91 /* A single linked list of devices. */ 92 struct device *dev; ··· 85 struct device **lastdev; 86 }; 87 88 + /* The list of Guest devices, based on command line arguments. */ 89 + static struct device_list devices; 90 + 91 /* The device structure describes a single device. */ 92 struct device 93 { 94 /* The linked-list pointer. */ 95 struct device *next; 96 + 97 + /* The this device's descriptor, as mapped into the Guest. */ 98 struct lguest_device_desc *desc; 99 + 100 + /* The name of this device, for --verbose. */ 101 + const char *name; 102 103 /* If handle_input is set, it wants to be called when this file 104 * descriptor is ready. */ 105 int fd; 106 bool (*handle_input)(int fd, struct device *me); 107 108 + /* Any queues attached to this device */ 109 + struct virtqueue *vq; 110 111 /* Device-specific data. */ 112 void *priv; 113 }; 114 + 115 + /* The virtqueue structure describes a queue attached to a device. */ 116 + struct virtqueue 117 + { 118 + struct virtqueue *next; 119 + 120 + /* Which device owns me. */ 121 + struct device *dev; 122 + 123 + /* The configuration for this queue. */ 124 + struct lguest_vqconfig config; 125 + 126 + /* The actual ring of buffers. */ 127 + struct vring vring; 128 + 129 + /* Last available index we saw. */ 130 + u16 last_avail_idx; 131 + 132 + /* The routine to call when the Guest pings us. */ 133 + void (*handle_output)(int fd, struct virtqueue *me); 134 + }; 135 + 136 + /* Since guest is UP and we don't run at the same time, we don't need barriers. 137 + * But I include them in the code in case others copy it. */ 138 + #define wmb() 139 + 140 + /* Convert an iovec element to the given type. 141 + * 142 + * This is a fairly ugly trick: we need to know the size of the type and 143 + * alignment requirement to check the pointer is kosher. It's also nice to 144 + * have the name of the type in case we report failure. 145 + * 146 + * Typing those three things all the time is cumbersome and error prone, so we 147 + * have a macro which sets them all up and passes to the real function. */ 148 + #define convert(iov, type) \ 149 + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 150 + 151 + static void *_convert(struct iovec *iov, size_t size, size_t align, 152 + const char *name) 153 + { 154 + if (iov->iov_len != size) 155 + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); 156 + if ((unsigned long)iov->iov_base % align != 0) 157 + errx(1, "Bad alignment %p for %s", iov->iov_base, name); 158 + return iov->iov_base; 159 + } 160 + 161 + /* The virtio configuration space is defined to be little-endian. x86 is 162 + * little-endian too, but it's nice to be explicit so we have these helpers. */ 163 + #define cpu_to_le16(v16) (v16) 164 + #define cpu_to_le32(v32) (v32) 165 + #define cpu_to_le64(v64) (v64) 166 + #define le16_to_cpu(v16) (v16) 167 + #define le32_to_cpu(v32) (v32) 168 + #define le64_to_cpu(v32) (v64) 169 + 170 + /*L:100 The Launcher code itself takes us out into userspace, that scary place 171 + * where pointers run wild and free! Unfortunately, like most userspace 172 + * programs, it's quite boring (which is why everyone likes to hack on the 173 + * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 174 + * will get you through this section. Or, maybe not. 175 + * 176 + * The Launcher sets up a big chunk of memory to be the Guest's "physical" 177 + * memory and stores it in "guest_base". In other words, Guest physical == 178 + * Launcher virtual with an offset. 179 + * 180 + * This can be tough to get your head around, but usually it just means that we 181 + * use these trivial conversion functions when the Guest gives us it's 182 + * "physical" addresses: */ 183 + static void *from_guest_phys(unsigned long addr) 184 + { 185 + return guest_base + addr; 186 + } 187 + 188 + static unsigned long to_guest_phys(const void *addr) 189 + { 190 + return (addr - guest_base); 191 + } 192 193 /*L:130 194 * Loading the Kernel. ··· 123 return fd; 124 } 125 126 + /* map_zeroed_pages() takes a number of pages. */ 127 + static void *map_zeroed_pages(unsigned int num) 128 { 129 + int fd = open_or_die("/dev/zero", O_RDONLY); 130 + void *addr; 131 132 /* We use a private mapping (ie. if we write to the page, it will be 133 + * copied). */ 134 + addr = mmap(NULL, getpagesize() * num, 135 + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 136 + if (addr == MAP_FAILED) 137 + err(1, "Mmaping %u pages of /dev/zero", num); 138 139 + return addr; 140 } 141 142 + /* Get some more pages for a device. */ 143 + static void *get_pages(unsigned int num) 144 { 145 + void *addr = from_guest_phys(guest_limit); 146 147 + guest_limit += num * getpagesize(); 148 + if (guest_limit > guest_max) 149 + errx(1, "Not enough memory for devices"); 150 + return addr; 151 + } 152 153 + /* This routine is used to load the kernel or initrd. It tries mmap, but if 154 + * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 155 + * it falls back to reading the memory in. */ 156 + static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 157 + { 158 + ssize_t r; 159 + 160 + /* We map writable even though for some segments are marked read-only. 161 + * The kernel really wants to be writable: it patches its own 162 + * instructions. 163 + * 164 + * MAP_PRIVATE means that the page won't be copied until a write is 165 + * done to it. This allows us to share untouched memory between 166 + * Guests. */ 167 + if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, 168 + MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 169 + return; 170 + 171 + /* pread does a seek and a read in one shot: saves a few lines. */ 172 + r = pread(fd, addr, len, offset); 173 + if (r != len) 174 + err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 175 } 176 177 /* This routine takes an open vmlinux image, which is in ELF, and maps it into ··· 167 * by all modern binaries on Linux including the kernel. 168 * 169 * The ELF headers give *two* addresses: a physical address, and a virtual 170 + * address. We use the physical address; the Guest will map itself to the 171 + * virtual address. 172 * 173 * We return the starting address. */ 174 + static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 175 { 176 Elf32_Phdr phdr[ehdr->e_phnum]; 177 unsigned int i; 178 179 /* Sanity checks on the main ELF header: an x86 executable with a 180 * reasonable number of correctly-sized program headers. */ ··· 199 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 200 err(1, "Reading program headers"); 201 202 /* Try all the headers: there are usually only three. A read-only one, 203 * a read-write one, and a "note" section which isn't loadable. */ 204 for (i = 0; i < ehdr->e_phnum; i++) { ··· 212 verbose("Section %i: size %i addr %p\n", 213 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 214 215 + /* We map this section of the file at its physical address. */ 216 + map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), 217 + phdr[i].p_offset, phdr[i].p_filesz); 218 } 219 220 + /* The entry point is given in the ELF header. */ 221 + return ehdr->e_entry; 222 } 223 224 /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 225 + * supposed to jump into it and it will unpack itself. We used to have to 226 + * perform some hairy magic because the unpacking code scared me. 227 * 228 + * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 229 + * a small patch to jump over the tricky bits in the Guest, so now we just read 230 + * the funky header so we know where in the file to load, and away we go! */ 231 + static unsigned long load_bzimage(int fd) 232 { 233 + struct boot_params boot; 234 + int r; 235 + /* Modern bzImages get loaded at 1M. */ 236 + void *p = from_guest_phys(0x100000); 237 238 + /* Go back to the start of the file and read the header. It should be 239 + * a Linux boot header (see Documentation/i386/boot.txt) */ 240 + lseek(fd, 0, SEEK_SET); 241 + read(fd, &boot, sizeof(boot)); 242 + 243 + /* Inside the setup_hdr, we expect the magic "HdrS" */ 244 + if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) 245 + errx(1, "This doesn't look like a bzImage to me"); 246 + 247 + /* Skip over the extra sectors of the header. */ 248 + lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); 249 + 250 + /* Now read everything into memory. in nice big chunks. */ 251 + while ((r = read(fd, p, 65536)) > 0) 252 + p += r; 253 + 254 + /* Finally, code32_start tells us where to enter the kernel. */ 255 + return boot.hdr.code32_start; 256 } 257 258 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 259 * come wrapped up in the self-decompressing "bzImage" format. With some funky 260 * coding, we can load those, too. */ 261 + static unsigned long load_kernel(int fd) 262 { 263 Elf32_Ehdr hdr; 264 ··· 373 374 /* If it's an ELF file, it starts with "\177ELF" */ 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 376 + return map_elf(fd, &hdr); 377 378 /* Otherwise we assume it's a bzImage, and try to unpack it */ 379 + return load_bzimage(fd); 380 } 381 382 /* This is a trivial little helper to align pages. Andi Kleen hated it because ··· 402 int ifd; 403 struct stat st; 404 unsigned long len; 405 406 ifd = open_or_die(name, O_RDONLY); 407 /* fstat() is needed to get the file size. */ 408 if (fstat(ifd, &st) < 0) 409 err(1, "fstat() on initrd '%s'", name); 410 411 + /* We map the initrd at the top of memory, but mmap wants it to be 412 + * page-aligned, so we round the size up for that. */ 413 len = page_align(st.st_size); 414 + map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 415 /* Once a file is mapped, you can close the file descriptor. It's a 416 * little odd, but quite useful. */ 417 close(ifd); 418 + verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 419 420 /* We return the initrd size. */ 421 return len; 422 } 423 424 + /* Once we know how much memory we have, we can construct simple linear page 425 + * tables which set virtual == physical which will get the Guest far enough 426 + * into the boot to create its own. 427 * 428 * We lay them out of the way, just below the initrd (which is why we need to 429 * know its size). */ 430 static unsigned long setup_pagetables(unsigned long mem, 431 + unsigned long initrd_size) 432 { 433 + unsigned long *pgdir, *linear; 434 unsigned int mapped_pages, i, linear_pages; 435 + unsigned int ptes_per_page = getpagesize()/sizeof(void *); 436 437 + mapped_pages = mem/getpagesize(); 438 439 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 440 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 441 442 /* We put the toplevel page directory page at the top of memory. */ 443 + pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); 444 445 /* Now we use the next linear_pages pages as pte pages */ 446 linear = (void *)pgdir - linear_pages*getpagesize(); ··· 465 for (i = 0; i < mapped_pages; i++) 466 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 467 468 + /* The top level points to the linear page table pages above. */ 469 for (i = 0; i < mapped_pages; i += ptes_per_page) { 470 + pgdir[i/ptes_per_page] 471 + = ((to_guest_phys(linear) + i*sizeof(void *)) 472 + | PAGE_PRESENT); 473 } 474 475 + verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", 476 + mapped_pages, linear_pages, to_guest_phys(linear)); 477 478 /* We return the top level (guest-physical) address: the kernel needs 479 * to know where it is. */ 480 + return to_guest_phys(pgdir); 481 } 482 483 /* Simple routine to roll all the commandline arguments together with spaces ··· 498 499 /* This is where we actually tell the kernel to initialize the Guest. We saw 500 * the arguments it expects when we looked at initialize() in lguest_user.c: 501 + * the base of guest "physical" memory, the top physical page to allow, the 502 + * top level pagetable and the entry point for the Guest. */ 503 + static int tell_kernel(unsigned long pgdir, unsigned long start) 504 { 505 + unsigned long args[] = { LHREQ_INITIALIZE, 506 + (unsigned long)guest_base, 507 + guest_limit / getpagesize(), pgdir, start }; 508 int fd; 509 510 + verbose("Guest: %p - %p (%#lx)\n", 511 + guest_base, guest_base + guest_limit, guest_limit); 512 fd = open_or_die("/dev/lguest", O_RDWR); 513 if (write(fd, args, sizeof(args)) < 0) 514 err(1, "Writing to /dev/lguest"); ··· 515 } 516 /*:*/ 517 518 + static void add_device_fd(int fd) 519 { 520 + FD_SET(fd, &devices.infds); 521 + if (fd > devices.max_infd) 522 + devices.max_infd = fd; 523 } 524 525 /*L:200 ··· 537 * 538 * This, of course, is merely a different *kind* of icky. 539 */ 540 + static void wake_parent(int pipefd, int lguest_fd) 541 { 542 /* Add the pipe from the Launcher to the fdset in the device_list, so 543 * we watch it, too. */ 544 + add_device_fd(pipefd); 545 546 for (;;) { 547 + fd_set rfds = devices.infds; 548 + unsigned long args[] = { LHREQ_BREAK, 1 }; 549 550 /* Wait until input is ready from one of the devices. */ 551 + select(devices.max_infd+1, &rfds, NULL, NULL, NULL); 552 /* Is it a message from the Launcher? */ 553 if (FD_ISSET(pipefd, &rfds)) { 554 + int fd; 555 /* If read() returns 0, it means the Launcher has 556 * exited. We silently follow. */ 557 + if (read(pipefd, &fd, sizeof(fd)) == 0) 558 exit(0); 559 + /* Otherwise it's telling us to change what file 560 + * descriptors we're to listen to. */ 561 + if (fd >= 0) 562 + FD_SET(fd, &devices.infds); 563 + else 564 + FD_CLR(-fd - 1, &devices.infds); 565 } else /* Send LHREQ_BREAK command. */ 566 write(lguest_fd, args, sizeof(args)); 567 } 568 } 569 570 /* This routine just sets up a pipe to the Waker process. */ 571 + static int setup_waker(int lguest_fd) 572 { 573 int pipefd[2], child; 574 ··· 580 if (child == 0) { 581 /* Close the "writing" end of our copy of the pipe */ 582 close(pipefd[1]); 583 + wake_parent(pipefd[0], lguest_fd); 584 } 585 /* Close the reading end of our copy of the pipe. */ 586 close(pipefd[0]); ··· 602 { 603 /* We have to separately check addr and addr+size, because size could 604 * be huge and addr + size might wrap around. */ 605 + if (addr >= guest_limit || addr + size >= guest_limit) 606 + errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 607 /* We return a pointer for the caller's convenience, now we know it's 608 * safe to use. */ 609 + return from_guest_phys(addr); 610 } 611 /* A macro which transparently hands the line number to the real function. */ 612 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 613 614 + /* This function returns the next descriptor in the chain, or vq->vring.num. */ 615 + static unsigned next_desc(struct virtqueue *vq, unsigned int i) 616 { 617 + unsigned int next; 618 619 + /* If this descriptor says it doesn't chain, we're done. */ 620 + if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) 621 + return vq->vring.num; 622 623 + /* Check they're not leading us off end of descriptors. */ 624 + next = vq->vring.desc[i].next; 625 + /* Make sure compiler knows to grab that: we don't want it changing! */ 626 + wmb(); 627 628 + if (next >= vq->vring.num) 629 + errx(1, "Desc next is %u", next); 630 + 631 + return next; 632 } 633 634 + /* This looks in the virtqueue and for the first available buffer, and converts 635 + * it to an iovec for convenient access. Since descriptors consist of some 636 + * number of output then some number of input descriptors, it's actually two 637 + * iovecs, but we pack them into one and note how many of each there were. 638 + * 639 + * This function returns the descriptor number found, or vq->vring.num (which 640 + * is never a valid descriptor number) if none was found. */ 641 + static unsigned get_vq_desc(struct virtqueue *vq, 642 + struct iovec iov[], 643 + unsigned int *out_num, unsigned int *in_num) 644 { 645 + unsigned int i, head; 646 647 + /* Check it isn't doing very strange things with descriptor numbers. */ 648 + if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) 649 + errx(1, "Guest moved used index from %u to %u", 650 + vq->last_avail_idx, vq->vring.avail->idx); 651 652 + /* If there's nothing new since last we looked, return invalid. */ 653 + if (vq->vring.avail->idx == vq->last_avail_idx) 654 + return vq->vring.num; 655 + 656 + /* Grab the next descriptor number they're advertising, and increment 657 + * the index we've seen. */ 658 + head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; 659 + 660 + /* If their number is silly, that's a fatal mistake. */ 661 + if (head >= vq->vring.num) 662 + errx(1, "Guest says index %u is available", head); 663 + 664 + /* When we start there are none of either input nor output. */ 665 + *out_num = *in_num = 0; 666 + 667 + i = head; 668 + do { 669 + /* Grab the first descriptor, and check it's OK. */ 670 + iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; 671 + iov[*out_num + *in_num].iov_base 672 + = check_pointer(vq->vring.desc[i].addr, 673 + vq->vring.desc[i].len); 674 + /* If this is an input descriptor, increment that count. */ 675 + if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) 676 + (*in_num)++; 677 + else { 678 + /* If it's an output descriptor, they're all supposed 679 + * to come before any input descriptors. */ 680 + if (*in_num) 681 + errx(1, "Descriptor has out after in"); 682 + (*out_num)++; 683 + } 684 + 685 + /* If we've got too many, that implies a descriptor loop. */ 686 + if (*out_num + *in_num > vq->vring.num) 687 + errx(1, "Looped descriptor"); 688 + } while ((i = next_desc(vq, i)) != vq->vring.num); 689 + 690 + return head; 691 } 692 693 + /* Once we've used one of their buffers, we tell them about it. We'll then 694 + * want to send them an interrupt, using trigger_irq(). */ 695 + static void add_used(struct virtqueue *vq, unsigned int head, int len) 696 { 697 + struct vring_used_elem *used; 698 + 699 + /* Get a pointer to the next entry in the used ring. */ 700 + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 701 + used->id = head; 702 + used->len = len; 703 + /* Make sure buffer is written before we update index. */ 704 + wmb(); 705 + vq->vring.used->idx++; 706 + } 707 + 708 + /* This actually sends the interrupt for this virtqueue */ 709 + static void trigger_irq(int fd, struct virtqueue *vq) 710 + { 711 + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 712 + 713 + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 714 + return; 715 + 716 + /* Send the Guest an interrupt tell them we used something up. */ 717 if (write(fd, buf, sizeof(buf)) != 0) 718 + err(1, "Triggering irq %i", vq->config.irq); 719 } 720 721 + /* And here's the combo meal deal. Supersize me! */ 722 + static void add_used_and_trigger(int fd, struct virtqueue *vq, 723 + unsigned int head, int len) 724 { 725 + add_used(vq, head, len); 726 + trigger_irq(fd, vq); 727 } 728 729 /* Here is the input terminal setting we save, and the routine to restore them ··· 701 /* This is the routine which handles console input (ie. stdin). */ 702 static bool handle_console_input(int fd, struct device *dev) 703 { 704 int len; 705 + unsigned int head, in_num, out_num; 706 + struct iovec iov[dev->vq->vring.num]; 707 struct console_abort *abort = dev->priv; 708 709 + /* First we need a console buffer from the Guests's input virtqueue. */ 710 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 711 + 712 + /* If they're not ready for input, stop listening to this file 713 + * descriptor. We'll start again once they add an input buffer. */ 714 + if (head == dev->vq->vring.num) 715 + return false; 716 + 717 + if (out_num) 718 + errx(1, "Output buffers in console in queue?"); 719 720 /* This is why we convert to iovecs: the readv() call uses them, and so 721 * it reads straight into the Guest's buffer. */ 722 + len = readv(dev->fd, iov, in_num); 723 if (len <= 0) { 724 /* This implies that the console is closed, is /dev/null, or 725 + * something went terribly wrong. */ 726 warnx("Failed to get console input, ignoring console."); 727 + /* Put the input terminal back. */ 728 + restore_term(); 729 + /* Remove callback from input vq, so it doesn't restart us. */ 730 + dev->vq->handle_output = NULL; 731 + /* Stop listening to this fd: don't call us again. */ 732 + return false; 733 } 734 735 + /* Tell the Guest about the new input. */ 736 + add_used_and_trigger(fd, dev->vq, head, len); 737 738 /* Three ^C within one second? Exit. 739 * ··· 746 struct timeval now; 747 gettimeofday(&now, NULL); 748 if (now.tv_sec <= abort->start.tv_sec+1) { 749 + unsigned long args[] = { LHREQ_BREAK, 0 }; 750 /* Close the fd so Waker will know it has to 751 * exit. */ 752 close(waker_fd); ··· 761 /* Any other key resets the abort counter. */ 762 abort->count = 0; 763 764 /* Everything went OK! */ 765 return true; 766 } 767 768 + /* Handling output for console is simple: we just get all the output buffers 769 + * and write them to stdout. */ 770 + static void handle_console_output(int fd, struct virtqueue *vq) 771 { 772 + unsigned int head, out, in; 773 + int len; 774 + struct iovec iov[vq->vring.num]; 775 + 776 + /* Keep getting output buffers from the Guest until we run out. */ 777 + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 778 + if (in) 779 + errx(1, "Input buffers in output queue?"); 780 + len = writev(STDOUT_FILENO, iov, out); 781 + add_used_and_trigger(fd, vq, head, len); 782 + } 783 } 784 785 + /* Handling output for network is also simple: we get all the output buffers 786 + * and write them (ignoring the first element) to this device's file descriptor 787 + * (stdout). */ 788 + static void handle_net_output(int fd, struct virtqueue *vq) 789 { 790 + unsigned int head, out, in; 791 + int len; 792 + struct iovec iov[vq->vring.num]; 793 + 794 + /* Keep getting output buffers from the Guest until we run out. */ 795 + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 796 + if (in) 797 + errx(1, "Input buffers in output queue?"); 798 + /* Check header, but otherwise ignore it (we said we supported 799 + * no features). */ 800 + (void)convert(&iov[0], struct virtio_net_hdr); 801 + len = writev(vq->dev->fd, iov+1, out-1); 802 + add_used_and_trigger(fd, vq, head, len); 803 + } 804 } 805 806 + /* This is where we handle a packet coming in from the tun device to our 807 + * Guest. */ 808 static bool handle_tun_input(int fd, struct device *dev) 809 { 810 + unsigned int head, in_num, out_num; 811 int len; 812 + struct iovec iov[dev->vq->vring.num]; 813 + struct virtio_net_hdr *hdr; 814 815 + /* First we need a network buffer from the Guests's recv virtqueue. */ 816 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 817 + if (head == dev->vq->vring.num) { 818 /* Now, it's expected that if we try to send a packet too 819 + * early, the Guest won't be ready yet. Wait until the device 820 + * status says it's ready. */ 821 + /* FIXME: Actually want DRIVER_ACTIVE here. */ 822 + if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) 823 warn("network: no dma buffer!"); 824 + /* We'll turn this back on if input buffers are registered. */ 825 + return false; 826 + } else if (out_num) 827 + errx(1, "Output buffers in network recv queue?"); 828 + 829 + /* First element is the header: we set it to 0 (no features). */ 830 + hdr = convert(&iov[0], struct virtio_net_hdr); 831 + hdr->flags = 0; 832 + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 833 834 /* Read the packet from the device directly into the Guest's buffer. */ 835 + len = readv(dev->fd, iov+1, in_num-1); 836 if (len <= 0) 837 err(1, "reading network"); 838 839 + /* Tell the Guest about the new packet. */ 840 + add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); 841 + 842 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 843 + ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], 844 + head != dev->vq->vring.num ? "sent" : "discarded"); 845 + 846 /* All good. */ 847 return true; 848 } 849 850 + /* This callback ensures we try again, in case we stopped console or net 851 + * delivery because Guest didn't have any buffers. */ 852 + static void enable_fd(int fd, struct virtqueue *vq) 853 { 854 + add_device_fd(vq->dev->fd); 855 + /* Tell waker to listen to it again */ 856 + write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); 857 } 858 859 + /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 860 + static void handle_output(int fd, unsigned long addr) 861 { 862 struct device *i; 863 + struct virtqueue *vq; 864 865 + /* Check each virtqueue. */ 866 + for (i = devices.dev; i; i = i->next) { 867 + for (vq = i->vq; vq; vq = vq->next) { 868 + if (vq->config.pfn == addr/getpagesize() 869 + && vq->handle_output) { 870 + verbose("Output to %s\n", vq->dev->name); 871 + vq->handle_output(fd, vq); 872 + return; 873 + } 874 } 875 } 876 877 + /* Early console write is done using notify on a nul-terminated string 878 + * in Guest memory. */ 879 + if (addr >= guest_limit) 880 + errx(1, "Bad NOTIFY %#lx", addr); 881 + 882 + write(STDOUT_FILENO, from_guest_phys(addr), 883 + strnlen(from_guest_phys(addr), guest_limit - addr)); 884 } 885 886 /* This is called when the waker wakes us up: check for incoming file 887 * descriptors. */ 888 + static void handle_input(int fd) 889 { 890 /* select() wants a zeroed timeval to mean "don't wait". */ 891 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 892 893 for (;;) { 894 struct device *i; 895 + fd_set fds = devices.infds; 896 897 /* If nothing is ready, we're done. */ 898 + if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) 899 break; 900 901 /* Otherwise, call the device(s) which have readable 902 * file descriptors and a method of handling them. */ 903 + for (i = devices.dev; i; i = i->next) { 904 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 905 + int dev_fd; 906 + if (i->handle_input(fd, i)) 907 + continue; 908 + 909 /* If handle_input() returns false, it means we 910 + * should no longer service it. Networking and 911 + * console do this when there's no input 912 + * buffers to deliver into. Console also uses 913 + * it when it discovers that stdin is 914 + * closed. */ 915 + FD_CLR(i->fd, &devices.infds); 916 + /* Tell waker to ignore it too, by sending a 917 + * negative fd number (-1, since 0 is a valid 918 + * FD number). */ 919 + dev_fd = -i->fd - 1; 920 + write(waker_fd, &dev_fd, sizeof(dev_fd)); 921 } 922 } 923 } ··· 982 * routines to allocate them. 983 * 984 * This routine allocates a new "struct lguest_device_desc" from descriptor 985 + * table just above the Guest's normal memory. It returns a pointer to that 986 + * descriptor. */ 987 + static struct lguest_device_desc *new_dev_desc(u16 type) 988 { 989 + struct lguest_device_desc *d; 990 991 + /* We only have one page for all the descriptors. */ 992 + if (devices.desc_used + sizeof(*d) > getpagesize()) 993 + errx(1, "Too many devices"); 994 + 995 + /* We don't need to set config_len or status: page is 0 already. */ 996 + d = (void *)devices.descpage + devices.desc_used; 997 + d->type = type; 998 + devices.desc_used += sizeof(*d); 999 + 1000 + return d; 1001 } 1002 1003 + /* Each device descriptor is followed by some configuration information. 1004 + * The first byte is a "status" byte for the Guest to report what's happening. 1005 + * After that are fields: u8 type, u8 len, [... len bytes...]. 1006 + * 1007 + * This routine adds a new field to an existing device's descriptor. It only 1008 + * works for the last device, but that's OK because that's how we use it. */ 1009 + static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) 1010 + { 1011 + /* This is the last descriptor, right? */ 1012 + assert(devices.descpage + devices.desc_used 1013 + == (u8 *)(dev->desc + 1) + dev->desc->config_len); 1014 + 1015 + /* We only have one page of device descriptions. */ 1016 + if (devices.desc_used + 2 + len > getpagesize()) 1017 + errx(1, "Too many devices"); 1018 + 1019 + /* Copy in the new config header: type then length. */ 1020 + devices.descpage[devices.desc_used++] = type; 1021 + devices.descpage[devices.desc_used++] = len; 1022 + memcpy(devices.descpage + devices.desc_used, c, len); 1023 + devices.desc_used += len; 1024 + 1025 + /* Update the device descriptor length: two byte head then data. */ 1026 + dev->desc->config_len += 2 + len; 1027 + } 1028 + 1029 + /* This routine adds a virtqueue to a device. We specify how many descriptors 1030 + * the virtqueue is to have. */ 1031 + static void add_virtqueue(struct device *dev, unsigned int num_descs, 1032 + void (*handle_output)(int fd, struct virtqueue *me)) 1033 + { 1034 + unsigned int pages; 1035 + struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1036 + void *p; 1037 + 1038 + /* First we need some pages for this virtqueue. */ 1039 + pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize(); 1040 + p = get_pages(pages); 1041 + 1042 + /* Initialize the configuration. */ 1043 + vq->config.num = num_descs; 1044 + vq->config.irq = devices.next_irq++; 1045 + vq->config.pfn = to_guest_phys(p) / getpagesize(); 1046 + 1047 + /* Initialize the vring. */ 1048 + vring_init(&vq->vring, num_descs, p); 1049 + 1050 + /* Add the configuration information to this device's descriptor. */ 1051 + add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, 1052 + sizeof(vq->config), &vq->config); 1053 + 1054 + /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1055 + * second. */ 1056 + for (i = &dev->vq; *i; i = &(*i)->next); 1057 + *i = vq; 1058 + 1059 + /* Link virtqueue back to device. */ 1060 + vq->dev = dev; 1061 + 1062 + /* Set up handler. */ 1063 + vq->handle_output = handle_output; 1064 + if (!handle_output) 1065 + vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1066 + } 1067 + 1068 + /* This routine does all the creation and setup of a new device, including 1069 + * caling new_dev_desc() to allocate the descriptor and device memory. */ 1070 + static struct device *new_device(const char *name, u16 type, int fd, 1071 + bool (*handle_input)(int, struct device *)) 1072 { 1073 struct device *dev = malloc(sizeof(*dev)); 1074 ··· 1026 * easier, but the user expects the devices to be arranged on the bus 1027 * in command-line order. The first network device on the command line 1028 * is eth0, the first block device /dev/lgba, etc. */ 1029 + *devices.lastdev = dev; 1030 dev->next = NULL; 1031 + devices.lastdev = &dev->next; 1032 1033 /* Now we populate the fields one at a time. */ 1034 dev->fd = fd; 1035 /* If we have an input handler for this file descriptor, then we add it 1036 * to the device_list's fdset and maxfd. */ 1037 if (handle_input) 1038 + add_device_fd(dev->fd); 1039 + dev->desc = new_dev_desc(type); 1040 dev->handle_input = handle_input; 1041 + dev->name = name; 1042 return dev; 1043 } 1044 1045 /* Our first setup routine is the console. It's a fairly simple device, but 1046 * UNIX tty handling makes it uglier than it could be. */ 1047 + static void setup_console(void) 1048 { 1049 struct device *dev; 1050 ··· 1062 atexit(restore_term); 1063 } 1064 1065 + dev = new_device("console", VIRTIO_ID_CONSOLE, 1066 + STDIN_FILENO, handle_console_input); 1067 /* We store the console state in dev->priv, and initialize it. */ 1068 dev->priv = malloc(sizeof(struct console_abort)); 1069 ((struct console_abort *)dev->priv)->count = 0; 1070 1071 + /* The console needs two virtqueues: the input then the output. When 1072 + * they put something the input queue, we make sure we're listening to 1073 + * stdin. When they put something in the output queue, we write it to 1074 + * stdout. */ 1075 + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1076 + add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1077 1078 + verbose("device %u: console\n", devices.device_num++); 1079 } 1080 /*:*/ 1081 + 1082 + /*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1083 + * --sharenet=<name> option which opens or creates a named pipe. This can be 1084 + * used to send packets to another guest in a 1:1 manner. 1085 + * 1086 + * More sopisticated is to use one of the tools developed for project like UML 1087 + * to do networking. 1088 + * 1089 + * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 1090 + * completely generic ("here's my vring, attach to your vring") and would work 1091 + * for any traffic. Of course, namespace and permissions issues need to be 1092 + * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide 1093 + * multiple inter-guest channels behind one interface, although it would 1094 + * require some manner of hotplugging new virtio channels. 1095 + * 1096 + * Finally, we could implement a virtio network switch in the kernel. :*/ 1097 1098 static u32 str2ip(const char *ipaddr) 1099 { ··· 1217 1218 /* This sets up the Host end of the network device with an IP address, brings 1219 * it up so packets will flow, the copies the MAC address into the hwaddr 1220 + * pointer. */ 1221 static void configure_device(int fd, const char *devname, u32 ipaddr, 1222 unsigned char hwaddr[6]) 1223 { ··· 1243 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1244 } 1245 1246 + /*L:195 Our network is a Host<->Guest network. This can either use bridging or 1247 + * routing, but the principle is the same: it uses the "tun" device to inject 1248 + * packets into the Host as if they came in from a normal network card. We 1249 + * just shunt packets between the Guest and the tun device. */ 1250 + static void setup_tun_net(const char *arg) 1251 { 1252 struct device *dev; 1253 struct ifreq ifr; 1254 int netfd, ipfd; 1255 u32 ip; 1256 const char *br_name = NULL; 1257 + u8 hwaddr[6]; 1258 1259 /* We open the /dev/net/tun device and tell it we want a tap device. A 1260 * tap device is like a tun device, only somehow different. To tell ··· 1270 * device: trust us! */ 1271 ioctl(netfd, TUNSETNOCSUM, 1); 1272 1273 + /* First we create a new network device. */ 1274 + dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1275 1276 + /* Network devices need a receive and a send queue, just like 1277 + * console. */ 1278 + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1279 + add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1280 1281 /* We need a socket to perform the magic network ioctls to bring up the 1282 * tap interface, connect to the bridge etc. Any socket will do! */ ··· 1300 } else /* It is an IP address to set up the device with */ 1301 ip = str2ip(arg); 1302 1303 + /* Set up the tun device, and get the mac address for the interface. */ 1304 + configure_device(ipfd, ifr.ifr_name, ip, hwaddr); 1305 1306 + /* Tell Guest what MAC address to use. */ 1307 + add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); 1308 1309 + /* We don't seed the socket any more; setup is done. */ 1310 close(ipfd); 1311 1312 + verbose("device %u: tun net %u.%u.%u.%u\n", 1313 + devices.device_num++, 1314 + (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); 1315 if (br_name) 1316 verbose("attached to bridge: %s\n", br_name); 1317 + } 1318 + 1319 + 1320 + /* 1321 + * Block device. 1322 + * 1323 + * Serving a block device is really easy: the Guest asks for a block number and 1324 + * we read or write that position in the file. 1325 + * 1326 + * Unfortunately, this is amazingly slow: the Guest waits until the read is 1327 + * finished before running anything else, even if it could be doing useful 1328 + * work. We could use async I/O, except it's reputed to suck so hard that 1329 + * characters actually go missing from your code when you try to use it. 1330 + * 1331 + * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1332 + 1333 + /* This hangs off device->priv, with the data. */ 1334 + struct vblk_info 1335 + { 1336 + /* The size of the file. */ 1337 + off64_t len; 1338 + 1339 + /* The file descriptor for the file. */ 1340 + int fd; 1341 + 1342 + /* IO thread listens on this file descriptor [0]. */ 1343 + int workpipe[2]; 1344 + 1345 + /* IO thread writes to this file descriptor to mark it done, then 1346 + * Launcher triggers interrupt to Guest. */ 1347 + int done_fd; 1348 + }; 1349 + 1350 + /* This is the core of the I/O thread. It returns true if it did something. */ 1351 + static bool service_io(struct device *dev) 1352 + { 1353 + struct vblk_info *vblk = dev->priv; 1354 + unsigned int head, out_num, in_num, wlen; 1355 + int ret; 1356 + struct virtio_blk_inhdr *in; 1357 + struct virtio_blk_outhdr *out; 1358 + struct iovec iov[dev->vq->vring.num]; 1359 + off64_t off; 1360 + 1361 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1362 + if (head == dev->vq->vring.num) 1363 + return false; 1364 + 1365 + if (out_num == 0 || in_num == 0) 1366 + errx(1, "Bad virtblk cmd %u out=%u in=%u", 1367 + head, out_num, in_num); 1368 + 1369 + out = convert(&iov[0], struct virtio_blk_outhdr); 1370 + in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); 1371 + off = out->sector * 512; 1372 + 1373 + /* This is how we implement barriers. Pretty poor, no? */ 1374 + if (out->type & VIRTIO_BLK_T_BARRIER) 1375 + fdatasync(vblk->fd); 1376 + 1377 + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1378 + fprintf(stderr, "Scsi commands unsupported\n"); 1379 + in->status = VIRTIO_BLK_S_UNSUPP; 1380 + wlen = sizeof(in); 1381 + } else if (out->type & VIRTIO_BLK_T_OUT) { 1382 + /* Write */ 1383 + 1384 + /* Move to the right location in the block file. This can fail 1385 + * if they try to write past end. */ 1386 + if (lseek64(vblk->fd, off, SEEK_SET) != off) 1387 + err(1, "Bad seek to sector %llu", out->sector); 1388 + 1389 + ret = writev(vblk->fd, iov+1, out_num-1); 1390 + verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1391 + 1392 + /* Grr... Now we know how long the descriptor they sent was, we 1393 + * make sure they didn't try to write over the end of the block 1394 + * file (possibly extending it). */ 1395 + if (ret > 0 && off + ret > vblk->len) { 1396 + /* Trim it back to the correct length */ 1397 + ftruncate64(vblk->fd, vblk->len); 1398 + /* Die, bad Guest, die. */ 1399 + errx(1, "Write past end %llu+%u", off, ret); 1400 + } 1401 + wlen = sizeof(in); 1402 + in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1403 + } else { 1404 + /* Read */ 1405 + 1406 + /* Move to the right location in the block file. This can fail 1407 + * if they try to read past end. */ 1408 + if (lseek64(vblk->fd, off, SEEK_SET) != off) 1409 + err(1, "Bad seek to sector %llu", out->sector); 1410 + 1411 + ret = readv(vblk->fd, iov+1, in_num-1); 1412 + verbose("READ from sector %llu: %i\n", out->sector, ret); 1413 + if (ret >= 0) { 1414 + wlen = sizeof(in) + ret; 1415 + in->status = VIRTIO_BLK_S_OK; 1416 + } else { 1417 + wlen = sizeof(in); 1418 + in->status = VIRTIO_BLK_S_IOERR; 1419 + } 1420 + } 1421 + 1422 + /* We can't trigger an IRQ, because we're not the Launcher. It does 1423 + * that when we tell it we're done. */ 1424 + add_used(dev->vq, head, wlen); 1425 + return true; 1426 + } 1427 + 1428 + /* This is the thread which actually services the I/O. */ 1429 + static int io_thread(void *_dev) 1430 + { 1431 + struct device *dev = _dev; 1432 + struct vblk_info *vblk = dev->priv; 1433 + char c; 1434 + 1435 + /* Close other side of workpipe so we get 0 read when main dies. */ 1436 + close(vblk->workpipe[1]); 1437 + /* Close the other side of the done_fd pipe. */ 1438 + close(dev->fd); 1439 + 1440 + /* When this read fails, it means Launcher died, so we follow. */ 1441 + while (read(vblk->workpipe[0], &c, 1) == 1) { 1442 + /* We acknowledge each request immediately, to reduce latency, 1443 + * rather than waiting until we've done them all. I haven't 1444 + * measured to see if it makes any difference. */ 1445 + while (service_io(dev)) 1446 + write(vblk->done_fd, &c, 1); 1447 + } 1448 + return 0; 1449 + } 1450 + 1451 + /* When the thread says some I/O is done, we interrupt the Guest. */ 1452 + static bool handle_io_finish(int fd, struct device *dev) 1453 + { 1454 + char c; 1455 + 1456 + /* If child died, presumably it printed message. */ 1457 + if (read(dev->fd, &c, 1) != 1) 1458 + exit(1); 1459 + 1460 + /* It did some work, so trigger the irq. */ 1461 + trigger_irq(fd, dev->vq); 1462 + return true; 1463 + } 1464 + 1465 + /* When the Guest submits some I/O, we wake the I/O thread. */ 1466 + static void handle_virtblk_output(int fd, struct virtqueue *vq) 1467 + { 1468 + struct vblk_info *vblk = vq->dev->priv; 1469 + char c = 0; 1470 + 1471 + /* Wake up I/O thread and tell it to go to work! */ 1472 + if (write(vblk->workpipe[1], &c, 1) != 1) 1473 + /* Presumably it indicated why it died. */ 1474 + exit(1); 1475 + } 1476 + 1477 + /* This creates a virtual block device. */ 1478 + static void setup_block_file(const char *filename) 1479 + { 1480 + int p[2]; 1481 + struct device *dev; 1482 + struct vblk_info *vblk; 1483 + void *stack; 1484 + u64 cap; 1485 + unsigned int val; 1486 + 1487 + /* This is the pipe the I/O thread will use to tell us I/O is done. */ 1488 + pipe(p); 1489 + 1490 + /* The device responds to return from I/O thread. */ 1491 + dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1492 + 1493 + /* The device has a virtqueue. */ 1494 + add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1495 + 1496 + /* Allocate the room for our own bookkeeping */ 1497 + vblk = dev->priv = malloc(sizeof(*vblk)); 1498 + 1499 + /* First we open the file and store the length. */ 1500 + vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 1501 + vblk->len = lseek64(vblk->fd, 0, SEEK_END); 1502 + 1503 + /* Tell Guest how many sectors this device has. */ 1504 + cap = cpu_to_le64(vblk->len / 512); 1505 + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); 1506 + 1507 + /* Tell Guest not to put in too many descriptors at once: two are used 1508 + * for the in and out elements. */ 1509 + val = cpu_to_le32(VIRTQUEUE_NUM - 2); 1510 + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); 1511 + 1512 + /* The I/O thread writes to this end of the pipe when done. */ 1513 + vblk->done_fd = p[1]; 1514 + 1515 + /* This is how we tell the I/O thread about more work. */ 1516 + pipe(vblk->workpipe); 1517 + 1518 + /* Create stack for thread and run it */ 1519 + stack = malloc(32768); 1520 + if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) 1521 + err(1, "Creating clone"); 1522 + 1523 + /* We don't need to keep the I/O thread's end of the pipes open. */ 1524 + close(vblk->done_fd); 1525 + close(vblk->workpipe[0]); 1526 + 1527 + verbose("device %u: virtblock %llu sectors\n", 1528 + devices.device_num, cap); 1529 } 1530 /* That's the end of device setup. */ 1531 1532 /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1533 * its input and output, and finally, lays it to rest. */ 1534 + static void __attribute__((noreturn)) run_guest(int lguest_fd) 1535 { 1536 for (;;) { 1537 + unsigned long args[] = { LHREQ_BREAK, 0 }; 1538 + unsigned long notify_addr; 1539 int readval; 1540 1541 /* We read from the /dev/lguest device to run the Guest. */ 1542 + readval = read(lguest_fd, &notify_addr, sizeof(notify_addr)); 1543 1544 + /* One unsigned long means the Guest did HCALL_NOTIFY */ 1545 + if (readval == sizeof(notify_addr)) { 1546 + verbose("Notify on address %#lx\n", notify_addr); 1547 + handle_output(lguest_fd, notify_addr); 1548 continue; 1549 /* ENOENT means the Guest died. Reading tells us why. */ 1550 } else if (errno == ENOENT) { ··· 1351 1352 /* Service input, then unset the BREAK which releases 1353 * the Waker. */ 1354 + handle_input(lguest_fd); 1355 if (write(lguest_fd, args, sizeof(args)) < 0) 1356 err(1, "Resetting break"); 1357 } ··· 1365 1366 static struct option opts[] = { 1367 { "verbose", 0, NULL, 'v' }, 1368 { "tunnet", 1, NULL, 't' }, 1369 { "block", 1, NULL, 'b' }, 1370 { "initrd", 1, NULL, 'i' }, ··· 1374 static void usage(void) 1375 { 1376 errx(1, "Usage: lguest [--verbose] " 1377 + "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1378 "|--block=<filename>|--initrd=<filename>]...\n" 1379 "<mem-in-mb> vmlinux [args...]"); 1380 } 1381 1382 + /*L:105 The main routine is where the real work begins: */ 1383 int main(int argc, char *argv[]) 1384 { 1385 + /* Memory, top-level pagetable, code startpoint and size of the 1386 + * (optional) initrd. */ 1387 + unsigned long mem = 0, pgdir, start, initrd_size = 0; 1388 /* A temporary and the /dev/lguest file descriptor. */ 1389 int i, c, lguest_fd; 1390 + /* The boot information for the Guest. */ 1391 + struct boot_params *boot; 1392 /* If they specify an initrd file to load. */ 1393 const char *initrd_name = NULL; 1394 ··· 1412 * device receive input from a file descriptor, we keep an fdset 1413 * (infds) and the maximum fd number (max_infd) with the head of the 1414 * list. We also keep a pointer to the last device, for easy appending 1415 + * to the list. Finally, we keep the next interrupt number to hand out 1416 + * (1: remember that 0 is used by the timer). */ 1417 + FD_ZERO(&devices.infds); 1418 + devices.max_infd = -1; 1419 + devices.lastdev = &devices.dev; 1420 + devices.next_irq = 1; 1421 1422 /* We need to know how much memory so we can set up the device 1423 * descriptor and memory pages for the devices as we parse the command ··· 1424 * of memory now. */ 1425 for (i = 1; i < argc; i++) { 1426 if (argv[i][0] != '-') { 1427 + mem = atoi(argv[i]) * 1024 * 1024; 1428 + /* We start by mapping anonymous pages over all of 1429 + * guest-physical memory range. This fills it with 0, 1430 + * and ensures that the Guest won't be killed when it 1431 + * tries to access it. */ 1432 + guest_base = map_zeroed_pages(mem / getpagesize() 1433 + + DEVICE_PAGES); 1434 + guest_limit = mem; 1435 + guest_max = mem + DEVICE_PAGES*getpagesize(); 1436 + devices.descpage = get_pages(1); 1437 break; 1438 } 1439 } ··· 1437 case 'v': 1438 verbose = true; 1439 break; 1440 case 't': 1441 + setup_tun_net(optarg); 1442 break; 1443 case 'b': 1444 + setup_block_file(optarg); 1445 break; 1446 case 'i': 1447 initrd_name = optarg; ··· 1459 if (optind + 2 > argc) 1460 usage(); 1461 1462 + verbose("Guest base is at %p\n", guest_base); 1463 1464 + /* We always have a console device */ 1465 + setup_console(); 1466 1467 /* Now we load the kernel */ 1468 + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1469 + 1470 + /* Boot information is stashed at physical address 0 */ 1471 + boot = from_guest_phys(0); 1472 1473 /* Map the initrd image if requested (at top of physical memory) */ 1474 if (initrd_name) { 1475 initrd_size = load_initrd(initrd_name, mem); 1476 /* These are the location in the Linux boot header where the 1477 * start and size of the initrd are expected to be found. */ 1478 + boot->hdr.ramdisk_image = mem - initrd_size; 1479 + boot->hdr.ramdisk_size = initrd_size; 1480 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1481 + boot->hdr.type_of_loader = 0xFF; 1482 } 1483 1484 /* Set up the initial linear pagetables, starting below the initrd. */ 1485 + pgdir = setup_pagetables(mem, initrd_size); 1486 1487 /* The Linux boot header contains an "E820" memory map: ours is a 1488 * simple, single region. */ 1489 + boot->e820_entries = 1; 1490 + boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 1491 /* The boot header contains a command line pointer: we put the command 1492 + * line after the boot header. */ 1493 + boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1494 + concat((char *)(boot + 1), argv+optind+2); 1495 1496 + /* Boot protocol version: 2.07 supports the fields for lguest. */ 1497 + boot->hdr.version = 0x207; 1498 + 1499 + /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ 1500 + boot->hdr.hardware_subarch = 1; 1501 + 1502 + /* Tell the entry path not to try to reload segment registers. */ 1503 + boot->hdr.loadflags |= KEEP_SEGMENTS; 1504 1505 /* We tell the kernel to initialize the Guest: this returns the open 1506 * /dev/lguest file descriptor. */ 1507 + lguest_fd = tell_kernel(pgdir, start); 1508 1509 /* We fork off a child process, which wakes the Launcher whenever one 1510 * of the input file descriptors needs attention. Otherwise we would 1511 * run the Guest until it tries to output something. */ 1512 + waker_fd = setup_waker(lguest_fd); 1513 1514 /* Finally, run the Guest. This doesn't return. */ 1515 + run_guest(lguest_fd); 1516 } 1517 /*:*/ 1518

+30 -40

Documentation/lguest/lguest.txt

··· 6 Linux developers and users to experiment with virtualization with the 7 minimum of complexity. Nonetheless, it should have sufficient 8 features to make it useful for specific tasks, and, of course, you are 9 - encouraged to fork and enhance it. 10 11 Features: 12 ··· 23 24 Running Lguest: 25 26 - - Lguest runs the same kernel as guest and host. You can configure 27 - them differently, but usually it's easiest not to. 28 29 You will need to configure your kernel with the following options: 30 31 - CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] 32 - CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") 33 - CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") 34 - CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") 35 - CONFIG_LGUEST=y/m ("Linux hypervisor example code") 36 37 - and I recommend: 38 - CONFIG_HZ=100 ("Timer frequency")[2] 39 40 - A tool called "lguest" is available in this directory: type "make" 41 to build it. If you didn't build your kernel in-tree, use "make ··· 62 dd if=/dev/zero of=rootfile bs=1M count=2048 63 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 64 65 - "modprobe lg" if you built it as a module. 66 67 - Run an lguest as root: 68 69 - Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba 70 71 Explanation: 72 - 64m: the amount of memory to use. 73 74 vmlinux: the kernel image found in the top of your build directory. You 75 can also use a standard bzImage. ··· 80 --tunnet=192.168.19.1: configures a "tap" device for networking with this 81 IP address. 82 83 - --block=rootfile: a file or block device which becomes /dev/lgba 84 inside the guest. 85 86 - root=/dev/lgba: this (and anything else on the command line) are 87 kernel boot parameters. 88 89 - Configuring networking. I usually have the host masquerade, using ··· 113 "--sharenet=<filename>": any two guests using the same file are on 114 the same network. This file is created if it does not exist. 115 116 - Lguest I/O model: 117 118 - Lguest uses a simplified DMA model plus shared memory for I/O. Guests 119 - can communicate with each other if they share underlying memory 120 - (usually by the lguest program mmaping the same file), but they can 121 - use any non-shared memory to communicate with the lguest process. 122 - 123 - Guests can register DMA buffers at any key (must be a valid physical 124 - address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) 125 - hypercall. "dmabufs" is the physical address of an array of "num" 126 - "struct lguest_dma": each contains a used_len, and an array of 127 - physical addresses and lengths. When a transfer occurs, the 128 - "used_len" field of one of the buffers which has used_len 0 will be 129 - set to the length transferred and the irq will fire. 130 - 131 - Using an irq value of 0 unbinds the dma buffers. 132 - 133 - To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, 134 - and the bytes used is written to the used_len field. This can be 0 if 135 - noone else has bound a DMA buffer to that key or some other error. 136 - DMA buffers bound by the same guest are ignored. 137 - 138 - Cheers! 139 Rusty Russell rusty@rustcorp.com.au. 140 - 141 - [1] These are on various places on the TODO list, waiting for you to 142 - get annoyed enough at the limitation to fix it. 143 - [2] Lguest is not yet tickless when idle. See [1].

··· 6 Linux developers and users to experiment with virtualization with the 7 minimum of complexity. Nonetheless, it should have sufficient 8 features to make it useful for specific tasks, and, of course, you are 9 + encouraged to fork and enhance it (see drivers/lguest/README). 10 11 Features: 12 ··· 23 24 Running Lguest: 25 26 + - The easiest way to run lguest is to use same kernel as guest and host. 27 + You can configure them differently, but usually it's easiest not to. 28 29 You will need to configure your kernel with the following options: 30 31 + "General setup": 32 + "Prompt for development and/or incomplete code/drivers" = Y 33 + (CONFIG_EXPERIMENTAL=y) 34 35 + "Processor type and features": 36 + "Paravirtualized guest support" = Y 37 + "Lguest guest support" = Y 38 + "High Memory Support" = off/4GB 39 + "Alignment value to which kernel should be aligned" = 0x100000 40 + (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 41 + CONFIG_PHYSICAL_ALIGN=0x100000) 42 + 43 + "Device Drivers": 44 + "Network device support" 45 + "Universal TUN/TAP device driver support" = M/Y 46 + (CONFIG_TUN=m) 47 + "Virtualization" 48 + "Linux hypervisor example code" = M/Y 49 + (CONFIG_LGUEST=m) 50 51 - A tool called "lguest" is available in this directory: type "make" 52 to build it. If you didn't build your kernel in-tree, use "make ··· 51 dd if=/dev/zero of=rootfile bs=1M count=2048 52 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 53 54 + Make sure that you install a getty on /dev/hvc0 if you want to log in on the 55 + console! 56 + 57 - "modprobe lg" if you built it as a module. 58 59 - Run an lguest as root: 60 61 + Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda 62 63 Explanation: 64 + 64: the amount of memory to use, in MB. 65 66 vmlinux: the kernel image found in the top of your build directory. You 67 can also use a standard bzImage. ··· 66 --tunnet=192.168.19.1: configures a "tap" device for networking with this 67 IP address. 68 69 + --block=rootfile: a file or block device which becomes /dev/vda 70 inside the guest. 71 72 + root=/dev/vda: this (and anything else on the command line) are 73 kernel boot parameters. 74 75 - Configuring networking. I usually have the host masquerade, using ··· 99 "--sharenet=<filename>": any two guests using the same file are on 100 the same network. This file is created if it does not exist. 101 102 + There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest 103 104 + Good luck! 105 Rusty Russell rusty@rustcorp.com.au.

+22 -10

arch/i386/Kconfig

··· 227 If in doubt, say "Y". 228 229 config PARAVIRT 230 - bool "Paravirtualization support (EXPERIMENTAL)" 231 - depends on EXPERIMENTAL 232 depends on !(X86_VISWS || X86_VOYAGER) 233 help 234 - Paravirtualization is a way of running multiple instances of 235 - Linux on the same machine, under a hypervisor. This option 236 - changes the kernel so it can modify itself when it is run 237 - under a hypervisor, improving performance significantly. 238 - However, when run without a hypervisor the kernel is 239 - theoretically slower. If in doubt, say N. 240 241 source "arch/x86/xen/Kconfig" 242 243 config VMI 244 - bool "VMI Paravirt-ops support" 245 - depends on PARAVIRT 246 help 247 VMI provides a paravirtualized interface to the VMware ESX server 248 (it could be used by other hypervisors in theory too, but is not 249 at the moment), by linking the kernel to a GPL-ed ROM module 250 provided by the hypervisor. 251 252 config ACPI_SRAT 253 bool

··· 227 If in doubt, say "Y". 228 229 config PARAVIRT 230 + bool 231 depends on !(X86_VISWS || X86_VOYAGER) 232 help 233 + This changes the kernel so it can modify itself when it is run 234 + under a hypervisor, potentially improving performance significantly 235 + over full virtualization. However, when run without a hypervisor 236 + the kernel is theoretically slower and slightly larger. 237 + 238 + menuconfig PARAVIRT_GUEST 239 + bool "Paravirtualized guest support" 240 + help 241 + Say Y here to get to see options related to running Linux under 242 + various hypervisors. This option alone does not add any kernel code. 243 + 244 + If you say N, all options in this submenu will be skipped and disabled. 245 + 246 + if PARAVIRT_GUEST 247 248 source "arch/x86/xen/Kconfig" 249 250 config VMI 251 + bool "VMI Guest support" 252 + select PARAVIRT 253 + depends on !(X86_VISWS || X86_VOYAGER) 254 help 255 VMI provides a paravirtualized interface to the VMware ESX server 256 (it could be used by other hypervisors in theory too, but is not 257 at the moment), by linking the kernel to a GPL-ed ROM module 258 provided by the hypervisor. 259 + 260 + source "arch/x86/lguest/Kconfig" 261 + 262 + endif 263 264 config ACPI_SRAT 265 bool

+3

arch/i386/Makefile

··· 99 # Xen paravirtualization support 100 core-$(CONFIG_XEN) += arch/x86/xen/ 101 102 # default subarch .h files 103 mflags-y += -Iinclude/asm-x86/mach-default 104

··· 99 # Xen paravirtualization support 100 core-$(CONFIG_XEN) += arch/x86/xen/ 101 102 + # lguest paravirtualization support 103 + core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ 104 + 105 # default subarch .h files 106 mflags-y += -Iinclude/asm-x86/mach-default 107

+1

arch/x86/kernel/asm-offsets_32.c

··· 136 #ifdef CONFIG_LGUEST_GUEST 137 BLANK(); 138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 139 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 140 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 141 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);

··· 136 #ifdef CONFIG_LGUEST_GUEST 137 BLANK(); 138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 139 + OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 140 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 141 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 142 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);

+14

arch/x86/lguest/Kconfig

···

··· 1 + config LGUEST_GUEST 2 + bool "Lguest guest support" 3 + select PARAVIRT 4 + depends on !X86_PAE 5 + select VIRTIO 6 + select VIRTIO_RING 7 + select VIRTIO_CONSOLE 8 + help 9 + Lguest is a tiny in-kernel hypervisor. Selecting this will 10 + allow your kernel to boot under lguest. This option will increase 11 + your kernel size by about 6k. If in doubt, say N. 12 + 13 + If you say Y here, make sure you say Y (or M) to the virtio block 14 + and net drivers which lguest needs.

+1

arch/x86/lguest/Makefile

···

··· 1 + obj-y := i386_head.o boot.o

+3 -2

arch/x86/xen/Kconfig

··· 3 # 4 5 config XEN 6 - bool "Enable support for Xen hypervisor" 7 - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES 8 help 9 This is the Linux Xen port. Enabling this will allow the 10 kernel to boot in a paravirtualized environment under the

··· 3 # 4 5 config XEN 6 + bool "Xen guest support" 7 + select PARAVIRT 8 + depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) 9 help 10 This is the Linux Xen port. Enabling this will allow the 11 kernel to boot in a paravirtualized environment under the

+1 -1

drivers/Kconfig

··· 94 95 source "drivers/uio/Kconfig" 96 97 - source "drivers/lguest/Kconfig" 98 endmenu

··· 94 95 source "drivers/uio/Kconfig" 96 97 + source "drivers/virtio/Kconfig" 98 endmenu

+1

drivers/Makefile

··· 91 obj-$(CONFIG_PPC_PS3) += ps3/ 92 obj-$(CONFIG_OF) += of/ 93 obj-$(CONFIG_SSB) += ssb/

··· 91 obj-$(CONFIG_PPC_PS3) += ps3/ 92 obj-$(CONFIG_OF) += of/ 93 obj-$(CONFIG_SSB) += ssb/ 94 + obj-$(CONFIG_VIRTIO) += virtio/

+6

drivers/block/Kconfig

··· 425 block device driver. It communicates with a back-end driver 426 in another domain which drives the actual block device. 427 428 endif # BLK_DEV

··· 425 block device driver. It communicates with a back-end driver 426 in another domain which drives the actual block device. 427 428 + config VIRTIO_BLK 429 + tristate "Virtio block driver (EXPERIMENTAL)" 430 + depends on EXPERIMENTAL && VIRTIO 431 + ---help--- 432 + This is the virtual block driver for lguest. Say Y or M. 433 + 434 endif # BLK_DEV

+1 -1

drivers/block/Makefile

··· 25 obj-$(CONFIG_BLK_DEV_UMEM) += umem.o 26 obj-$(CONFIG_BLK_DEV_NBD) += nbd.o 27 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o 28 29 obj-$(CONFIG_VIODASD) += viodasd.o 30 obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 31 obj-$(CONFIG_BLK_DEV_UB) += ub.o 32 33 obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 34 - obj-$(CONFIG_LGUEST_BLOCK) += lguest_blk.o

··· 25 obj-$(CONFIG_BLK_DEV_UMEM) += umem.o 26 obj-$(CONFIG_BLK_DEV_NBD) += nbd.o 27 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o 28 + obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o 29 30 obj-$(CONFIG_VIODASD) += viodasd.o 31 obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 32 obj-$(CONFIG_BLK_DEV_UB) += ub.o 33 34 obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o

-421

drivers/block/lguest_blk.c

··· 1 - /*D:400 2 - * The Guest block driver 3 - * 4 - * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc. 5 - * The mechanism is simple: we place the information about the request in the 6 - * device page, then use SEND_DMA (containing the data for a write, or an empty 7 - * "ping" DMA for a read). 8 - :*/ 9 - /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 10 - * 11 - * This program is free software; you can redistribute it and/or modify 12 - * it under the terms of the GNU General Public License as published by 13 - * the Free Software Foundation; either version 2 of the License, or 14 - * (at your option) any later version. 15 - * 16 - * This program is distributed in the hope that it will be useful, 17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 - * GNU General Public License for more details. 20 - * 21 - * You should have received a copy of the GNU General Public License 22 - * along with this program; if not, write to the Free Software 23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 - */ 25 - //#define DEBUG 26 - #include <linux/init.h> 27 - #include <linux/types.h> 28 - #include <linux/blkdev.h> 29 - #include <linux/interrupt.h> 30 - #include <linux/lguest_bus.h> 31 - 32 - static char next_block_index = 'a'; 33 - 34 - /*D:420 Here is the structure which holds all the information we need about 35 - * each Guest block device. 36 - * 37 - * I'm sure at this stage, you're wondering "hey, where was the adventure I was 38 - * promised?" and thinking "Rusty sucks, I shall say nasty things about him on 39 - * my blog". I think Real adventures have boring bits, too, and you're in the 40 - * middle of one. But it gets better. Just not quite yet. */ 41 - struct blockdev 42 - { 43 - /* The block queue infrastructure wants a spinlock: it is held while it 44 - * calls our block request function. We grab it in our interrupt 45 - * handler so the responses don't mess with new requests. */ 46 - spinlock_t lock; 47 - 48 - /* The disk structure registered with kernel. */ 49 - struct gendisk *disk; 50 - 51 - /* The major device number for this disk, and the interrupt. We only 52 - * really keep them here for completeness; we'd need them if we 53 - * supported device unplugging. */ 54 - int major; 55 - int irq; 56 - 57 - /* The physical address of this device's memory page */ 58 - unsigned long phys_addr; 59 - /* The mapped memory page for convenient acces. */ 60 - struct lguest_block_page *lb_page; 61 - 62 - /* We only have a single request outstanding at a time: this is it. */ 63 - struct lguest_dma dma; 64 - struct request *req; 65 - }; 66 - 67 - /*D:495 We originally used end_request() throughout the driver, but it turns 68 - * out that end_request() is deprecated, and doesn't actually end the request 69 - * (which seems like a good reason to deprecate it!). It simply ends the first 70 - * bio. So if we had 3 bios in a "struct request" we would do all 3, 71 - * end_request(), do 2, end_request(), do 1 and end_request(): twice as much 72 - * work as we needed to do. 73 - * 74 - * This reinforced to me that I do not understand the block layer. 75 - * 76 - * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a 77 - * request. This improved disk speed by 130%. */ 78 - static void end_entire_request(struct request *req, int uptodate) 79 - { 80 - if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) 81 - BUG(); 82 - add_disk_randomness(req->rq_disk); 83 - blkdev_dequeue_request(req); 84 - end_that_request_last(req, uptodate); 85 - } 86 - 87 - /* I'm told there are only two stories in the world worth telling: love and 88 - * hate. So there used to be a love scene here like this: 89 - * 90 - * Launcher: We could make beautiful I/O together, you and I. 91 - * Guest: My, that's a big disk! 92 - * 93 - * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */ 94 - 95 - /*D:490 This is the interrupt handler, called when a block read or write has 96 - * been completed for us. */ 97 - static irqreturn_t lgb_irq(int irq, void *_bd) 98 - { 99 - /* We handed our "struct blockdev" as the argument to request_irq(), so 100 - * it is passed through to us here. This tells us which device we're 101 - * dealing with in case we have more than one. */ 102 - struct blockdev *bd = _bd; 103 - unsigned long flags; 104 - 105 - /* We weren't doing anything? Strange, but could happen if we shared 106 - * interrupts (we don't!). */ 107 - if (!bd->req) { 108 - pr_debug("No work!\n"); 109 - return IRQ_NONE; 110 - } 111 - 112 - /* Not done yet? That's equally strange. */ 113 - if (!bd->lb_page->result) { 114 - pr_debug("No result!\n"); 115 - return IRQ_NONE; 116 - } 117 - 118 - /* We have to grab the lock before ending the request. */ 119 - spin_lock_irqsave(&bd->lock, flags); 120 - /* "result" is 1 for success, 2 for failure: end_entire_request() wants 121 - * to know whether this succeeded or not. */ 122 - end_entire_request(bd->req, bd->lb_page->result == 1); 123 - /* Clear out request, it's done. */ 124 - bd->req = NULL; 125 - /* Reset incoming DMA for next time. */ 126 - bd->dma.used_len = 0; 127 - /* Ready for more reads or writes */ 128 - blk_start_queue(bd->disk->queue); 129 - spin_unlock_irqrestore(&bd->lock, flags); 130 - 131 - /* The interrupt was for us, we dealt with it. */ 132 - return IRQ_HANDLED; 133 - } 134 - 135 - /*D:480 The block layer's "struct request" contains a number of "struct bio"s, 136 - * each of which contains "struct bio_vec"s, each of which contains a page, an 137 - * offset and a length. 138 - * 139 - * Fortunately there are iterators to help us walk through the "struct 140 - * request". Even more fortunately, there were plenty of places to steal the 141 - * code from. We pack the "struct request" into our "struct lguest_dma" and 142 - * return the total length. */ 143 - static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma) 144 - { 145 - unsigned int i = 0, len = 0; 146 - struct req_iterator iter; 147 - struct bio_vec *bvec; 148 - 149 - rq_for_each_segment(bvec, req, iter) { 150 - /* We told the block layer not to give us too many. */ 151 - BUG_ON(i == LGUEST_MAX_DMA_SECTIONS); 152 - /* If we had a zero-length segment, it would look like 153 - * the end of the data referred to by the "struct 154 - * lguest_dma", so make sure that doesn't happen. */ 155 - BUG_ON(!bvec->bv_len); 156 - /* Convert page & offset to a physical address */ 157 - dma->addr[i] = page_to_phys(bvec->bv_page) 158 - + bvec->bv_offset; 159 - dma->len[i] = bvec->bv_len; 160 - len += bvec->bv_len; 161 - i++; 162 - } 163 - /* If the array isn't full, we mark the end with a 0 length */ 164 - if (i < LGUEST_MAX_DMA_SECTIONS) 165 - dma->len[i] = 0; 166 - return len; 167 - } 168 - 169 - /* This creates an empty DMA, useful for prodding the Host without sending data 170 - * (ie. when we want to do a read) */ 171 - static void empty_dma(struct lguest_dma *dma) 172 - { 173 - dma->len[0] = 0; 174 - } 175 - 176 - /*D:470 Setting up a request is fairly easy: */ 177 - static void setup_req(struct blockdev *bd, 178 - int type, struct request *req, struct lguest_dma *dma) 179 - { 180 - /* The type is 1 (write) or 0 (read). */ 181 - bd->lb_page->type = type; 182 - /* The sector on disk where the read or write starts. */ 183 - bd->lb_page->sector = req->sector; 184 - /* The result is initialized to 0 (unfinished). */ 185 - bd->lb_page->result = 0; 186 - /* The current request (so we can end it in the interrupt handler). */ 187 - bd->req = req; 188 - /* The number of bytes: returned as a side-effect of req_to_dma(), 189 - * which packs the block layer's "struct request" into our "struct 190 - * lguest_dma" */ 191 - bd->lb_page->bytes = req_to_dma(req, dma); 192 - } 193 - 194 - /*D:450 Write is pretty straightforward: we pack the request into a "struct 195 - * lguest_dma", then use SEND_DMA to send the request. */ 196 - static void do_write(struct blockdev *bd, struct request *req) 197 - { 198 - struct lguest_dma send; 199 - 200 - pr_debug("lgb: WRITE sector %li\n", (long)req->sector); 201 - setup_req(bd, 1, req, &send); 202 - 203 - lguest_send_dma(bd->phys_addr, &send); 204 - } 205 - 206 - /* Read is similar to write, except we pack the request into our receive 207 - * "struct lguest_dma" and send through an empty DMA just to tell the Host that 208 - * there's a request pending. */ 209 - static void do_read(struct blockdev *bd, struct request *req) 210 - { 211 - struct lguest_dma ping; 212 - 213 - pr_debug("lgb: READ sector %li\n", (long)req->sector); 214 - setup_req(bd, 0, req, &bd->dma); 215 - 216 - empty_dma(&ping); 217 - lguest_send_dma(bd->phys_addr, &ping); 218 - } 219 - 220 - /*D:440 This where requests come in: we get handed the request queue and are 221 - * expected to pull a "struct request" off it until we've finished them or 222 - * we're waiting for a reply: */ 223 - static void do_lgb_request(struct request_queue *q) 224 - { 225 - struct blockdev *bd; 226 - struct request *req; 227 - 228 - again: 229 - /* This sometimes returns NULL even on the very first time around. I 230 - * wonder if it's something to do with letting elves handle the request 231 - * queue... */ 232 - req = elv_next_request(q); 233 - if (!req) 234 - return; 235 - 236 - /* We attached the struct blockdev to the disk: get it back */ 237 - bd = req->rq_disk->private_data; 238 - /* Sometimes we get repeated requests after blk_stop_queue(), but we 239 - * can only handle one at a time. */ 240 - if (bd->req) 241 - return; 242 - 243 - /* We only do reads and writes: no tricky business! */ 244 - if (!blk_fs_request(req)) { 245 - pr_debug("Got non-command 0x%08x\n", req->cmd_type); 246 - req->errors++; 247 - end_entire_request(req, 0); 248 - goto again; 249 - } 250 - 251 - if (rq_data_dir(req) == WRITE) 252 - do_write(bd, req); 253 - else 254 - do_read(bd, req); 255 - 256 - /* We've put out the request, so stop any more coming in until we get 257 - * an interrupt, which takes us to lgb_irq() to re-enable the queue. */ 258 - blk_stop_queue(q); 259 - } 260 - 261 - /*D:430 This is the "struct block_device_operations" we attach to the disk at 262 - * the end of lguestblk_probe(). It doesn't seem to want much. */ 263 - static struct block_device_operations lguestblk_fops = { 264 - .owner = THIS_MODULE, 265 - }; 266 - 267 - /*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure 268 - * quite why. I do know that the IDE code sent two or three of the maintainers 269 - * insane, perhaps this is the fringe of the same disease? 270 - * 271 - * As in the console code, the probe function gets handed the generic 272 - * lguest_device from lguest_bus.c: */ 273 - static int lguestblk_probe(struct lguest_device *lgdev) 274 - { 275 - struct blockdev *bd; 276 - int err; 277 - int irqflags = IRQF_SHARED; 278 - 279 - /* First we allocate our own "struct blockdev" and initialize the easy 280 - * fields. */ 281 - bd = kmalloc(sizeof(*bd), GFP_KERNEL); 282 - if (!bd) 283 - return -ENOMEM; 284 - 285 - spin_lock_init(&bd->lock); 286 - bd->irq = lgdev_irq(lgdev); 287 - bd->req = NULL; 288 - bd->dma.used_len = 0; 289 - bd->dma.len[0] = 0; 290 - /* The descriptor in the lguest_devices array provided by the Host 291 - * gives the Guest the physical page number of the device's page. */ 292 - bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT); 293 - 294 - /* We use lguest_map() to get a pointer to the device page */ 295 - bd->lb_page = lguest_map(bd->phys_addr, 1); 296 - if (!bd->lb_page) { 297 - err = -ENOMEM; 298 - goto out_free_bd; 299 - } 300 - 301 - /* We need a major device number: 0 means "assign one dynamically". */ 302 - bd->major = register_blkdev(0, "lguestblk"); 303 - if (bd->major < 0) { 304 - err = bd->major; 305 - goto out_unmap; 306 - } 307 - 308 - /* This allocates a "struct gendisk" where we pack all the information 309 - * about the disk which the rest of Linux sees. The argument is the 310 - * number of minor devices desired: we need one minor for the main 311 - * disk, and one for each partition. Of course, we can't possibly know 312 - * how many partitions are on the disk (add_disk does that). 313 - */ 314 - bd->disk = alloc_disk(16); 315 - if (!bd->disk) { 316 - err = -ENOMEM; 317 - goto out_unregister_blkdev; 318 - } 319 - 320 - /* Every disk needs a queue for requests to come in: we set up the 321 - * queue with a callback function (the core of our driver) and the lock 322 - * to use. */ 323 - bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock); 324 - if (!bd->disk->queue) { 325 - err = -ENOMEM; 326 - goto out_put_disk; 327 - } 328 - 329 - /* We can only handle a certain number of pointers in our SEND_DMA 330 - * call, so we set that with blk_queue_max_hw_segments(). This is not 331 - * to be confused with blk_queue_max_phys_segments() of course! I 332 - * know, who could possibly confuse the two? 333 - * 334 - * Well, it's simple to tell them apart: this one seems to work and the 335 - * other one didn't. */ 336 - blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS); 337 - 338 - /* Due to technical limitations of our Host (and simple coding) we 339 - * can't have a single buffer which crosses a page boundary. Tell it 340 - * here. This means that our maximum request size is 16 341 - * (LGUEST_MAX_DMA_SECTIONS) pages. */ 342 - blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1); 343 - 344 - /* We name our disk: this becomes the device name when udev does its 345 - * magic thing and creates the device node, such as /dev/lgba. 346 - * next_block_index is a global which starts at 'a'. Unfortunately 347 - * this simple increment logic means that the 27th disk will be called 348 - * "/dev/lgb{". In that case, I recommend having at least 29 disks, so 349 - * your /dev directory will be balanced. */ 350 - sprintf(bd->disk->disk_name, "lgb%c", next_block_index++); 351 - 352 - /* We look to the device descriptor again to see if this device's 353 - * interrupts are expected to be random. If they are, we tell the irq 354 - * subsystem. At the moment this bit is always set. */ 355 - if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) 356 - irqflags |= IRQF_SAMPLE_RANDOM; 357 - 358 - /* Now we have the name and irqflags, we can request the interrupt; we 359 - * give it the "struct blockdev" we have set up to pass to lgb_irq() 360 - * when there is an interrupt. */ 361 - err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd); 362 - if (err) 363 - goto out_cleanup_queue; 364 - 365 - /* We bind our one-entry DMA pool to the key for this block device so 366 - * the Host can reply to our requests. The key is equal to the 367 - * physical address of the device's page, which is conveniently 368 - * unique. */ 369 - err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq); 370 - if (err) 371 - goto out_free_irq; 372 - 373 - /* We finish our disk initialization and add the disk to the system. */ 374 - bd->disk->major = bd->major; 375 - bd->disk->first_minor = 0; 376 - bd->disk->private_data = bd; 377 - bd->disk->fops = &lguestblk_fops; 378 - /* This is initialized to the disk size by the Launcher. */ 379 - set_capacity(bd->disk, bd->lb_page->num_sectors); 380 - add_disk(bd->disk); 381 - 382 - printk(KERN_INFO "%s: device %i at major %d\n", 383 - bd->disk->disk_name, lgdev->index, bd->major); 384 - 385 - /* We don't need to keep the "struct blockdev" around, but if we ever 386 - * implemented device removal, we'd need this. */ 387 - lgdev->private = bd; 388 - return 0; 389 - 390 - out_free_irq: 391 - free_irq(bd->irq, bd); 392 - out_cleanup_queue: 393 - blk_cleanup_queue(bd->disk->queue); 394 - out_put_disk: 395 - put_disk(bd->disk); 396 - out_unregister_blkdev: 397 - unregister_blkdev(bd->major, "lguestblk"); 398 - out_unmap: 399 - lguest_unmap(bd->lb_page); 400 - out_free_bd: 401 - kfree(bd); 402 - return err; 403 - } 404 - 405 - /*D:410 The boilerplate code for registering the lguest block driver is just 406 - * like the console: */ 407 - static struct lguest_driver lguestblk_drv = { 408 - .name = "lguestblk", 409 - .owner = THIS_MODULE, 410 - .device_type = LGUEST_DEVICE_T_BLOCK, 411 - .probe = lguestblk_probe, 412 - }; 413 - 414 - static __init int lguestblk_init(void) 415 - { 416 - return register_lguest_driver(&lguestblk_drv); 417 - } 418 - module_init(lguestblk_init); 419 - 420 - MODULE_DESCRIPTION("Lguest block driver"); 421 - MODULE_LICENSE("GPL");

···

+308

drivers/block/virtio_blk.c

···

··· 1 + //#define DEBUG 2 + #include <linux/spinlock.h> 3 + #include <linux/blkdev.h> 4 + #include <linux/hdreg.h> 5 + #include <linux/virtio.h> 6 + #include <linux/virtio_blk.h> 7 + #include <linux/virtio_blk.h> 8 + 9 + static unsigned char virtblk_index = 'a'; 10 + struct virtio_blk 11 + { 12 + spinlock_t lock; 13 + 14 + struct virtio_device *vdev; 15 + struct virtqueue *vq; 16 + 17 + /* The disk structure for the kernel. */ 18 + struct gendisk *disk; 19 + 20 + /* Request tracking. */ 21 + struct list_head reqs; 22 + 23 + mempool_t *pool; 24 + 25 + /* Scatterlist: can be too big for stack. */ 26 + struct scatterlist sg[3+MAX_PHYS_SEGMENTS]; 27 + }; 28 + 29 + struct virtblk_req 30 + { 31 + struct list_head list; 32 + struct request *req; 33 + struct virtio_blk_outhdr out_hdr; 34 + struct virtio_blk_inhdr in_hdr; 35 + }; 36 + 37 + static bool blk_done(struct virtqueue *vq) 38 + { 39 + struct virtio_blk *vblk = vq->vdev->priv; 40 + struct virtblk_req *vbr; 41 + unsigned int len; 42 + unsigned long flags; 43 + 44 + spin_lock_irqsave(&vblk->lock, flags); 45 + while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) { 46 + int uptodate; 47 + switch (vbr->in_hdr.status) { 48 + case VIRTIO_BLK_S_OK: 49 + uptodate = 1; 50 + break; 51 + case VIRTIO_BLK_S_UNSUPP: 52 + uptodate = -ENOTTY; 53 + break; 54 + default: 55 + uptodate = 0; 56 + break; 57 + } 58 + 59 + end_dequeued_request(vbr->req, uptodate); 60 + list_del(&vbr->list); 61 + mempool_free(vbr, vblk->pool); 62 + } 63 + /* In case queue is stopped waiting for more buffers. */ 64 + blk_start_queue(vblk->disk->queue); 65 + spin_unlock_irqrestore(&vblk->lock, flags); 66 + return true; 67 + } 68 + 69 + static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 70 + struct request *req) 71 + { 72 + unsigned long num, out, in; 73 + struct virtblk_req *vbr; 74 + 75 + vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); 76 + if (!vbr) 77 + /* When another request finishes we'll try again. */ 78 + return false; 79 + 80 + vbr->req = req; 81 + if (blk_fs_request(vbr->req)) { 82 + vbr->out_hdr.type = 0; 83 + vbr->out_hdr.sector = vbr->req->sector; 84 + vbr->out_hdr.ioprio = vbr->req->ioprio; 85 + } else if (blk_pc_request(vbr->req)) { 86 + vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; 87 + vbr->out_hdr.sector = 0; 88 + vbr->out_hdr.ioprio = vbr->req->ioprio; 89 + } else { 90 + /* We don't put anything else in the queue. */ 91 + BUG(); 92 + } 93 + 94 + if (blk_barrier_rq(vbr->req)) 95 + vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER; 96 + 97 + /* We have to zero this, otherwise blk_rq_map_sg gets upset. */ 98 + memset(vblk->sg, 0, sizeof(vblk->sg)); 99 + sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr)); 100 + num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); 101 + sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr)); 102 + 103 + if (rq_data_dir(vbr->req) == WRITE) { 104 + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; 105 + out = 1 + num; 106 + in = 1; 107 + } else { 108 + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 109 + out = 1; 110 + in = 1 + num; 111 + } 112 + 113 + if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) { 114 + mempool_free(vbr, vblk->pool); 115 + return false; 116 + } 117 + 118 + list_add_tail(&vbr->list, &vblk->reqs); 119 + return true; 120 + } 121 + 122 + static void do_virtblk_request(struct request_queue *q) 123 + { 124 + struct virtio_blk *vblk = NULL; 125 + struct request *req; 126 + unsigned int issued = 0; 127 + 128 + while ((req = elv_next_request(q)) != NULL) { 129 + vblk = req->rq_disk->private_data; 130 + BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg)); 131 + 132 + /* If this request fails, stop queue and wait for something to 133 + finish to restart it. */ 134 + if (!do_req(q, vblk, req)) { 135 + blk_stop_queue(q); 136 + break; 137 + } 138 + blkdev_dequeue_request(req); 139 + issued++; 140 + } 141 + 142 + if (issued) 143 + vblk->vq->vq_ops->kick(vblk->vq); 144 + } 145 + 146 + static int virtblk_ioctl(struct inode *inode, struct file *filp, 147 + unsigned cmd, unsigned long data) 148 + { 149 + return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue, 150 + inode->i_bdev->bd_disk, cmd, 151 + (void __user *)data); 152 + } 153 + 154 + static struct block_device_operations virtblk_fops = { 155 + .ioctl = virtblk_ioctl, 156 + .owner = THIS_MODULE, 157 + }; 158 + 159 + static int virtblk_probe(struct virtio_device *vdev) 160 + { 161 + struct virtio_blk *vblk; 162 + int err, major; 163 + void *token; 164 + unsigned int len; 165 + u64 cap; 166 + u32 v; 167 + 168 + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); 169 + if (!vblk) { 170 + err = -ENOMEM; 171 + goto out; 172 + } 173 + 174 + INIT_LIST_HEAD(&vblk->reqs); 175 + spin_lock_init(&vblk->lock); 176 + vblk->vdev = vdev; 177 + 178 + /* We expect one virtqueue, for output. */ 179 + vblk->vq = vdev->config->find_vq(vdev, blk_done); 180 + if (IS_ERR(vblk->vq)) { 181 + err = PTR_ERR(vblk->vq); 182 + goto out_free_vblk; 183 + } 184 + 185 + vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); 186 + if (!vblk->pool) { 187 + err = -ENOMEM; 188 + goto out_free_vq; 189 + } 190 + 191 + major = register_blkdev(0, "virtblk"); 192 + if (major < 0) { 193 + err = major; 194 + goto out_mempool; 195 + } 196 + 197 + /* FIXME: How many partitions? How long is a piece of string? */ 198 + vblk->disk = alloc_disk(1 << 4); 199 + if (!vblk->disk) { 200 + err = -ENOMEM; 201 + goto out_unregister_blkdev; 202 + } 203 + 204 + vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); 205 + if (!vblk->disk->queue) { 206 + err = -ENOMEM; 207 + goto out_put_disk; 208 + } 209 + 210 + sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); 211 + vblk->disk->major = major; 212 + vblk->disk->first_minor = 0; 213 + vblk->disk->private_data = vblk; 214 + vblk->disk->fops = &virtblk_fops; 215 + 216 + /* If barriers are supported, tell block layer that queue is ordered */ 217 + token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len); 218 + if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER)) 219 + blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); 220 + 221 + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap); 222 + if (err) { 223 + dev_err(&vdev->dev, "Bad/missing capacity in config\n"); 224 + goto out_put_disk; 225 + } 226 + 227 + /* If capacity is too big, truncate with warning. */ 228 + if ((sector_t)cap != cap) { 229 + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", 230 + (unsigned long long)cap); 231 + cap = (sector_t)-1; 232 + } 233 + set_capacity(vblk->disk, cap); 234 + 235 + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v); 236 + if (!err) 237 + blk_queue_max_segment_size(vblk->disk->queue, v); 238 + else if (err != -ENOENT) { 239 + dev_err(&vdev->dev, "Bad SIZE_MAX in config\n"); 240 + goto out_put_disk; 241 + } 242 + 243 + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v); 244 + if (!err) 245 + blk_queue_max_hw_segments(vblk->disk->queue, v); 246 + else if (err != -ENOENT) { 247 + dev_err(&vdev->dev, "Bad SEG_MAX in config\n"); 248 + goto out_put_disk; 249 + } 250 + 251 + add_disk(vblk->disk); 252 + return 0; 253 + 254 + out_put_disk: 255 + put_disk(vblk->disk); 256 + out_unregister_blkdev: 257 + unregister_blkdev(major, "virtblk"); 258 + out_mempool: 259 + mempool_destroy(vblk->pool); 260 + out_free_vq: 261 + vdev->config->del_vq(vblk->vq); 262 + out_free_vblk: 263 + kfree(vblk); 264 + out: 265 + return err; 266 + } 267 + 268 + static void virtblk_remove(struct virtio_device *vdev) 269 + { 270 + struct virtio_blk *vblk = vdev->priv; 271 + int major = vblk->disk->major; 272 + 273 + BUG_ON(!list_empty(&vblk->reqs)); 274 + blk_cleanup_queue(vblk->disk->queue); 275 + put_disk(vblk->disk); 276 + unregister_blkdev(major, "virtblk"); 277 + mempool_destroy(vblk->pool); 278 + kfree(vblk); 279 + } 280 + 281 + static struct virtio_device_id id_table[] = { 282 + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, 283 + { 0 }, 284 + }; 285 + 286 + static struct virtio_driver virtio_blk = { 287 + .driver.name = KBUILD_MODNAME, 288 + .driver.owner = THIS_MODULE, 289 + .id_table = id_table, 290 + .probe = virtblk_probe, 291 + .remove = __devexit_p(virtblk_remove), 292 + }; 293 + 294 + static int __init init(void) 295 + { 296 + return register_virtio_driver(&virtio_blk); 297 + } 298 + 299 + static void __exit fini(void) 300 + { 301 + unregister_virtio_driver(&virtio_blk); 302 + } 303 + module_init(init); 304 + module_exit(fini); 305 + 306 + MODULE_DEVICE_TABLE(virtio, id_table); 307 + MODULE_DESCRIPTION("Virtio block driver"); 308 + MODULE_LICENSE("GPL");

+4

drivers/char/Kconfig

··· 613 help 614 Xen virtual console device driver 615 616 config HVCS 617 tristate "IBM Hypervisor Virtual Console Server support" 618 depends on PPC_PSERIES

··· 613 help 614 Xen virtual console device driver 615 616 + config VIRTIO_CONSOLE 617 + bool 618 + select HVC_DRIVER 619 + 620 config HVCS 621 tristate "IBM Hypervisor Virtual Console Server support" 622 depends on PPC_PSERIES

+1 -1

drivers/char/Makefile

··· 42 obj-$(CONFIG_N_HDLC) += n_hdlc.o 43 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o 44 obj-$(CONFIG_SX) += sx.o generic_serial.o 45 - obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o 46 obj-$(CONFIG_RIO) += rio/ generic_serial.o 47 obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o 48 obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o ··· 49 obj-$(CONFIG_HVC_BEAT) += hvc_beat.o 50 obj-$(CONFIG_HVC_DRIVER) += hvc_console.o 51 obj-$(CONFIG_HVC_XEN) += hvc_xen.o 52 obj-$(CONFIG_RAW_DRIVER) += raw.o 53 obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o 54 obj-$(CONFIG_MSPEC) += mspec.o

··· 42 obj-$(CONFIG_N_HDLC) += n_hdlc.o 43 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o 44 obj-$(CONFIG_SX) += sx.o generic_serial.o 45 obj-$(CONFIG_RIO) += rio/ generic_serial.o 46 obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o 47 obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o ··· 50 obj-$(CONFIG_HVC_BEAT) += hvc_beat.o 51 obj-$(CONFIG_HVC_DRIVER) += hvc_console.o 52 obj-$(CONFIG_HVC_XEN) += hvc_xen.o 53 + obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o 54 obj-$(CONFIG_RAW_DRIVER) += raw.o 55 obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o 56 obj-$(CONFIG_MSPEC) += mspec.o

-177

drivers/char/hvc_lguest.c

··· 1 - /*D:300 2 - * The Guest console driver 3 - * 4 - * This is a trivial console driver: we use lguest's DMA mechanism to send 5 - * bytes out, and register a DMA buffer to receive bytes in. It is assumed to 6 - * be present and available from the very beginning of boot. 7 - * 8 - * Writing console drivers is one of the few remaining Dark Arts in Linux. 9 - * Fortunately for us, the path of virtual consoles has been well-trodden by 10 - * the PowerPC folks, who wrote "hvc_console.c" to generically support any 11 - * virtual console. We use that infrastructure which only requires us to write 12 - * the basic put_chars and get_chars functions and call the right register 13 - * functions. 14 - :*/ 15 - 16 - /*M:002 The console can be flooded: while the Guest is processing input the 17 - * Host can send more. Buffering in the Host could alleviate this, but it is a 18 - * difficult problem in general. :*/ 19 - /* Copyright (C) 2006 Rusty Russell, IBM Corporation 20 - * 21 - * This program is free software; you can redistribute it and/or modify 22 - * it under the terms of the GNU General Public License as published by 23 - * the Free Software Foundation; either version 2 of the License, or 24 - * (at your option) any later version. 25 - * 26 - * This program is distributed in the hope that it will be useful, 27 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 - * GNU General Public License for more details. 30 - * 31 - * You should have received a copy of the GNU General Public License 32 - * along with this program; if not, write to the Free Software 33 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 34 - */ 35 - #include <linux/err.h> 36 - #include <linux/init.h> 37 - #include <linux/lguest_bus.h> 38 - #include <asm/paravirt.h> 39 - #include "hvc_console.h" 40 - 41 - /*D:340 This is our single console input buffer, with associated "struct 42 - * lguest_dma" referring to it. Note the 0-terminated length array, and the 43 - * use of physical address for the buffer itself. */ 44 - static char inbuf[256]; 45 - static struct lguest_dma cons_input = { .used_len = 0, 46 - .addr[0] = __pa(inbuf), 47 - .len[0] = sizeof(inbuf), 48 - .len[1] = 0 }; 49 - 50 - /*D:310 The put_chars() callback is pretty straightforward. 51 - * 52 - * First we put the pointer and length in a "struct lguest_dma": we only have 53 - * one pointer, so we set the second length to 0. Then we use SEND_DMA to send 54 - * the data to (Host) buffers attached to the console key. Usually a device's 55 - * key is a physical address within the device's memory, but because the 56 - * console device doesn't have any associated physical memory, we use the 57 - * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */ 58 - static int put_chars(u32 vtermno, const char *buf, int count) 59 - { 60 - struct lguest_dma dma; 61 - 62 - /* FIXME: DMA buffers in a "struct lguest_dma" are not allowed 63 - * to go over page boundaries. This never seems to happen, 64 - * but if it did we'd need to fix this code. */ 65 - dma.len[0] = count; 66 - dma.len[1] = 0; 67 - dma.addr[0] = __pa(buf); 68 - 69 - lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma); 70 - /* We're expected to return the amount of data we wrote: all of it. */ 71 - return count; 72 - } 73 - 74 - /*D:350 get_chars() is the callback from the hvc_console infrastructure when 75 - * an interrupt is received. 76 - * 77 - * Firstly we see if our buffer has been filled: if not, we return. The rest 78 - * of the code deals with the fact that the hvc_console() infrastructure only 79 - * asks us for 16 bytes at a time. We keep a "cons_offset" variable for 80 - * partially-read buffers. */ 81 - static int get_chars(u32 vtermno, char *buf, int count) 82 - { 83 - static int cons_offset; 84 - 85 - /* Nothing left to see here... */ 86 - if (!cons_input.used_len) 87 - return 0; 88 - 89 - /* You want more than we have to give? Well, try wanting less! */ 90 - if (cons_input.used_len - cons_offset < count) 91 - count = cons_input.used_len - cons_offset; 92 - 93 - /* Copy across to their buffer and increment offset. */ 94 - memcpy(buf, inbuf + cons_offset, count); 95 - cons_offset += count; 96 - 97 - /* Finished? Zero offset, and reset cons_input so Host will use it 98 - * again. */ 99 - if (cons_offset == cons_input.used_len) { 100 - cons_offset = 0; 101 - cons_input.used_len = 0; 102 - } 103 - return count; 104 - } 105 - /*:*/ 106 - 107 - static struct hv_ops lguest_cons = { 108 - .get_chars = get_chars, 109 - .put_chars = put_chars, 110 - }; 111 - 112 - /*D:320 Console drivers are initialized very early so boot messages can go 113 - * out. At this stage, the console is output-only. Our driver checks we're a 114 - * Guest, and if so hands hvc_instantiate() the console number (0), priority 115 - * (0), and the struct hv_ops containing the put_chars() function. */ 116 - static int __init cons_init(void) 117 - { 118 - if (strcmp(pv_info.name, "lguest") != 0) 119 - return 0; 120 - 121 - return hvc_instantiate(0, 0, &lguest_cons); 122 - } 123 - console_initcall(cons_init); 124 - 125 - /*D:370 To set up and manage our virtual console, we call hvc_alloc() and 126 - * stash the result in the private pointer of the "struct lguest_device". 127 - * Since we never remove the console device we never need this pointer again, 128 - * but using ->private is considered good form, and you never know who's going 129 - * to copy your driver. 130 - * 131 - * Once the console is set up, we bind our input buffer ready for input. */ 132 - static int lguestcons_probe(struct lguest_device *lgdev) 133 - { 134 - int err; 135 - 136 - /* The first argument of hvc_alloc() is the virtual console number, so 137 - * we use zero. The second argument is the interrupt number. 138 - * 139 - * The third argument is a "struct hv_ops" containing the put_chars() 140 - * and get_chars() pointers. The final argument is the output buffer 141 - * size: we use 256 and expect the Host to have room for us to send 142 - * that much. */ 143 - lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256); 144 - if (IS_ERR(lgdev->private)) 145 - return PTR_ERR(lgdev->private); 146 - 147 - /* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY. 148 - * "cons_input" is that statically-initialized global DMA buffer we saw 149 - * above, and we also give the interrupt we want. */ 150 - err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1, 151 - lgdev_irq(lgdev)); 152 - if (err) 153 - printk("lguest console: failed to bind buffer.\n"); 154 - return err; 155 - } 156 - /* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc() 157 - * to expect input when this interrupt is triggered, and then tell 158 - * lguest_bind_dma() that is the interrupt to send us when input comes in. */ 159 - 160 - /*D:360 From now on the console driver follows standard Guest driver form: 161 - * register_lguest_driver() registers the device type and probe function, and 162 - * the probe function sets up the device. 163 - * 164 - * The standard "struct lguest_driver": */ 165 - static struct lguest_driver lguestcons_drv = { 166 - .name = "lguestcons", 167 - .owner = THIS_MODULE, 168 - .device_type = LGUEST_DEVICE_T_CONSOLE, 169 - .probe = lguestcons_probe, 170 - }; 171 - 172 - /* The standard init function */ 173 - static int __init hvc_lguest_init(void) 174 - { 175 - return register_lguest_driver(&lguestcons_drv); 176 - } 177 - module_init(hvc_lguest_init);

···

+225

drivers/char/virtio_console.c

···

··· 1 + /*D:300 2 + * The Guest console driver 3 + * 4 + * Writing console drivers is one of the few remaining Dark Arts in Linux. 5 + * Fortunately for us, the path of virtual consoles has been well-trodden by 6 + * the PowerPC folks, who wrote "hvc_console.c" to generically support any 7 + * virtual console. We use that infrastructure which only requires us to write 8 + * the basic put_chars and get_chars functions and call the right register 9 + * functions. 10 + :*/ 11 + 12 + /*M:002 The console can be flooded: while the Guest is processing input the 13 + * Host can send more. Buffering in the Host could alleviate this, but it is a 14 + * difficult problem in general. :*/ 15 + /* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation 16 + * 17 + * This program is free software; you can redistribute it and/or modify 18 + * it under the terms of the GNU General Public License as published by 19 + * the Free Software Foundation; either version 2 of the License, or 20 + * (at your option) any later version. 21 + * 22 + * This program is distributed in the hope that it will be useful, 23 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 + * GNU General Public License for more details. 26 + * 27 + * You should have received a copy of the GNU General Public License 28 + * along with this program; if not, write to the Free Software 29 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 30 + */ 31 + #include <linux/err.h> 32 + #include <linux/init.h> 33 + #include <linux/virtio.h> 34 + #include <linux/virtio_console.h> 35 + #include "hvc_console.h" 36 + 37 + /*D:340 These represent our input and output console queues, and the virtio 38 + * operations for them. */ 39 + static struct virtqueue *in_vq, *out_vq; 40 + static struct virtio_device *vdev; 41 + 42 + /* This is our input buffer, and how much data is left in it. */ 43 + static unsigned int in_len; 44 + static char *in, *inbuf; 45 + 46 + /* The operations for our console. */ 47 + static struct hv_ops virtio_cons; 48 + 49 + /*D:310 The put_chars() callback is pretty straightforward. 50 + * 51 + * We turn the characters into a scatter-gather list, add it to the output 52 + * queue and then kick the Host. Then we sit here waiting for it to finish: 53 + * inefficient in theory, but in practice implementations will do it 54 + * immediately (lguest's Launcher does). */ 55 + static int put_chars(u32 vtermno, const char *buf, int count) 56 + { 57 + struct scatterlist sg[1]; 58 + unsigned int len; 59 + 60 + /* This is a convenient routine to initialize a single-elem sg list */ 61 + sg_init_one(sg, buf, count); 62 + 63 + /* add_buf wants a token to identify this buffer: we hand it any 64 + * non-NULL pointer, since there's only ever one buffer. */ 65 + if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) { 66 + /* Tell Host to go! */ 67 + out_vq->vq_ops->kick(out_vq); 68 + /* Chill out until it's done with the buffer. */ 69 + while (!out_vq->vq_ops->get_buf(out_vq, &len)) 70 + cpu_relax(); 71 + } 72 + 73 + /* We're expected to return the amount of data we wrote: all of it. */ 74 + return count; 75 + } 76 + 77 + /* Create a scatter-gather list representing our input buffer and put it in the 78 + * queue. */ 79 + static void add_inbuf(void) 80 + { 81 + struct scatterlist sg[1]; 82 + sg_init_one(sg, inbuf, PAGE_SIZE); 83 + 84 + /* We should always be able to add one buffer to an empty queue. */ 85 + if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0) 86 + BUG(); 87 + in_vq->vq_ops->kick(in_vq); 88 + } 89 + 90 + /*D:350 get_chars() is the callback from the hvc_console infrastructure when 91 + * an interrupt is received. 92 + * 93 + * Most of the code deals with the fact that the hvc_console() infrastructure 94 + * only asks us for 16 bytes at a time. We keep in_offset and in_used fields 95 + * for partially-filled buffers. */ 96 + static int get_chars(u32 vtermno, char *buf, int count) 97 + { 98 + /* If we don't have an input queue yet, we can't get input. */ 99 + BUG_ON(!in_vq); 100 + 101 + /* No buffer? Try to get one. */ 102 + if (!in_len) { 103 + in = in_vq->vq_ops->get_buf(in_vq, &in_len); 104 + if (!in) 105 + return 0; 106 + } 107 + 108 + /* You want more than we have to give? Well, try wanting less! */ 109 + if (in_len < count) 110 + count = in_len; 111 + 112 + /* Copy across to their buffer and increment offset. */ 113 + memcpy(buf, in, count); 114 + in += count; 115 + in_len -= count; 116 + 117 + /* Finished? Re-register buffer so Host will use it again. */ 118 + if (in_len == 0) 119 + add_inbuf(); 120 + 121 + return count; 122 + } 123 + /*:*/ 124 + 125 + /*D:320 Console drivers are initialized very early so boot messages can go out, 126 + * so we do things slightly differently from the generic virtio initialization 127 + * of the net and block drivers. 128 + * 129 + * At this stage, the console is output-only. It's too early to set up a 130 + * virtqueue, so we let the drivers do some boutique early-output thing. */ 131 + int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) 132 + { 133 + virtio_cons.put_chars = put_chars; 134 + return hvc_instantiate(0, 0, &virtio_cons); 135 + } 136 + 137 + /*D:370 Once we're further in boot, we get probed like any other virtio device. 138 + * At this stage we set up the output virtqueue. 139 + * 140 + * To set up and manage our virtual console, we call hvc_alloc(). Since we 141 + * never remove the console device we never need this pointer again. 142 + * 143 + * Finally we put our input buffer in the input queue, ready to receive. */ 144 + static int virtcons_probe(struct virtio_device *dev) 145 + { 146 + int err; 147 + struct hvc_struct *hvc; 148 + 149 + vdev = dev; 150 + 151 + /* This is the scratch page we use to receive console input */ 152 + inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 153 + if (!inbuf) { 154 + err = -ENOMEM; 155 + goto fail; 156 + } 157 + 158 + /* Find the input queue. */ 159 + /* FIXME: This is why we want to wean off hvc: we do nothing 160 + * when input comes in. */ 161 + in_vq = vdev->config->find_vq(vdev, NULL); 162 + if (IS_ERR(in_vq)) { 163 + err = PTR_ERR(in_vq); 164 + goto free; 165 + } 166 + 167 + out_vq = vdev->config->find_vq(vdev, NULL); 168 + if (IS_ERR(out_vq)) { 169 + err = PTR_ERR(out_vq); 170 + goto free_in_vq; 171 + } 172 + 173 + /* Start using the new console output. */ 174 + virtio_cons.get_chars = get_chars; 175 + virtio_cons.put_chars = put_chars; 176 + 177 + /* The first argument of hvc_alloc() is the virtual console number, so 178 + * we use zero. The second argument is the interrupt number; we 179 + * currently leave this as zero: it would be better not to use the 180 + * hvc mechanism and fix this (FIXME!). 181 + * 182 + * The third argument is a "struct hv_ops" containing the put_chars() 183 + * and get_chars() pointers. The final argument is the output buffer 184 + * size: we can do any size, so we put PAGE_SIZE here. */ 185 + hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE); 186 + if (IS_ERR(hvc)) { 187 + err = PTR_ERR(hvc); 188 + goto free_out_vq; 189 + } 190 + 191 + /* Register the input buffer the first time. */ 192 + add_inbuf(); 193 + return 0; 194 + 195 + free_out_vq: 196 + vdev->config->del_vq(out_vq); 197 + free_in_vq: 198 + vdev->config->del_vq(in_vq); 199 + free: 200 + kfree(inbuf); 201 + fail: 202 + return err; 203 + } 204 + 205 + static struct virtio_device_id id_table[] = { 206 + { VIRTIO_ID_CONSOLE, VIRTIO_DEV_ANY_ID }, 207 + { 0 }, 208 + }; 209 + 210 + static struct virtio_driver virtio_console = { 211 + .driver.name = KBUILD_MODNAME, 212 + .driver.owner = THIS_MODULE, 213 + .id_table = id_table, 214 + .probe = virtcons_probe, 215 + }; 216 + 217 + static int __init init(void) 218 + { 219 + return register_virtio_driver(&virtio_console); 220 + } 221 + module_init(init); 222 + 223 + MODULE_DEVICE_TABLE(virtio, id_table); 224 + MODULE_DESCRIPTION("Virtio console driver"); 225 + MODULE_LICENSE("GPL");

+4

drivers/kvm/Kconfig

··· 47 Provides support for KVM on AMD processors equipped with the AMD-V 48 (SVM) extensions. 49 50 endif # VIRTUALIZATION

··· 47 Provides support for KVM on AMD processors equipped with the AMD-V 48 (SVM) extensions. 49 50 + # OK, it's a little counter-intuitive to do this, but it puts it neatly under 51 + # the virtualization menu. 52 + source drivers/lguest/Kconfig 53 + 54 endif # VIRTUALIZATION

+1 -12

drivers/lguest/Kconfig

··· 1 config LGUEST 2 tristate "Linux hypervisor example code" 3 - depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE && FUTEX 4 - select LGUEST_GUEST 5 select HVC_DRIVER 6 ---help--- 7 This is a very simple module which allows you to run ··· 17 The guest needs code built-in, even if the host has lguest 18 support as a module. The drivers are tiny, so we build them 19 in too. 20 - 21 - config LGUEST_NET 22 - tristate 23 - default y 24 - depends on LGUEST_GUEST && NET 25 - 26 - config LGUEST_BLOCK 27 - tristate 28 - default y 29 - depends on LGUEST_GUEST && BLOCK

··· 1 config LGUEST 2 tristate "Linux hypervisor example code" 3 + depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !(X86_VISWS || X86_VOYAGER) 4 select HVC_DRIVER 5 ---help--- 6 This is a very simple module which allows you to run ··· 18 The guest needs code built-in, even if the host has lguest 19 support as a module. The drivers are tiny, so we build them 20 in too.

+6 -4

drivers/lguest/Makefile

··· 1 - # Guest requires the paravirt_ops replacement and the bus driver. 2 - obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o 3 4 # Host requires the other files, which can be a module. 5 obj-$(CONFIG_LGUEST) += lg.o 6 - lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 7 - segments.o io.o lguest_user.o switcher.o 8 9 Preparation Preparation!: PREFIX=P 10 Guest: PREFIX=G

··· 1 + # Guest requires the device configuration and probing code. 2 + obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o 3 4 # Host requires the other files, which can be a module. 5 obj-$(CONFIG_LGUEST) += lg.o 6 + lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 7 + segments.o lguest_user.o 8 + 9 + lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o 10 11 Preparation Preparation!: PREFIX=P 12 Guest: PREFIX=G

+50 -516

drivers/lguest/core.c

··· 11 #include <linux/vmalloc.h> 12 #include <linux/cpu.h> 13 #include <linux/freezer.h> 14 #include <asm/paravirt.h> 15 - #include <asm/desc.h> 16 #include <asm/pgtable.h> 17 #include <asm/uaccess.h> 18 #include <asm/poll.h> 19 - #include <asm/highmem.h> 20 #include <asm/asm-offsets.h> 21 - #include <asm/i387.h> 22 #include "lg.h" 23 24 - /* Found in switcher.S */ 25 - extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 26 - extern unsigned long default_idt_entries[]; 27 - 28 - /* Every guest maps the core switcher code. */ 29 - #define SHARED_SWITCHER_PAGES \ 30 - DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) 31 - /* Pages for switcher itself, then two pages per cpu */ 32 - #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) 33 - 34 - /* We map at -4M for ease of mapping into the guest (one PTE page). */ 35 - #define SWITCHER_ADDR 0xFFC00000 36 37 static struct vm_struct *switcher_vma; 38 static struct page **switcher_page; 39 40 - static int cpu_had_pge; 41 - static struct { 42 - unsigned long offset; 43 - unsigned short segment; 44 - } lguest_entry; 45 - 46 /* This One Big lock protects all inter-guest data structures. */ 47 DEFINE_MUTEX(lguest_lock); 48 - static DEFINE_PER_CPU(struct lguest *, last_guest); 49 - 50 - /* FIXME: Make dynamic. */ 51 - #define MAX_LGUEST_GUESTS 16 52 - struct lguest lguests[MAX_LGUEST_GUESTS]; 53 - 54 - /* Offset from where switcher.S was compiled to where we've copied it */ 55 - static unsigned long switcher_offset(void) 56 - { 57 - return SWITCHER_ADDR - (unsigned long)start_switcher_text; 58 - } 59 - 60 - /* This cpu's struct lguest_pages. */ 61 - static struct lguest_pages *lguest_pages(unsigned int cpu) 62 - { 63 - return &(((struct lguest_pages *) 64 - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 65 - } 66 67 /*H:010 We need to set up the Switcher at a high virtual address. Remember the 68 * Switcher is a few hundred bytes of assembler code which actually changes the ··· 35 * Host since it will be running as the switchover occurs. 36 * 37 * Trying to map memory at a particular address is an unusual thing to do, so 38 - * it's not a simple one-liner. We also set up the per-cpu parts of the 39 - * Switcher here. 40 - */ 41 static __init int map_switcher(void) 42 { 43 int i, err; ··· 92 goto free_vma; 93 } 94 95 - /* Now the switcher is mapped at the right address, we can't fail! 96 - * Copy in the compiled-in Switcher code (from switcher.S). */ 97 memcpy(switcher_vma->addr, start_switcher_text, 98 end_switcher_text - start_switcher_text); 99 - 100 - /* Most of the switcher.S doesn't care that it's been moved; on Intel, 101 - * jumps are relative, and it doesn't access any references to external 102 - * code or data. 103 - * 104 - * The only exception is the interrupt handlers in switcher.S: their 105 - * addresses are placed in a table (default_idt_entries), so we need to 106 - * update the table with the new addresses. switcher_offset() is a 107 - * convenience function which returns the distance between the builtin 108 - * switcher code and the high-mapped copy we just made. */ 109 - for (i = 0; i < IDT_ENTRIES; i++) 110 - default_idt_entries[i] += switcher_offset(); 111 - 112 - /* 113 - * Set up the Switcher's per-cpu areas. 114 - * 115 - * Each CPU gets two pages of its own within the high-mapped region 116 - * (aka. "struct lguest_pages"). Much of this can be initialized now, 117 - * but some depends on what Guest we are running (which is set up in 118 - * copy_in_guest_info()). 119 - */ 120 - for_each_possible_cpu(i) { 121 - /* lguest_pages() returns this CPU's two pages. */ 122 - struct lguest_pages *pages = lguest_pages(i); 123 - /* This is a convenience pointer to make the code fit one 124 - * statement to a line. */ 125 - struct lguest_ro_state *state = &pages->state; 126 - 127 - /* The Global Descriptor Table: the Host has a different one 128 - * for each CPU. We keep a descriptor for the GDT which says 129 - * where it is and how big it is (the size is actually the last 130 - * byte, not the size, hence the "-1"). */ 131 - state->host_gdt_desc.size = GDT_SIZE-1; 132 - state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 133 - 134 - /* All CPUs on the Host use the same Interrupt Descriptor 135 - * Table, so we just use store_idt(), which gets this CPU's IDT 136 - * descriptor. */ 137 - store_idt(&state->host_idt_desc); 138 - 139 - /* The descriptors for the Guest's GDT and IDT can be filled 140 - * out now, too. We copy the GDT & IDT into ->guest_gdt and 141 - * ->guest_idt before actually running the Guest. */ 142 - state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 143 - state->guest_idt_desc.address = (long)&state->guest_idt; 144 - state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 145 - state->guest_gdt_desc.address = (long)&state->guest_gdt; 146 - 147 - /* We know where we want the stack to be when the Guest enters 148 - * the switcher: in pages->regs. The stack grows upwards, so 149 - * we start it at the end of that structure. */ 150 - state->guest_tss.esp0 = (long)(&pages->regs + 1); 151 - /* And this is the GDT entry to use for the stack: we keep a 152 - * couple of special LGUEST entries. */ 153 - state->guest_tss.ss0 = LGUEST_DS; 154 - 155 - /* x86 can have a finegrained bitmap which indicates what I/O 156 - * ports the process can use. We set it to the end of our 157 - * structure, meaning "none". */ 158 - state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 159 - 160 - /* Some GDT entries are the same across all Guests, so we can 161 - * set them up now. */ 162 - setup_default_gdt_entries(state); 163 - /* Most IDT entries are the same for all Guests, too.*/ 164 - setup_default_idt_entries(state, default_idt_entries); 165 - 166 - /* The Host needs to be able to use the LGUEST segments on this 167 - * CPU, too, so put them in the Host GDT. */ 168 - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 169 - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 170 - } 171 - 172 - /* In the Switcher, we want the %cs segment register to use the 173 - * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 174 - * it will be undisturbed when we switch. To change %cs and jump we 175 - * need this structure to feed to Intel's "lcall" instruction. */ 176 - lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 177 - lguest_entry.segment = LGUEST_CS; 178 179 printk(KERN_INFO "lguest: mapped switcher at %p\n", 180 switcher_vma->addr); ··· 128 __free_pages(switcher_page[i], 0); 129 } 130 131 - /*H:130 Our Guest is usually so well behaved; it never tries to do things it 132 - * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 133 - * quite complete, because it doesn't contain replacements for the Intel I/O 134 - * instructions. As a result, the Guest sometimes fumbles across one during 135 - * the boot process as it probes for various things which are usually attached 136 - * to a PC. 137 - * 138 - * When the Guest uses one of these instructions, we get trap #13 (General 139 - * Protection Fault) and come here. We see if it's one of those troublesome 140 - * instructions and skip over it. We return true if we did. */ 141 - static int emulate_insn(struct lguest *lg) 142 - { 143 - u8 insn; 144 - unsigned int insnlen = 0, in = 0, shift = 0; 145 - /* The eip contains the *virtual* address of the Guest's instruction: 146 - * guest_pa just subtracts the Guest's page_offset. */ 147 - unsigned long physaddr = guest_pa(lg, lg->regs->eip); 148 - 149 - /* The guest_pa() function only works for Guest kernel addresses, but 150 - * that's all we're trying to do anyway. */ 151 - if (lg->regs->eip < lg->page_offset) 152 - return 0; 153 - 154 - /* Decoding x86 instructions is icky. */ 155 - lgread(lg, &insn, physaddr, 1); 156 - 157 - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 158 - of the eax register. */ 159 - if (insn == 0x66) { 160 - shift = 16; 161 - /* The instruction is 1 byte so far, read the next byte. */ 162 - insnlen = 1; 163 - lgread(lg, &insn, physaddr + insnlen, 1); 164 - } 165 - 166 - /* We can ignore the lower bit for the moment and decode the 4 opcodes 167 - * we need to emulate. */ 168 - switch (insn & 0xFE) { 169 - case 0xE4: /* in <next byte>,%al */ 170 - insnlen += 2; 171 - in = 1; 172 - break; 173 - case 0xEC: /* in (%dx),%al */ 174 - insnlen += 1; 175 - in = 1; 176 - break; 177 - case 0xE6: /* out %al,<next byte> */ 178 - insnlen += 2; 179 - break; 180 - case 0xEE: /* out %al,(%dx) */ 181 - insnlen += 1; 182 - break; 183 - default: 184 - /* OK, we don't know what this is, can't emulate. */ 185 - return 0; 186 - } 187 - 188 - /* If it was an "IN" instruction, they expect the result to be read 189 - * into %eax, so we change %eax. We always return all-ones, which 190 - * traditionally means "there's nothing there". */ 191 - if (in) { 192 - /* Lower bit tells is whether it's a 16 or 32 bit access */ 193 - if (insn & 0x1) 194 - lg->regs->eax = 0xFFFFFFFF; 195 - else 196 - lg->regs->eax |= (0xFFFF << shift); 197 - } 198 - /* Finally, we've "done" the instruction, so move past it. */ 199 - lg->regs->eip += insnlen; 200 - /* Success! */ 201 - return 1; 202 - } 203 - /*:*/ 204 - 205 /*L:305 206 * Dealing With Guest Memory. 207 * 208 * When the Guest gives us (what it thinks is) a physical address, we can use 209 - * the normal copy_from_user() & copy_to_user() on that address: remember, 210 - * Guest physical == Launcher virtual. 211 * 212 * But we can't trust the Guest: it might be trying to access the Launcher 213 * code. We have to check that the range is below the pfn_limit the Launcher ··· 145 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 146 } 147 148 - /* This is a convenient routine to get a 32-bit value from the Guest (a very 149 - * common operation). Here we can see how useful the kill_lguest() routine we 150 - * met in the Launcher can be: we return a random value (0) instead of needing 151 - * to return an error. */ 152 - u32 lgread_u32(struct lguest *lg, unsigned long addr) 153 - { 154 - u32 val = 0; 155 - 156 - /* Don't let them access lguest binary. */ 157 - if (!lguest_address_ok(lg, addr, sizeof(val)) 158 - || get_user(val, (u32 __user *)addr) != 0) 159 - kill_guest(lg, "bad read address %#lx", addr); 160 - return val; 161 - } 162 - 163 - /* Same thing for writing a value. */ 164 - void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) 165 - { 166 - if (!lguest_address_ok(lg, addr, sizeof(val)) 167 - || put_user(val, (u32 __user *)addr) != 0) 168 - kill_guest(lg, "bad write address %#lx", addr); 169 - } 170 - 171 - /* This routine is more generic, and copies a range of Guest bytes into a 172 - * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so 173 - * the caller doesn't end up using uninitialized kernel memory. */ 174 - void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 175 { 176 if (!lguest_address_ok(lg, addr, bytes) 177 - || copy_from_user(b, (void __user *)addr, bytes) != 0) { 178 /* copy_from_user should do this, but as we rely on it... */ 179 memset(b, 0, bytes); 180 kill_guest(lg, "bad read address %#lx len %u", addr, bytes); 181 } 182 } 183 184 - /* Similarly, our generic routine to copy into a range of Guest bytes. */ 185 - void lgwrite(struct lguest *lg, unsigned long addr, const void *b, 186 - unsigned bytes) 187 { 188 if (!lguest_address_ok(lg, addr, bytes) 189 - || copy_to_user((void __user *)addr, b, bytes) != 0) 190 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 191 - } 192 - /* (end of memory access helper routines) :*/ 193 - 194 - static void set_ts(void) 195 - { 196 - u32 cr0; 197 - 198 - cr0 = read_cr0(); 199 - if (!(cr0 & 8)) 200 - write_cr0(cr0|8); 201 - } 202 - 203 - /*S:010 204 - * We are getting close to the Switcher. 205 - * 206 - * Remember that each CPU has two pages which are visible to the Guest when it 207 - * runs on that CPU. This has to contain the state for that Guest: we copy the 208 - * state in just before we run the Guest. 209 - * 210 - * Each Guest has "changed" flags which indicate what has changed in the Guest 211 - * since it last ran. We saw this set in interrupts_and_traps.c and 212 - * segments.c. 213 - */ 214 - static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 215 - { 216 - /* Copying all this data can be quite expensive. We usually run the 217 - * same Guest we ran last time (and that Guest hasn't run anywhere else 218 - * meanwhile). If that's not the case, we pretend everything in the 219 - * Guest has changed. */ 220 - if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 221 - __get_cpu_var(last_guest) = lg; 222 - lg->last_pages = pages; 223 - lg->changed = CHANGED_ALL; 224 - } 225 - 226 - /* These copies are pretty cheap, so we do them unconditionally: */ 227 - /* Save the current Host top-level page directory. */ 228 - pages->state.host_cr3 = __pa(current->mm->pgd); 229 - /* Set up the Guest's page tables to see this CPU's pages (and no 230 - * other CPU's pages). */ 231 - map_switcher_in_guest(lg, pages); 232 - /* Set up the two "TSS" members which tell the CPU what stack to use 233 - * for traps which do directly into the Guest (ie. traps at privilege 234 - * level 1). */ 235 - pages->state.guest_tss.esp1 = lg->esp1; 236 - pages->state.guest_tss.ss1 = lg->ss1; 237 - 238 - /* Copy direct-to-Guest trap entries. */ 239 - if (lg->changed & CHANGED_IDT) 240 - copy_traps(lg, pages->state.guest_idt, default_idt_entries); 241 - 242 - /* Copy all GDT entries which the Guest can change. */ 243 - if (lg->changed & CHANGED_GDT) 244 - copy_gdt(lg, pages->state.guest_gdt); 245 - /* If only the TLS entries have changed, copy them. */ 246 - else if (lg->changed & CHANGED_GDT_TLS) 247 - copy_gdt_tls(lg, pages->state.guest_gdt); 248 - 249 - /* Mark the Guest as unchanged for next time. */ 250 - lg->changed = 0; 251 - } 252 - 253 - /* Finally: the code to actually call into the Switcher to run the Guest. */ 254 - static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 255 - { 256 - /* This is a dummy value we need for GCC's sake. */ 257 - unsigned int clobber; 258 - 259 - /* Copy the guest-specific information into this CPU's "struct 260 - * lguest_pages". */ 261 - copy_in_guest_info(lg, pages); 262 - 263 - /* Set the trap number to 256 (impossible value). If we fault while 264 - * switching to the Guest (bad segment registers or bug), this will 265 - * cause us to abort the Guest. */ 266 - lg->regs->trapnum = 256; 267 - 268 - /* Now: we push the "eflags" register on the stack, then do an "lcall". 269 - * This is how we change from using the kernel code segment to using 270 - * the dedicated lguest code segment, as well as jumping into the 271 - * Switcher. 272 - * 273 - * The lcall also pushes the old code segment (KERNEL_CS) onto the 274 - * stack, then the address of this call. This stack layout happens to 275 - * exactly match the stack of an interrupt... */ 276 - asm volatile("pushf; lcall *lguest_entry" 277 - /* This is how we tell GCC that %eax ("a") and %ebx ("b") 278 - * are changed by this routine. The "=" means output. */ 279 - : "=a"(clobber), "=b"(clobber) 280 - /* %eax contains the pages pointer. ("0" refers to the 281 - * 0-th argument above, ie "a"). %ebx contains the 282 - * physical address of the Guest's top-level page 283 - * directory. */ 284 - : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 285 - /* We tell gcc that all these registers could change, 286 - * which means we don't have to save and restore them in 287 - * the Switcher. */ 288 - : "memory", "%edx", "%ecx", "%edi", "%esi"); 289 } 290 /*:*/ 291 ··· 175 { 176 /* We stop running once the Guest is dead. */ 177 while (!lg->dead) { 178 - /* We need to initialize this, otherwise gcc complains. It's 179 - * not (yet) clever enough to see that it's initialized when we 180 - * need it. */ 181 - unsigned int cr2 = 0; /* Damn gcc */ 182 183 - /* First we run any hypercalls the Guest wants done: either in 184 - * the hypercall ring in "struct lguest_data", or directly by 185 - * using int 31 (LGUEST_TRAP_ENTRY). */ 186 - do_hypercalls(lg); 187 - /* It's possible the Guest did a SEND_DMA hypercall to the 188 * Launcher, in which case we return from the read() now. */ 189 - if (lg->dma_is_pending) { 190 - if (put_user(lg->pending_dma, user) || 191 - put_user(lg->pending_key, user+1)) 192 return -EFAULT; 193 - return sizeof(unsigned long)*2; 194 } 195 196 /* Check for signals */ ··· 222 * the "Do Not Disturb" sign: */ 223 local_irq_disable(); 224 225 - /* Remember the awfully-named TS bit? If the Guest has asked 226 - * to set it we set it now, so we can trap and pass that trap 227 - * to the Guest if it uses the FPU. */ 228 - if (lg->ts) 229 - set_ts(); 230 - 231 - /* SYSENTER is an optimized way of doing system calls. We 232 - * can't allow it because it always jumps to privilege level 0. 233 - * A normal Guest won't try it because we don't advertise it in 234 - * CPUID, but a malicious Guest (or malicious Guest userspace 235 - * program) could, so we tell the CPU to disable it before 236 - * running the Guest. */ 237 - if (boot_cpu_has(X86_FEATURE_SEP)) 238 - wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 239 - 240 - /* Now we actually run the Guest. It will pop back out when 241 - * something interesting happens, and we can examine its 242 - * registers to see what it was doing. */ 243 - run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 244 - 245 - /* The "regs" pointer contains two extra entries which are not 246 - * really registers: a trap number which says what interrupt or 247 - * trap made the switcher code come back, and an error code 248 - * which some traps set. */ 249 - 250 - /* If the Guest page faulted, then the cr2 register will tell 251 - * us the bad virtual address. We have to grab this now, 252 - * because once we re-enable interrupts an interrupt could 253 - * fault and thus overwrite cr2, or we could even move off to a 254 - * different CPU. */ 255 - if (lg->regs->trapnum == 14) 256 - cr2 = read_cr2(); 257 - /* Similarly, if we took a trap because the Guest used the FPU, 258 - * we have to restore the FPU it expects to see. */ 259 - else if (lg->regs->trapnum == 7) 260 - math_state_restore(); 261 - 262 - /* Restore SYSENTER if it's supposed to be on. */ 263 - if (boot_cpu_has(X86_FEATURE_SEP)) 264 - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 265 266 /* Now we're ready to be interrupted or moved to other CPUs */ 267 local_irq_enable(); 268 269 - /* OK, so what happened? */ 270 - switch (lg->regs->trapnum) { 271 - case 13: /* We've intercepted a GPF. */ 272 - /* Check if this was one of those annoying IN or OUT 273 - * instructions which we need to emulate. If so, we 274 - * just go back into the Guest after we've done it. */ 275 - if (lg->regs->errcode == 0) { 276 - if (emulate_insn(lg)) 277 - continue; 278 - } 279 - break; 280 - case 14: /* We've intercepted a page fault. */ 281 - /* The Guest accessed a virtual address that wasn't 282 - * mapped. This happens a lot: we don't actually set 283 - * up most of the page tables for the Guest at all when 284 - * we start: as it runs it asks for more and more, and 285 - * we set them up as required. In this case, we don't 286 - * even tell the Guest that the fault happened. 287 - * 288 - * The errcode tells whether this was a read or a 289 - * write, and whether kernel or userspace code. */ 290 - if (demand_page(lg, cr2, lg->regs->errcode)) 291 - continue; 292 - 293 - /* OK, it's really not there (or not OK): the Guest 294 - * needs to know. We write out the cr2 value so it 295 - * knows where the fault occurred. 296 - * 297 - * Note that if the Guest were really messed up, this 298 - * could happen before it's done the INITIALIZE 299 - * hypercall, so lg->lguest_data will be NULL, so 300 - * &lg->lguest_data->cr2 will be address 8. Writing 301 - * into that address won't hurt the Host at all, 302 - * though. */ 303 - if (put_user(cr2, &lg->lguest_data->cr2)) 304 - kill_guest(lg, "Writing cr2"); 305 - break; 306 - case 7: /* We've intercepted a Device Not Available fault. */ 307 - /* If the Guest doesn't want to know, we already 308 - * restored the Floating Point Unit, so we just 309 - * continue without telling it. */ 310 - if (!lg->ts) 311 - continue; 312 - break; 313 - case 32 ... 255: 314 - /* These values mean a real interrupt occurred, in 315 - * which case the Host handler has already been run. 316 - * We just do a friendly check if another process 317 - * should now be run, then fall through to loop 318 - * around: */ 319 - cond_resched(); 320 - case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ 321 - continue; 322 - } 323 - 324 - /* If we get here, it's a trap the Guest wants to know 325 - * about. */ 326 - if (deliver_trap(lg, lg->regs->trapnum)) 327 - continue; 328 - 329 - /* If the Guest doesn't have a handler (either it hasn't 330 - * registered any yet, or it's one of the faults we don't let 331 - * it handle), it dies with a cryptic error message. */ 332 - kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 333 - lg->regs->trapnum, lg->regs->eip, 334 - lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); 335 } 336 /* The Guest is dead => "No such file or directory" */ 337 return -ENOENT; 338 - } 339 - 340 - /* Now we can look at each of the routines this calls, in increasing order of 341 - * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 342 - * deliver_trap() and demand_page(). After all those, we'll be ready to 343 - * examine the Switcher, and our philosophical understanding of the Host/Guest 344 - * duality will be complete. :*/ 345 - 346 - int find_free_guest(void) 347 - { 348 - unsigned int i; 349 - for (i = 0; i < MAX_LGUEST_GUESTS; i++) 350 - if (!lguests[i].tsk) 351 - return i; 352 - return -1; 353 - } 354 - 355 - static void adjust_pge(void *on) 356 - { 357 - if (on) 358 - write_cr4(read_cr4() | X86_CR4_PGE); 359 - else 360 - write_cr4(read_cr4() & ~X86_CR4_PGE); 361 } 362 363 /*H:000 ··· 257 /* First we put the Switcher up in very high virtual memory. */ 258 err = map_switcher(); 259 if (err) 260 - return err; 261 262 /* Now we set up the pagetable implementation for the Guests. */ 263 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); 264 - if (err) { 265 - unmap_switcher(); 266 - return err; 267 - } 268 269 - /* The I/O subsystem needs some things initialized. */ 270 - lguest_io_init(); 271 272 /* /dev/lguest needs to be registered. */ 273 err = lguest_device_init(); 274 - if (err) { 275 - free_pagetables(); 276 - unmap_switcher(); 277 - return err; 278 - } 279 280 - /* Finally, we need to turn off "Page Global Enable". PGE is an 281 - * optimization where page table entries are specially marked to show 282 - * they never change. The Host kernel marks all the kernel pages this 283 - * way because it's always present, even when userspace is running. 284 - * 285 - * Lguest breaks this: unbeknownst to the rest of the Host kernel, we 286 - * switch to the Guest kernel. If you don't disable this on all CPUs, 287 - * you'll get really weird bugs that you'll chase for two days. 288 - * 289 - * I used to turn PGE off every time we switched to the Guest and back 290 - * on when we return, but that slowed the Switcher down noticibly. */ 291 - 292 - /* We don't need the complexity of CPUs coming and going while we're 293 - * doing this. */ 294 - lock_cpu_hotplug(); 295 - if (cpu_has_pge) { /* We have a broader idea of "global". */ 296 - /* Remember that this was originally set (for cleanup). */ 297 - cpu_had_pge = 1; 298 - /* adjust_pge is a helper function which sets or unsets the PGE 299 - * bit on its CPU, depending on the argument (0 == unset). */ 300 - on_each_cpu(adjust_pge, (void *)0, 0, 1); 301 - /* Turn off the feature in the global feature set. */ 302 - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 303 - } 304 - unlock_cpu_hotplug(); 305 306 /* All good! */ 307 return 0; 308 } 309 310 /* Cleaning up is just the same code, backwards. With a little French. */ 311 static void __exit fini(void) 312 { 313 lguest_device_remove(); 314 free_pagetables(); 315 unmap_switcher(); 316 317 - /* If we had PGE before we started, turn it back on now. */ 318 - lock_cpu_hotplug(); 319 - if (cpu_had_pge) { 320 - set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 321 - /* adjust_pge's argument "1" means set PGE. */ 322 - on_each_cpu(adjust_pge, (void *)1, 0, 1); 323 - } 324 - unlock_cpu_hotplug(); 325 } 326 327 /* The Host side of lguest can be a module. This is a nice way for people to 328 * play with it. */

··· 11 #include <linux/vmalloc.h> 12 #include <linux/cpu.h> 13 #include <linux/freezer.h> 14 + #include <linux/highmem.h> 15 #include <asm/paravirt.h> 16 #include <asm/pgtable.h> 17 #include <asm/uaccess.h> 18 #include <asm/poll.h> 19 #include <asm/asm-offsets.h> 20 #include "lg.h" 21 22 23 static struct vm_struct *switcher_vma; 24 static struct page **switcher_page; 25 26 /* This One Big lock protects all inter-guest data structures. */ 27 DEFINE_MUTEX(lguest_lock); 28 29 /*H:010 We need to set up the Switcher at a high virtual address. Remember the 30 * Switcher is a few hundred bytes of assembler code which actually changes the ··· 73 * Host since it will be running as the switchover occurs. 74 * 75 * Trying to map memory at a particular address is an unusual thing to do, so 76 + * it's not a simple one-liner. */ 77 static __init int map_switcher(void) 78 { 79 int i, err; ··· 132 goto free_vma; 133 } 134 135 + /* Now the Switcher is mapped at the right address, we can't fail! 136 + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ 137 memcpy(switcher_vma->addr, start_switcher_text, 138 end_switcher_text - start_switcher_text); 139 140 printk(KERN_INFO "lguest: mapped switcher at %p\n", 141 switcher_vma->addr); ··· 247 __free_pages(switcher_page[i], 0); 248 } 249 250 /*L:305 251 * Dealing With Guest Memory. 252 * 253 * When the Guest gives us (what it thinks is) a physical address, we can use 254 + * the normal copy_from_user() & copy_to_user() on the corresponding place in 255 + * the memory region allocated by the Launcher. 256 * 257 * But we can't trust the Guest: it might be trying to access the Launcher 258 * code. We have to check that the range is below the pfn_limit the Launcher ··· 338 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 339 } 340 341 + /* This routine copies memory from the Guest. Here we can see how useful the 342 + * kill_lguest() routine we met in the Launcher can be: we return a random 343 + * value (all zeroes) instead of needing to return an error. */ 344 + void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 345 { 346 if (!lguest_address_ok(lg, addr, bytes) 347 + || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { 348 /* copy_from_user should do this, but as we rely on it... */ 349 memset(b, 0, bytes); 350 kill_guest(lg, "bad read address %#lx len %u", addr, bytes); 351 } 352 } 353 354 + /* This is the write (copy into guest) version. */ 355 + void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, 356 + unsigned bytes) 357 { 358 if (!lguest_address_ok(lg, addr, bytes) 359 + || copy_to_user(lg->mem_base + addr, b, bytes) != 0) 360 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 361 } 362 /*:*/ 363 ··· 489 { 490 /* We stop running once the Guest is dead. */ 491 while (!lg->dead) { 492 + /* First we run any hypercalls the Guest wants done. */ 493 + if (lg->hcall) 494 + do_hypercalls(lg); 495 496 + /* It's possible the Guest did a NOTIFY hypercall to the 497 * Launcher, in which case we return from the read() now. */ 498 + if (lg->pending_notify) { 499 + if (put_user(lg->pending_notify, user)) 500 return -EFAULT; 501 + return sizeof(lg->pending_notify); 502 } 503 504 /* Check for signals */ ··· 542 * the "Do Not Disturb" sign: */ 543 local_irq_disable(); 544 545 + /* Actually run the Guest until something happens. */ 546 + lguest_arch_run_guest(lg); 547 548 /* Now we're ready to be interrupted or moved to other CPUs */ 549 local_irq_enable(); 550 551 + /* Now we deal with whatever happened to the Guest. */ 552 + lguest_arch_handle_trap(lg); 553 } 554 + 555 /* The Guest is dead => "No such file or directory" */ 556 return -ENOENT; 557 } 558 559 /*H:000 ··· 701 /* First we put the Switcher up in very high virtual memory. */ 702 err = map_switcher(); 703 if (err) 704 + goto out; 705 706 /* Now we set up the pagetable implementation for the Guests. */ 707 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); 708 + if (err) 709 + goto unmap; 710 711 + /* We might need to reserve an interrupt vector. */ 712 + err = init_interrupts(); 713 + if (err) 714 + goto free_pgtables; 715 716 /* /dev/lguest needs to be registered. */ 717 err = lguest_device_init(); 718 + if (err) 719 + goto free_interrupts; 720 721 + /* Finally we do some architecture-specific setup. */ 722 + lguest_arch_host_init(); 723 724 /* All good! */ 725 return 0; 726 + 727 + free_interrupts: 728 + free_interrupts(); 729 + free_pgtables: 730 + free_pagetables(); 731 + unmap: 732 + unmap_switcher(); 733 + out: 734 + return err; 735 } 736 737 /* Cleaning up is just the same code, backwards. With a little French. */ 738 static void __exit fini(void) 739 { 740 lguest_device_remove(); 741 + free_interrupts(); 742 free_pagetables(); 743 unmap_switcher(); 744 745 + lguest_arch_host_fini(); 746 } 747 + /*:*/ 748 749 /* The Host side of lguest can be a module. This is a nice way for people to 750 * play with it. */

+58 -119

drivers/lguest/hypercalls.c

··· 25 #include <linux/mm.h> 26 #include <asm/page.h> 27 #include <asm/pgtable.h> 28 - #include <irq_vectors.h> 29 #include "lg.h" 30 31 - /*H:120 This is the core hypercall routine: where the Guest gets what it 32 - * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both. 33 - * 34 - * Remember from the Guest: %eax == which call to make, and the arguments are 35 - * packed into %edx, %ebx and %ecx if needed. */ 36 - static void do_hcall(struct lguest *lg, struct lguest_regs *regs) 37 { 38 - switch (regs->eax) { 39 case LHCALL_FLUSH_ASYNC: 40 /* This call does nothing, except by breaking out of the Guest 41 * it makes us process all the asynchronous hypercalls. */ ··· 47 char msg[128]; 48 /* If the lgread fails, it will call kill_guest() itself; the 49 * kill_guest() with the message will be ignored. */ 50 - lgread(lg, msg, regs->edx, sizeof(msg)); 51 msg[sizeof(msg)-1] = '\0'; 52 kill_guest(lg, "CRASH: %s", msg); 53 break; ··· 55 case LHCALL_FLUSH_TLB: 56 /* FLUSH_TLB comes in two flavors, depending on the 57 * argument: */ 58 - if (regs->edx) 59 guest_pagetable_clear_all(lg); 60 else 61 guest_pagetable_flush_user(lg); 62 break; 63 - case LHCALL_BIND_DMA: 64 - /* BIND_DMA really wants four arguments, but it's the only call 65 - * which does. So the Guest packs the number of buffers and 66 - * the interrupt number into the final argument, and we decode 67 - * it here. This can legitimately fail, since we currently 68 - * place a limit on the number of DMA pools a Guest can have. 69 - * So we return true or false from this call. */ 70 - regs->eax = bind_dma(lg, regs->edx, regs->ebx, 71 - regs->ecx >> 8, regs->ecx & 0xFF); 72 - break; 73 74 /* All these calls simply pass the arguments through to the right 75 * routines. */ 76 - case LHCALL_SEND_DMA: 77 - send_dma(lg, regs->edx, regs->ebx); 78 - break; 79 - case LHCALL_LOAD_GDT: 80 - load_guest_gdt(lg, regs->edx, regs->ebx); 81 - break; 82 - case LHCALL_LOAD_IDT_ENTRY: 83 - load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); 84 - break; 85 case LHCALL_NEW_PGTABLE: 86 - guest_new_pagetable(lg, regs->edx); 87 break; 88 case LHCALL_SET_STACK: 89 - guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); 90 break; 91 case LHCALL_SET_PTE: 92 - guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx)); 93 break; 94 case LHCALL_SET_PMD: 95 - guest_set_pmd(lg, regs->edx, regs->ebx); 96 - break; 97 - case LHCALL_LOAD_TLS: 98 - guest_load_tls(lg, regs->edx); 99 break; 100 case LHCALL_SET_CLOCKEVENT: 101 - guest_set_clockevent(lg, regs->edx); 102 break; 103 - 104 case LHCALL_TS: 105 /* This sets the TS flag, as we saw used in run_guest(). */ 106 - lg->ts = regs->edx; 107 break; 108 case LHCALL_HALT: 109 /* Similarly, this sets the halted flag for run_guest(). */ 110 lg->halted = 1; 111 break; 112 default: 113 - kill_guest(lg, "Bad hypercall %li\n", regs->eax); 114 } 115 } 116 117 - /* Asynchronous hypercalls are easy: we just look in the array in the Guest's 118 - * "struct lguest_data" and see if there are any new ones marked "ready". 119 * 120 * We are careful to do these in order: obviously we respect the order the 121 * Guest put them in the ring, but we also promise the Guest that they will ··· 112 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 113 return; 114 115 - 116 /* We process "struct lguest_data"s hcalls[] ring once. */ 117 for (i = 0; i < ARRAY_SIZE(st); i++) { 118 - struct lguest_regs regs; 119 /* We remember where we were up to from last time. This makes 120 * sure that the hypercalls are done in the order the Guest 121 * places them in the ring. */ ··· 129 if (++lg->next_hcall == LHCALL_RING_SIZE) 130 lg->next_hcall = 0; 131 132 - /* We copy the hypercall arguments into a fake register 133 - * structure. This makes life simple for do_hcall(). */ 134 - if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) 135 - || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) 136 - || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) 137 - || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) { 138 kill_guest(lg, "Fetching async hypercalls"); 139 break; 140 } 141 142 /* Do the hypercall, same as a normal one. */ 143 - do_hcall(lg, &regs); 144 145 /* Mark the hypercall done. */ 146 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { ··· 146 break; 147 } 148 149 - /* Stop doing hypercalls if we've just done a DMA to the 150 - * Launcher: it needs to service this first. */ 151 - if (lg->dma_is_pending) 152 break; 153 } 154 } ··· 157 * Guest makes a hypercall, we end up here to set things up: */ 158 static void initialize(struct lguest *lg) 159 { 160 - u32 tsc_speed; 161 162 /* You can't do anything until you're initialized. The Guest knows the 163 * rules, so we're unforgiving here. */ 164 - if (lg->regs->eax != LHCALL_LGUEST_INIT) { 165 - kill_guest(lg, "hypercall %li before LGUEST_INIT", 166 - lg->regs->eax); 167 return; 168 } 169 170 - /* We insist that the Time Stamp Counter exist and doesn't change with 171 - * cpu frequency. Some devious chip manufacturers decided that TSC 172 - * changes could be handled in software. I decided that time going 173 - * backwards might be good for benchmarks, but it's bad for users. 174 - * 175 - * We also insist that the TSC be stable: the kernel detects unreliable 176 - * TSCs for its own purposes, and we use that here. */ 177 - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 178 - tsc_speed = tsc_khz; 179 - else 180 - tsc_speed = 0; 181 - 182 - /* The pointer to the Guest's "struct lguest_data" is the only 183 - * argument. */ 184 - lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; 185 - /* If we check the address they gave is OK now, we can simply 186 - * copy_to_user/from_user from now on rather than using lgread/lgwrite. 187 - * I put this in to show that I'm not immune to writing stupid 188 - * optimizations. */ 189 - if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { 190 kill_guest(lg, "bad guest page %p", lg->lguest_data); 191 - return; 192 - } 193 /* The Guest tells us where we're not to deliver interrupts by putting 194 * the range of addresses into "struct lguest_data". */ 195 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 196 - || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) 197 - /* We tell the Guest that it can't use the top 4MB of virtual 198 - * addresses used by the Switcher. */ 199 - || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 200 - || put_user(tsc_speed, &lg->lguest_data->tsc_khz) 201 - /* We also give the Guest a unique id, as used in lguest_net.c. */ 202 - || put_user(lg->guestid, &lg->lguest_data->guestid)) 203 kill_guest(lg, "bad guest page %p", lg->lguest_data); 204 205 /* We write the current time into the Guest's data page once now. */ 206 write_timestamp(lg); 207 208 /* This is the one case where the above accesses might have been the 209 * first write to a Guest page. This may have caused a copy-on-write 210 * fault, but the Guest might be referring to the old (read-only) 211 * page. */ 212 guest_pagetable_clear_all(lg); 213 - } 214 - /* Now we've examined the hypercall code; our Guest can make requests. There 215 - * is one other way we can do things for the Guest, as we see in 216 - * emulate_insn(). */ 217 - 218 - /*H:110 Tricky point: we mark the hypercall as "done" once we've done it. 219 - * Normally we don't need to do this: the Guest will run again and update the 220 - * trap number before we come back around the run_guest() loop to 221 - * do_hypercalls(). 222 - * 223 - * However, if we are signalled or the Guest sends DMA to the Launcher, that 224 - * loop will exit without running the Guest. When it comes back it would try 225 - * to re-run the hypercall. */ 226 - static void clear_hcall(struct lguest *lg) 227 - { 228 - lg->regs->trapnum = 255; 229 } 230 231 /*H:100 ··· 195 */ 196 void do_hypercalls(struct lguest *lg) 197 { 198 - /* Not initialized yet? */ 199 if (unlikely(!lg->lguest_data)) { 200 - /* Did the Guest make a hypercall? We might have come back for 201 - * some other reason (an interrupt, a different trap). */ 202 - if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 203 - /* Set up the "struct lguest_data" */ 204 - initialize(lg); 205 - /* The hypercall is done. */ 206 - clear_hcall(lg); 207 - } 208 return; 209 } 210 ··· 210 do_async_hcalls(lg); 211 212 /* If we stopped reading the hypercall ring because the Guest did a 213 - * SEND_DMA to the Launcher, we want to return now. Otherwise if the 214 - * Guest asked us to do a hypercall, we do it. */ 215 - if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 216 - do_hcall(lg, lg->regs); 217 - /* The hypercall is done. */ 218 - clear_hcall(lg); 219 } 220 } 221 ··· 234 { 235 struct timespec now; 236 ktime_get_real_ts(&now); 237 - if (put_user(now, &lg->lguest_data->time)) 238 kill_guest(lg, "Writing timestamp"); 239 }

··· 25 #include <linux/mm.h> 26 #include <asm/page.h> 27 #include <asm/pgtable.h> 28 #include "lg.h" 29 30 + /*H:120 This is the core hypercall routine: where the Guest gets what it wants. 31 + * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ 32 + static void do_hcall(struct lguest *lg, struct hcall_args *args) 33 { 34 + switch (args->arg0) { 35 case LHCALL_FLUSH_ASYNC: 36 /* This call does nothing, except by breaking out of the Guest 37 * it makes us process all the asynchronous hypercalls. */ ··· 51 char msg[128]; 52 /* If the lgread fails, it will call kill_guest() itself; the 53 * kill_guest() with the message will be ignored. */ 54 + __lgread(lg, msg, args->arg1, sizeof(msg)); 55 msg[sizeof(msg)-1] = '\0'; 56 kill_guest(lg, "CRASH: %s", msg); 57 break; ··· 59 case LHCALL_FLUSH_TLB: 60 /* FLUSH_TLB comes in two flavors, depending on the 61 * argument: */ 62 + if (args->arg1) 63 guest_pagetable_clear_all(lg); 64 else 65 guest_pagetable_flush_user(lg); 66 break; 67 68 /* All these calls simply pass the arguments through to the right 69 * routines. */ 70 case LHCALL_NEW_PGTABLE: 71 + guest_new_pagetable(lg, args->arg1); 72 break; 73 case LHCALL_SET_STACK: 74 + guest_set_stack(lg, args->arg1, args->arg2, args->arg3); 75 break; 76 case LHCALL_SET_PTE: 77 + guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); 78 break; 79 case LHCALL_SET_PMD: 80 + guest_set_pmd(lg, args->arg1, args->arg2); 81 break; 82 case LHCALL_SET_CLOCKEVENT: 83 + guest_set_clockevent(lg, args->arg1); 84 break; 85 case LHCALL_TS: 86 /* This sets the TS flag, as we saw used in run_guest(). */ 87 + lg->ts = args->arg1; 88 break; 89 case LHCALL_HALT: 90 /* Similarly, this sets the halted flag for run_guest(). */ 91 lg->halted = 1; 92 break; 93 + case LHCALL_NOTIFY: 94 + lg->pending_notify = args->arg1; 95 + break; 96 default: 97 + if (lguest_arch_do_hcall(lg, args)) 98 + kill_guest(lg, "Bad hypercall %li\n", args->arg0); 99 } 100 } 101 + /*:*/ 102 103 + /*H:124 Asynchronous hypercalls are easy: we just look in the array in the 104 + * Guest's "struct lguest_data" to see if any new ones are marked "ready". 105 * 106 * We are careful to do these in order: obviously we respect the order the 107 * Guest put them in the ring, but we also promise the Guest that they will ··· 134 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 135 return; 136 137 /* We process "struct lguest_data"s hcalls[] ring once. */ 138 for (i = 0; i < ARRAY_SIZE(st); i++) { 139 + struct hcall_args args; 140 /* We remember where we were up to from last time. This makes 141 * sure that the hypercalls are done in the order the Guest 142 * places them in the ring. */ ··· 152 if (++lg->next_hcall == LHCALL_RING_SIZE) 153 lg->next_hcall = 0; 154 155 + /* Copy the hypercall arguments into a local copy of 156 + * the hcall_args struct. */ 157 + if (copy_from_user(&args, &lg->lguest_data->hcalls[n], 158 + sizeof(struct hcall_args))) { 159 kill_guest(lg, "Fetching async hypercalls"); 160 break; 161 } 162 163 /* Do the hypercall, same as a normal one. */ 164 + do_hcall(lg, &args); 165 166 /* Mark the hypercall done. */ 167 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { ··· 171 break; 172 } 173 174 + /* Stop doing hypercalls if they want to notify the Launcher: 175 + * it needs to service this first. */ 176 + if (lg->pending_notify) 177 break; 178 } 179 } ··· 182 * Guest makes a hypercall, we end up here to set things up: */ 183 static void initialize(struct lguest *lg) 184 { 185 186 /* You can't do anything until you're initialized. The Guest knows the 187 * rules, so we're unforgiving here. */ 188 + if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { 189 + kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); 190 return; 191 } 192 193 + if (lguest_arch_init_hypercalls(lg)) 194 kill_guest(lg, "bad guest page %p", lg->lguest_data); 195 + 196 /* The Guest tells us where we're not to deliver interrupts by putting 197 * the range of addresses into "struct lguest_data". */ 198 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 199 + || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 200 kill_guest(lg, "bad guest page %p", lg->lguest_data); 201 202 /* We write the current time into the Guest's data page once now. */ 203 write_timestamp(lg); 204 + 205 + /* page_tables.c will also do some setup. */ 206 + page_table_guest_data_init(lg); 207 208 /* This is the one case where the above accesses might have been the 209 * first write to a Guest page. This may have caused a copy-on-write 210 * fault, but the Guest might be referring to the old (read-only) 211 * page. */ 212 guest_pagetable_clear_all(lg); 213 } 214 215 /*H:100 ··· 261 */ 262 void do_hypercalls(struct lguest *lg) 263 { 264 + /* Not initialized yet? This hypercall must do it. */ 265 if (unlikely(!lg->lguest_data)) { 266 + /* Set up the "struct lguest_data" */ 267 + initialize(lg); 268 + /* Hcall is done. */ 269 + lg->hcall = NULL; 270 return; 271 } 272 ··· 280 do_async_hcalls(lg); 281 282 /* If we stopped reading the hypercall ring because the Guest did a 283 + * NOTIFY to the Launcher, we want to return now. Otherwise we do 284 + * the hypercall. */ 285 + if (!lg->pending_notify) { 286 + do_hcall(lg, lg->hcall); 287 + /* Tricky point: we reset the hcall pointer to mark the 288 + * hypercall as "done". We use the hcall pointer rather than 289 + * the trap number to indicate a hypercall is pending. 290 + * Normally it doesn't matter: the Guest will run again and 291 + * update the trap number before we come back here. 292 + * 293 + * However, if we are signalled or the Guest sends DMA to the 294 + * Launcher, the run_guest() loop will exit without running the 295 + * Guest. When it comes back it would try to re-run the 296 + * hypercall. */ 297 + lg->hcall = NULL; 298 } 299 } 300 ··· 295 { 296 struct timespec now; 297 ktime_get_real_ts(&now); 298 + if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) 299 kill_guest(lg, "Writing timestamp"); 300 }

+83 -42

drivers/lguest/interrupts_and_traps.c

··· 12 * them first, so we also have a way of "reflecting" them into the Guest as if 13 * they had been delivered to it directly. :*/ 14 #include <linux/uaccess.h> 15 #include "lg.h" 16 17 /* The address of the interrupt handler is split into two bits: */ 18 static unsigned long idt_address(u32 lo, u32 hi) ··· 45 { 46 /* Stack grows upwards: move stack then write value. */ 47 *gstack -= 4; 48 - lgwrite_u32(lg, *gstack, val); 49 } 50 51 /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or ··· 62 * it). */ 63 static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 64 { 65 - unsigned long gstack; 66 u32 eflags, ss, irq_enable; 67 68 /* There are two cases for interrupts: one where the Guest is already 69 * in the kernel, and a more complex one where the Guest is in ··· 72 if ((lg->regs->ss&0x3) != GUEST_PL) { 73 /* The Guest told us their kernel stack with the SET_STACK 74 * hypercall: both the virtual address and the segment */ 75 - gstack = guest_pa(lg, lg->esp1); 76 ss = lg->ss1; 77 /* We push the old stack segment and pointer onto the new 78 * stack: when the Guest does an "iret" back from the interrupt 79 * handler the CPU will notice they're dropping privilege ··· 84 push_guest_stack(lg, &gstack, lg->regs->esp); 85 } else { 86 /* We're staying on the same Guest (kernel) stack. */ 87 - gstack = guest_pa(lg, lg->regs->esp); 88 ss = lg->regs->ss; 89 } 90 91 /* Remember that we never let the Guest actually disable interrupts, so ··· 113 /* Now we've pushed all the old state, we change the stack, the code 114 * segment and the address to execute. */ 115 lg->regs->ss = ss; 116 - lg->regs->esp = gstack + lg->page_offset; 117 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 118 lg->regs->eip = idt_address(lo, hi); 119 ··· 176 /* Look at the IDT entry the Guest gave us for this interrupt. The 177 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 178 * over them. */ 179 - idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; 180 /* If they don't have a handler (yet?), we just ignore it */ 181 if (idt_present(idt->a, idt->b)) { 182 /* OK, mark it no longer pending and deliver it. */ ··· 194 * timer interrupt. */ 195 write_timestamp(lg); 196 } 197 198 /*H:220 Now we've got the routines to deliver interrupts, delivering traps 199 * like page fault is easy. The only trick is that Intel decided that some ··· 249 { 250 /* Trap numbers are always 8 bit, but we set an impossible trap number 251 * for traps inside the Switcher, so check that here. */ 252 - if (num >= ARRAY_SIZE(lg->idt)) 253 return 0; 254 255 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 256 * bogus one in): if we fail here, the Guest will be killed. */ 257 - if (!idt_present(lg->idt[num].a, lg->idt[num].b)) 258 return 0; 259 - set_guest_interrupt(lg, lg->idt[num].a, lg->idt[num].b, has_err(num)); 260 return 1; 261 } 262 ··· 270 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 271 * the other hypervisors would tease it. 272 * 273 - * This routine determines if a trap can be delivered directly. */ 274 - static int direct_trap(const struct lguest *lg, 275 - const struct desc_struct *trap, 276 - unsigned int num) 277 { 278 /* Hardware interrupts don't go to the Guest at all (except system 279 * call). */ 280 - if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) 281 return 0; 282 283 /* The Host needs to see page faults (for shadow paging and to save the 284 * fault address), general protection faults (in/out emulation) and 285 * device not available (TS handling), and of course, the hypercall 286 * trap. */ 287 - if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) 288 - return 0; 289 - 290 - /* Only trap gates (type 15) can go direct to the Guest. Interrupt 291 - * gates (type 14) disable interrupts as they are entered, which we 292 - * never let the Guest do. Not present entries (type 0x0) also can't 293 - * go direct, of course 8) */ 294 - return idt_type(trap->a, trap->b) == 0xF; 295 } 296 /*:*/ 297 ··· 392 * to copy this again. */ 393 lg->changed |= CHANGED_IDT; 394 395 - /* The IDT which we keep in "struct lguest" only contains 32 entries 396 - * for the traps and LGUEST_IRQS (32) entries for interrupts. We 397 - * ignore attempts to set handlers for higher interrupt numbers, except 398 - * for the system call "interrupt" at 128: we have a special IDT entry 399 - * for that. */ 400 - if (num < ARRAY_SIZE(lg->idt)) 401 - set_trap(lg, &lg->idt[num], num, lo, hi); 402 - else if (num == SYSCALL_VECTOR) 403 - set_trap(lg, &lg->syscall_idt, num, lo, hi); 404 } 405 406 /* The default entry for each interrupt points into the Switcher routines which ··· 439 440 /* We can simply copy the direct traps, otherwise we use the default 441 * ones in the Switcher: they will return to the Host. */ 442 - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { 443 - if (direct_trap(lg, &lg->idt[i], i)) 444 - idt[i] = lg->idt[i]; 445 else 446 default_idt_entry(&idt[i], i, def[i]); 447 } 448 - 449 - /* Don't forget the system call trap! The IDT entries for other 450 - * interupts never change, so no need to copy them. */ 451 - i = SYSCALL_VECTOR; 452 - if (direct_trap(lg, &lg->syscall_idt, i)) 453 - idt[i] = lg->syscall_idt; 454 - else 455 - default_idt_entry(&idt[i], i, def[i]); 456 } 457 458 void guest_set_clockevent(struct lguest *lg, unsigned long delta)

··· 12 * them first, so we also have a way of "reflecting" them into the Guest as if 13 * they had been delivered to it directly. :*/ 14 #include <linux/uaccess.h> 15 + #include <linux/interrupt.h> 16 + #include <linux/module.h> 17 #include "lg.h" 18 + 19 + /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ 20 + static unsigned int syscall_vector = SYSCALL_VECTOR; 21 + module_param(syscall_vector, uint, 0444); 22 23 /* The address of the interrupt handler is split into two bits: */ 24 static unsigned long idt_address(u32 lo, u32 hi) ··· 39 { 40 /* Stack grows upwards: move stack then write value. */ 41 *gstack -= 4; 42 + lgwrite(lg, *gstack, u32, val); 43 } 44 45 /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or ··· 56 * it). */ 57 static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 58 { 59 + unsigned long gstack, origstack; 60 u32 eflags, ss, irq_enable; 61 + unsigned long virtstack; 62 63 /* There are two cases for interrupts: one where the Guest is already 64 * in the kernel, and a more complex one where the Guest is in ··· 65 if ((lg->regs->ss&0x3) != GUEST_PL) { 66 /* The Guest told us their kernel stack with the SET_STACK 67 * hypercall: both the virtual address and the segment */ 68 + virtstack = lg->esp1; 69 ss = lg->ss1; 70 + 71 + origstack = gstack = guest_pa(lg, virtstack); 72 /* We push the old stack segment and pointer onto the new 73 * stack: when the Guest does an "iret" back from the interrupt 74 * handler the CPU will notice they're dropping privilege ··· 75 push_guest_stack(lg, &gstack, lg->regs->esp); 76 } else { 77 /* We're staying on the same Guest (kernel) stack. */ 78 + virtstack = lg->regs->esp; 79 ss = lg->regs->ss; 80 + 81 + origstack = gstack = guest_pa(lg, virtstack); 82 } 83 84 /* Remember that we never let the Guest actually disable interrupts, so ··· 102 /* Now we've pushed all the old state, we change the stack, the code 103 * segment and the address to execute. */ 104 lg->regs->ss = ss; 105 + lg->regs->esp = virtstack + (gstack - origstack); 106 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 107 lg->regs->eip = idt_address(lo, hi); 108 ··· 165 /* Look at the IDT entry the Guest gave us for this interrupt. The 166 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 167 * over them. */ 168 + idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 169 /* If they don't have a handler (yet?), we just ignore it */ 170 if (idt_present(idt->a, idt->b)) { 171 /* OK, mark it no longer pending and deliver it. */ ··· 183 * timer interrupt. */ 184 write_timestamp(lg); 185 } 186 + /*:*/ 187 + 188 + /* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 189 + * me a patch, so we support that too. It'd be a big step for lguest if half 190 + * the Plan 9 user base were to start using it. 191 + * 192 + * Actually now I think of it, it's possible that Ron *is* half the Plan 9 193 + * userbase. Oh well. */ 194 + static bool could_be_syscall(unsigned int num) 195 + { 196 + /* Normal Linux SYSCALL_VECTOR or reserved vector? */ 197 + return num == SYSCALL_VECTOR || num == syscall_vector; 198 + } 199 + 200 + /* The syscall vector it wants must be unused by Host. */ 201 + bool check_syscall_vector(struct lguest *lg) 202 + { 203 + u32 vector; 204 + 205 + if (get_user(vector, &lg->lguest_data->syscall_vec)) 206 + return false; 207 + 208 + return could_be_syscall(vector); 209 + } 210 + 211 + int init_interrupts(void) 212 + { 213 + /* If they want some strange system call vector, reserve it now */ 214 + if (syscall_vector != SYSCALL_VECTOR 215 + && test_and_set_bit(syscall_vector, used_vectors)) { 216 + printk("lg: couldn't reserve syscall %u\n", syscall_vector); 217 + return -EBUSY; 218 + } 219 + return 0; 220 + } 221 + 222 + void free_interrupts(void) 223 + { 224 + if (syscall_vector != SYSCALL_VECTOR) 225 + clear_bit(syscall_vector, used_vectors); 226 + } 227 228 /*H:220 Now we've got the routines to deliver interrupts, delivering traps 229 * like page fault is easy. The only trick is that Intel decided that some ··· 197 { 198 /* Trap numbers are always 8 bit, but we set an impossible trap number 199 * for traps inside the Switcher, so check that here. */ 200 + if (num >= ARRAY_SIZE(lg->arch.idt)) 201 return 0; 202 203 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 204 * bogus one in): if we fail here, the Guest will be killed. */ 205 + if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 206 return 0; 207 + set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); 208 return 1; 209 } 210 ··· 218 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 219 * the other hypervisors would tease it. 220 * 221 + * This routine indicates if a particular trap number could be delivered 222 + * directly. */ 223 + static int direct_trap(unsigned int num) 224 { 225 /* Hardware interrupts don't go to the Guest at all (except system 226 * call). */ 227 + if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 228 return 0; 229 230 /* The Host needs to see page faults (for shadow paging and to save the 231 * fault address), general protection faults (in/out emulation) and 232 * device not available (TS handling), and of course, the hypercall 233 * trap. */ 234 + return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; 235 } 236 /*:*/ 237 ··· 348 * to copy this again. */ 349 lg->changed |= CHANGED_IDT; 350 351 + /* Check that the Guest doesn't try to step outside the bounds. */ 352 + if (num >= ARRAY_SIZE(lg->arch.idt)) 353 + kill_guest(lg, "Setting idt entry %u", num); 354 + else 355 + set_trap(lg, &lg->arch.idt[num], num, lo, hi); 356 } 357 358 /* The default entry for each interrupt points into the Switcher routines which ··· 399 400 /* We can simply copy the direct traps, otherwise we use the default 401 * ones in the Switcher: they will return to the Host. */ 402 + for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { 403 + /* If no Guest can ever override this trap, leave it alone. */ 404 + if (!direct_trap(i)) 405 + continue; 406 + 407 + /* Only trap gates (type 15) can go direct to the Guest. 408 + * Interrupt gates (type 14) disable interrupts as they are 409 + * entered, which we never let the Guest do. Not present 410 + * entries (type 0x0) also can't go direct, of course. */ 411 + if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) 412 + idt[i] = lg->arch.idt[i]; 413 else 414 + /* Reset it to the default. */ 415 default_idt_entry(&idt[i], i, def[i]); 416 } 417 } 418 419 void guest_set_clockevent(struct lguest *lg, unsigned long delta)

-626

drivers/lguest/io.c

··· 1 - /*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest 2 - * to talk to the Launcher or directly to another Guest. It uses familiar 3 - * concepts of DMA and interrupts, plus some neat code stolen from 4 - * futexes... :*/ 5 - 6 - /* Copyright (C) 2006 Rusty Russell IBM Corporation 7 - * 8 - * This program is free software; you can redistribute it and/or modify 9 - * it under the terms of the GNU General Public License as published by 10 - * the Free Software Foundation; either version 2 of the License, or 11 - * (at your option) any later version. 12 - * 13 - * This program is distributed in the hope that it will be useful, 14 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 - * GNU General Public License for more details. 17 - * 18 - * You should have received a copy of the GNU General Public License 19 - * along with this program; if not, write to the Free Software 20 - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 21 - */ 22 - #include <linux/types.h> 23 - #include <linux/futex.h> 24 - #include <linux/jhash.h> 25 - #include <linux/mm.h> 26 - #include <linux/highmem.h> 27 - #include <linux/uaccess.h> 28 - #include "lg.h" 29 - 30 - /*L:300 31 - * I/O 32 - * 33 - * Getting data in and out of the Guest is quite an art. There are numerous 34 - * ways to do it, and they all suck differently. We try to keep things fairly 35 - * close to "real" hardware so our Guest's drivers don't look like an alien 36 - * visitation in the middle of the Linux code, and yet make sure that Guests 37 - * can talk directly to other Guests, not just the Launcher. 38 - * 39 - * To do this, the Guest gives us a key when it binds or sends DMA buffers. 40 - * The key corresponds to a "physical" address inside the Guest (ie. a virtual 41 - * address inside the Launcher process). We don't, however, use this key 42 - * directly. 43 - * 44 - * We want Guests which share memory to be able to DMA to each other: two 45 - * Launchers can mmap memory the same file, then the Guests can communicate. 46 - * Fortunately, the futex code provides us with a way to get a "union 47 - * futex_key" corresponding to the memory lying at a virtual address: if the 48 - * two processes share memory, the "union futex_key" for that memory will match 49 - * even if the memory is mapped at different addresses in each. So we always 50 - * convert the keys to "union futex_key"s to compare them. 51 - * 52 - * Before we dive into this though, we need to look at another set of helper 53 - * routines used throughout the Host kernel code to access Guest memory. 54 - :*/ 55 - static struct list_head dma_hash[61]; 56 - 57 - /* An unfortunate side effect of the Linux double-linked list implementation is 58 - * that there's no good way to statically initialize an array of linked 59 - * lists. */ 60 - void lguest_io_init(void) 61 - { 62 - unsigned int i; 63 - 64 - for (i = 0; i < ARRAY_SIZE(dma_hash); i++) 65 - INIT_LIST_HEAD(&dma_hash[i]); 66 - } 67 - 68 - /* FIXME: allow multi-page lengths. */ 69 - static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) 70 - { 71 - unsigned int i; 72 - 73 - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 74 - if (!dma->len[i]) 75 - return 1; 76 - if (!lguest_address_ok(lg, dma->addr[i], dma->len[i])) 77 - goto kill; 78 - if (dma->len[i] > PAGE_SIZE) 79 - goto kill; 80 - /* We could do over a page, but is it worth it? */ 81 - if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) 82 - goto kill; 83 - } 84 - return 1; 85 - 86 - kill: 87 - kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]); 88 - return 0; 89 - } 90 - 91 - /*L:330 This is our hash function, using the wonderful Jenkins hash. 92 - * 93 - * The futex key is a union with three parts: an unsigned long word, a pointer, 94 - * and an int "offset". We could use jhash_2words() which takes three u32s. 95 - * (Ok, the hash functions are great: the naming sucks though). 96 - * 97 - * It's nice to be portable to 64-bit platforms, so we use the more generic 98 - * jhash2(), which takes an array of u32, the number of u32s, and an initial 99 - * u32 to roll in. This is uglier, but breaks down to almost the same code on 100 - * 32-bit platforms like this one. 101 - * 102 - * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61). 103 - */ 104 - static unsigned int hash(const union futex_key *key) 105 - { 106 - return jhash2((u32*)&key->both.word, 107 - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 108 - key->both.offset) 109 - % ARRAY_SIZE(dma_hash); 110 - } 111 - 112 - /* This is a convenience routine to compare two keys. It's a much bemoaned C 113 - * weakness that it doesn't allow '==' on structures or unions, so we have to 114 - * open-code it like this. */ 115 - static inline int key_eq(const union futex_key *a, const union futex_key *b) 116 - { 117 - return (a->both.word == b->both.word 118 - && a->both.ptr == b->both.ptr 119 - && a->both.offset == b->both.offset); 120 - } 121 - 122 - /*L:360 OK, when we need to actually free up a Guest's DMA array we do several 123 - * things, so we have a convenient function to do it. 124 - * 125 - * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem 126 - * for the drop_futex_key_refs(). */ 127 - static void unlink_dma(struct lguest_dma_info *dmainfo) 128 - { 129 - /* You locked this too, right? */ 130 - BUG_ON(!mutex_is_locked(&lguest_lock)); 131 - /* This is how we know that the entry is free. */ 132 - dmainfo->interrupt = 0; 133 - /* Remove it from the hash table. */ 134 - list_del(&dmainfo->list); 135 - /* Drop the references we were holding (to the inode or mm). */ 136 - drop_futex_key_refs(&dmainfo->key); 137 - } 138 - 139 - /*L:350 This is the routine which we call when the Guest asks to unregister a 140 - * DMA array attached to a given key. Returns true if the array was found. */ 141 - static int unbind_dma(struct lguest *lg, 142 - const union futex_key *key, 143 - unsigned long dmas) 144 - { 145 - int i, ret = 0; 146 - 147 - /* We don't bother with the hash table, just look through all this 148 - * Guest's DMA arrays. */ 149 - for (i = 0; i < LGUEST_MAX_DMA; i++) { 150 - /* In theory it could have more than one array on the same key, 151 - * or one array on multiple keys, so we check both */ 152 - if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { 153 - unlink_dma(&lg->dma[i]); 154 - ret = 1; 155 - break; 156 - } 157 - } 158 - return ret; 159 - } 160 - 161 - /*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct 162 - * lguest_dma" for receiving I/O. 163 - * 164 - * The Guest wants to bind an array of "struct lguest_dma"s to a particular key 165 - * to receive input. This only happens when the Guest is setting up a new 166 - * device, so it doesn't have to be very fast. 167 - * 168 - * It returns 1 on a successful registration (it can fail if we hit the limit 169 - * of registrations for this Guest). 170 - */ 171 - int bind_dma(struct lguest *lg, 172 - unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) 173 - { 174 - unsigned int i; 175 - int ret = 0; 176 - union futex_key key; 177 - /* Futex code needs the mmap_sem. */ 178 - struct rw_semaphore *fshared = &current->mm->mmap_sem; 179 - 180 - /* Invalid interrupt? (We could kill the guest here). */ 181 - if (interrupt >= LGUEST_IRQS) 182 - return 0; 183 - 184 - /* We need to grab the Big Lguest Lock, because other Guests may be 185 - * trying to look through this Guest's DMAs to send something while 186 - * we're doing this. */ 187 - mutex_lock(&lguest_lock); 188 - down_read(fshared); 189 - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 190 - kill_guest(lg, "bad dma key %#lx", ukey); 191 - goto unlock; 192 - } 193 - 194 - /* We want to keep this key valid once we drop mmap_sem, so we have to 195 - * hold a reference. */ 196 - get_futex_key_refs(&key); 197 - 198 - /* If the Guest specified an interrupt of 0, that means they want to 199 - * unregister this array of "struct lguest_dma"s. */ 200 - if (interrupt == 0) 201 - ret = unbind_dma(lg, &key, dmas); 202 - else { 203 - /* Look through this Guest's dma array for an unused entry. */ 204 - for (i = 0; i < LGUEST_MAX_DMA; i++) { 205 - /* If the interrupt is non-zero, the entry is already 206 - * used. */ 207 - if (lg->dma[i].interrupt) 208 - continue; 209 - 210 - /* OK, a free one! Fill on our details. */ 211 - lg->dma[i].dmas = dmas; 212 - lg->dma[i].num_dmas = numdmas; 213 - lg->dma[i].next_dma = 0; 214 - lg->dma[i].key = key; 215 - lg->dma[i].guestid = lg->guestid; 216 - lg->dma[i].interrupt = interrupt; 217 - 218 - /* Now we add it to the hash table: the position 219 - * depends on the futex key that we got. */ 220 - list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); 221 - /* Success! */ 222 - ret = 1; 223 - goto unlock; 224 - } 225 - } 226 - /* If we didn't find a slot to put the key in, drop the reference 227 - * again. */ 228 - drop_futex_key_refs(&key); 229 - unlock: 230 - /* Unlock and out. */ 231 - up_read(fshared); 232 - mutex_unlock(&lguest_lock); 233 - return ret; 234 - } 235 - 236 - /*L:385 Note that our routines to access a different Guest's memory are called 237 - * lgread_other() and lgwrite_other(): these names emphasize that they are only 238 - * used when the Guest is *not* the current Guest. 239 - * 240 - * The interface for copying from another process's memory is called 241 - * access_process_vm(), with a final argument of 0 for a read, and 1 for a 242 - * write. 243 - * 244 - * We need lgread_other() to read the destination Guest's "struct lguest_dma" 245 - * array. */ 246 - static int lgread_other(struct lguest *lg, 247 - void *buf, u32 addr, unsigned bytes) 248 - { 249 - if (!lguest_address_ok(lg, addr, bytes) 250 - || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) { 251 - memset(buf, 0, bytes); 252 - kill_guest(lg, "bad address in registered DMA struct"); 253 - return 0; 254 - } 255 - return 1; 256 - } 257 - 258 - /* "lgwrite()" to another Guest: used to update the destination "used_len" once 259 - * we've transferred data into the buffer. */ 260 - static int lgwrite_other(struct lguest *lg, u32 addr, 261 - const void *buf, unsigned bytes) 262 - { 263 - if (!lguest_address_ok(lg, addr, bytes) 264 - || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) 265 - != bytes)) { 266 - kill_guest(lg, "bad address writing to registered DMA"); 267 - return 0; 268 - } 269 - return 1; 270 - } 271 - 272 - /*L:400 This is the generic engine which copies from a source "struct 273 - * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The 274 - * destination Guest's pages have already been mapped, as contained in the 275 - * pages array. 276 - * 277 - * If you're wondering if there's a nice "copy from one process to another" 278 - * routine, so was I. But Linux isn't really set up to copy between two 279 - * unrelated processes, so we have to write it ourselves. 280 - */ 281 - static u32 copy_data(struct lguest *srclg, 282 - const struct lguest_dma *src, 283 - const struct lguest_dma *dst, 284 - struct page *pages[]) 285 - { 286 - unsigned int totlen, si, di, srcoff, dstoff; 287 - void *maddr = NULL; 288 - 289 - /* We return the total length transferred. */ 290 - totlen = 0; 291 - 292 - /* We keep indexes into the source and destination "struct lguest_dma", 293 - * and an offset within each region. */ 294 - si = di = 0; 295 - srcoff = dstoff = 0; 296 - 297 - /* We loop until the source or destination is exhausted. */ 298 - while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] 299 - && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { 300 - /* We can only transfer the rest of the src buffer, or as much 301 - * as will fit into the destination buffer. */ 302 - u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); 303 - 304 - /* For systems using "highmem" we need to use kmap() to access 305 - * the page we want. We often use the same page over and over, 306 - * so rather than kmap() it on every loop, we set the maddr 307 - * pointer to NULL when we need to move to the next 308 - * destination page. */ 309 - if (!maddr) 310 - maddr = kmap(pages[di]); 311 - 312 - /* Copy directly from (this Guest's) source address to the 313 - * destination Guest's kmap()ed buffer. Note that maddr points 314 - * to the start of the page: we need to add the offset of the 315 - * destination address and offset within the buffer. */ 316 - 317 - /* FIXME: This is not completely portable. I looked at 318 - * copy_to_user_page(), and some arch's seem to need special 319 - * flushes. x86 is fine. */ 320 - if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, 321 - (void __user *)src->addr[si], len) != 0) { 322 - /* If a copy failed, it's the source's fault. */ 323 - kill_guest(srclg, "bad address in sending DMA"); 324 - totlen = 0; 325 - break; 326 - } 327 - 328 - /* Increment the total and src & dst offsets */ 329 - totlen += len; 330 - srcoff += len; 331 - dstoff += len; 332 - 333 - /* Presumably we reached the end of the src or dest buffers: */ 334 - if (srcoff == src->len[si]) { 335 - /* Move to the next buffer at offset 0 */ 336 - si++; 337 - srcoff = 0; 338 - } 339 - if (dstoff == dst->len[di]) { 340 - /* We need to unmap that destination page and reset 341 - * maddr ready for the next one. */ 342 - kunmap(pages[di]); 343 - maddr = NULL; 344 - di++; 345 - dstoff = 0; 346 - } 347 - } 348 - 349 - /* If we still had a page mapped at the end, unmap now. */ 350 - if (maddr) 351 - kunmap(pages[di]); 352 - 353 - return totlen; 354 - } 355 - 356 - /*L:390 This is how we transfer a "struct lguest_dma" from the source Guest 357 - * (the current Guest which called SEND_DMA) to another Guest. */ 358 - static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, 359 - struct lguest *dstlg, const struct lguest_dma *dst) 360 - { 361 - int i; 362 - u32 ret; 363 - struct page *pages[LGUEST_MAX_DMA_SECTIONS]; 364 - 365 - /* We check that both source and destination "struct lguest_dma"s are 366 - * within the bounds of the source and destination Guests */ 367 - if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) 368 - return 0; 369 - 370 - /* We need to map the pages which correspond to each parts of 371 - * destination buffer. */ 372 - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 373 - if (dst->len[i] == 0) 374 - break; 375 - /* get_user_pages() is a complicated function, especially since 376 - * we only want a single page. But it works, and returns the 377 - * number of pages. Note that we're holding the destination's 378 - * mmap_sem, as get_user_pages() requires. */ 379 - if (get_user_pages(dstlg->tsk, dstlg->mm, 380 - dst->addr[i], 1, 1, 1, pages+i, NULL) 381 - != 1) { 382 - /* This means the destination gave us a bogus buffer */ 383 - kill_guest(dstlg, "Error mapping DMA pages"); 384 - ret = 0; 385 - goto drop_pages; 386 - } 387 - } 388 - 389 - /* Now copy the data until we run out of src or dst. */ 390 - ret = copy_data(srclg, src, dst, pages); 391 - 392 - drop_pages: 393 - while (--i >= 0) 394 - put_page(pages[i]); 395 - return ret; 396 - } 397 - 398 - /*L:380 Transferring data from one Guest to another is not as simple as I'd 399 - * like. We've found the "struct lguest_dma_info" bound to the same address as 400 - * the send, we need to copy into it. 401 - * 402 - * This function returns true if the destination array was empty. */ 403 - static int dma_transfer(struct lguest *srclg, 404 - unsigned long udma, 405 - struct lguest_dma_info *dst) 406 - { 407 - struct lguest_dma dst_dma, src_dma; 408 - struct lguest *dstlg; 409 - u32 i, dma = 0; 410 - 411 - /* From the "struct lguest_dma_info" we found in the hash, grab the 412 - * Guest. */ 413 - dstlg = &lguests[dst->guestid]; 414 - /* Read in the source "struct lguest_dma" handed to SEND_DMA. */ 415 - lgread(srclg, &src_dma, udma, sizeof(src_dma)); 416 - 417 - /* We need the destination's mmap_sem, and we already hold the source's 418 - * mmap_sem for the futex key lookup. Normally this would suggest that 419 - * we could deadlock if the destination Guest was trying to send to 420 - * this source Guest at the same time, which is another reason that all 421 - * I/O is done under the big lguest_lock. */ 422 - down_read(&dstlg->mm->mmap_sem); 423 - 424 - /* Look through the destination DMA array for an available buffer. */ 425 - for (i = 0; i < dst->num_dmas; i++) { 426 - /* We keep a "next_dma" pointer which often helps us avoid 427 - * looking at lots of previously-filled entries. */ 428 - dma = (dst->next_dma + i) % dst->num_dmas; 429 - if (!lgread_other(dstlg, &dst_dma, 430 - dst->dmas + dma * sizeof(struct lguest_dma), 431 - sizeof(dst_dma))) { 432 - goto fail; 433 - } 434 - if (!dst_dma.used_len) 435 - break; 436 - } 437 - 438 - /* If we found a buffer, we do the actual data copy. */ 439 - if (i != dst->num_dmas) { 440 - unsigned long used_lenp; 441 - unsigned int ret; 442 - 443 - ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); 444 - /* Put used length in the source "struct lguest_dma"'s used_len 445 - * field. It's a little tricky to figure out where that is, 446 - * though. */ 447 - lgwrite_u32(srclg, 448 - udma+offsetof(struct lguest_dma, used_len), ret); 449 - /* Tranferring 0 bytes is OK if the source buffer was empty. */ 450 - if (ret == 0 && src_dma.len[0] != 0) 451 - goto fail; 452 - 453 - /* The destination Guest might be running on a different CPU: 454 - * we have to make sure that it will see the "used_len" field 455 - * change to non-zero *after* it sees the data we copied into 456 - * the buffer. Hence a write memory barrier. */ 457 - wmb(); 458 - /* Figuring out where the destination's used_len field for this 459 - * "struct lguest_dma" in the array is also a little ugly. */ 460 - used_lenp = dst->dmas 461 - + dma * sizeof(struct lguest_dma) 462 - + offsetof(struct lguest_dma, used_len); 463 - lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); 464 - /* Move the cursor for next time. */ 465 - dst->next_dma++; 466 - } 467 - up_read(&dstlg->mm->mmap_sem); 468 - 469 - /* We trigger the destination interrupt, even if the destination was 470 - * empty and we didn't transfer anything: this gives them a chance to 471 - * wake up and refill. */ 472 - set_bit(dst->interrupt, dstlg->irqs_pending); 473 - /* Wake up the destination process. */ 474 - wake_up_process(dstlg->tsk); 475 - /* If we passed the last "struct lguest_dma", the receive had no 476 - * buffers left. */ 477 - return i == dst->num_dmas; 478 - 479 - fail: 480 - up_read(&dstlg->mm->mmap_sem); 481 - return 0; 482 - } 483 - 484 - /*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA 485 - * hypercall. We find out who's listening, and send to them. */ 486 - void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) 487 - { 488 - union futex_key key; 489 - int empty = 0; 490 - struct rw_semaphore *fshared = &current->mm->mmap_sem; 491 - 492 - again: 493 - mutex_lock(&lguest_lock); 494 - down_read(fshared); 495 - /* Get the futex key for the key the Guest gave us */ 496 - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 497 - kill_guest(lg, "bad sending DMA key"); 498 - goto unlock; 499 - } 500 - /* Since the key must be a multiple of 4, the futex key uses the lower 501 - * bit of the "offset" field (which would always be 0) to indicate a 502 - * mapping which is shared with other processes (ie. Guests). */ 503 - if (key.shared.offset & 1) { 504 - struct lguest_dma_info *i; 505 - /* Look through the hash for other Guests. */ 506 - list_for_each_entry(i, &dma_hash[hash(&key)], list) { 507 - /* Don't send to ourselves. */ 508 - if (i->guestid == lg->guestid) 509 - continue; 510 - if (!key_eq(&key, &i->key)) 511 - continue; 512 - 513 - /* If dma_transfer() tells us the destination has no 514 - * available buffers, we increment "empty". */ 515 - empty += dma_transfer(lg, udma, i); 516 - break; 517 - } 518 - /* If the destination is empty, we release our locks and 519 - * give the destination Guest a brief chance to restock. */ 520 - if (empty == 1) { 521 - /* Give any recipients one chance to restock. */ 522 - up_read(&current->mm->mmap_sem); 523 - mutex_unlock(&lguest_lock); 524 - /* Next time, we won't try again. */ 525 - empty++; 526 - goto again; 527 - } 528 - } else { 529 - /* Private mapping: Guest is sending to its Launcher. We set 530 - * the "dma_is_pending" flag so that the main loop will exit 531 - * and the Launcher's read() from /dev/lguest will return. */ 532 - lg->dma_is_pending = 1; 533 - lg->pending_dma = udma; 534 - lg->pending_key = ukey; 535 - } 536 - unlock: 537 - up_read(fshared); 538 - mutex_unlock(&lguest_lock); 539 - } 540 - /*:*/ 541 - 542 - void release_all_dma(struct lguest *lg) 543 - { 544 - unsigned int i; 545 - 546 - BUG_ON(!mutex_is_locked(&lguest_lock)); 547 - 548 - down_read(&lg->mm->mmap_sem); 549 - for (i = 0; i < LGUEST_MAX_DMA; i++) { 550 - if (lg->dma[i].interrupt) 551 - unlink_dma(&lg->dma[i]); 552 - } 553 - up_read(&lg->mm->mmap_sem); 554 - } 555 - 556 - /*M:007 We only return a single DMA buffer to the Launcher, but it would be 557 - * more efficient to return a pointer to the entire array of DMA buffers, which 558 - * it can cache and choose one whenever it wants. 559 - * 560 - * Currently the Launcher uses a write to /dev/lguest, and the return value is 561 - * the address of the DMA structure with the interrupt number placed in 562 - * dma->used_len. If we wanted to return the entire array, we need to return 563 - * the address, array size and interrupt number: this seems to require an 564 - * ioctl(). :*/ 565 - 566 - /*L:320 This routine looks for a DMA buffer registered by the Guest on the 567 - * given key (using the BIND_DMA hypercall). */ 568 - unsigned long get_dma_buffer(struct lguest *lg, 569 - unsigned long ukey, unsigned long *interrupt) 570 - { 571 - unsigned long ret = 0; 572 - union futex_key key; 573 - struct lguest_dma_info *i; 574 - struct rw_semaphore *fshared = &current->mm->mmap_sem; 575 - 576 - /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA 577 - * at the same time. */ 578 - mutex_lock(&lguest_lock); 579 - /* To match between Guests sharing the same underlying memory we steal 580 - * code from the futex infrastructure. This requires that we hold the 581 - * "mmap_sem" for our process (the Launcher), and pass it to the futex 582 - * code. */ 583 - down_read(fshared); 584 - 585 - /* This can fail if it's not a valid address, or if the address is not 586 - * divisible by 4 (the futex code needs that, we don't really). */ 587 - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 588 - kill_guest(lg, "bad registered DMA buffer"); 589 - goto unlock; 590 - } 591 - /* Search the hash table for matching entries (the Launcher can only 592 - * send to its own Guest for the moment, so the entry must be for this 593 - * Guest) */ 594 - list_for_each_entry(i, &dma_hash[hash(&key)], list) { 595 - if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { 596 - unsigned int j; 597 - /* Look through the registered DMA array for an 598 - * available buffer. */ 599 - for (j = 0; j < i->num_dmas; j++) { 600 - struct lguest_dma dma; 601 - 602 - ret = i->dmas + j * sizeof(struct lguest_dma); 603 - lgread(lg, &dma, ret, sizeof(dma)); 604 - if (dma.used_len == 0) 605 - break; 606 - } 607 - /* Store the interrupt the Guest wants when the buffer 608 - * is used. */ 609 - *interrupt = i->interrupt; 610 - break; 611 - } 612 - } 613 - unlock: 614 - up_read(fshared); 615 - mutex_unlock(&lguest_lock); 616 - return ret; 617 - } 618 - /*:*/ 619 - 620 - /*L:410 This really has completed the Launcher. Not only have we now finished 621 - * the longest chapter in our journey, but this also means we are over halfway 622 - * through! 623 - * 624 - * Enough prevaricating around the bush: it is time for us to dive into the 625 - * core of the Host, in "make Host". 626 - */

···

+57 -132

drivers/lguest/lg.h

··· 1 #ifndef _LGUEST_H 2 #define _LGUEST_H 3 4 - #include <asm/desc.h> 5 - 6 - #define GDT_ENTRY_LGUEST_CS 10 7 - #define GDT_ENTRY_LGUEST_DS 11 8 - #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) 9 - #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) 10 - 11 #ifndef __ASSEMBLY__ 12 #include <linux/types.h> 13 #include <linux/init.h> 14 #include <linux/stringify.h> 15 - #include <linux/binfmts.h> 16 - #include <linux/futex.h> 17 #include <linux/lguest.h> 18 #include <linux/lguest_launcher.h> 19 #include <linux/wait.h> 20 #include <linux/err.h> 21 #include <asm/semaphore.h> 22 - #include "irq_vectors.h" 23 24 - #define GUEST_PL 1 25 - 26 - struct lguest_regs 27 - { 28 - /* Manually saved part. */ 29 - unsigned long ebx, ecx, edx; 30 - unsigned long esi, edi, ebp; 31 - unsigned long gs; 32 - unsigned long eax; 33 - unsigned long fs, ds, es; 34 - unsigned long trapnum, errcode; 35 - /* Trap pushed part */ 36 - unsigned long eip; 37 - unsigned long cs; 38 - unsigned long eflags; 39 - unsigned long esp; 40 - unsigned long ss; 41 - }; 42 43 void free_pagetables(void); 44 int init_pagetables(struct page **switcher_page, unsigned int pages); 45 46 - /* Full 4G segment descriptors, suitable for CS and DS. */ 47 - #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 48 - #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 49 - 50 - struct lguest_dma_info 51 - { 52 - struct list_head list; 53 - union futex_key key; 54 - unsigned long dmas; 55 - u16 next_dma; 56 - u16 num_dmas; 57 - u16 guestid; 58 - u8 interrupt; /* 0 when not registered */ 59 - }; 60 - 61 - /*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He 62 - * reviewed the original code which used "u32" for all page table entries, and 63 - * insisted that it would be far clearer with explicit typing. I thought it 64 - * was overkill, but he was right: it is much clearer than it was before. 65 - * 66 - * We have separate types for the Guest's ptes & pgds and the shadow ptes & 67 - * pgds. There's already a Linux type for these (pte_t and pgd_t) but they 68 - * change depending on kernel config options (PAE). */ 69 - 70 - /* Each entry is identical: lower 12 bits of flags and upper 20 bits for the 71 - * "page frame number" (0 == first physical page, etc). They are different 72 - * types so the compiler will warn us if we mix them improperly. */ 73 - typedef union { 74 - struct { unsigned flags:12, pfn:20; }; 75 - struct { unsigned long val; } raw; 76 - } spgd_t; 77 - typedef union { 78 - struct { unsigned flags:12, pfn:20; }; 79 - struct { unsigned long val; } raw; 80 - } spte_t; 81 - typedef union { 82 - struct { unsigned flags:12, pfn:20; }; 83 - struct { unsigned long val; } raw; 84 - } gpgd_t; 85 - typedef union { 86 - struct { unsigned flags:12, pfn:20; }; 87 - struct { unsigned long val; } raw; 88 - } gpte_t; 89 - 90 - /* We have two convenient macros to convert a "raw" value as handed to us by 91 - * the Guest into the correct Guest PGD or PTE type. */ 92 - #define mkgpte(_val) ((gpte_t){.raw.val = _val}) 93 - #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) 94 - /*:*/ 95 - 96 struct pgdir 97 { 98 - unsigned long cr3; 99 - spgd_t *pgdir; 100 - }; 101 - 102 - /* This is a guest-specific page (mapped ro) into the guest. */ 103 - struct lguest_ro_state 104 - { 105 - /* Host information we need to restore when we switch back. */ 106 - u32 host_cr3; 107 - struct Xgt_desc_struct host_idt_desc; 108 - struct Xgt_desc_struct host_gdt_desc; 109 - u32 host_sp; 110 - 111 - /* Fields which are used when guest is running. */ 112 - struct Xgt_desc_struct guest_idt_desc; 113 - struct Xgt_desc_struct guest_gdt_desc; 114 - struct i386_hw_tss guest_tss; 115 - struct desc_struct guest_idt[IDT_ENTRIES]; 116 - struct desc_struct guest_gdt[GDT_ENTRIES]; 117 }; 118 119 /* We have two pages shared with guests, per cpu. */ ··· 47 struct lguest_data __user *lguest_data; 48 struct task_struct *tsk; 49 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 50 - u16 guestid; 51 u32 pfn_limit; 52 - u32 page_offset; 53 u32 cr2; 54 int halted; 55 int ts; 56 u32 next_hcall; 57 u32 esp1; 58 u8 ss1; 59 60 /* Do we need to stop what we're doing and return to userspace? */ 61 int break_out; ··· 78 struct task_struct *wake; 79 80 unsigned long noirq_start, noirq_end; 81 - int dma_is_pending; 82 - unsigned long pending_dma; /* struct lguest_dma */ 83 - unsigned long pending_key; /* address they're sending to */ 84 85 unsigned int stack_pages; 86 u32 tsc_khz; 87 88 - struct lguest_dma_info dma[LGUEST_MAX_DMA]; 89 - 90 /* Dead? */ 91 const char *dead; 92 93 - /* The GDT entries copied into lguest_ro_state when running. */ 94 - struct desc_struct gdt[GDT_ENTRIES]; 95 - 96 - /* The IDT entries: some copied into lguest_ro_state when running. */ 97 - struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS]; 98 - struct desc_struct syscall_idt; 99 100 /* Virtual clock device */ 101 struct hrtimer hrt; ··· 95 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); 96 }; 97 98 - extern struct lguest lguests[]; 99 extern struct mutex lguest_lock; 100 101 /* core.c: */ 102 - u32 lgread_u32(struct lguest *lg, unsigned long addr); 103 - void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val); 104 - void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len); 105 - void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len); 106 - int find_free_guest(void); 107 int lguest_address_ok(const struct lguest *lg, 108 unsigned long addr, unsigned long len); 109 int run_guest(struct lguest *lg, unsigned long __user *user); 110 111 112 /* interrupts_and_traps.c: */ 113 void maybe_do_interrupt(struct lguest *lg); ··· 140 const unsigned long *def); 141 void guest_set_clockevent(struct lguest *lg, unsigned long delta); 142 void init_clockdev(struct lguest *lg); 143 144 /* segments.c: */ 145 void setup_default_gdt_entries(struct lguest_ro_state *state); ··· 156 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); 157 void free_guest_pagetable(struct lguest *lg); 158 void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); 159 - void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); 160 void guest_pagetable_clear_all(struct lguest *lg); 161 void guest_pagetable_flush_user(struct lguest *lg); 162 - void guest_set_pte(struct lguest *lg, unsigned long cr3, 163 - unsigned long vaddr, gpte_t val); 164 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 165 int demand_page(struct lguest *info, unsigned long cr2, int errcode); 166 void pin_page(struct lguest *lg, unsigned long vaddr); 167 168 /* lguest_user.c: */ 169 int lguest_device_init(void); 170 void lguest_device_remove(void); 171 - 172 - /* io.c: */ 173 - void lguest_io_init(void); 174 - int bind_dma(struct lguest *lg, 175 - unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt); 176 - void send_dma(struct lguest *info, unsigned long key, unsigned long udma); 177 - void release_all_dma(struct lguest *lg); 178 - unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, 179 - unsigned long *interrupt); 180 181 /* hypercalls.c: */ 182 void do_hypercalls(struct lguest *lg); ··· 221 } while(0) 222 /* (End of aside) :*/ 223 224 - static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 225 - { 226 - return vaddr - lg->page_offset; 227 - } 228 #endif /* __ASSEMBLY__ */ 229 #endif /* _LGUEST_H */

··· 1 #ifndef _LGUEST_H 2 #define _LGUEST_H 3 4 #ifndef __ASSEMBLY__ 5 #include <linux/types.h> 6 #include <linux/init.h> 7 #include <linux/stringify.h> 8 #include <linux/lguest.h> 9 #include <linux/lguest_launcher.h> 10 #include <linux/wait.h> 11 #include <linux/err.h> 12 #include <asm/semaphore.h> 13 14 + #include <asm/lguest.h> 15 16 void free_pagetables(void); 17 int init_pagetables(struct page **switcher_page, unsigned int pages); 18 19 struct pgdir 20 { 21 + unsigned long gpgdir; 22 + pgd_t *pgdir; 23 }; 24 25 /* We have two pages shared with guests, per cpu. */ ··· 141 struct lguest_data __user *lguest_data; 142 struct task_struct *tsk; 143 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 144 u32 pfn_limit; 145 + /* This provides the offset to the base of guest-physical 146 + * memory in the Launcher. */ 147 + void __user *mem_base; 148 + unsigned long kernel_address; 149 u32 cr2; 150 int halted; 151 int ts; 152 u32 next_hcall; 153 u32 esp1; 154 u8 ss1; 155 + 156 + /* If a hypercall was asked for, this points to the arguments. */ 157 + struct hcall_args *hcall; 158 159 /* Do we need to stop what we're doing and return to userspace? */ 160 int break_out; ··· 167 struct task_struct *wake; 168 169 unsigned long noirq_start, noirq_end; 170 + unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 171 172 unsigned int stack_pages; 173 u32 tsc_khz; 174 175 /* Dead? */ 176 const char *dead; 177 178 + struct lguest_arch arch; 179 180 /* Virtual clock device */ 181 struct hrtimer hrt; ··· 193 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); 194 }; 195 196 extern struct mutex lguest_lock; 197 198 /* core.c: */ 199 int lguest_address_ok(const struct lguest *lg, 200 unsigned long addr, unsigned long len); 201 + void __lgread(struct lguest *, void *, unsigned long, unsigned); 202 + void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 203 + 204 + /*L:306 Using memory-copy operations like that is usually inconvient, so we 205 + * have the following helper macros which read and write a specific type (often 206 + * an unsigned long). 207 + * 208 + * This reads into a variable of the given type then returns that. */ 209 + #define lgread(lg, addr, type) \ 210 + ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) 211 + 212 + /* This checks that the variable is of the given type, then writes it out. */ 213 + #define lgwrite(lg, addr, type, val) \ 214 + do { \ 215 + typecheck(type, val); \ 216 + __lgwrite((lg), (addr), &(val), sizeof(val)); \ 217 + } while(0) 218 + /* (end of memory access helper routines) :*/ 219 + 220 int run_guest(struct lguest *lg, unsigned long __user *user); 221 222 + /* Helper macros to obtain the first 12 or the last 20 bits, this is only the 223 + * first step in the migration to the kernel types. pte_pfn is already defined 224 + * in the kernel. */ 225 + #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 226 + #define pte_flags(x) (pte_val(x) & ~PAGE_MASK) 227 + #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 228 229 /* interrupts_and_traps.c: */ 230 void maybe_do_interrupt(struct lguest *lg); ··· 219 const unsigned long *def); 220 void guest_set_clockevent(struct lguest *lg, unsigned long delta); 221 void init_clockdev(struct lguest *lg); 222 + bool check_syscall_vector(struct lguest *lg); 223 + int init_interrupts(void); 224 + void free_interrupts(void); 225 226 /* segments.c: */ 227 void setup_default_gdt_entries(struct lguest_ro_state *state); ··· 232 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); 233 void free_guest_pagetable(struct lguest *lg); 234 void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); 235 + void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 236 void guest_pagetable_clear_all(struct lguest *lg); 237 void guest_pagetable_flush_user(struct lguest *lg); 238 + void guest_set_pte(struct lguest *lg, unsigned long gpgdir, 239 + unsigned long vaddr, pte_t val); 240 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 241 int demand_page(struct lguest *info, unsigned long cr2, int errcode); 242 void pin_page(struct lguest *lg, unsigned long vaddr); 243 + unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); 244 + void page_table_guest_data_init(struct lguest *lg); 245 + 246 + /* <arch>/core.c: */ 247 + void lguest_arch_host_init(void); 248 + void lguest_arch_host_fini(void); 249 + void lguest_arch_run_guest(struct lguest *lg); 250 + void lguest_arch_handle_trap(struct lguest *lg); 251 + int lguest_arch_init_hypercalls(struct lguest *lg); 252 + int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); 253 + void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); 254 + 255 + /* <arch>/switcher.S: */ 256 + extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 257 258 /* lguest_user.c: */ 259 int lguest_device_init(void); 260 void lguest_device_remove(void); 261 262 /* hypercalls.c: */ 263 void do_hypercalls(struct lguest *lg); ··· 292 } while(0) 293 /* (End of aside) :*/ 294 295 #endif /* __ASSEMBLY__ */ 296 #endif /* _LGUEST_H */

+32 -70

drivers/lguest/lguest.c arch/x86/lguest/boot.c

··· 55 #include <linux/clockchips.h> 56 #include <linux/lguest.h> 57 #include <linux/lguest_launcher.h> 58 - #include <linux/lguest_bus.h> 59 #include <asm/paravirt.h> 60 #include <asm/param.h> 61 #include <asm/page.h> ··· 65 #include <asm/e820.h> 66 #include <asm/mce.h> 67 #include <asm/io.h> 68 69 /*G:010 Welcome to the Guest! 70 * ··· 86 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 87 .noirq_start = (u32)lguest_noirq_start, 88 .noirq_end = (u32)lguest_noirq_end, 89 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 90 }; 91 - struct lguest_device_desc *lguest_devices; 92 static cycle_t clock_base; 93 94 /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first ··· 148 /* Table full, so do normal hcall which will flush table. */ 149 hcall(call, arg1, arg2, arg3); 150 } else { 151 - lguest_data.hcalls[next_call].eax = call; 152 - lguest_data.hcalls[next_call].edx = arg1; 153 - lguest_data.hcalls[next_call].ebx = arg2; 154 - lguest_data.hcalls[next_call].ecx = arg3; 155 /* Arguments must all be written before we mark it to go */ 156 wmb(); 157 lguest_data.hcall_status[next_call] = 0; ··· 161 local_irq_restore(flags); 162 } 163 /*:*/ 164 - 165 - /* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because 166 - * Jeff Garzik complained that __pa() should never appear in drivers, and this 167 - * helps remove most of them. But also, it wraps some ugliness. */ 168 - void lguest_send_dma(unsigned long key, struct lguest_dma *dma) 169 - { 170 - /* The hcall might not write this if something goes wrong */ 171 - dma->used_len = 0; 172 - hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); 173 - } 174 - 175 - int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, 176 - unsigned int num, u8 irq) 177 - { 178 - /* This is the only hypercall which actually wants 5 arguments, and we 179 - * only support 4. Fortunately the interrupt number is always less 180 - * than 256, so we can pack it with the number of dmas in the final 181 - * argument. */ 182 - if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) 183 - return -ENOMEM; 184 - return 0; 185 - } 186 - 187 - /* Unbinding is the same hypercall as binding, but with 0 num & irq. */ 188 - void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) 189 - { 190 - hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); 191 - } 192 - 193 - /* For guests, device memory can be used as normal memory, so we cast away the 194 - * __iomem to quieten sparse. */ 195 - void *lguest_map(unsigned long phys_addr, unsigned long pages) 196 - { 197 - return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); 198 - } 199 - 200 - void lguest_unmap(void *addr) 201 - { 202 - iounmap((__force void __iomem *)addr); 203 - } 204 205 /*G:033 206 * Here are our first native-instruction replacements: four functions for ··· 642 .mask = CLOCKSOURCE_MASK(64), 643 .mult = 1 << 22, 644 .shift = 22, 645 }; 646 647 /* The "scheduler clock" is just our real clock, adjusted to start at zero */ ··· 724 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 725 * way, the "rating" is initialized so high that it's always chosen 726 * over any other clocksource. */ 727 - if (lguest_data.tsc_khz) { 728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 729 lguest_clock.shift); 730 - lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; 731 - } 732 clock_base = lguest_clock_read(); 733 clocksource_register(&lguest_clock); 734 ··· 850 return "LGUEST"; 851 } 852 853 /*G:050 854 * Patching (Powerfully Placating Performance Pedants) 855 * ··· 928 /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 929 * structures in the kernel provide points for (almost) every routine we have 930 * to override to avoid privileged instructions. */ 931 - __init void lguest_init(void *boot) 932 { 933 - /* Copy boot parameters first: the Launcher put the physical location 934 - * in %esi, and head.S converted that to a virtual address and handed 935 - * it to us. We use "__memcpy" because "memcpy" sometimes tries to do 936 - * tricky things to go faster, and we're not ready for that. */ 937 - __memcpy(&boot_params, boot, PARAM_SIZE); 938 - /* The boot parameters also tell us where the command-line is: save 939 - * that, too. */ 940 - __memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), 941 - COMMAND_LINE_SIZE); 942 - 943 /* We're under lguest, paravirt is enabled, and we're running at 944 * privilege level 1, not 0 as normal. */ 945 pv_info.name = "lguest"; ··· 1001 1002 /*G:070 Now we've seen all the paravirt_ops, we return to 1003 * lguest_init() where the rest of the fairly chaotic boot setup 1004 - * occurs. 1005 - * 1006 - * The Host expects our first hypercall to tell it where our "struct 1007 - * lguest_data" is, so we do that first. */ 1008 - hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); 1009 1010 /* The native boot code sets up initial page tables immediately after 1011 * the kernel itself, and sets init_pg_tables_end so they're not ··· 1013 /* Load the %fs segment register (the per-cpu segment register) with 1014 * the normal data segment to get through booting. */ 1015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1016 - 1017 - /* Clear the part of the kernel data which is expected to be zero. 1018 - * Normally it will be anyway, but if we're loading from a bzImage with 1019 - * CONFIG_RELOCATALE=y, the relocations will be sitting here. */ 1020 - memset(__bss_start, 0, __bss_stop - __bss_start); 1021 1022 /* The Host uses the top of the Guest's virtual address space for the 1023 * Host<->Guest Switcher, and it tells us how much it needs in ··· 1050 * virtual console" driver written by the PowerPC people, which we also 1051 * adapted for lguest's use. */ 1052 add_preferred_console("hvc", 0, NULL); 1053 1054 /* Last of all, we set the power management poweroff hook to point to 1055 * the Guest routine to power off. */

··· 55 #include <linux/clockchips.h> 56 #include <linux/lguest.h> 57 #include <linux/lguest_launcher.h> 58 + #include <linux/virtio_console.h> 59 #include <asm/paravirt.h> 60 #include <asm/param.h> 61 #include <asm/page.h> ··· 65 #include <asm/e820.h> 66 #include <asm/mce.h> 67 #include <asm/io.h> 68 + #include <asm/i387.h> 69 70 /*G:010 Welcome to the Guest! 71 * ··· 85 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 86 .noirq_start = (u32)lguest_noirq_start, 87 .noirq_end = (u32)lguest_noirq_end, 88 + .kernel_address = PAGE_OFFSET, 89 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 90 + .syscall_vec = SYSCALL_VECTOR, 91 }; 92 static cycle_t clock_base; 93 94 /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first ··· 146 /* Table full, so do normal hcall which will flush table. */ 147 hcall(call, arg1, arg2, arg3); 148 } else { 149 + lguest_data.hcalls[next_call].arg0 = call; 150 + lguest_data.hcalls[next_call].arg1 = arg1; 151 + lguest_data.hcalls[next_call].arg2 = arg2; 152 + lguest_data.hcalls[next_call].arg3 = arg3; 153 /* Arguments must all be written before we mark it to go */ 154 wmb(); 155 lguest_data.hcall_status[next_call] = 0; ··· 159 local_irq_restore(flags); 160 } 161 /*:*/ 162 163 /*G:033 164 * Here are our first native-instruction replacements: four functions for ··· 680 .mask = CLOCKSOURCE_MASK(64), 681 .mult = 1 << 22, 682 .shift = 22, 683 + .flags = CLOCK_SOURCE_IS_CONTINUOUS, 684 }; 685 686 /* The "scheduler clock" is just our real clock, adjusted to start at zero */ ··· 761 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 762 * way, the "rating" is initialized so high that it's always chosen 763 * over any other clocksource. */ 764 + if (lguest_data.tsc_khz) 765 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 766 lguest_clock.shift); 767 clock_base = lguest_clock_read(); 768 clocksource_register(&lguest_clock); 769 ··· 889 return "LGUEST"; 890 } 891 892 + /* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to 893 + * produce console output. */ 894 + static __init int early_put_chars(u32 vtermno, const char *buf, int count) 895 + { 896 + char scratch[17]; 897 + unsigned int len = count; 898 + 899 + if (len > sizeof(scratch) - 1) 900 + len = sizeof(scratch) - 1; 901 + scratch[len] = '\0'; 902 + memcpy(scratch, buf, len); 903 + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0); 904 + 905 + /* This routine returns the number of bytes actually written. */ 906 + return len; 907 + } 908 + 909 /*G:050 910 * Patching (Powerfully Placating Performance Pedants) 911 * ··· 950 /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 951 * structures in the kernel provide points for (almost) every routine we have 952 * to override to avoid privileged instructions. */ 953 + __init void lguest_init(void) 954 { 955 /* We're under lguest, paravirt is enabled, and we're running at 956 * privilege level 1, not 0 as normal. */ 957 pv_info.name = "lguest"; ··· 1033 1034 /*G:070 Now we've seen all the paravirt_ops, we return to 1035 * lguest_init() where the rest of the fairly chaotic boot setup 1036 + * occurs. */ 1037 1038 /* The native boot code sets up initial page tables immediately after 1039 * the kernel itself, and sets init_pg_tables_end so they're not ··· 1049 /* Load the %fs segment register (the per-cpu segment register) with 1050 * the normal data segment to get through booting. */ 1051 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1052 1053 /* The Host uses the top of the Guest's virtual address space for the 1054 * Host<->Guest Switcher, and it tells us how much it needs in ··· 1091 * virtual console" driver written by the PowerPC people, which we also 1092 * adapted for lguest's use. */ 1093 add_preferred_console("hvc", 0, NULL); 1094 + 1095 + /* Register our very early console. */ 1096 + virtio_cons_early_init(early_put_chars); 1097 1098 /* Last of all, we set the power management poweroff hook to point to 1099 * the Guest routine to power off. */

+34 -12

drivers/lguest/lguest_asm.S arch/x86/lguest/i386_head.S

··· 1 #include <linux/linkage.h> 2 #include <linux/lguest.h> 3 #include <asm/asm-offsets.h> 4 #include <asm/thread_info.h> 5 #include <asm/processor-flags.h> 6 7 - /*G:020 This is where we begin: we have a magic signature which the launcher 8 - * looks for. The plan is that the Linux boot protocol will be extended with a 9 - * "platform type" field which will guide us here from the normal entry point, 10 - * but for the moment this suffices. The normal boot code uses %esi for the 11 - * boot header, so we do too. We convert it to a virtual address by adding 12 - * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax). 13 * 14 * The .section line puts this code in .init.text so it will be discarded after 15 * boot. */ 16 .section .init.text, "ax", @progbits 17 - .ascii "GenuineLguest" 18 - /* Set up initial stack. */ 19 - movl $(init_thread_union+THREAD_SIZE),%esp 20 - movl %esi, %eax 21 - addl $__PAGE_OFFSET, %eax 22 - jmp lguest_init 23 24 /*G:055 We create a macro which puts the assembler code between lgstart_ and 25 * lgend_ markers. These templates are put in the .text section: they can't be

··· 1 #include <linux/linkage.h> 2 #include <linux/lguest.h> 3 + #include <asm/lguest_hcall.h> 4 #include <asm/asm-offsets.h> 5 #include <asm/thread_info.h> 6 #include <asm/processor-flags.h> 7 8 + /*G:020 This is where we begin: head.S notes that the boot header's platform 9 + * type field is "1" (lguest), so calls us here. The boot header is in %esi. 10 + * 11 + * WARNING: be very careful here! We're running at addresses equal to physical 12 + * addesses (around 0), not above PAGE_OFFSET as most code expectes 13 + * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 14 + * data. 15 * 16 * The .section line puts this code in .init.text so it will be discarded after 17 * boot. */ 18 .section .init.text, "ax", @progbits 19 + ENTRY(lguest_entry) 20 + /* Make initial hypercall now, so we can set up the pagetables. */ 21 + movl $LHCALL_LGUEST_INIT, %eax 22 + movl $lguest_data - __PAGE_OFFSET, %edx 23 + int $LGUEST_TRAP_ENTRY 24 + 25 + /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl 26 + * instruction uses %esi implicitly. */ 27 + movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi 28 + 29 + /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. 30 + * This means the first 128M of kernel memory will be mapped at 31 + * PAGE_OFFSET where the kernel expects to run. This will get it far 32 + * enough through boot to switch to its own pagetables. */ 33 + movl $32, %ecx 34 + movl %esi, %edi 35 + addl $((__PAGE_OFFSET >> 22) * 4), %edi 36 + rep 37 + movsl 38 + 39 + /* Set up the initial stack so we can run C code. */ 40 + movl $(init_thread_union+THREAD_SIZE),%esp 41 + 42 + /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 43 + * moment. */ 44 + jmp lguest_init+__PAGE_OFFSET 45 46 /*G:055 We create a macro which puts the assembler code between lgstart_ and 47 * lgend_ markers. These templates are put in the .text section: they can't be

-218

drivers/lguest/lguest_bus.c

··· 1 - /*P:050 Lguest guests use a very simple bus for devices. It's a simple array 2 - * of device descriptors contained just above the top of normal memory. The 3 - * lguest bus is 80% tedious boilerplate code. :*/ 4 - #include <linux/init.h> 5 - #include <linux/bootmem.h> 6 - #include <linux/lguest_bus.h> 7 - #include <asm/io.h> 8 - #include <asm/paravirt.h> 9 - 10 - static ssize_t type_show(struct device *_dev, 11 - struct device_attribute *attr, char *buf) 12 - { 13 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 14 - return sprintf(buf, "%hu", lguest_devices[dev->index].type); 15 - } 16 - static ssize_t features_show(struct device *_dev, 17 - struct device_attribute *attr, char *buf) 18 - { 19 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 20 - return sprintf(buf, "%hx", lguest_devices[dev->index].features); 21 - } 22 - static ssize_t pfn_show(struct device *_dev, 23 - struct device_attribute *attr, char *buf) 24 - { 25 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 26 - return sprintf(buf, "%u", lguest_devices[dev->index].pfn); 27 - } 28 - static ssize_t status_show(struct device *_dev, 29 - struct device_attribute *attr, char *buf) 30 - { 31 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 32 - return sprintf(buf, "%hx", lguest_devices[dev->index].status); 33 - } 34 - static ssize_t status_store(struct device *_dev, struct device_attribute *attr, 35 - const char *buf, size_t count) 36 - { 37 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 38 - if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1) 39 - return -EINVAL; 40 - return count; 41 - } 42 - static struct device_attribute lguest_dev_attrs[] = { 43 - __ATTR_RO(type), 44 - __ATTR_RO(features), 45 - __ATTR_RO(pfn), 46 - __ATTR(status, 0644, status_show, status_store), 47 - __ATTR_NULL 48 - }; 49 - 50 - /*D:130 The generic bus infrastructure requires a function which says whether a 51 - * device matches a driver. For us, it is simple: "struct lguest_driver" 52 - * contains a "device_type" field which indicates what type of device it can 53 - * handle, so we just cast the args and compare: */ 54 - static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) 55 - { 56 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 57 - struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv); 58 - 59 - return (drv->device_type == lguest_devices[dev->index].type); 60 - } 61 - /*:*/ 62 - 63 - struct lguest_bus { 64 - struct bus_type bus; 65 - struct device dev; 66 - }; 67 - 68 - static struct lguest_bus lguest_bus = { 69 - .bus = { 70 - .name = "lguest", 71 - .match = lguest_dev_match, 72 - .dev_attrs = lguest_dev_attrs, 73 - }, 74 - .dev = { 75 - .parent = NULL, 76 - .bus_id = "lguest", 77 - } 78 - }; 79 - 80 - /*D:140 This is the callback which occurs once the bus infrastructure matches 81 - * up a device and driver, ie. in response to add_lguest_device() calling 82 - * device_register(), or register_lguest_driver() calling driver_register(). 83 - * 84 - * At the moment it's always the latter: the devices are added first, since 85 - * scan_devices() is called from a "core_initcall", and the drivers themselves 86 - * called later as a normal "initcall". But it would work the other way too. 87 - * 88 - * So now we have the happy couple, we add the status bit to indicate that we 89 - * found a driver. If the driver truly loves the device, it will return 90 - * happiness from its probe function (ok, perhaps this wasn't my greatest 91 - * analogy), and we set the final "driver ok" bit so the Host sees it's all 92 - * green. */ 93 - static int lguest_dev_probe(struct device *_dev) 94 - { 95 - int ret; 96 - struct lguest_device*dev = container_of(_dev,struct lguest_device,dev); 97 - struct lguest_driver*drv = container_of(dev->dev.driver, 98 - struct lguest_driver, drv); 99 - 100 - lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; 101 - ret = drv->probe(dev); 102 - if (ret == 0) 103 - lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK; 104 - return ret; 105 - } 106 - 107 - /* The last part of the bus infrastructure is the function lguest drivers use 108 - * to register themselves. Firstly, we do nothing if there's no lguest bus 109 - * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct 110 - * driver" fields and call the generic driver_register(). */ 111 - int register_lguest_driver(struct lguest_driver *drv) 112 - { 113 - if (!lguest_devices) 114 - return 0; 115 - 116 - drv->drv.bus = &lguest_bus.bus; 117 - drv->drv.name = drv->name; 118 - drv->drv.owner = drv->owner; 119 - drv->drv.probe = lguest_dev_probe; 120 - 121 - return driver_register(&drv->drv); 122 - } 123 - 124 - /* At the moment we build all the drivers into the kernel because they're so 125 - * simple: 8144 bytes for all three of them as I type this. And as the console 126 - * really needs to be built in, it's actually only 3527 bytes for the network 127 - * and block drivers. 128 - * 129 - * If they get complex it will make sense for them to be modularized, so we 130 - * need to explicitly export the symbol. 131 - * 132 - * I don't think non-GPL modules make sense, so it's a GPL-only export. 133 - */ 134 - EXPORT_SYMBOL_GPL(register_lguest_driver); 135 - 136 - /*D:120 This is the core of the lguest bus: actually adding a new device. 137 - * It's a separate function because it's neater that way, and because an 138 - * earlier version of the code supported hotplug and unplug. They were removed 139 - * early on because they were never used. 140 - * 141 - * As Andrew Tridgell says, "Untested code is buggy code". 142 - * 143 - * It's worth reading this carefully: we start with an index into the array of 144 - * "struct lguest_device_desc"s indicating the device which is new: */ 145 - static void add_lguest_device(unsigned int index) 146 - { 147 - struct lguest_device *new; 148 - 149 - /* Each "struct lguest_device_desc" has a "status" field, which the 150 - * Guest updates as the device is probed. In the worst case, the Host 151 - * can look at these bits to tell what part of device setup failed, 152 - * even if the console isn't available. */ 153 - lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; 154 - new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); 155 - if (!new) { 156 - printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); 157 - lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; 158 - return; 159 - } 160 - 161 - /* The "struct lguest_device" setup is pretty straight-forward example 162 - * code. */ 163 - new->index = index; 164 - new->private = NULL; 165 - memset(&new->dev, 0, sizeof(new->dev)); 166 - new->dev.parent = &lguest_bus.dev; 167 - new->dev.bus = &lguest_bus.bus; 168 - sprintf(new->dev.bus_id, "%u", index); 169 - 170 - /* device_register() causes the bus infrastructure to look for a 171 - * matching driver. */ 172 - if (device_register(&new->dev) != 0) { 173 - printk(KERN_EMERG "Cannot register lguest device %u\n", index); 174 - lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; 175 - kfree(new); 176 - } 177 - } 178 - 179 - /*D:110 scan_devices() simply iterates through the device array. The type 0 180 - * is reserved to mean "no device", and anything else means we have found a 181 - * device: add it. */ 182 - static void scan_devices(void) 183 - { 184 - unsigned int i; 185 - 186 - for (i = 0; i < LGUEST_MAX_DEVICES; i++) 187 - if (lguest_devices[i].type) 188 - add_lguest_device(i); 189 - } 190 - 191 - /*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest 192 - * bus. We check that we are a Guest by checking paravirt_ops.name: there are 193 - * other ways of checking, but this seems most obvious to me. 194 - * 195 - * So we can access the array of "struct lguest_device_desc"s easily, we map 196 - * that memory and store the pointer in the global "lguest_devices". Then we 197 - * register the bus with the core. Doing two registrations seems clunky to me, 198 - * but it seems to be the correct sysfs incantation. 199 - * 200 - * Finally we call scan_devices() which adds all the devices found in the 201 - * "struct lguest_device_desc" array. */ 202 - static int __init lguest_bus_init(void) 203 - { 204 - if (strcmp(pv_info.name, "lguest") != 0) 205 - return 0; 206 - 207 - /* Devices are in a single page above top of "normal" mem */ 208 - lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); 209 - 210 - if (bus_register(&lguest_bus.bus) != 0 211 - || device_register(&lguest_bus.dev) != 0) 212 - panic("lguest bus registration failed"); 213 - 214 - scan_devices(); 215 - return 0; 216 - } 217 - /* Do this after core stuff, before devices. */ 218 - postcore_initcall(lguest_bus_init);

···

+373

drivers/lguest/lguest_device.c

···

··· 1 + /*P:050 Lguest guests use a very simple method to describe devices. It's a 2 + * series of device descriptors contained just above the top of normal 3 + * memory. 4 + * 5 + * We use the standard "virtio" device infrastructure, which provides us with a 6 + * console, a network and a block driver. Each one expects some configuration 7 + * information and a "virtqueue" mechanism to send and receive data. :*/ 8 + #include <linux/init.h> 9 + #include <linux/bootmem.h> 10 + #include <linux/lguest_launcher.h> 11 + #include <linux/virtio.h> 12 + #include <linux/virtio_config.h> 13 + #include <linux/interrupt.h> 14 + #include <linux/virtio_ring.h> 15 + #include <linux/err.h> 16 + #include <asm/io.h> 17 + #include <asm/paravirt.h> 18 + #include <asm/lguest_hcall.h> 19 + 20 + /* The pointer to our (page) of device descriptions. */ 21 + static void *lguest_devices; 22 + 23 + /* Unique numbering for lguest devices. */ 24 + static unsigned int dev_index; 25 + 26 + /* For Guests, device memory can be used as normal memory, so we cast away the 27 + * __iomem to quieten sparse. */ 28 + static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) 29 + { 30 + return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); 31 + } 32 + 33 + static inline void lguest_unmap(void *addr) 34 + { 35 + iounmap((__force void __iomem *)addr); 36 + } 37 + 38 + /*D:100 Each lguest device is just a virtio device plus a pointer to its entry 39 + * in the lguest_devices page. */ 40 + struct lguest_device { 41 + struct virtio_device vdev; 42 + 43 + /* The entry in the lguest_devices page for this device. */ 44 + struct lguest_device_desc *desc; 45 + }; 46 + 47 + /* Since the virtio infrastructure hands us a pointer to the virtio_device all 48 + * the time, it helps to have a curt macro to get a pointer to the struct 49 + * lguest_device it's enclosed in. */ 50 + #define to_lgdev(vdev) container_of(vdev, struct lguest_device, vdev) 51 + 52 + /*D:130 53 + * Device configurations 54 + * 55 + * The configuration information for a device consists of a series of fields. 56 + * The device will look for these fields during setup. 57 + * 58 + * For us these fields come immediately after that device's descriptor in the 59 + * lguest_devices page. 60 + * 61 + * Each field starts with a "type" byte, a "length" byte, then that number of 62 + * bytes of configuration information. The device descriptor tells us the 63 + * total configuration length so we know when we've reached the last field. */ 64 + 65 + /* type + length bytes */ 66 + #define FHDR_LEN 2 67 + 68 + /* This finds the first field of a given type for a device's configuration. */ 69 + static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len) 70 + { 71 + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; 72 + int i; 73 + 74 + for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) { 75 + if (desc->config[i] == type) { 76 + /* Mark it used, so Host can know we looked at it, and 77 + * also so we won't find the same one twice. */ 78 + desc->config[i] |= 0x80; 79 + /* Remember, the second byte is the length. */ 80 + *len = desc->config[i+1]; 81 + /* We return a pointer to the field header. */ 82 + return desc->config + i; 83 + } 84 + } 85 + 86 + /* Not found: return NULL for failure. */ 87 + return NULL; 88 + } 89 + 90 + /* Once they've found a field, getting a copy of it is easy. */ 91 + static void lg_get(struct virtio_device *vdev, void *token, 92 + void *buf, unsigned len) 93 + { 94 + /* Check they didn't ask for more than the length of the field! */ 95 + BUG_ON(len > ((u8 *)token)[1]); 96 + memcpy(buf, token + FHDR_LEN, len); 97 + } 98 + 99 + /* Setting the contents is also trivial. */ 100 + static void lg_set(struct virtio_device *vdev, void *token, 101 + const void *buf, unsigned len) 102 + { 103 + BUG_ON(len > ((u8 *)token)[1]); 104 + memcpy(token + FHDR_LEN, buf, len); 105 + } 106 + 107 + /* The operations to get and set the status word just access the status field 108 + * of the device descriptor. */ 109 + static u8 lg_get_status(struct virtio_device *vdev) 110 + { 111 + return to_lgdev(vdev)->desc->status; 112 + } 113 + 114 + static void lg_set_status(struct virtio_device *vdev, u8 status) 115 + { 116 + to_lgdev(vdev)->desc->status = status; 117 + } 118 + 119 + /* 120 + * Virtqueues 121 + * 122 + * The other piece of infrastructure virtio needs is a "virtqueue": a way of 123 + * the Guest device registering buffers for the other side to read from or 124 + * write into (ie. send and receive buffers). Each device can have multiple 125 + * virtqueues: for example the console has one queue for sending and one for 126 + * receiving. 127 + * 128 + * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue 129 + * already exists in virtio_ring.c. We just need to connect it up. 130 + * 131 + * We start with the information we need to keep about each virtqueue. 132 + */ 133 + 134 + /*D:140 This is the information we remember about each virtqueue. */ 135 + struct lguest_vq_info 136 + { 137 + /* A copy of the information contained in the device config. */ 138 + struct lguest_vqconfig config; 139 + 140 + /* The address where we mapped the virtio ring, so we can unmap it. */ 141 + void *pages; 142 + }; 143 + 144 + /* When the virtio_ring code wants to prod the Host, it calls us here and we 145 + * make a hypercall. We hand the page number of the virtqueue so the Host 146 + * knows which virtqueue we're talking about. */ 147 + static void lg_notify(struct virtqueue *vq) 148 + { 149 + /* We store our virtqueue information in the "priv" pointer of the 150 + * virtqueue structure. */ 151 + struct lguest_vq_info *lvq = vq->priv; 152 + 153 + hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0); 154 + } 155 + 156 + /* This routine finds the first virtqueue described in the configuration of 157 + * this device and sets it up. 158 + * 159 + * This is kind of an ugly duckling. It'd be nicer to have a standard 160 + * representation of a virtqueue in the configuration space, but it seems that 161 + * everyone wants to do it differently. The KVM guys want the Guest to 162 + * allocate its own pages and tell the Host where they are, but for lguest it's 163 + * simpler for the Host to simply tell us where the pages are. 164 + * 165 + * So we provide devices with a "find virtqueue and set it up" function. */ 166 + static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 167 + bool (*callback)(struct virtqueue *vq)) 168 + { 169 + struct lguest_vq_info *lvq; 170 + struct virtqueue *vq; 171 + unsigned int len; 172 + void *token; 173 + int err; 174 + 175 + /* Look for a field of the correct type to mark a virtqueue. Note that 176 + * if this succeeds, then the type will be changed so it won't be found 177 + * again, and future lg_find_vq() calls will find the next 178 + * virtqueue (if any). */ 179 + token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len); 180 + if (!token) 181 + return ERR_PTR(-ENOENT); 182 + 183 + lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); 184 + if (!lvq) 185 + return ERR_PTR(-ENOMEM); 186 + 187 + /* Note: we could use a configuration space inside here, just like we 188 + * do for the device. This would allow expansion in future, because 189 + * our configuration system is designed to be expansible. But this is 190 + * way easier. */ 191 + if (len != sizeof(lvq->config)) { 192 + dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len); 193 + err = -EIO; 194 + goto free_lvq; 195 + } 196 + /* Make a copy of the "struct lguest_vqconfig" field. We need a copy 197 + * because the config space might not be aligned correctly. */ 198 + vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config)); 199 + 200 + /* Figure out how many pages the ring will take, and map that memory */ 201 + lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, 202 + DIV_ROUND_UP(vring_size(lvq->config.num), 203 + PAGE_SIZE)); 204 + if (!lvq->pages) { 205 + err = -ENOMEM; 206 + goto free_lvq; 207 + } 208 + 209 + /* OK, tell virtio_ring.c to set up a virtqueue now we know its size 210 + * and we've got a pointer to its pages. */ 211 + vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages, 212 + lg_notify, callback); 213 + if (!vq) { 214 + err = -ENOMEM; 215 + goto unmap; 216 + } 217 + 218 + /* Tell the interrupt for this virtqueue to go to the virtio_ring 219 + * interrupt handler. */ 220 + /* FIXME: We used to have a flag for the Host to tell us we could use 221 + * the interrupt as a source of randomness: it'd be nice to have that 222 + * back.. */ 223 + err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, 224 + vdev->dev.bus_id, vq); 225 + if (err) 226 + goto destroy_vring; 227 + 228 + /* Last of all we hook up our 'struct lguest_vq_info" to the 229 + * virtqueue's priv pointer. */ 230 + vq->priv = lvq; 231 + return vq; 232 + 233 + destroy_vring: 234 + vring_del_virtqueue(vq); 235 + unmap: 236 + lguest_unmap(lvq->pages); 237 + free_lvq: 238 + kfree(lvq); 239 + return ERR_PTR(err); 240 + } 241 + /*:*/ 242 + 243 + /* Cleaning up a virtqueue is easy */ 244 + static void lg_del_vq(struct virtqueue *vq) 245 + { 246 + struct lguest_vq_info *lvq = vq->priv; 247 + 248 + /* Tell virtio_ring.c to free the virtqueue. */ 249 + vring_del_virtqueue(vq); 250 + /* Unmap the pages containing the ring. */ 251 + lguest_unmap(lvq->pages); 252 + /* Free our own queue information. */ 253 + kfree(lvq); 254 + } 255 + 256 + /* The ops structure which hooks everything together. */ 257 + static struct virtio_config_ops lguest_config_ops = { 258 + .find = lg_find, 259 + .get = lg_get, 260 + .set = lg_set, 261 + .get_status = lg_get_status, 262 + .set_status = lg_set_status, 263 + .find_vq = lg_find_vq, 264 + .del_vq = lg_del_vq, 265 + }; 266 + 267 + /* The root device for the lguest virtio devices. This makes them appear as 268 + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ 269 + static struct device lguest_root = { 270 + .parent = NULL, 271 + .bus_id = "lguest", 272 + }; 273 + 274 + /*D:120 This is the core of the lguest bus: actually adding a new device. 275 + * It's a separate function because it's neater that way, and because an 276 + * earlier version of the code supported hotplug and unplug. They were removed 277 + * early on because they were never used. 278 + * 279 + * As Andrew Tridgell says, "Untested code is buggy code". 280 + * 281 + * It's worth reading this carefully: we start with a pointer to the new device 282 + * descriptor in the "lguest_devices" page. */ 283 + static void add_lguest_device(struct lguest_device_desc *d) 284 + { 285 + struct lguest_device *ldev; 286 + 287 + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 288 + if (!ldev) { 289 + printk(KERN_EMERG "Cannot allocate lguest dev %u\n", 290 + dev_index++); 291 + return; 292 + } 293 + 294 + /* This devices' parent is the lguest/ dir. */ 295 + ldev->vdev.dev.parent = &lguest_root; 296 + /* We have a unique device index thanks to the dev_index counter. */ 297 + ldev->vdev.index = dev_index++; 298 + /* The device type comes straight from the descriptor. There's also a 299 + * device vendor field in the virtio_device struct, which we leave as 300 + * 0. */ 301 + ldev->vdev.id.device = d->type; 302 + /* We have a simple set of routines for querying the device's 303 + * configuration information and setting its status. */ 304 + ldev->vdev.config = &lguest_config_ops; 305 + /* And we remember the device's descriptor for lguest_config_ops. */ 306 + ldev->desc = d; 307 + 308 + /* register_virtio_device() sets up the generic fields for the struct 309 + * virtio_device and calls device_register(). This makes the bus 310 + * infrastructure look for a matching driver. */ 311 + if (register_virtio_device(&ldev->vdev) != 0) { 312 + printk(KERN_ERR "Failed to register lguest device %u\n", 313 + ldev->vdev.index); 314 + kfree(ldev); 315 + } 316 + } 317 + 318 + /*D:110 scan_devices() simply iterates through the device page. The type 0 is 319 + * reserved to mean "end of devices". */ 320 + static void scan_devices(void) 321 + { 322 + unsigned int i; 323 + struct lguest_device_desc *d; 324 + 325 + /* We start at the page beginning, and skip over each entry. */ 326 + for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) { 327 + d = lguest_devices + i; 328 + 329 + /* Once we hit a zero, stop. */ 330 + if (d->type == 0) 331 + break; 332 + 333 + add_lguest_device(d); 334 + } 335 + } 336 + 337 + /*D:105 Fairly early in boot, lguest_devices_init() is called to set up the 338 + * lguest device infrastructure. We check that we are a Guest by checking 339 + * pv_info.name: there are other ways of checking, but this seems most 340 + * obvious to me. 341 + * 342 + * So we can access the "struct lguest_device_desc"s easily, we map that memory 343 + * and store the pointer in the global "lguest_devices". Then we register a 344 + * root device from which all our devices will hang (this seems to be the 345 + * correct sysfs incantation). 346 + * 347 + * Finally we call scan_devices() which adds all the devices found in the 348 + * lguest_devices page. */ 349 + static int __init lguest_devices_init(void) 350 + { 351 + if (strcmp(pv_info.name, "lguest") != 0) 352 + return 0; 353 + 354 + if (device_register(&lguest_root) != 0) 355 + panic("Could not register lguest root"); 356 + 357 + /* Devices are in a single page above top of "normal" mem */ 358 + lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); 359 + 360 + scan_devices(); 361 + return 0; 362 + } 363 + /* We do this after core stuff, but before the drivers. */ 364 + postcore_initcall(lguest_devices_init); 365 + 366 + /*D:150 At this point in the journey we used to now wade through the lguest 367 + * devices themselves: net, block and console. Since they're all now virtio 368 + * devices rather than lguest-specific, I've decided to ignore them. Mostly, 369 + * they're kind of boring. But this does mean you'll never experience the 370 + * thrill of reading the forbidden love scene buried deep in the block driver. 371 + * 372 + * "make Launcher" beckons, where we answer questions like "Where do Guests 373 + * come from?", and "What do you do when someone asks for optimization?". */

+33 -105

drivers/lguest/lguest_user.c

··· 1 /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 2 * controls and communicates with the Guest. For example, the first write will 3 - * tell us the memory size, pagetable, entry point and kernel address offset. 4 - * A read will run the Guest until a signal is pending (-EINTR), or the Guest 5 - * does a DMA out to the Launcher. Writes are also used to get a DMA buffer 6 - * registered by the Guest and to send the Guest an interrupt. :*/ 7 #include <linux/uaccess.h> 8 #include <linux/miscdevice.h> 9 #include <linux/fs.h> 10 #include "lg.h" 11 12 - /*L:030 setup_regs() doesn't really belong in this file, but it gives us an 13 - * early glimpse deeper into the Host so it's worth having here. 14 - * 15 - * Most of the Guest's registers are left alone: we used get_zeroed_page() to 16 - * allocate the structure, so they will be 0. */ 17 - static void setup_regs(struct lguest_regs *regs, unsigned long start) 18 - { 19 - /* There are four "segment" registers which the Guest needs to boot: 20 - * The "code segment" register (cs) refers to the kernel code segment 21 - * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 22 - * refer to the kernel data segment __KERNEL_DS. 23 - * 24 - * The privilege level is packed into the lower bits. The Guest runs 25 - * at privilege level 1 (GUEST_PL).*/ 26 - regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 27 - regs->cs = __KERNEL_CS|GUEST_PL; 28 - 29 - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 30 - * is supposed to always be "1". Bit 9 (0x200) controls whether 31 - * interrupts are enabled. We always leave interrupts enabled while 32 - * running the Guest. */ 33 - regs->eflags = 0x202; 34 - 35 - /* The "Extended Instruction Pointer" register says where the Guest is 36 - * running. */ 37 - regs->eip = start; 38 - 39 - /* %esi points to our boot information, at physical address 0, so don't 40 - * touch it. */ 41 - } 42 - 43 - /*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a 44 - * DMA buffer. This is done by writing LHREQ_GETDMA and the key to 45 - * /dev/lguest. */ 46 - static long user_get_dma(struct lguest *lg, const u32 __user *input) 47 - { 48 - unsigned long key, udma, irq; 49 - 50 - /* Fetch the key they wrote to us. */ 51 - if (get_user(key, input) != 0) 52 - return -EFAULT; 53 - /* Look for a free Guest DMA buffer bound to that key. */ 54 - udma = get_dma_buffer(lg, key, &irq); 55 - if (!udma) 56 - return -ENOENT; 57 - 58 - /* We need to tell the Launcher what interrupt the Guest expects after 59 - * the buffer is filled. We stash it in udma->used_len. */ 60 - lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); 61 - 62 - /* The (guest-physical) address of the DMA buffer is returned from 63 - * the write(). */ 64 - return udma; 65 - } 66 - 67 /*L:315 To force the Guest to stop running and return to the Launcher, the 68 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 69 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 70 - static int break_guest_out(struct lguest *lg, const u32 __user *input) 71 { 72 unsigned long on; 73 ··· 34 35 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 36 * number to /dev/lguest. */ 37 - static int user_send_irq(struct lguest *lg, const u32 __user *input) 38 { 39 - u32 irq; 40 41 if (get_user(irq, input) != 0) 42 return -EFAULT; ··· 77 return len; 78 } 79 80 - /* If we returned from read() last time because the Guest sent DMA, 81 * clear the flag. */ 82 - if (lg->dma_is_pending) 83 - lg->dma_is_pending = 0; 84 85 /* Run the Guest until something interesting happens. */ 86 return run_guest(lg, (unsigned long __user *)user); 87 } 88 89 - /*L:020 The initialization write supplies 4 32-bit values (in addition to the 90 - * 32-bit LHREQ_INITIALIZE value). These are: 91 * 92 * pfnlimit: The highest (Guest-physical) page number the Guest should be 93 * allowed to access. The Launcher has to live in Guest memory, so it sets ··· 99 * pagetables (which are set up by the Launcher). 100 * 101 * start: The first instruction to execute ("eip" in x86-speak). 102 - * 103 - * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should 104 - * probably wean the code off this, but it's a very useful constant! Any 105 - * address above this is within the Guest kernel, and any kernel address can 106 - * quickly converted from physical to virtual by adding PAGE_OFFSET. It's 107 - * 0xC0000000 (3G) by default, but it's configurable at kernel build time. 108 */ 109 - static int initialize(struct file *file, const u32 __user *input) 110 { 111 /* "struct lguest" contains everything we (the Host) know about a 112 * Guest. */ 113 struct lguest *lg; 114 - int err, i; 115 - u32 args[4]; 116 117 - /* We grab the Big Lguest lock, which protects the global array 118 - * "lguests" and multiple simultaneous initializations. */ 119 mutex_lock(&lguest_lock); 120 /* You can't initialize twice! Close the device and start again... */ 121 if (file->private_data) { ··· 122 goto unlock; 123 } 124 125 - /* Find an unused guest. */ 126 - i = find_free_guest(); 127 - if (i < 0) { 128 - err = -ENOSPC; 129 goto unlock; 130 } 131 - /* OK, we have an index into the "lguest" array: "lg" is a convenient 132 - * pointer. */ 133 - lg = &lguests[i]; 134 135 /* Populate the easy fields of our "struct lguest" */ 136 - lg->guestid = i; 137 - lg->pfn_limit = args[0]; 138 - lg->page_offset = args[3]; 139 140 /* We need a complete page for the Guest registers: they are accessible 141 * to the Guest and we can only grant it access to whole pages. */ ··· 145 /* Initialize the Guest's shadow page tables, using the toplevel 146 * address the Launcher gave us. This allocates memory, so can 147 * fail. */ 148 - err = init_guest_pagetable(lg, args[1]); 149 if (err) 150 goto free_regs; 151 152 /* Now we initialize the Guest's registers, handing it the start 153 * address. */ 154 - setup_regs(lg->regs, args[2]); 155 - 156 - /* There are a couple of GDT entries the Guest expects when first 157 - * booting. */ 158 - setup_guest_gdt(lg); 159 160 /* The timer for lguest's clock needs initialization. */ 161 init_clockdev(lg); ··· 191 /*L:010 The first operation the Launcher does must be a write. All writes 192 * start with a 32 bit number: for the first write this must be 193 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 194 - * writes of other values to get DMA buffers and send interrupts. */ 195 - static ssize_t write(struct file *file, const char __user *input, 196 size_t size, loff_t *off) 197 { 198 /* Once the guest is initialized, we hold the "struct lguest" in the 199 * file private data. */ 200 struct lguest *lg = file->private_data; 201 - u32 req; 202 203 if (get_user(req, input) != 0) 204 return -EFAULT; 205 - input += sizeof(req); 206 207 /* If you haven't initialized, you must do that first. */ 208 if (req != LHREQ_INITIALIZE && !lg) ··· 219 220 switch (req) { 221 case LHREQ_INITIALIZE: 222 - return initialize(file, (const u32 __user *)input); 223 - case LHREQ_GETDMA: 224 - return user_get_dma(lg, (const u32 __user *)input); 225 case LHREQ_IRQ: 226 - return user_send_irq(lg, (const u32 __user *)input); 227 case LHREQ_BREAK: 228 - return break_guest_out(lg, (const u32 __user *)input); 229 default: 230 return -EINVAL; 231 } ··· 249 mutex_lock(&lguest_lock); 250 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 251 hrtimer_cancel(&lg->hrt); 252 - /* Free any DMA buffers the Guest had bound. */ 253 - release_all_dma(lg); 254 /* Free up the shadow page tables for the Guest. */ 255 free_guest_pagetable(lg); 256 /* Now all the memory cleanups are done, it's safe to release the

··· 1 /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 2 * controls and communicates with the Guest. For example, the first write will 3 + * tell us the Guest's memory layout, pagetable, entry point and kernel address 4 + * offset. A read will run the Guest until something happens, such as a signal 5 + * or the Guest doing a NOTIFY out to the Launcher. :*/ 6 #include <linux/uaccess.h> 7 #include <linux/miscdevice.h> 8 #include <linux/fs.h> 9 #include "lg.h" 10 11 /*L:315 To force the Guest to stop running and return to the Launcher, the 12 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 13 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 14 + static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 15 { 16 unsigned long on; 17 ··· 90 91 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 92 * number to /dev/lguest. */ 93 + static int user_send_irq(struct lguest *lg, const unsigned long __user *input) 94 { 95 + unsigned long irq; 96 97 if (get_user(irq, input) != 0) 98 return -EFAULT; ··· 133 return len; 134 } 135 136 + /* If we returned from read() last time because the Guest notified, 137 * clear the flag. */ 138 + if (lg->pending_notify) 139 + lg->pending_notify = 0; 140 141 /* Run the Guest until something interesting happens. */ 142 return run_guest(lg, (unsigned long __user *)user); 143 } 144 145 + /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) 146 + * values (in addition to the LHREQ_INITIALIZE value). These are: 147 + * 148 + * base: The start of the Guest-physical memory inside the Launcher memory. 149 * 150 * pfnlimit: The highest (Guest-physical) page number the Guest should be 151 * allowed to access. The Launcher has to live in Guest memory, so it sets ··· 153 * pagetables (which are set up by the Launcher). 154 * 155 * start: The first instruction to execute ("eip" in x86-speak). 156 */ 157 + static int initialize(struct file *file, const unsigned long __user *input) 158 { 159 /* "struct lguest" contains everything we (the Host) know about a 160 * Guest. */ 161 struct lguest *lg; 162 + int err; 163 + unsigned long args[4]; 164 165 + /* We grab the Big Lguest lock, which protects against multiple 166 + * simultaneous initializations. */ 167 mutex_lock(&lguest_lock); 168 /* You can't initialize twice! Close the device and start again... */ 169 if (file->private_data) { ··· 182 goto unlock; 183 } 184 185 + lg = kzalloc(sizeof(*lg), GFP_KERNEL); 186 + if (!lg) { 187 + err = -ENOMEM; 188 goto unlock; 189 } 190 191 /* Populate the easy fields of our "struct lguest" */ 192 + lg->mem_base = (void __user *)(long)args[0]; 193 + lg->pfn_limit = args[1]; 194 195 /* We need a complete page for the Guest registers: they are accessible 196 * to the Guest and we can only grant it access to whole pages. */ ··· 210 /* Initialize the Guest's shadow page tables, using the toplevel 211 * address the Launcher gave us. This allocates memory, so can 212 * fail. */ 213 + err = init_guest_pagetable(lg, args[2]); 214 if (err) 215 goto free_regs; 216 217 /* Now we initialize the Guest's registers, handing it the start 218 * address. */ 219 + lguest_arch_setup_regs(lg, args[3]); 220 221 /* The timer for lguest's clock needs initialization. */ 222 init_clockdev(lg); ··· 260 /*L:010 The first operation the Launcher does must be a write. All writes 261 * start with a 32 bit number: for the first write this must be 262 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 263 + * writes of other values to send interrupts. */ 264 + static ssize_t write(struct file *file, const char __user *in, 265 size_t size, loff_t *off) 266 { 267 /* Once the guest is initialized, we hold the "struct lguest" in the 268 * file private data. */ 269 struct lguest *lg = file->private_data; 270 + const unsigned long __user *input = (const unsigned long __user *)in; 271 + unsigned long req; 272 273 if (get_user(req, input) != 0) 274 return -EFAULT; 275 + input++; 276 277 /* If you haven't initialized, you must do that first. */ 278 if (req != LHREQ_INITIALIZE && !lg) ··· 287 288 switch (req) { 289 case LHREQ_INITIALIZE: 290 + return initialize(file, input); 291 case LHREQ_IRQ: 292 + return user_send_irq(lg, input); 293 case LHREQ_BREAK: 294 + return break_guest_out(lg, input); 295 default: 296 return -EINVAL; 297 } ··· 319 mutex_lock(&lguest_lock); 320 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 321 hrtimer_cancel(&lg->hrt); 322 /* Free up the shadow page tables for the Guest. */ 323 free_guest_pagetable(lg); 324 /* Now all the memory cleanups are done, it's safe to release the

+136 -114

drivers/lguest/page_tables.c

··· 13 #include <linux/random.h> 14 #include <linux/percpu.h> 15 #include <asm/tlbflush.h> 16 #include "lg.h" 17 18 /*M:008 We hold reference to pages, which prevents them from being swapped. ··· 45 * (vii) Setting up the page tables initially. 46 :*/ 47 48 - /* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 49 - * (or 2^10) entries per page. */ 50 - #define PTES_PER_PAGE_SHIFT 10 51 - #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) 52 53 /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 54 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 55 * page. */ 56 - #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) 57 58 /* We actually need a separate PTE page for each CPU. Remember that after the 59 * Switcher code itself comes two pages for each CPU, and we don't want this 60 * CPU's guest to see the pages of any other CPU. */ 61 - static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); 62 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 63 64 /*H:320 With our shadow and Guest types established, we need to deal with 65 * them: the page table code is curly enough to need helper functions to keep 66 * it clear and clean. 67 * 68 - * The first helper takes a virtual address, and says which entry in the top 69 - * level page table deals with that address. Since each top level entry deals 70 - * with 4M, this effectively divides by 4M. */ 71 - static unsigned vaddr_to_pgd_index(unsigned long vaddr) 72 - { 73 - return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 74 - } 75 - 76 - /* There are two functions which return pointers to the shadow (aka "real") 77 * page tables. 78 * 79 * spgd_addr() takes the virtual address and returns a pointer to the top-level 80 * page directory entry for that address. Since we keep track of several page 81 * tables, the "i" argument tells us which one we're interested in (it's 82 * usually the current one). */ 83 - static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 84 { 85 - unsigned int index = vaddr_to_pgd_index(vaddr); 86 87 /* We kill any Guest trying to touch the Switcher addresses. */ 88 if (index >= SWITCHER_PGD_INDEX) { ··· 84 /* This routine then takes the PGD entry given above, which contains the 85 * address of the PTE page. It then returns a pointer to the PTE entry for the 86 * given address. */ 87 - static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) 88 { 89 - spte_t *page = __va(spgd.pfn << PAGE_SHIFT); 90 /* You should never call this if the PGD entry wasn't valid */ 91 - BUG_ON(!(spgd.flags & _PAGE_PRESENT)); 92 - return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; 93 } 94 95 /* These two functions just like the above two, except they access the Guest 96 * page tables. Hence they return a Guest address. */ 97 static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 98 { 99 - unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 100 - return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); 101 } 102 103 static unsigned long gpte_addr(struct lguest *lg, 104 - gpgd_t gpgd, unsigned long vaddr) 105 { 106 - unsigned long gpage = gpgd.pfn << PAGE_SHIFT; 107 - BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); 108 - return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); 109 } 110 111 /*H:350 This routine takes a page number given by the Guest and converts it to ··· 138 * entry can be a little tricky. The flags are (almost) the same, but the 139 * Guest PTE contains a virtual page number: the CPU needs the real page 140 * number. */ 141 - static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) 142 { 143 - spte_t spte; 144 - unsigned long pfn; 145 146 /* The Guest sets the global flag, because it thinks that it is using 147 * PGE. We only told it to use PGE so it would tell us whether it was 148 * flushing a kernel mapping or a userspace mapping. We don't actually 149 * use the global bit, so throw it away. */ 150 - spte.flags = (gpte.flags & ~_PAGE_GLOBAL); 151 152 /* We need a temporary "unsigned long" variable to hold the answer from 153 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 154 * fit in spte.pfn. get_pfn() finds the real physical number of the 155 * page, given the virtual number. */ 156 - pfn = get_pfn(gpte.pfn, write); 157 if (pfn == -1UL) { 158 - kill_guest(lg, "failed to get page %u", gpte.pfn); 159 /* When we destroy the Guest, we'll go through the shadow page 160 * tables and release_pte() them. Make sure we don't think 161 * this one is valid! */ 162 - spte.flags = 0; 163 } 164 - /* Now we assign the page number, and our shadow PTE is complete. */ 165 - spte.pfn = pfn; 166 - return spte; 167 } 168 169 /*H:460 And to complete the chain, release_pte() looks like this: */ 170 - static void release_pte(spte_t pte) 171 { 172 /* Remember that get_user_pages() took a reference to the page, in 173 * get_pfn()? We have to put it back now. */ 174 - if (pte.flags & _PAGE_PRESENT) 175 - put_page(pfn_to_page(pte.pfn)); 176 } 177 /*:*/ 178 179 - static void check_gpte(struct lguest *lg, gpte_t gpte) 180 { 181 - if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) 182 kill_guest(lg, "bad page table entry"); 183 } 184 185 - static void check_gpgd(struct lguest *lg, gpgd_t gpgd) 186 { 187 - if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) 188 kill_guest(lg, "bad page directory entry"); 189 } 190 ··· 202 * true. */ 203 int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 204 { 205 - gpgd_t gpgd; 206 - spgd_t *spgd; 207 unsigned long gpte_ptr; 208 - gpte_t gpte; 209 - spte_t *spte; 210 211 /* First step: get the top-level Guest page table entry. */ 212 - gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); 213 /* Toplevel not present? We can't map it in. */ 214 - if (!(gpgd.flags & _PAGE_PRESENT)) 215 return 0; 216 217 /* Now look at the matching shadow entry. */ 218 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 219 - if (!(spgd->flags & _PAGE_PRESENT)) { 220 /* No shadow entry: allocate a new shadow PTE page. */ 221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 222 /* This is not really the Guest's fault, but killing it is ··· 229 check_gpgd(lg, gpgd); 230 /* And we copy the flags to the shadow PGD entry. The page 231 * number in the shadow PGD is the page we just allocated. */ 232 - spgd->raw.val = (__pa(ptepage) | gpgd.flags); 233 } 234 235 /* OK, now we look at the lower level in the Guest page table: keep its 236 * address, because we might update it later. */ 237 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 238 - gpte = mkgpte(lgread_u32(lg, gpte_ptr)); 239 240 /* If this page isn't in the Guest page tables, we can't page it in. */ 241 - if (!(gpte.flags & _PAGE_PRESENT)) 242 return 0; 243 244 /* Check they're not trying to write to a page the Guest wants 245 * read-only (bit 2 of errcode == write). */ 246 - if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) 247 return 0; 248 249 /* User access to a kernel page? (bit 3 == user access) */ 250 - if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) 251 return 0; 252 253 /* Check that the Guest PTE flags are OK, and the page number is below 254 * the pfn_limit (ie. not mapping the Launcher binary). */ 255 check_gpte(lg, gpte); 256 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 - gpte.flags |= _PAGE_ACCESSED; 258 if (errcode & 2) 259 - gpte.flags |= _PAGE_DIRTY; 260 261 /* Get the pointer to the shadow PTE entry we're going to set. */ 262 spte = spte_addr(lg, *spgd, vaddr); ··· 267 268 /* If this is a write, we insist that the Guest page is writable (the 269 * final arg to gpte_to_spte()). */ 270 - if (gpte.flags & _PAGE_DIRTY) 271 *spte = gpte_to_spte(lg, gpte, 1); 272 - else { 273 /* If this is a read, don't set the "writable" bit in the page 274 * table entry, even if the Guest says it's writable. That way 275 * we come back here when a write does actually ocur, so we can 276 * update the Guest's _PAGE_DIRTY flag. */ 277 - gpte_t ro_gpte = gpte; 278 - ro_gpte.flags &= ~_PAGE_RW; 279 - *spte = gpte_to_spte(lg, ro_gpte, 0); 280 - } 281 282 /* Finally, we write the Guest PTE entry back: we've set the 283 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 284 - lgwrite_u32(lg, gpte_ptr, gpte.raw.val); 285 286 /* We succeeded in mapping the page! */ 287 return 1; ··· 294 * mapped by the shadow page tables, and is it writable? */ 295 static int page_writable(struct lguest *lg, unsigned long vaddr) 296 { 297 - spgd_t *spgd; 298 unsigned long flags; 299 300 /* Look at the top level entry: is it present? */ 301 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 302 - if (!(spgd->flags & _PAGE_PRESENT)) 303 return 0; 304 305 /* Check the flags on the pte entry itself: it must be present and 306 * writable. */ 307 - flags = spte_addr(lg, *spgd, vaddr)->flags; 308 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 309 } 310 ··· 319 } 320 321 /*H:450 If we chase down the release_pgd() code, it looks like this: */ 322 - static void release_pgd(struct lguest *lg, spgd_t *spgd) 323 { 324 /* If the entry's not present, there's nothing to release. */ 325 - if (spgd->flags & _PAGE_PRESENT) { 326 unsigned int i; 327 /* Converting the pfn to find the actual PTE page is easy: turn 328 * the page number into a physical address, then convert to a 329 * virtual address (easy for kernel pages like this one). */ 330 - spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); 331 /* For each entry in the page, we might need to release it. */ 332 - for (i = 0; i < PTES_PER_PAGE; i++) 333 release_pte(ptepage[i]); 334 /* Now we can free the page of PTEs */ 335 free_page((long)ptepage); 336 /* And zero out the PGD entry we we never release it twice. */ 337 - spgd->raw.val = 0; 338 } 339 } 340 ··· 346 { 347 unsigned int i; 348 /* Release every pgd entry up to the kernel's address. */ 349 - for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) 350 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 351 } 352 ··· 359 } 360 /*:*/ 361 362 /* We keep several page tables. This is a simple routine to find the page 363 * table (if any) corresponding to this top-level address the Guest has given 364 * us. */ ··· 385 { 386 unsigned int i; 387 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 388 - if (lg->pgdirs[i].cr3 == pgtable) 389 break; 390 return i; 391 } ··· 394 * allocate a new one (and so the kernel parts are not there), we set 395 * blank_pgdir. */ 396 static unsigned int new_pgdir(struct lguest *lg, 397 - unsigned long cr3, 398 int *blank_pgdir) 399 { 400 unsigned int next; ··· 404 next = random32() % ARRAY_SIZE(lg->pgdirs); 405 /* If it's never been allocated at all before, try now. */ 406 if (!lg->pgdirs[next].pgdir) { 407 - lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); 408 /* If the allocation fails, just keep using the one we have */ 409 if (!lg->pgdirs[next].pgdir) 410 next = lg->pgdidx; ··· 414 *blank_pgdir = 1; 415 } 416 /* Record which Guest toplevel this shadows. */ 417 - lg->pgdirs[next].cr3 = cr3; 418 /* Release all the non-kernel mappings. */ 419 flush_user_mappings(lg, next); 420 ··· 481 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 482 */ 483 static void do_set_pte(struct lguest *lg, int idx, 484 - unsigned long vaddr, gpte_t gpte) 485 { 486 /* Look up the matching shadow page directot entry. */ 487 - spgd_t *spgd = spgd_addr(lg, idx, vaddr); 488 489 /* If the top level isn't present, there's no entry to update. */ 490 - if (spgd->flags & _PAGE_PRESENT) { 491 /* Otherwise, we start by releasing the existing entry. */ 492 - spte_t *spte = spte_addr(lg, *spgd, vaddr); 493 release_pte(*spte); 494 495 /* If they're setting this entry as dirty or accessed, we might 496 * as well put that entry they've given us in now. This shaves 497 * 10% off a copy-on-write micro-benchmark. */ 498 - if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 499 check_gpte(lg, gpte); 500 - *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); 501 } else 502 /* Otherwise we can demand_page() it in later. */ 503 - spte->raw.val = 0; 504 } 505 } 506 ··· 516 * The benefit is that when we have to track a new page table, we can copy keep 517 * all the kernel mappings. This speeds up context switch immensely. */ 518 void guest_set_pte(struct lguest *lg, 519 - unsigned long cr3, unsigned long vaddr, gpte_t gpte) 520 { 521 /* Kernel mappings must be changed on all top levels. Slow, but 522 * doesn't happen often. */ 523 - if (vaddr >= lg->page_offset) { 524 unsigned int i; 525 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 526 if (lg->pgdirs[i].pgdir) 527 do_set_pte(lg, i, vaddr, gpte); 528 } else { 529 /* Is this page table one we have a shadow for? */ 530 - int pgdir = find_pgdir(lg, cr3); 531 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 532 /* If so, do the update. */ 533 do_set_pte(lg, pgdir, vaddr, gpte); ··· 548 * 549 * So with that in mind here's our code to to update a (top-level) PGD entry: 550 */ 551 - void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) 552 { 553 int pgdir; 554 ··· 558 return; 559 560 /* If they're talking about a page table we have a shadow for... */ 561 - pgdir = find_pgdir(lg, cr3); 562 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 563 /* ... throw it away. */ 564 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); ··· 570 * its first page table is. We set some things up here: */ 571 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 572 { 573 - /* In flush_user_mappings() we loop from 0 to 574 - * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit 575 - * the Switcher mappings, so check that now. */ 576 - if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) 577 - return -EINVAL; 578 /* We start on the first shadow page table, and give it a blank PGD 579 * page. */ 580 lg->pgdidx = 0; 581 - lg->pgdirs[lg->pgdidx].cr3 = pgtable; 582 - lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); 583 if (!lg->pgdirs[lg->pgdidx].pgdir) 584 return -ENOMEM; 585 return 0; 586 } 587 588 /* When a Guest dies, our cleanup is fairly simple. */ ··· 617 * for each CPU already set up, we just need to hook them in. */ 618 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 619 { 620 - spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 621 - spgd_t switcher_pgd; 622 - spte_t regs_pte; 623 624 /* Make the last PGD entry for this Guest point to the Switcher's PTE 625 * page for this CPU (with appropriate flags). */ 626 - switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; 627 - switcher_pgd.flags = _PAGE_KERNEL; 628 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 629 630 /* We also change the Switcher PTE page. When we're running the Guest, ··· 634 * CPU's "struct lguest_pages": if we make sure the Guest's register 635 * page is already mapped there, we don't have to copy them out 636 * again. */ 637 - regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; 638 - regs_pte.flags = _PAGE_KERNEL; 639 - switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] 640 - = regs_pte; 641 } 642 /*:*/ 643 ··· 656 unsigned int pages) 657 { 658 unsigned int i; 659 - spte_t *pte = switcher_pte_page(cpu); 660 661 /* The first entries are easy: they map the Switcher code. */ 662 for (i = 0; i < pages; i++) { 663 - pte[i].pfn = page_to_pfn(switcher_page[i]); 664 - pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 665 } 666 667 /* The only other thing we map is this CPU's pair of pages. */ 668 i = pages + cpu*2; 669 670 /* First page (Guest registers) is writable from the Guest */ 671 - pte[i].pfn = page_to_pfn(switcher_page[i]); 672 - pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; 673 /* The second page contains the "struct lguest_ro_state", and is 674 * read-only. */ 675 - pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); 676 - pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 677 } 678 679 /*H:510 At boot or module load time, init_pagetables() allocates and populates ··· 684 unsigned int i; 685 686 for_each_possible_cpu(i) { 687 - switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); 688 if (!switcher_pte_page(i)) { 689 free_switcher_pte_pages(); 690 return -ENOMEM;

··· 13 #include <linux/random.h> 14 #include <linux/percpu.h> 15 #include <asm/tlbflush.h> 16 + #include <asm/uaccess.h> 17 #include "lg.h" 18 19 /*M:008 We hold reference to pages, which prevents them from being swapped. ··· 44 * (vii) Setting up the page tables initially. 45 :*/ 46 47 48 /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 49 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 50 * page. */ 51 + #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 52 53 /* We actually need a separate PTE page for each CPU. Remember that after the 54 * Switcher code itself comes two pages for each CPU, and we don't want this 55 * CPU's guest to see the pages of any other CPU. */ 56 + static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 57 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 58 59 /*H:320 With our shadow and Guest types established, we need to deal with 60 * them: the page table code is curly enough to need helper functions to keep 61 * it clear and clean. 62 * 63 + * There are two functions which return pointers to the shadow (aka "real") 64 * page tables. 65 * 66 * spgd_addr() takes the virtual address and returns a pointer to the top-level 67 * page directory entry for that address. Since we keep track of several page 68 * tables, the "i" argument tells us which one we're interested in (it's 69 * usually the current one). */ 70 + static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71 { 72 + unsigned int index = pgd_index(vaddr); 73 74 /* We kill any Guest trying to touch the Switcher addresses. */ 75 if (index >= SWITCHER_PGD_INDEX) { ··· 95 /* This routine then takes the PGD entry given above, which contains the 96 * address of the PTE page. It then returns a pointer to the PTE entry for the 97 * given address. */ 98 + static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 99 { 100 + pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 101 /* You should never call this if the PGD entry wasn't valid */ 102 + BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 103 + return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; 104 } 105 106 /* These two functions just like the above two, except they access the Guest 107 * page tables. Hence they return a Guest address. */ 108 static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 109 { 110 + unsigned int index = vaddr >> (PGDIR_SHIFT); 111 + return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); 112 } 113 114 static unsigned long gpte_addr(struct lguest *lg, 115 + pgd_t gpgd, unsigned long vaddr) 116 { 117 + unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 118 + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 119 + return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); 120 } 121 122 /*H:350 This routine takes a page number given by the Guest and converts it to ··· 149 * entry can be a little tricky. The flags are (almost) the same, but the 150 * Guest PTE contains a virtual page number: the CPU needs the real page 151 * number. */ 152 + static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) 153 { 154 + unsigned long pfn, base, flags; 155 156 /* The Guest sets the global flag, because it thinks that it is using 157 * PGE. We only told it to use PGE so it would tell us whether it was 158 * flushing a kernel mapping or a userspace mapping. We don't actually 159 * use the global bit, so throw it away. */ 160 + flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 161 + 162 + /* The Guest's pages are offset inside the Launcher. */ 163 + base = (unsigned long)lg->mem_base / PAGE_SIZE; 164 165 /* We need a temporary "unsigned long" variable to hold the answer from 166 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 167 * fit in spte.pfn. get_pfn() finds the real physical number of the 168 * page, given the virtual number. */ 169 + pfn = get_pfn(base + pte_pfn(gpte), write); 170 if (pfn == -1UL) { 171 + kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); 172 /* When we destroy the Guest, we'll go through the shadow page 173 * tables and release_pte() them. Make sure we don't think 174 * this one is valid! */ 175 + flags = 0; 176 } 177 + /* Now we assemble our shadow PTE from the page number and flags. */ 178 + return pfn_pte(pfn, __pgprot(flags)); 179 } 180 181 /*H:460 And to complete the chain, release_pte() looks like this: */ 182 + static void release_pte(pte_t pte) 183 { 184 /* Remember that get_user_pages() took a reference to the page, in 185 * get_pfn()? We have to put it back now. */ 186 + if (pte_flags(pte) & _PAGE_PRESENT) 187 + put_page(pfn_to_page(pte_pfn(pte))); 188 } 189 /*:*/ 190 191 + static void check_gpte(struct lguest *lg, pte_t gpte) 192 { 193 + if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) 194 + || pte_pfn(gpte) >= lg->pfn_limit) 195 kill_guest(lg, "bad page table entry"); 196 } 197 198 + static void check_gpgd(struct lguest *lg, pgd_t gpgd) 199 { 200 + if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) 201 kill_guest(lg, "bad page directory entry"); 202 } 203 ··· 211 * true. */ 212 int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 213 { 214 + pgd_t gpgd; 215 + pgd_t *spgd; 216 unsigned long gpte_ptr; 217 + pte_t gpte; 218 + pte_t *spte; 219 220 /* First step: get the top-level Guest page table entry. */ 221 + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 222 /* Toplevel not present? We can't map it in. */ 223 + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 224 return 0; 225 226 /* Now look at the matching shadow entry. */ 227 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 228 + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 229 /* No shadow entry: allocate a new shadow PTE page. */ 230 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 231 /* This is not really the Guest's fault, but killing it is ··· 238 check_gpgd(lg, gpgd); 239 /* And we copy the flags to the shadow PGD entry. The page 240 * number in the shadow PGD is the page we just allocated. */ 241 + *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); 242 } 243 244 /* OK, now we look at the lower level in the Guest page table: keep its 245 * address, because we might update it later. */ 246 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 247 + gpte = lgread(lg, gpte_ptr, pte_t); 248 249 /* If this page isn't in the Guest page tables, we can't page it in. */ 250 + if (!(pte_flags(gpte) & _PAGE_PRESENT)) 251 return 0; 252 253 /* Check they're not trying to write to a page the Guest wants 254 * read-only (bit 2 of errcode == write). */ 255 + if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 256 return 0; 257 258 /* User access to a kernel page? (bit 3 == user access) */ 259 + if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 260 return 0; 261 262 /* Check that the Guest PTE flags are OK, and the page number is below 263 * the pfn_limit (ie. not mapping the Launcher binary). */ 264 check_gpte(lg, gpte); 265 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 266 + gpte = pte_mkyoung(gpte); 267 + 268 if (errcode & 2) 269 + gpte = pte_mkdirty(gpte); 270 271 /* Get the pointer to the shadow PTE entry we're going to set. */ 272 spte = spte_addr(lg, *spgd, vaddr); ··· 275 276 /* If this is a write, we insist that the Guest page is writable (the 277 * final arg to gpte_to_spte()). */ 278 + if (pte_dirty(gpte)) 279 *spte = gpte_to_spte(lg, gpte, 1); 280 + else 281 /* If this is a read, don't set the "writable" bit in the page 282 * table entry, even if the Guest says it's writable. That way 283 * we come back here when a write does actually ocur, so we can 284 * update the Guest's _PAGE_DIRTY flag. */ 285 + *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 286 287 /* Finally, we write the Guest PTE entry back: we've set the 288 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 289 + lgwrite(lg, gpte_ptr, pte_t, gpte); 290 291 /* We succeeded in mapping the page! */ 292 return 1; ··· 305 * mapped by the shadow page tables, and is it writable? */ 306 static int page_writable(struct lguest *lg, unsigned long vaddr) 307 { 308 + pgd_t *spgd; 309 unsigned long flags; 310 311 /* Look at the top level entry: is it present? */ 312 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 313 + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 314 return 0; 315 316 /* Check the flags on the pte entry itself: it must be present and 317 * writable. */ 318 + flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); 319 + 320 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 321 } 322 ··· 329 } 330 331 /*H:450 If we chase down the release_pgd() code, it looks like this: */ 332 + static void release_pgd(struct lguest *lg, pgd_t *spgd) 333 { 334 /* If the entry's not present, there's nothing to release. */ 335 + if (pgd_flags(*spgd) & _PAGE_PRESENT) { 336 unsigned int i; 337 /* Converting the pfn to find the actual PTE page is easy: turn 338 * the page number into a physical address, then convert to a 339 * virtual address (easy for kernel pages like this one). */ 340 + pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 341 /* For each entry in the page, we might need to release it. */ 342 + for (i = 0; i < PTRS_PER_PTE; i++) 343 release_pte(ptepage[i]); 344 /* Now we can free the page of PTEs */ 345 free_page((long)ptepage); 346 /* And zero out the PGD entry we we never release it twice. */ 347 + *spgd = __pgd(0); 348 } 349 } 350 ··· 356 { 357 unsigned int i; 358 /* Release every pgd entry up to the kernel's address. */ 359 + for (i = 0; i < pgd_index(lg->kernel_address); i++) 360 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 361 } 362 ··· 369 } 370 /*:*/ 371 372 + /* We walk down the guest page tables to get a guest-physical address */ 373 + unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 374 + { 375 + pgd_t gpgd; 376 + pte_t gpte; 377 + 378 + /* First step: get the top-level Guest page table entry. */ 379 + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 380 + /* Toplevel not present? We can't map it in. */ 381 + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 382 + kill_guest(lg, "Bad address %#lx", vaddr); 383 + 384 + gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); 385 + if (!(pte_flags(gpte) & _PAGE_PRESENT)) 386 + kill_guest(lg, "Bad address %#lx", vaddr); 387 + 388 + return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 389 + } 390 + 391 /* We keep several page tables. This is a simple routine to find the page 392 * table (if any) corresponding to this top-level address the Guest has given 393 * us. */ ··· 376 { 377 unsigned int i; 378 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 379 + if (lg->pgdirs[i].gpgdir == pgtable) 380 break; 381 return i; 382 } ··· 385 * allocate a new one (and so the kernel parts are not there), we set 386 * blank_pgdir. */ 387 static unsigned int new_pgdir(struct lguest *lg, 388 + unsigned long gpgdir, 389 int *blank_pgdir) 390 { 391 unsigned int next; ··· 395 next = random32() % ARRAY_SIZE(lg->pgdirs); 396 /* If it's never been allocated at all before, try now. */ 397 if (!lg->pgdirs[next].pgdir) { 398 + lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 399 /* If the allocation fails, just keep using the one we have */ 400 if (!lg->pgdirs[next].pgdir) 401 next = lg->pgdidx; ··· 405 *blank_pgdir = 1; 406 } 407 /* Record which Guest toplevel this shadows. */ 408 + lg->pgdirs[next].gpgdir = gpgdir; 409 /* Release all the non-kernel mappings. */ 410 flush_user_mappings(lg, next); 411 ··· 472 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 473 */ 474 static void do_set_pte(struct lguest *lg, int idx, 475 + unsigned long vaddr, pte_t gpte) 476 { 477 /* Look up the matching shadow page directot entry. */ 478 + pgd_t *spgd = spgd_addr(lg, idx, vaddr); 479 480 /* If the top level isn't present, there's no entry to update. */ 481 + if (pgd_flags(*spgd) & _PAGE_PRESENT) { 482 /* Otherwise, we start by releasing the existing entry. */ 483 + pte_t *spte = spte_addr(lg, *spgd, vaddr); 484 release_pte(*spte); 485 486 /* If they're setting this entry as dirty or accessed, we might 487 * as well put that entry they've given us in now. This shaves 488 * 10% off a copy-on-write micro-benchmark. */ 489 + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 490 check_gpte(lg, gpte); 491 + *spte = gpte_to_spte(lg, gpte, 492 + pte_flags(gpte) & _PAGE_DIRTY); 493 } else 494 /* Otherwise we can demand_page() it in later. */ 495 + *spte = __pte(0); 496 } 497 } 498 ··· 506 * The benefit is that when we have to track a new page table, we can copy keep 507 * all the kernel mappings. This speeds up context switch immensely. */ 508 void guest_set_pte(struct lguest *lg, 509 + unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 510 { 511 /* Kernel mappings must be changed on all top levels. Slow, but 512 * doesn't happen often. */ 513 + if (vaddr >= lg->kernel_address) { 514 unsigned int i; 515 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 516 if (lg->pgdirs[i].pgdir) 517 do_set_pte(lg, i, vaddr, gpte); 518 } else { 519 /* Is this page table one we have a shadow for? */ 520 + int pgdir = find_pgdir(lg, gpgdir); 521 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 522 /* If so, do the update. */ 523 do_set_pte(lg, pgdir, vaddr, gpte); ··· 538 * 539 * So with that in mind here's our code to to update a (top-level) PGD entry: 540 */ 541 + void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) 542 { 543 int pgdir; 544 ··· 548 return; 549 550 /* If they're talking about a page table we have a shadow for... */ 551 + pgdir = find_pgdir(lg, gpgdir); 552 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 553 /* ... throw it away. */ 554 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); ··· 560 * its first page table is. We set some things up here: */ 561 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 562 { 563 /* We start on the first shadow page table, and give it a blank PGD 564 * page. */ 565 lg->pgdidx = 0; 566 + lg->pgdirs[lg->pgdidx].gpgdir = pgtable; 567 + lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); 568 if (!lg->pgdirs[lg->pgdidx].pgdir) 569 return -ENOMEM; 570 return 0; 571 + } 572 + 573 + /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 574 + void page_table_guest_data_init(struct lguest *lg) 575 + { 576 + /* We get the kernel address: above this is all kernel memory. */ 577 + if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) 578 + /* We tell the Guest that it can't use the top 4MB of virtual 579 + * addresses used by the Switcher. */ 580 + || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 581 + || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) 582 + kill_guest(lg, "bad guest page %p", lg->lguest_data); 583 + 584 + /* In flush_user_mappings() we loop from 0 to 585 + * "pgd_index(lg->kernel_address)". This assumes it won't hit the 586 + * Switcher mappings, so check that now. */ 587 + if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) 588 + kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); 589 } 590 591 /* When a Guest dies, our cleanup is fairly simple. */ ··· 594 * for each CPU already set up, we just need to hook them in. */ 595 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 596 { 597 + pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 598 + pgd_t switcher_pgd; 599 + pte_t regs_pte; 600 601 /* Make the last PGD entry for this Guest point to the Switcher's PTE 602 * page for this CPU (with appropriate flags). */ 603 + switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); 604 + 605 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 606 607 /* We also change the Switcher PTE page. When we're running the Guest, ··· 611 * CPU's "struct lguest_pages": if we make sure the Guest's register 612 * page is already mapped there, we don't have to copy them out 613 * again. */ 614 + regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); 615 + switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; 616 } 617 /*:*/ 618 ··· 635 unsigned int pages) 636 { 637 unsigned int i; 638 + pte_t *pte = switcher_pte_page(cpu); 639 640 /* The first entries are easy: they map the Switcher code. */ 641 for (i = 0; i < pages; i++) { 642 + pte[i] = mk_pte(switcher_page[i], 643 + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 644 } 645 646 /* The only other thing we map is this CPU's pair of pages. */ 647 i = pages + cpu*2; 648 649 /* First page (Guest registers) is writable from the Guest */ 650 + pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), 651 + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); 652 + 653 /* The second page contains the "struct lguest_ro_state", and is 654 * read-only. */ 655 + pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), 656 + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 657 } 658 659 /*H:510 At boot or module load time, init_pagetables() allocates and populates ··· 662 unsigned int i; 663 664 for_each_possible_cpu(i) { 665 + switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); 666 if (!switcher_pte_page(i)) { 667 free_switcher_pte_pages(); 668 return -ENOMEM;

+14 -14

drivers/lguest/segments.c

··· 73 /* Segment descriptors contain a privilege level: the Guest is 74 * sometimes careless and leaves this as 0, even though it's 75 * running at privilege level 1. If so, we fix it here. */ 76 - if ((lg->gdt[i].b & 0x00006000) == 0) 77 - lg->gdt[i].b |= (GUEST_PL << 13); 78 79 /* Each descriptor has an "accessed" bit. If we don't set it 80 * now, the CPU will try to set it when the Guest first loads 81 * that entry into a segment register. But the GDT isn't 82 * writable by the Guest, so bad things can happen. */ 83 - lg->gdt[i].b |= 0x00000100; 84 } 85 } 86 ··· 106 void setup_guest_gdt(struct lguest *lg) 107 { 108 /* Start with full 0-4G segments... */ 109 - lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 110 - lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 111 /* ...except the Guest is allowed to use them, so set the privilege 112 * level appropriately in the flags. */ 113 - lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 114 - lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 115 } 116 117 /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the ··· 126 unsigned int i; 127 128 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 129 - gdt[i] = lg->gdt[i]; 130 } 131 132 /* This is the full version */ ··· 138 * replaced. See ignored_gdt() above. */ 139 for (i = 0; i < GDT_ENTRIES; i++) 140 if (!ignored_gdt(i)) 141 - gdt[i] = lg->gdt[i]; 142 } 143 144 /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ ··· 146 { 147 /* We assume the Guest has the same number of GDT entries as the 148 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 149 - if (num > ARRAY_SIZE(lg->gdt)) 150 kill_guest(lg, "too many gdt entries %i", num); 151 152 /* We read the whole thing in, then fix it up. */ 153 - lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); 154 - fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); 155 /* Mark that the GDT changed so the core knows it has to copy it again, 156 * even if the Guest is run on the same CPU. */ 157 lg->changed |= CHANGED_GDT; ··· 159 160 void guest_load_tls(struct lguest *lg, unsigned long gtls) 161 { 162 - struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN]; 163 164 - lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 166 lg->changed |= CHANGED_GDT_TLS; 167 }

··· 73 /* Segment descriptors contain a privilege level: the Guest is 74 * sometimes careless and leaves this as 0, even though it's 75 * running at privilege level 1. If so, we fix it here. */ 76 + if ((lg->arch.gdt[i].b & 0x00006000) == 0) 77 + lg->arch.gdt[i].b |= (GUEST_PL << 13); 78 79 /* Each descriptor has an "accessed" bit. If we don't set it 80 * now, the CPU will try to set it when the Guest first loads 81 * that entry into a segment register. But the GDT isn't 82 * writable by the Guest, so bad things can happen. */ 83 + lg->arch.gdt[i].b |= 0x00000100; 84 } 85 } 86 ··· 106 void setup_guest_gdt(struct lguest *lg) 107 { 108 /* Start with full 0-4G segments... */ 109 + lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 110 + lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 111 /* ...except the Guest is allowed to use them, so set the privilege 112 * level appropriately in the flags. */ 113 + lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 114 + lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 115 } 116 117 /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the ··· 126 unsigned int i; 127 128 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 129 + gdt[i] = lg->arch.gdt[i]; 130 } 131 132 /* This is the full version */ ··· 138 * replaced. See ignored_gdt() above. */ 139 for (i = 0; i < GDT_ENTRIES; i++) 140 if (!ignored_gdt(i)) 141 + gdt[i] = lg->arch.gdt[i]; 142 } 143 144 /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ ··· 146 { 147 /* We assume the Guest has the same number of GDT entries as the 148 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 149 + if (num > ARRAY_SIZE(lg->arch.gdt)) 150 kill_guest(lg, "too many gdt entries %i", num); 151 152 /* We read the whole thing in, then fix it up. */ 153 + __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); 154 + fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); 155 /* Mark that the GDT changed so the core knows it has to copy it again, 156 * even if the Guest is run on the same CPU. */ 157 lg->changed |= CHANGED_GDT; ··· 159 160 void guest_load_tls(struct lguest *lg, unsigned long gtls) 161 { 162 + struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 163 164 + __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 166 lg->changed |= CHANGED_GDT_TLS; 167 }

+4 -3

drivers/lguest/switcher.S drivers/lguest/x86/switcher_32.S

··· 48 #include <linux/linkage.h> 49 #include <asm/asm-offsets.h> 50 #include <asm/page.h> 51 - #include "lg.h" 52 53 // We mark the start of the code to copy 54 // It's placed in .text tho it's never run here ··· 133 // The Guest's register page has been mapped 134 // Writable onto our %esp (stack) -- 135 // We can simply pop off all Guest regs. 136 popl %ebx 137 popl %ecx 138 popl %edx ··· 141 popl %edi 142 popl %ebp 143 popl %gs 144 - popl %eax 145 popl %fs 146 popl %ds 147 popl %es ··· 168 pushl %es; \ 169 pushl %ds; \ 170 pushl %fs; \ 171 - pushl %eax; \ 172 pushl %gs; \ 173 pushl %ebp; \ 174 pushl %edi; \ ··· 175 pushl %edx; \ 176 pushl %ecx; \ 177 pushl %ebx; \ 178 /* Our stack and our code are using segments \ 179 * Set in the TSS and IDT \ 180 * Yet if we were to touch data we'd use \

··· 48 #include <linux/linkage.h> 49 #include <asm/asm-offsets.h> 50 #include <asm/page.h> 51 + #include <asm/segment.h> 52 + #include <asm/lguest.h> 53 54 // We mark the start of the code to copy 55 // It's placed in .text tho it's never run here ··· 132 // The Guest's register page has been mapped 133 // Writable onto our %esp (stack) -- 134 // We can simply pop off all Guest regs. 135 + popl %eax 136 popl %ebx 137 popl %ecx 138 popl %edx ··· 139 popl %edi 140 popl %ebp 141 popl %gs 142 popl %fs 143 popl %ds 144 popl %es ··· 167 pushl %es; \ 168 pushl %ds; \ 169 pushl %fs; \ 170 pushl %gs; \ 171 pushl %ebp; \ 172 pushl %edi; \ ··· 175 pushl %edx; \ 176 pushl %ecx; \ 177 pushl %ebx; \ 178 + pushl %eax; \ 179 /* Our stack and our code are using segments \ 180 * Set in the TSS and IDT \ 181 * Yet if we were to touch data we'd use \

+577

drivers/lguest/x86/core.c

···

··· 1 + /* 2 + * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 3 + * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI. 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation; either version 2 of the License, or 8 + * (at your option) any later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but 11 + * WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 13 + * NON INFRINGEMENT. See the GNU General Public License for more 14 + * details. 15 + * 16 + * You should have received a copy of the GNU General Public License 17 + * along with this program; if not, write to the Free Software 18 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 + */ 20 + #include <linux/kernel.h> 21 + #include <linux/start_kernel.h> 22 + #include <linux/string.h> 23 + #include <linux/console.h> 24 + #include <linux/screen_info.h> 25 + #include <linux/irq.h> 26 + #include <linux/interrupt.h> 27 + #include <linux/clocksource.h> 28 + #include <linux/clockchips.h> 29 + #include <linux/cpu.h> 30 + #include <linux/lguest.h> 31 + #include <linux/lguest_launcher.h> 32 + #include <asm/paravirt.h> 33 + #include <asm/param.h> 34 + #include <asm/page.h> 35 + #include <asm/pgtable.h> 36 + #include <asm/desc.h> 37 + #include <asm/setup.h> 38 + #include <asm/lguest.h> 39 + #include <asm/uaccess.h> 40 + #include <asm/i387.h> 41 + #include "../lg.h" 42 + 43 + static int cpu_had_pge; 44 + 45 + static struct { 46 + unsigned long offset; 47 + unsigned short segment; 48 + } lguest_entry; 49 + 50 + /* Offset from where switcher.S was compiled to where we've copied it */ 51 + static unsigned long switcher_offset(void) 52 + { 53 + return SWITCHER_ADDR - (unsigned long)start_switcher_text; 54 + } 55 + 56 + /* This cpu's struct lguest_pages. */ 57 + static struct lguest_pages *lguest_pages(unsigned int cpu) 58 + { 59 + return &(((struct lguest_pages *) 60 + (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 61 + } 62 + 63 + static DEFINE_PER_CPU(struct lguest *, last_guest); 64 + 65 + /*S:010 66 + * We are getting close to the Switcher. 67 + * 68 + * Remember that each CPU has two pages which are visible to the Guest when it 69 + * runs on that CPU. This has to contain the state for that Guest: we copy the 70 + * state in just before we run the Guest. 71 + * 72 + * Each Guest has "changed" flags which indicate what has changed in the Guest 73 + * since it last ran. We saw this set in interrupts_and_traps.c and 74 + * segments.c. 75 + */ 76 + static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 77 + { 78 + /* Copying all this data can be quite expensive. We usually run the 79 + * same Guest we ran last time (and that Guest hasn't run anywhere else 80 + * meanwhile). If that's not the case, we pretend everything in the 81 + * Guest has changed. */ 82 + if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 83 + __get_cpu_var(last_guest) = lg; 84 + lg->last_pages = pages; 85 + lg->changed = CHANGED_ALL; 86 + } 87 + 88 + /* These copies are pretty cheap, so we do them unconditionally: */ 89 + /* Save the current Host top-level page directory. */ 90 + pages->state.host_cr3 = __pa(current->mm->pgd); 91 + /* Set up the Guest's page tables to see this CPU's pages (and no 92 + * other CPU's pages). */ 93 + map_switcher_in_guest(lg, pages); 94 + /* Set up the two "TSS" members which tell the CPU what stack to use 95 + * for traps which do directly into the Guest (ie. traps at privilege 96 + * level 1). */ 97 + pages->state.guest_tss.esp1 = lg->esp1; 98 + pages->state.guest_tss.ss1 = lg->ss1; 99 + 100 + /* Copy direct-to-Guest trap entries. */ 101 + if (lg->changed & CHANGED_IDT) 102 + copy_traps(lg, pages->state.guest_idt, default_idt_entries); 103 + 104 + /* Copy all GDT entries which the Guest can change. */ 105 + if (lg->changed & CHANGED_GDT) 106 + copy_gdt(lg, pages->state.guest_gdt); 107 + /* If only the TLS entries have changed, copy them. */ 108 + else if (lg->changed & CHANGED_GDT_TLS) 109 + copy_gdt_tls(lg, pages->state.guest_gdt); 110 + 111 + /* Mark the Guest as unchanged for next time. */ 112 + lg->changed = 0; 113 + } 114 + 115 + /* Finally: the code to actually call into the Switcher to run the Guest. */ 116 + static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 117 + { 118 + /* This is a dummy value we need for GCC's sake. */ 119 + unsigned int clobber; 120 + 121 + /* Copy the guest-specific information into this CPU's "struct 122 + * lguest_pages". */ 123 + copy_in_guest_info(lg, pages); 124 + 125 + /* Set the trap number to 256 (impossible value). If we fault while 126 + * switching to the Guest (bad segment registers or bug), this will 127 + * cause us to abort the Guest. */ 128 + lg->regs->trapnum = 256; 129 + 130 + /* Now: we push the "eflags" register on the stack, then do an "lcall". 131 + * This is how we change from using the kernel code segment to using 132 + * the dedicated lguest code segment, as well as jumping into the 133 + * Switcher. 134 + * 135 + * The lcall also pushes the old code segment (KERNEL_CS) onto the 136 + * stack, then the address of this call. This stack layout happens to 137 + * exactly match the stack of an interrupt... */ 138 + asm volatile("pushf; lcall *lguest_entry" 139 + /* This is how we tell GCC that %eax ("a") and %ebx ("b") 140 + * are changed by this routine. The "=" means output. */ 141 + : "=a"(clobber), "=b"(clobber) 142 + /* %eax contains the pages pointer. ("0" refers to the 143 + * 0-th argument above, ie "a"). %ebx contains the 144 + * physical address of the Guest's top-level page 145 + * directory. */ 146 + : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 147 + /* We tell gcc that all these registers could change, 148 + * which means we don't have to save and restore them in 149 + * the Switcher. */ 150 + : "memory", "%edx", "%ecx", "%edi", "%esi"); 151 + } 152 + /*:*/ 153 + 154 + /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 155 + * are disabled: we own the CPU. */ 156 + void lguest_arch_run_guest(struct lguest *lg) 157 + { 158 + /* Remember the awfully-named TS bit? If the Guest has asked 159 + * to set it we set it now, so we can trap and pass that trap 160 + * to the Guest if it uses the FPU. */ 161 + if (lg->ts) 162 + lguest_set_ts(); 163 + 164 + /* SYSENTER is an optimized way of doing system calls. We 165 + * can't allow it because it always jumps to privilege level 0. 166 + * A normal Guest won't try it because we don't advertise it in 167 + * CPUID, but a malicious Guest (or malicious Guest userspace 168 + * program) could, so we tell the CPU to disable it before 169 + * running the Guest. */ 170 + if (boot_cpu_has(X86_FEATURE_SEP)) 171 + wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 172 + 173 + /* Now we actually run the Guest. It will pop back out when 174 + * something interesting happens, and we can examine its 175 + * registers to see what it was doing. */ 176 + run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 177 + 178 + /* The "regs" pointer contains two extra entries which are not 179 + * really registers: a trap number which says what interrupt or 180 + * trap made the switcher code come back, and an error code 181 + * which some traps set. */ 182 + 183 + /* If the Guest page faulted, then the cr2 register will tell 184 + * us the bad virtual address. We have to grab this now, 185 + * because once we re-enable interrupts an interrupt could 186 + * fault and thus overwrite cr2, or we could even move off to a 187 + * different CPU. */ 188 + if (lg->regs->trapnum == 14) 189 + lg->arch.last_pagefault = read_cr2(); 190 + /* Similarly, if we took a trap because the Guest used the FPU, 191 + * we have to restore the FPU it expects to see. */ 192 + else if (lg->regs->trapnum == 7) 193 + math_state_restore(); 194 + 195 + /* Restore SYSENTER if it's supposed to be on. */ 196 + if (boot_cpu_has(X86_FEATURE_SEP)) 197 + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 198 + } 199 + 200 + /*H:130 Our Guest is usually so well behaved; it never tries to do things it 201 + * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 202 + * quite complete, because it doesn't contain replacements for the Intel I/O 203 + * instructions. As a result, the Guest sometimes fumbles across one during 204 + * the boot process as it probes for various things which are usually attached 205 + * to a PC. 206 + * 207 + * When the Guest uses one of these instructions, we get trap #13 (General 208 + * Protection Fault) and come here. We see if it's one of those troublesome 209 + * instructions and skip over it. We return true if we did. */ 210 + static int emulate_insn(struct lguest *lg) 211 + { 212 + u8 insn; 213 + unsigned int insnlen = 0, in = 0, shift = 0; 214 + /* The eip contains the *virtual* address of the Guest's instruction: 215 + * guest_pa just subtracts the Guest's page_offset. */ 216 + unsigned long physaddr = guest_pa(lg, lg->regs->eip); 217 + 218 + /* This must be the Guest kernel trying to do something, not userspace! 219 + * The bottom two bits of the CS segment register are the privilege 220 + * level. */ 221 + if ((lg->regs->cs & 3) != GUEST_PL) 222 + return 0; 223 + 224 + /* Decoding x86 instructions is icky. */ 225 + insn = lgread(lg, physaddr, u8); 226 + 227 + /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 228 + of the eax register. */ 229 + if (insn == 0x66) { 230 + shift = 16; 231 + /* The instruction is 1 byte so far, read the next byte. */ 232 + insnlen = 1; 233 + insn = lgread(lg, physaddr + insnlen, u8); 234 + } 235 + 236 + /* We can ignore the lower bit for the moment and decode the 4 opcodes 237 + * we need to emulate. */ 238 + switch (insn & 0xFE) { 239 + case 0xE4: /* in <next byte>,%al */ 240 + insnlen += 2; 241 + in = 1; 242 + break; 243 + case 0xEC: /* in (%dx),%al */ 244 + insnlen += 1; 245 + in = 1; 246 + break; 247 + case 0xE6: /* out %al,<next byte> */ 248 + insnlen += 2; 249 + break; 250 + case 0xEE: /* out %al,(%dx) */ 251 + insnlen += 1; 252 + break; 253 + default: 254 + /* OK, we don't know what this is, can't emulate. */ 255 + return 0; 256 + } 257 + 258 + /* If it was an "IN" instruction, they expect the result to be read 259 + * into %eax, so we change %eax. We always return all-ones, which 260 + * traditionally means "there's nothing there". */ 261 + if (in) { 262 + /* Lower bit tells is whether it's a 16 or 32 bit access */ 263 + if (insn & 0x1) 264 + lg->regs->eax = 0xFFFFFFFF; 265 + else 266 + lg->regs->eax |= (0xFFFF << shift); 267 + } 268 + /* Finally, we've "done" the instruction, so move past it. */ 269 + lg->regs->eip += insnlen; 270 + /* Success! */ 271 + return 1; 272 + } 273 + 274 + /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 275 + void lguest_arch_handle_trap(struct lguest *lg) 276 + { 277 + switch (lg->regs->trapnum) { 278 + case 13: /* We've intercepted a GPF. */ 279 + /* Check if this was one of those annoying IN or OUT 280 + * instructions which we need to emulate. If so, we 281 + * just go back into the Guest after we've done it. */ 282 + if (lg->regs->errcode == 0) { 283 + if (emulate_insn(lg)) 284 + return; 285 + } 286 + break; 287 + case 14: /* We've intercepted a page fault. */ 288 + /* The Guest accessed a virtual address that wasn't 289 + * mapped. This happens a lot: we don't actually set 290 + * up most of the page tables for the Guest at all when 291 + * we start: as it runs it asks for more and more, and 292 + * we set them up as required. In this case, we don't 293 + * even tell the Guest that the fault happened. 294 + * 295 + * The errcode tells whether this was a read or a 296 + * write, and whether kernel or userspace code. */ 297 + if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 298 + return; 299 + 300 + /* OK, it's really not there (or not OK): the Guest 301 + * needs to know. We write out the cr2 value so it 302 + * knows where the fault occurred. 303 + * 304 + * Note that if the Guest were really messed up, this 305 + * could happen before it's done the INITIALIZE 306 + * hypercall, so lg->lguest_data will be NULL */ 307 + if (lg->lguest_data && 308 + put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 309 + kill_guest(lg, "Writing cr2"); 310 + break; 311 + case 7: /* We've intercepted a Device Not Available fault. */ 312 + /* If the Guest doesn't want to know, we already 313 + * restored the Floating Point Unit, so we just 314 + * continue without telling it. */ 315 + if (!lg->ts) 316 + return; 317 + break; 318 + case 32 ... 255: 319 + /* These values mean a real interrupt occurred, in which case 320 + * the Host handler has already been run. We just do a 321 + * friendly check if another process should now be run, then 322 + * return to run the Guest again */ 323 + cond_resched(); 324 + return; 325 + case LGUEST_TRAP_ENTRY: 326 + /* Our 'struct hcall_args' maps directly over our regs: we set 327 + * up the pointer now to indicate a hypercall is pending. */ 328 + lg->hcall = (struct hcall_args *)lg->regs; 329 + return; 330 + } 331 + 332 + /* We didn't handle the trap, so it needs to go to the Guest. */ 333 + if (!deliver_trap(lg, lg->regs->trapnum)) 334 + /* If the Guest doesn't have a handler (either it hasn't 335 + * registered any yet, or it's one of the faults we don't let 336 + * it handle), it dies with a cryptic error message. */ 337 + kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 338 + lg->regs->trapnum, lg->regs->eip, 339 + lg->regs->trapnum == 14 ? lg->arch.last_pagefault 340 + : lg->regs->errcode); 341 + } 342 + 343 + /* Now we can look at each of the routines this calls, in increasing order of 344 + * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 345 + * deliver_trap() and demand_page(). After all those, we'll be ready to 346 + * examine the Switcher, and our philosophical understanding of the Host/Guest 347 + * duality will be complete. :*/ 348 + static void adjust_pge(void *on) 349 + { 350 + if (on) 351 + write_cr4(read_cr4() | X86_CR4_PGE); 352 + else 353 + write_cr4(read_cr4() & ~X86_CR4_PGE); 354 + } 355 + 356 + /*H:020 Now the Switcher is mapped and every thing else is ready, we need to do 357 + * some more i386-specific initialization. */ 358 + void __init lguest_arch_host_init(void) 359 + { 360 + int i; 361 + 362 + /* Most of the i386/switcher.S doesn't care that it's been moved; on 363 + * Intel, jumps are relative, and it doesn't access any references to 364 + * external code or data. 365 + * 366 + * The only exception is the interrupt handlers in switcher.S: their 367 + * addresses are placed in a table (default_idt_entries), so we need to 368 + * update the table with the new addresses. switcher_offset() is a 369 + * convenience function which returns the distance between the builtin 370 + * switcher code and the high-mapped copy we just made. */ 371 + for (i = 0; i < IDT_ENTRIES; i++) 372 + default_idt_entries[i] += switcher_offset(); 373 + 374 + /* 375 + * Set up the Switcher's per-cpu areas. 376 + * 377 + * Each CPU gets two pages of its own within the high-mapped region 378 + * (aka. "struct lguest_pages"). Much of this can be initialized now, 379 + * but some depends on what Guest we are running (which is set up in 380 + * copy_in_guest_info()). 381 + */ 382 + for_each_possible_cpu(i) { 383 + /* lguest_pages() returns this CPU's two pages. */ 384 + struct lguest_pages *pages = lguest_pages(i); 385 + /* This is a convenience pointer to make the code fit one 386 + * statement to a line. */ 387 + struct lguest_ro_state *state = &pages->state; 388 + 389 + /* The Global Descriptor Table: the Host has a different one 390 + * for each CPU. We keep a descriptor for the GDT which says 391 + * where it is and how big it is (the size is actually the last 392 + * byte, not the size, hence the "-1"). */ 393 + state->host_gdt_desc.size = GDT_SIZE-1; 394 + state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 395 + 396 + /* All CPUs on the Host use the same Interrupt Descriptor 397 + * Table, so we just use store_idt(), which gets this CPU's IDT 398 + * descriptor. */ 399 + store_idt(&state->host_idt_desc); 400 + 401 + /* The descriptors for the Guest's GDT and IDT can be filled 402 + * out now, too. We copy the GDT & IDT into ->guest_gdt and 403 + * ->guest_idt before actually running the Guest. */ 404 + state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 405 + state->guest_idt_desc.address = (long)&state->guest_idt; 406 + state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 407 + state->guest_gdt_desc.address = (long)&state->guest_gdt; 408 + 409 + /* We know where we want the stack to be when the Guest enters 410 + * the switcher: in pages->regs. The stack grows upwards, so 411 + * we start it at the end of that structure. */ 412 + state->guest_tss.esp0 = (long)(&pages->regs + 1); 413 + /* And this is the GDT entry to use for the stack: we keep a 414 + * couple of special LGUEST entries. */ 415 + state->guest_tss.ss0 = LGUEST_DS; 416 + 417 + /* x86 can have a finegrained bitmap which indicates what I/O 418 + * ports the process can use. We set it to the end of our 419 + * structure, meaning "none". */ 420 + state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 421 + 422 + /* Some GDT entries are the same across all Guests, so we can 423 + * set them up now. */ 424 + setup_default_gdt_entries(state); 425 + /* Most IDT entries are the same for all Guests, too.*/ 426 + setup_default_idt_entries(state, default_idt_entries); 427 + 428 + /* The Host needs to be able to use the LGUEST segments on this 429 + * CPU, too, so put them in the Host GDT. */ 430 + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 431 + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 432 + } 433 + 434 + /* In the Switcher, we want the %cs segment register to use the 435 + * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 436 + * it will be undisturbed when we switch. To change %cs and jump we 437 + * need this structure to feed to Intel's "lcall" instruction. */ 438 + lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 439 + lguest_entry.segment = LGUEST_CS; 440 + 441 + /* Finally, we need to turn off "Page Global Enable". PGE is an 442 + * optimization where page table entries are specially marked to show 443 + * they never change. The Host kernel marks all the kernel pages this 444 + * way because it's always present, even when userspace is running. 445 + * 446 + * Lguest breaks this: unbeknownst to the rest of the Host kernel, we 447 + * switch to the Guest kernel. If you don't disable this on all CPUs, 448 + * you'll get really weird bugs that you'll chase for two days. 449 + * 450 + * I used to turn PGE off every time we switched to the Guest and back 451 + * on when we return, but that slowed the Switcher down noticibly. */ 452 + 453 + /* We don't need the complexity of CPUs coming and going while we're 454 + * doing this. */ 455 + lock_cpu_hotplug(); 456 + if (cpu_has_pge) { /* We have a broader idea of "global". */ 457 + /* Remember that this was originally set (for cleanup). */ 458 + cpu_had_pge = 1; 459 + /* adjust_pge is a helper function which sets or unsets the PGE 460 + * bit on its CPU, depending on the argument (0 == unset). */ 461 + on_each_cpu(adjust_pge, (void *)0, 0, 1); 462 + /* Turn off the feature in the global feature set. */ 463 + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 464 + } 465 + unlock_cpu_hotplug(); 466 + }; 467 + /*:*/ 468 + 469 + void __exit lguest_arch_host_fini(void) 470 + { 471 + /* If we had PGE before we started, turn it back on now. */ 472 + lock_cpu_hotplug(); 473 + if (cpu_had_pge) { 474 + set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 475 + /* adjust_pge's argument "1" means set PGE. */ 476 + on_each_cpu(adjust_pge, (void *)1, 0, 1); 477 + } 478 + unlock_cpu_hotplug(); 479 + } 480 + 481 + 482 + /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ 483 + int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) 484 + { 485 + switch (args->arg0) { 486 + case LHCALL_LOAD_GDT: 487 + load_guest_gdt(lg, args->arg1, args->arg2); 488 + break; 489 + case LHCALL_LOAD_IDT_ENTRY: 490 + load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); 491 + break; 492 + case LHCALL_LOAD_TLS: 493 + guest_load_tls(lg, args->arg1); 494 + break; 495 + default: 496 + /* Bad Guest. Bad! */ 497 + return -EIO; 498 + } 499 + return 0; 500 + } 501 + 502 + /*H:126 i386-specific hypercall initialization: */ 503 + int lguest_arch_init_hypercalls(struct lguest *lg) 504 + { 505 + u32 tsc_speed; 506 + 507 + /* The pointer to the Guest's "struct lguest_data" is the only 508 + * argument. We check that address now. */ 509 + if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) 510 + return -EFAULT; 511 + 512 + /* Having checked it, we simply set lg->lguest_data to point straight 513 + * into the Launcher's memory at the right place and then use 514 + * copy_to_user/from_user from now on, instead of lgread/write. I put 515 + * this in to show that I'm not immune to writing stupid 516 + * optimizations. */ 517 + lg->lguest_data = lg->mem_base + lg->hcall->arg1; 518 + 519 + /* We insist that the Time Stamp Counter exist and doesn't change with 520 + * cpu frequency. Some devious chip manufacturers decided that TSC 521 + * changes could be handled in software. I decided that time going 522 + * backwards might be good for benchmarks, but it's bad for users. 523 + * 524 + * We also insist that the TSC be stable: the kernel detects unreliable 525 + * TSCs for its own purposes, and we use that here. */ 526 + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 527 + tsc_speed = tsc_khz; 528 + else 529 + tsc_speed = 0; 530 + if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) 531 + return -EFAULT; 532 + 533 + /* The interrupt code might not like the system call vector. */ 534 + if (!check_syscall_vector(lg)) 535 + kill_guest(lg, "bad syscall vector"); 536 + 537 + return 0; 538 + } 539 + /* Now we've examined the hypercall code; our Guest can make requests. There 540 + * is one other way we can do things for the Guest, as we see in 541 + * emulate_insn(). :*/ 542 + 543 + /*L:030 lguest_arch_setup_regs() 544 + * 545 + * Most of the Guest's registers are left alone: we used get_zeroed_page() to 546 + * allocate the structure, so they will be 0. */ 547 + void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) 548 + { 549 + struct lguest_regs *regs = lg->regs; 550 + 551 + /* There are four "segment" registers which the Guest needs to boot: 552 + * The "code segment" register (cs) refers to the kernel code segment 553 + * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 554 + * refer to the kernel data segment __KERNEL_DS. 555 + * 556 + * The privilege level is packed into the lower bits. The Guest runs 557 + * at privilege level 1 (GUEST_PL).*/ 558 + regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 559 + regs->cs = __KERNEL_CS|GUEST_PL; 560 + 561 + /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 562 + * is supposed to always be "1". Bit 9 (0x200) controls whether 563 + * interrupts are enabled. We always leave interrupts enabled while 564 + * running the Guest. */ 565 + regs->eflags = 0x202; 566 + 567 + /* The "Extended Instruction Pointer" register says where the Guest is 568 + * running. */ 569 + regs->eip = start; 570 + 571 + /* %esi points to our boot information, at physical address 0, so don't 572 + * touch it. */ 573 + /* There are a couple of GDT entries the Guest expects when first 574 + * booting. */ 575 + 576 + setup_guest_gdt(lg); 577 + }

+6

drivers/net/Kconfig

··· 3100 config NET_POLL_CONTROLLER 3101 def_bool NETPOLL 3102 3103 endif # NETDEVICES

··· 3100 config NET_POLL_CONTROLLER 3101 def_bool NETPOLL 3102 3103 + config VIRTIO_NET 3104 + tristate "Virtio network driver (EXPERIMENTAL)" 3105 + depends on EXPERIMENTAL && VIRTIO 3106 + ---help--- 3107 + This is the virtual network driver for lguest. Say Y or M. 3108 + 3109 endif # NETDEVICES

+1 -1

drivers/net/Makefile

··· 183 obj-$(CONFIG_HPLANCE) += hplance.o 7990.o 184 obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o 185 obj-$(CONFIG_EQUALIZER) += eql.o 186 - obj-$(CONFIG_LGUEST_NET) += lguest_net.o 187 obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o 188 obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o 189 obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o ··· 242 243 obj-$(CONFIG_NETXEN_NIC) += netxen/ 244 obj-$(CONFIG_NIU) += niu.o

··· 183 obj-$(CONFIG_HPLANCE) += hplance.o 7990.o 184 obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o 185 obj-$(CONFIG_EQUALIZER) += eql.o 186 obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o 187 obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o 188 obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o ··· 243 244 obj-$(CONFIG_NETXEN_NIC) += netxen/ 245 obj-$(CONFIG_NIU) += niu.o 246 + obj-$(CONFIG_VIRTIO_NET) += virtio_net.o

-555

drivers/net/lguest_net.c

··· 1 - /*D:500 2 - * The Guest network driver. 3 - * 4 - * This is very simple a virtual network driver, and our last Guest driver. 5 - * The only trick is that it can talk directly to multiple other recipients 6 - * (ie. other Guests on the same network). It can also be used with only the 7 - * Host on the network. 8 - :*/ 9 - 10 - /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 11 - * 12 - * This program is free software; you can redistribute it and/or modify 13 - * it under the terms of the GNU General Public License as published by 14 - * the Free Software Foundation; either version 2 of the License, or 15 - * (at your option) any later version. 16 - * 17 - * This program is distributed in the hope that it will be useful, 18 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 - * GNU General Public License for more details. 21 - * 22 - * You should have received a copy of the GNU General Public License 23 - * along with this program; if not, write to the Free Software 24 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 - */ 26 - //#define DEBUG 27 - #include <linux/netdevice.h> 28 - #include <linux/etherdevice.h> 29 - #include <linux/module.h> 30 - #include <linux/mm_types.h> 31 - #include <linux/io.h> 32 - #include <linux/lguest_bus.h> 33 - 34 - #define SHARED_SIZE PAGE_SIZE 35 - #define MAX_LANS 4 36 - #define NUM_SKBS 8 37 - 38 - /*M:011 Network code master Jeff Garzik points out numerous shortcomings in 39 - * this driver if it aspires to greatness. 40 - * 41 - * Firstly, it doesn't use "NAPI": the networking's New API, and is poorer for 42 - * it. As he says "NAPI means system-wide load leveling, across multiple 43 - * network interfaces. Lack of NAPI can mean competition at higher loads." 44 - * 45 - * He also points out that we don't implement set_mac_address, so users cannot 46 - * change the devices hardware address. When I asked why one would want to: 47 - * "Bonding, and situations where you /do/ want the MAC address to "leak" out 48 - * of the host onto the wider net." 49 - * 50 - * Finally, he would like module unloading: "It is not unrealistic to think of 51 - * [un|re|]loading the net support module in an lguest guest. And, adding 52 - * module support makes the programmer more responsible, because they now have 53 - * to learn to clean up after themselves. Any driver that cannot clean up 54 - * after itself is an incomplete driver in my book." 55 - :*/ 56 - 57 - /*D:530 The "struct lguestnet_info" contains all the information we need to 58 - * know about the network device. */ 59 - struct lguestnet_info 60 - { 61 - /* The mapped device page(s) (an array of "struct lguest_net"). */ 62 - struct lguest_net *peer; 63 - /* The physical address of the device page(s) */ 64 - unsigned long peer_phys; 65 - /* The size of the device page(s). */ 66 - unsigned long mapsize; 67 - 68 - /* The lguest_device I come from */ 69 - struct lguest_device *lgdev; 70 - 71 - /* My peerid (ie. my slot in the array). */ 72 - unsigned int me; 73 - 74 - /* Receive queue: the network packets waiting to be filled. */ 75 - struct sk_buff *skb[NUM_SKBS]; 76 - struct lguest_dma dma[NUM_SKBS]; 77 - }; 78 - /*:*/ 79 - 80 - /* How many bytes left in this page. */ 81 - static unsigned int rest_of_page(void *data) 82 - { 83 - return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); 84 - } 85 - 86 - /*D:570 Each peer (ie. Guest or Host) on the network binds their receive 87 - * buffers to a different key: we simply use the physical address of the 88 - * device's memory page plus the peer number. The Host insists that all keys 89 - * be a multiple of 4, so we multiply the peer number by 4. */ 90 - static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum) 91 - { 92 - return info->peer_phys + 4 * peernum; 93 - } 94 - 95 - /* This is the routine which sets up a "struct lguest_dma" to point to a 96 - * network packet, similar to req_to_dma() in lguest_blk.c. The structure of a 97 - * "struct sk_buff" has grown complex over the years: it consists of a "head" 98 - * linear section pointed to by "skb->data", and possibly an array of 99 - * "fragments" in the case of a non-linear packet. 100 - * 101 - * Our receive buffers don't use fragments at all but outgoing skbs might, so 102 - * we handle it. */ 103 - static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen, 104 - struct lguest_dma *dma) 105 - { 106 - unsigned int i, seg; 107 - 108 - /* First, we put the linear region into the "struct lguest_dma". Each 109 - * entry can't go over a page boundary, so even though all our packets 110 - * are 1514 bytes or less, we might need to use two entries here: */ 111 - for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) { 112 - dma->addr[seg] = virt_to_phys(skb->data + i); 113 - dma->len[seg] = min((unsigned)(headlen - i), 114 - rest_of_page(skb->data + i)); 115 - } 116 - 117 - /* Now we handle the fragments: at least they're guaranteed not to go 118 - * over a page. skb_shinfo(skb) returns a pointer to the structure 119 - * which tells us about the number of fragments and the fragment 120 - * array. */ 121 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) { 122 - const skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 123 - /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */ 124 - if (seg == LGUEST_MAX_DMA_SECTIONS) { 125 - /* We will end up sending a truncated packet should 126 - * this ever happen. Plus, a cool log message! */ 127 - printk("Woah dude! Megapacket!\n"); 128 - break; 129 - } 130 - dma->addr[seg] = page_to_phys(f->page) + f->page_offset; 131 - dma->len[seg] = f->size; 132 - } 133 - 134 - /* If after all that we didn't use the entire "struct lguest_dma" 135 - * array, we terminate it with a 0 length. */ 136 - if (seg < LGUEST_MAX_DMA_SECTIONS) 137 - dma->len[seg] = 0; 138 - } 139 - 140 - /* 141 - * Packet transmission. 142 - * 143 - * Our packet transmission is a little unusual. A real network card would just 144 - * send out the packet and leave the receivers to decide if they're interested. 145 - * Instead, we look through the network device memory page and see if any of 146 - * the ethernet addresses match the packet destination, and if so we send it to 147 - * that Guest. 148 - * 149 - * This is made a little more complicated in two cases. The first case is 150 - * broadcast packets: for that we send the packet to all Guests on the network, 151 - * one at a time. The second case is "promiscuous" mode, where a Guest wants 152 - * to see all the packets on the network. We need a way for the Guest to tell 153 - * us it wants to see all packets, so it sets the "multicast" bit on its 154 - * published MAC address, which is never valid in a real ethernet address. 155 - */ 156 - #define PROMISC_BIT 0x01 157 - 158 - /* This is the callback which is summoned whenever the network device's 159 - * multicast or promiscuous state changes. If the card is in promiscuous mode, 160 - * we advertise that in our ethernet address in the device's memory. We do the 161 - * same if Linux wants any or all multicast traffic. */ 162 - static void lguestnet_set_multicast(struct net_device *dev) 163 - { 164 - struct lguestnet_info *info = netdev_priv(dev); 165 - 166 - if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count) 167 - info->peer[info->me].mac[0] |= PROMISC_BIT; 168 - else 169 - info->peer[info->me].mac[0] &= ~PROMISC_BIT; 170 - } 171 - 172 - /* A simple test function to see if a peer wants to see all packets.*/ 173 - static int promisc(struct lguestnet_info *info, unsigned int peer) 174 - { 175 - return info->peer[peer].mac[0] & PROMISC_BIT; 176 - } 177 - 178 - /* Another simple function to see if a peer's advertised ethernet address 179 - * matches a packet's destination ethernet address. */ 180 - static int mac_eq(const unsigned char mac[ETH_ALEN], 181 - struct lguestnet_info *info, unsigned int peer) 182 - { 183 - /* Ignore multicast bit, which peer turns on to mean promisc. */ 184 - if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0]) 185 - return 0; 186 - return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0; 187 - } 188 - 189 - /* This is the function which actually sends a packet once we've decided a 190 - * peer wants it: */ 191 - static void transfer_packet(struct net_device *dev, 192 - struct sk_buff *skb, 193 - unsigned int peernum) 194 - { 195 - struct lguestnet_info *info = netdev_priv(dev); 196 - struct lguest_dma dma; 197 - 198 - /* We use our handy "struct lguest_dma" packing function to prepare 199 - * the skb for sending. */ 200 - skb_to_dma(skb, skb_headlen(skb), &dma); 201 - pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len); 202 - 203 - /* This is the actual send call which copies the packet. */ 204 - lguest_send_dma(peer_key(info, peernum), &dma); 205 - 206 - /* Check that the entire packet was transmitted. If not, it could mean 207 - * that the other Guest registered a short receive buffer, but this 208 - * driver should never do that. More likely, the peer is dead. */ 209 - if (dma.used_len != skb->len) { 210 - dev->stats.tx_carrier_errors++; 211 - pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n", 212 - peernum, dma.used_len, skb->len, 213 - (void *)dma.addr[0], dma.len[0]); 214 - } else { 215 - /* On success we update the stats. */ 216 - dev->stats.tx_bytes += skb->len; 217 - dev->stats.tx_packets++; 218 - } 219 - } 220 - 221 - /* Another helper function to tell is if a slot in the device memory is unused. 222 - * Since we always set the Local Assignment bit in the ethernet address, the 223 - * first byte can never be 0. */ 224 - static int unused_peer(const struct lguest_net peer[], unsigned int num) 225 - { 226 - return peer[num].mac[0] == 0; 227 - } 228 - 229 - /* Finally, here is the routine which handles an outgoing packet. It's called 230 - * "start_xmit" for traditional reasons. */ 231 - static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev) 232 - { 233 - unsigned int i; 234 - int broadcast; 235 - struct lguestnet_info *info = netdev_priv(dev); 236 - /* Extract the destination ethernet address from the packet. */ 237 - const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 238 - DECLARE_MAC_BUF(mac); 239 - 240 - pr_debug("%s: xmit %s\n", dev->name, print_mac(mac, dest)); 241 - 242 - /* If it's a multicast packet, we broadcast to everyone. That's not 243 - * very efficient, but there are very few applications which actually 244 - * use multicast, which is a shame really. 245 - * 246 - * As etherdevice.h points out: "By definition the broadcast address is 247 - * also a multicast address." So we don't have to test for broadcast 248 - * packets separately. */ 249 - broadcast = is_multicast_ether_addr(dest); 250 - 251 - /* Look through all the published ethernet addresses to see if we 252 - * should send this packet. */ 253 - for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) { 254 - /* We don't send to ourselves (we actually can't SEND_DMA to 255 - * ourselves anyway), and don't send to unused slots.*/ 256 - if (i == info->me || unused_peer(info->peer, i)) 257 - continue; 258 - 259 - /* If it's broadcast we send it. If they want every packet we 260 - * send it. If the destination matches their address we send 261 - * it. Otherwise we go to the next peer. */ 262 - if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i)) 263 - continue; 264 - 265 - pr_debug("lguestnet %s: sending from %i to %i\n", 266 - dev->name, info->me, i); 267 - /* Our routine which actually does the transfer. */ 268 - transfer_packet(dev, skb, i); 269 - } 270 - 271 - /* An xmit routine is expected to dispose of the packet, so we do. */ 272 - dev_kfree_skb(skb); 273 - 274 - /* As per kernel convention, 0 means success. This is why I love 275 - * networking: even if we never sent to anyone, that's still 276 - * success! */ 277 - return 0; 278 - } 279 - 280 - /*D:560 281 - * Packet receiving. 282 - * 283 - * First, here's a helper routine which fills one of our array of receive 284 - * buffers: */ 285 - static int fill_slot(struct net_device *dev, unsigned int slot) 286 - { 287 - struct lguestnet_info *info = netdev_priv(dev); 288 - 289 - /* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard 290 - * ethernet header of ETH_HLEN (14) bytes. */ 291 - info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN); 292 - if (!info->skb[slot]) { 293 - printk("%s: could not fill slot %i\n", dev->name, slot); 294 - return -ENOMEM; 295 - } 296 - 297 - /* skb_to_dma() is a helper which sets up the "struct lguest_dma" to 298 - * point to the data in the skb: we also use it for sending out a 299 - * packet. */ 300 - skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]); 301 - 302 - /* This is a Write Memory Barrier: it ensures that the entry in the 303 - * receive buffer array is written *before* we set the "used_len" entry 304 - * to 0. If the Host were looking at the receive buffer array from a 305 - * different CPU, it could potentially see "used_len = 0" and not see 306 - * the updated receive buffer information. This would be a horribly 307 - * nasty bug, so make sure the compiler and CPU know this has to happen 308 - * first. */ 309 - wmb(); 310 - /* Writing 0 to "used_len" tells the Host it can use this receive 311 - * buffer now. */ 312 - info->dma[slot].used_len = 0; 313 - return 0; 314 - } 315 - 316 - /* This is the actual receive routine. When we receive an interrupt from the 317 - * Host to tell us a packet has been delivered, we arrive here: */ 318 - static irqreturn_t lguestnet_rcv(int irq, void *dev_id) 319 - { 320 - struct net_device *dev = dev_id; 321 - struct lguestnet_info *info = netdev_priv(dev); 322 - unsigned int i, done = 0; 323 - 324 - /* Look through our entire receive array for an entry which has data 325 - * in it. */ 326 - for (i = 0; i < ARRAY_SIZE(info->dma); i++) { 327 - unsigned int length; 328 - struct sk_buff *skb; 329 - 330 - length = info->dma[i].used_len; 331 - if (length == 0) 332 - continue; 333 - 334 - /* We've found one! Remember the skb (we grabbed the length 335 - * above), and immediately refill the slot we've taken it 336 - * from. */ 337 - done++; 338 - skb = info->skb[i]; 339 - fill_slot(dev, i); 340 - 341 - /* This shouldn't happen: micropackets could be sent by a 342 - * badly-behaved Guest on the network, but the Host will never 343 - * stuff more data in the buffer than the buffer length. */ 344 - if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) { 345 - pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n", 346 - dev->name, length); 347 - dev_kfree_skb(skb); 348 - continue; 349 - } 350 - 351 - /* skb_put(), what a great function! I've ranted about this 352 - * function before (http://lkml.org/lkml/1999/9/26/24). You 353 - * call it after you've added data to the end of an skb (in 354 - * this case, it was the Host which wrote the data). */ 355 - skb_put(skb, length); 356 - 357 - /* The ethernet header contains a protocol field: we use the 358 - * standard helper to extract it, and place the result in 359 - * skb->protocol. The helper also sets up skb->pkt_type and 360 - * eats up the ethernet header from the front of the packet. */ 361 - skb->protocol = eth_type_trans(skb, dev); 362 - 363 - /* If this device doesn't need checksums for sending, we also 364 - * don't need to check the packets when they come in. */ 365 - if (dev->features & NETIF_F_NO_CSUM) 366 - skb->ip_summed = CHECKSUM_UNNECESSARY; 367 - 368 - /* As a last resort for debugging the driver or the lguest I/O 369 - * subsystem, you can uncomment the "#define DEBUG" at the top 370 - * of this file, which turns all the pr_debug() into printk() 371 - * and floods the logs. */ 372 - pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 373 - ntohs(skb->protocol), skb->len, skb->pkt_type); 374 - 375 - /* Update the packet and byte counts (visible from ifconfig, 376 - * and good for debugging). */ 377 - dev->stats.rx_bytes += skb->len; 378 - dev->stats.rx_packets++; 379 - 380 - /* Hand our fresh network packet into the stack's "network 381 - * interface receive" routine. That will free the packet 382 - * itself when it's finished. */ 383 - netif_rx(skb); 384 - } 385 - 386 - /* If we found any packets, we assume the interrupt was for us. */ 387 - return done ? IRQ_HANDLED : IRQ_NONE; 388 - } 389 - 390 - /*D:550 This is where we start: when the device is brought up by dhcpd or 391 - * ifconfig. At this point we advertise our MAC address to the rest of the 392 - * network, and register receive buffers ready for incoming packets. */ 393 - static int lguestnet_open(struct net_device *dev) 394 - { 395 - int i; 396 - struct lguestnet_info *info = netdev_priv(dev); 397 - 398 - /* Copy our MAC address into the device page, so others on the network 399 - * can find us. */ 400 - memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN); 401 - 402 - /* We might already be in promisc mode (dev->flags & IFF_PROMISC). Our 403 - * set_multicast callback handles this already, so we call it now. */ 404 - lguestnet_set_multicast(dev); 405 - 406 - /* Allocate packets and put them into our "struct lguest_dma" array. 407 - * If we fail to allocate all the packets we could still limp along, 408 - * but it's a sign of real stress so we should probably give up now. */ 409 - for (i = 0; i < ARRAY_SIZE(info->dma); i++) { 410 - if (fill_slot(dev, i) != 0) 411 - goto cleanup; 412 - } 413 - 414 - /* Finally we tell the Host where our array of "struct lguest_dma" 415 - * receive buffers is, binding it to the key corresponding to the 416 - * device's physical memory plus our peerid. */ 417 - if (lguest_bind_dma(peer_key(info,info->me), info->dma, 418 - NUM_SKBS, lgdev_irq(info->lgdev)) != 0) 419 - goto cleanup; 420 - return 0; 421 - 422 - cleanup: 423 - while (--i >= 0) 424 - dev_kfree_skb(info->skb[i]); 425 - return -ENOMEM; 426 - } 427 - /*:*/ 428 - 429 - /* The close routine is called when the device is no longer in use: we clean up 430 - * elegantly. */ 431 - static int lguestnet_close(struct net_device *dev) 432 - { 433 - unsigned int i; 434 - struct lguestnet_info *info = netdev_priv(dev); 435 - 436 - /* Clear all trace of our existence out of the device memory by setting 437 - * the slot which held our MAC address to 0 (unused). */ 438 - memset(&info->peer[info->me], 0, sizeof(info->peer[info->me])); 439 - 440 - /* Unregister our array of receive buffers */ 441 - lguest_unbind_dma(peer_key(info, info->me), info->dma); 442 - for (i = 0; i < ARRAY_SIZE(info->dma); i++) 443 - dev_kfree_skb(info->skb[i]); 444 - return 0; 445 - } 446 - 447 - /*D:510 The network device probe function is basically a standard ethernet 448 - * device setup. It reads the "struct lguest_device_desc" and sets the "struct 449 - * net_device". Oh, the line-by-line excitement! Let's skip over it. :*/ 450 - static int lguestnet_probe(struct lguest_device *lgdev) 451 - { 452 - int err, irqf = IRQF_SHARED; 453 - struct net_device *dev; 454 - struct lguestnet_info *info; 455 - struct lguest_device_desc *desc = &lguest_devices[lgdev->index]; 456 - 457 - pr_debug("lguest_net: probing for device %i\n", lgdev->index); 458 - 459 - dev = alloc_etherdev(sizeof(struct lguestnet_info)); 460 - if (!dev) 461 - return -ENOMEM; 462 - 463 - /* Ethernet defaults with some changes */ 464 - ether_setup(dev); 465 - dev->set_mac_address = NULL; 466 - 467 - dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */ 468 - dev->dev_addr[1] = 0x00; 469 - memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2); 470 - dev->dev_addr[4] = 0x00; 471 - dev->dev_addr[5] = 0x00; 472 - 473 - dev->open = lguestnet_open; 474 - dev->stop = lguestnet_close; 475 - dev->hard_start_xmit = lguestnet_start_xmit; 476 - 477 - /* We don't actually support multicast yet, but turning on/off 478 - * promisc also calls dev->set_multicast_list. */ 479 - dev->set_multicast_list = lguestnet_set_multicast; 480 - SET_NETDEV_DEV(dev, &lgdev->dev); 481 - 482 - /* The network code complains if you have "scatter-gather" capability 483 - * if you don't also handle checksums (it seem that would be 484 - * "illogical"). So we use a lie of omission and don't tell it that we 485 - * can handle scattered packets unless we also don't want checksums, 486 - * even though to us they're completely independent. */ 487 - if (desc->features & LGUEST_NET_F_NOCSUM) 488 - dev->features = NETIF_F_SG|NETIF_F_NO_CSUM; 489 - 490 - info = netdev_priv(dev); 491 - info->mapsize = PAGE_SIZE * desc->num_pages; 492 - info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT); 493 - info->lgdev = lgdev; 494 - info->peer = lguest_map(info->peer_phys, desc->num_pages); 495 - if (!info->peer) { 496 - err = -ENOMEM; 497 - goto free; 498 - } 499 - 500 - /* This stores our peerid (upper bits reserved for future). */ 501 - info->me = (desc->features & (info->mapsize-1)); 502 - 503 - err = register_netdev(dev); 504 - if (err) { 505 - pr_debug("lguestnet: registering device failed\n"); 506 - goto unmap; 507 - } 508 - 509 - if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) 510 - irqf |= IRQF_SAMPLE_RANDOM; 511 - if (request_irq(lgdev_irq(lgdev), lguestnet_rcv, irqf, "lguestnet", 512 - dev) != 0) { 513 - pr_debug("lguestnet: cannot get irq %i\n", lgdev_irq(lgdev)); 514 - goto unregister; 515 - } 516 - 517 - pr_debug("lguestnet: registered device %s\n", dev->name); 518 - /* Finally, we put the "struct net_device" in the generic "struct 519 - * lguest_device"s private pointer. Again, it's not necessary, but 520 - * makes sure the cool kernel kids don't tease us. */ 521 - lgdev->private = dev; 522 - return 0; 523 - 524 - unregister: 525 - unregister_netdev(dev); 526 - unmap: 527 - lguest_unmap(info->peer); 528 - free: 529 - free_netdev(dev); 530 - return err; 531 - } 532 - 533 - static struct lguest_driver lguestnet_drv = { 534 - .name = "lguestnet", 535 - .owner = THIS_MODULE, 536 - .device_type = LGUEST_DEVICE_T_NET, 537 - .probe = lguestnet_probe, 538 - }; 539 - 540 - static __init int lguestnet_init(void) 541 - { 542 - return register_lguest_driver(&lguestnet_drv); 543 - } 544 - module_init(lguestnet_init); 545 - 546 - MODULE_DESCRIPTION("Lguest network driver"); 547 - MODULE_LICENSE("GPL"); 548 - 549 - /*D:580 550 - * This is the last of the Drivers, and with this we have covered the many and 551 - * wonderous and fine (and boring) details of the Guest. 552 - * 553 - * "make Launcher" beckons, where we answer questions like "Where do Guests 554 - * come from?", and "What do you do when someone asks for optimization?" 555 - */

···

+435

drivers/net/virtio_net.c

···

··· 1 + /* A simple network driver using virtio. 2 + * 3 + * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation; either version 2 of the License, or 8 + * (at your option) any later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 + * GNU General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public License 16 + * along with this program; if not, write to the Free Software 17 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 + */ 19 + //#define DEBUG 20 + #include <linux/netdevice.h> 21 + #include <linux/etherdevice.h> 22 + #include <linux/module.h> 23 + #include <linux/virtio.h> 24 + #include <linux/virtio_net.h> 25 + #include <linux/scatterlist.h> 26 + 27 + /* FIXME: MTU in config. */ 28 + #define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) 29 + 30 + struct virtnet_info 31 + { 32 + struct virtio_device *vdev; 33 + struct virtqueue *rvq, *svq; 34 + struct net_device *dev; 35 + struct napi_struct napi; 36 + 37 + /* Number of input buffers, and max we've ever had. */ 38 + unsigned int num, max; 39 + 40 + /* Receive & send queues. */ 41 + struct sk_buff_head recv; 42 + struct sk_buff_head send; 43 + }; 44 + 45 + static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb) 46 + { 47 + return (struct virtio_net_hdr *)skb->cb; 48 + } 49 + 50 + static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb) 51 + { 52 + sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); 53 + } 54 + 55 + static bool skb_xmit_done(struct virtqueue *rvq) 56 + { 57 + struct virtnet_info *vi = rvq->vdev->priv; 58 + 59 + /* In case we were waiting for output buffers. */ 60 + netif_wake_queue(vi->dev); 61 + return true; 62 + } 63 + 64 + static void receive_skb(struct net_device *dev, struct sk_buff *skb, 65 + unsigned len) 66 + { 67 + struct virtio_net_hdr *hdr = skb_vnet_hdr(skb); 68 + 69 + if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { 70 + pr_debug("%s: short packet %i\n", dev->name, len); 71 + dev->stats.rx_length_errors++; 72 + goto drop; 73 + } 74 + len -= sizeof(struct virtio_net_hdr); 75 + BUG_ON(len > MAX_PACKET_LEN); 76 + 77 + skb_trim(skb, len); 78 + skb->protocol = eth_type_trans(skb, dev); 79 + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 80 + ntohs(skb->protocol), skb->len, skb->pkt_type); 81 + dev->stats.rx_bytes += skb->len; 82 + dev->stats.rx_packets++; 83 + 84 + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 85 + pr_debug("Needs csum!\n"); 86 + skb->ip_summed = CHECKSUM_PARTIAL; 87 + skb->csum_start = hdr->csum_start; 88 + skb->csum_offset = hdr->csum_offset; 89 + if (skb->csum_start > skb->len - 2 90 + || skb->csum_offset > skb->len - 2) { 91 + if (net_ratelimit()) 92 + printk(KERN_WARNING "%s: csum=%u/%u len=%u\n", 93 + dev->name, skb->csum_start, 94 + skb->csum_offset, skb->len); 95 + goto frame_err; 96 + } 97 + } 98 + 99 + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 100 + pr_debug("GSO!\n"); 101 + switch (hdr->gso_type) { 102 + case VIRTIO_NET_HDR_GSO_TCPV4: 103 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 104 + break; 105 + case VIRTIO_NET_HDR_GSO_TCPV4_ECN: 106 + skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN; 107 + break; 108 + case VIRTIO_NET_HDR_GSO_UDP: 109 + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 110 + break; 111 + case VIRTIO_NET_HDR_GSO_TCPV6: 112 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; 113 + break; 114 + default: 115 + if (net_ratelimit()) 116 + printk(KERN_WARNING "%s: bad gso type %u.\n", 117 + dev->name, hdr->gso_type); 118 + goto frame_err; 119 + } 120 + 121 + skb_shinfo(skb)->gso_size = hdr->gso_size; 122 + if (skb_shinfo(skb)->gso_size == 0) { 123 + if (net_ratelimit()) 124 + printk(KERN_WARNING "%s: zero gso size.\n", 125 + dev->name); 126 + goto frame_err; 127 + } 128 + 129 + /* Header must be checked, and gso_segs computed. */ 130 + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 131 + skb_shinfo(skb)->gso_segs = 0; 132 + } 133 + 134 + netif_receive_skb(skb); 135 + return; 136 + 137 + frame_err: 138 + dev->stats.rx_frame_errors++; 139 + drop: 140 + dev_kfree_skb(skb); 141 + } 142 + 143 + static void try_fill_recv(struct virtnet_info *vi) 144 + { 145 + struct sk_buff *skb; 146 + struct scatterlist sg[1+MAX_SKB_FRAGS]; 147 + int num, err; 148 + 149 + for (;;) { 150 + skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN); 151 + if (unlikely(!skb)) 152 + break; 153 + 154 + skb_put(skb, MAX_PACKET_LEN); 155 + vnet_hdr_to_sg(sg, skb); 156 + num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 157 + skb_queue_head(&vi->recv, skb); 158 + 159 + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb); 160 + if (err) { 161 + skb_unlink(skb, &vi->recv); 162 + kfree_skb(skb); 163 + break; 164 + } 165 + vi->num++; 166 + } 167 + if (unlikely(vi->num > vi->max)) 168 + vi->max = vi->num; 169 + vi->rvq->vq_ops->kick(vi->rvq); 170 + } 171 + 172 + static bool skb_recv_done(struct virtqueue *rvq) 173 + { 174 + struct virtnet_info *vi = rvq->vdev->priv; 175 + netif_rx_schedule(vi->dev, &vi->napi); 176 + /* Suppress further interrupts. */ 177 + return false; 178 + } 179 + 180 + static int virtnet_poll(struct napi_struct *napi, int budget) 181 + { 182 + struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi); 183 + struct sk_buff *skb = NULL; 184 + unsigned int len, received = 0; 185 + 186 + again: 187 + while (received < budget && 188 + (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) { 189 + __skb_unlink(skb, &vi->recv); 190 + receive_skb(vi->dev, skb, len); 191 + vi->num--; 192 + received++; 193 + } 194 + 195 + /* FIXME: If we oom and completely run out of inbufs, we need 196 + * to start a timer trying to fill more. */ 197 + if (vi->num < vi->max / 2) 198 + try_fill_recv(vi); 199 + 200 + /* All done? */ 201 + if (!skb) { 202 + netif_rx_complete(vi->dev, napi); 203 + if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq)) 204 + && netif_rx_reschedule(vi->dev, napi)) 205 + goto again; 206 + } 207 + 208 + return received; 209 + } 210 + 211 + static void free_old_xmit_skbs(struct virtnet_info *vi) 212 + { 213 + struct sk_buff *skb; 214 + unsigned int len; 215 + 216 + while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) { 217 + pr_debug("Sent skb %p\n", skb); 218 + __skb_unlink(skb, &vi->send); 219 + vi->dev->stats.tx_bytes += len; 220 + vi->dev->stats.tx_packets++; 221 + kfree_skb(skb); 222 + } 223 + } 224 + 225 + static int start_xmit(struct sk_buff *skb, struct net_device *dev) 226 + { 227 + struct virtnet_info *vi = netdev_priv(dev); 228 + int num, err; 229 + struct scatterlist sg[1+MAX_SKB_FRAGS]; 230 + struct virtio_net_hdr *hdr; 231 + const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 232 + DECLARE_MAC_BUF(mac); 233 + 234 + pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest)); 235 + 236 + free_old_xmit_skbs(vi); 237 + 238 + /* Encode metadata header at front. */ 239 + hdr = skb_vnet_hdr(skb); 240 + if (skb->ip_summed == CHECKSUM_PARTIAL) { 241 + hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 242 + hdr->csum_start = skb->csum_start - skb_headroom(skb); 243 + hdr->csum_offset = skb->csum_offset; 244 + } else { 245 + hdr->flags = 0; 246 + hdr->csum_offset = hdr->csum_start = 0; 247 + } 248 + 249 + if (skb_is_gso(skb)) { 250 + hdr->gso_size = skb_shinfo(skb)->gso_size; 251 + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) 252 + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; 253 + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) 254 + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 255 + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) 256 + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 257 + else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) 258 + hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 259 + else 260 + BUG(); 261 + } else { 262 + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 263 + hdr->gso_size = 0; 264 + } 265 + 266 + vnet_hdr_to_sg(sg, skb); 267 + num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 268 + __skb_queue_head(&vi->send, skb); 269 + err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); 270 + if (err) { 271 + pr_debug("%s: virtio not prepared to send\n", dev->name); 272 + skb_unlink(skb, &vi->send); 273 + netif_stop_queue(dev); 274 + return NETDEV_TX_BUSY; 275 + } 276 + vi->svq->vq_ops->kick(vi->svq); 277 + 278 + return 0; 279 + } 280 + 281 + static int virtnet_open(struct net_device *dev) 282 + { 283 + struct virtnet_info *vi = netdev_priv(dev); 284 + 285 + try_fill_recv(vi); 286 + 287 + /* If we didn't even get one input buffer, we're useless. */ 288 + if (vi->num == 0) 289 + return -ENOMEM; 290 + 291 + napi_enable(&vi->napi); 292 + return 0; 293 + } 294 + 295 + static int virtnet_close(struct net_device *dev) 296 + { 297 + struct virtnet_info *vi = netdev_priv(dev); 298 + struct sk_buff *skb; 299 + 300 + napi_disable(&vi->napi); 301 + 302 + /* networking core has neutered skb_xmit_done/skb_recv_done, so don't 303 + * worry about races vs. get(). */ 304 + vi->rvq->vq_ops->shutdown(vi->rvq); 305 + while ((skb = __skb_dequeue(&vi->recv)) != NULL) { 306 + kfree_skb(skb); 307 + vi->num--; 308 + } 309 + vi->svq->vq_ops->shutdown(vi->svq); 310 + while ((skb = __skb_dequeue(&vi->send)) != NULL) 311 + kfree_skb(skb); 312 + 313 + BUG_ON(vi->num != 0); 314 + return 0; 315 + } 316 + 317 + static int virtnet_probe(struct virtio_device *vdev) 318 + { 319 + int err; 320 + unsigned int len; 321 + struct net_device *dev; 322 + struct virtnet_info *vi; 323 + void *token; 324 + 325 + /* Allocate ourselves a network device with room for our info */ 326 + dev = alloc_etherdev(sizeof(struct virtnet_info)); 327 + if (!dev) 328 + return -ENOMEM; 329 + 330 + /* Set up network device as normal. */ 331 + ether_setup(dev); 332 + dev->open = virtnet_open; 333 + dev->stop = virtnet_close; 334 + dev->hard_start_xmit = start_xmit; 335 + dev->features = NETIF_F_HIGHDMA; 336 + SET_NETDEV_DEV(dev, &vdev->dev); 337 + 338 + /* Do we support "hardware" checksums? */ 339 + token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len); 340 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) { 341 + /* This opens up the world of extra features. */ 342 + dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; 343 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4)) 344 + dev->features |= NETIF_F_TSO; 345 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO)) 346 + dev->features |= NETIF_F_UFO; 347 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN)) 348 + dev->features |= NETIF_F_TSO_ECN; 349 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6)) 350 + dev->features |= NETIF_F_TSO6; 351 + } 352 + 353 + /* Configuration may specify what MAC to use. Otherwise random. */ 354 + token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len); 355 + if (token) { 356 + dev->addr_len = len; 357 + vdev->config->get(vdev, token, dev->dev_addr, len); 358 + } else 359 + random_ether_addr(dev->dev_addr); 360 + 361 + /* Set up our device-specific information */ 362 + vi = netdev_priv(dev); 363 + netif_napi_add(dev, &vi->napi, virtnet_poll, 16); 364 + vi->dev = dev; 365 + vi->vdev = vdev; 366 + 367 + /* We expect two virtqueues, receive then send. */ 368 + vi->rvq = vdev->config->find_vq(vdev, skb_recv_done); 369 + if (IS_ERR(vi->rvq)) { 370 + err = PTR_ERR(vi->rvq); 371 + goto free; 372 + } 373 + 374 + vi->svq = vdev->config->find_vq(vdev, skb_xmit_done); 375 + if (IS_ERR(vi->svq)) { 376 + err = PTR_ERR(vi->svq); 377 + goto free_recv; 378 + } 379 + 380 + /* Initialize our empty receive and send queues. */ 381 + skb_queue_head_init(&vi->recv); 382 + skb_queue_head_init(&vi->send); 383 + 384 + err = register_netdev(dev); 385 + if (err) { 386 + pr_debug("virtio_net: registering device failed\n"); 387 + goto free_send; 388 + } 389 + pr_debug("virtnet: registered device %s\n", dev->name); 390 + vdev->priv = vi; 391 + return 0; 392 + 393 + free_send: 394 + vdev->config->del_vq(vi->svq); 395 + free_recv: 396 + vdev->config->del_vq(vi->rvq); 397 + free: 398 + free_netdev(dev); 399 + return err; 400 + } 401 + 402 + static void virtnet_remove(struct virtio_device *vdev) 403 + { 404 + unregister_netdev(vdev->priv); 405 + free_netdev(vdev->priv); 406 + } 407 + 408 + static struct virtio_device_id id_table[] = { 409 + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, 410 + { 0 }, 411 + }; 412 + 413 + static struct virtio_driver virtio_net = { 414 + .driver.name = KBUILD_MODNAME, 415 + .driver.owner = THIS_MODULE, 416 + .id_table = id_table, 417 + .probe = virtnet_probe, 418 + .remove = __devexit_p(virtnet_remove), 419 + }; 420 + 421 + static int __init init(void) 422 + { 423 + return register_virtio_driver(&virtio_net); 424 + } 425 + 426 + static void __exit fini(void) 427 + { 428 + unregister_virtio_driver(&virtio_net); 429 + } 430 + module_init(init); 431 + module_exit(fini); 432 + 433 + MODULE_DEVICE_TABLE(virtio, id_table); 434 + MODULE_DESCRIPTION("Virtio network driver"); 435 + MODULE_LICENSE("GPL");

+8

drivers/virtio/Kconfig

···

··· 1 + # Virtio always gets selected by whoever wants it. 2 + config VIRTIO 3 + bool 4 + 5 + # Similarly the virtio ring implementation. 6 + config VIRTIO_RING 7 + bool 8 + depends on VIRTIO

+2

drivers/virtio/Makefile

···

··· 1 + obj-$(CONFIG_VIRTIO) += virtio.o 2 + obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o

+13

drivers/virtio/config.c

···

··· 1 + /* Configuration space parsing helpers for virtio. 2 + * 3 + * The configuration is [type][len][... len bytes ...] fields. 4 + * 5 + * Copyright 2007 Rusty Russell, IBM Corporation. 6 + * GPL v2 or later. 7 + */ 8 + #include <linux/err.h> 9 + #include <linux/virtio.h> 10 + #include <linux/virtio_config.h> 11 + #include <linux/bug.h> 12 + #include <asm/system.h> 13 +

+189

drivers/virtio/virtio.c

···

··· 1 + #include <linux/virtio.h> 2 + #include <linux/spinlock.h> 3 + #include <linux/virtio_config.h> 4 + 5 + static ssize_t device_show(struct device *_d, 6 + struct device_attribute *attr, char *buf) 7 + { 8 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 9 + return sprintf(buf, "%hu", dev->id.device); 10 + } 11 + static ssize_t vendor_show(struct device *_d, 12 + struct device_attribute *attr, char *buf) 13 + { 14 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 15 + return sprintf(buf, "%hu", dev->id.vendor); 16 + } 17 + static ssize_t status_show(struct device *_d, 18 + struct device_attribute *attr, char *buf) 19 + { 20 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 21 + return sprintf(buf, "0x%08x", dev->config->get_status(dev)); 22 + } 23 + static ssize_t modalias_show(struct device *_d, 24 + struct device_attribute *attr, char *buf) 25 + { 26 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 27 + 28 + return sprintf(buf, "virtio:d%08Xv%08X\n", 29 + dev->id.device, dev->id.vendor); 30 + } 31 + static struct device_attribute virtio_dev_attrs[] = { 32 + __ATTR_RO(device), 33 + __ATTR_RO(vendor), 34 + __ATTR_RO(status), 35 + __ATTR_RO(modalias), 36 + __ATTR_NULL 37 + }; 38 + 39 + static inline int virtio_id_match(const struct virtio_device *dev, 40 + const struct virtio_device_id *id) 41 + { 42 + if (id->device != dev->id.device) 43 + return 0; 44 + 45 + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor; 46 + } 47 + 48 + /* This looks through all the IDs a driver claims to support. If any of them 49 + * match, we return 1 and the kernel will call virtio_dev_probe(). */ 50 + static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) 51 + { 52 + unsigned int i; 53 + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); 54 + const struct virtio_device_id *ids; 55 + 56 + ids = container_of(_dr, struct virtio_driver, driver)->id_table; 57 + for (i = 0; ids[i].device; i++) 58 + if (virtio_id_match(dev, &ids[i])) 59 + return 1; 60 + return 0; 61 + } 62 + 63 + static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env) 64 + { 65 + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); 66 + 67 + return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", 68 + dev->id.device, dev->id.vendor); 69 + } 70 + 71 + static struct bus_type virtio_bus = { 72 + .name = "virtio", 73 + .match = virtio_dev_match, 74 + .dev_attrs = virtio_dev_attrs, 75 + .uevent = virtio_uevent, 76 + }; 77 + 78 + static void add_status(struct virtio_device *dev, unsigned status) 79 + { 80 + dev->config->set_status(dev, dev->config->get_status(dev) | status); 81 + } 82 + 83 + static int virtio_dev_probe(struct device *_d) 84 + { 85 + int err; 86 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 87 + struct virtio_driver *drv = container_of(dev->dev.driver, 88 + struct virtio_driver, driver); 89 + 90 + add_status(dev, VIRTIO_CONFIG_S_DRIVER); 91 + err = drv->probe(dev); 92 + if (err) 93 + add_status(dev, VIRTIO_CONFIG_S_FAILED); 94 + else 95 + add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 96 + return err; 97 + } 98 + 99 + int register_virtio_driver(struct virtio_driver *driver) 100 + { 101 + driver->driver.bus = &virtio_bus; 102 + driver->driver.probe = virtio_dev_probe; 103 + return driver_register(&driver->driver); 104 + } 105 + EXPORT_SYMBOL_GPL(register_virtio_driver); 106 + 107 + void unregister_virtio_driver(struct virtio_driver *driver) 108 + { 109 + driver_unregister(&driver->driver); 110 + } 111 + EXPORT_SYMBOL_GPL(unregister_virtio_driver); 112 + 113 + int register_virtio_device(struct virtio_device *dev) 114 + { 115 + int err; 116 + 117 + dev->dev.bus = &virtio_bus; 118 + sprintf(dev->dev.bus_id, "%u", dev->index); 119 + 120 + /* Acknowledge that we've seen the device. */ 121 + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); 122 + 123 + /* device_register() causes the bus infrastructure to look for a 124 + * matching driver. */ 125 + err = device_register(&dev->dev); 126 + if (err) 127 + add_status(dev, VIRTIO_CONFIG_S_FAILED); 128 + return err; 129 + } 130 + EXPORT_SYMBOL_GPL(register_virtio_device); 131 + 132 + void unregister_virtio_device(struct virtio_device *dev) 133 + { 134 + device_unregister(&dev->dev); 135 + } 136 + EXPORT_SYMBOL_GPL(unregister_virtio_device); 137 + 138 + int __virtio_config_val(struct virtio_device *vdev, 139 + u8 type, void *val, size_t size) 140 + { 141 + void *token; 142 + unsigned int len; 143 + 144 + token = vdev->config->find(vdev, type, &len); 145 + if (!token) 146 + return -ENOENT; 147 + 148 + if (len != size) 149 + return -EIO; 150 + 151 + vdev->config->get(vdev, token, val, size); 152 + return 0; 153 + } 154 + EXPORT_SYMBOL_GPL(__virtio_config_val); 155 + 156 + int virtio_use_bit(struct virtio_device *vdev, 157 + void *token, unsigned int len, unsigned int bitnum) 158 + { 159 + unsigned long bits[16]; 160 + 161 + /* This makes it convenient to pass-through find() results. */ 162 + if (!token) 163 + return 0; 164 + 165 + /* bit not in range of this bitfield? */ 166 + if (bitnum * 8 >= len / 2) 167 + return 0; 168 + 169 + /* Giant feature bitfields are silly. */ 170 + BUG_ON(len > sizeof(bits)); 171 + vdev->config->get(vdev, token, bits, len); 172 + 173 + if (!test_bit(bitnum, bits)) 174 + return 0; 175 + 176 + /* Set acknowledge bit, and write it back. */ 177 + set_bit(bitnum + len * 8 / 2, bits); 178 + vdev->config->set(vdev, token, bits, len); 179 + return 1; 180 + } 181 + EXPORT_SYMBOL_GPL(virtio_use_bit); 182 + 183 + static int virtio_init(void) 184 + { 185 + if (bus_register(&virtio_bus) != 0) 186 + panic("virtio bus registration failed"); 187 + return 0; 188 + } 189 + core_initcall(virtio_init);

+313

drivers/virtio/virtio_ring.c

···

··· 1 + /* Virtio ring implementation. 2 + * 3 + * Copyright 2007 Rusty Russell IBM Corporation 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation; either version 2 of the License, or 8 + * (at your option) any later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 + * GNU General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public License 16 + * along with this program; if not, write to the Free Software 17 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 + */ 19 + #include <linux/virtio.h> 20 + #include <linux/virtio_ring.h> 21 + #include <linux/device.h> 22 + 23 + #ifdef DEBUG 24 + /* For development, we want to crash whenever the ring is screwed. */ 25 + #define BAD_RING(vq, fmt...) \ 26 + do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0) 27 + #define START_USE(vq) \ 28 + do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0) 29 + #define END_USE(vq) \ 30 + do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0) 31 + #else 32 + #define BAD_RING(vq, fmt...) \ 33 + do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0) 34 + #define START_USE(vq) 35 + #define END_USE(vq) 36 + #endif 37 + 38 + struct vring_virtqueue 39 + { 40 + struct virtqueue vq; 41 + 42 + /* Actual memory layout for this queue */ 43 + struct vring vring; 44 + 45 + /* Other side has made a mess, don't try any more. */ 46 + bool broken; 47 + 48 + /* Number of free buffers */ 49 + unsigned int num_free; 50 + /* Head of free buffer list. */ 51 + unsigned int free_head; 52 + /* Number we've added since last sync. */ 53 + unsigned int num_added; 54 + 55 + /* Last used index we've seen. */ 56 + unsigned int last_used_idx; 57 + 58 + /* How to notify other side. FIXME: commonalize hcalls! */ 59 + void (*notify)(struct virtqueue *vq); 60 + 61 + #ifdef DEBUG 62 + /* They're supposed to lock for us. */ 63 + unsigned int in_use; 64 + #endif 65 + 66 + /* Tokens for callbacks. */ 67 + void *data[]; 68 + }; 69 + 70 + #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) 71 + 72 + static int vring_add_buf(struct virtqueue *_vq, 73 + struct scatterlist sg[], 74 + unsigned int out, 75 + unsigned int in, 76 + void *data) 77 + { 78 + struct vring_virtqueue *vq = to_vvq(_vq); 79 + unsigned int i, avail, head, uninitialized_var(prev); 80 + 81 + BUG_ON(data == NULL); 82 + BUG_ON(out + in > vq->vring.num); 83 + BUG_ON(out + in == 0); 84 + 85 + START_USE(vq); 86 + 87 + if (vq->num_free < out + in) { 88 + pr_debug("Can't add buf len %i - avail = %i\n", 89 + out + in, vq->num_free); 90 + END_USE(vq); 91 + return -ENOSPC; 92 + } 93 + 94 + /* We're about to use some buffers from the free list. */ 95 + vq->num_free -= out + in; 96 + 97 + head = vq->free_head; 98 + for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { 99 + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; 100 + vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) 101 + + sg->offset; 102 + vq->vring.desc[i].len = sg->length; 103 + prev = i; 104 + sg++; 105 + } 106 + for (; in; i = vq->vring.desc[i].next, in--) { 107 + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 108 + vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) 109 + + sg->offset; 110 + vq->vring.desc[i].len = sg->length; 111 + prev = i; 112 + sg++; 113 + } 114 + /* Last one doesn't continue. */ 115 + vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; 116 + 117 + /* Update free pointer */ 118 + vq->free_head = i; 119 + 120 + /* Set token. */ 121 + vq->data[head] = data; 122 + 123 + /* Put entry in available array (but don't update avail->idx until they 124 + * do sync). FIXME: avoid modulus here? */ 125 + avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num; 126 + vq->vring.avail->ring[avail] = head; 127 + 128 + pr_debug("Added buffer head %i to %p\n", head, vq); 129 + END_USE(vq); 130 + return 0; 131 + } 132 + 133 + static void vring_kick(struct virtqueue *_vq) 134 + { 135 + struct vring_virtqueue *vq = to_vvq(_vq); 136 + START_USE(vq); 137 + /* Descriptors and available array need to be set before we expose the 138 + * new available array entries. */ 139 + wmb(); 140 + 141 + vq->vring.avail->idx += vq->num_added; 142 + vq->num_added = 0; 143 + 144 + /* Need to update avail index before checking if we should notify */ 145 + mb(); 146 + 147 + if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) 148 + /* Prod other side to tell it about changes. */ 149 + vq->notify(&vq->vq); 150 + 151 + END_USE(vq); 152 + } 153 + 154 + static void detach_buf(struct vring_virtqueue *vq, unsigned int head) 155 + { 156 + unsigned int i; 157 + 158 + /* Clear data ptr. */ 159 + vq->data[head] = NULL; 160 + 161 + /* Put back on free list: find end */ 162 + i = head; 163 + while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { 164 + i = vq->vring.desc[i].next; 165 + vq->num_free++; 166 + } 167 + 168 + vq->vring.desc[i].next = vq->free_head; 169 + vq->free_head = head; 170 + /* Plus final descriptor */ 171 + vq->num_free++; 172 + } 173 + 174 + /* FIXME: We need to tell other side about removal, to synchronize. */ 175 + static void vring_shutdown(struct virtqueue *_vq) 176 + { 177 + struct vring_virtqueue *vq = to_vvq(_vq); 178 + unsigned int i; 179 + 180 + for (i = 0; i < vq->vring.num; i++) 181 + detach_buf(vq, i); 182 + } 183 + 184 + static inline bool more_used(const struct vring_virtqueue *vq) 185 + { 186 + return vq->last_used_idx != vq->vring.used->idx; 187 + } 188 + 189 + static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len) 190 + { 191 + struct vring_virtqueue *vq = to_vvq(_vq); 192 + void *ret; 193 + unsigned int i; 194 + 195 + START_USE(vq); 196 + 197 + if (!more_used(vq)) { 198 + pr_debug("No more buffers in queue\n"); 199 + END_USE(vq); 200 + return NULL; 201 + } 202 + 203 + i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; 204 + *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; 205 + 206 + if (unlikely(i >= vq->vring.num)) { 207 + BAD_RING(vq, "id %u out of range\n", i); 208 + return NULL; 209 + } 210 + if (unlikely(!vq->data[i])) { 211 + BAD_RING(vq, "id %u is not a head!\n", i); 212 + return NULL; 213 + } 214 + 215 + /* detach_buf clears data, so grab it now. */ 216 + ret = vq->data[i]; 217 + detach_buf(vq, i); 218 + vq->last_used_idx++; 219 + END_USE(vq); 220 + return ret; 221 + } 222 + 223 + static bool vring_restart(struct virtqueue *_vq) 224 + { 225 + struct vring_virtqueue *vq = to_vvq(_vq); 226 + 227 + START_USE(vq); 228 + BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)); 229 + 230 + /* We optimistically turn back on interrupts, then check if there was 231 + * more to do. */ 232 + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; 233 + mb(); 234 + if (unlikely(more_used(vq))) { 235 + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 236 + END_USE(vq); 237 + return false; 238 + } 239 + 240 + END_USE(vq); 241 + return true; 242 + } 243 + 244 + irqreturn_t vring_interrupt(int irq, void *_vq) 245 + { 246 + struct vring_virtqueue *vq = to_vvq(_vq); 247 + 248 + if (!more_used(vq)) { 249 + pr_debug("virtqueue interrupt with no work for %p\n", vq); 250 + return IRQ_NONE; 251 + } 252 + 253 + if (unlikely(vq->broken)) 254 + return IRQ_HANDLED; 255 + 256 + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); 257 + if (vq->vq.callback && !vq->vq.callback(&vq->vq)) 258 + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 259 + 260 + return IRQ_HANDLED; 261 + } 262 + 263 + static struct virtqueue_ops vring_vq_ops = { 264 + .add_buf = vring_add_buf, 265 + .get_buf = vring_get_buf, 266 + .kick = vring_kick, 267 + .restart = vring_restart, 268 + .shutdown = vring_shutdown, 269 + }; 270 + 271 + struct virtqueue *vring_new_virtqueue(unsigned int num, 272 + struct virtio_device *vdev, 273 + void *pages, 274 + void (*notify)(struct virtqueue *), 275 + bool (*callback)(struct virtqueue *)) 276 + { 277 + struct vring_virtqueue *vq; 278 + unsigned int i; 279 + 280 + vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); 281 + if (!vq) 282 + return NULL; 283 + 284 + vring_init(&vq->vring, num, pages); 285 + vq->vq.callback = callback; 286 + vq->vq.vdev = vdev; 287 + vq->vq.vq_ops = &vring_vq_ops; 288 + vq->notify = notify; 289 + vq->broken = false; 290 + vq->last_used_idx = 0; 291 + vq->num_added = 0; 292 + #ifdef DEBUG 293 + vq->in_use = false; 294 + #endif 295 + 296 + /* No callback? Tell other side not to bother us. */ 297 + if (!callback) 298 + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 299 + 300 + /* Put everything in free lists. */ 301 + vq->num_free = num; 302 + vq->free_head = 0; 303 + for (i = 0; i < num-1; i++) 304 + vq->vring.desc[i].next = i+1; 305 + 306 + return &vq->vq; 307 + } 308 + 309 + void vring_del_virtqueue(struct virtqueue *vq) 310 + { 311 + kfree(to_vvq(vq)); 312 + } 313 +

+3

include/asm-x86/Kbuild

··· 1 include include/asm-generic/Kbuild.asm 2 3 header-y += boot.h 4 header-y += debugreg.h 5 header-y += ldt.h 6 header-y += msr-index.h ··· 15 unifdef-y += a.out_64.h 16 unifdef-y += byteorder_32.h 17 unifdef-y += byteorder_64.h 18 unifdef-y += elf_32.h 19 unifdef-y += elf_64.h 20 unifdef-y += mce.h 21 unifdef-y += msgbuf_32.h 22 unifdef-y += msgbuf_64.h

··· 1 include include/asm-generic/Kbuild.asm 2 3 header-y += boot.h 4 + header-y += bootparam.h 5 header-y += debugreg.h 6 header-y += ldt.h 7 header-y += msr-index.h ··· 14 unifdef-y += a.out_64.h 15 unifdef-y += byteorder_32.h 16 unifdef-y += byteorder_64.h 17 + unifdef-y += e820.h 18 unifdef-y += elf_32.h 19 unifdef-y += elf_64.h 20 + unifdef-y += ist.h 21 unifdef-y += mce.h 22 unifdef-y += msgbuf_32.h 23 unifdef-y += msgbuf_64.h

+54 -54

include/asm-x86/bootparam.h

··· 10 #include <video/edid.h> 11 12 struct setup_header { 13 - u8 setup_sects; 14 - u16 root_flags; 15 - u32 syssize; 16 - u16 ram_size; 17 #define RAMDISK_IMAGE_START_MASK 0x07FF 18 #define RAMDISK_PROMPT_FLAG 0x8000 19 #define RAMDISK_LOAD_FLAG 0x4000 20 - u16 vid_mode; 21 - u16 root_dev; 22 - u16 boot_flag; 23 - u16 jump; 24 - u32 header; 25 - u16 version; 26 - u32 realmode_swtch; 27 - u16 start_sys; 28 - u16 kernel_version; 29 - u8 type_of_loader; 30 - u8 loadflags; 31 #define LOADED_HIGH (1<<0) 32 #define KEEP_SEGMENTS (1<<6) 33 #define CAN_USE_HEAP (1<<7) 34 - u16 setup_move_size; 35 - u32 code32_start; 36 - u32 ramdisk_image; 37 - u32 ramdisk_size; 38 - u32 bootsect_kludge; 39 - u16 heap_end_ptr; 40 - u16 _pad1; 41 - u32 cmd_line_ptr; 42 - u32 initrd_addr_max; 43 - u32 kernel_alignment; 44 - u8 relocatable_kernel; 45 - u8 _pad2[3]; 46 - u32 cmdline_size; 47 - u32 hardware_subarch; 48 - u64 hardware_subarch_data; 49 } __attribute__((packed)); 50 51 struct sys_desc_table { 52 - u16 length; 53 - u8 table[14]; 54 }; 55 56 struct efi_info { 57 - u32 _pad1; 58 - u32 efi_systab; 59 - u32 efi_memdesc_size; 60 - u32 efi_memdesc_version; 61 - u32 efi_memmap; 62 - u32 efi_memmap_size; 63 - u32 _pad2[2]; 64 }; 65 66 /* The so-called "zeropage" */ 67 struct boot_params { 68 struct screen_info screen_info; /* 0x000 */ 69 struct apm_bios_info apm_bios_info; /* 0x040 */ 70 - u8 _pad2[12]; /* 0x054 */ 71 struct ist_info ist_info; /* 0x060 */ 72 - u8 _pad3[16]; /* 0x070 */ 73 - u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 74 - u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 75 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 76 - u8 _pad4[144]; /* 0x0b0 */ 77 struct edid_info edid_info; /* 0x140 */ 78 struct efi_info efi_info; /* 0x1c0 */ 79 - u32 alt_mem_k; /* 0x1e0 */ 80 - u32 scratch; /* Scratch field! */ /* 0x1e4 */ 81 - u8 e820_entries; /* 0x1e8 */ 82 - u8 eddbuf_entries; /* 0x1e9 */ 83 - u8 edd_mbr_sig_buf_entries; /* 0x1ea */ 84 - u8 _pad6[6]; /* 0x1eb */ 85 struct setup_header hdr; /* setup header */ /* 0x1f1 */ 86 - u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; 87 - u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ 88 struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 89 - u8 _pad8[48]; /* 0xcd0 */ 90 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ 91 - u8 _pad9[276]; /* 0xeec */ 92 } __attribute__((packed)); 93 94 #endif /* _ASM_BOOTPARAM_H */

··· 10 #include <video/edid.h> 11 12 struct setup_header { 13 + __u8 setup_sects; 14 + __u16 root_flags; 15 + __u32 syssize; 16 + __u16 ram_size; 17 #define RAMDISK_IMAGE_START_MASK 0x07FF 18 #define RAMDISK_PROMPT_FLAG 0x8000 19 #define RAMDISK_LOAD_FLAG 0x4000 20 + __u16 vid_mode; 21 + __u16 root_dev; 22 + __u16 boot_flag; 23 + __u16 jump; 24 + __u32 header; 25 + __u16 version; 26 + __u32 realmode_swtch; 27 + __u16 start_sys; 28 + __u16 kernel_version; 29 + __u8 type_of_loader; 30 + __u8 loadflags; 31 #define LOADED_HIGH (1<<0) 32 #define KEEP_SEGMENTS (1<<6) 33 #define CAN_USE_HEAP (1<<7) 34 + __u16 setup_move_size; 35 + __u32 code32_start; 36 + __u32 ramdisk_image; 37 + __u32 ramdisk_size; 38 + __u32 bootsect_kludge; 39 + __u16 heap_end_ptr; 40 + __u16 _pad1; 41 + __u32 cmd_line_ptr; 42 + __u32 initrd_addr_max; 43 + __u32 kernel_alignment; 44 + __u8 relocatable_kernel; 45 + __u8 _pad2[3]; 46 + __u32 cmdline_size; 47 + __u32 hardware_subarch; 48 + __u64 hardware_subarch_data; 49 } __attribute__((packed)); 50 51 struct sys_desc_table { 52 + __u16 length; 53 + __u8 table[14]; 54 }; 55 56 struct efi_info { 57 + __u32 _pad1; 58 + __u32 efi_systab; 59 + __u32 efi_memdesc_size; 60 + __u32 efi_memdesc_version; 61 + __u32 efi_memmap; 62 + __u32 efi_memmap_size; 63 + __u32 _pad2[2]; 64 }; 65 66 /* The so-called "zeropage" */ 67 struct boot_params { 68 struct screen_info screen_info; /* 0x000 */ 69 struct apm_bios_info apm_bios_info; /* 0x040 */ 70 + __u8 _pad2[12]; /* 0x054 */ 71 struct ist_info ist_info; /* 0x060 */ 72 + __u8 _pad3[16]; /* 0x070 */ 73 + __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 74 + __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 75 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 76 + __u8 _pad4[144]; /* 0x0b0 */ 77 struct edid_info edid_info; /* 0x140 */ 78 struct efi_info efi_info; /* 0x1c0 */ 79 + __u32 alt_mem_k; /* 0x1e0 */ 80 + __u32 scratch; /* Scratch field! */ /* 0x1e4 */ 81 + __u8 e820_entries; /* 0x1e8 */ 82 + __u8 eddbuf_entries; /* 0x1e9 */ 83 + __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ 84 + __u8 _pad6[6]; /* 0x1eb */ 85 struct setup_header hdr; /* setup header */ /* 0x1f1 */ 86 + __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; 87 + __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ 88 struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 89 + __u8 _pad8[48]; /* 0xcd0 */ 90 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ 91 + __u8 _pad9[276]; /* 0xeec */ 92 } __attribute__((packed)); 93 94 #endif /* _ASM_BOOTPARAM_H */

+28

include/asm-x86/e820.h

··· 1 #ifdef CONFIG_X86_32 2 # include "e820_32.h" 3 #else 4 # include "e820_64.h" 5 #endif

··· 1 + #ifndef __ASM_E820_H 2 + #define __ASM_E820_H 3 + #define E820MAP 0x2d0 /* our map */ 4 + #define E820MAX 128 /* number of entries in E820MAP */ 5 + #define E820NR 0x1e8 /* # entries in E820MAP */ 6 + 7 + #define E820_RAM 1 8 + #define E820_RESERVED 2 9 + #define E820_ACPI 3 10 + #define E820_NVS 4 11 + 12 + #ifndef __ASSEMBLY__ 13 + struct e820entry { 14 + __u64 addr; /* start of memory segment */ 15 + __u64 size; /* size of memory segment */ 16 + __u32 type; /* type of memory segment */ 17 + } __attribute__((packed)); 18 + 19 + struct e820map { 20 + __u32 nr_map; 21 + struct e820entry map[E820MAX]; 22 + }; 23 + #endif /* __ASSEMBLY__ */ 24 + 25 + #ifdef __KERNEL__ 26 #ifdef CONFIG_X86_32 27 # include "e820_32.h" 28 #else 29 # include "e820_64.h" 30 #endif 31 + #endif /* __KERNEL__ */ 32 + 33 + #endif /* __ASM_E820_H */

-21

include/asm-x86/e820_32.h

··· 12 #ifndef __E820_HEADER 13 #define __E820_HEADER 14 15 - #define E820MAP 0x2d0 /* our map */ 16 - #define E820MAX 128 /* number of entries in E820MAP */ 17 - #define E820NR 0x1e8 /* # entries in E820MAP */ 18 - 19 - #define E820_RAM 1 20 - #define E820_RESERVED 2 21 - #define E820_ACPI 3 22 - #define E820_NVS 4 23 - 24 #define HIGH_MEMORY (1024*1024) 25 26 #ifndef __ASSEMBLY__ 27 - 28 - struct e820entry { 29 - u64 addr; /* start of memory segment */ 30 - u64 size; /* size of memory segment */ 31 - u32 type; /* type of memory segment */ 32 - } __attribute__((packed)); 33 - 34 - struct e820map { 35 - u32 nr_map; 36 - struct e820entry map[E820MAX]; 37 - }; 38 39 extern struct e820map e820; 40 ··· 36 #endif 37 38 #endif/*!__ASSEMBLY__*/ 39 - 40 #endif/*__E820_HEADER*/

··· 12 #ifndef __E820_HEADER 13 #define __E820_HEADER 14 15 #define HIGH_MEMORY (1024*1024) 16 17 #ifndef __ASSEMBLY__ 18 19 extern struct e820map e820; 20 ··· 56 #endif 57 58 #endif/*!__ASSEMBLY__*/ 59 #endif/*__E820_HEADER*/

-20

include/asm-x86/e820_64.h

··· 11 #ifndef __E820_HEADER 12 #define __E820_HEADER 13 14 - #define E820MAP 0x2d0 /* our map */ 15 - #define E820MAX 128 /* number of entries in E820MAP */ 16 - #define E820NR 0x1e8 /* # entries in E820MAP */ 17 - 18 - #define E820_RAM 1 19 - #define E820_RESERVED 2 20 - #define E820_ACPI 3 21 - #define E820_NVS 4 22 - 23 #ifndef __ASSEMBLY__ 24 - struct e820entry { 25 - u64 addr; /* start of memory segment */ 26 - u64 size; /* size of memory segment */ 27 - u32 type; /* type of memory segment */ 28 - } __attribute__((packed)); 29 - 30 - struct e820map { 31 - u32 nr_map; 32 - struct e820entry map[E820MAX]; 33 - }; 34 - 35 extern unsigned long find_e820_area(unsigned long start, unsigned long end, 36 unsigned size); 37 extern void add_memory_region(unsigned long start, unsigned long size,

··· 11 #ifndef __E820_HEADER 12 #define __E820_HEADER 13 14 #ifndef __ASSEMBLY__ 15 extern unsigned long find_e820_area(unsigned long start, unsigned long end, 16 unsigned size); 17 extern void add_memory_region(unsigned long start, unsigned long size,

+6 -6

include/asm-x86/ist.h

··· 17 */ 18 19 20 - #ifdef __KERNEL__ 21 - 22 #include <linux/types.h> 23 24 struct ist_info { 25 - u32 signature; 26 - u32 command; 27 - u32 event; 28 - u32 perf_level; 29 }; 30 31 extern struct ist_info ist_info; 32

··· 17 */ 18 19 20 #include <linux/types.h> 21 22 struct ist_info { 23 + __u32 signature; 24 + __u32 command; 25 + __u32 event; 26 + __u32 perf_level; 27 }; 28 + 29 + #ifdef __KERNEL__ 30 31 extern struct ist_info ist_info; 32

+86

include/asm-x86/lguest.h

···

··· 1 + #ifndef _X86_LGUEST_H 2 + #define _X86_LGUEST_H 3 + 4 + #define GDT_ENTRY_LGUEST_CS 10 5 + #define GDT_ENTRY_LGUEST_DS 11 6 + #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) 7 + #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) 8 + 9 + #ifndef __ASSEMBLY__ 10 + #include <asm/desc.h> 11 + 12 + #define GUEST_PL 1 13 + 14 + /* Every guest maps the core switcher code. */ 15 + #define SHARED_SWITCHER_PAGES \ 16 + DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) 17 + /* Pages for switcher itself, then two pages per cpu */ 18 + #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) 19 + 20 + /* We map at -4M for ease of mapping into the guest (one PTE page). */ 21 + #define SWITCHER_ADDR 0xFFC00000 22 + 23 + /* Found in switcher.S */ 24 + extern unsigned long default_idt_entries[]; 25 + 26 + struct lguest_regs 27 + { 28 + /* Manually saved part. */ 29 + unsigned long eax, ebx, ecx, edx; 30 + unsigned long esi, edi, ebp; 31 + unsigned long gs; 32 + unsigned long fs, ds, es; 33 + unsigned long trapnum, errcode; 34 + /* Trap pushed part */ 35 + unsigned long eip; 36 + unsigned long cs; 37 + unsigned long eflags; 38 + unsigned long esp; 39 + unsigned long ss; 40 + }; 41 + 42 + /* This is a guest-specific page (mapped ro) into the guest. */ 43 + struct lguest_ro_state 44 + { 45 + /* Host information we need to restore when we switch back. */ 46 + u32 host_cr3; 47 + struct Xgt_desc_struct host_idt_desc; 48 + struct Xgt_desc_struct host_gdt_desc; 49 + u32 host_sp; 50 + 51 + /* Fields which are used when guest is running. */ 52 + struct Xgt_desc_struct guest_idt_desc; 53 + struct Xgt_desc_struct guest_gdt_desc; 54 + struct i386_hw_tss guest_tss; 55 + struct desc_struct guest_idt[IDT_ENTRIES]; 56 + struct desc_struct guest_gdt[GDT_ENTRIES]; 57 + }; 58 + 59 + struct lguest_arch 60 + { 61 + /* The GDT entries copied into lguest_ro_state when running. */ 62 + struct desc_struct gdt[GDT_ENTRIES]; 63 + 64 + /* The IDT entries: some copied into lguest_ro_state when running. */ 65 + struct desc_struct idt[IDT_ENTRIES]; 66 + 67 + /* The address of the last guest-visible pagefault (ie. cr2). */ 68 + unsigned long last_pagefault; 69 + }; 70 + 71 + static inline void lguest_set_ts(void) 72 + { 73 + u32 cr0; 74 + 75 + cr0 = read_cr0(); 76 + if (!(cr0 & 8)) 77 + write_cr0(cr0|8); 78 + } 79 + 80 + /* Full 4G segment descriptors, suitable for CS and DS. */ 81 + #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 82 + #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 83 + 84 + #endif /* __ASSEMBLY__ */ 85 + 86 + #endif

+71

include/asm-x86/lguest_hcall.h

···

··· 1 + /* Architecture specific portion of the lguest hypercalls */ 2 + #ifndef _X86_LGUEST_HCALL_H 3 + #define _X86_LGUEST_HCALL_H 4 + 5 + #define LHCALL_FLUSH_ASYNC 0 6 + #define LHCALL_LGUEST_INIT 1 7 + #define LHCALL_CRASH 2 8 + #define LHCALL_LOAD_GDT 3 9 + #define LHCALL_NEW_PGTABLE 4 10 + #define LHCALL_FLUSH_TLB 5 11 + #define LHCALL_LOAD_IDT_ENTRY 6 12 + #define LHCALL_SET_STACK 7 13 + #define LHCALL_TS 8 14 + #define LHCALL_SET_CLOCKEVENT 9 15 + #define LHCALL_HALT 10 16 + #define LHCALL_SET_PTE 14 17 + #define LHCALL_SET_PMD 15 18 + #define LHCALL_LOAD_TLS 16 19 + #define LHCALL_NOTIFY 17 20 + 21 + /*G:031 First, how does our Guest contact the Host to ask for privileged 22 + * operations? There are two ways: the direct way is to make a "hypercall", 23 + * to make requests of the Host Itself. 24 + * 25 + * Our hypercall mechanism uses the highest unused trap code (traps 32 and 26 + * above are used by real hardware interrupts). Seventeen hypercalls are 27 + * available: the hypercall number is put in the %eax register, and the 28 + * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 29 + * value makes sense, it's returned in %eax. 30 + * 31 + * Grossly invalid calls result in Sudden Death at the hands of the vengeful 32 + * Host, rather than returning failure. This reflects Winston Churchill's 33 + * definition of a gentleman: "someone who is only rude intentionally". */ 34 + #define LGUEST_TRAP_ENTRY 0x1F 35 + 36 + #ifndef __ASSEMBLY__ 37 + #include <asm/hw_irq.h> 38 + 39 + static inline unsigned long 40 + hcall(unsigned long call, 41 + unsigned long arg1, unsigned long arg2, unsigned long arg3) 42 + { 43 + /* "int" is the Intel instruction to trigger a trap. */ 44 + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 45 + /* The call is in %eax (aka "a"), and can be replaced */ 46 + : "=a"(call) 47 + /* The other arguments are in %eax, %edx, %ebx & %ecx */ 48 + : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 49 + /* "memory" means this might write somewhere in memory. 50 + * This isn't true for all calls, but it's safe to tell 51 + * gcc that it might happen so it doesn't get clever. */ 52 + : "memory"); 53 + return call; 54 + } 55 + /*:*/ 56 + 57 + void async_hcall(unsigned long call, 58 + unsigned long arg1, unsigned long arg2, unsigned long arg3); 59 + 60 + /* Can't use our min() macro here: needs to be a constant */ 61 + #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 62 + 63 + #define LHCALL_RING_SIZE 64 64 + struct hcall_args 65 + { 66 + /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 67 + unsigned long arg0, arg2, arg3, arg1; 68 + }; 69 + 70 + #endif /* !__ASSEMBLY__ */ 71 + #endif /* _I386_LGUEST_HCALL_H */

+5

include/linux/Kbuild

··· 186 unifdef-y += dccp.h 187 unifdef-y += dirent.h 188 unifdef-y += dlm.h 189 unifdef-y += elfcore.h 190 unifdef-y += errno.h 191 unifdef-y += errqueue.h ··· 307 unifdef-y += rtnetlink.h 308 unifdef-y += scc.h 309 unifdef-y += sched.h 310 unifdef-y += sdla.h 311 unifdef-y += selinux_netlink.h 312 unifdef-y += sem.h ··· 343 unifdef-y += utsname.h 344 unifdef-y += videodev2.h 345 unifdef-y += videodev.h 346 unifdef-y += wait.h 347 unifdef-y += wanrouter.h 348 unifdef-y += watchdog.h

··· 186 unifdef-y += dccp.h 187 unifdef-y += dirent.h 188 unifdef-y += dlm.h 189 + unifdef-y += edd.h 190 unifdef-y += elfcore.h 191 unifdef-y += errno.h 192 unifdef-y += errqueue.h ··· 306 unifdef-y += rtnetlink.h 307 unifdef-y += scc.h 308 unifdef-y += sched.h 309 + unifdef-y += screen_info.h 310 unifdef-y += sdla.h 311 unifdef-y += selinux_netlink.h 312 unifdef-y += sem.h ··· 341 unifdef-y += utsname.h 342 unifdef-y += videodev2.h 343 unifdef-y += videodev.h 344 + unifdef-y += virtio_config.h 345 + unifdef-y += virtio_blk.h 346 + unifdef-y += virtio_net.h 347 unifdef-y += wait.h 348 unifdef-y += wanrouter.h 349 unifdef-y += watchdog.h

+15 -15

include/linux/apm_bios.h

··· 16 * General Public License for more details. 17 */ 18 19 - typedef unsigned short apm_event_t; 20 - typedef unsigned short apm_eventinfo_t; 21 22 #ifdef __KERNEL__ 23 24 - #include <linux/types.h> 25 26 #define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8) 27 #define APM_CS_16 (APM_CS + 8) 28 #define APM_DS (APM_CS_16 + 8) 29 - 30 - struct apm_bios_info { 31 - u16 version; 32 - u16 cseg; 33 - u32 offset; 34 - u16 cseg_16; 35 - u16 dseg; 36 - u16 flags; 37 - u16 cseg_len; 38 - u16 cseg_16_len; 39 - u16 dseg_len; 40 - }; 41 42 /* Results of APM Installation Check */ 43 #define APM_16_BIT_SUPPORT 0x0001

··· 16 * General Public License for more details. 17 */ 18 19 + #include <linux/types.h> 20 + 21 + struct apm_bios_info { 22 + __u16 version; 23 + __u16 cseg; 24 + __u32 offset; 25 + __u16 cseg_16; 26 + __u16 dseg; 27 + __u16 flags; 28 + __u16 cseg_len; 29 + __u16 cseg_16_len; 30 + __u16 dseg_len; 31 + }; 32 33 #ifdef __KERNEL__ 34 35 + typedef unsigned short apm_event_t; 36 + typedef unsigned short apm_eventinfo_t; 37 38 #define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8) 39 #define APM_CS_16 (APM_CS + 8) 40 #define APM_DS (APM_CS_16 + 8) 41 42 /* Results of APM Installation Check */ 43 #define APM_16_BIT_SUPPORT 0x0001

+69 -68

include/linux/edd.h

··· 67 #define EDD_INFO_USE_INT13_FN50 (1 << 7) 68 69 struct edd_device_params { 70 - u16 length; 71 - u16 info_flags; 72 - u32 num_default_cylinders; 73 - u32 num_default_heads; 74 - u32 sectors_per_track; 75 - u64 number_of_sectors; 76 - u16 bytes_per_sector; 77 - u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ 78 - u16 key; /* = 0xBEDD */ 79 - u8 device_path_info_length; /* = 44 */ 80 - u8 reserved2; 81 - u16 reserved3; 82 - u8 host_bus_type[4]; 83 - u8 interface_type[8]; 84 union { 85 struct { 86 - u16 base_address; 87 - u16 reserved1; 88 - u32 reserved2; 89 } __attribute__ ((packed)) isa; 90 struct { 91 - u8 bus; 92 - u8 slot; 93 - u8 function; 94 - u8 channel; 95 - u32 reserved; 96 } __attribute__ ((packed)) pci; 97 /* pcix is same as pci */ 98 struct { 99 - u64 reserved; 100 } __attribute__ ((packed)) ibnd; 101 struct { 102 - u64 reserved; 103 } __attribute__ ((packed)) xprs; 104 struct { 105 - u64 reserved; 106 } __attribute__ ((packed)) htpt; 107 struct { 108 - u64 reserved; 109 } __attribute__ ((packed)) unknown; 110 } interface_path; 111 union { 112 struct { 113 - u8 device; 114 - u8 reserved1; 115 - u16 reserved2; 116 - u32 reserved3; 117 - u64 reserved4; 118 } __attribute__ ((packed)) ata; 119 struct { 120 - u8 device; 121 - u8 lun; 122 - u8 reserved1; 123 - u8 reserved2; 124 - u32 reserved3; 125 - u64 reserved4; 126 } __attribute__ ((packed)) atapi; 127 struct { 128 - u16 id; 129 - u64 lun; 130 - u16 reserved1; 131 - u32 reserved2; 132 } __attribute__ ((packed)) scsi; 133 struct { 134 - u64 serial_number; 135 - u64 reserved; 136 } __attribute__ ((packed)) usb; 137 struct { 138 - u64 eui; 139 - u64 reserved; 140 } __attribute__ ((packed)) i1394; 141 struct { 142 - u64 wwid; 143 - u64 lun; 144 } __attribute__ ((packed)) fibre; 145 struct { 146 - u64 identity_tag; 147 - u64 reserved; 148 } __attribute__ ((packed)) i2o; 149 struct { 150 - u32 array_number; 151 - u32 reserved1; 152 - u64 reserved2; 153 } __attribute__ ((packed)) raid; 154 struct { 155 - u8 device; 156 - u8 reserved1; 157 - u16 reserved2; 158 - u32 reserved3; 159 - u64 reserved4; 160 } __attribute__ ((packed)) sata; 161 struct { 162 - u64 reserved1; 163 - u64 reserved2; 164 } __attribute__ ((packed)) unknown; 165 } device_path; 166 - u8 reserved4; 167 - u8 checksum; 168 } __attribute__ ((packed)); 169 170 struct edd_info { 171 - u8 device; 172 - u8 version; 173 - u16 interface_support; 174 - u16 legacy_max_cylinder; 175 - u8 legacy_max_head; 176 - u8 legacy_sectors_per_track; 177 struct edd_device_params params; 178 } __attribute__ ((packed)); 179 ··· 184 unsigned char edd_info_nr; 185 }; 186 187 extern struct edd edd; 188 - 189 #endif /*!__ASSEMBLY__ */ 190 191 #endif /* _LINUX_EDD_H */

··· 67 #define EDD_INFO_USE_INT13_FN50 (1 << 7) 68 69 struct edd_device_params { 70 + __u16 length; 71 + __u16 info_flags; 72 + __u32 num_default_cylinders; 73 + __u32 num_default_heads; 74 + __u32 sectors_per_track; 75 + __u64 number_of_sectors; 76 + __u16 bytes_per_sector; 77 + __u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ 78 + __u16 key; /* = 0xBEDD */ 79 + __u8 device_path_info_length; /* = 44 */ 80 + __u8 reserved2; 81 + __u16 reserved3; 82 + __u8 host_bus_type[4]; 83 + __u8 interface_type[8]; 84 union { 85 struct { 86 + __u16 base_address; 87 + __u16 reserved1; 88 + __u32 reserved2; 89 } __attribute__ ((packed)) isa; 90 struct { 91 + __u8 bus; 92 + __u8 slot; 93 + __u8 function; 94 + __u8 channel; 95 + __u32 reserved; 96 } __attribute__ ((packed)) pci; 97 /* pcix is same as pci */ 98 struct { 99 + __u64 reserved; 100 } __attribute__ ((packed)) ibnd; 101 struct { 102 + __u64 reserved; 103 } __attribute__ ((packed)) xprs; 104 struct { 105 + __u64 reserved; 106 } __attribute__ ((packed)) htpt; 107 struct { 108 + __u64 reserved; 109 } __attribute__ ((packed)) unknown; 110 } interface_path; 111 union { 112 struct { 113 + __u8 device; 114 + __u8 reserved1; 115 + __u16 reserved2; 116 + __u32 reserved3; 117 + __u64 reserved4; 118 } __attribute__ ((packed)) ata; 119 struct { 120 + __u8 device; 121 + __u8 lun; 122 + __u8 reserved1; 123 + __u8 reserved2; 124 + __u32 reserved3; 125 + __u64 reserved4; 126 } __attribute__ ((packed)) atapi; 127 struct { 128 + __u16 id; 129 + __u64 lun; 130 + __u16 reserved1; 131 + __u32 reserved2; 132 } __attribute__ ((packed)) scsi; 133 struct { 134 + __u64 serial_number; 135 + __u64 reserved; 136 } __attribute__ ((packed)) usb; 137 struct { 138 + __u64 eui; 139 + __u64 reserved; 140 } __attribute__ ((packed)) i1394; 141 struct { 142 + __u64 wwid; 143 + __u64 lun; 144 } __attribute__ ((packed)) fibre; 145 struct { 146 + __u64 identity_tag; 147 + __u64 reserved; 148 } __attribute__ ((packed)) i2o; 149 struct { 150 + __u32 array_number; 151 + __u32 reserved1; 152 + __u64 reserved2; 153 } __attribute__ ((packed)) raid; 154 struct { 155 + __u8 device; 156 + __u8 reserved1; 157 + __u16 reserved2; 158 + __u32 reserved3; 159 + __u64 reserved4; 160 } __attribute__ ((packed)) sata; 161 struct { 162 + __u64 reserved1; 163 + __u64 reserved2; 164 } __attribute__ ((packed)) unknown; 165 } device_path; 166 + __u8 reserved4; 167 + __u8 checksum; 168 } __attribute__ ((packed)); 169 170 struct edd_info { 171 + __u8 device; 172 + __u8 version; 173 + __u16 interface_support; 174 + __u16 legacy_max_cylinder; 175 + __u8 legacy_max_head; 176 + __u8 legacy_sectors_per_track; 177 struct edd_device_params params; 178 } __attribute__ ((packed)); 179 ··· 184 unsigned char edd_info_nr; 185 }; 186 187 + #ifdef __KERNEL__ 188 extern struct edd edd; 189 + #endif /* __KERNEL__ */ 190 #endif /*!__ASSEMBLY__ */ 191 192 #endif /* _LINUX_EDD_H */

+12 -68

include/linux/lguest.h

··· 1 /* Things the lguest guest needs to know. Note: like all lguest interfaces, 2 * this is subject to wild and random change between versions. */ 3 - #ifndef _ASM_LGUEST_H 4 - #define _ASM_LGUEST_H 5 6 #ifndef __ASSEMBLY__ 7 #include <asm/irq.h> 8 - 9 - #define LHCALL_FLUSH_ASYNC 0 10 - #define LHCALL_LGUEST_INIT 1 11 - #define LHCALL_CRASH 2 12 - #define LHCALL_LOAD_GDT 3 13 - #define LHCALL_NEW_PGTABLE 4 14 - #define LHCALL_FLUSH_TLB 5 15 - #define LHCALL_LOAD_IDT_ENTRY 6 16 - #define LHCALL_SET_STACK 7 17 - #define LHCALL_TS 8 18 - #define LHCALL_SET_CLOCKEVENT 9 19 - #define LHCALL_HALT 10 20 - #define LHCALL_BIND_DMA 12 21 - #define LHCALL_SEND_DMA 13 22 - #define LHCALL_SET_PTE 14 23 - #define LHCALL_SET_PMD 15 24 - #define LHCALL_LOAD_TLS 16 25 26 #define LG_CLOCK_MIN_DELTA 100UL 27 #define LG_CLOCK_MAX_DELTA ULONG_MAX 28 - 29 - /*G:031 First, how does our Guest contact the Host to ask for privileged 30 - * operations? There are two ways: the direct way is to make a "hypercall", 31 - * to make requests of the Host Itself. 32 - * 33 - * Our hypercall mechanism uses the highest unused trap code (traps 32 and 34 - * above are used by real hardware interrupts). Seventeen hypercalls are 35 - * available: the hypercall number is put in the %eax register, and the 36 - * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 37 - * value makes sense, it's returned in %eax. 38 - * 39 - * Grossly invalid calls result in Sudden Death at the hands of the vengeful 40 - * Host, rather than returning failure. This reflects Winston Churchill's 41 - * definition of a gentleman: "someone who is only rude intentionally". */ 42 - #define LGUEST_TRAP_ENTRY 0x1F 43 - 44 - static inline unsigned long 45 - hcall(unsigned long call, 46 - unsigned long arg1, unsigned long arg2, unsigned long arg3) 47 - { 48 - /* "int" is the Intel instruction to trigger a trap. */ 49 - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 50 - /* The call is in %eax (aka "a"), and can be replaced */ 51 - : "=a"(call) 52 - /* The other arguments are in %eax, %edx, %ebx & %ecx */ 53 - : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 54 - /* "memory" means this might write somewhere in memory. 55 - * This isn't true for all calls, but it's safe to tell 56 - * gcc that it might happen so it doesn't get clever. */ 57 - : "memory"); 58 - return call; 59 - } 60 - /*:*/ 61 - 62 - void async_hcall(unsigned long call, 63 - unsigned long arg1, unsigned long arg2, unsigned long arg3); 64 - 65 - /* Can't use our min() macro here: needs to be a constant */ 66 - #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 67 - 68 - #define LHCALL_RING_SIZE 64 69 - struct hcall_ring 70 - { 71 - u32 eax, edx, ebx, ecx; 72 - }; 73 74 /*G:032 The second method of communicating with the Host is to via "struct 75 * lguest_data". The Guest's very first hypercall is to tell the Host where ··· 37 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 38 u8 hcall_status[LHCALL_RING_SIZE]; 39 /* The actual registers for the hypercalls. */ 40 - struct hcall_ring hcalls[LHCALL_RING_SIZE]; 41 42 /* Fields initialized by the Host at boot: */ 43 /* Memory not to try to access */ 44 unsigned long reserve_mem; 45 - /* ID of this Guest (used by network driver to set ethernet address) */ 46 - u16 guestid; 47 /* KHz for the TSC clock. */ 48 u32 tsc_khz; 49 50 /* Fields initialized by the Guest at boot: */ 51 /* Instruction range to suppress interrupts even if enabled */ 52 unsigned long noirq_start, noirq_end; 53 }; 54 extern struct lguest_data lguest_data; 55 #endif /* __ASSEMBLY__ */ 56 - #endif /* _ASM_LGUEST_H */

··· 1 /* Things the lguest guest needs to know. Note: like all lguest interfaces, 2 * this is subject to wild and random change between versions. */ 3 + #ifndef _LINUX_LGUEST_H 4 + #define _LINUX_LGUEST_H 5 6 #ifndef __ASSEMBLY__ 7 + #include <linux/time.h> 8 #include <asm/irq.h> 9 + #include <asm/lguest_hcall.h> 10 11 #define LG_CLOCK_MIN_DELTA 100UL 12 #define LG_CLOCK_MAX_DELTA ULONG_MAX 13 14 /*G:032 The second method of communicating with the Host is to via "struct 15 * lguest_data". The Guest's very first hypercall is to tell the Host where ··· 97 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 98 u8 hcall_status[LHCALL_RING_SIZE]; 99 /* The actual registers for the hypercalls. */ 100 + struct hcall_args hcalls[LHCALL_RING_SIZE]; 101 102 /* Fields initialized by the Host at boot: */ 103 /* Memory not to try to access */ 104 unsigned long reserve_mem; 105 /* KHz for the TSC clock. */ 106 u32 tsc_khz; 107 + /* Page where the top-level pagetable is */ 108 + unsigned long pgdir; 109 110 /* Fields initialized by the Guest at boot: */ 111 /* Instruction range to suppress interrupts even if enabled */ 112 unsigned long noirq_start, noirq_end; 113 + /* Address above which page tables are all identical. */ 114 + unsigned long kernel_address; 115 + /* The vector to try to use for system calls (0x40 or 0x80). */ 116 + unsigned int syscall_vec; 117 }; 118 extern struct lguest_data lguest_data; 119 #endif /* __ASSEMBLY__ */ 120 + #endif /* _LINUX_LGUEST_H */

-51

include/linux/lguest_bus.h

··· 1 - #ifndef _ASM_LGUEST_DEVICE_H 2 - #define _ASM_LGUEST_DEVICE_H 3 - /* Everything you need to know about lguest devices. */ 4 - #include <linux/device.h> 5 - #include <linux/lguest.h> 6 - #include <linux/lguest_launcher.h> 7 - 8 - struct lguest_device { 9 - /* Unique busid, and index into lguest_page->devices[] */ 10 - unsigned int index; 11 - 12 - struct device dev; 13 - 14 - /* Driver can hang data off here. */ 15 - void *private; 16 - }; 17 - 18 - /*D:380 Since interrupt numbers are arbitrary, we use a convention: each device 19 - * can use the interrupt number corresponding to its index. The +1 is because 20 - * interrupt 0 is not usable (it's actually the timer interrupt). */ 21 - static inline int lgdev_irq(const struct lguest_device *dev) 22 - { 23 - return dev->index + 1; 24 - } 25 - /*:*/ 26 - 27 - /* dma args must not be vmalloced! */ 28 - void lguest_send_dma(unsigned long key, struct lguest_dma *dma); 29 - int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, 30 - unsigned int num, u8 irq); 31 - void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas); 32 - 33 - /* Map the virtual device space */ 34 - void *lguest_map(unsigned long phys_addr, unsigned long pages); 35 - void lguest_unmap(void *); 36 - 37 - struct lguest_driver { 38 - const char *name; 39 - struct module *owner; 40 - u16 device_type; 41 - int (*probe)(struct lguest_device *dev); 42 - void (*remove)(struct lguest_device *dev); 43 - 44 - struct device_driver drv; 45 - }; 46 - 47 - extern int register_lguest_driver(struct lguest_driver *drv); 48 - extern void unregister_lguest_driver(struct lguest_driver *drv); 49 - 50 - extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */ 51 - #endif /* _ASM_LGUEST_DEVICE_H */

···

+22 -90

include/linux/lguest_launcher.h

··· 1 #ifndef _ASM_LGUEST_USER 2 #define _ASM_LGUEST_USER 3 /* Everything the "lguest" userspace program needs to know. */ 4 /* They can register up to 32 arrays of lguest_dma. */ 5 #define LGUEST_MAX_DMA 32 6 /* At most we can dma 16 lguest_dma in one op. */ ··· 9 10 /* How many devices? Assume each one wants up to two dma arrays per device. */ 11 #define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) 12 - 13 - /*D:200 14 - * Lguest I/O 15 - * 16 - * The lguest I/O mechanism is the only way Guests can talk to devices. There 17 - * are two hypercalls involved: SEND_DMA for output and BIND_DMA for input. In 18 - * each case, "struct lguest_dma" describes the buffer: this contains 16 19 - * addr/len pairs, and if there are fewer buffer elements the len array is 20 - * terminated with a 0. 21 - * 22 - * I/O is organized by keys: BIND_DMA attaches buffers to a particular key, and 23 - * SEND_DMA transfers to buffers bound to particular key. By convention, keys 24 - * correspond to a physical address within the device's page. This means that 25 - * devices will never accidentally end up with the same keys, and allows the 26 - * Host use The Futex Trick (as we'll see later in our journey). 27 - * 28 - * SEND_DMA simply indicates a key to send to, and the physical address of the 29 - * "struct lguest_dma" to send. The Host will write the number of bytes 30 - * transferred into the "struct lguest_dma"'s used_len member. 31 - * 32 - * BIND_DMA indicates a key to bind to, a pointer to an array of "struct 33 - * lguest_dma"s ready for receiving, the size of that array, and an interrupt 34 - * to trigger when data is received. The Host will only allow transfers into 35 - * buffers with a used_len of zero: it then sets used_len to the number of 36 - * bytes transferred and triggers the interrupt for the Guest to process the 37 - * new input. */ 38 - struct lguest_dma 39 - { 40 - /* 0 if free to be used, filled by the Host. */ 41 - u32 used_len; 42 - unsigned long addr[LGUEST_MAX_DMA_SECTIONS]; 43 - u16 len[LGUEST_MAX_DMA_SECTIONS]; 44 - }; 45 - /*:*/ 46 - 47 - /*D:460 This is the layout of a block device memory page. The Launcher sets up 48 - * the num_sectors initially to tell the Guest the size of the disk. The Guest 49 - * puts the type, sector and length of the request in the first three fields, 50 - * then DMAs to the Host. The Host processes the request, sets up the result, 51 - * then DMAs back to the Guest. */ 52 - struct lguest_block_page 53 - { 54 - /* 0 is a read, 1 is a write. */ 55 - int type; 56 - u32 sector; /* Offset in device = sector * 512. */ 57 - u32 bytes; /* Length expected to be read/written in bytes */ 58 - /* 0 = pending, 1 = done, 2 = done, error */ 59 - int result; 60 - u32 num_sectors; /* Disk length = num_sectors * 512 */ 61 - }; 62 - 63 - /*D:520 The network device is basically a memory page where all the Guests on 64 - * the network publish their MAC (ethernet) addresses: it's an array of "struct 65 - * lguest_net": */ 66 - struct lguest_net 67 - { 68 - /* Simply the mac address (with multicast bit meaning promisc). */ 69 - unsigned char mac[6]; 70 - }; 71 - /*:*/ 72 73 /* Where the Host expects the Guest to SEND_DMA console output to. */ 74 #define LGUEST_CONSOLE_DMA_KEY 0 ··· 22 * complex burden for the Host and suboptimal for the Guest, so we have our own 23 * "lguest" bus and simple drivers. 24 * 25 - * Devices are described by an array of LGUEST_MAX_DEVICES of these structs, 26 - * placed by the Launcher just above the top of physical memory: 27 */ 28 struct lguest_device_desc { 29 - /* The device type: console, network, disk etc. */ 30 - u16 type; 31 - #define LGUEST_DEVICE_T_CONSOLE 1 32 - #define LGUEST_DEVICE_T_NET 2 33 - #define LGUEST_DEVICE_T_BLOCK 3 34 35 - /* The specific features of this device: these depends on device type 36 - * except for LGUEST_DEVICE_F_RANDOMNESS. */ 37 - u16 features; 38 - #define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */ 39 - #define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */ 40 - 41 - /* This is how the Guest reports status of the device: the Host can set 42 - * LGUEST_DEVICE_S_REMOVED to indicate removal, but the rest are only 43 - * ever manipulated by the Guest, and only ever set. */ 44 - u16 status; 45 - /* 256 and above are device specific. */ 46 - #define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */ 47 - #define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */ 48 - #define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */ 49 - #define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */ 50 - #define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */ 51 - #define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */ 52 - 53 - /* Each device exists somewhere in Guest physical memory, over some 54 - * number of pages. */ 55 - u16 num_pages; 56 - u32 pfn; 57 }; 58 /*:*/ 59 ··· 52 enum lguest_req 53 { 54 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ 55 - LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */ 56 LHREQ_IRQ, /* + irq */ 57 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ 58 };

··· 1 #ifndef _ASM_LGUEST_USER 2 #define _ASM_LGUEST_USER 3 /* Everything the "lguest" userspace program needs to know. */ 4 + #include <linux/types.h> 5 /* They can register up to 32 arrays of lguest_dma. */ 6 #define LGUEST_MAX_DMA 32 7 /* At most we can dma 16 lguest_dma in one op. */ ··· 8 9 /* How many devices? Assume each one wants up to two dma arrays per device. */ 10 #define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) 11 12 /* Where the Host expects the Guest to SEND_DMA console output to. */ 13 #define LGUEST_CONSOLE_DMA_KEY 0 ··· 81 * complex burden for the Host and suboptimal for the Guest, so we have our own 82 * "lguest" bus and simple drivers. 83 * 84 + * Devices are described by a simplified ID, a status byte, and some "config" 85 + * bytes which describe this device's configuration. This is placed by the 86 + * Launcher just above the top of physical memory: 87 */ 88 struct lguest_device_desc { 89 + /* The device type: console, network, disk etc. Type 0 terminates. */ 90 + __u8 type; 91 + /* The number of bytes of the config array. */ 92 + __u8 config_len; 93 + /* A status byte, written by the Guest. */ 94 + __u8 status; 95 + __u8 config[0]; 96 + }; 97 98 + /*D:135 This is how we expect the device configuration field for a virtqueue 99 + * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */ 100 + struct lguest_vqconfig { 101 + /* The number of entries in the virtio_ring */ 102 + __u16 num; 103 + /* The interrupt we get when something happens. */ 104 + __u16 irq; 105 + /* The page number of the virtio ring for this device. */ 106 + __u32 pfn; 107 }; 108 /*:*/ 109 ··· 120 enum lguest_req 121 { 122 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ 123 + LHREQ_GETDMA, /* No longer used */ 124 LHREQ_IRQ, /* + irq */ 125 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ 126 };

+6

include/linux/mod_devicetable.h

··· 361 #define SSB_ANY_ID 0xFFFF 362 #define SSB_ANY_REV 0xFF 363 364 #endif /* LINUX_MOD_DEVICETABLE_H */

··· 361 #define SSB_ANY_ID 0xFFFF 362 #define SSB_ANY_REV 0xFF 363 364 + struct virtio_device_id { 365 + __u32 device; 366 + __u32 vendor; 367 + }; 368 + #define VIRTIO_DEV_ANY_ID 0xffffffff 369 + 370 #endif /* LINUX_MOD_DEVICETABLE_H */

+46 -35

include/linux/screen_info.h

··· 8 */ 9 10 struct screen_info { 11 - u8 orig_x; /* 0x00 */ 12 - u8 orig_y; /* 0x01 */ 13 - u16 ext_mem_k; /* 0x02 */ 14 - u16 orig_video_page; /* 0x04 */ 15 - u8 orig_video_mode; /* 0x06 */ 16 - u8 orig_video_cols; /* 0x07 */ 17 - u16 unused2; /* 0x08 */ 18 - u16 orig_video_ega_bx; /* 0x0a */ 19 - u16 unused3; /* 0x0c */ 20 - u8 orig_video_lines; /* 0x0e */ 21 - u8 orig_video_isVGA; /* 0x0f */ 22 - u16 orig_video_points; /* 0x10 */ 23 24 /* VESA graphic mode -- linear frame buffer */ 25 - u16 lfb_width; /* 0x12 */ 26 - u16 lfb_height; /* 0x14 */ 27 - u16 lfb_depth; /* 0x16 */ 28 - u32 lfb_base; /* 0x18 */ 29 - u32 lfb_size; /* 0x1c */ 30 - u16 cl_magic, cl_offset; /* 0x20 */ 31 - u16 lfb_linelength; /* 0x24 */ 32 - u8 red_size; /* 0x26 */ 33 - u8 red_pos; /* 0x27 */ 34 - u8 green_size; /* 0x28 */ 35 - u8 green_pos; /* 0x29 */ 36 - u8 blue_size; /* 0x2a */ 37 - u8 blue_pos; /* 0x2b */ 38 - u8 rsvd_size; /* 0x2c */ 39 - u8 rsvd_pos; /* 0x2d */ 40 - u16 vesapm_seg; /* 0x2e */ 41 - u16 vesapm_off; /* 0x30 */ 42 - u16 pages; /* 0x32 */ 43 - u16 vesa_attributes; /* 0x34 */ 44 - u32 capabilities; /* 0x36 */ 45 - u8 _reserved[6]; /* 0x3a */ 46 } __attribute__((packed)); 47 - 48 - extern struct screen_info screen_info; 49 50 #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ 51 #define VIDEO_TYPE_CGA 0x11 /* CGA Display */ ··· 62 #define VIDEO_TYPE_SUNPCI 0x51 /* Sun PCI based frame buffer. */ 63 64 #define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */ 65 66 #endif /* _SCREEN_INFO_H */

··· 8 */ 9 10 struct screen_info { 11 + __u8 orig_x; /* 0x00 */ 12 + __u8 orig_y; /* 0x01 */ 13 + __u16 ext_mem_k; /* 0x02 */ 14 + __u16 orig_video_page; /* 0x04 */ 15 + __u8 orig_video_mode; /* 0x06 */ 16 + __u8 orig_video_cols; /* 0x07 */ 17 + __u16 unused2; /* 0x08 */ 18 + __u16 orig_video_ega_bx;/* 0x0a */ 19 + __u16 unused3; /* 0x0c */ 20 + __u8 orig_video_lines; /* 0x0e */ 21 + __u8 orig_video_isVGA; /* 0x0f */ 22 + __u16 orig_video_points;/* 0x10 */ 23 24 /* VESA graphic mode -- linear frame buffer */ 25 + __u16 lfb_width; /* 0x12 */ 26 + __u16 lfb_height; /* 0x14 */ 27 + __u16 lfb_depth; /* 0x16 */ 28 + __u32 lfb_base; /* 0x18 */ 29 + __u32 lfb_size; /* 0x1c */ 30 + __u16 cl_magic, cl_offset; /* 0x20 */ 31 + __u16 lfb_linelength; /* 0x24 */ 32 + __u8 red_size; /* 0x26 */ 33 + __u8 red_pos; /* 0x27 */ 34 + __u8 green_size; /* 0x28 */ 35 + __u8 green_pos; /* 0x29 */ 36 + __u8 blue_size; /* 0x2a */ 37 + __u8 blue_pos; /* 0x2b */ 38 + __u8 rsvd_size; /* 0x2c */ 39 + __u8 rsvd_pos; /* 0x2d */ 40 + __u16 vesapm_seg; /* 0x2e */ 41 + __u16 vesapm_off; /* 0x30 */ 42 + __u16 pages; /* 0x32 */ 43 + __u16 vesa_attributes; /* 0x34 */ 44 + __u32 capabilities; /* 0x36 */ 45 + __u8 _reserved[6]; /* 0x3a */ 46 } __attribute__((packed)); 47 48 #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ 49 #define VIDEO_TYPE_CGA 0x11 /* CGA Display */ ··· 64 #define VIDEO_TYPE_SUNPCI 0x51 /* Sun PCI based frame buffer. */ 65 66 #define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */ 67 + 68 + #ifdef __KERNEL__ 69 + extern struct screen_info screen_info; 70 + 71 + #define ORIG_X (screen_info.orig_x) 72 + #define ORIG_Y (screen_info.orig_y) 73 + #define ORIG_VIDEO_MODE (screen_info.orig_video_mode) 74 + #define ORIG_VIDEO_COLS (screen_info.orig_video_cols) 75 + #define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx) 76 + #define ORIG_VIDEO_LINES (screen_info.orig_video_lines) 77 + #define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA) 78 + #define ORIG_VIDEO_POINTS (screen_info.orig_video_points) 79 + #endif /* __KERNEL__ */ 80 81 #endif /* _SCREEN_INFO_H */

+110

include/linux/virtio.h

···

··· 1 + #ifndef _LINUX_VIRTIO_H 2 + #define _LINUX_VIRTIO_H 3 + /* Everything a virtio driver needs to work with any particular virtio 4 + * implementation. */ 5 + #include <linux/types.h> 6 + #include <linux/scatterlist.h> 7 + #include <linux/spinlock.h> 8 + #include <linux/device.h> 9 + #include <linux/mod_devicetable.h> 10 + 11 + /** 12 + * virtqueue - a queue to register buffers for sending or receiving. 13 + * @callback: the function to call when buffers are consumed (can be NULL). 14 + * If this returns false, callbacks are suppressed until vq_ops->restart 15 + * is called. 16 + * @vdev: the virtio device this queue was created for. 17 + * @vq_ops: the operations for this virtqueue (see below). 18 + * @priv: a pointer for the virtqueue implementation to use. 19 + */ 20 + struct virtqueue 21 + { 22 + bool (*callback)(struct virtqueue *vq); 23 + struct virtio_device *vdev; 24 + struct virtqueue_ops *vq_ops; 25 + void *priv; 26 + }; 27 + 28 + /** 29 + * virtqueue_ops - operations for virtqueue abstraction layer 30 + * @add_buf: expose buffer to other end 31 + * vq: the struct virtqueue we're talking about. 32 + * sg: the description of the buffer(s). 33 + * out_num: the number of sg readable by other side 34 + * in_num: the number of sg which are writable (after readable ones) 35 + * data: the token identifying the buffer. 36 + * Returns 0 or an error. 37 + * @kick: update after add_buf 38 + * vq: the struct virtqueue 39 + * After one or more add_buf calls, invoke this to kick the other side. 40 + * @get_buf: get the next used buffer 41 + * vq: the struct virtqueue we're talking about. 42 + * len: the length written into the buffer 43 + * Returns NULL or the "data" token handed to add_buf. 44 + * @restart: restart callbacks after callback returned false. 45 + * vq: the struct virtqueue we're talking about. 46 + * This returns "false" (and doesn't re-enable) if there are pending 47 + * buffers in the queue, to avoid a race. 48 + * @shutdown: "unadd" all buffers. 49 + * vq: the struct virtqueue we're talking about. 50 + * Remove everything from the queue. 51 + * 52 + * Locking rules are straightforward: the driver is responsible for 53 + * locking. No two operations may be invoked simultaneously. 54 + * 55 + * All operations can be called in any context. 56 + */ 57 + struct virtqueue_ops { 58 + int (*add_buf)(struct virtqueue *vq, 59 + struct scatterlist sg[], 60 + unsigned int out_num, 61 + unsigned int in_num, 62 + void *data); 63 + 64 + void (*kick)(struct virtqueue *vq); 65 + 66 + void *(*get_buf)(struct virtqueue *vq, unsigned int *len); 67 + 68 + bool (*restart)(struct virtqueue *vq); 69 + 70 + void (*shutdown)(struct virtqueue *vq); 71 + }; 72 + 73 + /** 74 + * virtio_device - representation of a device using virtio 75 + * @index: unique position on the virtio bus 76 + * @dev: underlying device. 77 + * @id: the device type identification (used to match it with a driver). 78 + * @config: the configuration ops for this device. 79 + * @priv: private pointer for the driver's use. 80 + */ 81 + struct virtio_device 82 + { 83 + int index; 84 + struct device dev; 85 + struct virtio_device_id id; 86 + struct virtio_config_ops *config; 87 + void *priv; 88 + }; 89 + 90 + int register_virtio_device(struct virtio_device *dev); 91 + void unregister_virtio_device(struct virtio_device *dev); 92 + 93 + /** 94 + * virtio_driver - operations for a virtio I/O driver 95 + * @driver: underlying device driver (populate name and owner). 96 + * @id_table: the ids serviced by this driver. 97 + * @probe: the function to call when a device is found. Returns a token for 98 + * remove, or PTR_ERR(). 99 + * @remove: the function when a device is removed. 100 + */ 101 + struct virtio_driver { 102 + struct device_driver driver; 103 + const struct virtio_device_id *id_table; 104 + int (*probe)(struct virtio_device *dev); 105 + void (*remove)(struct virtio_device *dev); 106 + }; 107 + 108 + int register_virtio_driver(struct virtio_driver *drv); 109 + void unregister_virtio_driver(struct virtio_driver *drv); 110 + #endif /* _LINUX_VIRTIO_H */

+51

include/linux/virtio_blk.h

···

··· 1 + #ifndef _LINUX_VIRTIO_BLK_H 2 + #define _LINUX_VIRTIO_BLK_H 3 + #include <linux/virtio_config.h> 4 + 5 + /* The ID for virtio_block */ 6 + #define VIRTIO_ID_BLOCK 2 7 + 8 + /* Feature bits */ 9 + #define VIRTIO_CONFIG_BLK_F 0x40 10 + #define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */ 11 + 12 + /* The capacity (in 512-byte sectors). */ 13 + #define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41 14 + /* The maximum segment size. */ 15 + #define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42 16 + /* The maximum number of segments. */ 17 + #define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43 18 + 19 + /* These two define direction. */ 20 + #define VIRTIO_BLK_T_IN 0 21 + #define VIRTIO_BLK_T_OUT 1 22 + 23 + /* This bit says it's a scsi command, not an actual read or write. */ 24 + #define VIRTIO_BLK_T_SCSI_CMD 2 25 + 26 + /* Barrier before this op. */ 27 + #define VIRTIO_BLK_T_BARRIER 0x80000000 28 + 29 + /* This is the first element of the read scatter-gather list. */ 30 + struct virtio_blk_outhdr 31 + { 32 + /* VIRTIO_BLK_T* */ 33 + __u32 type; 34 + /* io priority. */ 35 + __u32 ioprio; 36 + /* Sector (ie. 512 byte offset) */ 37 + __u64 sector; 38 + /* Where to put reply. */ 39 + __u64 id; 40 + }; 41 + 42 + #define VIRTIO_BLK_S_OK 0 43 + #define VIRTIO_BLK_S_IOERR 1 44 + #define VIRTIO_BLK_S_UNSUPP 2 45 + 46 + /* This is the first element of the write scatter-gather list */ 47 + struct virtio_blk_inhdr 48 + { 49 + unsigned char status; 50 + }; 51 + #endif /* _LINUX_VIRTIO_BLK_H */

+111

include/linux/virtio_config.h

···

··· 1 + #ifndef _LINUX_VIRTIO_CONFIG_H 2 + #define _LINUX_VIRTIO_CONFIG_H 3 + /* Virtio devices use a standardized configuration space to define their 4 + * features and pass configuration information, but each implementation can 5 + * store and access that space differently. */ 6 + #include <linux/types.h> 7 + 8 + /* Status byte for guest to report progress, and synchronize config. */ 9 + /* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */ 10 + #define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 11 + /* We have found a driver for the device. */ 12 + #define VIRTIO_CONFIG_S_DRIVER 2 13 + /* Driver has used its parts of the config, and is happy */ 14 + #define VIRTIO_CONFIG_S_DRIVER_OK 4 15 + /* We've given up on this device. */ 16 + #define VIRTIO_CONFIG_S_FAILED 0x80 17 + 18 + /* Feature byte (actually 7 bits availabe): */ 19 + /* Requirements/features of the virtio implementation. */ 20 + #define VIRTIO_CONFIG_F_VIRTIO 1 21 + /* Requirements/features of the virtqueue (may have more than one). */ 22 + #define VIRTIO_CONFIG_F_VIRTQUEUE 2 23 + 24 + #ifdef __KERNEL__ 25 + struct virtio_device; 26 + 27 + /** 28 + * virtio_config_ops - operations for configuring a virtio device 29 + * @find: search for the next configuration field of the given type. 30 + * vdev: the virtio_device 31 + * type: the feature type 32 + * len: the (returned) length of the field if found. 33 + * Returns a token if found, or NULL. Never returnes the same field twice 34 + * (ie. it's used up). 35 + * @get: read the value of a configuration field after find(). 36 + * vdev: the virtio_device 37 + * token: the token returned from find(). 38 + * buf: the buffer to write the field value into. 39 + * len: the length of the buffer (given by find()). 40 + * Note that contents are conventionally little-endian. 41 + * @set: write the value of a configuration field after find(). 42 + * vdev: the virtio_device 43 + * token: the token returned from find(). 44 + * buf: the buffer to read the field value from. 45 + * len: the length of the buffer (given by find()). 46 + * Note that contents are conventionally little-endian. 47 + * @get_status: read the status byte 48 + * vdev: the virtio_device 49 + * Returns the status byte 50 + * @set_status: write the status byte 51 + * vdev: the virtio_device 52 + * status: the new status byte 53 + * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue. 54 + * vdev: the virtio_device 55 + * callback: the virqtueue callback 56 + * Returns the new virtqueue or ERR_PTR(). 57 + * @del_vq: free a virtqueue found by find_vq(). 58 + */ 59 + struct virtio_config_ops 60 + { 61 + void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len); 62 + void (*get)(struct virtio_device *vdev, void *token, 63 + void *buf, unsigned len); 64 + void (*set)(struct virtio_device *vdev, void *token, 65 + const void *buf, unsigned len); 66 + u8 (*get_status)(struct virtio_device *vdev); 67 + void (*set_status)(struct virtio_device *vdev, u8 status); 68 + struct virtqueue *(*find_vq)(struct virtio_device *vdev, 69 + bool (*callback)(struct virtqueue *)); 70 + void (*del_vq)(struct virtqueue *vq); 71 + }; 72 + 73 + /** 74 + * virtio_config_val - get a single virtio config and mark it used. 75 + * @config: the virtio config space 76 + * @type: the type to search for. 77 + * @val: a pointer to the value to fill in. 78 + * 79 + * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't 80 + * be found again. This version does endian conversion. */ 81 + #define virtio_config_val(vdev, type, v) ({ \ 82 + int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \ 83 + \ 84 + BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ 85 + && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ 86 + if (!_err) { \ 87 + switch (sizeof(*(v))) { \ 88 + case 2: le16_to_cpus((__u16 *) v); break; \ 89 + case 4: le32_to_cpus((__u32 *) v); break; \ 90 + case 8: le64_to_cpus((__u64 *) v); break; \ 91 + } \ 92 + } \ 93 + _err; \ 94 + }) 95 + 96 + int __virtio_config_val(struct virtio_device *dev, 97 + u8 type, void *val, size_t size); 98 + 99 + /** 100 + * virtio_use_bit - helper to use a feature bit in a bitfield value. 101 + * @dev: the virtio device 102 + * @token: the token as returned from vdev->config->find(). 103 + * @len: the length of the field. 104 + * @bitnum: the bit to test. 105 + * 106 + * If handed a NULL token, it returns false, otherwise returns bit status. 107 + * If it's one, it sets the mirroring acknowledgement bit. */ 108 + int virtio_use_bit(struct virtio_device *vdev, 109 + void *token, unsigned int len, unsigned int bitnum); 110 + #endif /* __KERNEL__ */ 111 + #endif /* _LINUX_VIRTIO_CONFIG_H */

+12

include/linux/virtio_console.h

···

··· 1 + #ifndef _LINUX_VIRTIO_CONSOLE_H 2 + #define _LINUX_VIRTIO_CONSOLE_H 3 + #include <linux/virtio_config.h> 4 + 5 + /* The ID for virtio console */ 6 + #define VIRTIO_ID_CONSOLE 3 7 + 8 + #ifdef __KERNEL__ 9 + int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)); 10 + #endif /* __KERNEL__ */ 11 + 12 + #endif /* _LINUX_VIRTIO_CONSOLE_H */

+36

include/linux/virtio_net.h

···

··· 1 + #ifndef _LINUX_VIRTIO_NET_H 2 + #define _LINUX_VIRTIO_NET_H 3 + #include <linux/virtio_config.h> 4 + 5 + /* The ID for virtio_net */ 6 + #define VIRTIO_ID_NET 1 7 + 8 + /* The bitmap of config for virtio net */ 9 + #define VIRTIO_CONFIG_NET_F 0x40 10 + #define VIRTIO_NET_F_NO_CSUM 0 11 + #define VIRTIO_NET_F_TSO4 1 12 + #define VIRTIO_NET_F_UFO 2 13 + #define VIRTIO_NET_F_TSO4_ECN 3 14 + #define VIRTIO_NET_F_TSO6 4 15 + 16 + /* The config defining mac address. */ 17 + #define VIRTIO_CONFIG_NET_MAC_F 0x41 18 + 19 + /* This is the first element of the scatter-gather list. If you don't 20 + * specify GSO or CSUM features, you can simply ignore the header. */ 21 + struct virtio_net_hdr 22 + { 23 + #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset 24 + __u8 flags; 25 + #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame 26 + #define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) 27 + /* FIXME: Do we need this? If they said they can handle ECN, do they care? */ 28 + #define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN 29 + #define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) 30 + #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP 31 + __u8 gso_type; 32 + __u16 gso_size; 33 + __u16 csum_start; 34 + __u16 csum_offset; 35 + }; 36 + #endif /* _LINUX_VIRTIO_NET_H */

+119

include/linux/virtio_ring.h

···

··· 1 + #ifndef _LINUX_VIRTIO_RING_H 2 + #define _LINUX_VIRTIO_RING_H 3 + /* An interface for efficient virtio implementation, currently for use by KVM 4 + * and lguest, but hopefully others soon. Do NOT change this since it will 5 + * break existing servers and clients. 6 + * 7 + * This header is BSD licensed so anyone can use the definitions to implement 8 + * compatible drivers/servers. 9 + * 10 + * Copyright Rusty Russell IBM Corporation 2007. */ 11 + #include <linux/types.h> 12 + 13 + /* This marks a buffer as continuing via the next field. */ 14 + #define VRING_DESC_F_NEXT 1 15 + /* This marks a buffer as write-only (otherwise read-only). */ 16 + #define VRING_DESC_F_WRITE 2 17 + 18 + /* This means don't notify other side when buffer added. */ 19 + #define VRING_USED_F_NO_NOTIFY 1 20 + /* This means don't interrupt guest when buffer consumed. */ 21 + #define VRING_AVAIL_F_NO_INTERRUPT 1 22 + 23 + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ 24 + struct vring_desc 25 + { 26 + /* Address (guest-physical). */ 27 + __u64 addr; 28 + /* Length. */ 29 + __u32 len; 30 + /* The flags as indicated above. */ 31 + __u16 flags; 32 + /* We chain unused descriptors via this, too */ 33 + __u16 next; 34 + }; 35 + 36 + struct vring_avail 37 + { 38 + __u16 flags; 39 + __u16 idx; 40 + __u16 ring[]; 41 + }; 42 + 43 + /* u32 is used here for ids for padding reasons. */ 44 + struct vring_used_elem 45 + { 46 + /* Index of start of used descriptor chain. */ 47 + __u32 id; 48 + /* Total length of the descriptor chain which was used (written to) */ 49 + __u32 len; 50 + }; 51 + 52 + struct vring_used 53 + { 54 + __u16 flags; 55 + __u16 idx; 56 + struct vring_used_elem ring[]; 57 + }; 58 + 59 + struct vring { 60 + unsigned int num; 61 + 62 + struct vring_desc *desc; 63 + 64 + struct vring_avail *avail; 65 + 66 + struct vring_used *used; 67 + }; 68 + 69 + /* The standard layout for the ring is a continuous chunk of memory which looks 70 + * like this. The used fields will be aligned to a "num+1" boundary. 71 + * 72 + * struct vring 73 + * { 74 + * // The actual descriptors (16 bytes each) 75 + * struct vring_desc desc[num]; 76 + * 77 + * // A ring of available descriptor heads with free-running index. 78 + * __u16 avail_flags; 79 + * __u16 avail_idx; 80 + * __u16 available[num]; 81 + * 82 + * // Padding so a correctly-chosen num value will cache-align used_idx. 83 + * char pad[sizeof(struct vring_desc) - sizeof(avail_flags)]; 84 + * 85 + * // A ring of used descriptor heads with free-running index. 86 + * __u16 used_flags; 87 + * __u16 used_idx; 88 + * struct vring_used_elem used[num]; 89 + * }; 90 + */ 91 + static inline void vring_init(struct vring *vr, unsigned int num, void *p) 92 + { 93 + vr->num = num; 94 + vr->desc = p; 95 + vr->avail = p + num*sizeof(struct vring); 96 + vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16)); 97 + } 98 + 99 + static inline unsigned vring_size(unsigned int num) 100 + { 101 + return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16)) 102 + + sizeof(__u32) + num * sizeof(struct vring_used_elem); 103 + } 104 + 105 + #ifdef __KERNEL__ 106 + #include <linux/irqreturn.h> 107 + struct virtio_device; 108 + struct virtqueue; 109 + 110 + struct virtqueue *vring_new_virtqueue(unsigned int num, 111 + struct virtio_device *vdev, 112 + void *pages, 113 + void (*notify)(struct virtqueue *vq), 114 + bool (*callback)(struct virtqueue *vq)); 115 + void vring_del_virtqueue(struct virtqueue *vq); 116 + 117 + irqreturn_t vring_interrupt(int irq, void *_vq); 118 + #endif /* __KERNEL__ */ 119 + #endif /* _LINUX_VIRTIO_RING_H */

+1

include/video/Kbuild

··· 1 unifdef-y += sisfb.h uvesafb.h

··· 1 unifdef-y += sisfb.h uvesafb.h 2 + unifdef-y += edid.h

+4 -5

include/video/edid.h

··· 1 #ifndef __linux_video_edid_h__ 2 #define __linux_video_edid_h__ 3 4 - #ifdef __KERNEL__ 5 6 - 7 - #ifdef CONFIG_X86 8 struct edid_info { 9 unsigned char dummy[128]; 10 }; 11 12 extern struct edid_info edid_info; 13 - #endif /* CONFIG_X86 */ 14 - 15 #endif /* __KERNEL__ */ 16 17 #endif /* __linux_video_edid_h__ */

··· 1 #ifndef __linux_video_edid_h__ 2 #define __linux_video_edid_h__ 3 4 + #if !defined(__KERNEL__) || defined(CONFIG_X86) 5 6 struct edid_info { 7 unsigned char dummy[128]; 8 }; 9 10 + #ifdef __KERNEL__ 11 extern struct edid_info edid_info; 12 #endif /* __KERNEL__ */ 13 + 14 + #endif 15 16 #endif /* __linux_video_edid_h__ */

+18

scripts/mod/file2alias.c

··· 525 return 1; 526 } 527 528 /* Ignore any prefix, eg. v850 prepends _ */ 529 static inline int sym_is(const char *symbol, const char *name) 530 { ··· 665 do_table(symval, sym->st_size, 666 sizeof(struct ssb_device_id), "ssb", 667 do_ssb_entry, mod); 668 free(zeros); 669 } 670

··· 525 return 1; 526 } 527 528 + /* Looks like: virtio:dNvN */ 529 + static int do_virtio_entry(const char *filename, struct virtio_device_id *id, 530 + char *alias) 531 + { 532 + id->device = TO_NATIVE(id->device); 533 + id->vendor = TO_NATIVE(id->vendor); 534 + 535 + strcpy(alias, "virtio:"); 536 + ADD(alias, "d", 1, id->device); 537 + ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor); 538 + 539 + return 1; 540 + } 541 + 542 /* Ignore any prefix, eg. v850 prepends _ */ 543 static inline int sym_is(const char *symbol, const char *name) 544 { ··· 651 do_table(symval, sym->st_size, 652 sizeof(struct ssb_device_id), "ssb", 653 do_ssb_entry, mod); 654 + else if (sym_is(symname, "__mod_virtio_device_table")) 655 + do_table(symval, sym->st_size, 656 + sizeof(struct virtio_device_id), "virtio", 657 + do_virtio_entry, mod); 658 free(zeros); 659 } 660