commit 0d6810091cdbd05efeb31654c6a41a6cbdfdd2c8 · tjh.dev/kernel

+3 -23

Documentation/lguest/Makefile

··· 1 1 # This creates the demonstration utility "lguest" which runs a Linux guest. 2 - 3 - # For those people that have a separate object dir, look there for .config 4 - KBUILD_OUTPUT := ../.. 5 - ifdef O 6 - ifeq ("$(origin O)", "command line") 7 - KBUILD_OUTPUT := $(O) 8 - endif 9 - endif 10 - # We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. 11 - include $(KBUILD_OUTPUT)/.config 12 - LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) 13 - 14 - CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds 2 + CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include 15 3 LDLIBS:=-lz 16 - # Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and 17 - # not others (eg. FC7). 18 - LDFLAGS+=-static 19 - all: lguest.lds lguest 20 4 21 - # The linker script on x86 is so complex the only way of creating one 22 - # which will link our binary in the right place is to mangle the 23 - # default one. 24 - lguest.lds: 25 - $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ 5 + all: lguest 26 6 27 7 clean: 28 - rm -f lguest.lds lguest 8 + rm -f lguest

+879 -744

Documentation/lguest/lguest.c

··· 1 1 /*P:100 This is the Launcher code, a simple program which lays out the 2 2 * "physical" memory for the new Guest by mapping the kernel image and the 3 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 4 - * 5 - * The only trick: the Makefile links it at a high address so it will be clear 6 - * of the guest memory region. It means that each Guest cannot have more than 7 - * about 2.5G of memory on a normally configured Host. :*/ 4 + :*/ 8 5 #define _LARGEFILE64_SOURCE 9 6 #define _GNU_SOURCE 10 7 #include <stdio.h> ··· 12 15 #include <stdlib.h> 13 16 #include <elf.h> 14 17 #include <sys/mman.h> 18 + #include <sys/param.h> 15 19 #include <sys/types.h> 16 20 #include <sys/stat.h> 17 21 #include <sys/wait.h> ··· 32 34 #include <termios.h> 33 35 #include <getopt.h> 34 36 #include <zlib.h> 35 - /*L:110 We can ignore the 28 include files we need for this program, but I do 37 + #include <assert.h> 38 + #include <sched.h> 39 + /*L:110 We can ignore the 30 include files we need for this program, but I do 36 40 * want to draw attention to the use of kernel-style types. 37 41 * 38 42 * As Linus said, "C is a Spartan language, and so should your naming be." I ··· 45 45 typedef uint32_t u32; 46 46 typedef uint16_t u16; 47 47 typedef uint8_t u8; 48 - #include "../../include/linux/lguest_launcher.h" 49 - #include "../../include/asm-x86/e820_32.h" 48 + #include "linux/lguest_launcher.h" 49 + #include "linux/pci_ids.h" 50 + #include "linux/virtio_config.h" 51 + #include "linux/virtio_net.h" 52 + #include "linux/virtio_blk.h" 53 + #include "linux/virtio_console.h" 54 + #include "linux/virtio_ring.h" 55 + #include "asm-x86/bootparam.h" 50 56 /*:*/ 51 57 52 58 #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ ··· 61 55 #ifndef SIOCBRADDIF 62 56 #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 63 57 #endif 58 + /* We can have up to 256 pages for devices. */ 59 + #define DEVICE_PAGES 256 60 + /* This fits nicely in a single 4096-byte page. */ 61 + #define VIRTQUEUE_NUM 127 64 62 65 63 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows 66 64 * this, and although I wouldn't recommend it, it works quite nicely here. */ ··· 75 65 76 66 /* The pipe to send commands to the waker process */ 77 67 static int waker_fd; 78 - /* The top of guest physical memory. */ 79 - static u32 top; 68 + /* The pointer to the start of guest memory. */ 69 + static void *guest_base; 70 + /* The maximum guest physical address allowed, and maximum possible. */ 71 + static unsigned long guest_limit, guest_max; 80 72 81 73 /* This is our list of devices. */ 82 74 struct device_list ··· 88 76 fd_set infds; 89 77 int max_infd; 90 78 79 + /* Counter to assign interrupt numbers. */ 80 + unsigned int next_irq; 81 + 82 + /* Counter to print out convenient device numbers. */ 83 + unsigned int device_num; 84 + 91 85 /* The descriptor page for the devices. */ 92 - struct lguest_device_desc *descs; 86 + u8 *descpage; 87 + 88 + /* The tail of the last descriptor. */ 89 + unsigned int desc_used; 93 90 94 91 /* A single linked list of devices. */ 95 92 struct device *dev; ··· 106 85 struct device **lastdev; 107 86 }; 108 87 88 + /* The list of Guest devices, based on command line arguments. */ 89 + static struct device_list devices; 90 + 109 91 /* The device structure describes a single device. */ 110 92 struct device 111 93 { 112 94 /* The linked-list pointer. */ 113 95 struct device *next; 114 - /* The descriptor for this device, as mapped into the Guest. */ 96 + 97 + /* The this device's descriptor, as mapped into the Guest. */ 115 98 struct lguest_device_desc *desc; 116 - /* The memory page(s) of this device, if any. Also mapped in Guest. */ 117 - void *mem; 99 + 100 + /* The name of this device, for --verbose. */ 101 + const char *name; 118 102 119 103 /* If handle_input is set, it wants to be called when this file 120 104 * descriptor is ready. */ 121 105 int fd; 122 106 bool (*handle_input)(int fd, struct device *me); 123 107 124 - /* If handle_output is set, it wants to be called when the Guest sends 125 - * DMA to this key. */ 126 - unsigned long watch_key; 127 - u32 (*handle_output)(int fd, const struct iovec *iov, 128 - unsigned int num, struct device *me); 108 + /* Any queues attached to this device */ 109 + struct virtqueue *vq; 129 110 130 111 /* Device-specific data. */ 131 112 void *priv; 132 113 }; 114 + 115 + /* The virtqueue structure describes a queue attached to a device. */ 116 + struct virtqueue 117 + { 118 + struct virtqueue *next; 119 + 120 + /* Which device owns me. */ 121 + struct device *dev; 122 + 123 + /* The configuration for this queue. */ 124 + struct lguest_vqconfig config; 125 + 126 + /* The actual ring of buffers. */ 127 + struct vring vring; 128 + 129 + /* Last available index we saw. */ 130 + u16 last_avail_idx; 131 + 132 + /* The routine to call when the Guest pings us. */ 133 + void (*handle_output)(int fd, struct virtqueue *me); 134 + }; 135 + 136 + /* Since guest is UP and we don't run at the same time, we don't need barriers. 137 + * But I include them in the code in case others copy it. */ 138 + #define wmb() 139 + 140 + /* Convert an iovec element to the given type. 141 + * 142 + * This is a fairly ugly trick: we need to know the size of the type and 143 + * alignment requirement to check the pointer is kosher. It's also nice to 144 + * have the name of the type in case we report failure. 145 + * 146 + * Typing those three things all the time is cumbersome and error prone, so we 147 + * have a macro which sets them all up and passes to the real function. */ 148 + #define convert(iov, type) \ 149 + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 150 + 151 + static void *_convert(struct iovec *iov, size_t size, size_t align, 152 + const char *name) 153 + { 154 + if (iov->iov_len != size) 155 + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); 156 + if ((unsigned long)iov->iov_base % align != 0) 157 + errx(1, "Bad alignment %p for %s", iov->iov_base, name); 158 + return iov->iov_base; 159 + } 160 + 161 + /* The virtio configuration space is defined to be little-endian. x86 is 162 + * little-endian too, but it's nice to be explicit so we have these helpers. */ 163 + #define cpu_to_le16(v16) (v16) 164 + #define cpu_to_le32(v32) (v32) 165 + #define cpu_to_le64(v64) (v64) 166 + #define le16_to_cpu(v16) (v16) 167 + #define le32_to_cpu(v32) (v32) 168 + #define le64_to_cpu(v32) (v64) 169 + 170 + /*L:100 The Launcher code itself takes us out into userspace, that scary place 171 + * where pointers run wild and free! Unfortunately, like most userspace 172 + * programs, it's quite boring (which is why everyone likes to hack on the 173 + * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 174 + * will get you through this section. Or, maybe not. 175 + * 176 + * The Launcher sets up a big chunk of memory to be the Guest's "physical" 177 + * memory and stores it in "guest_base". In other words, Guest physical == 178 + * Launcher virtual with an offset. 179 + * 180 + * This can be tough to get your head around, but usually it just means that we 181 + * use these trivial conversion functions when the Guest gives us it's 182 + * "physical" addresses: */ 183 + static void *from_guest_phys(unsigned long addr) 184 + { 185 + return guest_base + addr; 186 + } 187 + 188 + static unsigned long to_guest_phys(const void *addr) 189 + { 190 + return (addr - guest_base); 191 + } 133 192 134 193 /*L:130 135 194 * Loading the Kernel. ··· 224 123 return fd; 225 124 } 226 125 227 - /* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ 228 - static void *map_zeroed_pages(unsigned long addr, unsigned int num) 126 + /* map_zeroed_pages() takes a number of pages. */ 127 + static void *map_zeroed_pages(unsigned int num) 229 128 { 230 - /* We cache the /dev/zero file-descriptor so we only open it once. */ 231 - static int fd = -1; 232 - 233 - if (fd == -1) 234 - fd = open_or_die("/dev/zero", O_RDONLY); 129 + int fd = open_or_die("/dev/zero", O_RDONLY); 130 + void *addr; 235 131 236 132 /* We use a private mapping (ie. if we write to the page, it will be 237 - * copied), and obviously we insist that it be mapped where we ask. */ 238 - if (mmap((void *)addr, getpagesize() * num, 239 - PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 240 - != (void *)addr) 241 - err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 133 + * copied). */ 134 + addr = mmap(NULL, getpagesize() * num, 135 + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 136 + if (addr == MAP_FAILED) 137 + err(1, "Mmaping %u pages of /dev/zero", num); 242 138 243 - /* Returning the address is just a courtesy: can simplify callers. */ 244 - return (void *)addr; 139 + return addr; 245 140 } 246 141 247 - /* To find out where to start we look for the magic Guest string, which marks 248 - * the code we see in lguest_asm.S. This is a hack which we are currently 249 - * plotting to replace with the normal Linux entry point. */ 250 - static unsigned long entry_point(void *start, void *end, 251 - unsigned long page_offset) 142 + /* Get some more pages for a device. */ 143 + static void *get_pages(unsigned int num) 252 144 { 253 - void *p; 145 + void *addr = from_guest_phys(guest_limit); 254 146 255 - /* The scan gives us the physical starting address. We want the 256 - * virtual address in this case, and fortunately, we already figured 257 - * out the physical-virtual difference and passed it here in 258 - * "page_offset". */ 259 - for (p = start; p < end; p++) 260 - if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) 261 - return (long)p + strlen("GenuineLguest") + page_offset; 147 + guest_limit += num * getpagesize(); 148 + if (guest_limit > guest_max) 149 + errx(1, "Not enough memory for devices"); 150 + return addr; 151 + } 262 152 263 - err(1, "Is this image a genuine lguest?"); 153 + /* This routine is used to load the kernel or initrd. It tries mmap, but if 154 + * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 155 + * it falls back to reading the memory in. */ 156 + static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 157 + { 158 + ssize_t r; 159 + 160 + /* We map writable even though for some segments are marked read-only. 161 + * The kernel really wants to be writable: it patches its own 162 + * instructions. 163 + * 164 + * MAP_PRIVATE means that the page won't be copied until a write is 165 + * done to it. This allows us to share untouched memory between 166 + * Guests. */ 167 + if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, 168 + MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 169 + return; 170 + 171 + /* pread does a seek and a read in one shot: saves a few lines. */ 172 + r = pread(fd, addr, len, offset); 173 + if (r != len) 174 + err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 264 175 } 265 176 266 177 /* This routine takes an open vmlinux image, which is in ELF, and maps it into ··· 280 167 * by all modern binaries on Linux including the kernel. 281 168 * 282 169 * The ELF headers give *two* addresses: a physical address, and a virtual 283 - * address. The Guest kernel expects to be placed in memory at the physical 284 - * address, and the page tables set up so it will correspond to that virtual 285 - * address. We return the difference between the virtual and physical 286 - * addresses in the "page_offset" pointer. 170 + * address. We use the physical address; the Guest will map itself to the 171 + * virtual address. 287 172 * 288 173 * We return the starting address. */ 289 - static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 290 - unsigned long *page_offset) 174 + static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 291 175 { 292 - void *addr; 293 176 Elf32_Phdr phdr[ehdr->e_phnum]; 294 177 unsigned int i; 295 - unsigned long start = -1UL, end = 0; 296 178 297 179 /* Sanity checks on the main ELF header: an x86 executable with a 298 180 * reasonable number of correctly-sized program headers. */ ··· 307 199 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 308 200 err(1, "Reading program headers"); 309 201 310 - /* We don't know page_offset yet. */ 311 - *page_offset = 0; 312 - 313 202 /* Try all the headers: there are usually only three. A read-only one, 314 203 * a read-write one, and a "note" section which isn't loadable. */ 315 204 for (i = 0; i < ehdr->e_phnum; i++) { ··· 317 212 verbose("Section %i: size %i addr %p\n", 318 213 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 319 214 320 - /* We expect a simple linear address space: every segment must 321 - * have the same difference between virtual (p_vaddr) and 322 - * physical (p_paddr) address. */ 323 - if (!*page_offset) 324 - *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; 325 - else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) 326 - errx(1, "Page offset of section %i different", i); 327 - 328 - /* We track the first and last address we mapped, so we can 329 - * tell entry_point() where to scan. */ 330 - if (phdr[i].p_paddr < start) 331 - start = phdr[i].p_paddr; 332 - if (phdr[i].p_paddr + phdr[i].p_filesz > end) 333 - end = phdr[i].p_paddr + phdr[i].p_filesz; 334 - 335 - /* We map this section of the file at its physical address. We 336 - * map it read & write even if the header says this segment is 337 - * read-only. The kernel really wants to be writable: it 338 - * patches its own instructions which would normally be 339 - * read-only. 340 - * 341 - * MAP_PRIVATE means that the page won't be copied until a 342 - * write is done to it. This allows us to share much of the 343 - * kernel memory between Guests. */ 344 - addr = mmap((void *)phdr[i].p_paddr, 345 - phdr[i].p_filesz, 346 - PROT_READ|PROT_WRITE|PROT_EXEC, 347 - MAP_FIXED|MAP_PRIVATE, 348 - elf_fd, phdr[i].p_offset); 349 - if (addr != (void *)phdr[i].p_paddr) 350 - err(1, "Mmaping vmlinux seg %i gave %p not %p", 351 - i, addr, (void *)phdr[i].p_paddr); 215 + /* We map this section of the file at its physical address. */ 216 + map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), 217 + phdr[i].p_offset, phdr[i].p_filesz); 352 218 } 353 219 354 - return entry_point((void *)start, (void *)end, *page_offset); 355 - } 356 - 357 - /*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. 358 - * 359 - * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects 360 - * to be. We don't know what that option was, but we can figure it out 361 - * approximately by looking at the addresses in the code. I chose the common 362 - * case of reading a memory location into the %eax register: 363 - * 364 - * movl <some-address>, %eax 365 - * 366 - * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, 367 - * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. 368 - * 369 - * In this example can guess that the kernel was compiled with 370 - * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the 371 - * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our 372 - * kernel isn't that bloated yet. 373 - * 374 - * Unfortunately, x86 has variable-length instructions, so finding this 375 - * particular instruction properly involves writing a disassembler. Instead, 376 - * we rely on statistics. We look for "0xA1" and tally the different bytes 377 - * which occur 4 bytes later (the "0xC0" in our example above). When one of 378 - * those bytes appears three times, we can be reasonably confident that it 379 - * forms the start of CONFIG_PAGE_OFFSET. 380 - * 381 - * This is amazingly reliable. */ 382 - static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) 383 - { 384 - unsigned int i, possibilities[256] = { 0 }; 385 - 386 - for (i = 0; i + 4 < len; i++) { 387 - /* mov 0xXXXXXXXX,%eax */ 388 - if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) 389 - return (unsigned long)img[i+4] << 24; 390 - } 391 - errx(1, "could not determine page offset"); 392 - } 393 - 394 - /*L:160 Unfortunately the entire ELF image isn't compressed: the segments 395 - * which need loading are extracted and compressed raw. This denies us the 396 - * information we need to make a fully-general loader. */ 397 - static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) 398 - { 399 - gzFile f; 400 - int ret, len = 0; 401 - /* A bzImage always gets loaded at physical address 1M. This is 402 - * actually configurable as CONFIG_PHYSICAL_START, but as the comment 403 - * there says, "Don't change this unless you know what you are doing". 404 - * Indeed. */ 405 - void *img = (void *)0x100000; 406 - 407 - /* gzdopen takes our file descriptor (carefully placed at the start of 408 - * the GZIP header we found) and returns a gzFile. */ 409 - f = gzdopen(fd, "rb"); 410 - /* We read it into memory in 64k chunks until we hit the end. */ 411 - while ((ret = gzread(f, img + len, 65536)) > 0) 412 - len += ret; 413 - if (ret < 0) 414 - err(1, "reading image from bzImage"); 415 - 416 - verbose("Unpacked size %i addr %p\n", len, img); 417 - 418 - /* Without the ELF header, we can't tell virtual-physical gap. This is 419 - * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, 420 - * I have a clever way of figuring it out from the code itself. */ 421 - *page_offset = intuit_page_offset(img, len); 422 - 423 - return entry_point(img, img + len, *page_offset); 220 + /* The entry point is given in the ELF header. */ 221 + return ehdr->e_entry; 424 222 } 425 223 426 224 /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 427 - * supposed to jump into it and it will unpack itself. We can't do that 428 - * because the Guest can't run the unpacking code, and adding features to 429 - * lguest kills puppies, so we don't want to. 225 + * supposed to jump into it and it will unpack itself. We used to have to 226 + * perform some hairy magic because the unpacking code scared me. 430 227 * 431 - * The bzImage is formed by putting the decompressing code in front of the 432 - * compressed kernel code. So we can simple scan through it looking for the 433 - * first "gzip" header, and start decompressing from there. */ 434 - static unsigned long load_bzimage(int fd, unsigned long *page_offset) 228 + * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 229 + * a small patch to jump over the tricky bits in the Guest, so now we just read 230 + * the funky header so we know where in the file to load, and away we go! */ 231 + static unsigned long load_bzimage(int fd) 435 232 { 436 - unsigned char c; 437 - int state = 0; 233 + struct boot_params boot; 234 + int r; 235 + /* Modern bzImages get loaded at 1M. */ 236 + void *p = from_guest_phys(0x100000); 438 237 439 - /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */ 440 - while (read(fd, &c, 1) == 1) { 441 - switch (state) { 442 - case 0: 443 - if (c == 0x1F) 444 - state++; 445 - break; 446 - case 1: 447 - if (c == 0x8B) 448 - state++; 449 - else 450 - state = 0; 451 - break; 452 - case 2 ... 8: 453 - state++; 454 - break; 455 - case 9: 456 - /* Seek back to the start of the gzip header. */ 457 - lseek(fd, -10, SEEK_CUR); 458 - /* One final check: "compressed under UNIX". */ 459 - if (c != 0x03) 460 - state = -1; 461 - else 462 - return unpack_bzimage(fd, page_offset); 463 - } 464 - } 465 - errx(1, "Could not find kernel in bzImage"); 238 + /* Go back to the start of the file and read the header. It should be 239 + * a Linux boot header (see Documentation/i386/boot.txt) */ 240 + lseek(fd, 0, SEEK_SET); 241 + read(fd, &boot, sizeof(boot)); 242 + 243 + /* Inside the setup_hdr, we expect the magic "HdrS" */ 244 + if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) 245 + errx(1, "This doesn't look like a bzImage to me"); 246 + 247 + /* Skip over the extra sectors of the header. */ 248 + lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); 249 + 250 + /* Now read everything into memory. in nice big chunks. */ 251 + while ((r = read(fd, p, 65536)) > 0) 252 + p += r; 253 + 254 + /* Finally, code32_start tells us where to enter the kernel. */ 255 + return boot.hdr.code32_start; 466 256 } 467 257 468 258 /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 469 259 * come wrapped up in the self-decompressing "bzImage" format. With some funky 470 260 * coding, we can load those, too. */ 471 - static unsigned long load_kernel(int fd, unsigned long *page_offset) 261 + static unsigned long load_kernel(int fd) 472 262 { 473 263 Elf32_Ehdr hdr; 474 264 ··· 373 373 374 374 /* If it's an ELF file, it starts with "\177ELF" */ 375 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 376 - return map_elf(fd, &hdr, page_offset); 376 + return map_elf(fd, &hdr); 377 377 378 378 /* Otherwise we assume it's a bzImage, and try to unpack it */ 379 - return load_bzimage(fd, page_offset); 379 + return load_bzimage(fd); 380 380 } 381 381 382 382 /* This is a trivial little helper to align pages. Andi Kleen hated it because ··· 402 402 int ifd; 403 403 struct stat st; 404 404 unsigned long len; 405 - void *iaddr; 406 405 407 406 ifd = open_or_die(name, O_RDONLY); 408 407 /* fstat() is needed to get the file size. */ 409 408 if (fstat(ifd, &st) < 0) 410 409 err(1, "fstat() on initrd '%s'", name); 411 410 412 - /* The length needs to be rounded up to a page size: mmap needs the 413 - * address to be page aligned. */ 411 + /* We map the initrd at the top of memory, but mmap wants it to be 412 + * page-aligned, so we round the size up for that. */ 414 413 len = page_align(st.st_size); 415 - /* We map the initrd at the top of memory. */ 416 - iaddr = mmap((void *)mem - len, st.st_size, 417 - PROT_READ|PROT_EXEC|PROT_WRITE, 418 - MAP_FIXED|MAP_PRIVATE, ifd, 0); 419 - if (iaddr != (void *)mem - len) 420 - err(1, "Mmaping initrd '%s' returned %p not %p", 421 - name, iaddr, (void *)mem - len); 414 + map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 422 415 /* Once a file is mapped, you can close the file descriptor. It's a 423 416 * little odd, but quite useful. */ 424 417 close(ifd); 425 - verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); 418 + verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 426 419 427 420 /* We return the initrd size. */ 428 421 return len; 429 422 } 430 423 431 - /* Once we know how much memory we have, and the address the Guest kernel 432 - * expects, we can construct simple linear page tables which will get the Guest 433 - * far enough into the boot to create its own. 424 + /* Once we know how much memory we have, we can construct simple linear page 425 + * tables which set virtual == physical which will get the Guest far enough 426 + * into the boot to create its own. 434 427 * 435 428 * We lay them out of the way, just below the initrd (which is why we need to 436 429 * know its size). */ 437 430 static unsigned long setup_pagetables(unsigned long mem, 438 - unsigned long initrd_size, 439 - unsigned long page_offset) 431 + unsigned long initrd_size) 440 432 { 441 - u32 *pgdir, *linear; 433 + unsigned long *pgdir, *linear; 442 434 unsigned int mapped_pages, i, linear_pages; 443 - unsigned int ptes_per_page = getpagesize()/sizeof(u32); 435 + unsigned int ptes_per_page = getpagesize()/sizeof(void *); 444 436 445 - /* Ideally we map all physical memory starting at page_offset. 446 - * However, if page_offset is 0xC0000000 we can only map 1G of physical 447 - * (0xC0000000 + 1G overflows). */ 448 - if (mem <= -page_offset) 449 - mapped_pages = mem/getpagesize(); 450 - else 451 - mapped_pages = -page_offset/getpagesize(); 437 + mapped_pages = mem/getpagesize(); 452 438 453 439 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 454 440 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 455 441 456 442 /* We put the toplevel page directory page at the top of memory. */ 457 - pgdir = (void *)mem - initrd_size - getpagesize(); 443 + pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); 458 444 459 445 /* Now we use the next linear_pages pages as pte pages */ 460 446 linear = (void *)pgdir - linear_pages*getpagesize(); ··· 451 465 for (i = 0; i < mapped_pages; i++) 452 466 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 453 467 454 - /* The top level points to the linear page table pages above. The 455 - * entry representing page_offset points to the first one, and they 456 - * continue from there. */ 468 + /* The top level points to the linear page table pages above. */ 457 469 for (i = 0; i < mapped_pages; i += ptes_per_page) { 458 - pgdir[(i + page_offset/getpagesize())/ptes_per_page] 459 - = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 470 + pgdir[i/ptes_per_page] 471 + = ((to_guest_phys(linear) + i*sizeof(void *)) 472 + | PAGE_PRESENT); 460 473 } 461 474 462 - verbose("Linear mapping of %u pages in %u pte pages at %p\n", 463 - mapped_pages, linear_pages, linear); 475 + verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", 476 + mapped_pages, linear_pages, to_guest_phys(linear)); 464 477 465 478 /* We return the top level (guest-physical) address: the kernel needs 466 479 * to know where it is. */ 467 - return (unsigned long)pgdir; 480 + return to_guest_phys(pgdir); 468 481 } 469 482 470 483 /* Simple routine to roll all the commandline arguments together with spaces ··· 483 498 484 499 /* This is where we actually tell the kernel to initialize the Guest. We saw 485 500 * the arguments it expects when we looked at initialize() in lguest_user.c: 486 - * the top physical page to allow, the top level pagetable, the entry point and 487 - * the page_offset constant for the Guest. */ 488 - static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 501 + * the base of guest "physical" memory, the top physical page to allow, the 502 + * top level pagetable and the entry point for the Guest. */ 503 + static int tell_kernel(unsigned long pgdir, unsigned long start) 489 504 { 490 - u32 args[] = { LHREQ_INITIALIZE, 491 - top/getpagesize(), pgdir, start, page_offset }; 505 + unsigned long args[] = { LHREQ_INITIALIZE, 506 + (unsigned long)guest_base, 507 + guest_limit / getpagesize(), pgdir, start }; 492 508 int fd; 493 509 510 + verbose("Guest: %p - %p (%#lx)\n", 511 + guest_base, guest_base + guest_limit, guest_limit); 494 512 fd = open_or_die("/dev/lguest", O_RDWR); 495 513 if (write(fd, args, sizeof(args)) < 0) 496 514 err(1, "Writing to /dev/lguest"); ··· 503 515 } 504 516 /*:*/ 505 517 506 - static void set_fd(int fd, struct device_list *devices) 518 + static void add_device_fd(int fd) 507 519 { 508 - FD_SET(fd, &devices->infds); 509 - if (fd > devices->max_infd) 510 - devices->max_infd = fd; 520 + FD_SET(fd, &devices.infds); 521 + if (fd > devices.max_infd) 522 + devices.max_infd = fd; 511 523 } 512 524 513 525 /*L:200 ··· 525 537 * 526 538 * This, of course, is merely a different *kind* of icky. 527 539 */ 528 - static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) 540 + static void wake_parent(int pipefd, int lguest_fd) 529 541 { 530 542 /* Add the pipe from the Launcher to the fdset in the device_list, so 531 543 * we watch it, too. */ 532 - set_fd(pipefd, devices); 544 + add_device_fd(pipefd); 533 545 534 546 for (;;) { 535 - fd_set rfds = devices->infds; 536 - u32 args[] = { LHREQ_BREAK, 1 }; 547 + fd_set rfds = devices.infds; 548 + unsigned long args[] = { LHREQ_BREAK, 1 }; 537 549 538 550 /* Wait until input is ready from one of the devices. */ 539 - select(devices->max_infd+1, &rfds, NULL, NULL, NULL); 551 + select(devices.max_infd+1, &rfds, NULL, NULL, NULL); 540 552 /* Is it a message from the Launcher? */ 541 553 if (FD_ISSET(pipefd, &rfds)) { 542 - int ignorefd; 554 + int fd; 543 555 /* If read() returns 0, it means the Launcher has 544 556 * exited. We silently follow. */ 545 - if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) 557 + if (read(pipefd, &fd, sizeof(fd)) == 0) 546 558 exit(0); 547 - /* Otherwise it's telling us there's a problem with one 548 - * of the devices, and we should ignore that file 549 - * descriptor from now on. */ 550 - FD_CLR(ignorefd, &devices->infds); 559 + /* Otherwise it's telling us to change what file 560 + * descriptors we're to listen to. */ 561 + if (fd >= 0) 562 + FD_SET(fd, &devices.infds); 563 + else 564 + FD_CLR(-fd - 1, &devices.infds); 551 565 } else /* Send LHREQ_BREAK command. */ 552 566 write(lguest_fd, args, sizeof(args)); 553 567 } 554 568 } 555 569 556 570 /* This routine just sets up a pipe to the Waker process. */ 557 - static int setup_waker(int lguest_fd, struct device_list *device_list) 571 + static int setup_waker(int lguest_fd) 558 572 { 559 573 int pipefd[2], child; 560 574 ··· 570 580 if (child == 0) { 571 581 /* Close the "writing" end of our copy of the pipe */ 572 582 close(pipefd[1]); 573 - wake_parent(pipefd[0], lguest_fd, device_list); 583 + wake_parent(pipefd[0], lguest_fd); 574 584 } 575 585 /* Close the reading end of our copy of the pipe. */ 576 586 close(pipefd[0]); ··· 592 602 { 593 603 /* We have to separately check addr and addr+size, because size could 594 604 * be huge and addr + size might wrap around. */ 595 - if (addr >= top || addr + size >= top) 596 - errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 605 + if (addr >= guest_limit || addr + size >= guest_limit) 606 + errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 597 607 /* We return a pointer for the caller's convenience, now we know it's 598 608 * safe to use. */ 599 - return (void *)addr; 609 + return from_guest_phys(addr); 600 610 } 601 611 /* A macro which transparently hands the line number to the real function. */ 602 612 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 603 613 604 - /* The Guest has given us the address of a "struct lguest_dma". We check it's 605 - * OK and convert it to an iovec (which is a simple array of ptr/size 606 - * pairs). */ 607 - static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) 614 + /* This function returns the next descriptor in the chain, or vq->vring.num. */ 615 + static unsigned next_desc(struct virtqueue *vq, unsigned int i) 608 616 { 609 - unsigned int i; 610 - struct lguest_dma *udma; 617 + unsigned int next; 611 618 612 - /* First we make sure that the array memory itself is valid. */ 613 - udma = check_pointer(dma, sizeof(*udma)); 614 - /* Now we check each element */ 615 - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 616 - /* A zero length ends the array. */ 617 - if (!udma->len[i]) 618 - break; 619 + /* If this descriptor says it doesn't chain, we're done. */ 620 + if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) 621 + return vq->vring.num; 619 622 620 - iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); 621 - iov[i].iov_len = udma->len[i]; 622 - } 623 - *num = i; 623 + /* Check they're not leading us off end of descriptors. */ 624 + next = vq->vring.desc[i].next; 625 + /* Make sure compiler knows to grab that: we don't want it changing! */ 626 + wmb(); 624 627 625 - /* We return the pointer to where the caller should write the amount of 626 - * the buffer used. */ 627 - return &udma->used_len; 628 + if (next >= vq->vring.num) 629 + errx(1, "Desc next is %u", next); 630 + 631 + return next; 628 632 } 629 633 630 - /* This routine gets a DMA buffer from the Guest for a given key, and converts 631 - * it to an iovec array. It returns the interrupt the Guest wants when we're 632 - * finished, and a pointer to the "used_len" field to fill in. */ 633 - static u32 *get_dma_buffer(int fd, void *key, 634 - struct iovec iov[], unsigned int *num, u32 *irq) 634 + /* This looks in the virtqueue and for the first available buffer, and converts 635 + * it to an iovec for convenient access. Since descriptors consist of some 636 + * number of output then some number of input descriptors, it's actually two 637 + * iovecs, but we pack them into one and note how many of each there were. 638 + * 639 + * This function returns the descriptor number found, or vq->vring.num (which 640 + * is never a valid descriptor number) if none was found. */ 641 + static unsigned get_vq_desc(struct virtqueue *vq, 642 + struct iovec iov[], 643 + unsigned int *out_num, unsigned int *in_num) 635 644 { 636 - u32 buf[] = { LHREQ_GETDMA, (u32)key }; 637 - unsigned long udma; 638 - u32 *res; 645 + unsigned int i, head; 639 646 640 - /* Ask the kernel for a DMA buffer corresponding to this key. */ 641 - udma = write(fd, buf, sizeof(buf)); 642 - /* They haven't registered any, or they're all used? */ 643 - if (udma == (unsigned long)-1) 644 - return NULL; 647 + /* Check it isn't doing very strange things with descriptor numbers. */ 648 + if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) 649 + errx(1, "Guest moved used index from %u to %u", 650 + vq->last_avail_idx, vq->vring.avail->idx); 645 651 646 - /* Convert it into our iovec array */ 647 - res = dma2iov(udma, iov, num); 648 - /* The kernel stashes irq in ->used_len to get it out to us. */ 649 - *irq = *res; 650 - /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */ 651 - return res; 652 + /* If there's nothing new since last we looked, return invalid. */ 653 + if (vq->vring.avail->idx == vq->last_avail_idx) 654 + return vq->vring.num; 655 + 656 + /* Grab the next descriptor number they're advertising, and increment 657 + * the index we've seen. */ 658 + head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; 659 + 660 + /* If their number is silly, that's a fatal mistake. */ 661 + if (head >= vq->vring.num) 662 + errx(1, "Guest says index %u is available", head); 663 + 664 + /* When we start there are none of either input nor output. */ 665 + *out_num = *in_num = 0; 666 + 667 + i = head; 668 + do { 669 + /* Grab the first descriptor, and check it's OK. */ 670 + iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; 671 + iov[*out_num + *in_num].iov_base 672 + = check_pointer(vq->vring.desc[i].addr, 673 + vq->vring.desc[i].len); 674 + /* If this is an input descriptor, increment that count. */ 675 + if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) 676 + (*in_num)++; 677 + else { 678 + /* If it's an output descriptor, they're all supposed 679 + * to come before any input descriptors. */ 680 + if (*in_num) 681 + errx(1, "Descriptor has out after in"); 682 + (*out_num)++; 683 + } 684 + 685 + /* If we've got too many, that implies a descriptor loop. */ 686 + if (*out_num + *in_num > vq->vring.num) 687 + errx(1, "Looped descriptor"); 688 + } while ((i = next_desc(vq, i)) != vq->vring.num); 689 + 690 + return head; 652 691 } 653 692 654 - /* This is a convenient routine to send the Guest an interrupt. */ 655 - static void trigger_irq(int fd, u32 irq) 693 + /* Once we've used one of their buffers, we tell them about it. We'll then 694 + * want to send them an interrupt, using trigger_irq(). */ 695 + static void add_used(struct virtqueue *vq, unsigned int head, int len) 656 696 { 657 - u32 buf[] = { LHREQ_IRQ, irq }; 697 + struct vring_used_elem *used; 698 + 699 + /* Get a pointer to the next entry in the used ring. */ 700 + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 701 + used->id = head; 702 + used->len = len; 703 + /* Make sure buffer is written before we update index. */ 704 + wmb(); 705 + vq->vring.used->idx++; 706 + } 707 + 708 + /* This actually sends the interrupt for this virtqueue */ 709 + static void trigger_irq(int fd, struct virtqueue *vq) 710 + { 711 + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 712 + 713 + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 714 + return; 715 + 716 + /* Send the Guest an interrupt tell them we used something up. */ 658 717 if (write(fd, buf, sizeof(buf)) != 0) 659 - err(1, "Triggering irq %i", irq); 718 + err(1, "Triggering irq %i", vq->config.irq); 660 719 } 661 720 662 - /* This simply sets up an iovec array where we can put data to be discarded. 663 - * This happens when the Guest doesn't want or can't handle the input: we have 664 - * to get rid of it somewhere, and if we bury it in the ceiling space it will 665 - * start to smell after a week. */ 666 - static void discard_iovec(struct iovec *iov, unsigned int *num) 721 + /* And here's the combo meal deal. Supersize me! */ 722 + static void add_used_and_trigger(int fd, struct virtqueue *vq, 723 + unsigned int head, int len) 667 724 { 668 - static char discard_buf[1024]; 669 - *num = 1; 670 - iov->iov_base = discard_buf; 671 - iov->iov_len = sizeof(discard_buf); 725 + add_used(vq, head, len); 726 + trigger_irq(fd, vq); 672 727 } 673 728 674 729 /* Here is the input terminal setting we save, and the routine to restore them ··· 736 701 /* This is the routine which handles console input (ie. stdin). */ 737 702 static bool handle_console_input(int fd, struct device *dev) 738 703 { 739 - u32 irq = 0, *lenp; 740 704 int len; 741 - unsigned int num; 742 - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 705 + unsigned int head, in_num, out_num; 706 + struct iovec iov[dev->vq->vring.num]; 743 707 struct console_abort *abort = dev->priv; 744 708 745 - /* First we get the console buffer from the Guest. The key is dev->mem 746 - * which was set to 0 in setup_console(). */ 747 - lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); 748 - if (!lenp) { 749 - /* If it's not ready for input, warn and set up to discard. */ 750 - warn("console: no dma buffer!"); 751 - discard_iovec(iov, &num); 752 - } 709 + /* First we need a console buffer from the Guests's input virtqueue. */ 710 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 711 + 712 + /* If they're not ready for input, stop listening to this file 713 + * descriptor. We'll start again once they add an input buffer. */ 714 + if (head == dev->vq->vring.num) 715 + return false; 716 + 717 + if (out_num) 718 + errx(1, "Output buffers in console in queue?"); 753 719 754 720 /* This is why we convert to iovecs: the readv() call uses them, and so 755 721 * it reads straight into the Guest's buffer. */ 756 - len = readv(dev->fd, iov, num); 722 + len = readv(dev->fd, iov, in_num); 757 723 if (len <= 0) { 758 724 /* This implies that the console is closed, is /dev/null, or 759 - * something went terribly wrong. We still go through the rest 760 - * of the logic, though, especially the exit handling below. */ 725 + * something went terribly wrong. */ 761 726 warnx("Failed to get console input, ignoring console."); 762 - len = 0; 727 + /* Put the input terminal back. */ 728 + restore_term(); 729 + /* Remove callback from input vq, so it doesn't restart us. */ 730 + dev->vq->handle_output = NULL; 731 + /* Stop listening to this fd: don't call us again. */ 732 + return false; 763 733 } 764 734 765 - /* If we read the data into the Guest, fill in the length and send the 766 - * interrupt. */ 767 - if (lenp) { 768 - *lenp = len; 769 - trigger_irq(fd, irq); 770 - } 735 + /* Tell the Guest about the new input. */ 736 + add_used_and_trigger(fd, dev->vq, head, len); 771 737 772 738 /* Three ^C within one second? Exit. 773 739 * ··· 782 746 struct timeval now; 783 747 gettimeofday(&now, NULL); 784 748 if (now.tv_sec <= abort->start.tv_sec+1) { 785 - u32 args[] = { LHREQ_BREAK, 0 }; 749 + unsigned long args[] = { LHREQ_BREAK, 0 }; 786 750 /* Close the fd so Waker will know it has to 787 751 * exit. */ 788 752 close(waker_fd); ··· 797 761 /* Any other key resets the abort counter. */ 798 762 abort->count = 0; 799 763 800 - /* Now, if we didn't read anything, put the input terminal back and 801 - * return failure (meaning, don't call us again). */ 802 - if (!len) { 803 - restore_term(); 804 - return false; 805 - } 806 764 /* Everything went OK! */ 807 765 return true; 808 766 } 809 767 810 - /* Handling console output is much simpler than input. */ 811 - static u32 handle_console_output(int fd, const struct iovec *iov, 812 - unsigned num, struct device*dev) 768 + /* Handling output for console is simple: we just get all the output buffers 769 + * and write them to stdout. */ 770 + static void handle_console_output(int fd, struct virtqueue *vq) 813 771 { 814 - /* Whatever the Guest sends, write it to standard output. Return the 815 - * number of bytes written. */ 816 - return writev(STDOUT_FILENO, iov, num); 772 + unsigned int head, out, in; 773 + int len; 774 + struct iovec iov[vq->vring.num]; 775 + 776 + /* Keep getting output buffers from the Guest until we run out. */ 777 + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 778 + if (in) 779 + errx(1, "Input buffers in output queue?"); 780 + len = writev(STDOUT_FILENO, iov, out); 781 + add_used_and_trigger(fd, vq, head, len); 782 + } 817 783 } 818 784 819 - /* Guest->Host network output is also pretty easy. */ 820 - static u32 handle_tun_output(int fd, const struct iovec *iov, 821 - unsigned num, struct device *dev) 785 + /* Handling output for network is also simple: we get all the output buffers 786 + * and write them (ignoring the first element) to this device's file descriptor 787 + * (stdout). */ 788 + static void handle_net_output(int fd, struct virtqueue *vq) 822 789 { 823 - /* We put a flag in the "priv" pointer of the network device, and set 824 - * it as soon as we see output. We'll see why in handle_tun_input() */ 825 - *(bool *)dev->priv = true; 826 - /* Whatever packet the Guest sent us, write it out to the tun 827 - * device. */ 828 - return writev(dev->fd, iov, num); 790 + unsigned int head, out, in; 791 + int len; 792 + struct iovec iov[vq->vring.num]; 793 + 794 + /* Keep getting output buffers from the Guest until we run out. */ 795 + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 796 + if (in) 797 + errx(1, "Input buffers in output queue?"); 798 + /* Check header, but otherwise ignore it (we said we supported 799 + * no features). */ 800 + (void)convert(&iov[0], struct virtio_net_hdr); 801 + len = writev(vq->dev->fd, iov+1, out-1); 802 + add_used_and_trigger(fd, vq, head, len); 803 + } 829 804 } 830 805 831 - /* This matches the peer_key() in lguest_net.c. The key for any given slot 832 - * is the address of the network device's page plus 4 * the slot number. */ 833 - static unsigned long peer_offset(unsigned int peernum) 834 - { 835 - return 4 * peernum; 836 - } 837 - 838 - /* This is where we handle a packet coming in from the tun device */ 806 + /* This is where we handle a packet coming in from the tun device to our 807 + * Guest. */ 839 808 static bool handle_tun_input(int fd, struct device *dev) 840 809 { 841 - u32 irq = 0, *lenp; 810 + unsigned int head, in_num, out_num; 842 811 int len; 843 - unsigned num; 844 - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 812 + struct iovec iov[dev->vq->vring.num]; 813 + struct virtio_net_hdr *hdr; 845 814 846 - /* First we get a buffer the Guest has bound to its key. */ 847 - lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, 848 - &irq); 849 - if (!lenp) { 815 + /* First we need a network buffer from the Guests's recv virtqueue. */ 816 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 817 + if (head == dev->vq->vring.num) { 850 818 /* Now, it's expected that if we try to send a packet too 851 - * early, the Guest won't be ready yet. This is why we set a 852 - * flag when the Guest sends its first packet. If it's sent a 853 - * packet we assume it should be ready to receive them. 854 - * 855 - * Actually, this is what the status bits in the descriptor are 856 - * for: we should *use* them. FIXME! */ 857 - if (*(bool *)dev->priv) 819 + * early, the Guest won't be ready yet. Wait until the device 820 + * status says it's ready. */ 821 + /* FIXME: Actually want DRIVER_ACTIVE here. */ 822 + if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) 858 823 warn("network: no dma buffer!"); 859 - discard_iovec(iov, &num); 860 - } 824 + /* We'll turn this back on if input buffers are registered. */ 825 + return false; 826 + } else if (out_num) 827 + errx(1, "Output buffers in network recv queue?"); 828 + 829 + /* First element is the header: we set it to 0 (no features). */ 830 + hdr = convert(&iov[0], struct virtio_net_hdr); 831 + hdr->flags = 0; 832 + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 861 833 862 834 /* Read the packet from the device directly into the Guest's buffer. */ 863 - len = readv(dev->fd, iov, num); 835 + len = readv(dev->fd, iov+1, in_num-1); 864 836 if (len <= 0) 865 837 err(1, "reading network"); 866 838 867 - /* Write the used_len, and trigger the interrupt for the Guest */ 868 - if (lenp) { 869 - *lenp = len; 870 - trigger_irq(fd, irq); 871 - } 839 + /* Tell the Guest about the new packet. */ 840 + add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); 841 + 872 842 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 873 - ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], 874 - lenp ? "sent" : "discarded"); 843 + ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], 844 + head != dev->vq->vring.num ? "sent" : "discarded"); 845 + 875 846 /* All good. */ 876 847 return true; 877 848 } 878 849 879 - /* The last device handling routine is block output: the Guest has sent a DMA 880 - * to the block device. It will have placed the command it wants in the 881 - * "struct lguest_block_page". */ 882 - static u32 handle_block_output(int fd, const struct iovec *iov, 883 - unsigned num, struct device *dev) 850 + /* This callback ensures we try again, in case we stopped console or net 851 + * delivery because Guest didn't have any buffers. */ 852 + static void enable_fd(int fd, struct virtqueue *vq) 884 853 { 885 - struct lguest_block_page *p = dev->mem; 886 - u32 irq, *lenp; 887 - unsigned int len, reply_num; 888 - struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; 889 - off64_t device_len, off = (off64_t)p->sector * 512; 890 - 891 - /* First we extract the device length from the dev->priv pointer. */ 892 - device_len = *(off64_t *)dev->priv; 893 - 894 - /* We first check that the read or write is within the length of the 895 - * block file. */ 896 - if (off >= device_len) 897 - err(1, "Bad offset %llu vs %llu", off, device_len); 898 - /* Move to the right location in the block file. This shouldn't fail, 899 - * but best to check. */ 900 - if (lseek64(dev->fd, off, SEEK_SET) != off) 901 - err(1, "Bad seek to sector %i", p->sector); 902 - 903 - verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); 904 - 905 - /* They were supposed to bind a reply buffer at key equal to the start 906 - * of the block device memory. We need this to tell them when the 907 - * request is finished. */ 908 - lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); 909 - if (!lenp) 910 - err(1, "Block request didn't give us a dma buffer"); 911 - 912 - if (p->type) { 913 - /* A write request. The DMA they sent contained the data, so 914 - * write it out. */ 915 - len = writev(dev->fd, iov, num); 916 - /* Grr... Now we know how long the "struct lguest_dma" they 917 - * sent was, we make sure they didn't try to write over the end 918 - * of the block file (possibly extending it). */ 919 - if (off + len > device_len) { 920 - /* Trim it back to the correct length */ 921 - ftruncate64(dev->fd, device_len); 922 - /* Die, bad Guest, die. */ 923 - errx(1, "Write past end %llu+%u", off, len); 924 - } 925 - /* The reply length is 0: we just send back an empty DMA to 926 - * interrupt them and tell them the write is finished. */ 927 - *lenp = 0; 928 - } else { 929 - /* A read request. They sent an empty DMA to start the 930 - * request, and we put the read contents into the reply 931 - * buffer. */ 932 - len = readv(dev->fd, reply, reply_num); 933 - *lenp = len; 934 - } 935 - 936 - /* The result is 1 (done), 2 if there was an error (short read or 937 - * write). */ 938 - p->result = 1 + (p->bytes != len); 939 - /* Now tell them we've used their reply buffer. */ 940 - trigger_irq(fd, irq); 941 - 942 - /* We're supposed to return the number of bytes of the output buffer we 943 - * used. But the block device uses the "result" field instead, so we 944 - * don't bother. */ 945 - return 0; 854 + add_device_fd(vq->dev->fd); 855 + /* Tell waker to listen to it again */ 856 + write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); 946 857 } 947 858 948 - /* This is the generic routine we call when the Guest sends some DMA out. */ 949 - static void handle_output(int fd, unsigned long dma, unsigned long key, 950 - struct device_list *devices) 859 + /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 860 + static void handle_output(int fd, unsigned long addr) 951 861 { 952 862 struct device *i; 953 - u32 *lenp; 954 - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 955 - unsigned num = 0; 863 + struct virtqueue *vq; 956 864 957 - /* Convert the "struct lguest_dma" they're sending to a "struct 958 - * iovec". */ 959 - lenp = dma2iov(dma, iov, &num); 960 - 961 - /* Check each device: if they expect output to this key, tell them to 962 - * handle it. */ 963 - for (i = devices->dev; i; i = i->next) { 964 - if (i->handle_output && key == i->watch_key) { 965 - /* We write the result straight into the used_len field 966 - * for them. */ 967 - *lenp = i->handle_output(fd, iov, num, i); 968 - return; 865 + /* Check each virtqueue. */ 866 + for (i = devices.dev; i; i = i->next) { 867 + for (vq = i->vq; vq; vq = vq->next) { 868 + if (vq->config.pfn == addr/getpagesize() 869 + && vq->handle_output) { 870 + verbose("Output to %s\n", vq->dev->name); 871 + vq->handle_output(fd, vq); 872 + return; 873 + } 969 874 } 970 875 } 971 876 972 - /* This can happen: the kernel sends any SEND_DMA which doesn't match 973 - * another Guest to us. It could be that another Guest just left a 974 - * network, for example. But it's unusual. */ 975 - warnx("Pending dma %p, key %p", (void *)dma, (void *)key); 877 + /* Early console write is done using notify on a nul-terminated string 878 + * in Guest memory. */ 879 + if (addr >= guest_limit) 880 + errx(1, "Bad NOTIFY %#lx", addr); 881 + 882 + write(STDOUT_FILENO, from_guest_phys(addr), 883 + strnlen(from_guest_phys(addr), guest_limit - addr)); 976 884 } 977 885 978 886 /* This is called when the waker wakes us up: check for incoming file 979 887 * descriptors. */ 980 - static void handle_input(int fd, struct device_list *devices) 888 + static void handle_input(int fd) 981 889 { 982 890 /* select() wants a zeroed timeval to mean "don't wait". */ 983 891 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 984 892 985 893 for (;;) { 986 894 struct device *i; 987 - fd_set fds = devices->infds; 895 + fd_set fds = devices.infds; 988 896 989 897 /* If nothing is ready, we're done. */ 990 - if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) 898 + if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) 991 899 break; 992 900 993 901 /* Otherwise, call the device(s) which have readable 994 902 * file descriptors and a method of handling them. */ 995 - for (i = devices->dev; i; i = i->next) { 903 + for (i = devices.dev; i; i = i->next) { 996 904 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 905 + int dev_fd; 906 + if (i->handle_input(fd, i)) 907 + continue; 908 + 997 909 /* If handle_input() returns false, it means we 998 - * should no longer service it. 999 - * handle_console_input() does this. */ 1000 - if (!i->handle_input(fd, i)) { 1001 - /* Clear it from the set of input file 1002 - * descriptors kept at the head of the 1003 - * device list. */ 1004 - FD_CLR(i->fd, &devices->infds); 1005 - /* Tell waker to ignore it too... */ 1006 - write(waker_fd, &i->fd, sizeof(i->fd)); 1007 - } 910 + * should no longer service it. Networking and 911 + * console do this when there's no input 912 + * buffers to deliver into. Console also uses 913 + * it when it discovers that stdin is 914 + * closed. */ 915 + FD_CLR(i->fd, &devices.infds); 916 + /* Tell waker to ignore it too, by sending a 917 + * negative fd number (-1, since 0 is a valid 918 + * FD number). */ 919 + dev_fd = -i->fd - 1; 920 + write(waker_fd, &dev_fd, sizeof(dev_fd)); 1008 921 } 1009 922 } 1010 923 } ··· 967 982 * routines to allocate them. 968 983 * 969 984 * This routine allocates a new "struct lguest_device_desc" from descriptor 970 - * table in the devices array just above the Guest's normal memory. */ 971 - static struct lguest_device_desc * 972 - new_dev_desc(struct lguest_device_desc *descs, 973 - u16 type, u16 features, u16 num_pages) 985 + * table just above the Guest's normal memory. It returns a pointer to that 986 + * descriptor. */ 987 + static struct lguest_device_desc *new_dev_desc(u16 type) 974 988 { 975 - unsigned int i; 989 + struct lguest_device_desc *d; 976 990 977 - for (i = 0; i < LGUEST_MAX_DEVICES; i++) { 978 - if (!descs[i].type) { 979 - descs[i].type = type; 980 - descs[i].features = features; 981 - descs[i].num_pages = num_pages; 982 - /* If they said the device needs memory, we allocate 983 - * that now, bumping up the top of Guest memory. */ 984 - if (num_pages) { 985 - map_zeroed_pages(top, num_pages); 986 - descs[i].pfn = top/getpagesize(); 987 - top += num_pages*getpagesize(); 988 - } 989 - return &descs[i]; 990 - } 991 - } 992 - errx(1, "too many devices"); 991 + /* We only have one page for all the descriptors. */ 992 + if (devices.desc_used + sizeof(*d) > getpagesize()) 993 + errx(1, "Too many devices"); 994 + 995 + /* We don't need to set config_len or status: page is 0 already. */ 996 + d = (void *)devices.descpage + devices.desc_used; 997 + d->type = type; 998 + devices.desc_used += sizeof(*d); 999 + 1000 + return d; 993 1001 } 994 1002 995 - /* This monster routine does all the creation and setup of a new device, 996 - * including caling new_dev_desc() to allocate the descriptor and device 997 - * memory. */ 998 - static struct device *new_device(struct device_list *devices, 999 - u16 type, u16 num_pages, u16 features, 1000 - int fd, 1001 - bool (*handle_input)(int, struct device *), 1002 - unsigned long watch_off, 1003 - u32 (*handle_output)(int, 1004 - const struct iovec *, 1005 - unsigned, 1006 - struct device *)) 1003 + /* Each device descriptor is followed by some configuration information. 1004 + * The first byte is a "status" byte for the Guest to report what's happening. 1005 + * After that are fields: u8 type, u8 len, [... len bytes...]. 1006 + * 1007 + * This routine adds a new field to an existing device's descriptor. It only 1008 + * works for the last device, but that's OK because that's how we use it. */ 1009 + static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) 1010 + { 1011 + /* This is the last descriptor, right? */ 1012 + assert(devices.descpage + devices.desc_used 1013 + == (u8 *)(dev->desc + 1) + dev->desc->config_len); 1014 + 1015 + /* We only have one page of device descriptions. */ 1016 + if (devices.desc_used + 2 + len > getpagesize()) 1017 + errx(1, "Too many devices"); 1018 + 1019 + /* Copy in the new config header: type then length. */ 1020 + devices.descpage[devices.desc_used++] = type; 1021 + devices.descpage[devices.desc_used++] = len; 1022 + memcpy(devices.descpage + devices.desc_used, c, len); 1023 + devices.desc_used += len; 1024 + 1025 + /* Update the device descriptor length: two byte head then data. */ 1026 + dev->desc->config_len += 2 + len; 1027 + } 1028 + 1029 + /* This routine adds a virtqueue to a device. We specify how many descriptors 1030 + * the virtqueue is to have. */ 1031 + static void add_virtqueue(struct device *dev, unsigned int num_descs, 1032 + void (*handle_output)(int fd, struct virtqueue *me)) 1033 + { 1034 + unsigned int pages; 1035 + struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1036 + void *p; 1037 + 1038 + /* First we need some pages for this virtqueue. */ 1039 + pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize(); 1040 + p = get_pages(pages); 1041 + 1042 + /* Initialize the configuration. */ 1043 + vq->config.num = num_descs; 1044 + vq->config.irq = devices.next_irq++; 1045 + vq->config.pfn = to_guest_phys(p) / getpagesize(); 1046 + 1047 + /* Initialize the vring. */ 1048 + vring_init(&vq->vring, num_descs, p); 1049 + 1050 + /* Add the configuration information to this device's descriptor. */ 1051 + add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, 1052 + sizeof(vq->config), &vq->config); 1053 + 1054 + /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1055 + * second. */ 1056 + for (i = &dev->vq; *i; i = &(*i)->next); 1057 + *i = vq; 1058 + 1059 + /* Link virtqueue back to device. */ 1060 + vq->dev = dev; 1061 + 1062 + /* Set up handler. */ 1063 + vq->handle_output = handle_output; 1064 + if (!handle_output) 1065 + vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1066 + } 1067 + 1068 + /* This routine does all the creation and setup of a new device, including 1069 + * caling new_dev_desc() to allocate the descriptor and device memory. */ 1070 + static struct device *new_device(const char *name, u16 type, int fd, 1071 + bool (*handle_input)(int, struct device *)) 1007 1072 { 1008 1073 struct device *dev = malloc(sizeof(*dev)); 1009 1074 ··· 1061 1026 * easier, but the user expects the devices to be arranged on the bus 1062 1027 * in command-line order. The first network device on the command line 1063 1028 * is eth0, the first block device /dev/lgba, etc. */ 1064 - *devices->lastdev = dev; 1029 + *devices.lastdev = dev; 1065 1030 dev->next = NULL; 1066 - devices->lastdev = &dev->next; 1031 + devices.lastdev = &dev->next; 1067 1032 1068 1033 /* Now we populate the fields one at a time. */ 1069 1034 dev->fd = fd; 1070 1035 /* If we have an input handler for this file descriptor, then we add it 1071 1036 * to the device_list's fdset and maxfd. */ 1072 1037 if (handle_input) 1073 - set_fd(dev->fd, devices); 1074 - dev->desc = new_dev_desc(devices->descs, type, features, num_pages); 1075 - dev->mem = (void *)(dev->desc->pfn * getpagesize()); 1038 + add_device_fd(dev->fd); 1039 + dev->desc = new_dev_desc(type); 1076 1040 dev->handle_input = handle_input; 1077 - dev->watch_key = (unsigned long)dev->mem + watch_off; 1078 - dev->handle_output = handle_output; 1041 + dev->name = name; 1079 1042 return dev; 1080 1043 } 1081 1044 1082 1045 /* Our first setup routine is the console. It's a fairly simple device, but 1083 1046 * UNIX tty handling makes it uglier than it could be. */ 1084 - static void setup_console(struct device_list *devices) 1047 + static void setup_console(void) 1085 1048 { 1086 1049 struct device *dev; 1087 1050 ··· 1095 1062 atexit(restore_term); 1096 1063 } 1097 1064 1098 - /* We don't currently require any memory for the console, so we ask for 1099 - * 0 pages. */ 1100 - dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, 1101 - STDIN_FILENO, handle_console_input, 1102 - LGUEST_CONSOLE_DMA_KEY, handle_console_output); 1065 + dev = new_device("console", VIRTIO_ID_CONSOLE, 1066 + STDIN_FILENO, handle_console_input); 1103 1067 /* We store the console state in dev->priv, and initialize it. */ 1104 1068 dev->priv = malloc(sizeof(struct console_abort)); 1105 1069 ((struct console_abort *)dev->priv)->count = 0; 1106 - verbose("device %p: console\n", 1107 - (void *)(dev->desc->pfn * getpagesize())); 1108 - } 1109 1070 1110 - /* Setting up a block file is also fairly straightforward. */ 1111 - static void setup_block_file(const char *filename, struct device_list *devices) 1112 - { 1113 - int fd; 1114 - struct device *dev; 1115 - off64_t *device_len; 1116 - struct lguest_block_page *p; 1071 + /* The console needs two virtqueues: the input then the output. When 1072 + * they put something the input queue, we make sure we're listening to 1073 + * stdin. When they put something in the output queue, we write it to 1074 + * stdout. */ 1075 + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1076 + add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1117 1077 1118 - /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We 1119 - * open with O_DIRECT because otherwise our benchmarks go much too 1120 - * fast. */ 1121 - fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); 1122 - 1123 - /* We want one page, and have no input handler (the block file never 1124 - * has anything interesting to say to us). Our timing will be quite 1125 - * random, so it should be a reasonable randomness source. */ 1126 - dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, 1127 - LGUEST_DEVICE_F_RANDOMNESS, 1128 - fd, NULL, 0, handle_block_output); 1129 - 1130 - /* We store the device size in the private area */ 1131 - device_len = dev->priv = malloc(sizeof(*device_len)); 1132 - /* This is the safe way of establishing the size of our device: it 1133 - * might be a normal file or an actual block device like /dev/hdb. */ 1134 - *device_len = lseek64(fd, 0, SEEK_END); 1135 - 1136 - /* The device memory is a "struct lguest_block_page". It's zeroed 1137 - * already, we just need to put in the device size. Block devices 1138 - * think in sectors (ie. 512 byte chunks), so we translate here. */ 1139 - p = dev->mem; 1140 - p->num_sectors = *device_len/512; 1141 - verbose("device %p: block %i sectors\n", 1142 - (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); 1143 - } 1144 - 1145 - /* 1146 - * Network Devices. 1147 - * 1148 - * Setting up network devices is quite a pain, because we have three types. 1149 - * First, we have the inter-Guest network. This is a file which is mapped into 1150 - * the address space of the Guests who are on the network. Because it is a 1151 - * shared mapping, the same page underlies all the devices, and they can send 1152 - * DMA to each other. 1153 - * 1154 - * Remember from our network driver, the Guest is told what slot in the page it 1155 - * is to use. We use exclusive fnctl locks to reserve a slot. If another 1156 - * Guest is using a slot, the lock will fail and we try another. Because fnctl 1157 - * locks are cleaned up automatically when we die, this cleverly means that our 1158 - * reservation on the slot will vanish if we crash. */ 1159 - static unsigned int find_slot(int netfd, const char *filename) 1160 - { 1161 - struct flock fl; 1162 - 1163 - fl.l_type = F_WRLCK; 1164 - fl.l_whence = SEEK_SET; 1165 - fl.l_len = 1; 1166 - /* Try a 1 byte lock in each possible position number */ 1167 - for (fl.l_start = 0; 1168 - fl.l_start < getpagesize()/sizeof(struct lguest_net); 1169 - fl.l_start++) { 1170 - /* If we succeed, return the slot number. */ 1171 - if (fcntl(netfd, F_SETLK, &fl) == 0) 1172 - return fl.l_start; 1173 - } 1174 - errx(1, "No free slots in network file %s", filename); 1175 - } 1176 - 1177 - /* This function sets up the network file */ 1178 - static void setup_net_file(const char *filename, 1179 - struct device_list *devices) 1180 - { 1181 - int netfd; 1182 - struct device *dev; 1183 - 1184 - /* We don't use open_or_die() here: for friendliness we create the file 1185 - * if it doesn't already exist. */ 1186 - netfd = open(filename, O_RDWR, 0); 1187 - if (netfd < 0) { 1188 - if (errno == ENOENT) { 1189 - netfd = open(filename, O_RDWR|O_CREAT, 0600); 1190 - if (netfd >= 0) { 1191 - /* If we succeeded, initialize the file with a 1192 - * blank page. */ 1193 - char page[getpagesize()]; 1194 - memset(page, 0, sizeof(page)); 1195 - write(netfd, page, sizeof(page)); 1196 - } 1197 - } 1198 - if (netfd < 0) 1199 - err(1, "cannot open net file '%s'", filename); 1200 - } 1201 - 1202 - /* We need 1 page, and the features indicate the slot to use and that 1203 - * no checksum is needed. We never touch this device again; it's 1204 - * between the Guests on the network, so we don't register input or 1205 - * output handlers. */ 1206 - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, 1207 - find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, 1208 - -1, NULL, 0, NULL); 1209 - 1210 - /* Map the shared file. */ 1211 - if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, 1212 - MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) 1213 - err(1, "could not mmap '%s'", filename); 1214 - verbose("device %p: shared net %s, peer %i\n", 1215 - (void *)(dev->desc->pfn * getpagesize()), filename, 1216 - dev->desc->features & ~LGUEST_NET_F_NOCSUM); 1078 + verbose("device %u: console\n", devices.device_num++); 1217 1079 } 1218 1080 /*:*/ 1081 + 1082 + /*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1083 + * --sharenet=<name> option which opens or creates a named pipe. This can be 1084 + * used to send packets to another guest in a 1:1 manner. 1085 + * 1086 + * More sopisticated is to use one of the tools developed for project like UML 1087 + * to do networking. 1088 + * 1089 + * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 1090 + * completely generic ("here's my vring, attach to your vring") and would work 1091 + * for any traffic. Of course, namespace and permissions issues need to be 1092 + * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide 1093 + * multiple inter-guest channels behind one interface, although it would 1094 + * require some manner of hotplugging new virtio channels. 1095 + * 1096 + * Finally, we could implement a virtio network switch in the kernel. :*/ 1219 1097 1220 1098 static u32 str2ip(const char *ipaddr) 1221 1099 { ··· 1161 1217 1162 1218 /* This sets up the Host end of the network device with an IP address, brings 1163 1219 * it up so packets will flow, the copies the MAC address into the hwaddr 1164 - * pointer (in practice, the Host's slot in the network device's memory). */ 1220 + * pointer. */ 1165 1221 static void configure_device(int fd, const char *devname, u32 ipaddr, 1166 1222 unsigned char hwaddr[6]) 1167 1223 { ··· 1187 1243 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1188 1244 } 1189 1245 1190 - /*L:195 The other kind of network is a Host<->Guest network. This can either 1191 - * use briding or routing, but the principle is the same: it uses the "tun" 1192 - * device to inject packets into the Host as if they came in from a normal 1193 - * network card. We just shunt packets between the Guest and the tun 1194 - * device. */ 1195 - static void setup_tun_net(const char *arg, struct device_list *devices) 1246 + /*L:195 Our network is a Host<->Guest network. This can either use bridging or 1247 + * routing, but the principle is the same: it uses the "tun" device to inject 1248 + * packets into the Host as if they came in from a normal network card. We 1249 + * just shunt packets between the Guest and the tun device. */ 1250 + static void setup_tun_net(const char *arg) 1196 1251 { 1197 1252 struct device *dev; 1198 1253 struct ifreq ifr; 1199 1254 int netfd, ipfd; 1200 1255 u32 ip; 1201 1256 const char *br_name = NULL; 1257 + u8 hwaddr[6]; 1202 1258 1203 1259 /* We open the /dev/net/tun device and tell it we want a tap device. A 1204 1260 * tap device is like a tun device, only somehow different. To tell ··· 1214 1270 * device: trust us! */ 1215 1271 ioctl(netfd, TUNSETNOCSUM, 1); 1216 1272 1217 - /* We create the net device with 1 page, using the features field of 1218 - * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and 1219 - * that the device has fairly random timing. We do *not* specify 1220 - * LGUEST_NET_F_NOCSUM: these packets can reach the real world. 1221 - * 1222 - * We will put our MAC address is slot 0 for the Guest to see, so 1223 - * it will send packets to us using the key "peer_offset(0)": */ 1224 - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, 1225 - NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, 1226 - handle_tun_input, peer_offset(0), handle_tun_output); 1273 + /* First we create a new network device. */ 1274 + dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1227 1275 1228 - /* We keep a flag which says whether we've seen packets come out from 1229 - * this network device. */ 1230 - dev->priv = malloc(sizeof(bool)); 1231 - *(bool *)dev->priv = false; 1276 + /* Network devices need a receive and a send queue, just like 1277 + * console. */ 1278 + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1279 + add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1232 1280 1233 1281 /* We need a socket to perform the magic network ioctls to bring up the 1234 1282 * tap interface, connect to the bridge etc. Any socket will do! */ ··· 1236 1300 } else /* It is an IP address to set up the device with */ 1237 1301 ip = str2ip(arg); 1238 1302 1239 - /* We are peer 0, ie. first slot, so we hand dev->mem to this routine 1240 - * to write the MAC address at the start of the device memory. */ 1241 - configure_device(ipfd, ifr.ifr_name, ip, dev->mem); 1303 + /* Set up the tun device, and get the mac address for the interface. */ 1304 + configure_device(ipfd, ifr.ifr_name, ip, hwaddr); 1242 1305 1243 - /* Set "promisc" bit: we want every single packet if we're going to 1244 - * bridge to other machines (and otherwise it doesn't matter). */ 1245 - *((u8 *)dev->mem) |= 0x1; 1306 + /* Tell Guest what MAC address to use. */ 1307 + add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); 1246 1308 1309 + /* We don't seed the socket any more; setup is done. */ 1247 1310 close(ipfd); 1248 1311 1249 - verbose("device %p: tun net %u.%u.%u.%u\n", 1250 - (void *)(dev->desc->pfn * getpagesize()), 1251 - (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); 1312 + verbose("device %u: tun net %u.%u.%u.%u\n", 1313 + devices.device_num++, 1314 + (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); 1252 1315 if (br_name) 1253 1316 verbose("attached to bridge: %s\n", br_name); 1317 + } 1318 + 1319 + 1320 + /* 1321 + * Block device. 1322 + * 1323 + * Serving a block device is really easy: the Guest asks for a block number and 1324 + * we read or write that position in the file. 1325 + * 1326 + * Unfortunately, this is amazingly slow: the Guest waits until the read is 1327 + * finished before running anything else, even if it could be doing useful 1328 + * work. We could use async I/O, except it's reputed to suck so hard that 1329 + * characters actually go missing from your code when you try to use it. 1330 + * 1331 + * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1332 + 1333 + /* This hangs off device->priv, with the data. */ 1334 + struct vblk_info 1335 + { 1336 + /* The size of the file. */ 1337 + off64_t len; 1338 + 1339 + /* The file descriptor for the file. */ 1340 + int fd; 1341 + 1342 + /* IO thread listens on this file descriptor [0]. */ 1343 + int workpipe[2]; 1344 + 1345 + /* IO thread writes to this file descriptor to mark it done, then 1346 + * Launcher triggers interrupt to Guest. */ 1347 + int done_fd; 1348 + }; 1349 + 1350 + /* This is the core of the I/O thread. It returns true if it did something. */ 1351 + static bool service_io(struct device *dev) 1352 + { 1353 + struct vblk_info *vblk = dev->priv; 1354 + unsigned int head, out_num, in_num, wlen; 1355 + int ret; 1356 + struct virtio_blk_inhdr *in; 1357 + struct virtio_blk_outhdr *out; 1358 + struct iovec iov[dev->vq->vring.num]; 1359 + off64_t off; 1360 + 1361 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1362 + if (head == dev->vq->vring.num) 1363 + return false; 1364 + 1365 + if (out_num == 0 || in_num == 0) 1366 + errx(1, "Bad virtblk cmd %u out=%u in=%u", 1367 + head, out_num, in_num); 1368 + 1369 + out = convert(&iov[0], struct virtio_blk_outhdr); 1370 + in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); 1371 + off = out->sector * 512; 1372 + 1373 + /* This is how we implement barriers. Pretty poor, no? */ 1374 + if (out->type & VIRTIO_BLK_T_BARRIER) 1375 + fdatasync(vblk->fd); 1376 + 1377 + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1378 + fprintf(stderr, "Scsi commands unsupported\n"); 1379 + in->status = VIRTIO_BLK_S_UNSUPP; 1380 + wlen = sizeof(in); 1381 + } else if (out->type & VIRTIO_BLK_T_OUT) { 1382 + /* Write */ 1383 + 1384 + /* Move to the right location in the block file. This can fail 1385 + * if they try to write past end. */ 1386 + if (lseek64(vblk->fd, off, SEEK_SET) != off) 1387 + err(1, "Bad seek to sector %llu", out->sector); 1388 + 1389 + ret = writev(vblk->fd, iov+1, out_num-1); 1390 + verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1391 + 1392 + /* Grr... Now we know how long the descriptor they sent was, we 1393 + * make sure they didn't try to write over the end of the block 1394 + * file (possibly extending it). */ 1395 + if (ret > 0 && off + ret > vblk->len) { 1396 + /* Trim it back to the correct length */ 1397 + ftruncate64(vblk->fd, vblk->len); 1398 + /* Die, bad Guest, die. */ 1399 + errx(1, "Write past end %llu+%u", off, ret); 1400 + } 1401 + wlen = sizeof(in); 1402 + in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1403 + } else { 1404 + /* Read */ 1405 + 1406 + /* Move to the right location in the block file. This can fail 1407 + * if they try to read past end. */ 1408 + if (lseek64(vblk->fd, off, SEEK_SET) != off) 1409 + err(1, "Bad seek to sector %llu", out->sector); 1410 + 1411 + ret = readv(vblk->fd, iov+1, in_num-1); 1412 + verbose("READ from sector %llu: %i\n", out->sector, ret); 1413 + if (ret >= 0) { 1414 + wlen = sizeof(in) + ret; 1415 + in->status = VIRTIO_BLK_S_OK; 1416 + } else { 1417 + wlen = sizeof(in); 1418 + in->status = VIRTIO_BLK_S_IOERR; 1419 + } 1420 + } 1421 + 1422 + /* We can't trigger an IRQ, because we're not the Launcher. It does 1423 + * that when we tell it we're done. */ 1424 + add_used(dev->vq, head, wlen); 1425 + return true; 1426 + } 1427 + 1428 + /* This is the thread which actually services the I/O. */ 1429 + static int io_thread(void *_dev) 1430 + { 1431 + struct device *dev = _dev; 1432 + struct vblk_info *vblk = dev->priv; 1433 + char c; 1434 + 1435 + /* Close other side of workpipe so we get 0 read when main dies. */ 1436 + close(vblk->workpipe[1]); 1437 + /* Close the other side of the done_fd pipe. */ 1438 + close(dev->fd); 1439 + 1440 + /* When this read fails, it means Launcher died, so we follow. */ 1441 + while (read(vblk->workpipe[0], &c, 1) == 1) { 1442 + /* We acknowledge each request immediately, to reduce latency, 1443 + * rather than waiting until we've done them all. I haven't 1444 + * measured to see if it makes any difference. */ 1445 + while (service_io(dev)) 1446 + write(vblk->done_fd, &c, 1); 1447 + } 1448 + return 0; 1449 + } 1450 + 1451 + /* When the thread says some I/O is done, we interrupt the Guest. */ 1452 + static bool handle_io_finish(int fd, struct device *dev) 1453 + { 1454 + char c; 1455 + 1456 + /* If child died, presumably it printed message. */ 1457 + if (read(dev->fd, &c, 1) != 1) 1458 + exit(1); 1459 + 1460 + /* It did some work, so trigger the irq. */ 1461 + trigger_irq(fd, dev->vq); 1462 + return true; 1463 + } 1464 + 1465 + /* When the Guest submits some I/O, we wake the I/O thread. */ 1466 + static void handle_virtblk_output(int fd, struct virtqueue *vq) 1467 + { 1468 + struct vblk_info *vblk = vq->dev->priv; 1469 + char c = 0; 1470 + 1471 + /* Wake up I/O thread and tell it to go to work! */ 1472 + if (write(vblk->workpipe[1], &c, 1) != 1) 1473 + /* Presumably it indicated why it died. */ 1474 + exit(1); 1475 + } 1476 + 1477 + /* This creates a virtual block device. */ 1478 + static void setup_block_file(const char *filename) 1479 + { 1480 + int p[2]; 1481 + struct device *dev; 1482 + struct vblk_info *vblk; 1483 + void *stack; 1484 + u64 cap; 1485 + unsigned int val; 1486 + 1487 + /* This is the pipe the I/O thread will use to tell us I/O is done. */ 1488 + pipe(p); 1489 + 1490 + /* The device responds to return from I/O thread. */ 1491 + dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1492 + 1493 + /* The device has a virtqueue. */ 1494 + add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1495 + 1496 + /* Allocate the room for our own bookkeeping */ 1497 + vblk = dev->priv = malloc(sizeof(*vblk)); 1498 + 1499 + /* First we open the file and store the length. */ 1500 + vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 1501 + vblk->len = lseek64(vblk->fd, 0, SEEK_END); 1502 + 1503 + /* Tell Guest how many sectors this device has. */ 1504 + cap = cpu_to_le64(vblk->len / 512); 1505 + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); 1506 + 1507 + /* Tell Guest not to put in too many descriptors at once: two are used 1508 + * for the in and out elements. */ 1509 + val = cpu_to_le32(VIRTQUEUE_NUM - 2); 1510 + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); 1511 + 1512 + /* The I/O thread writes to this end of the pipe when done. */ 1513 + vblk->done_fd = p[1]; 1514 + 1515 + /* This is how we tell the I/O thread about more work. */ 1516 + pipe(vblk->workpipe); 1517 + 1518 + /* Create stack for thread and run it */ 1519 + stack = malloc(32768); 1520 + if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) 1521 + err(1, "Creating clone"); 1522 + 1523 + /* We don't need to keep the I/O thread's end of the pipes open. */ 1524 + close(vblk->done_fd); 1525 + close(vblk->workpipe[0]); 1526 + 1527 + verbose("device %u: virtblock %llu sectors\n", 1528 + devices.device_num, cap); 1254 1529 } 1255 1530 /* That's the end of device setup. */ 1256 1531 1257 1532 /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1258 1533 * its input and output, and finally, lays it to rest. */ 1259 - static void __attribute__((noreturn)) 1260 - run_guest(int lguest_fd, struct device_list *device_list) 1534 + static void __attribute__((noreturn)) run_guest(int lguest_fd) 1261 1535 { 1262 1536 for (;;) { 1263 - u32 args[] = { LHREQ_BREAK, 0 }; 1264 - unsigned long arr[2]; 1537 + unsigned long args[] = { LHREQ_BREAK, 0 }; 1538 + unsigned long notify_addr; 1265 1539 int readval; 1266 1540 1267 1541 /* We read from the /dev/lguest device to run the Guest. */ 1268 - readval = read(lguest_fd, arr, sizeof(arr)); 1542 + readval = read(lguest_fd, &notify_addr, sizeof(notify_addr)); 1269 1543 1270 - /* The read can only really return sizeof(arr) (the Guest did a 1271 - * SEND_DMA to us), or an error. */ 1272 - 1273 - /* For a successful read, arr[0] is the address of the "struct 1274 - * lguest_dma", and arr[1] is the key the Guest sent to. */ 1275 - if (readval == sizeof(arr)) { 1276 - handle_output(lguest_fd, arr[0], arr[1], device_list); 1544 + /* One unsigned long means the Guest did HCALL_NOTIFY */ 1545 + if (readval == sizeof(notify_addr)) { 1546 + verbose("Notify on address %#lx\n", notify_addr); 1547 + handle_output(lguest_fd, notify_addr); 1277 1548 continue; 1278 1549 /* ENOENT means the Guest died. Reading tells us why. */ 1279 1550 } else if (errno == ENOENT) { ··· 1494 1351 1495 1352 /* Service input, then unset the BREAK which releases 1496 1353 * the Waker. */ 1497 - handle_input(lguest_fd, device_list); 1354 + handle_input(lguest_fd); 1498 1355 if (write(lguest_fd, args, sizeof(args)) < 0) 1499 1356 err(1, "Resetting break"); 1500 1357 } ··· 1508 1365 1509 1366 static struct option opts[] = { 1510 1367 { "verbose", 0, NULL, 'v' }, 1511 - { "sharenet", 1, NULL, 's' }, 1512 1368 { "tunnet", 1, NULL, 't' }, 1513 1369 { "block", 1, NULL, 'b' }, 1514 1370 { "initrd", 1, NULL, 'i' }, ··· 1516 1374 static void usage(void) 1517 1375 { 1518 1376 errx(1, "Usage: lguest [--verbose] " 1519 - "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1377 + "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1520 1378 "|--block=<filename>|--initrd=<filename>]...\n" 1521 1379 "<mem-in-mb> vmlinux [args...]"); 1522 1380 } 1523 1381 1524 - /*L:100 The Launcher code itself takes us out into userspace, that scary place 1525 - * where pointers run wild and free! Unfortunately, like most userspace 1526 - * programs, it's quite boring (which is why everyone like to hack on the 1527 - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 1528 - * will get you through this section. Or, maybe not. 1529 - * 1530 - * The Launcher binary sits up high, usually starting at address 0xB8000000. 1531 - * Everything below this is the "physical" memory for the Guest. For example, 1532 - * if the Guest were to write a "1" at physical address 0, we would see a "1" 1533 - * in the Launcher at "(int *)0". Guest physical == Launcher virtual. 1534 - * 1535 - * This can be tough to get your head around, but usually it just means that we 1536 - * don't need to do any conversion when the Guest gives us it's "physical" 1537 - * addresses. 1538 - */ 1382 + /*L:105 The main routine is where the real work begins: */ 1539 1383 int main(int argc, char *argv[]) 1540 1384 { 1541 - /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1542 - * of the (optional) initrd. */ 1543 - unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1385 + /* Memory, top-level pagetable, code startpoint and size of the 1386 + * (optional) initrd. */ 1387 + unsigned long mem = 0, pgdir, start, initrd_size = 0; 1544 1388 /* A temporary and the /dev/lguest file descriptor. */ 1545 1389 int i, c, lguest_fd; 1546 - /* The list of Guest devices, based on command line arguments. */ 1547 - struct device_list device_list; 1548 - /* The boot information for the Guest: at guest-physical address 0. */ 1549 - void *boot = (void *)0; 1390 + /* The boot information for the Guest. */ 1391 + struct boot_params *boot; 1550 1392 /* If they specify an initrd file to load. */ 1551 1393 const char *initrd_name = NULL; 1552 1394 ··· 1538 1412 * device receive input from a file descriptor, we keep an fdset 1539 1413 * (infds) and the maximum fd number (max_infd) with the head of the 1540 1414 * list. We also keep a pointer to the last device, for easy appending 1541 - * to the list. */ 1542 - device_list.max_infd = -1; 1543 - device_list.dev = NULL; 1544 - device_list.lastdev = &device_list.dev; 1545 - FD_ZERO(&device_list.infds); 1415 + * to the list. Finally, we keep the next interrupt number to hand out 1416 + * (1: remember that 0 is used by the timer). */ 1417 + FD_ZERO(&devices.infds); 1418 + devices.max_infd = -1; 1419 + devices.lastdev = &devices.dev; 1420 + devices.next_irq = 1; 1546 1421 1547 1422 /* We need to know how much memory so we can set up the device 1548 1423 * descriptor and memory pages for the devices as we parse the command ··· 1551 1424 * of memory now. */ 1552 1425 for (i = 1; i < argc; i++) { 1553 1426 if (argv[i][0] != '-') { 1554 - mem = top = atoi(argv[i]) * 1024 * 1024; 1555 - device_list.descs = map_zeroed_pages(top, 1); 1556 - top += getpagesize(); 1427 + mem = atoi(argv[i]) * 1024 * 1024; 1428 + /* We start by mapping anonymous pages over all of 1429 + * guest-physical memory range. This fills it with 0, 1430 + * and ensures that the Guest won't be killed when it 1431 + * tries to access it. */ 1432 + guest_base = map_zeroed_pages(mem / getpagesize() 1433 + + DEVICE_PAGES); 1434 + guest_limit = mem; 1435 + guest_max = mem + DEVICE_PAGES*getpagesize(); 1436 + devices.descpage = get_pages(1); 1557 1437 break; 1558 1438 } 1559 1439 } ··· 1571 1437 case 'v': 1572 1438 verbose = true; 1573 1439 break; 1574 - case 's': 1575 - setup_net_file(optarg, &device_list); 1576 - break; 1577 1440 case 't': 1578 - setup_tun_net(optarg, &device_list); 1441 + setup_tun_net(optarg); 1579 1442 break; 1580 1443 case 'b': 1581 - setup_block_file(optarg, &device_list); 1444 + setup_block_file(optarg); 1582 1445 break; 1583 1446 case 'i': 1584 1447 initrd_name = optarg; ··· 1590 1459 if (optind + 2 > argc) 1591 1460 usage(); 1592 1461 1593 - /* We always have a console device */ 1594 - setup_console(&device_list); 1462 + verbose("Guest base is at %p\n", guest_base); 1595 1463 1596 - /* We start by mapping anonymous pages over all of guest-physical 1597 - * memory range. This fills it with 0, and ensures that the Guest 1598 - * won't be killed when it tries to access it. */ 1599 - map_zeroed_pages(0, mem / getpagesize()); 1464 + /* We always have a console device */ 1465 + setup_console(); 1600 1466 1601 1467 /* Now we load the kernel */ 1602 - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1603 - &page_offset); 1468 + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1469 + 1470 + /* Boot information is stashed at physical address 0 */ 1471 + boot = from_guest_phys(0); 1604 1472 1605 1473 /* Map the initrd image if requested (at top of physical memory) */ 1606 1474 if (initrd_name) { 1607 1475 initrd_size = load_initrd(initrd_name, mem); 1608 1476 /* These are the location in the Linux boot header where the 1609 1477 * start and size of the initrd are expected to be found. */ 1610 - *(unsigned long *)(boot+0x218) = mem - initrd_size; 1611 - *(unsigned long *)(boot+0x21c) = initrd_size; 1478 + boot->hdr.ramdisk_image = mem - initrd_size; 1479 + boot->hdr.ramdisk_size = initrd_size; 1612 1480 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1613 - *(unsigned char *)(boot+0x210) = 0xFF; 1481 + boot->hdr.type_of_loader = 0xFF; 1614 1482 } 1615 1483 1616 1484 /* Set up the initial linear pagetables, starting below the initrd. */ 1617 - pgdir = setup_pagetables(mem, initrd_size, page_offset); 1485 + pgdir = setup_pagetables(mem, initrd_size); 1618 1486 1619 1487 /* The Linux boot header contains an "E820" memory map: ours is a 1620 1488 * simple, single region. */ 1621 - *(char*)(boot+E820NR) = 1; 1622 - *((struct e820entry *)(boot+E820MAP)) 1623 - = ((struct e820entry) { 0, mem, E820_RAM }); 1489 + boot->e820_entries = 1; 1490 + boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 1624 1491 /* The boot header contains a command line pointer: we put the command 1625 - * line after the boot header (at address 4096) */ 1626 - *(void **)(boot + 0x228) = boot + 4096; 1627 - concat(boot + 4096, argv+optind+2); 1492 + * line after the boot header. */ 1493 + boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1494 + concat((char *)(boot + 1), argv+optind+2); 1628 1495 1629 - /* The guest type value of "1" tells the Guest it's under lguest. */ 1630 - *(int *)(boot + 0x23c) = 1; 1496 + /* Boot protocol version: 2.07 supports the fields for lguest. */ 1497 + boot->hdr.version = 0x207; 1498 + 1499 + /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ 1500 + boot->hdr.hardware_subarch = 1; 1501 + 1502 + /* Tell the entry path not to try to reload segment registers. */ 1503 + boot->hdr.loadflags |= KEEP_SEGMENTS; 1631 1504 1632 1505 /* We tell the kernel to initialize the Guest: this returns the open 1633 1506 * /dev/lguest file descriptor. */ 1634 - lguest_fd = tell_kernel(pgdir, start, page_offset); 1507 + lguest_fd = tell_kernel(pgdir, start); 1635 1508 1636 1509 /* We fork off a child process, which wakes the Launcher whenever one 1637 1510 * of the input file descriptors needs attention. Otherwise we would 1638 1511 * run the Guest until it tries to output something. */ 1639 - waker_fd = setup_waker(lguest_fd, &device_list); 1512 + waker_fd = setup_waker(lguest_fd); 1640 1513 1641 1514 /* Finally, run the Guest. This doesn't return. */ 1642 - run_guest(lguest_fd, &device_list); 1515 + run_guest(lguest_fd); 1643 1516 } 1644 1517 /*:*/ 1645 1518

+30 -40

Documentation/lguest/lguest.txt

··· 6 6 Linux developers and users to experiment with virtualization with the 7 7 minimum of complexity. Nonetheless, it should have sufficient 8 8 features to make it useful for specific tasks, and, of course, you are 9 - encouraged to fork and enhance it. 9 + encouraged to fork and enhance it (see drivers/lguest/README). 10 10 11 11 Features: 12 12 ··· 23 23 24 24 Running Lguest: 25 25 26 - - Lguest runs the same kernel as guest and host. You can configure 27 - them differently, but usually it's easiest not to. 26 + - The easiest way to run lguest is to use same kernel as guest and host. 27 + You can configure them differently, but usually it's easiest not to. 28 28 29 29 You will need to configure your kernel with the following options: 30 30 31 - CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] 32 - CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") 33 - CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") 34 - CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") 35 - CONFIG_LGUEST=y/m ("Linux hypervisor example code") 31 + "General setup": 32 + "Prompt for development and/or incomplete code/drivers" = Y 33 + (CONFIG_EXPERIMENTAL=y) 36 34 37 - and I recommend: 38 - CONFIG_HZ=100 ("Timer frequency")[2] 35 + "Processor type and features": 36 + "Paravirtualized guest support" = Y 37 + "Lguest guest support" = Y 38 + "High Memory Support" = off/4GB 39 + "Alignment value to which kernel should be aligned" = 0x100000 40 + (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 41 + CONFIG_PHYSICAL_ALIGN=0x100000) 42 + 43 + "Device Drivers": 44 + "Network device support" 45 + "Universal TUN/TAP device driver support" = M/Y 46 + (CONFIG_TUN=m) 47 + "Virtualization" 48 + "Linux hypervisor example code" = M/Y 49 + (CONFIG_LGUEST=m) 39 50 40 51 - A tool called "lguest" is available in this directory: type "make" 41 52 to build it. If you didn't build your kernel in-tree, use "make ··· 62 51 dd if=/dev/zero of=rootfile bs=1M count=2048 63 52 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 64 53 54 + Make sure that you install a getty on /dev/hvc0 if you want to log in on the 55 + console! 56 + 65 57 - "modprobe lg" if you built it as a module. 66 58 67 59 - Run an lguest as root: 68 60 69 - Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba 61 + Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda 70 62 71 63 Explanation: 72 - 64m: the amount of memory to use. 64 + 64: the amount of memory to use, in MB. 73 65 74 66 vmlinux: the kernel image found in the top of your build directory. You 75 67 can also use a standard bzImage. ··· 80 66 --tunnet=192.168.19.1: configures a "tap" device for networking with this 81 67 IP address. 82 68 83 - --block=rootfile: a file or block device which becomes /dev/lgba 69 + --block=rootfile: a file or block device which becomes /dev/vda 84 70 inside the guest. 85 71 86 - root=/dev/lgba: this (and anything else on the command line) are 72 + root=/dev/vda: this (and anything else on the command line) are 87 73 kernel boot parameters. 88 74 89 75 - Configuring networking. I usually have the host masquerade, using ··· 113 99 "--sharenet=<filename>": any two guests using the same file are on 114 100 the same network. This file is created if it does not exist. 115 101 116 - Lguest I/O model: 102 + There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest 117 103 118 - Lguest uses a simplified DMA model plus shared memory for I/O. Guests 119 - can communicate with each other if they share underlying memory 120 - (usually by the lguest program mmaping the same file), but they can 121 - use any non-shared memory to communicate with the lguest process. 122 - 123 - Guests can register DMA buffers at any key (must be a valid physical 124 - address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) 125 - hypercall. "dmabufs" is the physical address of an array of "num" 126 - "struct lguest_dma": each contains a used_len, and an array of 127 - physical addresses and lengths. When a transfer occurs, the 128 - "used_len" field of one of the buffers which has used_len 0 will be 129 - set to the length transferred and the irq will fire. 130 - 131 - Using an irq value of 0 unbinds the dma buffers. 132 - 133 - To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, 134 - and the bytes used is written to the used_len field. This can be 0 if 135 - noone else has bound a DMA buffer to that key or some other error. 136 - DMA buffers bound by the same guest are ignored. 137 - 138 - Cheers! 104 + Good luck! 139 105 Rusty Russell rusty@rustcorp.com.au. 140 - 141 - [1] These are on various places on the TODO list, waiting for you to 142 - get annoyed enough at the limitation to fix it. 143 - [2] Lguest is not yet tickless when idle. See [1].

+22 -10

arch/i386/Kconfig

··· 227 227 If in doubt, say "Y". 228 228 229 229 config PARAVIRT 230 - bool "Paravirtualization support (EXPERIMENTAL)" 231 - depends on EXPERIMENTAL 230 + bool 232 231 depends on !(X86_VISWS || X86_VOYAGER) 233 232 help 234 - Paravirtualization is a way of running multiple instances of 235 - Linux on the same machine, under a hypervisor. This option 236 - changes the kernel so it can modify itself when it is run 237 - under a hypervisor, improving performance significantly. 238 - However, when run without a hypervisor the kernel is 239 - theoretically slower. If in doubt, say N. 233 + This changes the kernel so it can modify itself when it is run 234 + under a hypervisor, potentially improving performance significantly 235 + over full virtualization. However, when run without a hypervisor 236 + the kernel is theoretically slower and slightly larger. 237 + 238 + menuconfig PARAVIRT_GUEST 239 + bool "Paravirtualized guest support" 240 + help 241 + Say Y here to get to see options related to running Linux under 242 + various hypervisors. This option alone does not add any kernel code. 243 + 244 + If you say N, all options in this submenu will be skipped and disabled. 245 + 246 + if PARAVIRT_GUEST 240 247 241 248 source "arch/x86/xen/Kconfig" 242 249 243 250 config VMI 244 - bool "VMI Paravirt-ops support" 245 - depends on PARAVIRT 251 + bool "VMI Guest support" 252 + select PARAVIRT 253 + depends on !(X86_VISWS || X86_VOYAGER) 246 254 help 247 255 VMI provides a paravirtualized interface to the VMware ESX server 248 256 (it could be used by other hypervisors in theory too, but is not 249 257 at the moment), by linking the kernel to a GPL-ed ROM module 250 258 provided by the hypervisor. 259 + 260 + source "arch/x86/lguest/Kconfig" 261 + 262 + endif 251 263 252 264 config ACPI_SRAT 253 265 bool

+3

arch/i386/Makefile

··· 99 99 # Xen paravirtualization support 100 100 core-$(CONFIG_XEN) += arch/x86/xen/ 101 101 102 + # lguest paravirtualization support 103 + core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ 104 + 102 105 # default subarch .h files 103 106 mflags-y += -Iinclude/asm-x86/mach-default 104 107

+1

arch/x86/kernel/asm-offsets_32.c

··· 136 136 #ifdef CONFIG_LGUEST_GUEST 137 137 BLANK(); 138 138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 139 + OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 139 140 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 140 141 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 141 142 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);

+14

arch/x86/lguest/Kconfig

··· 1 + config LGUEST_GUEST 2 + bool "Lguest guest support" 3 + select PARAVIRT 4 + depends on !X86_PAE 5 + select VIRTIO 6 + select VIRTIO_RING 7 + select VIRTIO_CONSOLE 8 + help 9 + Lguest is a tiny in-kernel hypervisor. Selecting this will 10 + allow your kernel to boot under lguest. This option will increase 11 + your kernel size by about 6k. If in doubt, say N. 12 + 13 + If you say Y here, make sure you say Y (or M) to the virtio block 14 + and net drivers which lguest needs.

+1

arch/x86/lguest/Makefile

··· 1 + obj-y := i386_head.o boot.o

+3 -2

arch/x86/xen/Kconfig

··· 3 3 # 4 4 5 5 config XEN 6 - bool "Enable support for Xen hypervisor" 7 - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES 6 + bool "Xen guest support" 7 + select PARAVIRT 8 + depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) 8 9 help 9 10 This is the Linux Xen port. Enabling this will allow the 10 11 kernel to boot in a paravirtualized environment under the

+1 -1

drivers/Kconfig

··· 94 94 95 95 source "drivers/uio/Kconfig" 96 96 97 - source "drivers/lguest/Kconfig" 97 + source "drivers/virtio/Kconfig" 98 98 endmenu

+1

drivers/Makefile

··· 91 91 obj-$(CONFIG_PPC_PS3) += ps3/ 92 92 obj-$(CONFIG_OF) += of/ 93 93 obj-$(CONFIG_SSB) += ssb/ 94 + obj-$(CONFIG_VIRTIO) += virtio/

+6

drivers/block/Kconfig

··· 425 425 block device driver. It communicates with a back-end driver 426 426 in another domain which drives the actual block device. 427 427 428 + config VIRTIO_BLK 429 + tristate "Virtio block driver (EXPERIMENTAL)" 430 + depends on EXPERIMENTAL && VIRTIO 431 + ---help--- 432 + This is the virtual block driver for lguest. Say Y or M. 433 + 428 434 endif # BLK_DEV

+1 -1

drivers/block/Makefile

··· 25 25 obj-$(CONFIG_BLK_DEV_UMEM) += umem.o 26 26 obj-$(CONFIG_BLK_DEV_NBD) += nbd.o 27 27 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o 28 + obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o 28 29 29 30 obj-$(CONFIG_VIODASD) += viodasd.o 30 31 obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 31 32 obj-$(CONFIG_BLK_DEV_UB) += ub.o 32 33 33 34 obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 34 - obj-$(CONFIG_LGUEST_BLOCK) += lguest_blk.o

-421

drivers/block/lguest_blk.c

··· 1 - /*D:400 2 - * The Guest block driver 3 - * 4 - * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc. 5 - * The mechanism is simple: we place the information about the request in the 6 - * device page, then use SEND_DMA (containing the data for a write, or an empty 7 - * "ping" DMA for a read). 8 - :*/ 9 - /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 10 - * 11 - * This program is free software; you can redistribute it and/or modify 12 - * it under the terms of the GNU General Public License as published by 13 - * the Free Software Foundation; either version 2 of the License, or 14 - * (at your option) any later version. 15 - * 16 - * This program is distributed in the hope that it will be useful, 17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 - * GNU General Public License for more details. 20 - * 21 - * You should have received a copy of the GNU General Public License 22 - * along with this program; if not, write to the Free Software 23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 - */ 25 - //#define DEBUG 26 - #include <linux/init.h> 27 - #include <linux/types.h> 28 - #include <linux/blkdev.h> 29 - #include <linux/interrupt.h> 30 - #include <linux/lguest_bus.h> 31 - 32 - static char next_block_index = 'a'; 33 - 34 - /*D:420 Here is the structure which holds all the information we need about 35 - * each Guest block device. 36 - * 37 - * I'm sure at this stage, you're wondering "hey, where was the adventure I was 38 - * promised?" and thinking "Rusty sucks, I shall say nasty things about him on 39 - * my blog". I think Real adventures have boring bits, too, and you're in the 40 - * middle of one. But it gets better. Just not quite yet. */ 41 - struct blockdev 42 - { 43 - /* The block queue infrastructure wants a spinlock: it is held while it 44 - * calls our block request function. We grab it in our interrupt 45 - * handler so the responses don't mess with new requests. */ 46 - spinlock_t lock; 47 - 48 - /* The disk structure registered with kernel. */ 49 - struct gendisk *disk; 50 - 51 - /* The major device number for this disk, and the interrupt. We only 52 - * really keep them here for completeness; we'd need them if we 53 - * supported device unplugging. */ 54 - int major; 55 - int irq; 56 - 57 - /* The physical address of this device's memory page */ 58 - unsigned long phys_addr; 59 - /* The mapped memory page for convenient acces. */ 60 - struct lguest_block_page *lb_page; 61 - 62 - /* We only have a single request outstanding at a time: this is it. */ 63 - struct lguest_dma dma; 64 - struct request *req; 65 - }; 66 - 67 - /*D:495 We originally used end_request() throughout the driver, but it turns 68 - * out that end_request() is deprecated, and doesn't actually end the request 69 - * (which seems like a good reason to deprecate it!). It simply ends the first 70 - * bio. So if we had 3 bios in a "struct request" we would do all 3, 71 - * end_request(), do 2, end_request(), do 1 and end_request(): twice as much 72 - * work as we needed to do. 73 - * 74 - * This reinforced to me that I do not understand the block layer. 75 - * 76 - * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a 77 - * request. This improved disk speed by 130%. */ 78 - static void end_entire_request(struct request *req, int uptodate) 79 - { 80 - if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) 81 - BUG(); 82 - add_disk_randomness(req->rq_disk); 83 - blkdev_dequeue_request(req); 84 - end_that_request_last(req, uptodate); 85 - } 86 - 87 - /* I'm told there are only two stories in the world worth telling: love and 88 - * hate. So there used to be a love scene here like this: 89 - * 90 - * Launcher: We could make beautiful I/O together, you and I. 91 - * Guest: My, that's a big disk! 92 - * 93 - * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */ 94 - 95 - /*D:490 This is the interrupt handler, called when a block read or write has 96 - * been completed for us. */ 97 - static irqreturn_t lgb_irq(int irq, void *_bd) 98 - { 99 - /* We handed our "struct blockdev" as the argument to request_irq(), so 100 - * it is passed through to us here. This tells us which device we're 101 - * dealing with in case we have more than one. */ 102 - struct blockdev *bd = _bd; 103 - unsigned long flags; 104 - 105 - /* We weren't doing anything? Strange, but could happen if we shared 106 - * interrupts (we don't!). */ 107 - if (!bd->req) { 108 - pr_debug("No work!\n"); 109 - return IRQ_NONE; 110 - } 111 - 112 - /* Not done yet? That's equally strange. */ 113 - if (!bd->lb_page->result) { 114 - pr_debug("No result!\n"); 115 - return IRQ_NONE; 116 - } 117 - 118 - /* We have to grab the lock before ending the request. */ 119 - spin_lock_irqsave(&bd->lock, flags); 120 - /* "result" is 1 for success, 2 for failure: end_entire_request() wants 121 - * to know whether this succeeded or not. */ 122 - end_entire_request(bd->req, bd->lb_page->result == 1); 123 - /* Clear out request, it's done. */ 124 - bd->req = NULL; 125 - /* Reset incoming DMA for next time. */ 126 - bd->dma.used_len = 0; 127 - /* Ready for more reads or writes */ 128 - blk_start_queue(bd->disk->queue); 129 - spin_unlock_irqrestore(&bd->lock, flags); 130 - 131 - /* The interrupt was for us, we dealt with it. */ 132 - return IRQ_HANDLED; 133 - } 134 - 135 - /*D:480 The block layer's "struct request" contains a number of "struct bio"s, 136 - * each of which contains "struct bio_vec"s, each of which contains a page, an 137 - * offset and a length. 138 - * 139 - * Fortunately there are iterators to help us walk through the "struct 140 - * request". Even more fortunately, there were plenty of places to steal the 141 - * code from. We pack the "struct request" into our "struct lguest_dma" and 142 - * return the total length. */ 143 - static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma) 144 - { 145 - unsigned int i = 0, len = 0; 146 - struct req_iterator iter; 147 - struct bio_vec *bvec; 148 - 149 - rq_for_each_segment(bvec, req, iter) { 150 - /* We told the block layer not to give us too many. */ 151 - BUG_ON(i == LGUEST_MAX_DMA_SECTIONS); 152 - /* If we had a zero-length segment, it would look like 153 - * the end of the data referred to by the "struct 154 - * lguest_dma", so make sure that doesn't happen. */ 155 - BUG_ON(!bvec->bv_len); 156 - /* Convert page & offset to a physical address */ 157 - dma->addr[i] = page_to_phys(bvec->bv_page) 158 - + bvec->bv_offset; 159 - dma->len[i] = bvec->bv_len; 160 - len += bvec->bv_len; 161 - i++; 162 - } 163 - /* If the array isn't full, we mark the end with a 0 length */ 164 - if (i < LGUEST_MAX_DMA_SECTIONS) 165 - dma->len[i] = 0; 166 - return len; 167 - } 168 - 169 - /* This creates an empty DMA, useful for prodding the Host without sending data 170 - * (ie. when we want to do a read) */ 171 - static void empty_dma(struct lguest_dma *dma) 172 - { 173 - dma->len[0] = 0; 174 - } 175 - 176 - /*D:470 Setting up a request is fairly easy: */ 177 - static void setup_req(struct blockdev *bd, 178 - int type, struct request *req, struct lguest_dma *dma) 179 - { 180 - /* The type is 1 (write) or 0 (read). */ 181 - bd->lb_page->type = type; 182 - /* The sector on disk where the read or write starts. */ 183 - bd->lb_page->sector = req->sector; 184 - /* The result is initialized to 0 (unfinished). */ 185 - bd->lb_page->result = 0; 186 - /* The current request (so we can end it in the interrupt handler). */ 187 - bd->req = req; 188 - /* The number of bytes: returned as a side-effect of req_to_dma(), 189 - * which packs the block layer's "struct request" into our "struct 190 - * lguest_dma" */ 191 - bd->lb_page->bytes = req_to_dma(req, dma); 192 - } 193 - 194 - /*D:450 Write is pretty straightforward: we pack the request into a "struct 195 - * lguest_dma", then use SEND_DMA to send the request. */ 196 - static void do_write(struct blockdev *bd, struct request *req) 197 - { 198 - struct lguest_dma send; 199 - 200 - pr_debug("lgb: WRITE sector %li\n", (long)req->sector); 201 - setup_req(bd, 1, req, &send); 202 - 203 - lguest_send_dma(bd->phys_addr, &send); 204 - } 205 - 206 - /* Read is similar to write, except we pack the request into our receive 207 - * "struct lguest_dma" and send through an empty DMA just to tell the Host that 208 - * there's a request pending. */ 209 - static void do_read(struct blockdev *bd, struct request *req) 210 - { 211 - struct lguest_dma ping; 212 - 213 - pr_debug("lgb: READ sector %li\n", (long)req->sector); 214 - setup_req(bd, 0, req, &bd->dma); 215 - 216 - empty_dma(&ping); 217 - lguest_send_dma(bd->phys_addr, &ping); 218 - } 219 - 220 - /*D:440 This where requests come in: we get handed the request queue and are 221 - * expected to pull a "struct request" off it until we've finished them or 222 - * we're waiting for a reply: */ 223 - static void do_lgb_request(struct request_queue *q) 224 - { 225 - struct blockdev *bd; 226 - struct request *req; 227 - 228 - again: 229 - /* This sometimes returns NULL even on the very first time around. I 230 - * wonder if it's something to do with letting elves handle the request 231 - * queue... */ 232 - req = elv_next_request(q); 233 - if (!req) 234 - return; 235 - 236 - /* We attached the struct blockdev to the disk: get it back */ 237 - bd = req->rq_disk->private_data; 238 - /* Sometimes we get repeated requests after blk_stop_queue(), but we 239 - * can only handle one at a time. */ 240 - if (bd->req) 241 - return; 242 - 243 - /* We only do reads and writes: no tricky business! */ 244 - if (!blk_fs_request(req)) { 245 - pr_debug("Got non-command 0x%08x\n", req->cmd_type); 246 - req->errors++; 247 - end_entire_request(req, 0); 248 - goto again; 249 - } 250 - 251 - if (rq_data_dir(req) == WRITE) 252 - do_write(bd, req); 253 - else 254 - do_read(bd, req); 255 - 256 - /* We've put out the request, so stop any more coming in until we get 257 - * an interrupt, which takes us to lgb_irq() to re-enable the queue. */ 258 - blk_stop_queue(q); 259 - } 260 - 261 - /*D:430 This is the "struct block_device_operations" we attach to the disk at 262 - * the end of lguestblk_probe(). It doesn't seem to want much. */ 263 - static struct block_device_operations lguestblk_fops = { 264 - .owner = THIS_MODULE, 265 - }; 266 - 267 - /*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure 268 - * quite why. I do know that the IDE code sent two or three of the maintainers 269 - * insane, perhaps this is the fringe of the same disease? 270 - * 271 - * As in the console code, the probe function gets handed the generic 272 - * lguest_device from lguest_bus.c: */ 273 - static int lguestblk_probe(struct lguest_device *lgdev) 274 - { 275 - struct blockdev *bd; 276 - int err; 277 - int irqflags = IRQF_SHARED; 278 - 279 - /* First we allocate our own "struct blockdev" and initialize the easy 280 - * fields. */ 281 - bd = kmalloc(sizeof(*bd), GFP_KERNEL); 282 - if (!bd) 283 - return -ENOMEM; 284 - 285 - spin_lock_init(&bd->lock); 286 - bd->irq = lgdev_irq(lgdev); 287 - bd->req = NULL; 288 - bd->dma.used_len = 0; 289 - bd->dma.len[0] = 0; 290 - /* The descriptor in the lguest_devices array provided by the Host 291 - * gives the Guest the physical page number of the device's page. */ 292 - bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT); 293 - 294 - /* We use lguest_map() to get a pointer to the device page */ 295 - bd->lb_page = lguest_map(bd->phys_addr, 1); 296 - if (!bd->lb_page) { 297 - err = -ENOMEM; 298 - goto out_free_bd; 299 - } 300 - 301 - /* We need a major device number: 0 means "assign one dynamically". */ 302 - bd->major = register_blkdev(0, "lguestblk"); 303 - if (bd->major < 0) { 304 - err = bd->major; 305 - goto out_unmap; 306 - } 307 - 308 - /* This allocates a "struct gendisk" where we pack all the information 309 - * about the disk which the rest of Linux sees. The argument is the 310 - * number of minor devices desired: we need one minor for the main 311 - * disk, and one for each partition. Of course, we can't possibly know 312 - * how many partitions are on the disk (add_disk does that). 313 - */ 314 - bd->disk = alloc_disk(16); 315 - if (!bd->disk) { 316 - err = -ENOMEM; 317 - goto out_unregister_blkdev; 318 - } 319 - 320 - /* Every disk needs a queue for requests to come in: we set up the 321 - * queue with a callback function (the core of our driver) and the lock 322 - * to use. */ 323 - bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock); 324 - if (!bd->disk->queue) { 325 - err = -ENOMEM; 326 - goto out_put_disk; 327 - } 328 - 329 - /* We can only handle a certain number of pointers in our SEND_DMA 330 - * call, so we set that with blk_queue_max_hw_segments(). This is not 331 - * to be confused with blk_queue_max_phys_segments() of course! I 332 - * know, who could possibly confuse the two? 333 - * 334 - * Well, it's simple to tell them apart: this one seems to work and the 335 - * other one didn't. */ 336 - blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS); 337 - 338 - /* Due to technical limitations of our Host (and simple coding) we 339 - * can't have a single buffer which crosses a page boundary. Tell it 340 - * here. This means that our maximum request size is 16 341 - * (LGUEST_MAX_DMA_SECTIONS) pages. */ 342 - blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1); 343 - 344 - /* We name our disk: this becomes the device name when udev does its 345 - * magic thing and creates the device node, such as /dev/lgba. 346 - * next_block_index is a global which starts at 'a'. Unfortunately 347 - * this simple increment logic means that the 27th disk will be called 348 - * "/dev/lgb{". In that case, I recommend having at least 29 disks, so 349 - * your /dev directory will be balanced. */ 350 - sprintf(bd->disk->disk_name, "lgb%c", next_block_index++); 351 - 352 - /* We look to the device descriptor again to see if this device's 353 - * interrupts are expected to be random. If they are, we tell the irq 354 - * subsystem. At the moment this bit is always set. */ 355 - if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) 356 - irqflags |= IRQF_SAMPLE_RANDOM; 357 - 358 - /* Now we have the name and irqflags, we can request the interrupt; we 359 - * give it the "struct blockdev" we have set up to pass to lgb_irq() 360 - * when there is an interrupt. */ 361 - err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd); 362 - if (err) 363 - goto out_cleanup_queue; 364 - 365 - /* We bind our one-entry DMA pool to the key for this block device so 366 - * the Host can reply to our requests. The key is equal to the 367 - * physical address of the device's page, which is conveniently 368 - * unique. */ 369 - err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq); 370 - if (err) 371 - goto out_free_irq; 372 - 373 - /* We finish our disk initialization and add the disk to the system. */ 374 - bd->disk->major = bd->major; 375 - bd->disk->first_minor = 0; 376 - bd->disk->private_data = bd; 377 - bd->disk->fops = &lguestblk_fops; 378 - /* This is initialized to the disk size by the Launcher. */ 379 - set_capacity(bd->disk, bd->lb_page->num_sectors); 380 - add_disk(bd->disk); 381 - 382 - printk(KERN_INFO "%s: device %i at major %d\n", 383 - bd->disk->disk_name, lgdev->index, bd->major); 384 - 385 - /* We don't need to keep the "struct blockdev" around, but if we ever 386 - * implemented device removal, we'd need this. */ 387 - lgdev->private = bd; 388 - return 0; 389 - 390 - out_free_irq: 391 - free_irq(bd->irq, bd); 392 - out_cleanup_queue: 393 - blk_cleanup_queue(bd->disk->queue); 394 - out_put_disk: 395 - put_disk(bd->disk); 396 - out_unregister_blkdev: 397 - unregister_blkdev(bd->major, "lguestblk"); 398 - out_unmap: 399 - lguest_unmap(bd->lb_page); 400 - out_free_bd: 401 - kfree(bd); 402 - return err; 403 - } 404 - 405 - /*D:410 The boilerplate code for registering the lguest block driver is just 406 - * like the console: */ 407 - static struct lguest_driver lguestblk_drv = { 408 - .name = "lguestblk", 409 - .owner = THIS_MODULE, 410 - .device_type = LGUEST_DEVICE_T_BLOCK, 411 - .probe = lguestblk_probe, 412 - }; 413 - 414 - static __init int lguestblk_init(void) 415 - { 416 - return register_lguest_driver(&lguestblk_drv); 417 - } 418 - module_init(lguestblk_init); 419 - 420 - MODULE_DESCRIPTION("Lguest block driver"); 421 - MODULE_LICENSE("GPL");

+308

drivers/block/virtio_blk.c

··· 1 + //#define DEBUG 2 + #include <linux/spinlock.h> 3 + #include <linux/blkdev.h> 4 + #include <linux/hdreg.h> 5 + #include <linux/virtio.h> 6 + #include <linux/virtio_blk.h> 7 + #include <linux/virtio_blk.h> 8 + 9 + static unsigned char virtblk_index = 'a'; 10 + struct virtio_blk 11 + { 12 + spinlock_t lock; 13 + 14 + struct virtio_device *vdev; 15 + struct virtqueue *vq; 16 + 17 + /* The disk structure for the kernel. */ 18 + struct gendisk *disk; 19 + 20 + /* Request tracking. */ 21 + struct list_head reqs; 22 + 23 + mempool_t *pool; 24 + 25 + /* Scatterlist: can be too big for stack. */ 26 + struct scatterlist sg[3+MAX_PHYS_SEGMENTS]; 27 + }; 28 + 29 + struct virtblk_req 30 + { 31 + struct list_head list; 32 + struct request *req; 33 + struct virtio_blk_outhdr out_hdr; 34 + struct virtio_blk_inhdr in_hdr; 35 + }; 36 + 37 + static bool blk_done(struct virtqueue *vq) 38 + { 39 + struct virtio_blk *vblk = vq->vdev->priv; 40 + struct virtblk_req *vbr; 41 + unsigned int len; 42 + unsigned long flags; 43 + 44 + spin_lock_irqsave(&vblk->lock, flags); 45 + while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) { 46 + int uptodate; 47 + switch (vbr->in_hdr.status) { 48 + case VIRTIO_BLK_S_OK: 49 + uptodate = 1; 50 + break; 51 + case VIRTIO_BLK_S_UNSUPP: 52 + uptodate = -ENOTTY; 53 + break; 54 + default: 55 + uptodate = 0; 56 + break; 57 + } 58 + 59 + end_dequeued_request(vbr->req, uptodate); 60 + list_del(&vbr->list); 61 + mempool_free(vbr, vblk->pool); 62 + } 63 + /* In case queue is stopped waiting for more buffers. */ 64 + blk_start_queue(vblk->disk->queue); 65 + spin_unlock_irqrestore(&vblk->lock, flags); 66 + return true; 67 + } 68 + 69 + static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 70 + struct request *req) 71 + { 72 + unsigned long num, out, in; 73 + struct virtblk_req *vbr; 74 + 75 + vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); 76 + if (!vbr) 77 + /* When another request finishes we'll try again. */ 78 + return false; 79 + 80 + vbr->req = req; 81 + if (blk_fs_request(vbr->req)) { 82 + vbr->out_hdr.type = 0; 83 + vbr->out_hdr.sector = vbr->req->sector; 84 + vbr->out_hdr.ioprio = vbr->req->ioprio; 85 + } else if (blk_pc_request(vbr->req)) { 86 + vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; 87 + vbr->out_hdr.sector = 0; 88 + vbr->out_hdr.ioprio = vbr->req->ioprio; 89 + } else { 90 + /* We don't put anything else in the queue. */ 91 + BUG(); 92 + } 93 + 94 + if (blk_barrier_rq(vbr->req)) 95 + vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER; 96 + 97 + /* We have to zero this, otherwise blk_rq_map_sg gets upset. */ 98 + memset(vblk->sg, 0, sizeof(vblk->sg)); 99 + sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr)); 100 + num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); 101 + sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr)); 102 + 103 + if (rq_data_dir(vbr->req) == WRITE) { 104 + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; 105 + out = 1 + num; 106 + in = 1; 107 + } else { 108 + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 109 + out = 1; 110 + in = 1 + num; 111 + } 112 + 113 + if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) { 114 + mempool_free(vbr, vblk->pool); 115 + return false; 116 + } 117 + 118 + list_add_tail(&vbr->list, &vblk->reqs); 119 + return true; 120 + } 121 + 122 + static void do_virtblk_request(struct request_queue *q) 123 + { 124 + struct virtio_blk *vblk = NULL; 125 + struct request *req; 126 + unsigned int issued = 0; 127 + 128 + while ((req = elv_next_request(q)) != NULL) { 129 + vblk = req->rq_disk->private_data; 130 + BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg)); 131 + 132 + /* If this request fails, stop queue and wait for something to 133 + finish to restart it. */ 134 + if (!do_req(q, vblk, req)) { 135 + blk_stop_queue(q); 136 + break; 137 + } 138 + blkdev_dequeue_request(req); 139 + issued++; 140 + } 141 + 142 + if (issued) 143 + vblk->vq->vq_ops->kick(vblk->vq); 144 + } 145 + 146 + static int virtblk_ioctl(struct inode *inode, struct file *filp, 147 + unsigned cmd, unsigned long data) 148 + { 149 + return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue, 150 + inode->i_bdev->bd_disk, cmd, 151 + (void __user *)data); 152 + } 153 + 154 + static struct block_device_operations virtblk_fops = { 155 + .ioctl = virtblk_ioctl, 156 + .owner = THIS_MODULE, 157 + }; 158 + 159 + static int virtblk_probe(struct virtio_device *vdev) 160 + { 161 + struct virtio_blk *vblk; 162 + int err, major; 163 + void *token; 164 + unsigned int len; 165 + u64 cap; 166 + u32 v; 167 + 168 + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); 169 + if (!vblk) { 170 + err = -ENOMEM; 171 + goto out; 172 + } 173 + 174 + INIT_LIST_HEAD(&vblk->reqs); 175 + spin_lock_init(&vblk->lock); 176 + vblk->vdev = vdev; 177 + 178 + /* We expect one virtqueue, for output. */ 179 + vblk->vq = vdev->config->find_vq(vdev, blk_done); 180 + if (IS_ERR(vblk->vq)) { 181 + err = PTR_ERR(vblk->vq); 182 + goto out_free_vblk; 183 + } 184 + 185 + vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); 186 + if (!vblk->pool) { 187 + err = -ENOMEM; 188 + goto out_free_vq; 189 + } 190 + 191 + major = register_blkdev(0, "virtblk"); 192 + if (major < 0) { 193 + err = major; 194 + goto out_mempool; 195 + } 196 + 197 + /* FIXME: How many partitions? How long is a piece of string? */ 198 + vblk->disk = alloc_disk(1 << 4); 199 + if (!vblk->disk) { 200 + err = -ENOMEM; 201 + goto out_unregister_blkdev; 202 + } 203 + 204 + vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); 205 + if (!vblk->disk->queue) { 206 + err = -ENOMEM; 207 + goto out_put_disk; 208 + } 209 + 210 + sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); 211 + vblk->disk->major = major; 212 + vblk->disk->first_minor = 0; 213 + vblk->disk->private_data = vblk; 214 + vblk->disk->fops = &virtblk_fops; 215 + 216 + /* If barriers are supported, tell block layer that queue is ordered */ 217 + token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len); 218 + if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER)) 219 + blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); 220 + 221 + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap); 222 + if (err) { 223 + dev_err(&vdev->dev, "Bad/missing capacity in config\n"); 224 + goto out_put_disk; 225 + } 226 + 227 + /* If capacity is too big, truncate with warning. */ 228 + if ((sector_t)cap != cap) { 229 + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", 230 + (unsigned long long)cap); 231 + cap = (sector_t)-1; 232 + } 233 + set_capacity(vblk->disk, cap); 234 + 235 + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v); 236 + if (!err) 237 + blk_queue_max_segment_size(vblk->disk->queue, v); 238 + else if (err != -ENOENT) { 239 + dev_err(&vdev->dev, "Bad SIZE_MAX in config\n"); 240 + goto out_put_disk; 241 + } 242 + 243 + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v); 244 + if (!err) 245 + blk_queue_max_hw_segments(vblk->disk->queue, v); 246 + else if (err != -ENOENT) { 247 + dev_err(&vdev->dev, "Bad SEG_MAX in config\n"); 248 + goto out_put_disk; 249 + } 250 + 251 + add_disk(vblk->disk); 252 + return 0; 253 + 254 + out_put_disk: 255 + put_disk(vblk->disk); 256 + out_unregister_blkdev: 257 + unregister_blkdev(major, "virtblk"); 258 + out_mempool: 259 + mempool_destroy(vblk->pool); 260 + out_free_vq: 261 + vdev->config->del_vq(vblk->vq); 262 + out_free_vblk: 263 + kfree(vblk); 264 + out: 265 + return err; 266 + } 267 + 268 + static void virtblk_remove(struct virtio_device *vdev) 269 + { 270 + struct virtio_blk *vblk = vdev->priv; 271 + int major = vblk->disk->major; 272 + 273 + BUG_ON(!list_empty(&vblk->reqs)); 274 + blk_cleanup_queue(vblk->disk->queue); 275 + put_disk(vblk->disk); 276 + unregister_blkdev(major, "virtblk"); 277 + mempool_destroy(vblk->pool); 278 + kfree(vblk); 279 + } 280 + 281 + static struct virtio_device_id id_table[] = { 282 + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, 283 + { 0 }, 284 + }; 285 + 286 + static struct virtio_driver virtio_blk = { 287 + .driver.name = KBUILD_MODNAME, 288 + .driver.owner = THIS_MODULE, 289 + .id_table = id_table, 290 + .probe = virtblk_probe, 291 + .remove = __devexit_p(virtblk_remove), 292 + }; 293 + 294 + static int __init init(void) 295 + { 296 + return register_virtio_driver(&virtio_blk); 297 + } 298 + 299 + static void __exit fini(void) 300 + { 301 + unregister_virtio_driver(&virtio_blk); 302 + } 303 + module_init(init); 304 + module_exit(fini); 305 + 306 + MODULE_DEVICE_TABLE(virtio, id_table); 307 + MODULE_DESCRIPTION("Virtio block driver"); 308 + MODULE_LICENSE("GPL");

+4

drivers/char/Kconfig

··· 613 613 help 614 614 Xen virtual console device driver 615 615 616 + config VIRTIO_CONSOLE 617 + bool 618 + select HVC_DRIVER 619 + 616 620 config HVCS 617 621 tristate "IBM Hypervisor Virtual Console Server support" 618 622 depends on PPC_PSERIES

+1 -1

drivers/char/Makefile

··· 42 42 obj-$(CONFIG_N_HDLC) += n_hdlc.o 43 43 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o 44 44 obj-$(CONFIG_SX) += sx.o generic_serial.o 45 - obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o 46 45 obj-$(CONFIG_RIO) += rio/ generic_serial.o 47 46 obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o 48 47 obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o ··· 49 50 obj-$(CONFIG_HVC_BEAT) += hvc_beat.o 50 51 obj-$(CONFIG_HVC_DRIVER) += hvc_console.o 51 52 obj-$(CONFIG_HVC_XEN) += hvc_xen.o 53 + obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o 52 54 obj-$(CONFIG_RAW_DRIVER) += raw.o 53 55 obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o 54 56 obj-$(CONFIG_MSPEC) += mspec.o

-177

drivers/char/hvc_lguest.c

··· 1 - /*D:300 2 - * The Guest console driver 3 - * 4 - * This is a trivial console driver: we use lguest's DMA mechanism to send 5 - * bytes out, and register a DMA buffer to receive bytes in. It is assumed to 6 - * be present and available from the very beginning of boot. 7 - * 8 - * Writing console drivers is one of the few remaining Dark Arts in Linux. 9 - * Fortunately for us, the path of virtual consoles has been well-trodden by 10 - * the PowerPC folks, who wrote "hvc_console.c" to generically support any 11 - * virtual console. We use that infrastructure which only requires us to write 12 - * the basic put_chars and get_chars functions and call the right register 13 - * functions. 14 - :*/ 15 - 16 - /*M:002 The console can be flooded: while the Guest is processing input the 17 - * Host can send more. Buffering in the Host could alleviate this, but it is a 18 - * difficult problem in general. :*/ 19 - /* Copyright (C) 2006 Rusty Russell, IBM Corporation 20 - * 21 - * This program is free software; you can redistribute it and/or modify 22 - * it under the terms of the GNU General Public License as published by 23 - * the Free Software Foundation; either version 2 of the License, or 24 - * (at your option) any later version. 25 - * 26 - * This program is distributed in the hope that it will be useful, 27 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 - * GNU General Public License for more details. 30 - * 31 - * You should have received a copy of the GNU General Public License 32 - * along with this program; if not, write to the Free Software 33 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 34 - */ 35 - #include <linux/err.h> 36 - #include <linux/init.h> 37 - #include <linux/lguest_bus.h> 38 - #include <asm/paravirt.h> 39 - #include "hvc_console.h" 40 - 41 - /*D:340 This is our single console input buffer, with associated "struct 42 - * lguest_dma" referring to it. Note the 0-terminated length array, and the 43 - * use of physical address for the buffer itself. */ 44 - static char inbuf[256]; 45 - static struct lguest_dma cons_input = { .used_len = 0, 46 - .addr[0] = __pa(inbuf), 47 - .len[0] = sizeof(inbuf), 48 - .len[1] = 0 }; 49 - 50 - /*D:310 The put_chars() callback is pretty straightforward. 51 - * 52 - * First we put the pointer and length in a "struct lguest_dma": we only have 53 - * one pointer, so we set the second length to 0. Then we use SEND_DMA to send 54 - * the data to (Host) buffers attached to the console key. Usually a device's 55 - * key is a physical address within the device's memory, but because the 56 - * console device doesn't have any associated physical memory, we use the 57 - * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */ 58 - static int put_chars(u32 vtermno, const char *buf, int count) 59 - { 60 - struct lguest_dma dma; 61 - 62 - /* FIXME: DMA buffers in a "struct lguest_dma" are not allowed 63 - * to go over page boundaries. This never seems to happen, 64 - * but if it did we'd need to fix this code. */ 65 - dma.len[0] = count; 66 - dma.len[1] = 0; 67 - dma.addr[0] = __pa(buf); 68 - 69 - lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma); 70 - /* We're expected to return the amount of data we wrote: all of it. */ 71 - return count; 72 - } 73 - 74 - /*D:350 get_chars() is the callback from the hvc_console infrastructure when 75 - * an interrupt is received. 76 - * 77 - * Firstly we see if our buffer has been filled: if not, we return. The rest 78 - * of the code deals with the fact that the hvc_console() infrastructure only 79 - * asks us for 16 bytes at a time. We keep a "cons_offset" variable for 80 - * partially-read buffers. */ 81 - static int get_chars(u32 vtermno, char *buf, int count) 82 - { 83 - static int cons_offset; 84 - 85 - /* Nothing left to see here... */ 86 - if (!cons_input.used_len) 87 - return 0; 88 - 89 - /* You want more than we have to give? Well, try wanting less! */ 90 - if (cons_input.used_len - cons_offset < count) 91 - count = cons_input.used_len - cons_offset; 92 - 93 - /* Copy across to their buffer and increment offset. */ 94 - memcpy(buf, inbuf + cons_offset, count); 95 - cons_offset += count; 96 - 97 - /* Finished? Zero offset, and reset cons_input so Host will use it 98 - * again. */ 99 - if (cons_offset == cons_input.used_len) { 100 - cons_offset = 0; 101 - cons_input.used_len = 0; 102 - } 103 - return count; 104 - } 105 - /*:*/ 106 - 107 - static struct hv_ops lguest_cons = { 108 - .get_chars = get_chars, 109 - .put_chars = put_chars, 110 - }; 111 - 112 - /*D:320 Console drivers are initialized very early so boot messages can go 113 - * out. At this stage, the console is output-only. Our driver checks we're a 114 - * Guest, and if so hands hvc_instantiate() the console number (0), priority 115 - * (0), and the struct hv_ops containing the put_chars() function. */ 116 - static int __init cons_init(void) 117 - { 118 - if (strcmp(pv_info.name, "lguest") != 0) 119 - return 0; 120 - 121 - return hvc_instantiate(0, 0, &lguest_cons); 122 - } 123 - console_initcall(cons_init); 124 - 125 - /*D:370 To set up and manage our virtual console, we call hvc_alloc() and 126 - * stash the result in the private pointer of the "struct lguest_device". 127 - * Since we never remove the console device we never need this pointer again, 128 - * but using ->private is considered good form, and you never know who's going 129 - * to copy your driver. 130 - * 131 - * Once the console is set up, we bind our input buffer ready for input. */ 132 - static int lguestcons_probe(struct lguest_device *lgdev) 133 - { 134 - int err; 135 - 136 - /* The first argument of hvc_alloc() is the virtual console number, so 137 - * we use zero. The second argument is the interrupt number. 138 - * 139 - * The third argument is a "struct hv_ops" containing the put_chars() 140 - * and get_chars() pointers. The final argument is the output buffer 141 - * size: we use 256 and expect the Host to have room for us to send 142 - * that much. */ 143 - lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256); 144 - if (IS_ERR(lgdev->private)) 145 - return PTR_ERR(lgdev->private); 146 - 147 - /* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY. 148 - * "cons_input" is that statically-initialized global DMA buffer we saw 149 - * above, and we also give the interrupt we want. */ 150 - err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1, 151 - lgdev_irq(lgdev)); 152 - if (err) 153 - printk("lguest console: failed to bind buffer.\n"); 154 - return err; 155 - } 156 - /* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc() 157 - * to expect input when this interrupt is triggered, and then tell 158 - * lguest_bind_dma() that is the interrupt to send us when input comes in. */ 159 - 160 - /*D:360 From now on the console driver follows standard Guest driver form: 161 - * register_lguest_driver() registers the device type and probe function, and 162 - * the probe function sets up the device. 163 - * 164 - * The standard "struct lguest_driver": */ 165 - static struct lguest_driver lguestcons_drv = { 166 - .name = "lguestcons", 167 - .owner = THIS_MODULE, 168 - .device_type = LGUEST_DEVICE_T_CONSOLE, 169 - .probe = lguestcons_probe, 170 - }; 171 - 172 - /* The standard init function */ 173 - static int __init hvc_lguest_init(void) 174 - { 175 - return register_lguest_driver(&lguestcons_drv); 176 - } 177 - module_init(hvc_lguest_init);

+225

drivers/char/virtio_console.c

··· 1 + /*D:300 2 + * The Guest console driver 3 + * 4 + * Writing console drivers is one of the few remaining Dark Arts in Linux. 5 + * Fortunately for us, the path of virtual consoles has been well-trodden by 6 + * the PowerPC folks, who wrote "hvc_console.c" to generically support any 7 + * virtual console. We use that infrastructure which only requires us to write 8 + * the basic put_chars and get_chars functions and call the right register 9 + * functions. 10 + :*/ 11 + 12 + /*M:002 The console can be flooded: while the Guest is processing input the 13 + * Host can send more. Buffering in the Host could alleviate this, but it is a 14 + * difficult problem in general. :*/ 15 + /* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation 16 + * 17 + * This program is free software; you can redistribute it and/or modify 18 + * it under the terms of the GNU General Public License as published by 19 + * the Free Software Foundation; either version 2 of the License, or 20 + * (at your option) any later version. 21 + * 22 + * This program is distributed in the hope that it will be useful, 23 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 + * GNU General Public License for more details. 26 + * 27 + * You should have received a copy of the GNU General Public License 28 + * along with this program; if not, write to the Free Software 29 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 30 + */ 31 + #include <linux/err.h> 32 + #include <linux/init.h> 33 + #include <linux/virtio.h> 34 + #include <linux/virtio_console.h> 35 + #include "hvc_console.h" 36 + 37 + /*D:340 These represent our input and output console queues, and the virtio 38 + * operations for them. */ 39 + static struct virtqueue *in_vq, *out_vq; 40 + static struct virtio_device *vdev; 41 + 42 + /* This is our input buffer, and how much data is left in it. */ 43 + static unsigned int in_len; 44 + static char *in, *inbuf; 45 + 46 + /* The operations for our console. */ 47 + static struct hv_ops virtio_cons; 48 + 49 + /*D:310 The put_chars() callback is pretty straightforward. 50 + * 51 + * We turn the characters into a scatter-gather list, add it to the output 52 + * queue and then kick the Host. Then we sit here waiting for it to finish: 53 + * inefficient in theory, but in practice implementations will do it 54 + * immediately (lguest's Launcher does). */ 55 + static int put_chars(u32 vtermno, const char *buf, int count) 56 + { 57 + struct scatterlist sg[1]; 58 + unsigned int len; 59 + 60 + /* This is a convenient routine to initialize a single-elem sg list */ 61 + sg_init_one(sg, buf, count); 62 + 63 + /* add_buf wants a token to identify this buffer: we hand it any 64 + * non-NULL pointer, since there's only ever one buffer. */ 65 + if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) { 66 + /* Tell Host to go! */ 67 + out_vq->vq_ops->kick(out_vq); 68 + /* Chill out until it's done with the buffer. */ 69 + while (!out_vq->vq_ops->get_buf(out_vq, &len)) 70 + cpu_relax(); 71 + } 72 + 73 + /* We're expected to return the amount of data we wrote: all of it. */ 74 + return count; 75 + } 76 + 77 + /* Create a scatter-gather list representing our input buffer and put it in the 78 + * queue. */ 79 + static void add_inbuf(void) 80 + { 81 + struct scatterlist sg[1]; 82 + sg_init_one(sg, inbuf, PAGE_SIZE); 83 + 84 + /* We should always be able to add one buffer to an empty queue. */ 85 + if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0) 86 + BUG(); 87 + in_vq->vq_ops->kick(in_vq); 88 + } 89 + 90 + /*D:350 get_chars() is the callback from the hvc_console infrastructure when 91 + * an interrupt is received. 92 + * 93 + * Most of the code deals with the fact that the hvc_console() infrastructure 94 + * only asks us for 16 bytes at a time. We keep in_offset and in_used fields 95 + * for partially-filled buffers. */ 96 + static int get_chars(u32 vtermno, char *buf, int count) 97 + { 98 + /* If we don't have an input queue yet, we can't get input. */ 99 + BUG_ON(!in_vq); 100 + 101 + /* No buffer? Try to get one. */ 102 + if (!in_len) { 103 + in = in_vq->vq_ops->get_buf(in_vq, &in_len); 104 + if (!in) 105 + return 0; 106 + } 107 + 108 + /* You want more than we have to give? Well, try wanting less! */ 109 + if (in_len < count) 110 + count = in_len; 111 + 112 + /* Copy across to their buffer and increment offset. */ 113 + memcpy(buf, in, count); 114 + in += count; 115 + in_len -= count; 116 + 117 + /* Finished? Re-register buffer so Host will use it again. */ 118 + if (in_len == 0) 119 + add_inbuf(); 120 + 121 + return count; 122 + } 123 + /*:*/ 124 + 125 + /*D:320 Console drivers are initialized very early so boot messages can go out, 126 + * so we do things slightly differently from the generic virtio initialization 127 + * of the net and block drivers. 128 + * 129 + * At this stage, the console is output-only. It's too early to set up a 130 + * virtqueue, so we let the drivers do some boutique early-output thing. */ 131 + int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) 132 + { 133 + virtio_cons.put_chars = put_chars; 134 + return hvc_instantiate(0, 0, &virtio_cons); 135 + } 136 + 137 + /*D:370 Once we're further in boot, we get probed like any other virtio device. 138 + * At this stage we set up the output virtqueue. 139 + * 140 + * To set up and manage our virtual console, we call hvc_alloc(). Since we 141 + * never remove the console device we never need this pointer again. 142 + * 143 + * Finally we put our input buffer in the input queue, ready to receive. */ 144 + static int virtcons_probe(struct virtio_device *dev) 145 + { 146 + int err; 147 + struct hvc_struct *hvc; 148 + 149 + vdev = dev; 150 + 151 + /* This is the scratch page we use to receive console input */ 152 + inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 153 + if (!inbuf) { 154 + err = -ENOMEM; 155 + goto fail; 156 + } 157 + 158 + /* Find the input queue. */ 159 + /* FIXME: This is why we want to wean off hvc: we do nothing 160 + * when input comes in. */ 161 + in_vq = vdev->config->find_vq(vdev, NULL); 162 + if (IS_ERR(in_vq)) { 163 + err = PTR_ERR(in_vq); 164 + goto free; 165 + } 166 + 167 + out_vq = vdev->config->find_vq(vdev, NULL); 168 + if (IS_ERR(out_vq)) { 169 + err = PTR_ERR(out_vq); 170 + goto free_in_vq; 171 + } 172 + 173 + /* Start using the new console output. */ 174 + virtio_cons.get_chars = get_chars; 175 + virtio_cons.put_chars = put_chars; 176 + 177 + /* The first argument of hvc_alloc() is the virtual console number, so 178 + * we use zero. The second argument is the interrupt number; we 179 + * currently leave this as zero: it would be better not to use the 180 + * hvc mechanism and fix this (FIXME!). 181 + * 182 + * The third argument is a "struct hv_ops" containing the put_chars() 183 + * and get_chars() pointers. The final argument is the output buffer 184 + * size: we can do any size, so we put PAGE_SIZE here. */ 185 + hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE); 186 + if (IS_ERR(hvc)) { 187 + err = PTR_ERR(hvc); 188 + goto free_out_vq; 189 + } 190 + 191 + /* Register the input buffer the first time. */ 192 + add_inbuf(); 193 + return 0; 194 + 195 + free_out_vq: 196 + vdev->config->del_vq(out_vq); 197 + free_in_vq: 198 + vdev->config->del_vq(in_vq); 199 + free: 200 + kfree(inbuf); 201 + fail: 202 + return err; 203 + } 204 + 205 + static struct virtio_device_id id_table[] = { 206 + { VIRTIO_ID_CONSOLE, VIRTIO_DEV_ANY_ID }, 207 + { 0 }, 208 + }; 209 + 210 + static struct virtio_driver virtio_console = { 211 + .driver.name = KBUILD_MODNAME, 212 + .driver.owner = THIS_MODULE, 213 + .id_table = id_table, 214 + .probe = virtcons_probe, 215 + }; 216 + 217 + static int __init init(void) 218 + { 219 + return register_virtio_driver(&virtio_console); 220 + } 221 + module_init(init); 222 + 223 + MODULE_DEVICE_TABLE(virtio, id_table); 224 + MODULE_DESCRIPTION("Virtio console driver"); 225 + MODULE_LICENSE("GPL");

+4

drivers/kvm/Kconfig

··· 47 47 Provides support for KVM on AMD processors equipped with the AMD-V 48 48 (SVM) extensions. 49 49 50 + # OK, it's a little counter-intuitive to do this, but it puts it neatly under 51 + # the virtualization menu. 52 + source drivers/lguest/Kconfig 53 + 50 54 endif # VIRTUALIZATION

+1 -12

drivers/lguest/Kconfig

··· 1 1 config LGUEST 2 2 tristate "Linux hypervisor example code" 3 - depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE && FUTEX 4 - select LGUEST_GUEST 3 + depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !(X86_VISWS || X86_VOYAGER) 5 4 select HVC_DRIVER 6 5 ---help--- 7 6 This is a very simple module which allows you to run ··· 17 18 The guest needs code built-in, even if the host has lguest 18 19 support as a module. The drivers are tiny, so we build them 19 20 in too. 20 - 21 - config LGUEST_NET 22 - tristate 23 - default y 24 - depends on LGUEST_GUEST && NET 25 - 26 - config LGUEST_BLOCK 27 - tristate 28 - default y 29 - depends on LGUEST_GUEST && BLOCK

+6 -4

drivers/lguest/Makefile

··· 1 - # Guest requires the paravirt_ops replacement and the bus driver. 2 - obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o 1 + # Guest requires the device configuration and probing code. 2 + obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o 3 3 4 4 # Host requires the other files, which can be a module. 5 5 obj-$(CONFIG_LGUEST) += lg.o 6 - lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 7 - segments.o io.o lguest_user.o switcher.o 6 + lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 7 + segments.o lguest_user.o 8 + 9 + lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o 8 10 9 11 Preparation Preparation!: PREFIX=P 10 12 Guest: PREFIX=G

+50 -516

drivers/lguest/core.c

··· 11 11 #include <linux/vmalloc.h> 12 12 #include <linux/cpu.h> 13 13 #include <linux/freezer.h> 14 + #include <linux/highmem.h> 14 15 #include <asm/paravirt.h> 15 - #include <asm/desc.h> 16 16 #include <asm/pgtable.h> 17 17 #include <asm/uaccess.h> 18 18 #include <asm/poll.h> 19 - #include <asm/highmem.h> 20 19 #include <asm/asm-offsets.h> 21 - #include <asm/i387.h> 22 20 #include "lg.h" 23 21 24 - /* Found in switcher.S */ 25 - extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 26 - extern unsigned long default_idt_entries[]; 27 - 28 - /* Every guest maps the core switcher code. */ 29 - #define SHARED_SWITCHER_PAGES \ 30 - DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) 31 - /* Pages for switcher itself, then two pages per cpu */ 32 - #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) 33 - 34 - /* We map at -4M for ease of mapping into the guest (one PTE page). */ 35 - #define SWITCHER_ADDR 0xFFC00000 36 22 37 23 static struct vm_struct *switcher_vma; 38 24 static struct page **switcher_page; 39 25 40 - static int cpu_had_pge; 41 - static struct { 42 - unsigned long offset; 43 - unsigned short segment; 44 - } lguest_entry; 45 - 46 26 /* This One Big lock protects all inter-guest data structures. */ 47 27 DEFINE_MUTEX(lguest_lock); 48 - static DEFINE_PER_CPU(struct lguest *, last_guest); 49 - 50 - /* FIXME: Make dynamic. */ 51 - #define MAX_LGUEST_GUESTS 16 52 - struct lguest lguests[MAX_LGUEST_GUESTS]; 53 - 54 - /* Offset from where switcher.S was compiled to where we've copied it */ 55 - static unsigned long switcher_offset(void) 56 - { 57 - return SWITCHER_ADDR - (unsigned long)start_switcher_text; 58 - } 59 - 60 - /* This cpu's struct lguest_pages. */ 61 - static struct lguest_pages *lguest_pages(unsigned int cpu) 62 - { 63 - return &(((struct lguest_pages *) 64 - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 65 - } 66 28 67 29 /*H:010 We need to set up the Switcher at a high virtual address. Remember the 68 30 * Switcher is a few hundred bytes of assembler code which actually changes the ··· 35 73 * Host since it will be running as the switchover occurs. 36 74 * 37 75 * Trying to map memory at a particular address is an unusual thing to do, so 38 - * it's not a simple one-liner. We also set up the per-cpu parts of the 39 - * Switcher here. 40 - */ 76 + * it's not a simple one-liner. */ 41 77 static __init int map_switcher(void) 42 78 { 43 79 int i, err; ··· 92 132 goto free_vma; 93 133 } 94 134 95 - /* Now the switcher is mapped at the right address, we can't fail! 96 - * Copy in the compiled-in Switcher code (from switcher.S). */ 135 + /* Now the Switcher is mapped at the right address, we can't fail! 136 + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ 97 137 memcpy(switcher_vma->addr, start_switcher_text, 98 138 end_switcher_text - start_switcher_text); 99 - 100 - /* Most of the switcher.S doesn't care that it's been moved; on Intel, 101 - * jumps are relative, and it doesn't access any references to external 102 - * code or data. 103 - * 104 - * The only exception is the interrupt handlers in switcher.S: their 105 - * addresses are placed in a table (default_idt_entries), so we need to 106 - * update the table with the new addresses. switcher_offset() is a 107 - * convenience function which returns the distance between the builtin 108 - * switcher code and the high-mapped copy we just made. */ 109 - for (i = 0; i < IDT_ENTRIES; i++) 110 - default_idt_entries[i] += switcher_offset(); 111 - 112 - /* 113 - * Set up the Switcher's per-cpu areas. 114 - * 115 - * Each CPU gets two pages of its own within the high-mapped region 116 - * (aka. "struct lguest_pages"). Much of this can be initialized now, 117 - * but some depends on what Guest we are running (which is set up in 118 - * copy_in_guest_info()). 119 - */ 120 - for_each_possible_cpu(i) { 121 - /* lguest_pages() returns this CPU's two pages. */ 122 - struct lguest_pages *pages = lguest_pages(i); 123 - /* This is a convenience pointer to make the code fit one 124 - * statement to a line. */ 125 - struct lguest_ro_state *state = &pages->state; 126 - 127 - /* The Global Descriptor Table: the Host has a different one 128 - * for each CPU. We keep a descriptor for the GDT which says 129 - * where it is and how big it is (the size is actually the last 130 - * byte, not the size, hence the "-1"). */ 131 - state->host_gdt_desc.size = GDT_SIZE-1; 132 - state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 133 - 134 - /* All CPUs on the Host use the same Interrupt Descriptor 135 - * Table, so we just use store_idt(), which gets this CPU's IDT 136 - * descriptor. */ 137 - store_idt(&state->host_idt_desc); 138 - 139 - /* The descriptors for the Guest's GDT and IDT can be filled 140 - * out now, too. We copy the GDT & IDT into ->guest_gdt and 141 - * ->guest_idt before actually running the Guest. */ 142 - state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 143 - state->guest_idt_desc.address = (long)&state->guest_idt; 144 - state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 145 - state->guest_gdt_desc.address = (long)&state->guest_gdt; 146 - 147 - /* We know where we want the stack to be when the Guest enters 148 - * the switcher: in pages->regs. The stack grows upwards, so 149 - * we start it at the end of that structure. */ 150 - state->guest_tss.esp0 = (long)(&pages->regs + 1); 151 - /* And this is the GDT entry to use for the stack: we keep a 152 - * couple of special LGUEST entries. */ 153 - state->guest_tss.ss0 = LGUEST_DS; 154 - 155 - /* x86 can have a finegrained bitmap which indicates what I/O 156 - * ports the process can use. We set it to the end of our 157 - * structure, meaning "none". */ 158 - state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 159 - 160 - /* Some GDT entries are the same across all Guests, so we can 161 - * set them up now. */ 162 - setup_default_gdt_entries(state); 163 - /* Most IDT entries are the same for all Guests, too.*/ 164 - setup_default_idt_entries(state, default_idt_entries); 165 - 166 - /* The Host needs to be able to use the LGUEST segments on this 167 - * CPU, too, so put them in the Host GDT. */ 168 - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 169 - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 170 - } 171 - 172 - /* In the Switcher, we want the %cs segment register to use the 173 - * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 174 - * it will be undisturbed when we switch. To change %cs and jump we 175 - * need this structure to feed to Intel's "lcall" instruction. */ 176 - lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 177 - lguest_entry.segment = LGUEST_CS; 178 139 179 140 printk(KERN_INFO "lguest: mapped switcher at %p\n", 180 141 switcher_vma->addr); ··· 128 247 __free_pages(switcher_page[i], 0); 129 248 } 130 249 131 - /*H:130 Our Guest is usually so well behaved; it never tries to do things it 132 - * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 133 - * quite complete, because it doesn't contain replacements for the Intel I/O 134 - * instructions. As a result, the Guest sometimes fumbles across one during 135 - * the boot process as it probes for various things which are usually attached 136 - * to a PC. 137 - * 138 - * When the Guest uses one of these instructions, we get trap #13 (General 139 - * Protection Fault) and come here. We see if it's one of those troublesome 140 - * instructions and skip over it. We return true if we did. */ 141 - static int emulate_insn(struct lguest *lg) 142 - { 143 - u8 insn; 144 - unsigned int insnlen = 0, in = 0, shift = 0; 145 - /* The eip contains the *virtual* address of the Guest's instruction: 146 - * guest_pa just subtracts the Guest's page_offset. */ 147 - unsigned long physaddr = guest_pa(lg, lg->regs->eip); 148 - 149 - /* The guest_pa() function only works for Guest kernel addresses, but 150 - * that's all we're trying to do anyway. */ 151 - if (lg->regs->eip < lg->page_offset) 152 - return 0; 153 - 154 - /* Decoding x86 instructions is icky. */ 155 - lgread(lg, &insn, physaddr, 1); 156 - 157 - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 158 - of the eax register. */ 159 - if (insn == 0x66) { 160 - shift = 16; 161 - /* The instruction is 1 byte so far, read the next byte. */ 162 - insnlen = 1; 163 - lgread(lg, &insn, physaddr + insnlen, 1); 164 - } 165 - 166 - /* We can ignore the lower bit for the moment and decode the 4 opcodes 167 - * we need to emulate. */ 168 - switch (insn & 0xFE) { 169 - case 0xE4: /* in <next byte>,%al */ 170 - insnlen += 2; 171 - in = 1; 172 - break; 173 - case 0xEC: /* in (%dx),%al */ 174 - insnlen += 1; 175 - in = 1; 176 - break; 177 - case 0xE6: /* out %al,<next byte> */ 178 - insnlen += 2; 179 - break; 180 - case 0xEE: /* out %al,(%dx) */ 181 - insnlen += 1; 182 - break; 183 - default: 184 - /* OK, we don't know what this is, can't emulate. */ 185 - return 0; 186 - } 187 - 188 - /* If it was an "IN" instruction, they expect the result to be read 189 - * into %eax, so we change %eax. We always return all-ones, which 190 - * traditionally means "there's nothing there". */ 191 - if (in) { 192 - /* Lower bit tells is whether it's a 16 or 32 bit access */ 193 - if (insn & 0x1) 194 - lg->regs->eax = 0xFFFFFFFF; 195 - else 196 - lg->regs->eax |= (0xFFFF << shift); 197 - } 198 - /* Finally, we've "done" the instruction, so move past it. */ 199 - lg->regs->eip += insnlen; 200 - /* Success! */ 201 - return 1; 202 - } 203 - /*:*/ 204 - 205 250 /*L:305 206 251 * Dealing With Guest Memory. 207 252 * 208 253 * When the Guest gives us (what it thinks is) a physical address, we can use 209 - * the normal copy_from_user() & copy_to_user() on that address: remember, 210 - * Guest physical == Launcher virtual. 254 + * the normal copy_from_user() & copy_to_user() on the corresponding place in 255 + * the memory region allocated by the Launcher. 211 256 * 212 257 * But we can't trust the Guest: it might be trying to access the Launcher 213 258 * code. We have to check that the range is below the pfn_limit the Launcher ··· 145 338 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 146 339 } 147 340 148 - /* This is a convenient routine to get a 32-bit value from the Guest (a very 149 - * common operation). Here we can see how useful the kill_lguest() routine we 150 - * met in the Launcher can be: we return a random value (0) instead of needing 151 - * to return an error. */ 152 - u32 lgread_u32(struct lguest *lg, unsigned long addr) 153 - { 154 - u32 val = 0; 155 - 156 - /* Don't let them access lguest binary. */ 157 - if (!lguest_address_ok(lg, addr, sizeof(val)) 158 - || get_user(val, (u32 __user *)addr) != 0) 159 - kill_guest(lg, "bad read address %#lx", addr); 160 - return val; 161 - } 162 - 163 - /* Same thing for writing a value. */ 164 - void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) 165 - { 166 - if (!lguest_address_ok(lg, addr, sizeof(val)) 167 - || put_user(val, (u32 __user *)addr) != 0) 168 - kill_guest(lg, "bad write address %#lx", addr); 169 - } 170 - 171 - /* This routine is more generic, and copies a range of Guest bytes into a 172 - * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so 173 - * the caller doesn't end up using uninitialized kernel memory. */ 174 - void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 341 + /* This routine copies memory from the Guest. Here we can see how useful the 342 + * kill_lguest() routine we met in the Launcher can be: we return a random 343 + * value (all zeroes) instead of needing to return an error. */ 344 + void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 175 345 { 176 346 if (!lguest_address_ok(lg, addr, bytes) 177 - || copy_from_user(b, (void __user *)addr, bytes) != 0) { 347 + || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { 178 348 /* copy_from_user should do this, but as we rely on it... */ 179 349 memset(b, 0, bytes); 180 350 kill_guest(lg, "bad read address %#lx len %u", addr, bytes); 181 351 } 182 352 } 183 353 184 - /* Similarly, our generic routine to copy into a range of Guest bytes. */ 185 - void lgwrite(struct lguest *lg, unsigned long addr, const void *b, 186 - unsigned bytes) 354 + /* This is the write (copy into guest) version. */ 355 + void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, 356 + unsigned bytes) 187 357 { 188 358 if (!lguest_address_ok(lg, addr, bytes) 189 - || copy_to_user((void __user *)addr, b, bytes) != 0) 359 + || copy_to_user(lg->mem_base + addr, b, bytes) != 0) 190 360 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 191 - } 192 - /* (end of memory access helper routines) :*/ 193 - 194 - static void set_ts(void) 195 - { 196 - u32 cr0; 197 - 198 - cr0 = read_cr0(); 199 - if (!(cr0 & 8)) 200 - write_cr0(cr0|8); 201 - } 202 - 203 - /*S:010 204 - * We are getting close to the Switcher. 205 - * 206 - * Remember that each CPU has two pages which are visible to the Guest when it 207 - * runs on that CPU. This has to contain the state for that Guest: we copy the 208 - * state in just before we run the Guest. 209 - * 210 - * Each Guest has "changed" flags which indicate what has changed in the Guest 211 - * since it last ran. We saw this set in interrupts_and_traps.c and 212 - * segments.c. 213 - */ 214 - static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 215 - { 216 - /* Copying all this data can be quite expensive. We usually run the 217 - * same Guest we ran last time (and that Guest hasn't run anywhere else 218 - * meanwhile). If that's not the case, we pretend everything in the 219 - * Guest has changed. */ 220 - if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 221 - __get_cpu_var(last_guest) = lg; 222 - lg->last_pages = pages; 223 - lg->changed = CHANGED_ALL; 224 - } 225 - 226 - /* These copies are pretty cheap, so we do them unconditionally: */ 227 - /* Save the current Host top-level page directory. */ 228 - pages->state.host_cr3 = __pa(current->mm->pgd); 229 - /* Set up the Guest's page tables to see this CPU's pages (and no 230 - * other CPU's pages). */ 231 - map_switcher_in_guest(lg, pages); 232 - /* Set up the two "TSS" members which tell the CPU what stack to use 233 - * for traps which do directly into the Guest (ie. traps at privilege 234 - * level 1). */ 235 - pages->state.guest_tss.esp1 = lg->esp1; 236 - pages->state.guest_tss.ss1 = lg->ss1; 237 - 238 - /* Copy direct-to-Guest trap entries. */ 239 - if (lg->changed & CHANGED_IDT) 240 - copy_traps(lg, pages->state.guest_idt, default_idt_entries); 241 - 242 - /* Copy all GDT entries which the Guest can change. */ 243 - if (lg->changed & CHANGED_GDT) 244 - copy_gdt(lg, pages->state.guest_gdt); 245 - /* If only the TLS entries have changed, copy them. */ 246 - else if (lg->changed & CHANGED_GDT_TLS) 247 - copy_gdt_tls(lg, pages->state.guest_gdt); 248 - 249 - /* Mark the Guest as unchanged for next time. */ 250 - lg->changed = 0; 251 - } 252 - 253 - /* Finally: the code to actually call into the Switcher to run the Guest. */ 254 - static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 255 - { 256 - /* This is a dummy value we need for GCC's sake. */ 257 - unsigned int clobber; 258 - 259 - /* Copy the guest-specific information into this CPU's "struct 260 - * lguest_pages". */ 261 - copy_in_guest_info(lg, pages); 262 - 263 - /* Set the trap number to 256 (impossible value). If we fault while 264 - * switching to the Guest (bad segment registers or bug), this will 265 - * cause us to abort the Guest. */ 266 - lg->regs->trapnum = 256; 267 - 268 - /* Now: we push the "eflags" register on the stack, then do an "lcall". 269 - * This is how we change from using the kernel code segment to using 270 - * the dedicated lguest code segment, as well as jumping into the 271 - * Switcher. 272 - * 273 - * The lcall also pushes the old code segment (KERNEL_CS) onto the 274 - * stack, then the address of this call. This stack layout happens to 275 - * exactly match the stack of an interrupt... */ 276 - asm volatile("pushf; lcall *lguest_entry" 277 - /* This is how we tell GCC that %eax ("a") and %ebx ("b") 278 - * are changed by this routine. The "=" means output. */ 279 - : "=a"(clobber), "=b"(clobber) 280 - /* %eax contains the pages pointer. ("0" refers to the 281 - * 0-th argument above, ie "a"). %ebx contains the 282 - * physical address of the Guest's top-level page 283 - * directory. */ 284 - : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 285 - /* We tell gcc that all these registers could change, 286 - * which means we don't have to save and restore them in 287 - * the Switcher. */ 288 - : "memory", "%edx", "%ecx", "%edi", "%esi"); 289 361 } 290 362 /*:*/ 291 363 ··· 175 489 { 176 490 /* We stop running once the Guest is dead. */ 177 491 while (!lg->dead) { 178 - /* We need to initialize this, otherwise gcc complains. It's 179 - * not (yet) clever enough to see that it's initialized when we 180 - * need it. */ 181 - unsigned int cr2 = 0; /* Damn gcc */ 492 + /* First we run any hypercalls the Guest wants done. */ 493 + if (lg->hcall) 494 + do_hypercalls(lg); 182 495 183 - /* First we run any hypercalls the Guest wants done: either in 184 - * the hypercall ring in "struct lguest_data", or directly by 185 - * using int 31 (LGUEST_TRAP_ENTRY). */ 186 - do_hypercalls(lg); 187 - /* It's possible the Guest did a SEND_DMA hypercall to the 496 + /* It's possible the Guest did a NOTIFY hypercall to the 188 497 * Launcher, in which case we return from the read() now. */ 189 - if (lg->dma_is_pending) { 190 - if (put_user(lg->pending_dma, user) || 191 - put_user(lg->pending_key, user+1)) 498 + if (lg->pending_notify) { 499 + if (put_user(lg->pending_notify, user)) 192 500 return -EFAULT; 193 - return sizeof(unsigned long)*2; 501 + return sizeof(lg->pending_notify); 194 502 } 195 503 196 504 /* Check for signals */ ··· 222 542 * the "Do Not Disturb" sign: */ 223 543 local_irq_disable(); 224 544 225 - /* Remember the awfully-named TS bit? If the Guest has asked 226 - * to set it we set it now, so we can trap and pass that trap 227 - * to the Guest if it uses the FPU. */ 228 - if (lg->ts) 229 - set_ts(); 230 - 231 - /* SYSENTER is an optimized way of doing system calls. We 232 - * can't allow it because it always jumps to privilege level 0. 233 - * A normal Guest won't try it because we don't advertise it in 234 - * CPUID, but a malicious Guest (or malicious Guest userspace 235 - * program) could, so we tell the CPU to disable it before 236 - * running the Guest. */ 237 - if (boot_cpu_has(X86_FEATURE_SEP)) 238 - wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 239 - 240 - /* Now we actually run the Guest. It will pop back out when 241 - * something interesting happens, and we can examine its 242 - * registers to see what it was doing. */ 243 - run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 244 - 245 - /* The "regs" pointer contains two extra entries which are not 246 - * really registers: a trap number which says what interrupt or 247 - * trap made the switcher code come back, and an error code 248 - * which some traps set. */ 249 - 250 - /* If the Guest page faulted, then the cr2 register will tell 251 - * us the bad virtual address. We have to grab this now, 252 - * because once we re-enable interrupts an interrupt could 253 - * fault and thus overwrite cr2, or we could even move off to a 254 - * different CPU. */ 255 - if (lg->regs->trapnum == 14) 256 - cr2 = read_cr2(); 257 - /* Similarly, if we took a trap because the Guest used the FPU, 258 - * we have to restore the FPU it expects to see. */ 259 - else if (lg->regs->trapnum == 7) 260 - math_state_restore(); 261 - 262 - /* Restore SYSENTER if it's supposed to be on. */ 263 - if (boot_cpu_has(X86_FEATURE_SEP)) 264 - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 545 + /* Actually run the Guest until something happens. */ 546 + lguest_arch_run_guest(lg); 265 547 266 548 /* Now we're ready to be interrupted or moved to other CPUs */ 267 549 local_irq_enable(); 268 550 269 - /* OK, so what happened? */ 270 - switch (lg->regs->trapnum) { 271 - case 13: /* We've intercepted a GPF. */ 272 - /* Check if this was one of those annoying IN or OUT 273 - * instructions which we need to emulate. If so, we 274 - * just go back into the Guest after we've done it. */ 275 - if (lg->regs->errcode == 0) { 276 - if (emulate_insn(lg)) 277 - continue; 278 - } 279 - break; 280 - case 14: /* We've intercepted a page fault. */ 281 - /* The Guest accessed a virtual address that wasn't 282 - * mapped. This happens a lot: we don't actually set 283 - * up most of the page tables for the Guest at all when 284 - * we start: as it runs it asks for more and more, and 285 - * we set them up as required. In this case, we don't 286 - * even tell the Guest that the fault happened. 287 - * 288 - * The errcode tells whether this was a read or a 289 - * write, and whether kernel or userspace code. */ 290 - if (demand_page(lg, cr2, lg->regs->errcode)) 291 - continue; 292 - 293 - /* OK, it's really not there (or not OK): the Guest 294 - * needs to know. We write out the cr2 value so it 295 - * knows where the fault occurred. 296 - * 297 - * Note that if the Guest were really messed up, this 298 - * could happen before it's done the INITIALIZE 299 - * hypercall, so lg->lguest_data will be NULL, so 300 - * &lg->lguest_data->cr2 will be address 8. Writing 301 - * into that address won't hurt the Host at all, 302 - * though. */ 303 - if (put_user(cr2, &lg->lguest_data->cr2)) 304 - kill_guest(lg, "Writing cr2"); 305 - break; 306 - case 7: /* We've intercepted a Device Not Available fault. */ 307 - /* If the Guest doesn't want to know, we already 308 - * restored the Floating Point Unit, so we just 309 - * continue without telling it. */ 310 - if (!lg->ts) 311 - continue; 312 - break; 313 - case 32 ... 255: 314 - /* These values mean a real interrupt occurred, in 315 - * which case the Host handler has already been run. 316 - * We just do a friendly check if another process 317 - * should now be run, then fall through to loop 318 - * around: */ 319 - cond_resched(); 320 - case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ 321 - continue; 322 - } 323 - 324 - /* If we get here, it's a trap the Guest wants to know 325 - * about. */ 326 - if (deliver_trap(lg, lg->regs->trapnum)) 327 - continue; 328 - 329 - /* If the Guest doesn't have a handler (either it hasn't 330 - * registered any yet, or it's one of the faults we don't let 331 - * it handle), it dies with a cryptic error message. */ 332 - kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 333 - lg->regs->trapnum, lg->regs->eip, 334 - lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); 551 + /* Now we deal with whatever happened to the Guest. */ 552 + lguest_arch_handle_trap(lg); 335 553 } 554 + 336 555 /* The Guest is dead => "No such file or directory" */ 337 556 return -ENOENT; 338 - } 339 - 340 - /* Now we can look at each of the routines this calls, in increasing order of 341 - * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 342 - * deliver_trap() and demand_page(). After all those, we'll be ready to 343 - * examine the Switcher, and our philosophical understanding of the Host/Guest 344 - * duality will be complete. :*/ 345 - 346 - int find_free_guest(void) 347 - { 348 - unsigned int i; 349 - for (i = 0; i < MAX_LGUEST_GUESTS; i++) 350 - if (!lguests[i].tsk) 351 - return i; 352 - return -1; 353 - } 354 - 355 - static void adjust_pge(void *on) 356 - { 357 - if (on) 358 - write_cr4(read_cr4() | X86_CR4_PGE); 359 - else 360 - write_cr4(read_cr4() & ~X86_CR4_PGE); 361 557 } 362 558 363 559 /*H:000 ··· 257 701 /* First we put the Switcher up in very high virtual memory. */ 258 702 err = map_switcher(); 259 703 if (err) 260 - return err; 704 + goto out; 261 705 262 706 /* Now we set up the pagetable implementation for the Guests. */ 263 707 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); 264 - if (err) { 265 - unmap_switcher(); 266 - return err; 267 - } 708 + if (err) 709 + goto unmap; 268 710 269 - /* The I/O subsystem needs some things initialized. */ 270 - lguest_io_init(); 711 + /* We might need to reserve an interrupt vector. */ 712 + err = init_interrupts(); 713 + if (err) 714 + goto free_pgtables; 271 715 272 716 /* /dev/lguest needs to be registered. */ 273 717 err = lguest_device_init(); 274 - if (err) { 275 - free_pagetables(); 276 - unmap_switcher(); 277 - return err; 278 - } 718 + if (err) 719 + goto free_interrupts; 279 720 280 - /* Finally, we need to turn off "Page Global Enable". PGE is an 281 - * optimization where page table entries are specially marked to show 282 - * they never change. The Host kernel marks all the kernel pages this 283 - * way because it's always present, even when userspace is running. 284 - * 285 - * Lguest breaks this: unbeknownst to the rest of the Host kernel, we 286 - * switch to the Guest kernel. If you don't disable this on all CPUs, 287 - * you'll get really weird bugs that you'll chase for two days. 288 - * 289 - * I used to turn PGE off every time we switched to the Guest and back 290 - * on when we return, but that slowed the Switcher down noticibly. */ 291 - 292 - /* We don't need the complexity of CPUs coming and going while we're 293 - * doing this. */ 294 - lock_cpu_hotplug(); 295 - if (cpu_has_pge) { /* We have a broader idea of "global". */ 296 - /* Remember that this was originally set (for cleanup). */ 297 - cpu_had_pge = 1; 298 - /* adjust_pge is a helper function which sets or unsets the PGE 299 - * bit on its CPU, depending on the argument (0 == unset). */ 300 - on_each_cpu(adjust_pge, (void *)0, 0, 1); 301 - /* Turn off the feature in the global feature set. */ 302 - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 303 - } 304 - unlock_cpu_hotplug(); 721 + /* Finally we do some architecture-specific setup. */ 722 + lguest_arch_host_init(); 305 723 306 724 /* All good! */ 307 725 return 0; 726 + 727 + free_interrupts: 728 + free_interrupts(); 729 + free_pgtables: 730 + free_pagetables(); 731 + unmap: 732 + unmap_switcher(); 733 + out: 734 + return err; 308 735 } 309 736 310 737 /* Cleaning up is just the same code, backwards. With a little French. */ 311 738 static void __exit fini(void) 312 739 { 313 740 lguest_device_remove(); 741 + free_interrupts(); 314 742 free_pagetables(); 315 743 unmap_switcher(); 316 744 317 - /* If we had PGE before we started, turn it back on now. */ 318 - lock_cpu_hotplug(); 319 - if (cpu_had_pge) { 320 - set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 321 - /* adjust_pge's argument "1" means set PGE. */ 322 - on_each_cpu(adjust_pge, (void *)1, 0, 1); 323 - } 324 - unlock_cpu_hotplug(); 745 + lguest_arch_host_fini(); 325 746 } 747 + /*:*/ 326 748 327 749 /* The Host side of lguest can be a module. This is a nice way for people to 328 750 * play with it. */

+58 -119

drivers/lguest/hypercalls.c

··· 25 25 #include <linux/mm.h> 26 26 #include <asm/page.h> 27 27 #include <asm/pgtable.h> 28 - #include <irq_vectors.h> 29 28 #include "lg.h" 30 29 31 - /*H:120 This is the core hypercall routine: where the Guest gets what it 32 - * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both. 33 - * 34 - * Remember from the Guest: %eax == which call to make, and the arguments are 35 - * packed into %edx, %ebx and %ecx if needed. */ 36 - static void do_hcall(struct lguest *lg, struct lguest_regs *regs) 30 + /*H:120 This is the core hypercall routine: where the Guest gets what it wants. 31 + * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ 32 + static void do_hcall(struct lguest *lg, struct hcall_args *args) 37 33 { 38 - switch (regs->eax) { 34 + switch (args->arg0) { 39 35 case LHCALL_FLUSH_ASYNC: 40 36 /* This call does nothing, except by breaking out of the Guest 41 37 * it makes us process all the asynchronous hypercalls. */ ··· 47 51 char msg[128]; 48 52 /* If the lgread fails, it will call kill_guest() itself; the 49 53 * kill_guest() with the message will be ignored. */ 50 - lgread(lg, msg, regs->edx, sizeof(msg)); 54 + __lgread(lg, msg, args->arg1, sizeof(msg)); 51 55 msg[sizeof(msg)-1] = '\0'; 52 56 kill_guest(lg, "CRASH: %s", msg); 53 57 break; ··· 55 59 case LHCALL_FLUSH_TLB: 56 60 /* FLUSH_TLB comes in two flavors, depending on the 57 61 * argument: */ 58 - if (regs->edx) 62 + if (args->arg1) 59 63 guest_pagetable_clear_all(lg); 60 64 else 61 65 guest_pagetable_flush_user(lg); 62 66 break; 63 - case LHCALL_BIND_DMA: 64 - /* BIND_DMA really wants four arguments, but it's the only call 65 - * which does. So the Guest packs the number of buffers and 66 - * the interrupt number into the final argument, and we decode 67 - * it here. This can legitimately fail, since we currently 68 - * place a limit on the number of DMA pools a Guest can have. 69 - * So we return true or false from this call. */ 70 - regs->eax = bind_dma(lg, regs->edx, regs->ebx, 71 - regs->ecx >> 8, regs->ecx & 0xFF); 72 - break; 73 67 74 68 /* All these calls simply pass the arguments through to the right 75 69 * routines. */ 76 - case LHCALL_SEND_DMA: 77 - send_dma(lg, regs->edx, regs->ebx); 78 - break; 79 - case LHCALL_LOAD_GDT: 80 - load_guest_gdt(lg, regs->edx, regs->ebx); 81 - break; 82 - case LHCALL_LOAD_IDT_ENTRY: 83 - load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); 84 - break; 85 70 case LHCALL_NEW_PGTABLE: 86 - guest_new_pagetable(lg, regs->edx); 71 + guest_new_pagetable(lg, args->arg1); 87 72 break; 88 73 case LHCALL_SET_STACK: 89 - guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); 74 + guest_set_stack(lg, args->arg1, args->arg2, args->arg3); 90 75 break; 91 76 case LHCALL_SET_PTE: 92 - guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx)); 77 + guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); 93 78 break; 94 79 case LHCALL_SET_PMD: 95 - guest_set_pmd(lg, regs->edx, regs->ebx); 96 - break; 97 - case LHCALL_LOAD_TLS: 98 - guest_load_tls(lg, regs->edx); 80 + guest_set_pmd(lg, args->arg1, args->arg2); 99 81 break; 100 82 case LHCALL_SET_CLOCKEVENT: 101 - guest_set_clockevent(lg, regs->edx); 83 + guest_set_clockevent(lg, args->arg1); 102 84 break; 103 - 104 85 case LHCALL_TS: 105 86 /* This sets the TS flag, as we saw used in run_guest(). */ 106 - lg->ts = regs->edx; 87 + lg->ts = args->arg1; 107 88 break; 108 89 case LHCALL_HALT: 109 90 /* Similarly, this sets the halted flag for run_guest(). */ 110 91 lg->halted = 1; 111 92 break; 93 + case LHCALL_NOTIFY: 94 + lg->pending_notify = args->arg1; 95 + break; 112 96 default: 113 - kill_guest(lg, "Bad hypercall %li\n", regs->eax); 97 + if (lguest_arch_do_hcall(lg, args)) 98 + kill_guest(lg, "Bad hypercall %li\n", args->arg0); 114 99 } 115 100 } 101 + /*:*/ 116 102 117 - /* Asynchronous hypercalls are easy: we just look in the array in the Guest's 118 - * "struct lguest_data" and see if there are any new ones marked "ready". 103 + /*H:124 Asynchronous hypercalls are easy: we just look in the array in the 104 + * Guest's "struct lguest_data" to see if any new ones are marked "ready". 119 105 * 120 106 * We are careful to do these in order: obviously we respect the order the 121 107 * Guest put them in the ring, but we also promise the Guest that they will ··· 112 134 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 113 135 return; 114 136 115 - 116 137 /* We process "struct lguest_data"s hcalls[] ring once. */ 117 138 for (i = 0; i < ARRAY_SIZE(st); i++) { 118 - struct lguest_regs regs; 139 + struct hcall_args args; 119 140 /* We remember where we were up to from last time. This makes 120 141 * sure that the hypercalls are done in the order the Guest 121 142 * places them in the ring. */ ··· 129 152 if (++lg->next_hcall == LHCALL_RING_SIZE) 130 153 lg->next_hcall = 0; 131 154 132 - /* We copy the hypercall arguments into a fake register 133 - * structure. This makes life simple for do_hcall(). */ 134 - if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) 135 - || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) 136 - || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) 137 - || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) { 155 + /* Copy the hypercall arguments into a local copy of 156 + * the hcall_args struct. */ 157 + if (copy_from_user(&args, &lg->lguest_data->hcalls[n], 158 + sizeof(struct hcall_args))) { 138 159 kill_guest(lg, "Fetching async hypercalls"); 139 160 break; 140 161 } 141 162 142 163 /* Do the hypercall, same as a normal one. */ 143 - do_hcall(lg, &regs); 164 + do_hcall(lg, &args); 144 165 145 166 /* Mark the hypercall done. */ 146 167 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { ··· 146 171 break; 147 172 } 148 173 149 - /* Stop doing hypercalls if we've just done a DMA to the 150 - * Launcher: it needs to service this first. */ 151 - if (lg->dma_is_pending) 174 + /* Stop doing hypercalls if they want to notify the Launcher: 175 + * it needs to service this first. */ 176 + if (lg->pending_notify) 152 177 break; 153 178 } 154 179 } ··· 157 182 * Guest makes a hypercall, we end up here to set things up: */ 158 183 static void initialize(struct lguest *lg) 159 184 { 160 - u32 tsc_speed; 161 185 162 186 /* You can't do anything until you're initialized. The Guest knows the 163 187 * rules, so we're unforgiving here. */ 164 - if (lg->regs->eax != LHCALL_LGUEST_INIT) { 165 - kill_guest(lg, "hypercall %li before LGUEST_INIT", 166 - lg->regs->eax); 188 + if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { 189 + kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); 167 190 return; 168 191 } 169 192 170 - /* We insist that the Time Stamp Counter exist and doesn't change with 171 - * cpu frequency. Some devious chip manufacturers decided that TSC 172 - * changes could be handled in software. I decided that time going 173 - * backwards might be good for benchmarks, but it's bad for users. 174 - * 175 - * We also insist that the TSC be stable: the kernel detects unreliable 176 - * TSCs for its own purposes, and we use that here. */ 177 - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 178 - tsc_speed = tsc_khz; 179 - else 180 - tsc_speed = 0; 181 - 182 - /* The pointer to the Guest's "struct lguest_data" is the only 183 - * argument. */ 184 - lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; 185 - /* If we check the address they gave is OK now, we can simply 186 - * copy_to_user/from_user from now on rather than using lgread/lgwrite. 187 - * I put this in to show that I'm not immune to writing stupid 188 - * optimizations. */ 189 - if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { 193 + if (lguest_arch_init_hypercalls(lg)) 190 194 kill_guest(lg, "bad guest page %p", lg->lguest_data); 191 - return; 192 - } 195 + 193 196 /* The Guest tells us where we're not to deliver interrupts by putting 194 197 * the range of addresses into "struct lguest_data". */ 195 198 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 196 - || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) 197 - /* We tell the Guest that it can't use the top 4MB of virtual 198 - * addresses used by the Switcher. */ 199 - || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 200 - || put_user(tsc_speed, &lg->lguest_data->tsc_khz) 201 - /* We also give the Guest a unique id, as used in lguest_net.c. */ 202 - || put_user(lg->guestid, &lg->lguest_data->guestid)) 199 + || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 203 200 kill_guest(lg, "bad guest page %p", lg->lguest_data); 204 201 205 202 /* We write the current time into the Guest's data page once now. */ 206 203 write_timestamp(lg); 204 + 205 + /* page_tables.c will also do some setup. */ 206 + page_table_guest_data_init(lg); 207 207 208 208 /* This is the one case where the above accesses might have been the 209 209 * first write to a Guest page. This may have caused a copy-on-write 210 210 * fault, but the Guest might be referring to the old (read-only) 211 211 * page. */ 212 212 guest_pagetable_clear_all(lg); 213 - } 214 - /* Now we've examined the hypercall code; our Guest can make requests. There 215 - * is one other way we can do things for the Guest, as we see in 216 - * emulate_insn(). */ 217 - 218 - /*H:110 Tricky point: we mark the hypercall as "done" once we've done it. 219 - * Normally we don't need to do this: the Guest will run again and update the 220 - * trap number before we come back around the run_guest() loop to 221 - * do_hypercalls(). 222 - * 223 - * However, if we are signalled or the Guest sends DMA to the Launcher, that 224 - * loop will exit without running the Guest. When it comes back it would try 225 - * to re-run the hypercall. */ 226 - static void clear_hcall(struct lguest *lg) 227 - { 228 - lg->regs->trapnum = 255; 229 213 } 230 214 231 215 /*H:100 ··· 195 261 */ 196 262 void do_hypercalls(struct lguest *lg) 197 263 { 198 - /* Not initialized yet? */ 264 + /* Not initialized yet? This hypercall must do it. */ 199 265 if (unlikely(!lg->lguest_data)) { 200 - /* Did the Guest make a hypercall? We might have come back for 201 - * some other reason (an interrupt, a different trap). */ 202 - if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 203 - /* Set up the "struct lguest_data" */ 204 - initialize(lg); 205 - /* The hypercall is done. */ 206 - clear_hcall(lg); 207 - } 266 + /* Set up the "struct lguest_data" */ 267 + initialize(lg); 268 + /* Hcall is done. */ 269 + lg->hcall = NULL; 208 270 return; 209 271 } 210 272 ··· 210 280 do_async_hcalls(lg); 211 281 212 282 /* If we stopped reading the hypercall ring because the Guest did a 213 - * SEND_DMA to the Launcher, we want to return now. Otherwise if the 214 - * Guest asked us to do a hypercall, we do it. */ 215 - if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 216 - do_hcall(lg, lg->regs); 217 - /* The hypercall is done. */ 218 - clear_hcall(lg); 283 + * NOTIFY to the Launcher, we want to return now. Otherwise we do 284 + * the hypercall. */ 285 + if (!lg->pending_notify) { 286 + do_hcall(lg, lg->hcall); 287 + /* Tricky point: we reset the hcall pointer to mark the 288 + * hypercall as "done". We use the hcall pointer rather than 289 + * the trap number to indicate a hypercall is pending. 290 + * Normally it doesn't matter: the Guest will run again and 291 + * update the trap number before we come back here. 292 + * 293 + * However, if we are signalled or the Guest sends DMA to the 294 + * Launcher, the run_guest() loop will exit without running the 295 + * Guest. When it comes back it would try to re-run the 296 + * hypercall. */ 297 + lg->hcall = NULL; 219 298 } 220 299 } 221 300 ··· 234 295 { 235 296 struct timespec now; 236 297 ktime_get_real_ts(&now); 237 - if (put_user(now, &lg->lguest_data->time)) 298 + if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) 238 299 kill_guest(lg, "Writing timestamp"); 239 300 }

+83 -42

drivers/lguest/interrupts_and_traps.c

··· 12 12 * them first, so we also have a way of "reflecting" them into the Guest as if 13 13 * they had been delivered to it directly. :*/ 14 14 #include <linux/uaccess.h> 15 + #include <linux/interrupt.h> 16 + #include <linux/module.h> 15 17 #include "lg.h" 18 + 19 + /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ 20 + static unsigned int syscall_vector = SYSCALL_VECTOR; 21 + module_param(syscall_vector, uint, 0444); 16 22 17 23 /* The address of the interrupt handler is split into two bits: */ 18 24 static unsigned long idt_address(u32 lo, u32 hi) ··· 45 39 { 46 40 /* Stack grows upwards: move stack then write value. */ 47 41 *gstack -= 4; 48 - lgwrite_u32(lg, *gstack, val); 42 + lgwrite(lg, *gstack, u32, val); 49 43 } 50 44 51 45 /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or ··· 62 56 * it). */ 63 57 static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 64 58 { 65 - unsigned long gstack; 59 + unsigned long gstack, origstack; 66 60 u32 eflags, ss, irq_enable; 61 + unsigned long virtstack; 67 62 68 63 /* There are two cases for interrupts: one where the Guest is already 69 64 * in the kernel, and a more complex one where the Guest is in ··· 72 65 if ((lg->regs->ss&0x3) != GUEST_PL) { 73 66 /* The Guest told us their kernel stack with the SET_STACK 74 67 * hypercall: both the virtual address and the segment */ 75 - gstack = guest_pa(lg, lg->esp1); 68 + virtstack = lg->esp1; 76 69 ss = lg->ss1; 70 + 71 + origstack = gstack = guest_pa(lg, virtstack); 77 72 /* We push the old stack segment and pointer onto the new 78 73 * stack: when the Guest does an "iret" back from the interrupt 79 74 * handler the CPU will notice they're dropping privilege ··· 84 75 push_guest_stack(lg, &gstack, lg->regs->esp); 85 76 } else { 86 77 /* We're staying on the same Guest (kernel) stack. */ 87 - gstack = guest_pa(lg, lg->regs->esp); 78 + virtstack = lg->regs->esp; 88 79 ss = lg->regs->ss; 80 + 81 + origstack = gstack = guest_pa(lg, virtstack); 89 82 } 90 83 91 84 /* Remember that we never let the Guest actually disable interrupts, so ··· 113 102 /* Now we've pushed all the old state, we change the stack, the code 114 103 * segment and the address to execute. */ 115 104 lg->regs->ss = ss; 116 - lg->regs->esp = gstack + lg->page_offset; 105 + lg->regs->esp = virtstack + (gstack - origstack); 117 106 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 118 107 lg->regs->eip = idt_address(lo, hi); 119 108 ··· 176 165 /* Look at the IDT entry the Guest gave us for this interrupt. The 177 166 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 178 167 * over them. */ 179 - idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; 168 + idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 180 169 /* If they don't have a handler (yet?), we just ignore it */ 181 170 if (idt_present(idt->a, idt->b)) { 182 171 /* OK, mark it no longer pending and deliver it. */ ··· 194 183 * timer interrupt. */ 195 184 write_timestamp(lg); 196 185 } 186 + /*:*/ 187 + 188 + /* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 189 + * me a patch, so we support that too. It'd be a big step for lguest if half 190 + * the Plan 9 user base were to start using it. 191 + * 192 + * Actually now I think of it, it's possible that Ron *is* half the Plan 9 193 + * userbase. Oh well. */ 194 + static bool could_be_syscall(unsigned int num) 195 + { 196 + /* Normal Linux SYSCALL_VECTOR or reserved vector? */ 197 + return num == SYSCALL_VECTOR || num == syscall_vector; 198 + } 199 + 200 + /* The syscall vector it wants must be unused by Host. */ 201 + bool check_syscall_vector(struct lguest *lg) 202 + { 203 + u32 vector; 204 + 205 + if (get_user(vector, &lg->lguest_data->syscall_vec)) 206 + return false; 207 + 208 + return could_be_syscall(vector); 209 + } 210 + 211 + int init_interrupts(void) 212 + { 213 + /* If they want some strange system call vector, reserve it now */ 214 + if (syscall_vector != SYSCALL_VECTOR 215 + && test_and_set_bit(syscall_vector, used_vectors)) { 216 + printk("lg: couldn't reserve syscall %u\n", syscall_vector); 217 + return -EBUSY; 218 + } 219 + return 0; 220 + } 221 + 222 + void free_interrupts(void) 223 + { 224 + if (syscall_vector != SYSCALL_VECTOR) 225 + clear_bit(syscall_vector, used_vectors); 226 + } 197 227 198 228 /*H:220 Now we've got the routines to deliver interrupts, delivering traps 199 229 * like page fault is easy. The only trick is that Intel decided that some ··· 249 197 { 250 198 /* Trap numbers are always 8 bit, but we set an impossible trap number 251 199 * for traps inside the Switcher, so check that here. */ 252 - if (num >= ARRAY_SIZE(lg->idt)) 200 + if (num >= ARRAY_SIZE(lg->arch.idt)) 253 201 return 0; 254 202 255 203 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 256 204 * bogus one in): if we fail here, the Guest will be killed. */ 257 - if (!idt_present(lg->idt[num].a, lg->idt[num].b)) 205 + if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 258 206 return 0; 259 - set_guest_interrupt(lg, lg->idt[num].a, lg->idt[num].b, has_err(num)); 207 + set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); 260 208 return 1; 261 209 } 262 210 ··· 270 218 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 271 219 * the other hypervisors would tease it. 272 220 * 273 - * This routine determines if a trap can be delivered directly. */ 274 - static int direct_trap(const struct lguest *lg, 275 - const struct desc_struct *trap, 276 - unsigned int num) 221 + * This routine indicates if a particular trap number could be delivered 222 + * directly. */ 223 + static int direct_trap(unsigned int num) 277 224 { 278 225 /* Hardware interrupts don't go to the Guest at all (except system 279 226 * call). */ 280 - if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) 227 + if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 281 228 return 0; 282 229 283 230 /* The Host needs to see page faults (for shadow paging and to save the 284 231 * fault address), general protection faults (in/out emulation) and 285 232 * device not available (TS handling), and of course, the hypercall 286 233 * trap. */ 287 - if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) 288 - return 0; 289 - 290 - /* Only trap gates (type 15) can go direct to the Guest. Interrupt 291 - * gates (type 14) disable interrupts as they are entered, which we 292 - * never let the Guest do. Not present entries (type 0x0) also can't 293 - * go direct, of course 8) */ 294 - return idt_type(trap->a, trap->b) == 0xF; 234 + return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; 295 235 } 296 236 /*:*/ 297 237 ··· 392 348 * to copy this again. */ 393 349 lg->changed |= CHANGED_IDT; 394 350 395 - /* The IDT which we keep in "struct lguest" only contains 32 entries 396 - * for the traps and LGUEST_IRQS (32) entries for interrupts. We 397 - * ignore attempts to set handlers for higher interrupt numbers, except 398 - * for the system call "interrupt" at 128: we have a special IDT entry 399 - * for that. */ 400 - if (num < ARRAY_SIZE(lg->idt)) 401 - set_trap(lg, &lg->idt[num], num, lo, hi); 402 - else if (num == SYSCALL_VECTOR) 403 - set_trap(lg, &lg->syscall_idt, num, lo, hi); 351 + /* Check that the Guest doesn't try to step outside the bounds. */ 352 + if (num >= ARRAY_SIZE(lg->arch.idt)) 353 + kill_guest(lg, "Setting idt entry %u", num); 354 + else 355 + set_trap(lg, &lg->arch.idt[num], num, lo, hi); 404 356 } 405 357 406 358 /* The default entry for each interrupt points into the Switcher routines which ··· 439 399 440 400 /* We can simply copy the direct traps, otherwise we use the default 441 401 * ones in the Switcher: they will return to the Host. */ 442 - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { 443 - if (direct_trap(lg, &lg->idt[i], i)) 444 - idt[i] = lg->idt[i]; 402 + for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { 403 + /* If no Guest can ever override this trap, leave it alone. */ 404 + if (!direct_trap(i)) 405 + continue; 406 + 407 + /* Only trap gates (type 15) can go direct to the Guest. 408 + * Interrupt gates (type 14) disable interrupts as they are 409 + * entered, which we never let the Guest do. Not present 410 + * entries (type 0x0) also can't go direct, of course. */ 411 + if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) 412 + idt[i] = lg->arch.idt[i]; 445 413 else 414 + /* Reset it to the default. */ 446 415 default_idt_entry(&idt[i], i, def[i]); 447 416 } 448 - 449 - /* Don't forget the system call trap! The IDT entries for other 450 - * interupts never change, so no need to copy them. */ 451 - i = SYSCALL_VECTOR; 452 - if (direct_trap(lg, &lg->syscall_idt, i)) 453 - idt[i] = lg->syscall_idt; 454 - else 455 - default_idt_entry(&idt[i], i, def[i]); 456 417 } 457 418 458 419 void guest_set_clockevent(struct lguest *lg, unsigned long delta)

-626

drivers/lguest/io.c

··· 1 - /*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest 2 - * to talk to the Launcher or directly to another Guest. It uses familiar 3 - * concepts of DMA and interrupts, plus some neat code stolen from 4 - * futexes... :*/ 5 - 6 - /* Copyright (C) 2006 Rusty Russell IBM Corporation 7 - * 8 - * This program is free software; you can redistribute it and/or modify 9 - * it under the terms of the GNU General Public License as published by 10 - * the Free Software Foundation; either version 2 of the License, or 11 - * (at your option) any later version. 12 - * 13 - * This program is distributed in the hope that it will be useful, 14 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 - * GNU General Public License for more details. 17 - * 18 - * You should have received a copy of the GNU General Public License 19 - * along with this program; if not, write to the Free Software 20 - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 21 - */ 22 - #include <linux/types.h> 23 - #include <linux/futex.h> 24 - #include <linux/jhash.h> 25 - #include <linux/mm.h> 26 - #include <linux/highmem.h> 27 - #include <linux/uaccess.h> 28 - #include "lg.h" 29 - 30 - /*L:300 31 - * I/O 32 - * 33 - * Getting data in and out of the Guest is quite an art. There are numerous 34 - * ways to do it, and they all suck differently. We try to keep things fairly 35 - * close to "real" hardware so our Guest's drivers don't look like an alien 36 - * visitation in the middle of the Linux code, and yet make sure that Guests 37 - * can talk directly to other Guests, not just the Launcher. 38 - * 39 - * To do this, the Guest gives us a key when it binds or sends DMA buffers. 40 - * The key corresponds to a "physical" address inside the Guest (ie. a virtual 41 - * address inside the Launcher process). We don't, however, use this key 42 - * directly. 43 - * 44 - * We want Guests which share memory to be able to DMA to each other: two 45 - * Launchers can mmap memory the same file, then the Guests can communicate. 46 - * Fortunately, the futex code provides us with a way to get a "union 47 - * futex_key" corresponding to the memory lying at a virtual address: if the 48 - * two processes share memory, the "union futex_key" for that memory will match 49 - * even if the memory is mapped at different addresses in each. So we always 50 - * convert the keys to "union futex_key"s to compare them. 51 - * 52 - * Before we dive into this though, we need to look at another set of helper 53 - * routines used throughout the Host kernel code to access Guest memory. 54 - :*/ 55 - static struct list_head dma_hash[61]; 56 - 57 - /* An unfortunate side effect of the Linux double-linked list implementation is 58 - * that there's no good way to statically initialize an array of linked 59 - * lists. */ 60 - void lguest_io_init(void) 61 - { 62 - unsigned int i; 63 - 64 - for (i = 0; i < ARRAY_SIZE(dma_hash); i++) 65 - INIT_LIST_HEAD(&dma_hash[i]); 66 - } 67 - 68 - /* FIXME: allow multi-page lengths. */ 69 - static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) 70 - { 71 - unsigned int i; 72 - 73 - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 74 - if (!dma->len[i]) 75 - return 1; 76 - if (!lguest_address_ok(lg, dma->addr[i], dma->len[i])) 77 - goto kill; 78 - if (dma->len[i] > PAGE_SIZE) 79 - goto kill; 80 - /* We could do over a page, but is it worth it? */ 81 - if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) 82 - goto kill; 83 - } 84 - return 1; 85 - 86 - kill: 87 - kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]); 88 - return 0; 89 - } 90 - 91 - /*L:330 This is our hash function, using the wonderful Jenkins hash. 92 - * 93 - * The futex key is a union with three parts: an unsigned long word, a pointer, 94 - * and an int "offset". We could use jhash_2words() which takes three u32s. 95 - * (Ok, the hash functions are great: the naming sucks though). 96 - * 97 - * It's nice to be portable to 64-bit platforms, so we use the more generic 98 - * jhash2(), which takes an array of u32, the number of u32s, and an initial 99 - * u32 to roll in. This is uglier, but breaks down to almost the same code on 100 - * 32-bit platforms like this one. 101 - * 102 - * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61). 103 - */ 104 - static unsigned int hash(const union futex_key *key) 105 - { 106 - return jhash2((u32*)&key->both.word, 107 - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 108 - key->both.offset) 109 - % ARRAY_SIZE(dma_hash); 110 - } 111 - 112 - /* This is a convenience routine to compare two keys. It's a much bemoaned C 113 - * weakness that it doesn't allow '==' on structures or unions, so we have to 114 - * open-code it like this. */ 115 - static inline int key_eq(const union futex_key *a, const union futex_key *b) 116 - { 117 - return (a->both.word == b->both.word 118 - && a->both.ptr == b->both.ptr 119 - && a->both.offset == b->both.offset); 120 - } 121 - 122 - /*L:360 OK, when we need to actually free up a Guest's DMA array we do several 123 - * things, so we have a convenient function to do it. 124 - * 125 - * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem 126 - * for the drop_futex_key_refs(). */ 127 - static void unlink_dma(struct lguest_dma_info *dmainfo) 128 - { 129 - /* You locked this too, right? */ 130 - BUG_ON(!mutex_is_locked(&lguest_lock)); 131 - /* This is how we know that the entry is free. */ 132 - dmainfo->interrupt = 0; 133 - /* Remove it from the hash table. */ 134 - list_del(&dmainfo->list); 135 - /* Drop the references we were holding (to the inode or mm). */ 136 - drop_futex_key_refs(&dmainfo->key); 137 - } 138 - 139 - /*L:350 This is the routine which we call when the Guest asks to unregister a 140 - * DMA array attached to a given key. Returns true if the array was found. */ 141 - static int unbind_dma(struct lguest *lg, 142 - const union futex_key *key, 143 - unsigned long dmas) 144 - { 145 - int i, ret = 0; 146 - 147 - /* We don't bother with the hash table, just look through all this 148 - * Guest's DMA arrays. */ 149 - for (i = 0; i < LGUEST_MAX_DMA; i++) { 150 - /* In theory it could have more than one array on the same key, 151 - * or one array on multiple keys, so we check both */ 152 - if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { 153 - unlink_dma(&lg->dma[i]); 154 - ret = 1; 155 - break; 156 - } 157 - } 158 - return ret; 159 - } 160 - 161 - /*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct 162 - * lguest_dma" for receiving I/O. 163 - * 164 - * The Guest wants to bind an array of "struct lguest_dma"s to a particular key 165 - * to receive input. This only happens when the Guest is setting up a new 166 - * device, so it doesn't have to be very fast. 167 - * 168 - * It returns 1 on a successful registration (it can fail if we hit the limit 169 - * of registrations for this Guest). 170 - */ 171 - int bind_dma(struct lguest *lg, 172 - unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) 173 - { 174 - unsigned int i; 175 - int ret = 0; 176 - union futex_key key; 177 - /* Futex code needs the mmap_sem. */ 178 - struct rw_semaphore *fshared = &current->mm->mmap_sem; 179 - 180 - /* Invalid interrupt? (We could kill the guest here). */ 181 - if (interrupt >= LGUEST_IRQS) 182 - return 0; 183 - 184 - /* We need to grab the Big Lguest Lock, because other Guests may be 185 - * trying to look through this Guest's DMAs to send something while 186 - * we're doing this. */ 187 - mutex_lock(&lguest_lock); 188 - down_read(fshared); 189 - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 190 - kill_guest(lg, "bad dma key %#lx", ukey); 191 - goto unlock; 192 - } 193 - 194 - /* We want to keep this key valid once we drop mmap_sem, so we have to 195 - * hold a reference. */ 196 - get_futex_key_refs(&key); 197 - 198 - /* If the Guest specified an interrupt of 0, that means they want to 199 - * unregister this array of "struct lguest_dma"s. */ 200 - if (interrupt == 0) 201 - ret = unbind_dma(lg, &key, dmas); 202 - else { 203 - /* Look through this Guest's dma array for an unused entry. */ 204 - for (i = 0; i < LGUEST_MAX_DMA; i++) { 205 - /* If the interrupt is non-zero, the entry is already 206 - * used. */ 207 - if (lg->dma[i].interrupt) 208 - continue; 209 - 210 - /* OK, a free one! Fill on our details. */ 211 - lg->dma[i].dmas = dmas; 212 - lg->dma[i].num_dmas = numdmas; 213 - lg->dma[i].next_dma = 0; 214 - lg->dma[i].key = key; 215 - lg->dma[i].guestid = lg->guestid; 216 - lg->dma[i].interrupt = interrupt; 217 - 218 - /* Now we add it to the hash table: the position 219 - * depends on the futex key that we got. */ 220 - list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); 221 - /* Success! */ 222 - ret = 1; 223 - goto unlock; 224 - } 225 - } 226 - /* If we didn't find a slot to put the key in, drop the reference 227 - * again. */ 228 - drop_futex_key_refs(&key); 229 - unlock: 230 - /* Unlock and out. */ 231 - up_read(fshared); 232 - mutex_unlock(&lguest_lock); 233 - return ret; 234 - } 235 - 236 - /*L:385 Note that our routines to access a different Guest's memory are called 237 - * lgread_other() and lgwrite_other(): these names emphasize that they are only 238 - * used when the Guest is *not* the current Guest. 239 - * 240 - * The interface for copying from another process's memory is called 241 - * access_process_vm(), with a final argument of 0 for a read, and 1 for a 242 - * write. 243 - * 244 - * We need lgread_other() to read the destination Guest's "struct lguest_dma" 245 - * array. */ 246 - static int lgread_other(struct lguest *lg, 247 - void *buf, u32 addr, unsigned bytes) 248 - { 249 - if (!lguest_address_ok(lg, addr, bytes) 250 - || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) { 251 - memset(buf, 0, bytes); 252 - kill_guest(lg, "bad address in registered DMA struct"); 253 - return 0; 254 - } 255 - return 1; 256 - } 257 - 258 - /* "lgwrite()" to another Guest: used to update the destination "used_len" once 259 - * we've transferred data into the buffer. */ 260 - static int lgwrite_other(struct lguest *lg, u32 addr, 261 - const void *buf, unsigned bytes) 262 - { 263 - if (!lguest_address_ok(lg, addr, bytes) 264 - || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) 265 - != bytes)) { 266 - kill_guest(lg, "bad address writing to registered DMA"); 267 - return 0; 268 - } 269 - return 1; 270 - } 271 - 272 - /*L:400 This is the generic engine which copies from a source "struct 273 - * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The 274 - * destination Guest's pages have already been mapped, as contained in the 275 - * pages array. 276 - * 277 - * If you're wondering if there's a nice "copy from one process to another" 278 - * routine, so was I. But Linux isn't really set up to copy between two 279 - * unrelated processes, so we have to write it ourselves. 280 - */ 281 - static u32 copy_data(struct lguest *srclg, 282 - const struct lguest_dma *src, 283 - const struct lguest_dma *dst, 284 - struct page *pages[]) 285 - { 286 - unsigned int totlen, si, di, srcoff, dstoff; 287 - void *maddr = NULL; 288 - 289 - /* We return the total length transferred. */ 290 - totlen = 0; 291 - 292 - /* We keep indexes into the source and destination "struct lguest_dma", 293 - * and an offset within each region. */ 294 - si = di = 0; 295 - srcoff = dstoff = 0; 296 - 297 - /* We loop until the source or destination is exhausted. */ 298 - while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] 299 - && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { 300 - /* We can only transfer the rest of the src buffer, or as much 301 - * as will fit into the destination buffer. */ 302 - u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); 303 - 304 - /* For systems using "highmem" we need to use kmap() to access 305 - * the page we want. We often use the same page over and over, 306 - * so rather than kmap() it on every loop, we set the maddr 307 - * pointer to NULL when we need to move to the next 308 - * destination page. */ 309 - if (!maddr) 310 - maddr = kmap(pages[di]); 311 - 312 - /* Copy directly from (this Guest's) source address to the 313 - * destination Guest's kmap()ed buffer. Note that maddr points 314 - * to the start of the page: we need to add the offset of the 315 - * destination address and offset within the buffer. */ 316 - 317 - /* FIXME: This is not completely portable. I looked at 318 - * copy_to_user_page(), and some arch's seem to need special 319 - * flushes. x86 is fine. */ 320 - if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, 321 - (void __user *)src->addr[si], len) != 0) { 322 - /* If a copy failed, it's the source's fault. */ 323 - kill_guest(srclg, "bad address in sending DMA"); 324 - totlen = 0; 325 - break; 326 - } 327 - 328 - /* Increment the total and src & dst offsets */ 329 - totlen += len; 330 - srcoff += len; 331 - dstoff += len; 332 - 333 - /* Presumably we reached the end of the src or dest buffers: */ 334 - if (srcoff == src->len[si]) { 335 - /* Move to the next buffer at offset 0 */ 336 - si++; 337 - srcoff = 0; 338 - } 339 - if (dstoff == dst->len[di]) { 340 - /* We need to unmap that destination page and reset 341 - * maddr ready for the next one. */ 342 - kunmap(pages[di]); 343 - maddr = NULL; 344 - di++; 345 - dstoff = 0; 346 - } 347 - } 348 - 349 - /* If we still had a page mapped at the end, unmap now. */ 350 - if (maddr) 351 - kunmap(pages[di]); 352 - 353 - return totlen; 354 - } 355 - 356 - /*L:390 This is how we transfer a "struct lguest_dma" from the source Guest 357 - * (the current Guest which called SEND_DMA) to another Guest. */ 358 - static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, 359 - struct lguest *dstlg, const struct lguest_dma *dst) 360 - { 361 - int i; 362 - u32 ret; 363 - struct page *pages[LGUEST_MAX_DMA_SECTIONS]; 364 - 365 - /* We check that both source and destination "struct lguest_dma"s are 366 - * within the bounds of the source and destination Guests */ 367 - if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) 368 - return 0; 369 - 370 - /* We need to map the pages which correspond to each parts of 371 - * destination buffer. */ 372 - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 373 - if (dst->len[i] == 0) 374 - break; 375 - /* get_user_pages() is a complicated function, especially since 376 - * we only want a single page. But it works, and returns the 377 - * number of pages. Note that we're holding the destination's 378 - * mmap_sem, as get_user_pages() requires. */ 379 - if (get_user_pages(dstlg->tsk, dstlg->mm, 380 - dst->addr[i], 1, 1, 1, pages+i, NULL) 381 - != 1) { 382 - /* This means the destination gave us a bogus buffer */ 383 - kill_guest(dstlg, "Error mapping DMA pages"); 384 - ret = 0; 385 - goto drop_pages; 386 - } 387 - } 388 - 389 - /* Now copy the data until we run out of src or dst. */ 390 - ret = copy_data(srclg, src, dst, pages); 391 - 392 - drop_pages: 393 - while (--i >= 0) 394 - put_page(pages[i]); 395 - return ret; 396 - } 397 - 398 - /*L:380 Transferring data from one Guest to another is not as simple as I'd 399 - * like. We've found the "struct lguest_dma_info" bound to the same address as 400 - * the send, we need to copy into it. 401 - * 402 - * This function returns true if the destination array was empty. */ 403 - static int dma_transfer(struct lguest *srclg, 404 - unsigned long udma, 405 - struct lguest_dma_info *dst) 406 - { 407 - struct lguest_dma dst_dma, src_dma; 408 - struct lguest *dstlg; 409 - u32 i, dma = 0; 410 - 411 - /* From the "struct lguest_dma_info" we found in the hash, grab the 412 - * Guest. */ 413 - dstlg = &lguests[dst->guestid]; 414 - /* Read in the source "struct lguest_dma" handed to SEND_DMA. */ 415 - lgread(srclg, &src_dma, udma, sizeof(src_dma)); 416 - 417 - /* We need the destination's mmap_sem, and we already hold the source's 418 - * mmap_sem for the futex key lookup. Normally this would suggest that 419 - * we could deadlock if the destination Guest was trying to send to 420 - * this source Guest at the same time, which is another reason that all 421 - * I/O is done under the big lguest_lock. */ 422 - down_read(&dstlg->mm->mmap_sem); 423 - 424 - /* Look through the destination DMA array for an available buffer. */ 425 - for (i = 0; i < dst->num_dmas; i++) { 426 - /* We keep a "next_dma" pointer which often helps us avoid 427 - * looking at lots of previously-filled entries. */ 428 - dma = (dst->next_dma + i) % dst->num_dmas; 429 - if (!lgread_other(dstlg, &dst_dma, 430 - dst->dmas + dma * sizeof(struct lguest_dma), 431 - sizeof(dst_dma))) { 432 - goto fail; 433 - } 434 - if (!dst_dma.used_len) 435 - break; 436 - } 437 - 438 - /* If we found a buffer, we do the actual data copy. */ 439 - if (i != dst->num_dmas) { 440 - unsigned long used_lenp; 441 - unsigned int ret; 442 - 443 - ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); 444 - /* Put used length in the source "struct lguest_dma"'s used_len 445 - * field. It's a little tricky to figure out where that is, 446 - * though. */ 447 - lgwrite_u32(srclg, 448 - udma+offsetof(struct lguest_dma, used_len), ret); 449 - /* Tranferring 0 bytes is OK if the source buffer was empty. */ 450 - if (ret == 0 && src_dma.len[0] != 0) 451 - goto fail; 452 - 453 - /* The destination Guest might be running on a different CPU: 454 - * we have to make sure that it will see the "used_len" field 455 - * change to non-zero *after* it sees the data we copied into 456 - * the buffer. Hence a write memory barrier. */ 457 - wmb(); 458 - /* Figuring out where the destination's used_len field for this 459 - * "struct lguest_dma" in the array is also a little ugly. */ 460 - used_lenp = dst->dmas 461 - + dma * sizeof(struct lguest_dma) 462 - + offsetof(struct lguest_dma, used_len); 463 - lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); 464 - /* Move the cursor for next time. */ 465 - dst->next_dma++; 466 - } 467 - up_read(&dstlg->mm->mmap_sem); 468 - 469 - /* We trigger the destination interrupt, even if the destination was 470 - * empty and we didn't transfer anything: this gives them a chance to 471 - * wake up and refill. */ 472 - set_bit(dst->interrupt, dstlg->irqs_pending); 473 - /* Wake up the destination process. */ 474 - wake_up_process(dstlg->tsk); 475 - /* If we passed the last "struct lguest_dma", the receive had no 476 - * buffers left. */ 477 - return i == dst->num_dmas; 478 - 479 - fail: 480 - up_read(&dstlg->mm->mmap_sem); 481 - return 0; 482 - } 483 - 484 - /*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA 485 - * hypercall. We find out who's listening, and send to them. */ 486 - void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) 487 - { 488 - union futex_key key; 489 - int empty = 0; 490 - struct rw_semaphore *fshared = &current->mm->mmap_sem; 491 - 492 - again: 493 - mutex_lock(&lguest_lock); 494 - down_read(fshared); 495 - /* Get the futex key for the key the Guest gave us */ 496 - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 497 - kill_guest(lg, "bad sending DMA key"); 498 - goto unlock; 499 - } 500 - /* Since the key must be a multiple of 4, the futex key uses the lower 501 - * bit of the "offset" field (which would always be 0) to indicate a 502 - * mapping which is shared with other processes (ie. Guests). */ 503 - if (key.shared.offset & 1) { 504 - struct lguest_dma_info *i; 505 - /* Look through the hash for other Guests. */ 506 - list_for_each_entry(i, &dma_hash[hash(&key)], list) { 507 - /* Don't send to ourselves. */ 508 - if (i->guestid == lg->guestid) 509 - continue; 510 - if (!key_eq(&key, &i->key)) 511 - continue; 512 - 513 - /* If dma_transfer() tells us the destination has no 514 - * available buffers, we increment "empty". */ 515 - empty += dma_transfer(lg, udma, i); 516 - break; 517 - } 518 - /* If the destination is empty, we release our locks and 519 - * give the destination Guest a brief chance to restock. */ 520 - if (empty == 1) { 521 - /* Give any recipients one chance to restock. */ 522 - up_read(&current->mm->mmap_sem); 523 - mutex_unlock(&lguest_lock); 524 - /* Next time, we won't try again. */ 525 - empty++; 526 - goto again; 527 - } 528 - } else { 529 - /* Private mapping: Guest is sending to its Launcher. We set 530 - * the "dma_is_pending" flag so that the main loop will exit 531 - * and the Launcher's read() from /dev/lguest will return. */ 532 - lg->dma_is_pending = 1; 533 - lg->pending_dma = udma; 534 - lg->pending_key = ukey; 535 - } 536 - unlock: 537 - up_read(fshared); 538 - mutex_unlock(&lguest_lock); 539 - } 540 - /*:*/ 541 - 542 - void release_all_dma(struct lguest *lg) 543 - { 544 - unsigned int i; 545 - 546 - BUG_ON(!mutex_is_locked(&lguest_lock)); 547 - 548 - down_read(&lg->mm->mmap_sem); 549 - for (i = 0; i < LGUEST_MAX_DMA; i++) { 550 - if (lg->dma[i].interrupt) 551 - unlink_dma(&lg->dma[i]); 552 - } 553 - up_read(&lg->mm->mmap_sem); 554 - } 555 - 556 - /*M:007 We only return a single DMA buffer to the Launcher, but it would be 557 - * more efficient to return a pointer to the entire array of DMA buffers, which 558 - * it can cache and choose one whenever it wants. 559 - * 560 - * Currently the Launcher uses a write to /dev/lguest, and the return value is 561 - * the address of the DMA structure with the interrupt number placed in 562 - * dma->used_len. If we wanted to return the entire array, we need to return 563 - * the address, array size and interrupt number: this seems to require an 564 - * ioctl(). :*/ 565 - 566 - /*L:320 This routine looks for a DMA buffer registered by the Guest on the 567 - * given key (using the BIND_DMA hypercall). */ 568 - unsigned long get_dma_buffer(struct lguest *lg, 569 - unsigned long ukey, unsigned long *interrupt) 570 - { 571 - unsigned long ret = 0; 572 - union futex_key key; 573 - struct lguest_dma_info *i; 574 - struct rw_semaphore *fshared = &current->mm->mmap_sem; 575 - 576 - /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA 577 - * at the same time. */ 578 - mutex_lock(&lguest_lock); 579 - /* To match between Guests sharing the same underlying memory we steal 580 - * code from the futex infrastructure. This requires that we hold the 581 - * "mmap_sem" for our process (the Launcher), and pass it to the futex 582 - * code. */ 583 - down_read(fshared); 584 - 585 - /* This can fail if it's not a valid address, or if the address is not 586 - * divisible by 4 (the futex code needs that, we don't really). */ 587 - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 588 - kill_guest(lg, "bad registered DMA buffer"); 589 - goto unlock; 590 - } 591 - /* Search the hash table for matching entries (the Launcher can only 592 - * send to its own Guest for the moment, so the entry must be for this 593 - * Guest) */ 594 - list_for_each_entry(i, &dma_hash[hash(&key)], list) { 595 - if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { 596 - unsigned int j; 597 - /* Look through the registered DMA array for an 598 - * available buffer. */ 599 - for (j = 0; j < i->num_dmas; j++) { 600 - struct lguest_dma dma; 601 - 602 - ret = i->dmas + j * sizeof(struct lguest_dma); 603 - lgread(lg, &dma, ret, sizeof(dma)); 604 - if (dma.used_len == 0) 605 - break; 606 - } 607 - /* Store the interrupt the Guest wants when the buffer 608 - * is used. */ 609 - *interrupt = i->interrupt; 610 - break; 611 - } 612 - } 613 - unlock: 614 - up_read(fshared); 615 - mutex_unlock(&lguest_lock); 616 - return ret; 617 - } 618 - /*:*/ 619 - 620 - /*L:410 This really has completed the Launcher. Not only have we now finished 621 - * the longest chapter in our journey, but this also means we are over halfway 622 - * through! 623 - * 624 - * Enough prevaricating around the bush: it is time for us to dive into the 625 - * core of the Host, in "make Host". 626 - */

+57 -132

drivers/lguest/lg.h

··· 1 1 #ifndef _LGUEST_H 2 2 #define _LGUEST_H 3 3 4 - #include <asm/desc.h> 5 - 6 - #define GDT_ENTRY_LGUEST_CS 10 7 - #define GDT_ENTRY_LGUEST_DS 11 8 - #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) 9 - #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) 10 - 11 4 #ifndef __ASSEMBLY__ 12 5 #include <linux/types.h> 13 6 #include <linux/init.h> 14 7 #include <linux/stringify.h> 15 - #include <linux/binfmts.h> 16 - #include <linux/futex.h> 17 8 #include <linux/lguest.h> 18 9 #include <linux/lguest_launcher.h> 19 10 #include <linux/wait.h> 20 11 #include <linux/err.h> 21 12 #include <asm/semaphore.h> 22 - #include "irq_vectors.h" 23 13 24 - #define GUEST_PL 1 25 - 26 - struct lguest_regs 27 - { 28 - /* Manually saved part. */ 29 - unsigned long ebx, ecx, edx; 30 - unsigned long esi, edi, ebp; 31 - unsigned long gs; 32 - unsigned long eax; 33 - unsigned long fs, ds, es; 34 - unsigned long trapnum, errcode; 35 - /* Trap pushed part */ 36 - unsigned long eip; 37 - unsigned long cs; 38 - unsigned long eflags; 39 - unsigned long esp; 40 - unsigned long ss; 41 - }; 14 + #include <asm/lguest.h> 42 15 43 16 void free_pagetables(void); 44 17 int init_pagetables(struct page **switcher_page, unsigned int pages); 45 18 46 - /* Full 4G segment descriptors, suitable for CS and DS. */ 47 - #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 48 - #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 49 - 50 - struct lguest_dma_info 51 - { 52 - struct list_head list; 53 - union futex_key key; 54 - unsigned long dmas; 55 - u16 next_dma; 56 - u16 num_dmas; 57 - u16 guestid; 58 - u8 interrupt; /* 0 when not registered */ 59 - }; 60 - 61 - /*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He 62 - * reviewed the original code which used "u32" for all page table entries, and 63 - * insisted that it would be far clearer with explicit typing. I thought it 64 - * was overkill, but he was right: it is much clearer than it was before. 65 - * 66 - * We have separate types for the Guest's ptes & pgds and the shadow ptes & 67 - * pgds. There's already a Linux type for these (pte_t and pgd_t) but they 68 - * change depending on kernel config options (PAE). */ 69 - 70 - /* Each entry is identical: lower 12 bits of flags and upper 20 bits for the 71 - * "page frame number" (0 == first physical page, etc). They are different 72 - * types so the compiler will warn us if we mix them improperly. */ 73 - typedef union { 74 - struct { unsigned flags:12, pfn:20; }; 75 - struct { unsigned long val; } raw; 76 - } spgd_t; 77 - typedef union { 78 - struct { unsigned flags:12, pfn:20; }; 79 - struct { unsigned long val; } raw; 80 - } spte_t; 81 - typedef union { 82 - struct { unsigned flags:12, pfn:20; }; 83 - struct { unsigned long val; } raw; 84 - } gpgd_t; 85 - typedef union { 86 - struct { unsigned flags:12, pfn:20; }; 87 - struct { unsigned long val; } raw; 88 - } gpte_t; 89 - 90 - /* We have two convenient macros to convert a "raw" value as handed to us by 91 - * the Guest into the correct Guest PGD or PTE type. */ 92 - #define mkgpte(_val) ((gpte_t){.raw.val = _val}) 93 - #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) 94 - /*:*/ 95 - 96 19 struct pgdir 97 20 { 98 - unsigned long cr3; 99 - spgd_t *pgdir; 100 - }; 101 - 102 - /* This is a guest-specific page (mapped ro) into the guest. */ 103 - struct lguest_ro_state 104 - { 105 - /* Host information we need to restore when we switch back. */ 106 - u32 host_cr3; 107 - struct Xgt_desc_struct host_idt_desc; 108 - struct Xgt_desc_struct host_gdt_desc; 109 - u32 host_sp; 110 - 111 - /* Fields which are used when guest is running. */ 112 - struct Xgt_desc_struct guest_idt_desc; 113 - struct Xgt_desc_struct guest_gdt_desc; 114 - struct i386_hw_tss guest_tss; 115 - struct desc_struct guest_idt[IDT_ENTRIES]; 116 - struct desc_struct guest_gdt[GDT_ENTRIES]; 21 + unsigned long gpgdir; 22 + pgd_t *pgdir; 117 23 }; 118 24 119 25 /* We have two pages shared with guests, per cpu. */ ··· 47 141 struct lguest_data __user *lguest_data; 48 142 struct task_struct *tsk; 49 143 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 50 - u16 guestid; 51 144 u32 pfn_limit; 52 - u32 page_offset; 145 + /* This provides the offset to the base of guest-physical 146 + * memory in the Launcher. */ 147 + void __user *mem_base; 148 + unsigned long kernel_address; 53 149 u32 cr2; 54 150 int halted; 55 151 int ts; 56 152 u32 next_hcall; 57 153 u32 esp1; 58 154 u8 ss1; 155 + 156 + /* If a hypercall was asked for, this points to the arguments. */ 157 + struct hcall_args *hcall; 59 158 60 159 /* Do we need to stop what we're doing and return to userspace? */ 61 160 int break_out; ··· 78 167 struct task_struct *wake; 79 168 80 169 unsigned long noirq_start, noirq_end; 81 - int dma_is_pending; 82 - unsigned long pending_dma; /* struct lguest_dma */ 83 - unsigned long pending_key; /* address they're sending to */ 170 + unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 84 171 85 172 unsigned int stack_pages; 86 173 u32 tsc_khz; 87 174 88 - struct lguest_dma_info dma[LGUEST_MAX_DMA]; 89 - 90 175 /* Dead? */ 91 176 const char *dead; 92 177 93 - /* The GDT entries copied into lguest_ro_state when running. */ 94 - struct desc_struct gdt[GDT_ENTRIES]; 95 - 96 - /* The IDT entries: some copied into lguest_ro_state when running. */ 97 - struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS]; 98 - struct desc_struct syscall_idt; 178 + struct lguest_arch arch; 99 179 100 180 /* Virtual clock device */ 101 181 struct hrtimer hrt; ··· 95 193 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); 96 194 }; 97 195 98 - extern struct lguest lguests[]; 99 196 extern struct mutex lguest_lock; 100 197 101 198 /* core.c: */ 102 - u32 lgread_u32(struct lguest *lg, unsigned long addr); 103 - void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val); 104 - void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len); 105 - void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len); 106 - int find_free_guest(void); 107 199 int lguest_address_ok(const struct lguest *lg, 108 200 unsigned long addr, unsigned long len); 201 + void __lgread(struct lguest *, void *, unsigned long, unsigned); 202 + void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 203 + 204 + /*L:306 Using memory-copy operations like that is usually inconvient, so we 205 + * have the following helper macros which read and write a specific type (often 206 + * an unsigned long). 207 + * 208 + * This reads into a variable of the given type then returns that. */ 209 + #define lgread(lg, addr, type) \ 210 + ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) 211 + 212 + /* This checks that the variable is of the given type, then writes it out. */ 213 + #define lgwrite(lg, addr, type, val) \ 214 + do { \ 215 + typecheck(type, val); \ 216 + __lgwrite((lg), (addr), &(val), sizeof(val)); \ 217 + } while(0) 218 + /* (end of memory access helper routines) :*/ 219 + 109 220 int run_guest(struct lguest *lg, unsigned long __user *user); 110 221 222 + /* Helper macros to obtain the first 12 or the last 20 bits, this is only the 223 + * first step in the migration to the kernel types. pte_pfn is already defined 224 + * in the kernel. */ 225 + #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 226 + #define pte_flags(x) (pte_val(x) & ~PAGE_MASK) 227 + #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 111 228 112 229 /* interrupts_and_traps.c: */ 113 230 void maybe_do_interrupt(struct lguest *lg); ··· 140 219 const unsigned long *def); 141 220 void guest_set_clockevent(struct lguest *lg, unsigned long delta); 142 221 void init_clockdev(struct lguest *lg); 222 + bool check_syscall_vector(struct lguest *lg); 223 + int init_interrupts(void); 224 + void free_interrupts(void); 143 225 144 226 /* segments.c: */ 145 227 void setup_default_gdt_entries(struct lguest_ro_state *state); ··· 156 232 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); 157 233 void free_guest_pagetable(struct lguest *lg); 158 234 void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); 159 - void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); 235 + void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 160 236 void guest_pagetable_clear_all(struct lguest *lg); 161 237 void guest_pagetable_flush_user(struct lguest *lg); 162 - void guest_set_pte(struct lguest *lg, unsigned long cr3, 163 - unsigned long vaddr, gpte_t val); 238 + void guest_set_pte(struct lguest *lg, unsigned long gpgdir, 239 + unsigned long vaddr, pte_t val); 164 240 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 165 241 int demand_page(struct lguest *info, unsigned long cr2, int errcode); 166 242 void pin_page(struct lguest *lg, unsigned long vaddr); 243 + unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); 244 + void page_table_guest_data_init(struct lguest *lg); 245 + 246 + /* <arch>/core.c: */ 247 + void lguest_arch_host_init(void); 248 + void lguest_arch_host_fini(void); 249 + void lguest_arch_run_guest(struct lguest *lg); 250 + void lguest_arch_handle_trap(struct lguest *lg); 251 + int lguest_arch_init_hypercalls(struct lguest *lg); 252 + int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); 253 + void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); 254 + 255 + /* <arch>/switcher.S: */ 256 + extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 167 257 168 258 /* lguest_user.c: */ 169 259 int lguest_device_init(void); 170 260 void lguest_device_remove(void); 171 - 172 - /* io.c: */ 173 - void lguest_io_init(void); 174 - int bind_dma(struct lguest *lg, 175 - unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt); 176 - void send_dma(struct lguest *info, unsigned long key, unsigned long udma); 177 - void release_all_dma(struct lguest *lg); 178 - unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, 179 - unsigned long *interrupt); 180 261 181 262 /* hypercalls.c: */ 182 263 void do_hypercalls(struct lguest *lg); ··· 221 292 } while(0) 222 293 /* (End of aside) :*/ 223 294 224 - static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 225 - { 226 - return vaddr - lg->page_offset; 227 - } 228 295 #endif /* __ASSEMBLY__ */ 229 296 #endif /* _LGUEST_H */

+32 -70

drivers/lguest/lguest.c arch/x86/lguest/boot.c

··· 55 55 #include <linux/clockchips.h> 56 56 #include <linux/lguest.h> 57 57 #include <linux/lguest_launcher.h> 58 - #include <linux/lguest_bus.h> 58 + #include <linux/virtio_console.h> 59 59 #include <asm/paravirt.h> 60 60 #include <asm/param.h> 61 61 #include <asm/page.h> ··· 65 65 #include <asm/e820.h> 66 66 #include <asm/mce.h> 67 67 #include <asm/io.h> 68 + #include <asm/i387.h> 68 69 69 70 /*G:010 Welcome to the Guest! 70 71 * ··· 86 85 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 87 86 .noirq_start = (u32)lguest_noirq_start, 88 87 .noirq_end = (u32)lguest_noirq_end, 88 + .kernel_address = PAGE_OFFSET, 89 89 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 90 + .syscall_vec = SYSCALL_VECTOR, 90 91 }; 91 - struct lguest_device_desc *lguest_devices; 92 92 static cycle_t clock_base; 93 93 94 94 /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first ··· 148 146 /* Table full, so do normal hcall which will flush table. */ 149 147 hcall(call, arg1, arg2, arg3); 150 148 } else { 151 - lguest_data.hcalls[next_call].eax = call; 152 - lguest_data.hcalls[next_call].edx = arg1; 153 - lguest_data.hcalls[next_call].ebx = arg2; 154 - lguest_data.hcalls[next_call].ecx = arg3; 149 + lguest_data.hcalls[next_call].arg0 = call; 150 + lguest_data.hcalls[next_call].arg1 = arg1; 151 + lguest_data.hcalls[next_call].arg2 = arg2; 152 + lguest_data.hcalls[next_call].arg3 = arg3; 155 153 /* Arguments must all be written before we mark it to go */ 156 154 wmb(); 157 155 lguest_data.hcall_status[next_call] = 0; ··· 161 159 local_irq_restore(flags); 162 160 } 163 161 /*:*/ 164 - 165 - /* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because 166 - * Jeff Garzik complained that __pa() should never appear in drivers, and this 167 - * helps remove most of them. But also, it wraps some ugliness. */ 168 - void lguest_send_dma(unsigned long key, struct lguest_dma *dma) 169 - { 170 - /* The hcall might not write this if something goes wrong */ 171 - dma->used_len = 0; 172 - hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); 173 - } 174 - 175 - int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, 176 - unsigned int num, u8 irq) 177 - { 178 - /* This is the only hypercall which actually wants 5 arguments, and we 179 - * only support 4. Fortunately the interrupt number is always less 180 - * than 256, so we can pack it with the number of dmas in the final 181 - * argument. */ 182 - if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) 183 - return -ENOMEM; 184 - return 0; 185 - } 186 - 187 - /* Unbinding is the same hypercall as binding, but with 0 num & irq. */ 188 - void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) 189 - { 190 - hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); 191 - } 192 - 193 - /* For guests, device memory can be used as normal memory, so we cast away the 194 - * __iomem to quieten sparse. */ 195 - void *lguest_map(unsigned long phys_addr, unsigned long pages) 196 - { 197 - return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); 198 - } 199 - 200 - void lguest_unmap(void *addr) 201 - { 202 - iounmap((__force void __iomem *)addr); 203 - } 204 162 205 163 /*G:033 206 164 * Here are our first native-instruction replacements: four functions for ··· 642 680 .mask = CLOCKSOURCE_MASK(64), 643 681 .mult = 1 << 22, 644 682 .shift = 22, 683 + .flags = CLOCK_SOURCE_IS_CONTINUOUS, 645 684 }; 646 685 647 686 /* The "scheduler clock" is just our real clock, adjusted to start at zero */ ··· 724 761 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 725 762 * way, the "rating" is initialized so high that it's always chosen 726 763 * over any other clocksource. */ 727 - if (lguest_data.tsc_khz) { 764 + if (lguest_data.tsc_khz) 728 765 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 729 766 lguest_clock.shift); 730 - lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; 731 - } 732 767 clock_base = lguest_clock_read(); 733 768 clocksource_register(&lguest_clock); 734 769 ··· 850 889 return "LGUEST"; 851 890 } 852 891 892 + /* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to 893 + * produce console output. */ 894 + static __init int early_put_chars(u32 vtermno, const char *buf, int count) 895 + { 896 + char scratch[17]; 897 + unsigned int len = count; 898 + 899 + if (len > sizeof(scratch) - 1) 900 + len = sizeof(scratch) - 1; 901 + scratch[len] = '\0'; 902 + memcpy(scratch, buf, len); 903 + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0); 904 + 905 + /* This routine returns the number of bytes actually written. */ 906 + return len; 907 + } 908 + 853 909 /*G:050 854 910 * Patching (Powerfully Placating Performance Pedants) 855 911 * ··· 928 950 /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 929 951 * structures in the kernel provide points for (almost) every routine we have 930 952 * to override to avoid privileged instructions. */ 931 - __init void lguest_init(void *boot) 953 + __init void lguest_init(void) 932 954 { 933 - /* Copy boot parameters first: the Launcher put the physical location 934 - * in %esi, and head.S converted that to a virtual address and handed 935 - * it to us. We use "__memcpy" because "memcpy" sometimes tries to do 936 - * tricky things to go faster, and we're not ready for that. */ 937 - __memcpy(&boot_params, boot, PARAM_SIZE); 938 - /* The boot parameters also tell us where the command-line is: save 939 - * that, too. */ 940 - __memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), 941 - COMMAND_LINE_SIZE); 942 - 943 955 /* We're under lguest, paravirt is enabled, and we're running at 944 956 * privilege level 1, not 0 as normal. */ 945 957 pv_info.name = "lguest"; ··· 1001 1033 1002 1034 /*G:070 Now we've seen all the paravirt_ops, we return to 1003 1035 * lguest_init() where the rest of the fairly chaotic boot setup 1004 - * occurs. 1005 - * 1006 - * The Host expects our first hypercall to tell it where our "struct 1007 - * lguest_data" is, so we do that first. */ 1008 - hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); 1036 + * occurs. */ 1009 1037 1010 1038 /* The native boot code sets up initial page tables immediately after 1011 1039 * the kernel itself, and sets init_pg_tables_end so they're not ··· 1013 1049 /* Load the %fs segment register (the per-cpu segment register) with 1014 1050 * the normal data segment to get through booting. */ 1015 1051 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1016 - 1017 - /* Clear the part of the kernel data which is expected to be zero. 1018 - * Normally it will be anyway, but if we're loading from a bzImage with 1019 - * CONFIG_RELOCATALE=y, the relocations will be sitting here. */ 1020 - memset(__bss_start, 0, __bss_stop - __bss_start); 1021 1052 1022 1053 /* The Host uses the top of the Guest's virtual address space for the 1023 1054 * Host<->Guest Switcher, and it tells us how much it needs in ··· 1050 1091 * virtual console" driver written by the PowerPC people, which we also 1051 1092 * adapted for lguest's use. */ 1052 1093 add_preferred_console("hvc", 0, NULL); 1094 + 1095 + /* Register our very early console. */ 1096 + virtio_cons_early_init(early_put_chars); 1053 1097 1054 1098 /* Last of all, we set the power management poweroff hook to point to 1055 1099 * the Guest routine to power off. */

+34 -12

drivers/lguest/lguest_asm.S arch/x86/lguest/i386_head.S

··· 1 1 #include <linux/linkage.h> 2 2 #include <linux/lguest.h> 3 + #include <asm/lguest_hcall.h> 3 4 #include <asm/asm-offsets.h> 4 5 #include <asm/thread_info.h> 5 6 #include <asm/processor-flags.h> 6 7 7 - /*G:020 This is where we begin: we have a magic signature which the launcher 8 - * looks for. The plan is that the Linux boot protocol will be extended with a 9 - * "platform type" field which will guide us here from the normal entry point, 10 - * but for the moment this suffices. The normal boot code uses %esi for the 11 - * boot header, so we do too. We convert it to a virtual address by adding 12 - * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax). 8 + /*G:020 This is where we begin: head.S notes that the boot header's platform 9 + * type field is "1" (lguest), so calls us here. The boot header is in %esi. 10 + * 11 + * WARNING: be very careful here! We're running at addresses equal to physical 12 + * addesses (around 0), not above PAGE_OFFSET as most code expectes 13 + * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 14 + * data. 13 15 * 14 16 * The .section line puts this code in .init.text so it will be discarded after 15 17 * boot. */ 16 18 .section .init.text, "ax", @progbits 17 - .ascii "GenuineLguest" 18 - /* Set up initial stack. */ 19 - movl $(init_thread_union+THREAD_SIZE),%esp 20 - movl %esi, %eax 21 - addl $__PAGE_OFFSET, %eax 22 - jmp lguest_init 19 + ENTRY(lguest_entry) 20 + /* Make initial hypercall now, so we can set up the pagetables. */ 21 + movl $LHCALL_LGUEST_INIT, %eax 22 + movl $lguest_data - __PAGE_OFFSET, %edx 23 + int $LGUEST_TRAP_ENTRY 24 + 25 + /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl 26 + * instruction uses %esi implicitly. */ 27 + movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi 28 + 29 + /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. 30 + * This means the first 128M of kernel memory will be mapped at 31 + * PAGE_OFFSET where the kernel expects to run. This will get it far 32 + * enough through boot to switch to its own pagetables. */ 33 + movl $32, %ecx 34 + movl %esi, %edi 35 + addl $((__PAGE_OFFSET >> 22) * 4), %edi 36 + rep 37 + movsl 38 + 39 + /* Set up the initial stack so we can run C code. */ 40 + movl $(init_thread_union+THREAD_SIZE),%esp 41 + 42 + /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 43 + * moment. */ 44 + jmp lguest_init+__PAGE_OFFSET 23 45 24 46 /*G:055 We create a macro which puts the assembler code between lgstart_ and 25 47 * lgend_ markers. These templates are put in the .text section: they can't be

-218

drivers/lguest/lguest_bus.c

··· 1 - /*P:050 Lguest guests use a very simple bus for devices. It's a simple array 2 - * of device descriptors contained just above the top of normal memory. The 3 - * lguest bus is 80% tedious boilerplate code. :*/ 4 - #include <linux/init.h> 5 - #include <linux/bootmem.h> 6 - #include <linux/lguest_bus.h> 7 - #include <asm/io.h> 8 - #include <asm/paravirt.h> 9 - 10 - static ssize_t type_show(struct device *_dev, 11 - struct device_attribute *attr, char *buf) 12 - { 13 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 14 - return sprintf(buf, "%hu", lguest_devices[dev->index].type); 15 - } 16 - static ssize_t features_show(struct device *_dev, 17 - struct device_attribute *attr, char *buf) 18 - { 19 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 20 - return sprintf(buf, "%hx", lguest_devices[dev->index].features); 21 - } 22 - static ssize_t pfn_show(struct device *_dev, 23 - struct device_attribute *attr, char *buf) 24 - { 25 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 26 - return sprintf(buf, "%u", lguest_devices[dev->index].pfn); 27 - } 28 - static ssize_t status_show(struct device *_dev, 29 - struct device_attribute *attr, char *buf) 30 - { 31 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 32 - return sprintf(buf, "%hx", lguest_devices[dev->index].status); 33 - } 34 - static ssize_t status_store(struct device *_dev, struct device_attribute *attr, 35 - const char *buf, size_t count) 36 - { 37 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 38 - if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1) 39 - return -EINVAL; 40 - return count; 41 - } 42 - static struct device_attribute lguest_dev_attrs[] = { 43 - __ATTR_RO(type), 44 - __ATTR_RO(features), 45 - __ATTR_RO(pfn), 46 - __ATTR(status, 0644, status_show, status_store), 47 - __ATTR_NULL 48 - }; 49 - 50 - /*D:130 The generic bus infrastructure requires a function which says whether a 51 - * device matches a driver. For us, it is simple: "struct lguest_driver" 52 - * contains a "device_type" field which indicates what type of device it can 53 - * handle, so we just cast the args and compare: */ 54 - static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) 55 - { 56 - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 57 - struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv); 58 - 59 - return (drv->device_type == lguest_devices[dev->index].type); 60 - } 61 - /*:*/ 62 - 63 - struct lguest_bus { 64 - struct bus_type bus; 65 - struct device dev; 66 - }; 67 - 68 - static struct lguest_bus lguest_bus = { 69 - .bus = { 70 - .name = "lguest", 71 - .match = lguest_dev_match, 72 - .dev_attrs = lguest_dev_attrs, 73 - }, 74 - .dev = { 75 - .parent = NULL, 76 - .bus_id = "lguest", 77 - } 78 - }; 79 - 80 - /*D:140 This is the callback which occurs once the bus infrastructure matches 81 - * up a device and driver, ie. in response to add_lguest_device() calling 82 - * device_register(), or register_lguest_driver() calling driver_register(). 83 - * 84 - * At the moment it's always the latter: the devices are added first, since 85 - * scan_devices() is called from a "core_initcall", and the drivers themselves 86 - * called later as a normal "initcall". But it would work the other way too. 87 - * 88 - * So now we have the happy couple, we add the status bit to indicate that we 89 - * found a driver. If the driver truly loves the device, it will return 90 - * happiness from its probe function (ok, perhaps this wasn't my greatest 91 - * analogy), and we set the final "driver ok" bit so the Host sees it's all 92 - * green. */ 93 - static int lguest_dev_probe(struct device *_dev) 94 - { 95 - int ret; 96 - struct lguest_device*dev = container_of(_dev,struct lguest_device,dev); 97 - struct lguest_driver*drv = container_of(dev->dev.driver, 98 - struct lguest_driver, drv); 99 - 100 - lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; 101 - ret = drv->probe(dev); 102 - if (ret == 0) 103 - lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK; 104 - return ret; 105 - } 106 - 107 - /* The last part of the bus infrastructure is the function lguest drivers use 108 - * to register themselves. Firstly, we do nothing if there's no lguest bus 109 - * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct 110 - * driver" fields and call the generic driver_register(). */ 111 - int register_lguest_driver(struct lguest_driver *drv) 112 - { 113 - if (!lguest_devices) 114 - return 0; 115 - 116 - drv->drv.bus = &lguest_bus.bus; 117 - drv->drv.name = drv->name; 118 - drv->drv.owner = drv->owner; 119 - drv->drv.probe = lguest_dev_probe; 120 - 121 - return driver_register(&drv->drv); 122 - } 123 - 124 - /* At the moment we build all the drivers into the kernel because they're so 125 - * simple: 8144 bytes for all three of them as I type this. And as the console 126 - * really needs to be built in, it's actually only 3527 bytes for the network 127 - * and block drivers. 128 - * 129 - * If they get complex it will make sense for them to be modularized, so we 130 - * need to explicitly export the symbol. 131 - * 132 - * I don't think non-GPL modules make sense, so it's a GPL-only export. 133 - */ 134 - EXPORT_SYMBOL_GPL(register_lguest_driver); 135 - 136 - /*D:120 This is the core of the lguest bus: actually adding a new device. 137 - * It's a separate function because it's neater that way, and because an 138 - * earlier version of the code supported hotplug and unplug. They were removed 139 - * early on because they were never used. 140 - * 141 - * As Andrew Tridgell says, "Untested code is buggy code". 142 - * 143 - * It's worth reading this carefully: we start with an index into the array of 144 - * "struct lguest_device_desc"s indicating the device which is new: */ 145 - static void add_lguest_device(unsigned int index) 146 - { 147 - struct lguest_device *new; 148 - 149 - /* Each "struct lguest_device_desc" has a "status" field, which the 150 - * Guest updates as the device is probed. In the worst case, the Host 151 - * can look at these bits to tell what part of device setup failed, 152 - * even if the console isn't available. */ 153 - lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; 154 - new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); 155 - if (!new) { 156 - printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); 157 - lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; 158 - return; 159 - } 160 - 161 - /* The "struct lguest_device" setup is pretty straight-forward example 162 - * code. */ 163 - new->index = index; 164 - new->private = NULL; 165 - memset(&new->dev, 0, sizeof(new->dev)); 166 - new->dev.parent = &lguest_bus.dev; 167 - new->dev.bus = &lguest_bus.bus; 168 - sprintf(new->dev.bus_id, "%u", index); 169 - 170 - /* device_register() causes the bus infrastructure to look for a 171 - * matching driver. */ 172 - if (device_register(&new->dev) != 0) { 173 - printk(KERN_EMERG "Cannot register lguest device %u\n", index); 174 - lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; 175 - kfree(new); 176 - } 177 - } 178 - 179 - /*D:110 scan_devices() simply iterates through the device array. The type 0 180 - * is reserved to mean "no device", and anything else means we have found a 181 - * device: add it. */ 182 - static void scan_devices(void) 183 - { 184 - unsigned int i; 185 - 186 - for (i = 0; i < LGUEST_MAX_DEVICES; i++) 187 - if (lguest_devices[i].type) 188 - add_lguest_device(i); 189 - } 190 - 191 - /*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest 192 - * bus. We check that we are a Guest by checking paravirt_ops.name: there are 193 - * other ways of checking, but this seems most obvious to me. 194 - * 195 - * So we can access the array of "struct lguest_device_desc"s easily, we map 196 - * that memory and store the pointer in the global "lguest_devices". Then we 197 - * register the bus with the core. Doing two registrations seems clunky to me, 198 - * but it seems to be the correct sysfs incantation. 199 - * 200 - * Finally we call scan_devices() which adds all the devices found in the 201 - * "struct lguest_device_desc" array. */ 202 - static int __init lguest_bus_init(void) 203 - { 204 - if (strcmp(pv_info.name, "lguest") != 0) 205 - return 0; 206 - 207 - /* Devices are in a single page above top of "normal" mem */ 208 - lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); 209 - 210 - if (bus_register(&lguest_bus.bus) != 0 211 - || device_register(&lguest_bus.dev) != 0) 212 - panic("lguest bus registration failed"); 213 - 214 - scan_devices(); 215 - return 0; 216 - } 217 - /* Do this after core stuff, before devices. */ 218 - postcore_initcall(lguest_bus_init);

+373

drivers/lguest/lguest_device.c

··· 1 + /*P:050 Lguest guests use a very simple method to describe devices. It's a 2 + * series of device descriptors contained just above the top of normal 3 + * memory. 4 + * 5 + * We use the standard "virtio" device infrastructure, which provides us with a 6 + * console, a network and a block driver. Each one expects some configuration 7 + * information and a "virtqueue" mechanism to send and receive data. :*/ 8 + #include <linux/init.h> 9 + #include <linux/bootmem.h> 10 + #include <linux/lguest_launcher.h> 11 + #include <linux/virtio.h> 12 + #include <linux/virtio_config.h> 13 + #include <linux/interrupt.h> 14 + #include <linux/virtio_ring.h> 15 + #include <linux/err.h> 16 + #include <asm/io.h> 17 + #include <asm/paravirt.h> 18 + #include <asm/lguest_hcall.h> 19 + 20 + /* The pointer to our (page) of device descriptions. */ 21 + static void *lguest_devices; 22 + 23 + /* Unique numbering for lguest devices. */ 24 + static unsigned int dev_index; 25 + 26 + /* For Guests, device memory can be used as normal memory, so we cast away the 27 + * __iomem to quieten sparse. */ 28 + static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) 29 + { 30 + return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); 31 + } 32 + 33 + static inline void lguest_unmap(void *addr) 34 + { 35 + iounmap((__force void __iomem *)addr); 36 + } 37 + 38 + /*D:100 Each lguest device is just a virtio device plus a pointer to its entry 39 + * in the lguest_devices page. */ 40 + struct lguest_device { 41 + struct virtio_device vdev; 42 + 43 + /* The entry in the lguest_devices page for this device. */ 44 + struct lguest_device_desc *desc; 45 + }; 46 + 47 + /* Since the virtio infrastructure hands us a pointer to the virtio_device all 48 + * the time, it helps to have a curt macro to get a pointer to the struct 49 + * lguest_device it's enclosed in. */ 50 + #define to_lgdev(vdev) container_of(vdev, struct lguest_device, vdev) 51 + 52 + /*D:130 53 + * Device configurations 54 + * 55 + * The configuration information for a device consists of a series of fields. 56 + * The device will look for these fields during setup. 57 + * 58 + * For us these fields come immediately after that device's descriptor in the 59 + * lguest_devices page. 60 + * 61 + * Each field starts with a "type" byte, a "length" byte, then that number of 62 + * bytes of configuration information. The device descriptor tells us the 63 + * total configuration length so we know when we've reached the last field. */ 64 + 65 + /* type + length bytes */ 66 + #define FHDR_LEN 2 67 + 68 + /* This finds the first field of a given type for a device's configuration. */ 69 + static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len) 70 + { 71 + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; 72 + int i; 73 + 74 + for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) { 75 + if (desc->config[i] == type) { 76 + /* Mark it used, so Host can know we looked at it, and 77 + * also so we won't find the same one twice. */ 78 + desc->config[i] |= 0x80; 79 + /* Remember, the second byte is the length. */ 80 + *len = desc->config[i+1]; 81 + /* We return a pointer to the field header. */ 82 + return desc->config + i; 83 + } 84 + } 85 + 86 + /* Not found: return NULL for failure. */ 87 + return NULL; 88 + } 89 + 90 + /* Once they've found a field, getting a copy of it is easy. */ 91 + static void lg_get(struct virtio_device *vdev, void *token, 92 + void *buf, unsigned len) 93 + { 94 + /* Check they didn't ask for more than the length of the field! */ 95 + BUG_ON(len > ((u8 *)token)[1]); 96 + memcpy(buf, token + FHDR_LEN, len); 97 + } 98 + 99 + /* Setting the contents is also trivial. */ 100 + static void lg_set(struct virtio_device *vdev, void *token, 101 + const void *buf, unsigned len) 102 + { 103 + BUG_ON(len > ((u8 *)token)[1]); 104 + memcpy(token + FHDR_LEN, buf, len); 105 + } 106 + 107 + /* The operations to get and set the status word just access the status field 108 + * of the device descriptor. */ 109 + static u8 lg_get_status(struct virtio_device *vdev) 110 + { 111 + return to_lgdev(vdev)->desc->status; 112 + } 113 + 114 + static void lg_set_status(struct virtio_device *vdev, u8 status) 115 + { 116 + to_lgdev(vdev)->desc->status = status; 117 + } 118 + 119 + /* 120 + * Virtqueues 121 + * 122 + * The other piece of infrastructure virtio needs is a "virtqueue": a way of 123 + * the Guest device registering buffers for the other side to read from or 124 + * write into (ie. send and receive buffers). Each device can have multiple 125 + * virtqueues: for example the console has one queue for sending and one for 126 + * receiving. 127 + * 128 + * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue 129 + * already exists in virtio_ring.c. We just need to connect it up. 130 + * 131 + * We start with the information we need to keep about each virtqueue. 132 + */ 133 + 134 + /*D:140 This is the information we remember about each virtqueue. */ 135 + struct lguest_vq_info 136 + { 137 + /* A copy of the information contained in the device config. */ 138 + struct lguest_vqconfig config; 139 + 140 + /* The address where we mapped the virtio ring, so we can unmap it. */ 141 + void *pages; 142 + }; 143 + 144 + /* When the virtio_ring code wants to prod the Host, it calls us here and we 145 + * make a hypercall. We hand the page number of the virtqueue so the Host 146 + * knows which virtqueue we're talking about. */ 147 + static void lg_notify(struct virtqueue *vq) 148 + { 149 + /* We store our virtqueue information in the "priv" pointer of the 150 + * virtqueue structure. */ 151 + struct lguest_vq_info *lvq = vq->priv; 152 + 153 + hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0); 154 + } 155 + 156 + /* This routine finds the first virtqueue described in the configuration of 157 + * this device and sets it up. 158 + * 159 + * This is kind of an ugly duckling. It'd be nicer to have a standard 160 + * representation of a virtqueue in the configuration space, but it seems that 161 + * everyone wants to do it differently. The KVM guys want the Guest to 162 + * allocate its own pages and tell the Host where they are, but for lguest it's 163 + * simpler for the Host to simply tell us where the pages are. 164 + * 165 + * So we provide devices with a "find virtqueue and set it up" function. */ 166 + static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 167 + bool (*callback)(struct virtqueue *vq)) 168 + { 169 + struct lguest_vq_info *lvq; 170 + struct virtqueue *vq; 171 + unsigned int len; 172 + void *token; 173 + int err; 174 + 175 + /* Look for a field of the correct type to mark a virtqueue. Note that 176 + * if this succeeds, then the type will be changed so it won't be found 177 + * again, and future lg_find_vq() calls will find the next 178 + * virtqueue (if any). */ 179 + token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len); 180 + if (!token) 181 + return ERR_PTR(-ENOENT); 182 + 183 + lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); 184 + if (!lvq) 185 + return ERR_PTR(-ENOMEM); 186 + 187 + /* Note: we could use a configuration space inside here, just like we 188 + * do for the device. This would allow expansion in future, because 189 + * our configuration system is designed to be expansible. But this is 190 + * way easier. */ 191 + if (len != sizeof(lvq->config)) { 192 + dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len); 193 + err = -EIO; 194 + goto free_lvq; 195 + } 196 + /* Make a copy of the "struct lguest_vqconfig" field. We need a copy 197 + * because the config space might not be aligned correctly. */ 198 + vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config)); 199 + 200 + /* Figure out how many pages the ring will take, and map that memory */ 201 + lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, 202 + DIV_ROUND_UP(vring_size(lvq->config.num), 203 + PAGE_SIZE)); 204 + if (!lvq->pages) { 205 + err = -ENOMEM; 206 + goto free_lvq; 207 + } 208 + 209 + /* OK, tell virtio_ring.c to set up a virtqueue now we know its size 210 + * and we've got a pointer to its pages. */ 211 + vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages, 212 + lg_notify, callback); 213 + if (!vq) { 214 + err = -ENOMEM; 215 + goto unmap; 216 + } 217 + 218 + /* Tell the interrupt for this virtqueue to go to the virtio_ring 219 + * interrupt handler. */ 220 + /* FIXME: We used to have a flag for the Host to tell us we could use 221 + * the interrupt as a source of randomness: it'd be nice to have that 222 + * back.. */ 223 + err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, 224 + vdev->dev.bus_id, vq); 225 + if (err) 226 + goto destroy_vring; 227 + 228 + /* Last of all we hook up our 'struct lguest_vq_info" to the 229 + * virtqueue's priv pointer. */ 230 + vq->priv = lvq; 231 + return vq; 232 + 233 + destroy_vring: 234 + vring_del_virtqueue(vq); 235 + unmap: 236 + lguest_unmap(lvq->pages); 237 + free_lvq: 238 + kfree(lvq); 239 + return ERR_PTR(err); 240 + } 241 + /*:*/ 242 + 243 + /* Cleaning up a virtqueue is easy */ 244 + static void lg_del_vq(struct virtqueue *vq) 245 + { 246 + struct lguest_vq_info *lvq = vq->priv; 247 + 248 + /* Tell virtio_ring.c to free the virtqueue. */ 249 + vring_del_virtqueue(vq); 250 + /* Unmap the pages containing the ring. */ 251 + lguest_unmap(lvq->pages); 252 + /* Free our own queue information. */ 253 + kfree(lvq); 254 + } 255 + 256 + /* The ops structure which hooks everything together. */ 257 + static struct virtio_config_ops lguest_config_ops = { 258 + .find = lg_find, 259 + .get = lg_get, 260 + .set = lg_set, 261 + .get_status = lg_get_status, 262 + .set_status = lg_set_status, 263 + .find_vq = lg_find_vq, 264 + .del_vq = lg_del_vq, 265 + }; 266 + 267 + /* The root device for the lguest virtio devices. This makes them appear as 268 + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ 269 + static struct device lguest_root = { 270 + .parent = NULL, 271 + .bus_id = "lguest", 272 + }; 273 + 274 + /*D:120 This is the core of the lguest bus: actually adding a new device. 275 + * It's a separate function because it's neater that way, and because an 276 + * earlier version of the code supported hotplug and unplug. They were removed 277 + * early on because they were never used. 278 + * 279 + * As Andrew Tridgell says, "Untested code is buggy code". 280 + * 281 + * It's worth reading this carefully: we start with a pointer to the new device 282 + * descriptor in the "lguest_devices" page. */ 283 + static void add_lguest_device(struct lguest_device_desc *d) 284 + { 285 + struct lguest_device *ldev; 286 + 287 + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 288 + if (!ldev) { 289 + printk(KERN_EMERG "Cannot allocate lguest dev %u\n", 290 + dev_index++); 291 + return; 292 + } 293 + 294 + /* This devices' parent is the lguest/ dir. */ 295 + ldev->vdev.dev.parent = &lguest_root; 296 + /* We have a unique device index thanks to the dev_index counter. */ 297 + ldev->vdev.index = dev_index++; 298 + /* The device type comes straight from the descriptor. There's also a 299 + * device vendor field in the virtio_device struct, which we leave as 300 + * 0. */ 301 + ldev->vdev.id.device = d->type; 302 + /* We have a simple set of routines for querying the device's 303 + * configuration information and setting its status. */ 304 + ldev->vdev.config = &lguest_config_ops; 305 + /* And we remember the device's descriptor for lguest_config_ops. */ 306 + ldev->desc = d; 307 + 308 + /* register_virtio_device() sets up the generic fields for the struct 309 + * virtio_device and calls device_register(). This makes the bus 310 + * infrastructure look for a matching driver. */ 311 + if (register_virtio_device(&ldev->vdev) != 0) { 312 + printk(KERN_ERR "Failed to register lguest device %u\n", 313 + ldev->vdev.index); 314 + kfree(ldev); 315 + } 316 + } 317 + 318 + /*D:110 scan_devices() simply iterates through the device page. The type 0 is 319 + * reserved to mean "end of devices". */ 320 + static void scan_devices(void) 321 + { 322 + unsigned int i; 323 + struct lguest_device_desc *d; 324 + 325 + /* We start at the page beginning, and skip over each entry. */ 326 + for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) { 327 + d = lguest_devices + i; 328 + 329 + /* Once we hit a zero, stop. */ 330 + if (d->type == 0) 331 + break; 332 + 333 + add_lguest_device(d); 334 + } 335 + } 336 + 337 + /*D:105 Fairly early in boot, lguest_devices_init() is called to set up the 338 + * lguest device infrastructure. We check that we are a Guest by checking 339 + * pv_info.name: there are other ways of checking, but this seems most 340 + * obvious to me. 341 + * 342 + * So we can access the "struct lguest_device_desc"s easily, we map that memory 343 + * and store the pointer in the global "lguest_devices". Then we register a 344 + * root device from which all our devices will hang (this seems to be the 345 + * correct sysfs incantation). 346 + * 347 + * Finally we call scan_devices() which adds all the devices found in the 348 + * lguest_devices page. */ 349 + static int __init lguest_devices_init(void) 350 + { 351 + if (strcmp(pv_info.name, "lguest") != 0) 352 + return 0; 353 + 354 + if (device_register(&lguest_root) != 0) 355 + panic("Could not register lguest root"); 356 + 357 + /* Devices are in a single page above top of "normal" mem */ 358 + lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); 359 + 360 + scan_devices(); 361 + return 0; 362 + } 363 + /* We do this after core stuff, but before the drivers. */ 364 + postcore_initcall(lguest_devices_init); 365 + 366 + /*D:150 At this point in the journey we used to now wade through the lguest 367 + * devices themselves: net, block and console. Since they're all now virtio 368 + * devices rather than lguest-specific, I've decided to ignore them. Mostly, 369 + * they're kind of boring. But this does mean you'll never experience the 370 + * thrill of reading the forbidden love scene buried deep in the block driver. 371 + * 372 + * "make Launcher" beckons, where we answer questions like "Where do Guests 373 + * come from?", and "What do you do when someone asks for optimization?". */

+33 -105

drivers/lguest/lguest_user.c

··· 1 1 /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 2 2 * controls and communicates with the Guest. For example, the first write will 3 - * tell us the memory size, pagetable, entry point and kernel address offset. 4 - * A read will run the Guest until a signal is pending (-EINTR), or the Guest 5 - * does a DMA out to the Launcher. Writes are also used to get a DMA buffer 6 - * registered by the Guest and to send the Guest an interrupt. :*/ 3 + * tell us the Guest's memory layout, pagetable, entry point and kernel address 4 + * offset. A read will run the Guest until something happens, such as a signal 5 + * or the Guest doing a NOTIFY out to the Launcher. :*/ 7 6 #include <linux/uaccess.h> 8 7 #include <linux/miscdevice.h> 9 8 #include <linux/fs.h> 10 9 #include "lg.h" 11 10 12 - /*L:030 setup_regs() doesn't really belong in this file, but it gives us an 13 - * early glimpse deeper into the Host so it's worth having here. 14 - * 15 - * Most of the Guest's registers are left alone: we used get_zeroed_page() to 16 - * allocate the structure, so they will be 0. */ 17 - static void setup_regs(struct lguest_regs *regs, unsigned long start) 18 - { 19 - /* There are four "segment" registers which the Guest needs to boot: 20 - * The "code segment" register (cs) refers to the kernel code segment 21 - * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 22 - * refer to the kernel data segment __KERNEL_DS. 23 - * 24 - * The privilege level is packed into the lower bits. The Guest runs 25 - * at privilege level 1 (GUEST_PL).*/ 26 - regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 27 - regs->cs = __KERNEL_CS|GUEST_PL; 28 - 29 - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 30 - * is supposed to always be "1". Bit 9 (0x200) controls whether 31 - * interrupts are enabled. We always leave interrupts enabled while 32 - * running the Guest. */ 33 - regs->eflags = 0x202; 34 - 35 - /* The "Extended Instruction Pointer" register says where the Guest is 36 - * running. */ 37 - regs->eip = start; 38 - 39 - /* %esi points to our boot information, at physical address 0, so don't 40 - * touch it. */ 41 - } 42 - 43 - /*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a 44 - * DMA buffer. This is done by writing LHREQ_GETDMA and the key to 45 - * /dev/lguest. */ 46 - static long user_get_dma(struct lguest *lg, const u32 __user *input) 47 - { 48 - unsigned long key, udma, irq; 49 - 50 - /* Fetch the key they wrote to us. */ 51 - if (get_user(key, input) != 0) 52 - return -EFAULT; 53 - /* Look for a free Guest DMA buffer bound to that key. */ 54 - udma = get_dma_buffer(lg, key, &irq); 55 - if (!udma) 56 - return -ENOENT; 57 - 58 - /* We need to tell the Launcher what interrupt the Guest expects after 59 - * the buffer is filled. We stash it in udma->used_len. */ 60 - lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); 61 - 62 - /* The (guest-physical) address of the DMA buffer is returned from 63 - * the write(). */ 64 - return udma; 65 - } 66 - 67 11 /*L:315 To force the Guest to stop running and return to the Launcher, the 68 12 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 69 13 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 70 - static int break_guest_out(struct lguest *lg, const u32 __user *input) 14 + static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 71 15 { 72 16 unsigned long on; 73 17 ··· 34 90 35 91 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 36 92 * number to /dev/lguest. */ 37 - static int user_send_irq(struct lguest *lg, const u32 __user *input) 93 + static int user_send_irq(struct lguest *lg, const unsigned long __user *input) 38 94 { 39 - u32 irq; 95 + unsigned long irq; 40 96 41 97 if (get_user(irq, input) != 0) 42 98 return -EFAULT; ··· 77 133 return len; 78 134 } 79 135 80 - /* If we returned from read() last time because the Guest sent DMA, 136 + /* If we returned from read() last time because the Guest notified, 81 137 * clear the flag. */ 82 - if (lg->dma_is_pending) 83 - lg->dma_is_pending = 0; 138 + if (lg->pending_notify) 139 + lg->pending_notify = 0; 84 140 85 141 /* Run the Guest until something interesting happens. */ 86 142 return run_guest(lg, (unsigned long __user *)user); 87 143 } 88 144 89 - /*L:020 The initialization write supplies 4 32-bit values (in addition to the 90 - * 32-bit LHREQ_INITIALIZE value). These are: 145 + /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) 146 + * values (in addition to the LHREQ_INITIALIZE value). These are: 147 + * 148 + * base: The start of the Guest-physical memory inside the Launcher memory. 91 149 * 92 150 * pfnlimit: The highest (Guest-physical) page number the Guest should be 93 151 * allowed to access. The Launcher has to live in Guest memory, so it sets ··· 99 153 * pagetables (which are set up by the Launcher). 100 154 * 101 155 * start: The first instruction to execute ("eip" in x86-speak). 102 - * 103 - * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should 104 - * probably wean the code off this, but it's a very useful constant! Any 105 - * address above this is within the Guest kernel, and any kernel address can 106 - * quickly converted from physical to virtual by adding PAGE_OFFSET. It's 107 - * 0xC0000000 (3G) by default, but it's configurable at kernel build time. 108 156 */ 109 - static int initialize(struct file *file, const u32 __user *input) 157 + static int initialize(struct file *file, const unsigned long __user *input) 110 158 { 111 159 /* "struct lguest" contains everything we (the Host) know about a 112 160 * Guest. */ 113 161 struct lguest *lg; 114 - int err, i; 115 - u32 args[4]; 162 + int err; 163 + unsigned long args[4]; 116 164 117 - /* We grab the Big Lguest lock, which protects the global array 118 - * "lguests" and multiple simultaneous initializations. */ 165 + /* We grab the Big Lguest lock, which protects against multiple 166 + * simultaneous initializations. */ 119 167 mutex_lock(&lguest_lock); 120 168 /* You can't initialize twice! Close the device and start again... */ 121 169 if (file->private_data) { ··· 122 182 goto unlock; 123 183 } 124 184 125 - /* Find an unused guest. */ 126 - i = find_free_guest(); 127 - if (i < 0) { 128 - err = -ENOSPC; 185 + lg = kzalloc(sizeof(*lg), GFP_KERNEL); 186 + if (!lg) { 187 + err = -ENOMEM; 129 188 goto unlock; 130 189 } 131 - /* OK, we have an index into the "lguest" array: "lg" is a convenient 132 - * pointer. */ 133 - lg = &lguests[i]; 134 190 135 191 /* Populate the easy fields of our "struct lguest" */ 136 - lg->guestid = i; 137 - lg->pfn_limit = args[0]; 138 - lg->page_offset = args[3]; 192 + lg->mem_base = (void __user *)(long)args[0]; 193 + lg->pfn_limit = args[1]; 139 194 140 195 /* We need a complete page for the Guest registers: they are accessible 141 196 * to the Guest and we can only grant it access to whole pages. */ ··· 145 210 /* Initialize the Guest's shadow page tables, using the toplevel 146 211 * address the Launcher gave us. This allocates memory, so can 147 212 * fail. */ 148 - err = init_guest_pagetable(lg, args[1]); 213 + err = init_guest_pagetable(lg, args[2]); 149 214 if (err) 150 215 goto free_regs; 151 216 152 217 /* Now we initialize the Guest's registers, handing it the start 153 218 * address. */ 154 - setup_regs(lg->regs, args[2]); 155 - 156 - /* There are a couple of GDT entries the Guest expects when first 157 - * booting. */ 158 - setup_guest_gdt(lg); 219 + lguest_arch_setup_regs(lg, args[3]); 159 220 160 221 /* The timer for lguest's clock needs initialization. */ 161 222 init_clockdev(lg); ··· 191 260 /*L:010 The first operation the Launcher does must be a write. All writes 192 261 * start with a 32 bit number: for the first write this must be 193 262 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 194 - * writes of other values to get DMA buffers and send interrupts. */ 195 - static ssize_t write(struct file *file, const char __user *input, 263 + * writes of other values to send interrupts. */ 264 + static ssize_t write(struct file *file, const char __user *in, 196 265 size_t size, loff_t *off) 197 266 { 198 267 /* Once the guest is initialized, we hold the "struct lguest" in the 199 268 * file private data. */ 200 269 struct lguest *lg = file->private_data; 201 - u32 req; 270 + const unsigned long __user *input = (const unsigned long __user *)in; 271 + unsigned long req; 202 272 203 273 if (get_user(req, input) != 0) 204 274 return -EFAULT; 205 - input += sizeof(req); 275 + input++; 206 276 207 277 /* If you haven't initialized, you must do that first. */ 208 278 if (req != LHREQ_INITIALIZE && !lg) ··· 219 287 220 288 switch (req) { 221 289 case LHREQ_INITIALIZE: 222 - return initialize(file, (const u32 __user *)input); 223 - case LHREQ_GETDMA: 224 - return user_get_dma(lg, (const u32 __user *)input); 290 + return initialize(file, input); 225 291 case LHREQ_IRQ: 226 - return user_send_irq(lg, (const u32 __user *)input); 292 + return user_send_irq(lg, input); 227 293 case LHREQ_BREAK: 228 - return break_guest_out(lg, (const u32 __user *)input); 294 + return break_guest_out(lg, input); 229 295 default: 230 296 return -EINVAL; 231 297 } ··· 249 319 mutex_lock(&lguest_lock); 250 320 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 251 321 hrtimer_cancel(&lg->hrt); 252 - /* Free any DMA buffers the Guest had bound. */ 253 - release_all_dma(lg); 254 322 /* Free up the shadow page tables for the Guest. */ 255 323 free_guest_pagetable(lg); 256 324 /* Now all the memory cleanups are done, it's safe to release the

+136 -114

drivers/lguest/page_tables.c

··· 13 13 #include <linux/random.h> 14 14 #include <linux/percpu.h> 15 15 #include <asm/tlbflush.h> 16 + #include <asm/uaccess.h> 16 17 #include "lg.h" 17 18 18 19 /*M:008 We hold reference to pages, which prevents them from being swapped. ··· 45 44 * (vii) Setting up the page tables initially. 46 45 :*/ 47 46 48 - /* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 49 - * (or 2^10) entries per page. */ 50 - #define PTES_PER_PAGE_SHIFT 10 51 - #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) 52 47 53 48 /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 54 49 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 55 50 * page. */ 56 - #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) 51 + #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 57 52 58 53 /* We actually need a separate PTE page for each CPU. Remember that after the 59 54 * Switcher code itself comes two pages for each CPU, and we don't want this 60 55 * CPU's guest to see the pages of any other CPU. */ 61 - static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); 56 + static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 62 57 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 63 58 64 59 /*H:320 With our shadow and Guest types established, we need to deal with 65 60 * them: the page table code is curly enough to need helper functions to keep 66 61 * it clear and clean. 67 62 * 68 - * The first helper takes a virtual address, and says which entry in the top 69 - * level page table deals with that address. Since each top level entry deals 70 - * with 4M, this effectively divides by 4M. */ 71 - static unsigned vaddr_to_pgd_index(unsigned long vaddr) 72 - { 73 - return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 74 - } 75 - 76 - /* There are two functions which return pointers to the shadow (aka "real") 63 + * There are two functions which return pointers to the shadow (aka "real") 77 64 * page tables. 78 65 * 79 66 * spgd_addr() takes the virtual address and returns a pointer to the top-level 80 67 * page directory entry for that address. Since we keep track of several page 81 68 * tables, the "i" argument tells us which one we're interested in (it's 82 69 * usually the current one). */ 83 - static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 70 + static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 84 71 { 85 - unsigned int index = vaddr_to_pgd_index(vaddr); 72 + unsigned int index = pgd_index(vaddr); 86 73 87 74 /* We kill any Guest trying to touch the Switcher addresses. */ 88 75 if (index >= SWITCHER_PGD_INDEX) { ··· 84 95 /* This routine then takes the PGD entry given above, which contains the 85 96 * address of the PTE page. It then returns a pointer to the PTE entry for the 86 97 * given address. */ 87 - static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) 98 + static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 88 99 { 89 - spte_t *page = __va(spgd.pfn << PAGE_SHIFT); 100 + pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 90 101 /* You should never call this if the PGD entry wasn't valid */ 91 - BUG_ON(!(spgd.flags & _PAGE_PRESENT)); 92 - return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; 102 + BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 103 + return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; 93 104 } 94 105 95 106 /* These two functions just like the above two, except they access the Guest 96 107 * page tables. Hence they return a Guest address. */ 97 108 static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 98 109 { 99 - unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 100 - return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); 110 + unsigned int index = vaddr >> (PGDIR_SHIFT); 111 + return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); 101 112 } 102 113 103 114 static unsigned long gpte_addr(struct lguest *lg, 104 - gpgd_t gpgd, unsigned long vaddr) 115 + pgd_t gpgd, unsigned long vaddr) 105 116 { 106 - unsigned long gpage = gpgd.pfn << PAGE_SHIFT; 107 - BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); 108 - return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); 117 + unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 118 + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 119 + return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); 109 120 } 110 121 111 122 /*H:350 This routine takes a page number given by the Guest and converts it to ··· 138 149 * entry can be a little tricky. The flags are (almost) the same, but the 139 150 * Guest PTE contains a virtual page number: the CPU needs the real page 140 151 * number. */ 141 - static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) 152 + static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) 142 153 { 143 - spte_t spte; 144 - unsigned long pfn; 154 + unsigned long pfn, base, flags; 145 155 146 156 /* The Guest sets the global flag, because it thinks that it is using 147 157 * PGE. We only told it to use PGE so it would tell us whether it was 148 158 * flushing a kernel mapping or a userspace mapping. We don't actually 149 159 * use the global bit, so throw it away. */ 150 - spte.flags = (gpte.flags & ~_PAGE_GLOBAL); 160 + flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 161 + 162 + /* The Guest's pages are offset inside the Launcher. */ 163 + base = (unsigned long)lg->mem_base / PAGE_SIZE; 151 164 152 165 /* We need a temporary "unsigned long" variable to hold the answer from 153 166 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 154 167 * fit in spte.pfn. get_pfn() finds the real physical number of the 155 168 * page, given the virtual number. */ 156 - pfn = get_pfn(gpte.pfn, write); 169 + pfn = get_pfn(base + pte_pfn(gpte), write); 157 170 if (pfn == -1UL) { 158 - kill_guest(lg, "failed to get page %u", gpte.pfn); 171 + kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); 159 172 /* When we destroy the Guest, we'll go through the shadow page 160 173 * tables and release_pte() them. Make sure we don't think 161 174 * this one is valid! */ 162 - spte.flags = 0; 175 + flags = 0; 163 176 } 164 - /* Now we assign the page number, and our shadow PTE is complete. */ 165 - spte.pfn = pfn; 166 - return spte; 177 + /* Now we assemble our shadow PTE from the page number and flags. */ 178 + return pfn_pte(pfn, __pgprot(flags)); 167 179 } 168 180 169 181 /*H:460 And to complete the chain, release_pte() looks like this: */ 170 - static void release_pte(spte_t pte) 182 + static void release_pte(pte_t pte) 171 183 { 172 184 /* Remember that get_user_pages() took a reference to the page, in 173 185 * get_pfn()? We have to put it back now. */ 174 - if (pte.flags & _PAGE_PRESENT) 175 - put_page(pfn_to_page(pte.pfn)); 186 + if (pte_flags(pte) & _PAGE_PRESENT) 187 + put_page(pfn_to_page(pte_pfn(pte))); 176 188 } 177 189 /*:*/ 178 190 179 - static void check_gpte(struct lguest *lg, gpte_t gpte) 191 + static void check_gpte(struct lguest *lg, pte_t gpte) 180 192 { 181 - if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) 193 + if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) 194 + || pte_pfn(gpte) >= lg->pfn_limit) 182 195 kill_guest(lg, "bad page table entry"); 183 196 } 184 197 185 - static void check_gpgd(struct lguest *lg, gpgd_t gpgd) 198 + static void check_gpgd(struct lguest *lg, pgd_t gpgd) 186 199 { 187 - if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) 200 + if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) 188 201 kill_guest(lg, "bad page directory entry"); 189 202 } 190 203 ··· 202 211 * true. */ 203 212 int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 204 213 { 205 - gpgd_t gpgd; 206 - spgd_t *spgd; 214 + pgd_t gpgd; 215 + pgd_t *spgd; 207 216 unsigned long gpte_ptr; 208 - gpte_t gpte; 209 - spte_t *spte; 217 + pte_t gpte; 218 + pte_t *spte; 210 219 211 220 /* First step: get the top-level Guest page table entry. */ 212 - gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); 221 + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 213 222 /* Toplevel not present? We can't map it in. */ 214 - if (!(gpgd.flags & _PAGE_PRESENT)) 223 + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 215 224 return 0; 216 225 217 226 /* Now look at the matching shadow entry. */ 218 227 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 219 - if (!(spgd->flags & _PAGE_PRESENT)) { 228 + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 220 229 /* No shadow entry: allocate a new shadow PTE page. */ 221 230 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 222 231 /* This is not really the Guest's fault, but killing it is ··· 229 238 check_gpgd(lg, gpgd); 230 239 /* And we copy the flags to the shadow PGD entry. The page 231 240 * number in the shadow PGD is the page we just allocated. */ 232 - spgd->raw.val = (__pa(ptepage) | gpgd.flags); 241 + *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); 233 242 } 234 243 235 244 /* OK, now we look at the lower level in the Guest page table: keep its 236 245 * address, because we might update it later. */ 237 246 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 238 - gpte = mkgpte(lgread_u32(lg, gpte_ptr)); 247 + gpte = lgread(lg, gpte_ptr, pte_t); 239 248 240 249 /* If this page isn't in the Guest page tables, we can't page it in. */ 241 - if (!(gpte.flags & _PAGE_PRESENT)) 250 + if (!(pte_flags(gpte) & _PAGE_PRESENT)) 242 251 return 0; 243 252 244 253 /* Check they're not trying to write to a page the Guest wants 245 254 * read-only (bit 2 of errcode == write). */ 246 - if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) 255 + if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 247 256 return 0; 248 257 249 258 /* User access to a kernel page? (bit 3 == user access) */ 250 - if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) 259 + if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 251 260 return 0; 252 261 253 262 /* Check that the Guest PTE flags are OK, and the page number is below 254 263 * the pfn_limit (ie. not mapping the Launcher binary). */ 255 264 check_gpte(lg, gpte); 256 265 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 - gpte.flags |= _PAGE_ACCESSED; 266 + gpte = pte_mkyoung(gpte); 267 + 258 268 if (errcode & 2) 259 - gpte.flags |= _PAGE_DIRTY; 269 + gpte = pte_mkdirty(gpte); 260 270 261 271 /* Get the pointer to the shadow PTE entry we're going to set. */ 262 272 spte = spte_addr(lg, *spgd, vaddr); ··· 267 275 268 276 /* If this is a write, we insist that the Guest page is writable (the 269 277 * final arg to gpte_to_spte()). */ 270 - if (gpte.flags & _PAGE_DIRTY) 278 + if (pte_dirty(gpte)) 271 279 *spte = gpte_to_spte(lg, gpte, 1); 272 - else { 280 + else 273 281 /* If this is a read, don't set the "writable" bit in the page 274 282 * table entry, even if the Guest says it's writable. That way 275 283 * we come back here when a write does actually ocur, so we can 276 284 * update the Guest's _PAGE_DIRTY flag. */ 277 - gpte_t ro_gpte = gpte; 278 - ro_gpte.flags &= ~_PAGE_RW; 279 - *spte = gpte_to_spte(lg, ro_gpte, 0); 280 - } 285 + *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 281 286 282 287 /* Finally, we write the Guest PTE entry back: we've set the 283 288 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 284 - lgwrite_u32(lg, gpte_ptr, gpte.raw.val); 289 + lgwrite(lg, gpte_ptr, pte_t, gpte); 285 290 286 291 /* We succeeded in mapping the page! */ 287 292 return 1; ··· 294 305 * mapped by the shadow page tables, and is it writable? */ 295 306 static int page_writable(struct lguest *lg, unsigned long vaddr) 296 307 { 297 - spgd_t *spgd; 308 + pgd_t *spgd; 298 309 unsigned long flags; 299 310 300 311 /* Look at the top level entry: is it present? */ 301 312 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 302 - if (!(spgd->flags & _PAGE_PRESENT)) 313 + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 303 314 return 0; 304 315 305 316 /* Check the flags on the pte entry itself: it must be present and 306 317 * writable. */ 307 - flags = spte_addr(lg, *spgd, vaddr)->flags; 318 + flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); 319 + 308 320 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 309 321 } 310 322 ··· 319 329 } 320 330 321 331 /*H:450 If we chase down the release_pgd() code, it looks like this: */ 322 - static void release_pgd(struct lguest *lg, spgd_t *spgd) 332 + static void release_pgd(struct lguest *lg, pgd_t *spgd) 323 333 { 324 334 /* If the entry's not present, there's nothing to release. */ 325 - if (spgd->flags & _PAGE_PRESENT) { 335 + if (pgd_flags(*spgd) & _PAGE_PRESENT) { 326 336 unsigned int i; 327 337 /* Converting the pfn to find the actual PTE page is easy: turn 328 338 * the page number into a physical address, then convert to a 329 339 * virtual address (easy for kernel pages like this one). */ 330 - spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); 340 + pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 331 341 /* For each entry in the page, we might need to release it. */ 332 - for (i = 0; i < PTES_PER_PAGE; i++) 342 + for (i = 0; i < PTRS_PER_PTE; i++) 333 343 release_pte(ptepage[i]); 334 344 /* Now we can free the page of PTEs */ 335 345 free_page((long)ptepage); 336 346 /* And zero out the PGD entry we we never release it twice. */ 337 - spgd->raw.val = 0; 347 + *spgd = __pgd(0); 338 348 } 339 349 } 340 350 ··· 346 356 { 347 357 unsigned int i; 348 358 /* Release every pgd entry up to the kernel's address. */ 349 - for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) 359 + for (i = 0; i < pgd_index(lg->kernel_address); i++) 350 360 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 351 361 } 352 362 ··· 359 369 } 360 370 /*:*/ 361 371 372 + /* We walk down the guest page tables to get a guest-physical address */ 373 + unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 374 + { 375 + pgd_t gpgd; 376 + pte_t gpte; 377 + 378 + /* First step: get the top-level Guest page table entry. */ 379 + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); 380 + /* Toplevel not present? We can't map it in. */ 381 + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 382 + kill_guest(lg, "Bad address %#lx", vaddr); 383 + 384 + gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); 385 + if (!(pte_flags(gpte) & _PAGE_PRESENT)) 386 + kill_guest(lg, "Bad address %#lx", vaddr); 387 + 388 + return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 389 + } 390 + 362 391 /* We keep several page tables. This is a simple routine to find the page 363 392 * table (if any) corresponding to this top-level address the Guest has given 364 393 * us. */ ··· 385 376 { 386 377 unsigned int i; 387 378 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 388 - if (lg->pgdirs[i].cr3 == pgtable) 379 + if (lg->pgdirs[i].gpgdir == pgtable) 389 380 break; 390 381 return i; 391 382 } ··· 394 385 * allocate a new one (and so the kernel parts are not there), we set 395 386 * blank_pgdir. */ 396 387 static unsigned int new_pgdir(struct lguest *lg, 397 - unsigned long cr3, 388 + unsigned long gpgdir, 398 389 int *blank_pgdir) 399 390 { 400 391 unsigned int next; ··· 404 395 next = random32() % ARRAY_SIZE(lg->pgdirs); 405 396 /* If it's never been allocated at all before, try now. */ 406 397 if (!lg->pgdirs[next].pgdir) { 407 - lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); 398 + lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 408 399 /* If the allocation fails, just keep using the one we have */ 409 400 if (!lg->pgdirs[next].pgdir) 410 401 next = lg->pgdidx; ··· 414 405 *blank_pgdir = 1; 415 406 } 416 407 /* Record which Guest toplevel this shadows. */ 417 - lg->pgdirs[next].cr3 = cr3; 408 + lg->pgdirs[next].gpgdir = gpgdir; 418 409 /* Release all the non-kernel mappings. */ 419 410 flush_user_mappings(lg, next); 420 411 ··· 481 472 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 482 473 */ 483 474 static void do_set_pte(struct lguest *lg, int idx, 484 - unsigned long vaddr, gpte_t gpte) 475 + unsigned long vaddr, pte_t gpte) 485 476 { 486 477 /* Look up the matching shadow page directot entry. */ 487 - spgd_t *spgd = spgd_addr(lg, idx, vaddr); 478 + pgd_t *spgd = spgd_addr(lg, idx, vaddr); 488 479 489 480 /* If the top level isn't present, there's no entry to update. */ 490 - if (spgd->flags & _PAGE_PRESENT) { 481 + if (pgd_flags(*spgd) & _PAGE_PRESENT) { 491 482 /* Otherwise, we start by releasing the existing entry. */ 492 - spte_t *spte = spte_addr(lg, *spgd, vaddr); 483 + pte_t *spte = spte_addr(lg, *spgd, vaddr); 493 484 release_pte(*spte); 494 485 495 486 /* If they're setting this entry as dirty or accessed, we might 496 487 * as well put that entry they've given us in now. This shaves 497 488 * 10% off a copy-on-write micro-benchmark. */ 498 - if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 489 + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 499 490 check_gpte(lg, gpte); 500 - *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); 491 + *spte = gpte_to_spte(lg, gpte, 492 + pte_flags(gpte) & _PAGE_DIRTY); 501 493 } else 502 494 /* Otherwise we can demand_page() it in later. */ 503 - spte->raw.val = 0; 495 + *spte = __pte(0); 504 496 } 505 497 } 506 498 ··· 516 506 * The benefit is that when we have to track a new page table, we can copy keep 517 507 * all the kernel mappings. This speeds up context switch immensely. */ 518 508 void guest_set_pte(struct lguest *lg, 519 - unsigned long cr3, unsigned long vaddr, gpte_t gpte) 509 + unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 520 510 { 521 511 /* Kernel mappings must be changed on all top levels. Slow, but 522 512 * doesn't happen often. */ 523 - if (vaddr >= lg->page_offset) { 513 + if (vaddr >= lg->kernel_address) { 524 514 unsigned int i; 525 515 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 526 516 if (lg->pgdirs[i].pgdir) 527 517 do_set_pte(lg, i, vaddr, gpte); 528 518 } else { 529 519 /* Is this page table one we have a shadow for? */ 530 - int pgdir = find_pgdir(lg, cr3); 520 + int pgdir = find_pgdir(lg, gpgdir); 531 521 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 532 522 /* If so, do the update. */ 533 523 do_set_pte(lg, pgdir, vaddr, gpte); ··· 548 538 * 549 539 * So with that in mind here's our code to to update a (top-level) PGD entry: 550 540 */ 551 - void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) 541 + void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) 552 542 { 553 543 int pgdir; 554 544 ··· 558 548 return; 559 549 560 550 /* If they're talking about a page table we have a shadow for... */ 561 - pgdir = find_pgdir(lg, cr3); 551 + pgdir = find_pgdir(lg, gpgdir); 562 552 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 563 553 /* ... throw it away. */ 564 554 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); ··· 570 560 * its first page table is. We set some things up here: */ 571 561 int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 572 562 { 573 - /* In flush_user_mappings() we loop from 0 to 574 - * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit 575 - * the Switcher mappings, so check that now. */ 576 - if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) 577 - return -EINVAL; 578 563 /* We start on the first shadow page table, and give it a blank PGD 579 564 * page. */ 580 565 lg->pgdidx = 0; 581 - lg->pgdirs[lg->pgdidx].cr3 = pgtable; 582 - lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); 566 + lg->pgdirs[lg->pgdidx].gpgdir = pgtable; 567 + lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); 583 568 if (!lg->pgdirs[lg->pgdidx].pgdir) 584 569 return -ENOMEM; 585 570 return 0; 571 + } 572 + 573 + /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 574 + void page_table_guest_data_init(struct lguest *lg) 575 + { 576 + /* We get the kernel address: above this is all kernel memory. */ 577 + if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) 578 + /* We tell the Guest that it can't use the top 4MB of virtual 579 + * addresses used by the Switcher. */ 580 + || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 581 + || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) 582 + kill_guest(lg, "bad guest page %p", lg->lguest_data); 583 + 584 + /* In flush_user_mappings() we loop from 0 to 585 + * "pgd_index(lg->kernel_address)". This assumes it won't hit the 586 + * Switcher mappings, so check that now. */ 587 + if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) 588 + kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); 586 589 } 587 590 588 591 /* When a Guest dies, our cleanup is fairly simple. */ ··· 617 594 * for each CPU already set up, we just need to hook them in. */ 618 595 void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 619 596 { 620 - spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 621 - spgd_t switcher_pgd; 622 - spte_t regs_pte; 597 + pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 598 + pgd_t switcher_pgd; 599 + pte_t regs_pte; 623 600 624 601 /* Make the last PGD entry for this Guest point to the Switcher's PTE 625 602 * page for this CPU (with appropriate flags). */ 626 - switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; 627 - switcher_pgd.flags = _PAGE_KERNEL; 603 + switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); 604 + 628 605 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 629 606 630 607 /* We also change the Switcher PTE page. When we're running the Guest, ··· 634 611 * CPU's "struct lguest_pages": if we make sure the Guest's register 635 612 * page is already mapped there, we don't have to copy them out 636 613 * again. */ 637 - regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; 638 - regs_pte.flags = _PAGE_KERNEL; 639 - switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] 640 - = regs_pte; 614 + regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); 615 + switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; 641 616 } 642 617 /*:*/ 643 618 ··· 656 635 unsigned int pages) 657 636 { 658 637 unsigned int i; 659 - spte_t *pte = switcher_pte_page(cpu); 638 + pte_t *pte = switcher_pte_page(cpu); 660 639 661 640 /* The first entries are easy: they map the Switcher code. */ 662 641 for (i = 0; i < pages; i++) { 663 - pte[i].pfn = page_to_pfn(switcher_page[i]); 664 - pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 642 + pte[i] = mk_pte(switcher_page[i], 643 + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 665 644 } 666 645 667 646 /* The only other thing we map is this CPU's pair of pages. */ 668 647 i = pages + cpu*2; 669 648 670 649 /* First page (Guest registers) is writable from the Guest */ 671 - pte[i].pfn = page_to_pfn(switcher_page[i]); 672 - pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; 650 + pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), 651 + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); 652 + 673 653 /* The second page contains the "struct lguest_ro_state", and is 674 654 * read-only. */ 675 - pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); 676 - pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 655 + pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), 656 + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 677 657 } 678 658 679 659 /*H:510 At boot or module load time, init_pagetables() allocates and populates ··· 684 662 unsigned int i; 685 663 686 664 for_each_possible_cpu(i) { 687 - switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); 665 + switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); 688 666 if (!switcher_pte_page(i)) { 689 667 free_switcher_pte_pages(); 690 668 return -ENOMEM;

+14 -14

drivers/lguest/segments.c

··· 73 73 /* Segment descriptors contain a privilege level: the Guest is 74 74 * sometimes careless and leaves this as 0, even though it's 75 75 * running at privilege level 1. If so, we fix it here. */ 76 - if ((lg->gdt[i].b & 0x00006000) == 0) 77 - lg->gdt[i].b |= (GUEST_PL << 13); 76 + if ((lg->arch.gdt[i].b & 0x00006000) == 0) 77 + lg->arch.gdt[i].b |= (GUEST_PL << 13); 78 78 79 79 /* Each descriptor has an "accessed" bit. If we don't set it 80 80 * now, the CPU will try to set it when the Guest first loads 81 81 * that entry into a segment register. But the GDT isn't 82 82 * writable by the Guest, so bad things can happen. */ 83 - lg->gdt[i].b |= 0x00000100; 83 + lg->arch.gdt[i].b |= 0x00000100; 84 84 } 85 85 } 86 86 ··· 106 106 void setup_guest_gdt(struct lguest *lg) 107 107 { 108 108 /* Start with full 0-4G segments... */ 109 - lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 110 - lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 109 + lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 110 + lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 111 111 /* ...except the Guest is allowed to use them, so set the privilege 112 112 * level appropriately in the flags. */ 113 - lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 114 - lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 113 + lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 114 + lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 115 115 } 116 116 117 117 /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the ··· 126 126 unsigned int i; 127 127 128 128 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 129 - gdt[i] = lg->gdt[i]; 129 + gdt[i] = lg->arch.gdt[i]; 130 130 } 131 131 132 132 /* This is the full version */ ··· 138 138 * replaced. See ignored_gdt() above. */ 139 139 for (i = 0; i < GDT_ENTRIES; i++) 140 140 if (!ignored_gdt(i)) 141 - gdt[i] = lg->gdt[i]; 141 + gdt[i] = lg->arch.gdt[i]; 142 142 } 143 143 144 144 /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ ··· 146 146 { 147 147 /* We assume the Guest has the same number of GDT entries as the 148 148 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 149 - if (num > ARRAY_SIZE(lg->gdt)) 149 + if (num > ARRAY_SIZE(lg->arch.gdt)) 150 150 kill_guest(lg, "too many gdt entries %i", num); 151 151 152 152 /* We read the whole thing in, then fix it up. */ 153 - lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); 154 - fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); 153 + __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); 154 + fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); 155 155 /* Mark that the GDT changed so the core knows it has to copy it again, 156 156 * even if the Guest is run on the same CPU. */ 157 157 lg->changed |= CHANGED_GDT; ··· 159 159 160 160 void guest_load_tls(struct lguest *lg, unsigned long gtls) 161 161 { 162 - struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN]; 162 + struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 163 163 164 - lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 164 + __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 165 165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 166 166 lg->changed |= CHANGED_GDT_TLS; 167 167 }

+4 -3

drivers/lguest/switcher.S drivers/lguest/x86/switcher_32.S

··· 48 48 #include <linux/linkage.h> 49 49 #include <asm/asm-offsets.h> 50 50 #include <asm/page.h> 51 - #include "lg.h" 51 + #include <asm/segment.h> 52 + #include <asm/lguest.h> 52 53 53 54 // We mark the start of the code to copy 54 55 // It's placed in .text tho it's never run here ··· 133 132 // The Guest's register page has been mapped 134 133 // Writable onto our %esp (stack) -- 135 134 // We can simply pop off all Guest regs. 135 + popl %eax 136 136 popl %ebx 137 137 popl %ecx 138 138 popl %edx ··· 141 139 popl %edi 142 140 popl %ebp 143 141 popl %gs 144 - popl %eax 145 142 popl %fs 146 143 popl %ds 147 144 popl %es ··· 168 167 pushl %es; \ 169 168 pushl %ds; \ 170 169 pushl %fs; \ 171 - pushl %eax; \ 172 170 pushl %gs; \ 173 171 pushl %ebp; \ 174 172 pushl %edi; \ ··· 175 175 pushl %edx; \ 176 176 pushl %ecx; \ 177 177 pushl %ebx; \ 178 + pushl %eax; \ 178 179 /* Our stack and our code are using segments \ 179 180 * Set in the TSS and IDT \ 180 181 * Yet if we were to touch data we'd use \

+577

drivers/lguest/x86/core.c

··· 1 + /* 2 + * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 3 + * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI. 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation; either version 2 of the License, or 8 + * (at your option) any later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but 11 + * WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 13 + * NON INFRINGEMENT. See the GNU General Public License for more 14 + * details. 15 + * 16 + * You should have received a copy of the GNU General Public License 17 + * along with this program; if not, write to the Free Software 18 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 + */ 20 + #include <linux/kernel.h> 21 + #include <linux/start_kernel.h> 22 + #include <linux/string.h> 23 + #include <linux/console.h> 24 + #include <linux/screen_info.h> 25 + #include <linux/irq.h> 26 + #include <linux/interrupt.h> 27 + #include <linux/clocksource.h> 28 + #include <linux/clockchips.h> 29 + #include <linux/cpu.h> 30 + #include <linux/lguest.h> 31 + #include <linux/lguest_launcher.h> 32 + #include <asm/paravirt.h> 33 + #include <asm/param.h> 34 + #include <asm/page.h> 35 + #include <asm/pgtable.h> 36 + #include <asm/desc.h> 37 + #include <asm/setup.h> 38 + #include <asm/lguest.h> 39 + #include <asm/uaccess.h> 40 + #include <asm/i387.h> 41 + #include "../lg.h" 42 + 43 + static int cpu_had_pge; 44 + 45 + static struct { 46 + unsigned long offset; 47 + unsigned short segment; 48 + } lguest_entry; 49 + 50 + /* Offset from where switcher.S was compiled to where we've copied it */ 51 + static unsigned long switcher_offset(void) 52 + { 53 + return SWITCHER_ADDR - (unsigned long)start_switcher_text; 54 + } 55 + 56 + /* This cpu's struct lguest_pages. */ 57 + static struct lguest_pages *lguest_pages(unsigned int cpu) 58 + { 59 + return &(((struct lguest_pages *) 60 + (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 61 + } 62 + 63 + static DEFINE_PER_CPU(struct lguest *, last_guest); 64 + 65 + /*S:010 66 + * We are getting close to the Switcher. 67 + * 68 + * Remember that each CPU has two pages which are visible to the Guest when it 69 + * runs on that CPU. This has to contain the state for that Guest: we copy the 70 + * state in just before we run the Guest. 71 + * 72 + * Each Guest has "changed" flags which indicate what has changed in the Guest 73 + * since it last ran. We saw this set in interrupts_and_traps.c and 74 + * segments.c. 75 + */ 76 + static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 77 + { 78 + /* Copying all this data can be quite expensive. We usually run the 79 + * same Guest we ran last time (and that Guest hasn't run anywhere else 80 + * meanwhile). If that's not the case, we pretend everything in the 81 + * Guest has changed. */ 82 + if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 83 + __get_cpu_var(last_guest) = lg; 84 + lg->last_pages = pages; 85 + lg->changed = CHANGED_ALL; 86 + } 87 + 88 + /* These copies are pretty cheap, so we do them unconditionally: */ 89 + /* Save the current Host top-level page directory. */ 90 + pages->state.host_cr3 = __pa(current->mm->pgd); 91 + /* Set up the Guest's page tables to see this CPU's pages (and no 92 + * other CPU's pages). */ 93 + map_switcher_in_guest(lg, pages); 94 + /* Set up the two "TSS" members which tell the CPU what stack to use 95 + * for traps which do directly into the Guest (ie. traps at privilege 96 + * level 1). */ 97 + pages->state.guest_tss.esp1 = lg->esp1; 98 + pages->state.guest_tss.ss1 = lg->ss1; 99 + 100 + /* Copy direct-to-Guest trap entries. */ 101 + if (lg->changed & CHANGED_IDT) 102 + copy_traps(lg, pages->state.guest_idt, default_idt_entries); 103 + 104 + /* Copy all GDT entries which the Guest can change. */ 105 + if (lg->changed & CHANGED_GDT) 106 + copy_gdt(lg, pages->state.guest_gdt); 107 + /* If only the TLS entries have changed, copy them. */ 108 + else if (lg->changed & CHANGED_GDT_TLS) 109 + copy_gdt_tls(lg, pages->state.guest_gdt); 110 + 111 + /* Mark the Guest as unchanged for next time. */ 112 + lg->changed = 0; 113 + } 114 + 115 + /* Finally: the code to actually call into the Switcher to run the Guest. */ 116 + static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 117 + { 118 + /* This is a dummy value we need for GCC's sake. */ 119 + unsigned int clobber; 120 + 121 + /* Copy the guest-specific information into this CPU's "struct 122 + * lguest_pages". */ 123 + copy_in_guest_info(lg, pages); 124 + 125 + /* Set the trap number to 256 (impossible value). If we fault while 126 + * switching to the Guest (bad segment registers or bug), this will 127 + * cause us to abort the Guest. */ 128 + lg->regs->trapnum = 256; 129 + 130 + /* Now: we push the "eflags" register on the stack, then do an "lcall". 131 + * This is how we change from using the kernel code segment to using 132 + * the dedicated lguest code segment, as well as jumping into the 133 + * Switcher. 134 + * 135 + * The lcall also pushes the old code segment (KERNEL_CS) onto the 136 + * stack, then the address of this call. This stack layout happens to 137 + * exactly match the stack of an interrupt... */ 138 + asm volatile("pushf; lcall *lguest_entry" 139 + /* This is how we tell GCC that %eax ("a") and %ebx ("b") 140 + * are changed by this routine. The "=" means output. */ 141 + : "=a"(clobber), "=b"(clobber) 142 + /* %eax contains the pages pointer. ("0" refers to the 143 + * 0-th argument above, ie "a"). %ebx contains the 144 + * physical address of the Guest's top-level page 145 + * directory. */ 146 + : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 147 + /* We tell gcc that all these registers could change, 148 + * which means we don't have to save and restore them in 149 + * the Switcher. */ 150 + : "memory", "%edx", "%ecx", "%edi", "%esi"); 151 + } 152 + /*:*/ 153 + 154 + /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 155 + * are disabled: we own the CPU. */ 156 + void lguest_arch_run_guest(struct lguest *lg) 157 + { 158 + /* Remember the awfully-named TS bit? If the Guest has asked 159 + * to set it we set it now, so we can trap and pass that trap 160 + * to the Guest if it uses the FPU. */ 161 + if (lg->ts) 162 + lguest_set_ts(); 163 + 164 + /* SYSENTER is an optimized way of doing system calls. We 165 + * can't allow it because it always jumps to privilege level 0. 166 + * A normal Guest won't try it because we don't advertise it in 167 + * CPUID, but a malicious Guest (or malicious Guest userspace 168 + * program) could, so we tell the CPU to disable it before 169 + * running the Guest. */ 170 + if (boot_cpu_has(X86_FEATURE_SEP)) 171 + wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 172 + 173 + /* Now we actually run the Guest. It will pop back out when 174 + * something interesting happens, and we can examine its 175 + * registers to see what it was doing. */ 176 + run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 177 + 178 + /* The "regs" pointer contains two extra entries which are not 179 + * really registers: a trap number which says what interrupt or 180 + * trap made the switcher code come back, and an error code 181 + * which some traps set. */ 182 + 183 + /* If the Guest page faulted, then the cr2 register will tell 184 + * us the bad virtual address. We have to grab this now, 185 + * because once we re-enable interrupts an interrupt could 186 + * fault and thus overwrite cr2, or we could even move off to a 187 + * different CPU. */ 188 + if (lg->regs->trapnum == 14) 189 + lg->arch.last_pagefault = read_cr2(); 190 + /* Similarly, if we took a trap because the Guest used the FPU, 191 + * we have to restore the FPU it expects to see. */ 192 + else if (lg->regs->trapnum == 7) 193 + math_state_restore(); 194 + 195 + /* Restore SYSENTER if it's supposed to be on. */ 196 + if (boot_cpu_has(X86_FEATURE_SEP)) 197 + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 198 + } 199 + 200 + /*H:130 Our Guest is usually so well behaved; it never tries to do things it 201 + * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 202 + * quite complete, because it doesn't contain replacements for the Intel I/O 203 + * instructions. As a result, the Guest sometimes fumbles across one during 204 + * the boot process as it probes for various things which are usually attached 205 + * to a PC. 206 + * 207 + * When the Guest uses one of these instructions, we get trap #13 (General 208 + * Protection Fault) and come here. We see if it's one of those troublesome 209 + * instructions and skip over it. We return true if we did. */ 210 + static int emulate_insn(struct lguest *lg) 211 + { 212 + u8 insn; 213 + unsigned int insnlen = 0, in = 0, shift = 0; 214 + /* The eip contains the *virtual* address of the Guest's instruction: 215 + * guest_pa just subtracts the Guest's page_offset. */ 216 + unsigned long physaddr = guest_pa(lg, lg->regs->eip); 217 + 218 + /* This must be the Guest kernel trying to do something, not userspace! 219 + * The bottom two bits of the CS segment register are the privilege 220 + * level. */ 221 + if ((lg->regs->cs & 3) != GUEST_PL) 222 + return 0; 223 + 224 + /* Decoding x86 instructions is icky. */ 225 + insn = lgread(lg, physaddr, u8); 226 + 227 + /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 228 + of the eax register. */ 229 + if (insn == 0x66) { 230 + shift = 16; 231 + /* The instruction is 1 byte so far, read the next byte. */ 232 + insnlen = 1; 233 + insn = lgread(lg, physaddr + insnlen, u8); 234 + } 235 + 236 + /* We can ignore the lower bit for the moment and decode the 4 opcodes 237 + * we need to emulate. */ 238 + switch (insn & 0xFE) { 239 + case 0xE4: /* in <next byte>,%al */ 240 + insnlen += 2; 241 + in = 1; 242 + break; 243 + case 0xEC: /* in (%dx),%al */ 244 + insnlen += 1; 245 + in = 1; 246 + break; 247 + case 0xE6: /* out %al,<next byte> */ 248 + insnlen += 2; 249 + break; 250 + case 0xEE: /* out %al,(%dx) */ 251 + insnlen += 1; 252 + break; 253 + default: 254 + /* OK, we don't know what this is, can't emulate. */ 255 + return 0; 256 + } 257 + 258 + /* If it was an "IN" instruction, they expect the result to be read 259 + * into %eax, so we change %eax. We always return all-ones, which 260 + * traditionally means "there's nothing there". */ 261 + if (in) { 262 + /* Lower bit tells is whether it's a 16 or 32 bit access */ 263 + if (insn & 0x1) 264 + lg->regs->eax = 0xFFFFFFFF; 265 + else 266 + lg->regs->eax |= (0xFFFF << shift); 267 + } 268 + /* Finally, we've "done" the instruction, so move past it. */ 269 + lg->regs->eip += insnlen; 270 + /* Success! */ 271 + return 1; 272 + } 273 + 274 + /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 275 + void lguest_arch_handle_trap(struct lguest *lg) 276 + { 277 + switch (lg->regs->trapnum) { 278 + case 13: /* We've intercepted a GPF. */ 279 + /* Check if this was one of those annoying IN or OUT 280 + * instructions which we need to emulate. If so, we 281 + * just go back into the Guest after we've done it. */ 282 + if (lg->regs->errcode == 0) { 283 + if (emulate_insn(lg)) 284 + return; 285 + } 286 + break; 287 + case 14: /* We've intercepted a page fault. */ 288 + /* The Guest accessed a virtual address that wasn't 289 + * mapped. This happens a lot: we don't actually set 290 + * up most of the page tables for the Guest at all when 291 + * we start: as it runs it asks for more and more, and 292 + * we set them up as required. In this case, we don't 293 + * even tell the Guest that the fault happened. 294 + * 295 + * The errcode tells whether this was a read or a 296 + * write, and whether kernel or userspace code. */ 297 + if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 298 + return; 299 + 300 + /* OK, it's really not there (or not OK): the Guest 301 + * needs to know. We write out the cr2 value so it 302 + * knows where the fault occurred. 303 + * 304 + * Note that if the Guest were really messed up, this 305 + * could happen before it's done the INITIALIZE 306 + * hypercall, so lg->lguest_data will be NULL */ 307 + if (lg->lguest_data && 308 + put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 309 + kill_guest(lg, "Writing cr2"); 310 + break; 311 + case 7: /* We've intercepted a Device Not Available fault. */ 312 + /* If the Guest doesn't want to know, we already 313 + * restored the Floating Point Unit, so we just 314 + * continue without telling it. */ 315 + if (!lg->ts) 316 + return; 317 + break; 318 + case 32 ... 255: 319 + /* These values mean a real interrupt occurred, in which case 320 + * the Host handler has already been run. We just do a 321 + * friendly check if another process should now be run, then 322 + * return to run the Guest again */ 323 + cond_resched(); 324 + return; 325 + case LGUEST_TRAP_ENTRY: 326 + /* Our 'struct hcall_args' maps directly over our regs: we set 327 + * up the pointer now to indicate a hypercall is pending. */ 328 + lg->hcall = (struct hcall_args *)lg->regs; 329 + return; 330 + } 331 + 332 + /* We didn't handle the trap, so it needs to go to the Guest. */ 333 + if (!deliver_trap(lg, lg->regs->trapnum)) 334 + /* If the Guest doesn't have a handler (either it hasn't 335 + * registered any yet, or it's one of the faults we don't let 336 + * it handle), it dies with a cryptic error message. */ 337 + kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 338 + lg->regs->trapnum, lg->regs->eip, 339 + lg->regs->trapnum == 14 ? lg->arch.last_pagefault 340 + : lg->regs->errcode); 341 + } 342 + 343 + /* Now we can look at each of the routines this calls, in increasing order of 344 + * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 345 + * deliver_trap() and demand_page(). After all those, we'll be ready to 346 + * examine the Switcher, and our philosophical understanding of the Host/Guest 347 + * duality will be complete. :*/ 348 + static void adjust_pge(void *on) 349 + { 350 + if (on) 351 + write_cr4(read_cr4() | X86_CR4_PGE); 352 + else 353 + write_cr4(read_cr4() & ~X86_CR4_PGE); 354 + } 355 + 356 + /*H:020 Now the Switcher is mapped and every thing else is ready, we need to do 357 + * some more i386-specific initialization. */ 358 + void __init lguest_arch_host_init(void) 359 + { 360 + int i; 361 + 362 + /* Most of the i386/switcher.S doesn't care that it's been moved; on 363 + * Intel, jumps are relative, and it doesn't access any references to 364 + * external code or data. 365 + * 366 + * The only exception is the interrupt handlers in switcher.S: their 367 + * addresses are placed in a table (default_idt_entries), so we need to 368 + * update the table with the new addresses. switcher_offset() is a 369 + * convenience function which returns the distance between the builtin 370 + * switcher code and the high-mapped copy we just made. */ 371 + for (i = 0; i < IDT_ENTRIES; i++) 372 + default_idt_entries[i] += switcher_offset(); 373 + 374 + /* 375 + * Set up the Switcher's per-cpu areas. 376 + * 377 + * Each CPU gets two pages of its own within the high-mapped region 378 + * (aka. "struct lguest_pages"). Much of this can be initialized now, 379 + * but some depends on what Guest we are running (which is set up in 380 + * copy_in_guest_info()). 381 + */ 382 + for_each_possible_cpu(i) { 383 + /* lguest_pages() returns this CPU's two pages. */ 384 + struct lguest_pages *pages = lguest_pages(i); 385 + /* This is a convenience pointer to make the code fit one 386 + * statement to a line. */ 387 + struct lguest_ro_state *state = &pages->state; 388 + 389 + /* The Global Descriptor Table: the Host has a different one 390 + * for each CPU. We keep a descriptor for the GDT which says 391 + * where it is and how big it is (the size is actually the last 392 + * byte, not the size, hence the "-1"). */ 393 + state->host_gdt_desc.size = GDT_SIZE-1; 394 + state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 395 + 396 + /* All CPUs on the Host use the same Interrupt Descriptor 397 + * Table, so we just use store_idt(), which gets this CPU's IDT 398 + * descriptor. */ 399 + store_idt(&state->host_idt_desc); 400 + 401 + /* The descriptors for the Guest's GDT and IDT can be filled 402 + * out now, too. We copy the GDT & IDT into ->guest_gdt and 403 + * ->guest_idt before actually running the Guest. */ 404 + state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 405 + state->guest_idt_desc.address = (long)&state->guest_idt; 406 + state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 407 + state->guest_gdt_desc.address = (long)&state->guest_gdt; 408 + 409 + /* We know where we want the stack to be when the Guest enters 410 + * the switcher: in pages->regs. The stack grows upwards, so 411 + * we start it at the end of that structure. */ 412 + state->guest_tss.esp0 = (long)(&pages->regs + 1); 413 + /* And this is the GDT entry to use for the stack: we keep a 414 + * couple of special LGUEST entries. */ 415 + state->guest_tss.ss0 = LGUEST_DS; 416 + 417 + /* x86 can have a finegrained bitmap which indicates what I/O 418 + * ports the process can use. We set it to the end of our 419 + * structure, meaning "none". */ 420 + state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 421 + 422 + /* Some GDT entries are the same across all Guests, so we can 423 + * set them up now. */ 424 + setup_default_gdt_entries(state); 425 + /* Most IDT entries are the same for all Guests, too.*/ 426 + setup_default_idt_entries(state, default_idt_entries); 427 + 428 + /* The Host needs to be able to use the LGUEST segments on this 429 + * CPU, too, so put them in the Host GDT. */ 430 + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 431 + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 432 + } 433 + 434 + /* In the Switcher, we want the %cs segment register to use the 435 + * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 436 + * it will be undisturbed when we switch. To change %cs and jump we 437 + * need this structure to feed to Intel's "lcall" instruction. */ 438 + lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 439 + lguest_entry.segment = LGUEST_CS; 440 + 441 + /* Finally, we need to turn off "Page Global Enable". PGE is an 442 + * optimization where page table entries are specially marked to show 443 + * they never change. The Host kernel marks all the kernel pages this 444 + * way because it's always present, even when userspace is running. 445 + * 446 + * Lguest breaks this: unbeknownst to the rest of the Host kernel, we 447 + * switch to the Guest kernel. If you don't disable this on all CPUs, 448 + * you'll get really weird bugs that you'll chase for two days. 449 + * 450 + * I used to turn PGE off every time we switched to the Guest and back 451 + * on when we return, but that slowed the Switcher down noticibly. */ 452 + 453 + /* We don't need the complexity of CPUs coming and going while we're 454 + * doing this. */ 455 + lock_cpu_hotplug(); 456 + if (cpu_has_pge) { /* We have a broader idea of "global". */ 457 + /* Remember that this was originally set (for cleanup). */ 458 + cpu_had_pge = 1; 459 + /* adjust_pge is a helper function which sets or unsets the PGE 460 + * bit on its CPU, depending on the argument (0 == unset). */ 461 + on_each_cpu(adjust_pge, (void *)0, 0, 1); 462 + /* Turn off the feature in the global feature set. */ 463 + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 464 + } 465 + unlock_cpu_hotplug(); 466 + }; 467 + /*:*/ 468 + 469 + void __exit lguest_arch_host_fini(void) 470 + { 471 + /* If we had PGE before we started, turn it back on now. */ 472 + lock_cpu_hotplug(); 473 + if (cpu_had_pge) { 474 + set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 475 + /* adjust_pge's argument "1" means set PGE. */ 476 + on_each_cpu(adjust_pge, (void *)1, 0, 1); 477 + } 478 + unlock_cpu_hotplug(); 479 + } 480 + 481 + 482 + /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ 483 + int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) 484 + { 485 + switch (args->arg0) { 486 + case LHCALL_LOAD_GDT: 487 + load_guest_gdt(lg, args->arg1, args->arg2); 488 + break; 489 + case LHCALL_LOAD_IDT_ENTRY: 490 + load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); 491 + break; 492 + case LHCALL_LOAD_TLS: 493 + guest_load_tls(lg, args->arg1); 494 + break; 495 + default: 496 + /* Bad Guest. Bad! */ 497 + return -EIO; 498 + } 499 + return 0; 500 + } 501 + 502 + /*H:126 i386-specific hypercall initialization: */ 503 + int lguest_arch_init_hypercalls(struct lguest *lg) 504 + { 505 + u32 tsc_speed; 506 + 507 + /* The pointer to the Guest's "struct lguest_data" is the only 508 + * argument. We check that address now. */ 509 + if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) 510 + return -EFAULT; 511 + 512 + /* Having checked it, we simply set lg->lguest_data to point straight 513 + * into the Launcher's memory at the right place and then use 514 + * copy_to_user/from_user from now on, instead of lgread/write. I put 515 + * this in to show that I'm not immune to writing stupid 516 + * optimizations. */ 517 + lg->lguest_data = lg->mem_base + lg->hcall->arg1; 518 + 519 + /* We insist that the Time Stamp Counter exist and doesn't change with 520 + * cpu frequency. Some devious chip manufacturers decided that TSC 521 + * changes could be handled in software. I decided that time going 522 + * backwards might be good for benchmarks, but it's bad for users. 523 + * 524 + * We also insist that the TSC be stable: the kernel detects unreliable 525 + * TSCs for its own purposes, and we use that here. */ 526 + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 527 + tsc_speed = tsc_khz; 528 + else 529 + tsc_speed = 0; 530 + if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) 531 + return -EFAULT; 532 + 533 + /* The interrupt code might not like the system call vector. */ 534 + if (!check_syscall_vector(lg)) 535 + kill_guest(lg, "bad syscall vector"); 536 + 537 + return 0; 538 + } 539 + /* Now we've examined the hypercall code; our Guest can make requests. There 540 + * is one other way we can do things for the Guest, as we see in 541 + * emulate_insn(). :*/ 542 + 543 + /*L:030 lguest_arch_setup_regs() 544 + * 545 + * Most of the Guest's registers are left alone: we used get_zeroed_page() to 546 + * allocate the structure, so they will be 0. */ 547 + void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) 548 + { 549 + struct lguest_regs *regs = lg->regs; 550 + 551 + /* There are four "segment" registers which the Guest needs to boot: 552 + * The "code segment" register (cs) refers to the kernel code segment 553 + * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 554 + * refer to the kernel data segment __KERNEL_DS. 555 + * 556 + * The privilege level is packed into the lower bits. The Guest runs 557 + * at privilege level 1 (GUEST_PL).*/ 558 + regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 559 + regs->cs = __KERNEL_CS|GUEST_PL; 560 + 561 + /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 562 + * is supposed to always be "1". Bit 9 (0x200) controls whether 563 + * interrupts are enabled. We always leave interrupts enabled while 564 + * running the Guest. */ 565 + regs->eflags = 0x202; 566 + 567 + /* The "Extended Instruction Pointer" register says where the Guest is 568 + * running. */ 569 + regs->eip = start; 570 + 571 + /* %esi points to our boot information, at physical address 0, so don't 572 + * touch it. */ 573 + /* There are a couple of GDT entries the Guest expects when first 574 + * booting. */ 575 + 576 + setup_guest_gdt(lg); 577 + }

+6

drivers/net/Kconfig

··· 3100 3100 config NET_POLL_CONTROLLER 3101 3101 def_bool NETPOLL 3102 3102 3103 + config VIRTIO_NET 3104 + tristate "Virtio network driver (EXPERIMENTAL)" 3105 + depends on EXPERIMENTAL && VIRTIO 3106 + ---help--- 3107 + This is the virtual network driver for lguest. Say Y or M. 3108 + 3103 3109 endif # NETDEVICES

+1 -1

drivers/net/Makefile

··· 183 183 obj-$(CONFIG_HPLANCE) += hplance.o 7990.o 184 184 obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o 185 185 obj-$(CONFIG_EQUALIZER) += eql.o 186 - obj-$(CONFIG_LGUEST_NET) += lguest_net.o 187 186 obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o 188 187 obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o 189 188 obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o ··· 242 243 243 244 obj-$(CONFIG_NETXEN_NIC) += netxen/ 244 245 obj-$(CONFIG_NIU) += niu.o 246 + obj-$(CONFIG_VIRTIO_NET) += virtio_net.o

-555

drivers/net/lguest_net.c

··· 1 - /*D:500 2 - * The Guest network driver. 3 - * 4 - * This is very simple a virtual network driver, and our last Guest driver. 5 - * The only trick is that it can talk directly to multiple other recipients 6 - * (ie. other Guests on the same network). It can also be used with only the 7 - * Host on the network. 8 - :*/ 9 - 10 - /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 11 - * 12 - * This program is free software; you can redistribute it and/or modify 13 - * it under the terms of the GNU General Public License as published by 14 - * the Free Software Foundation; either version 2 of the License, or 15 - * (at your option) any later version. 16 - * 17 - * This program is distributed in the hope that it will be useful, 18 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 - * GNU General Public License for more details. 21 - * 22 - * You should have received a copy of the GNU General Public License 23 - * along with this program; if not, write to the Free Software 24 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 - */ 26 - //#define DEBUG 27 - #include <linux/netdevice.h> 28 - #include <linux/etherdevice.h> 29 - #include <linux/module.h> 30 - #include <linux/mm_types.h> 31 - #include <linux/io.h> 32 - #include <linux/lguest_bus.h> 33 - 34 - #define SHARED_SIZE PAGE_SIZE 35 - #define MAX_LANS 4 36 - #define NUM_SKBS 8 37 - 38 - /*M:011 Network code master Jeff Garzik points out numerous shortcomings in 39 - * this driver if it aspires to greatness. 40 - * 41 - * Firstly, it doesn't use "NAPI": the networking's New API, and is poorer for 42 - * it. As he says "NAPI means system-wide load leveling, across multiple 43 - * network interfaces. Lack of NAPI can mean competition at higher loads." 44 - * 45 - * He also points out that we don't implement set_mac_address, so users cannot 46 - * change the devices hardware address. When I asked why one would want to: 47 - * "Bonding, and situations where you /do/ want the MAC address to "leak" out 48 - * of the host onto the wider net." 49 - * 50 - * Finally, he would like module unloading: "It is not unrealistic to think of 51 - * [un|re|]loading the net support module in an lguest guest. And, adding 52 - * module support makes the programmer more responsible, because they now have 53 - * to learn to clean up after themselves. Any driver that cannot clean up 54 - * after itself is an incomplete driver in my book." 55 - :*/ 56 - 57 - /*D:530 The "struct lguestnet_info" contains all the information we need to 58 - * know about the network device. */ 59 - struct lguestnet_info 60 - { 61 - /* The mapped device page(s) (an array of "struct lguest_net"). */ 62 - struct lguest_net *peer; 63 - /* The physical address of the device page(s) */ 64 - unsigned long peer_phys; 65 - /* The size of the device page(s). */ 66 - unsigned long mapsize; 67 - 68 - /* The lguest_device I come from */ 69 - struct lguest_device *lgdev; 70 - 71 - /* My peerid (ie. my slot in the array). */ 72 - unsigned int me; 73 - 74 - /* Receive queue: the network packets waiting to be filled. */ 75 - struct sk_buff *skb[NUM_SKBS]; 76 - struct lguest_dma dma[NUM_SKBS]; 77 - }; 78 - /*:*/ 79 - 80 - /* How many bytes left in this page. */ 81 - static unsigned int rest_of_page(void *data) 82 - { 83 - return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); 84 - } 85 - 86 - /*D:570 Each peer (ie. Guest or Host) on the network binds their receive 87 - * buffers to a different key: we simply use the physical address of the 88 - * device's memory page plus the peer number. The Host insists that all keys 89 - * be a multiple of 4, so we multiply the peer number by 4. */ 90 - static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum) 91 - { 92 - return info->peer_phys + 4 * peernum; 93 - } 94 - 95 - /* This is the routine which sets up a "struct lguest_dma" to point to a 96 - * network packet, similar to req_to_dma() in lguest_blk.c. The structure of a 97 - * "struct sk_buff" has grown complex over the years: it consists of a "head" 98 - * linear section pointed to by "skb->data", and possibly an array of 99 - * "fragments" in the case of a non-linear packet. 100 - * 101 - * Our receive buffers don't use fragments at all but outgoing skbs might, so 102 - * we handle it. */ 103 - static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen, 104 - struct lguest_dma *dma) 105 - { 106 - unsigned int i, seg; 107 - 108 - /* First, we put the linear region into the "struct lguest_dma". Each 109 - * entry can't go over a page boundary, so even though all our packets 110 - * are 1514 bytes or less, we might need to use two entries here: */ 111 - for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) { 112 - dma->addr[seg] = virt_to_phys(skb->data + i); 113 - dma->len[seg] = min((unsigned)(headlen - i), 114 - rest_of_page(skb->data + i)); 115 - } 116 - 117 - /* Now we handle the fragments: at least they're guaranteed not to go 118 - * over a page. skb_shinfo(skb) returns a pointer to the structure 119 - * which tells us about the number of fragments and the fragment 120 - * array. */ 121 - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) { 122 - const skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 123 - /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */ 124 - if (seg == LGUEST_MAX_DMA_SECTIONS) { 125 - /* We will end up sending a truncated packet should 126 - * this ever happen. Plus, a cool log message! */ 127 - printk("Woah dude! Megapacket!\n"); 128 - break; 129 - } 130 - dma->addr[seg] = page_to_phys(f->page) + f->page_offset; 131 - dma->len[seg] = f->size; 132 - } 133 - 134 - /* If after all that we didn't use the entire "struct lguest_dma" 135 - * array, we terminate it with a 0 length. */ 136 - if (seg < LGUEST_MAX_DMA_SECTIONS) 137 - dma->len[seg] = 0; 138 - } 139 - 140 - /* 141 - * Packet transmission. 142 - * 143 - * Our packet transmission is a little unusual. A real network card would just 144 - * send out the packet and leave the receivers to decide if they're interested. 145 - * Instead, we look through the network device memory page and see if any of 146 - * the ethernet addresses match the packet destination, and if so we send it to 147 - * that Guest. 148 - * 149 - * This is made a little more complicated in two cases. The first case is 150 - * broadcast packets: for that we send the packet to all Guests on the network, 151 - * one at a time. The second case is "promiscuous" mode, where a Guest wants 152 - * to see all the packets on the network. We need a way for the Guest to tell 153 - * us it wants to see all packets, so it sets the "multicast" bit on its 154 - * published MAC address, which is never valid in a real ethernet address. 155 - */ 156 - #define PROMISC_BIT 0x01 157 - 158 - /* This is the callback which is summoned whenever the network device's 159 - * multicast or promiscuous state changes. If the card is in promiscuous mode, 160 - * we advertise that in our ethernet address in the device's memory. We do the 161 - * same if Linux wants any or all multicast traffic. */ 162 - static void lguestnet_set_multicast(struct net_device *dev) 163 - { 164 - struct lguestnet_info *info = netdev_priv(dev); 165 - 166 - if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count) 167 - info->peer[info->me].mac[0] |= PROMISC_BIT; 168 - else 169 - info->peer[info->me].mac[0] &= ~PROMISC_BIT; 170 - } 171 - 172 - /* A simple test function to see if a peer wants to see all packets.*/ 173 - static int promisc(struct lguestnet_info *info, unsigned int peer) 174 - { 175 - return info->peer[peer].mac[0] & PROMISC_BIT; 176 - } 177 - 178 - /* Another simple function to see if a peer's advertised ethernet address 179 - * matches a packet's destination ethernet address. */ 180 - static int mac_eq(const unsigned char mac[ETH_ALEN], 181 - struct lguestnet_info *info, unsigned int peer) 182 - { 183 - /* Ignore multicast bit, which peer turns on to mean promisc. */ 184 - if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0]) 185 - return 0; 186 - return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0; 187 - } 188 - 189 - /* This is the function which actually sends a packet once we've decided a 190 - * peer wants it: */ 191 - static void transfer_packet(struct net_device *dev, 192 - struct sk_buff *skb, 193 - unsigned int peernum) 194 - { 195 - struct lguestnet_info *info = netdev_priv(dev); 196 - struct lguest_dma dma; 197 - 198 - /* We use our handy "struct lguest_dma" packing function to prepare 199 - * the skb for sending. */ 200 - skb_to_dma(skb, skb_headlen(skb), &dma); 201 - pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len); 202 - 203 - /* This is the actual send call which copies the packet. */ 204 - lguest_send_dma(peer_key(info, peernum), &dma); 205 - 206 - /* Check that the entire packet was transmitted. If not, it could mean 207 - * that the other Guest registered a short receive buffer, but this 208 - * driver should never do that. More likely, the peer is dead. */ 209 - if (dma.used_len != skb->len) { 210 - dev->stats.tx_carrier_errors++; 211 - pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n", 212 - peernum, dma.used_len, skb->len, 213 - (void *)dma.addr[0], dma.len[0]); 214 - } else { 215 - /* On success we update the stats. */ 216 - dev->stats.tx_bytes += skb->len; 217 - dev->stats.tx_packets++; 218 - } 219 - } 220 - 221 - /* Another helper function to tell is if a slot in the device memory is unused. 222 - * Since we always set the Local Assignment bit in the ethernet address, the 223 - * first byte can never be 0. */ 224 - static int unused_peer(const struct lguest_net peer[], unsigned int num) 225 - { 226 - return peer[num].mac[0] == 0; 227 - } 228 - 229 - /* Finally, here is the routine which handles an outgoing packet. It's called 230 - * "start_xmit" for traditional reasons. */ 231 - static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev) 232 - { 233 - unsigned int i; 234 - int broadcast; 235 - struct lguestnet_info *info = netdev_priv(dev); 236 - /* Extract the destination ethernet address from the packet. */ 237 - const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 238 - DECLARE_MAC_BUF(mac); 239 - 240 - pr_debug("%s: xmit %s\n", dev->name, print_mac(mac, dest)); 241 - 242 - /* If it's a multicast packet, we broadcast to everyone. That's not 243 - * very efficient, but there are very few applications which actually 244 - * use multicast, which is a shame really. 245 - * 246 - * As etherdevice.h points out: "By definition the broadcast address is 247 - * also a multicast address." So we don't have to test for broadcast 248 - * packets separately. */ 249 - broadcast = is_multicast_ether_addr(dest); 250 - 251 - /* Look through all the published ethernet addresses to see if we 252 - * should send this packet. */ 253 - for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) { 254 - /* We don't send to ourselves (we actually can't SEND_DMA to 255 - * ourselves anyway), and don't send to unused slots.*/ 256 - if (i == info->me || unused_peer(info->peer, i)) 257 - continue; 258 - 259 - /* If it's broadcast we send it. If they want every packet we 260 - * send it. If the destination matches their address we send 261 - * it. Otherwise we go to the next peer. */ 262 - if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i)) 263 - continue; 264 - 265 - pr_debug("lguestnet %s: sending from %i to %i\n", 266 - dev->name, info->me, i); 267 - /* Our routine which actually does the transfer. */ 268 - transfer_packet(dev, skb, i); 269 - } 270 - 271 - /* An xmit routine is expected to dispose of the packet, so we do. */ 272 - dev_kfree_skb(skb); 273 - 274 - /* As per kernel convention, 0 means success. This is why I love 275 - * networking: even if we never sent to anyone, that's still 276 - * success! */ 277 - return 0; 278 - } 279 - 280 - /*D:560 281 - * Packet receiving. 282 - * 283 - * First, here's a helper routine which fills one of our array of receive 284 - * buffers: */ 285 - static int fill_slot(struct net_device *dev, unsigned int slot) 286 - { 287 - struct lguestnet_info *info = netdev_priv(dev); 288 - 289 - /* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard 290 - * ethernet header of ETH_HLEN (14) bytes. */ 291 - info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN); 292 - if (!info->skb[slot]) { 293 - printk("%s: could not fill slot %i\n", dev->name, slot); 294 - return -ENOMEM; 295 - } 296 - 297 - /* skb_to_dma() is a helper which sets up the "struct lguest_dma" to 298 - * point to the data in the skb: we also use it for sending out a 299 - * packet. */ 300 - skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]); 301 - 302 - /* This is a Write Memory Barrier: it ensures that the entry in the 303 - * receive buffer array is written *before* we set the "used_len" entry 304 - * to 0. If the Host were looking at the receive buffer array from a 305 - * different CPU, it could potentially see "used_len = 0" and not see 306 - * the updated receive buffer information. This would be a horribly 307 - * nasty bug, so make sure the compiler and CPU know this has to happen 308 - * first. */ 309 - wmb(); 310 - /* Writing 0 to "used_len" tells the Host it can use this receive 311 - * buffer now. */ 312 - info->dma[slot].used_len = 0; 313 - return 0; 314 - } 315 - 316 - /* This is the actual receive routine. When we receive an interrupt from the 317 - * Host to tell us a packet has been delivered, we arrive here: */ 318 - static irqreturn_t lguestnet_rcv(int irq, void *dev_id) 319 - { 320 - struct net_device *dev = dev_id; 321 - struct lguestnet_info *info = netdev_priv(dev); 322 - unsigned int i, done = 0; 323 - 324 - /* Look through our entire receive array for an entry which has data 325 - * in it. */ 326 - for (i = 0; i < ARRAY_SIZE(info->dma); i++) { 327 - unsigned int length; 328 - struct sk_buff *skb; 329 - 330 - length = info->dma[i].used_len; 331 - if (length == 0) 332 - continue; 333 - 334 - /* We've found one! Remember the skb (we grabbed the length 335 - * above), and immediately refill the slot we've taken it 336 - * from. */ 337 - done++; 338 - skb = info->skb[i]; 339 - fill_slot(dev, i); 340 - 341 - /* This shouldn't happen: micropackets could be sent by a 342 - * badly-behaved Guest on the network, but the Host will never 343 - * stuff more data in the buffer than the buffer length. */ 344 - if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) { 345 - pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n", 346 - dev->name, length); 347 - dev_kfree_skb(skb); 348 - continue; 349 - } 350 - 351 - /* skb_put(), what a great function! I've ranted about this 352 - * function before (http://lkml.org/lkml/1999/9/26/24). You 353 - * call it after you've added data to the end of an skb (in 354 - * this case, it was the Host which wrote the data). */ 355 - skb_put(skb, length); 356 - 357 - /* The ethernet header contains a protocol field: we use the 358 - * standard helper to extract it, and place the result in 359 - * skb->protocol. The helper also sets up skb->pkt_type and 360 - * eats up the ethernet header from the front of the packet. */ 361 - skb->protocol = eth_type_trans(skb, dev); 362 - 363 - /* If this device doesn't need checksums for sending, we also 364 - * don't need to check the packets when they come in. */ 365 - if (dev->features & NETIF_F_NO_CSUM) 366 - skb->ip_summed = CHECKSUM_UNNECESSARY; 367 - 368 - /* As a last resort for debugging the driver or the lguest I/O 369 - * subsystem, you can uncomment the "#define DEBUG" at the top 370 - * of this file, which turns all the pr_debug() into printk() 371 - * and floods the logs. */ 372 - pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 373 - ntohs(skb->protocol), skb->len, skb->pkt_type); 374 - 375 - /* Update the packet and byte counts (visible from ifconfig, 376 - * and good for debugging). */ 377 - dev->stats.rx_bytes += skb->len; 378 - dev->stats.rx_packets++; 379 - 380 - /* Hand our fresh network packet into the stack's "network 381 - * interface receive" routine. That will free the packet 382 - * itself when it's finished. */ 383 - netif_rx(skb); 384 - } 385 - 386 - /* If we found any packets, we assume the interrupt was for us. */ 387 - return done ? IRQ_HANDLED : IRQ_NONE; 388 - } 389 - 390 - /*D:550 This is where we start: when the device is brought up by dhcpd or 391 - * ifconfig. At this point we advertise our MAC address to the rest of the 392 - * network, and register receive buffers ready for incoming packets. */ 393 - static int lguestnet_open(struct net_device *dev) 394 - { 395 - int i; 396 - struct lguestnet_info *info = netdev_priv(dev); 397 - 398 - /* Copy our MAC address into the device page, so others on the network 399 - * can find us. */ 400 - memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN); 401 - 402 - /* We might already be in promisc mode (dev->flags & IFF_PROMISC). Our 403 - * set_multicast callback handles this already, so we call it now. */ 404 - lguestnet_set_multicast(dev); 405 - 406 - /* Allocate packets and put them into our "struct lguest_dma" array. 407 - * If we fail to allocate all the packets we could still limp along, 408 - * but it's a sign of real stress so we should probably give up now. */ 409 - for (i = 0; i < ARRAY_SIZE(info->dma); i++) { 410 - if (fill_slot(dev, i) != 0) 411 - goto cleanup; 412 - } 413 - 414 - /* Finally we tell the Host where our array of "struct lguest_dma" 415 - * receive buffers is, binding it to the key corresponding to the 416 - * device's physical memory plus our peerid. */ 417 - if (lguest_bind_dma(peer_key(info,info->me), info->dma, 418 - NUM_SKBS, lgdev_irq(info->lgdev)) != 0) 419 - goto cleanup; 420 - return 0; 421 - 422 - cleanup: 423 - while (--i >= 0) 424 - dev_kfree_skb(info->skb[i]); 425 - return -ENOMEM; 426 - } 427 - /*:*/ 428 - 429 - /* The close routine is called when the device is no longer in use: we clean up 430 - * elegantly. */ 431 - static int lguestnet_close(struct net_device *dev) 432 - { 433 - unsigned int i; 434 - struct lguestnet_info *info = netdev_priv(dev); 435 - 436 - /* Clear all trace of our existence out of the device memory by setting 437 - * the slot which held our MAC address to 0 (unused). */ 438 - memset(&info->peer[info->me], 0, sizeof(info->peer[info->me])); 439 - 440 - /* Unregister our array of receive buffers */ 441 - lguest_unbind_dma(peer_key(info, info->me), info->dma); 442 - for (i = 0; i < ARRAY_SIZE(info->dma); i++) 443 - dev_kfree_skb(info->skb[i]); 444 - return 0; 445 - } 446 - 447 - /*D:510 The network device probe function is basically a standard ethernet 448 - * device setup. It reads the "struct lguest_device_desc" and sets the "struct 449 - * net_device". Oh, the line-by-line excitement! Let's skip over it. :*/ 450 - static int lguestnet_probe(struct lguest_device *lgdev) 451 - { 452 - int err, irqf = IRQF_SHARED; 453 - struct net_device *dev; 454 - struct lguestnet_info *info; 455 - struct lguest_device_desc *desc = &lguest_devices[lgdev->index]; 456 - 457 - pr_debug("lguest_net: probing for device %i\n", lgdev->index); 458 - 459 - dev = alloc_etherdev(sizeof(struct lguestnet_info)); 460 - if (!dev) 461 - return -ENOMEM; 462 - 463 - /* Ethernet defaults with some changes */ 464 - ether_setup(dev); 465 - dev->set_mac_address = NULL; 466 - 467 - dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */ 468 - dev->dev_addr[1] = 0x00; 469 - memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2); 470 - dev->dev_addr[4] = 0x00; 471 - dev->dev_addr[5] = 0x00; 472 - 473 - dev->open = lguestnet_open; 474 - dev->stop = lguestnet_close; 475 - dev->hard_start_xmit = lguestnet_start_xmit; 476 - 477 - /* We don't actually support multicast yet, but turning on/off 478 - * promisc also calls dev->set_multicast_list. */ 479 - dev->set_multicast_list = lguestnet_set_multicast; 480 - SET_NETDEV_DEV(dev, &lgdev->dev); 481 - 482 - /* The network code complains if you have "scatter-gather" capability 483 - * if you don't also handle checksums (it seem that would be 484 - * "illogical"). So we use a lie of omission and don't tell it that we 485 - * can handle scattered packets unless we also don't want checksums, 486 - * even though to us they're completely independent. */ 487 - if (desc->features & LGUEST_NET_F_NOCSUM) 488 - dev->features = NETIF_F_SG|NETIF_F_NO_CSUM; 489 - 490 - info = netdev_priv(dev); 491 - info->mapsize = PAGE_SIZE * desc->num_pages; 492 - info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT); 493 - info->lgdev = lgdev; 494 - info->peer = lguest_map(info->peer_phys, desc->num_pages); 495 - if (!info->peer) { 496 - err = -ENOMEM; 497 - goto free; 498 - } 499 - 500 - /* This stores our peerid (upper bits reserved for future). */ 501 - info->me = (desc->features & (info->mapsize-1)); 502 - 503 - err = register_netdev(dev); 504 - if (err) { 505 - pr_debug("lguestnet: registering device failed\n"); 506 - goto unmap; 507 - } 508 - 509 - if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) 510 - irqf |= IRQF_SAMPLE_RANDOM; 511 - if (request_irq(lgdev_irq(lgdev), lguestnet_rcv, irqf, "lguestnet", 512 - dev) != 0) { 513 - pr_debug("lguestnet: cannot get irq %i\n", lgdev_irq(lgdev)); 514 - goto unregister; 515 - } 516 - 517 - pr_debug("lguestnet: registered device %s\n", dev->name); 518 - /* Finally, we put the "struct net_device" in the generic "struct 519 - * lguest_device"s private pointer. Again, it's not necessary, but 520 - * makes sure the cool kernel kids don't tease us. */ 521 - lgdev->private = dev; 522 - return 0; 523 - 524 - unregister: 525 - unregister_netdev(dev); 526 - unmap: 527 - lguest_unmap(info->peer); 528 - free: 529 - free_netdev(dev); 530 - return err; 531 - } 532 - 533 - static struct lguest_driver lguestnet_drv = { 534 - .name = "lguestnet", 535 - .owner = THIS_MODULE, 536 - .device_type = LGUEST_DEVICE_T_NET, 537 - .probe = lguestnet_probe, 538 - }; 539 - 540 - static __init int lguestnet_init(void) 541 - { 542 - return register_lguest_driver(&lguestnet_drv); 543 - } 544 - module_init(lguestnet_init); 545 - 546 - MODULE_DESCRIPTION("Lguest network driver"); 547 - MODULE_LICENSE("GPL"); 548 - 549 - /*D:580 550 - * This is the last of the Drivers, and with this we have covered the many and 551 - * wonderous and fine (and boring) details of the Guest. 552 - * 553 - * "make Launcher" beckons, where we answer questions like "Where do Guests 554 - * come from?", and "What do you do when someone asks for optimization?" 555 - */

+435

drivers/net/virtio_net.c

··· 1 + /* A simple network driver using virtio. 2 + * 3 + * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation; either version 2 of the License, or 8 + * (at your option) any later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 + * GNU General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public License 16 + * along with this program; if not, write to the Free Software 17 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 + */ 19 + //#define DEBUG 20 + #include <linux/netdevice.h> 21 + #include <linux/etherdevice.h> 22 + #include <linux/module.h> 23 + #include <linux/virtio.h> 24 + #include <linux/virtio_net.h> 25 + #include <linux/scatterlist.h> 26 + 27 + /* FIXME: MTU in config. */ 28 + #define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) 29 + 30 + struct virtnet_info 31 + { 32 + struct virtio_device *vdev; 33 + struct virtqueue *rvq, *svq; 34 + struct net_device *dev; 35 + struct napi_struct napi; 36 + 37 + /* Number of input buffers, and max we've ever had. */ 38 + unsigned int num, max; 39 + 40 + /* Receive & send queues. */ 41 + struct sk_buff_head recv; 42 + struct sk_buff_head send; 43 + }; 44 + 45 + static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb) 46 + { 47 + return (struct virtio_net_hdr *)skb->cb; 48 + } 49 + 50 + static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb) 51 + { 52 + sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); 53 + } 54 + 55 + static bool skb_xmit_done(struct virtqueue *rvq) 56 + { 57 + struct virtnet_info *vi = rvq->vdev->priv; 58 + 59 + /* In case we were waiting for output buffers. */ 60 + netif_wake_queue(vi->dev); 61 + return true; 62 + } 63 + 64 + static void receive_skb(struct net_device *dev, struct sk_buff *skb, 65 + unsigned len) 66 + { 67 + struct virtio_net_hdr *hdr = skb_vnet_hdr(skb); 68 + 69 + if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { 70 + pr_debug("%s: short packet %i\n", dev->name, len); 71 + dev->stats.rx_length_errors++; 72 + goto drop; 73 + } 74 + len -= sizeof(struct virtio_net_hdr); 75 + BUG_ON(len > MAX_PACKET_LEN); 76 + 77 + skb_trim(skb, len); 78 + skb->protocol = eth_type_trans(skb, dev); 79 + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 80 + ntohs(skb->protocol), skb->len, skb->pkt_type); 81 + dev->stats.rx_bytes += skb->len; 82 + dev->stats.rx_packets++; 83 + 84 + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 85 + pr_debug("Needs csum!\n"); 86 + skb->ip_summed = CHECKSUM_PARTIAL; 87 + skb->csum_start = hdr->csum_start; 88 + skb->csum_offset = hdr->csum_offset; 89 + if (skb->csum_start > skb->len - 2 90 + || skb->csum_offset > skb->len - 2) { 91 + if (net_ratelimit()) 92 + printk(KERN_WARNING "%s: csum=%u/%u len=%u\n", 93 + dev->name, skb->csum_start, 94 + skb->csum_offset, skb->len); 95 + goto frame_err; 96 + } 97 + } 98 + 99 + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { 100 + pr_debug("GSO!\n"); 101 + switch (hdr->gso_type) { 102 + case VIRTIO_NET_HDR_GSO_TCPV4: 103 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 104 + break; 105 + case VIRTIO_NET_HDR_GSO_TCPV4_ECN: 106 + skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN; 107 + break; 108 + case VIRTIO_NET_HDR_GSO_UDP: 109 + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 110 + break; 111 + case VIRTIO_NET_HDR_GSO_TCPV6: 112 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; 113 + break; 114 + default: 115 + if (net_ratelimit()) 116 + printk(KERN_WARNING "%s: bad gso type %u.\n", 117 + dev->name, hdr->gso_type); 118 + goto frame_err; 119 + } 120 + 121 + skb_shinfo(skb)->gso_size = hdr->gso_size; 122 + if (skb_shinfo(skb)->gso_size == 0) { 123 + if (net_ratelimit()) 124 + printk(KERN_WARNING "%s: zero gso size.\n", 125 + dev->name); 126 + goto frame_err; 127 + } 128 + 129 + /* Header must be checked, and gso_segs computed. */ 130 + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 131 + skb_shinfo(skb)->gso_segs = 0; 132 + } 133 + 134 + netif_receive_skb(skb); 135 + return; 136 + 137 + frame_err: 138 + dev->stats.rx_frame_errors++; 139 + drop: 140 + dev_kfree_skb(skb); 141 + } 142 + 143 + static void try_fill_recv(struct virtnet_info *vi) 144 + { 145 + struct sk_buff *skb; 146 + struct scatterlist sg[1+MAX_SKB_FRAGS]; 147 + int num, err; 148 + 149 + for (;;) { 150 + skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN); 151 + if (unlikely(!skb)) 152 + break; 153 + 154 + skb_put(skb, MAX_PACKET_LEN); 155 + vnet_hdr_to_sg(sg, skb); 156 + num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 157 + skb_queue_head(&vi->recv, skb); 158 + 159 + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb); 160 + if (err) { 161 + skb_unlink(skb, &vi->recv); 162 + kfree_skb(skb); 163 + break; 164 + } 165 + vi->num++; 166 + } 167 + if (unlikely(vi->num > vi->max)) 168 + vi->max = vi->num; 169 + vi->rvq->vq_ops->kick(vi->rvq); 170 + } 171 + 172 + static bool skb_recv_done(struct virtqueue *rvq) 173 + { 174 + struct virtnet_info *vi = rvq->vdev->priv; 175 + netif_rx_schedule(vi->dev, &vi->napi); 176 + /* Suppress further interrupts. */ 177 + return false; 178 + } 179 + 180 + static int virtnet_poll(struct napi_struct *napi, int budget) 181 + { 182 + struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi); 183 + struct sk_buff *skb = NULL; 184 + unsigned int len, received = 0; 185 + 186 + again: 187 + while (received < budget && 188 + (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) { 189 + __skb_unlink(skb, &vi->recv); 190 + receive_skb(vi->dev, skb, len); 191 + vi->num--; 192 + received++; 193 + } 194 + 195 + /* FIXME: If we oom and completely run out of inbufs, we need 196 + * to start a timer trying to fill more. */ 197 + if (vi->num < vi->max / 2) 198 + try_fill_recv(vi); 199 + 200 + /* All done? */ 201 + if (!skb) { 202 + netif_rx_complete(vi->dev, napi); 203 + if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq)) 204 + && netif_rx_reschedule(vi->dev, napi)) 205 + goto again; 206 + } 207 + 208 + return received; 209 + } 210 + 211 + static void free_old_xmit_skbs(struct virtnet_info *vi) 212 + { 213 + struct sk_buff *skb; 214 + unsigned int len; 215 + 216 + while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) { 217 + pr_debug("Sent skb %p\n", skb); 218 + __skb_unlink(skb, &vi->send); 219 + vi->dev->stats.tx_bytes += len; 220 + vi->dev->stats.tx_packets++; 221 + kfree_skb(skb); 222 + } 223 + } 224 + 225 + static int start_xmit(struct sk_buff *skb, struct net_device *dev) 226 + { 227 + struct virtnet_info *vi = netdev_priv(dev); 228 + int num, err; 229 + struct scatterlist sg[1+MAX_SKB_FRAGS]; 230 + struct virtio_net_hdr *hdr; 231 + const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 232 + DECLARE_MAC_BUF(mac); 233 + 234 + pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest)); 235 + 236 + free_old_xmit_skbs(vi); 237 + 238 + /* Encode metadata header at front. */ 239 + hdr = skb_vnet_hdr(skb); 240 + if (skb->ip_summed == CHECKSUM_PARTIAL) { 241 + hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 242 + hdr->csum_start = skb->csum_start - skb_headroom(skb); 243 + hdr->csum_offset = skb->csum_offset; 244 + } else { 245 + hdr->flags = 0; 246 + hdr->csum_offset = hdr->csum_start = 0; 247 + } 248 + 249 + if (skb_is_gso(skb)) { 250 + hdr->gso_size = skb_shinfo(skb)->gso_size; 251 + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) 252 + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; 253 + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) 254 + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 255 + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) 256 + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 257 + else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) 258 + hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; 259 + else 260 + BUG(); 261 + } else { 262 + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 263 + hdr->gso_size = 0; 264 + } 265 + 266 + vnet_hdr_to_sg(sg, skb); 267 + num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 268 + __skb_queue_head(&vi->send, skb); 269 + err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); 270 + if (err) { 271 + pr_debug("%s: virtio not prepared to send\n", dev->name); 272 + skb_unlink(skb, &vi->send); 273 + netif_stop_queue(dev); 274 + return NETDEV_TX_BUSY; 275 + } 276 + vi->svq->vq_ops->kick(vi->svq); 277 + 278 + return 0; 279 + } 280 + 281 + static int virtnet_open(struct net_device *dev) 282 + { 283 + struct virtnet_info *vi = netdev_priv(dev); 284 + 285 + try_fill_recv(vi); 286 + 287 + /* If we didn't even get one input buffer, we're useless. */ 288 + if (vi->num == 0) 289 + return -ENOMEM; 290 + 291 + napi_enable(&vi->napi); 292 + return 0; 293 + } 294 + 295 + static int virtnet_close(struct net_device *dev) 296 + { 297 + struct virtnet_info *vi = netdev_priv(dev); 298 + struct sk_buff *skb; 299 + 300 + napi_disable(&vi->napi); 301 + 302 + /* networking core has neutered skb_xmit_done/skb_recv_done, so don't 303 + * worry about races vs. get(). */ 304 + vi->rvq->vq_ops->shutdown(vi->rvq); 305 + while ((skb = __skb_dequeue(&vi->recv)) != NULL) { 306 + kfree_skb(skb); 307 + vi->num--; 308 + } 309 + vi->svq->vq_ops->shutdown(vi->svq); 310 + while ((skb = __skb_dequeue(&vi->send)) != NULL) 311 + kfree_skb(skb); 312 + 313 + BUG_ON(vi->num != 0); 314 + return 0; 315 + } 316 + 317 + static int virtnet_probe(struct virtio_device *vdev) 318 + { 319 + int err; 320 + unsigned int len; 321 + struct net_device *dev; 322 + struct virtnet_info *vi; 323 + void *token; 324 + 325 + /* Allocate ourselves a network device with room for our info */ 326 + dev = alloc_etherdev(sizeof(struct virtnet_info)); 327 + if (!dev) 328 + return -ENOMEM; 329 + 330 + /* Set up network device as normal. */ 331 + ether_setup(dev); 332 + dev->open = virtnet_open; 333 + dev->stop = virtnet_close; 334 + dev->hard_start_xmit = start_xmit; 335 + dev->features = NETIF_F_HIGHDMA; 336 + SET_NETDEV_DEV(dev, &vdev->dev); 337 + 338 + /* Do we support "hardware" checksums? */ 339 + token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len); 340 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) { 341 + /* This opens up the world of extra features. */ 342 + dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; 343 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4)) 344 + dev->features |= NETIF_F_TSO; 345 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO)) 346 + dev->features |= NETIF_F_UFO; 347 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN)) 348 + dev->features |= NETIF_F_TSO_ECN; 349 + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6)) 350 + dev->features |= NETIF_F_TSO6; 351 + } 352 + 353 + /* Configuration may specify what MAC to use. Otherwise random. */ 354 + token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len); 355 + if (token) { 356 + dev->addr_len = len; 357 + vdev->config->get(vdev, token, dev->dev_addr, len); 358 + } else 359 + random_ether_addr(dev->dev_addr); 360 + 361 + /* Set up our device-specific information */ 362 + vi = netdev_priv(dev); 363 + netif_napi_add(dev, &vi->napi, virtnet_poll, 16); 364 + vi->dev = dev; 365 + vi->vdev = vdev; 366 + 367 + /* We expect two virtqueues, receive then send. */ 368 + vi->rvq = vdev->config->find_vq(vdev, skb_recv_done); 369 + if (IS_ERR(vi->rvq)) { 370 + err = PTR_ERR(vi->rvq); 371 + goto free; 372 + } 373 + 374 + vi->svq = vdev->config->find_vq(vdev, skb_xmit_done); 375 + if (IS_ERR(vi->svq)) { 376 + err = PTR_ERR(vi->svq); 377 + goto free_recv; 378 + } 379 + 380 + /* Initialize our empty receive and send queues. */ 381 + skb_queue_head_init(&vi->recv); 382 + skb_queue_head_init(&vi->send); 383 + 384 + err = register_netdev(dev); 385 + if (err) { 386 + pr_debug("virtio_net: registering device failed\n"); 387 + goto free_send; 388 + } 389 + pr_debug("virtnet: registered device %s\n", dev->name); 390 + vdev->priv = vi; 391 + return 0; 392 + 393 + free_send: 394 + vdev->config->del_vq(vi->svq); 395 + free_recv: 396 + vdev->config->del_vq(vi->rvq); 397 + free: 398 + free_netdev(dev); 399 + return err; 400 + } 401 + 402 + static void virtnet_remove(struct virtio_device *vdev) 403 + { 404 + unregister_netdev(vdev->priv); 405 + free_netdev(vdev->priv); 406 + } 407 + 408 + static struct virtio_device_id id_table[] = { 409 + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, 410 + { 0 }, 411 + }; 412 + 413 + static struct virtio_driver virtio_net = { 414 + .driver.name = KBUILD_MODNAME, 415 + .driver.owner = THIS_MODULE, 416 + .id_table = id_table, 417 + .probe = virtnet_probe, 418 + .remove = __devexit_p(virtnet_remove), 419 + }; 420 + 421 + static int __init init(void) 422 + { 423 + return register_virtio_driver(&virtio_net); 424 + } 425 + 426 + static void __exit fini(void) 427 + { 428 + unregister_virtio_driver(&virtio_net); 429 + } 430 + module_init(init); 431 + module_exit(fini); 432 + 433 + MODULE_DEVICE_TABLE(virtio, id_table); 434 + MODULE_DESCRIPTION("Virtio network driver"); 435 + MODULE_LICENSE("GPL");

+8

drivers/virtio/Kconfig

··· 1 + # Virtio always gets selected by whoever wants it. 2 + config VIRTIO 3 + bool 4 + 5 + # Similarly the virtio ring implementation. 6 + config VIRTIO_RING 7 + bool 8 + depends on VIRTIO

+2

drivers/virtio/Makefile

··· 1 + obj-$(CONFIG_VIRTIO) += virtio.o 2 + obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o

+13

drivers/virtio/config.c

··· 1 + /* Configuration space parsing helpers for virtio. 2 + * 3 + * The configuration is [type][len][... len bytes ...] fields. 4 + * 5 + * Copyright 2007 Rusty Russell, IBM Corporation. 6 + * GPL v2 or later. 7 + */ 8 + #include <linux/err.h> 9 + #include <linux/virtio.h> 10 + #include <linux/virtio_config.h> 11 + #include <linux/bug.h> 12 + #include <asm/system.h> 13 +

+189

drivers/virtio/virtio.c

··· 1 + #include <linux/virtio.h> 2 + #include <linux/spinlock.h> 3 + #include <linux/virtio_config.h> 4 + 5 + static ssize_t device_show(struct device *_d, 6 + struct device_attribute *attr, char *buf) 7 + { 8 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 9 + return sprintf(buf, "%hu", dev->id.device); 10 + } 11 + static ssize_t vendor_show(struct device *_d, 12 + struct device_attribute *attr, char *buf) 13 + { 14 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 15 + return sprintf(buf, "%hu", dev->id.vendor); 16 + } 17 + static ssize_t status_show(struct device *_d, 18 + struct device_attribute *attr, char *buf) 19 + { 20 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 21 + return sprintf(buf, "0x%08x", dev->config->get_status(dev)); 22 + } 23 + static ssize_t modalias_show(struct device *_d, 24 + struct device_attribute *attr, char *buf) 25 + { 26 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 27 + 28 + return sprintf(buf, "virtio:d%08Xv%08X\n", 29 + dev->id.device, dev->id.vendor); 30 + } 31 + static struct device_attribute virtio_dev_attrs[] = { 32 + __ATTR_RO(device), 33 + __ATTR_RO(vendor), 34 + __ATTR_RO(status), 35 + __ATTR_RO(modalias), 36 + __ATTR_NULL 37 + }; 38 + 39 + static inline int virtio_id_match(const struct virtio_device *dev, 40 + const struct virtio_device_id *id) 41 + { 42 + if (id->device != dev->id.device) 43 + return 0; 44 + 45 + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor; 46 + } 47 + 48 + /* This looks through all the IDs a driver claims to support. If any of them 49 + * match, we return 1 and the kernel will call virtio_dev_probe(). */ 50 + static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) 51 + { 52 + unsigned int i; 53 + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); 54 + const struct virtio_device_id *ids; 55 + 56 + ids = container_of(_dr, struct virtio_driver, driver)->id_table; 57 + for (i = 0; ids[i].device; i++) 58 + if (virtio_id_match(dev, &ids[i])) 59 + return 1; 60 + return 0; 61 + } 62 + 63 + static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env) 64 + { 65 + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); 66 + 67 + return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", 68 + dev->id.device, dev->id.vendor); 69 + } 70 + 71 + static struct bus_type virtio_bus = { 72 + .name = "virtio", 73 + .match = virtio_dev_match, 74 + .dev_attrs = virtio_dev_attrs, 75 + .uevent = virtio_uevent, 76 + }; 77 + 78 + static void add_status(struct virtio_device *dev, unsigned status) 79 + { 80 + dev->config->set_status(dev, dev->config->get_status(dev) | status); 81 + } 82 + 83 + static int virtio_dev_probe(struct device *_d) 84 + { 85 + int err; 86 + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); 87 + struct virtio_driver *drv = container_of(dev->dev.driver, 88 + struct virtio_driver, driver); 89 + 90 + add_status(dev, VIRTIO_CONFIG_S_DRIVER); 91 + err = drv->probe(dev); 92 + if (err) 93 + add_status(dev, VIRTIO_CONFIG_S_FAILED); 94 + else 95 + add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 96 + return err; 97 + } 98 + 99 + int register_virtio_driver(struct virtio_driver *driver) 100 + { 101 + driver->driver.bus = &virtio_bus; 102 + driver->driver.probe = virtio_dev_probe; 103 + return driver_register(&driver->driver); 104 + } 105 + EXPORT_SYMBOL_GPL(register_virtio_driver); 106 + 107 + void unregister_virtio_driver(struct virtio_driver *driver) 108 + { 109 + driver_unregister(&driver->driver); 110 + } 111 + EXPORT_SYMBOL_GPL(unregister_virtio_driver); 112 + 113 + int register_virtio_device(struct virtio_device *dev) 114 + { 115 + int err; 116 + 117 + dev->dev.bus = &virtio_bus; 118 + sprintf(dev->dev.bus_id, "%u", dev->index); 119 + 120 + /* Acknowledge that we've seen the device. */ 121 + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); 122 + 123 + /* device_register() causes the bus infrastructure to look for a 124 + * matching driver. */ 125 + err = device_register(&dev->dev); 126 + if (err) 127 + add_status(dev, VIRTIO_CONFIG_S_FAILED); 128 + return err; 129 + } 130 + EXPORT_SYMBOL_GPL(register_virtio_device); 131 + 132 + void unregister_virtio_device(struct virtio_device *dev) 133 + { 134 + device_unregister(&dev->dev); 135 + } 136 + EXPORT_SYMBOL_GPL(unregister_virtio_device); 137 + 138 + int __virtio_config_val(struct virtio_device *vdev, 139 + u8 type, void *val, size_t size) 140 + { 141 + void *token; 142 + unsigned int len; 143 + 144 + token = vdev->config->find(vdev, type, &len); 145 + if (!token) 146 + return -ENOENT; 147 + 148 + if (len != size) 149 + return -EIO; 150 + 151 + vdev->config->get(vdev, token, val, size); 152 + return 0; 153 + } 154 + EXPORT_SYMBOL_GPL(__virtio_config_val); 155 + 156 + int virtio_use_bit(struct virtio_device *vdev, 157 + void *token, unsigned int len, unsigned int bitnum) 158 + { 159 + unsigned long bits[16]; 160 + 161 + /* This makes it convenient to pass-through find() results. */ 162 + if (!token) 163 + return 0; 164 + 165 + /* bit not in range of this bitfield? */ 166 + if (bitnum * 8 >= len / 2) 167 + return 0; 168 + 169 + /* Giant feature bitfields are silly. */ 170 + BUG_ON(len > sizeof(bits)); 171 + vdev->config->get(vdev, token, bits, len); 172 + 173 + if (!test_bit(bitnum, bits)) 174 + return 0; 175 + 176 + /* Set acknowledge bit, and write it back. */ 177 + set_bit(bitnum + len * 8 / 2, bits); 178 + vdev->config->set(vdev, token, bits, len); 179 + return 1; 180 + } 181 + EXPORT_SYMBOL_GPL(virtio_use_bit); 182 + 183 + static int virtio_init(void) 184 + { 185 + if (bus_register(&virtio_bus) != 0) 186 + panic("virtio bus registration failed"); 187 + return 0; 188 + } 189 + core_initcall(virtio_init);

+313

drivers/virtio/virtio_ring.c

··· 1 + /* Virtio ring implementation. 2 + * 3 + * Copyright 2007 Rusty Russell IBM Corporation 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation; either version 2 of the License, or 8 + * (at your option) any later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, 11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 + * GNU General Public License for more details. 14 + * 15 + * You should have received a copy of the GNU General Public License 16 + * along with this program; if not, write to the Free Software 17 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 + */ 19 + #include <linux/virtio.h> 20 + #include <linux/virtio_ring.h> 21 + #include <linux/device.h> 22 + 23 + #ifdef DEBUG 24 + /* For development, we want to crash whenever the ring is screwed. */ 25 + #define BAD_RING(vq, fmt...) \ 26 + do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0) 27 + #define START_USE(vq) \ 28 + do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0) 29 + #define END_USE(vq) \ 30 + do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0) 31 + #else 32 + #define BAD_RING(vq, fmt...) \ 33 + do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0) 34 + #define START_USE(vq) 35 + #define END_USE(vq) 36 + #endif 37 + 38 + struct vring_virtqueue 39 + { 40 + struct virtqueue vq; 41 + 42 + /* Actual memory layout for this queue */ 43 + struct vring vring; 44 + 45 + /* Other side has made a mess, don't try any more. */ 46 + bool broken; 47 + 48 + /* Number of free buffers */ 49 + unsigned int num_free; 50 + /* Head of free buffer list. */ 51 + unsigned int free_head; 52 + /* Number we've added since last sync. */ 53 + unsigned int num_added; 54 + 55 + /* Last used index we've seen. */ 56 + unsigned int last_used_idx; 57 + 58 + /* How to notify other side. FIXME: commonalize hcalls! */ 59 + void (*notify)(struct virtqueue *vq); 60 + 61 + #ifdef DEBUG 62 + /* They're supposed to lock for us. */ 63 + unsigned int in_use; 64 + #endif 65 + 66 + /* Tokens for callbacks. */ 67 + void *data[]; 68 + }; 69 + 70 + #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) 71 + 72 + static int vring_add_buf(struct virtqueue *_vq, 73 + struct scatterlist sg[], 74 + unsigned int out, 75 + unsigned int in, 76 + void *data) 77 + { 78 + struct vring_virtqueue *vq = to_vvq(_vq); 79 + unsigned int i, avail, head, uninitialized_var(prev); 80 + 81 + BUG_ON(data == NULL); 82 + BUG_ON(out + in > vq->vring.num); 83 + BUG_ON(out + in == 0); 84 + 85 + START_USE(vq); 86 + 87 + if (vq->num_free < out + in) { 88 + pr_debug("Can't add buf len %i - avail = %i\n", 89 + out + in, vq->num_free); 90 + END_USE(vq); 91 + return -ENOSPC; 92 + } 93 + 94 + /* We're about to use some buffers from the free list. */ 95 + vq->num_free -= out + in; 96 + 97 + head = vq->free_head; 98 + for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { 99 + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; 100 + vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) 101 + + sg->offset; 102 + vq->vring.desc[i].len = sg->length; 103 + prev = i; 104 + sg++; 105 + } 106 + for (; in; i = vq->vring.desc[i].next, in--) { 107 + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 108 + vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) 109 + + sg->offset; 110 + vq->vring.desc[i].len = sg->length; 111 + prev = i; 112 + sg++; 113 + } 114 + /* Last one doesn't continue. */ 115 + vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; 116 + 117 + /* Update free pointer */ 118 + vq->free_head = i; 119 + 120 + /* Set token. */ 121 + vq->data[head] = data; 122 + 123 + /* Put entry in available array (but don't update avail->idx until they 124 + * do sync). FIXME: avoid modulus here? */ 125 + avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num; 126 + vq->vring.avail->ring[avail] = head; 127 + 128 + pr_debug("Added buffer head %i to %p\n", head, vq); 129 + END_USE(vq); 130 + return 0; 131 + } 132 + 133 + static void vring_kick(struct virtqueue *_vq) 134 + { 135 + struct vring_virtqueue *vq = to_vvq(_vq); 136 + START_USE(vq); 137 + /* Descriptors and available array need to be set before we expose the 138 + * new available array entries. */ 139 + wmb(); 140 + 141 + vq->vring.avail->idx += vq->num_added; 142 + vq->num_added = 0; 143 + 144 + /* Need to update avail index before checking if we should notify */ 145 + mb(); 146 + 147 + if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) 148 + /* Prod other side to tell it about changes. */ 149 + vq->notify(&vq->vq); 150 + 151 + END_USE(vq); 152 + } 153 + 154 + static void detach_buf(struct vring_virtqueue *vq, unsigned int head) 155 + { 156 + unsigned int i; 157 + 158 + /* Clear data ptr. */ 159 + vq->data[head] = NULL; 160 + 161 + /* Put back on free list: find end */ 162 + i = head; 163 + while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { 164 + i = vq->vring.desc[i].next; 165 + vq->num_free++; 166 + } 167 + 168 + vq->vring.desc[i].next = vq->free_head; 169 + vq->free_head = head; 170 + /* Plus final descriptor */ 171 + vq->num_free++; 172 + } 173 + 174 + /* FIXME: We need to tell other side about removal, to synchronize. */ 175 + static void vring_shutdown(struct virtqueue *_vq) 176 + { 177 + struct vring_virtqueue *vq = to_vvq(_vq); 178 + unsigned int i; 179 + 180 + for (i = 0; i < vq->vring.num; i++) 181 + detach_buf(vq, i); 182 + } 183 + 184 + static inline bool more_used(const struct vring_virtqueue *vq) 185 + { 186 + return vq->last_used_idx != vq->vring.used->idx; 187 + } 188 + 189 + static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len) 190 + { 191 + struct vring_virtqueue *vq = to_vvq(_vq); 192 + void *ret; 193 + unsigned int i; 194 + 195 + START_USE(vq); 196 + 197 + if (!more_used(vq)) { 198 + pr_debug("No more buffers in queue\n"); 199 + END_USE(vq); 200 + return NULL; 201 + } 202 + 203 + i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; 204 + *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; 205 + 206 + if (unlikely(i >= vq->vring.num)) { 207 + BAD_RING(vq, "id %u out of range\n", i); 208 + return NULL; 209 + } 210 + if (unlikely(!vq->data[i])) { 211 + BAD_RING(vq, "id %u is not a head!\n", i); 212 + return NULL; 213 + } 214 + 215 + /* detach_buf clears data, so grab it now. */ 216 + ret = vq->data[i]; 217 + detach_buf(vq, i); 218 + vq->last_used_idx++; 219 + END_USE(vq); 220 + return ret; 221 + } 222 + 223 + static bool vring_restart(struct virtqueue *_vq) 224 + { 225 + struct vring_virtqueue *vq = to_vvq(_vq); 226 + 227 + START_USE(vq); 228 + BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)); 229 + 230 + /* We optimistically turn back on interrupts, then check if there was 231 + * more to do. */ 232 + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; 233 + mb(); 234 + if (unlikely(more_used(vq))) { 235 + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 236 + END_USE(vq); 237 + return false; 238 + } 239 + 240 + END_USE(vq); 241 + return true; 242 + } 243 + 244 + irqreturn_t vring_interrupt(int irq, void *_vq) 245 + { 246 + struct vring_virtqueue *vq = to_vvq(_vq); 247 + 248 + if (!more_used(vq)) { 249 + pr_debug("virtqueue interrupt with no work for %p\n", vq); 250 + return IRQ_NONE; 251 + } 252 + 253 + if (unlikely(vq->broken)) 254 + return IRQ_HANDLED; 255 + 256 + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); 257 + if (vq->vq.callback && !vq->vq.callback(&vq->vq)) 258 + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 259 + 260 + return IRQ_HANDLED; 261 + } 262 + 263 + static struct virtqueue_ops vring_vq_ops = { 264 + .add_buf = vring_add_buf, 265 + .get_buf = vring_get_buf, 266 + .kick = vring_kick, 267 + .restart = vring_restart, 268 + .shutdown = vring_shutdown, 269 + }; 270 + 271 + struct virtqueue *vring_new_virtqueue(unsigned int num, 272 + struct virtio_device *vdev, 273 + void *pages, 274 + void (*notify)(struct virtqueue *), 275 + bool (*callback)(struct virtqueue *)) 276 + { 277 + struct vring_virtqueue *vq; 278 + unsigned int i; 279 + 280 + vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); 281 + if (!vq) 282 + return NULL; 283 + 284 + vring_init(&vq->vring, num, pages); 285 + vq->vq.callback = callback; 286 + vq->vq.vdev = vdev; 287 + vq->vq.vq_ops = &vring_vq_ops; 288 + vq->notify = notify; 289 + vq->broken = false; 290 + vq->last_used_idx = 0; 291 + vq->num_added = 0; 292 + #ifdef DEBUG 293 + vq->in_use = false; 294 + #endif 295 + 296 + /* No callback? Tell other side not to bother us. */ 297 + if (!callback) 298 + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 299 + 300 + /* Put everything in free lists. */ 301 + vq->num_free = num; 302 + vq->free_head = 0; 303 + for (i = 0; i < num-1; i++) 304 + vq->vring.desc[i].next = i+1; 305 + 306 + return &vq->vq; 307 + } 308 + 309 + void vring_del_virtqueue(struct virtqueue *vq) 310 + { 311 + kfree(to_vvq(vq)); 312 + } 313 +

+3

include/asm-x86/Kbuild

··· 1 1 include include/asm-generic/Kbuild.asm 2 2 3 3 header-y += boot.h 4 + header-y += bootparam.h 4 5 header-y += debugreg.h 5 6 header-y += ldt.h 6 7 header-y += msr-index.h ··· 15 14 unifdef-y += a.out_64.h 16 15 unifdef-y += byteorder_32.h 17 16 unifdef-y += byteorder_64.h 17 + unifdef-y += e820.h 18 18 unifdef-y += elf_32.h 19 19 unifdef-y += elf_64.h 20 + unifdef-y += ist.h 20 21 unifdef-y += mce.h 21 22 unifdef-y += msgbuf_32.h 22 23 unifdef-y += msgbuf_64.h

+54 -54

include/asm-x86/bootparam.h

··· 10 10 #include <video/edid.h> 11 11 12 12 struct setup_header { 13 - u8 setup_sects; 14 - u16 root_flags; 15 - u32 syssize; 16 - u16 ram_size; 13 + __u8 setup_sects; 14 + __u16 root_flags; 15 + __u32 syssize; 16 + __u16 ram_size; 17 17 #define RAMDISK_IMAGE_START_MASK 0x07FF 18 18 #define RAMDISK_PROMPT_FLAG 0x8000 19 19 #define RAMDISK_LOAD_FLAG 0x4000 20 - u16 vid_mode; 21 - u16 root_dev; 22 - u16 boot_flag; 23 - u16 jump; 24 - u32 header; 25 - u16 version; 26 - u32 realmode_swtch; 27 - u16 start_sys; 28 - u16 kernel_version; 29 - u8 type_of_loader; 30 - u8 loadflags; 20 + __u16 vid_mode; 21 + __u16 root_dev; 22 + __u16 boot_flag; 23 + __u16 jump; 24 + __u32 header; 25 + __u16 version; 26 + __u32 realmode_swtch; 27 + __u16 start_sys; 28 + __u16 kernel_version; 29 + __u8 type_of_loader; 30 + __u8 loadflags; 31 31 #define LOADED_HIGH (1<<0) 32 32 #define KEEP_SEGMENTS (1<<6) 33 33 #define CAN_USE_HEAP (1<<7) 34 - u16 setup_move_size; 35 - u32 code32_start; 36 - u32 ramdisk_image; 37 - u32 ramdisk_size; 38 - u32 bootsect_kludge; 39 - u16 heap_end_ptr; 40 - u16 _pad1; 41 - u32 cmd_line_ptr; 42 - u32 initrd_addr_max; 43 - u32 kernel_alignment; 44 - u8 relocatable_kernel; 45 - u8 _pad2[3]; 46 - u32 cmdline_size; 47 - u32 hardware_subarch; 48 - u64 hardware_subarch_data; 34 + __u16 setup_move_size; 35 + __u32 code32_start; 36 + __u32 ramdisk_image; 37 + __u32 ramdisk_size; 38 + __u32 bootsect_kludge; 39 + __u16 heap_end_ptr; 40 + __u16 _pad1; 41 + __u32 cmd_line_ptr; 42 + __u32 initrd_addr_max; 43 + __u32 kernel_alignment; 44 + __u8 relocatable_kernel; 45 + __u8 _pad2[3]; 46 + __u32 cmdline_size; 47 + __u32 hardware_subarch; 48 + __u64 hardware_subarch_data; 49 49 } __attribute__((packed)); 50 50 51 51 struct sys_desc_table { 52 - u16 length; 53 - u8 table[14]; 52 + __u16 length; 53 + __u8 table[14]; 54 54 }; 55 55 56 56 struct efi_info { 57 - u32 _pad1; 58 - u32 efi_systab; 59 - u32 efi_memdesc_size; 60 - u32 efi_memdesc_version; 61 - u32 efi_memmap; 62 - u32 efi_memmap_size; 63 - u32 _pad2[2]; 57 + __u32 _pad1; 58 + __u32 efi_systab; 59 + __u32 efi_memdesc_size; 60 + __u32 efi_memdesc_version; 61 + __u32 efi_memmap; 62 + __u32 efi_memmap_size; 63 + __u32 _pad2[2]; 64 64 }; 65 65 66 66 /* The so-called "zeropage" */ 67 67 struct boot_params { 68 68 struct screen_info screen_info; /* 0x000 */ 69 69 struct apm_bios_info apm_bios_info; /* 0x040 */ 70 - u8 _pad2[12]; /* 0x054 */ 70 + __u8 _pad2[12]; /* 0x054 */ 71 71 struct ist_info ist_info; /* 0x060 */ 72 - u8 _pad3[16]; /* 0x070 */ 73 - u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 74 - u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 72 + __u8 _pad3[16]; /* 0x070 */ 73 + __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 74 + __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 75 75 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 76 - u8 _pad4[144]; /* 0x0b0 */ 76 + __u8 _pad4[144]; /* 0x0b0 */ 77 77 struct edid_info edid_info; /* 0x140 */ 78 78 struct efi_info efi_info; /* 0x1c0 */ 79 - u32 alt_mem_k; /* 0x1e0 */ 80 - u32 scratch; /* Scratch field! */ /* 0x1e4 */ 81 - u8 e820_entries; /* 0x1e8 */ 82 - u8 eddbuf_entries; /* 0x1e9 */ 83 - u8 edd_mbr_sig_buf_entries; /* 0x1ea */ 84 - u8 _pad6[6]; /* 0x1eb */ 79 + __u32 alt_mem_k; /* 0x1e0 */ 80 + __u32 scratch; /* Scratch field! */ /* 0x1e4 */ 81 + __u8 e820_entries; /* 0x1e8 */ 82 + __u8 eddbuf_entries; /* 0x1e9 */ 83 + __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ 84 + __u8 _pad6[6]; /* 0x1eb */ 85 85 struct setup_header hdr; /* setup header */ /* 0x1f1 */ 86 - u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; 87 - u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ 86 + __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; 87 + __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ 88 88 struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 89 - u8 _pad8[48]; /* 0xcd0 */ 89 + __u8 _pad8[48]; /* 0xcd0 */ 90 90 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ 91 - u8 _pad9[276]; /* 0xeec */ 91 + __u8 _pad9[276]; /* 0xeec */ 92 92 } __attribute__((packed)); 93 93 94 94 #endif /* _ASM_BOOTPARAM_H */

+28

include/asm-x86/e820.h

··· 1 + #ifndef __ASM_E820_H 2 + #define __ASM_E820_H 3 + #define E820MAP 0x2d0 /* our map */ 4 + #define E820MAX 128 /* number of entries in E820MAP */ 5 + #define E820NR 0x1e8 /* # entries in E820MAP */ 6 + 7 + #define E820_RAM 1 8 + #define E820_RESERVED 2 9 + #define E820_ACPI 3 10 + #define E820_NVS 4 11 + 12 + #ifndef __ASSEMBLY__ 13 + struct e820entry { 14 + __u64 addr; /* start of memory segment */ 15 + __u64 size; /* size of memory segment */ 16 + __u32 type; /* type of memory segment */ 17 + } __attribute__((packed)); 18 + 19 + struct e820map { 20 + __u32 nr_map; 21 + struct e820entry map[E820MAX]; 22 + }; 23 + #endif /* __ASSEMBLY__ */ 24 + 25 + #ifdef __KERNEL__ 1 26 #ifdef CONFIG_X86_32 2 27 # include "e820_32.h" 3 28 #else 4 29 # include "e820_64.h" 5 30 #endif 31 + #endif /* __KERNEL__ */ 32 + 33 + #endif /* __ASM_E820_H */

-21

include/asm-x86/e820_32.h

··· 12 12 #ifndef __E820_HEADER 13 13 #define __E820_HEADER 14 14 15 - #define E820MAP 0x2d0 /* our map */ 16 - #define E820MAX 128 /* number of entries in E820MAP */ 17 - #define E820NR 0x1e8 /* # entries in E820MAP */ 18 - 19 - #define E820_RAM 1 20 - #define E820_RESERVED 2 21 - #define E820_ACPI 3 22 - #define E820_NVS 4 23 - 24 15 #define HIGH_MEMORY (1024*1024) 25 16 26 17 #ifndef __ASSEMBLY__ 27 - 28 - struct e820entry { 29 - u64 addr; /* start of memory segment */ 30 - u64 size; /* size of memory segment */ 31 - u32 type; /* type of memory segment */ 32 - } __attribute__((packed)); 33 - 34 - struct e820map { 35 - u32 nr_map; 36 - struct e820entry map[E820MAX]; 37 - }; 38 18 39 19 extern struct e820map e820; 40 20 ··· 36 56 #endif 37 57 38 58 #endif/*!__ASSEMBLY__*/ 39 - 40 59 #endif/*__E820_HEADER*/

-20

include/asm-x86/e820_64.h

··· 11 11 #ifndef __E820_HEADER 12 12 #define __E820_HEADER 13 13 14 - #define E820MAP 0x2d0 /* our map */ 15 - #define E820MAX 128 /* number of entries in E820MAP */ 16 - #define E820NR 0x1e8 /* # entries in E820MAP */ 17 - 18 - #define E820_RAM 1 19 - #define E820_RESERVED 2 20 - #define E820_ACPI 3 21 - #define E820_NVS 4 22 - 23 14 #ifndef __ASSEMBLY__ 24 - struct e820entry { 25 - u64 addr; /* start of memory segment */ 26 - u64 size; /* size of memory segment */ 27 - u32 type; /* type of memory segment */ 28 - } __attribute__((packed)); 29 - 30 - struct e820map { 31 - u32 nr_map; 32 - struct e820entry map[E820MAX]; 33 - }; 34 - 35 15 extern unsigned long find_e820_area(unsigned long start, unsigned long end, 36 16 unsigned size); 37 17 extern void add_memory_region(unsigned long start, unsigned long size,

+6 -6

include/asm-x86/ist.h

··· 17 17 */ 18 18 19 19 20 - #ifdef __KERNEL__ 21 - 22 20 #include <linux/types.h> 23 21 24 22 struct ist_info { 25 - u32 signature; 26 - u32 command; 27 - u32 event; 28 - u32 perf_level; 23 + __u32 signature; 24 + __u32 command; 25 + __u32 event; 26 + __u32 perf_level; 29 27 }; 28 + 29 + #ifdef __KERNEL__ 30 30 31 31 extern struct ist_info ist_info; 32 32

+86

include/asm-x86/lguest.h

··· 1 + #ifndef _X86_LGUEST_H 2 + #define _X86_LGUEST_H 3 + 4 + #define GDT_ENTRY_LGUEST_CS 10 5 + #define GDT_ENTRY_LGUEST_DS 11 6 + #define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) 7 + #define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) 8 + 9 + #ifndef __ASSEMBLY__ 10 + #include <asm/desc.h> 11 + 12 + #define GUEST_PL 1 13 + 14 + /* Every guest maps the core switcher code. */ 15 + #define SHARED_SWITCHER_PAGES \ 16 + DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) 17 + /* Pages for switcher itself, then two pages per cpu */ 18 + #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) 19 + 20 + /* We map at -4M for ease of mapping into the guest (one PTE page). */ 21 + #define SWITCHER_ADDR 0xFFC00000 22 + 23 + /* Found in switcher.S */ 24 + extern unsigned long default_idt_entries[]; 25 + 26 + struct lguest_regs 27 + { 28 + /* Manually saved part. */ 29 + unsigned long eax, ebx, ecx, edx; 30 + unsigned long esi, edi, ebp; 31 + unsigned long gs; 32 + unsigned long fs, ds, es; 33 + unsigned long trapnum, errcode; 34 + /* Trap pushed part */ 35 + unsigned long eip; 36 + unsigned long cs; 37 + unsigned long eflags; 38 + unsigned long esp; 39 + unsigned long ss; 40 + }; 41 + 42 + /* This is a guest-specific page (mapped ro) into the guest. */ 43 + struct lguest_ro_state 44 + { 45 + /* Host information we need to restore when we switch back. */ 46 + u32 host_cr3; 47 + struct Xgt_desc_struct host_idt_desc; 48 + struct Xgt_desc_struct host_gdt_desc; 49 + u32 host_sp; 50 + 51 + /* Fields which are used when guest is running. */ 52 + struct Xgt_desc_struct guest_idt_desc; 53 + struct Xgt_desc_struct guest_gdt_desc; 54 + struct i386_hw_tss guest_tss; 55 + struct desc_struct guest_idt[IDT_ENTRIES]; 56 + struct desc_struct guest_gdt[GDT_ENTRIES]; 57 + }; 58 + 59 + struct lguest_arch 60 + { 61 + /* The GDT entries copied into lguest_ro_state when running. */ 62 + struct desc_struct gdt[GDT_ENTRIES]; 63 + 64 + /* The IDT entries: some copied into lguest_ro_state when running. */ 65 + struct desc_struct idt[IDT_ENTRIES]; 66 + 67 + /* The address of the last guest-visible pagefault (ie. cr2). */ 68 + unsigned long last_pagefault; 69 + }; 70 + 71 + static inline void lguest_set_ts(void) 72 + { 73 + u32 cr0; 74 + 75 + cr0 = read_cr0(); 76 + if (!(cr0 & 8)) 77 + write_cr0(cr0|8); 78 + } 79 + 80 + /* Full 4G segment descriptors, suitable for CS and DS. */ 81 + #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 82 + #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 83 + 84 + #endif /* __ASSEMBLY__ */ 85 + 86 + #endif

+71

include/asm-x86/lguest_hcall.h

··· 1 + /* Architecture specific portion of the lguest hypercalls */ 2 + #ifndef _X86_LGUEST_HCALL_H 3 + #define _X86_LGUEST_HCALL_H 4 + 5 + #define LHCALL_FLUSH_ASYNC 0 6 + #define LHCALL_LGUEST_INIT 1 7 + #define LHCALL_CRASH 2 8 + #define LHCALL_LOAD_GDT 3 9 + #define LHCALL_NEW_PGTABLE 4 10 + #define LHCALL_FLUSH_TLB 5 11 + #define LHCALL_LOAD_IDT_ENTRY 6 12 + #define LHCALL_SET_STACK 7 13 + #define LHCALL_TS 8 14 + #define LHCALL_SET_CLOCKEVENT 9 15 + #define LHCALL_HALT 10 16 + #define LHCALL_SET_PTE 14 17 + #define LHCALL_SET_PMD 15 18 + #define LHCALL_LOAD_TLS 16 19 + #define LHCALL_NOTIFY 17 20 + 21 + /*G:031 First, how does our Guest contact the Host to ask for privileged 22 + * operations? There are two ways: the direct way is to make a "hypercall", 23 + * to make requests of the Host Itself. 24 + * 25 + * Our hypercall mechanism uses the highest unused trap code (traps 32 and 26 + * above are used by real hardware interrupts). Seventeen hypercalls are 27 + * available: the hypercall number is put in the %eax register, and the 28 + * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 29 + * value makes sense, it's returned in %eax. 30 + * 31 + * Grossly invalid calls result in Sudden Death at the hands of the vengeful 32 + * Host, rather than returning failure. This reflects Winston Churchill's 33 + * definition of a gentleman: "someone who is only rude intentionally". */ 34 + #define LGUEST_TRAP_ENTRY 0x1F 35 + 36 + #ifndef __ASSEMBLY__ 37 + #include <asm/hw_irq.h> 38 + 39 + static inline unsigned long 40 + hcall(unsigned long call, 41 + unsigned long arg1, unsigned long arg2, unsigned long arg3) 42 + { 43 + /* "int" is the Intel instruction to trigger a trap. */ 44 + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 45 + /* The call is in %eax (aka "a"), and can be replaced */ 46 + : "=a"(call) 47 + /* The other arguments are in %eax, %edx, %ebx & %ecx */ 48 + : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 49 + /* "memory" means this might write somewhere in memory. 50 + * This isn't true for all calls, but it's safe to tell 51 + * gcc that it might happen so it doesn't get clever. */ 52 + : "memory"); 53 + return call; 54 + } 55 + /*:*/ 56 + 57 + void async_hcall(unsigned long call, 58 + unsigned long arg1, unsigned long arg2, unsigned long arg3); 59 + 60 + /* Can't use our min() macro here: needs to be a constant */ 61 + #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 62 + 63 + #define LHCALL_RING_SIZE 64 64 + struct hcall_args 65 + { 66 + /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 67 + unsigned long arg0, arg2, arg3, arg1; 68 + }; 69 + 70 + #endif /* !__ASSEMBLY__ */ 71 + #endif /* _I386_LGUEST_HCALL_H */

+5

include/linux/Kbuild

··· 186 186 unifdef-y += dccp.h 187 187 unifdef-y += dirent.h 188 188 unifdef-y += dlm.h 189 + unifdef-y += edd.h 189 190 unifdef-y += elfcore.h 190 191 unifdef-y += errno.h 191 192 unifdef-y += errqueue.h ··· 307 306 unifdef-y += rtnetlink.h 308 307 unifdef-y += scc.h 309 308 unifdef-y += sched.h 309 + unifdef-y += screen_info.h 310 310 unifdef-y += sdla.h 311 311 unifdef-y += selinux_netlink.h 312 312 unifdef-y += sem.h ··· 343 341 unifdef-y += utsname.h 344 342 unifdef-y += videodev2.h 345 343 unifdef-y += videodev.h 344 + unifdef-y += virtio_config.h 345 + unifdef-y += virtio_blk.h 346 + unifdef-y += virtio_net.h 346 347 unifdef-y += wait.h 347 348 unifdef-y += wanrouter.h 348 349 unifdef-y += watchdog.h

+15 -15

include/linux/apm_bios.h

··· 16 16 * General Public License for more details. 17 17 */ 18 18 19 - typedef unsigned short apm_event_t; 20 - typedef unsigned short apm_eventinfo_t; 19 + #include <linux/types.h> 20 + 21 + struct apm_bios_info { 22 + __u16 version; 23 + __u16 cseg; 24 + __u32 offset; 25 + __u16 cseg_16; 26 + __u16 dseg; 27 + __u16 flags; 28 + __u16 cseg_len; 29 + __u16 cseg_16_len; 30 + __u16 dseg_len; 31 + }; 21 32 22 33 #ifdef __KERNEL__ 23 34 24 - #include <linux/types.h> 35 + typedef unsigned short apm_event_t; 36 + typedef unsigned short apm_eventinfo_t; 25 37 26 38 #define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8) 27 39 #define APM_CS_16 (APM_CS + 8) 28 40 #define APM_DS (APM_CS_16 + 8) 29 - 30 - struct apm_bios_info { 31 - u16 version; 32 - u16 cseg; 33 - u32 offset; 34 - u16 cseg_16; 35 - u16 dseg; 36 - u16 flags; 37 - u16 cseg_len; 38 - u16 cseg_16_len; 39 - u16 dseg_len; 40 - }; 41 41 42 42 /* Results of APM Installation Check */ 43 43 #define APM_16_BIT_SUPPORT 0x0001

+69 -68

include/linux/edd.h

··· 67 67 #define EDD_INFO_USE_INT13_FN50 (1 << 7) 68 68 69 69 struct edd_device_params { 70 - u16 length; 71 - u16 info_flags; 72 - u32 num_default_cylinders; 73 - u32 num_default_heads; 74 - u32 sectors_per_track; 75 - u64 number_of_sectors; 76 - u16 bytes_per_sector; 77 - u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ 78 - u16 key; /* = 0xBEDD */ 79 - u8 device_path_info_length; /* = 44 */ 80 - u8 reserved2; 81 - u16 reserved3; 82 - u8 host_bus_type[4]; 83 - u8 interface_type[8]; 70 + __u16 length; 71 + __u16 info_flags; 72 + __u32 num_default_cylinders; 73 + __u32 num_default_heads; 74 + __u32 sectors_per_track; 75 + __u64 number_of_sectors; 76 + __u16 bytes_per_sector; 77 + __u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ 78 + __u16 key; /* = 0xBEDD */ 79 + __u8 device_path_info_length; /* = 44 */ 80 + __u8 reserved2; 81 + __u16 reserved3; 82 + __u8 host_bus_type[4]; 83 + __u8 interface_type[8]; 84 84 union { 85 85 struct { 86 - u16 base_address; 87 - u16 reserved1; 88 - u32 reserved2; 86 + __u16 base_address; 87 + __u16 reserved1; 88 + __u32 reserved2; 89 89 } __attribute__ ((packed)) isa; 90 90 struct { 91 - u8 bus; 92 - u8 slot; 93 - u8 function; 94 - u8 channel; 95 - u32 reserved; 91 + __u8 bus; 92 + __u8 slot; 93 + __u8 function; 94 + __u8 channel; 95 + __u32 reserved; 96 96 } __attribute__ ((packed)) pci; 97 97 /* pcix is same as pci */ 98 98 struct { 99 - u64 reserved; 99 + __u64 reserved; 100 100 } __attribute__ ((packed)) ibnd; 101 101 struct { 102 - u64 reserved; 102 + __u64 reserved; 103 103 } __attribute__ ((packed)) xprs; 104 104 struct { 105 - u64 reserved; 105 + __u64 reserved; 106 106 } __attribute__ ((packed)) htpt; 107 107 struct { 108 - u64 reserved; 108 + __u64 reserved; 109 109 } __attribute__ ((packed)) unknown; 110 110 } interface_path; 111 111 union { 112 112 struct { 113 - u8 device; 114 - u8 reserved1; 115 - u16 reserved2; 116 - u32 reserved3; 117 - u64 reserved4; 113 + __u8 device; 114 + __u8 reserved1; 115 + __u16 reserved2; 116 + __u32 reserved3; 117 + __u64 reserved4; 118 118 } __attribute__ ((packed)) ata; 119 119 struct { 120 - u8 device; 121 - u8 lun; 122 - u8 reserved1; 123 - u8 reserved2; 124 - u32 reserved3; 125 - u64 reserved4; 120 + __u8 device; 121 + __u8 lun; 122 + __u8 reserved1; 123 + __u8 reserved2; 124 + __u32 reserved3; 125 + __u64 reserved4; 126 126 } __attribute__ ((packed)) atapi; 127 127 struct { 128 - u16 id; 129 - u64 lun; 130 - u16 reserved1; 131 - u32 reserved2; 128 + __u16 id; 129 + __u64 lun; 130 + __u16 reserved1; 131 + __u32 reserved2; 132 132 } __attribute__ ((packed)) scsi; 133 133 struct { 134 - u64 serial_number; 135 - u64 reserved; 134 + __u64 serial_number; 135 + __u64 reserved; 136 136 } __attribute__ ((packed)) usb; 137 137 struct { 138 - u64 eui; 139 - u64 reserved; 138 + __u64 eui; 139 + __u64 reserved; 140 140 } __attribute__ ((packed)) i1394; 141 141 struct { 142 - u64 wwid; 143 - u64 lun; 142 + __u64 wwid; 143 + __u64 lun; 144 144 } __attribute__ ((packed)) fibre; 145 145 struct { 146 - u64 identity_tag; 147 - u64 reserved; 146 + __u64 identity_tag; 147 + __u64 reserved; 148 148 } __attribute__ ((packed)) i2o; 149 149 struct { 150 - u32 array_number; 151 - u32 reserved1; 152 - u64 reserved2; 150 + __u32 array_number; 151 + __u32 reserved1; 152 + __u64 reserved2; 153 153 } __attribute__ ((packed)) raid; 154 154 struct { 155 - u8 device; 156 - u8 reserved1; 157 - u16 reserved2; 158 - u32 reserved3; 159 - u64 reserved4; 155 + __u8 device; 156 + __u8 reserved1; 157 + __u16 reserved2; 158 + __u32 reserved3; 159 + __u64 reserved4; 160 160 } __attribute__ ((packed)) sata; 161 161 struct { 162 - u64 reserved1; 163 - u64 reserved2; 162 + __u64 reserved1; 163 + __u64 reserved2; 164 164 } __attribute__ ((packed)) unknown; 165 165 } device_path; 166 - u8 reserved4; 167 - u8 checksum; 166 + __u8 reserved4; 167 + __u8 checksum; 168 168 } __attribute__ ((packed)); 169 169 170 170 struct edd_info { 171 - u8 device; 172 - u8 version; 173 - u16 interface_support; 174 - u16 legacy_max_cylinder; 175 - u8 legacy_max_head; 176 - u8 legacy_sectors_per_track; 171 + __u8 device; 172 + __u8 version; 173 + __u16 interface_support; 174 + __u16 legacy_max_cylinder; 175 + __u8 legacy_max_head; 176 + __u8 legacy_sectors_per_track; 177 177 struct edd_device_params params; 178 178 } __attribute__ ((packed)); 179 179 ··· 184 184 unsigned char edd_info_nr; 185 185 }; 186 186 187 + #ifdef __KERNEL__ 187 188 extern struct edd edd; 188 - 189 + #endif /* __KERNEL__ */ 189 190 #endif /*!__ASSEMBLY__ */ 190 191 191 192 #endif /* _LINUX_EDD_H */

+12 -68

include/linux/lguest.h

··· 1 1 /* Things the lguest guest needs to know. Note: like all lguest interfaces, 2 2 * this is subject to wild and random change between versions. */ 3 - #ifndef _ASM_LGUEST_H 4 - #define _ASM_LGUEST_H 3 + #ifndef _LINUX_LGUEST_H 4 + #define _LINUX_LGUEST_H 5 5 6 6 #ifndef __ASSEMBLY__ 7 + #include <linux/time.h> 7 8 #include <asm/irq.h> 8 - 9 - #define LHCALL_FLUSH_ASYNC 0 10 - #define LHCALL_LGUEST_INIT 1 11 - #define LHCALL_CRASH 2 12 - #define LHCALL_LOAD_GDT 3 13 - #define LHCALL_NEW_PGTABLE 4 14 - #define LHCALL_FLUSH_TLB 5 15 - #define LHCALL_LOAD_IDT_ENTRY 6 16 - #define LHCALL_SET_STACK 7 17 - #define LHCALL_TS 8 18 - #define LHCALL_SET_CLOCKEVENT 9 19 - #define LHCALL_HALT 10 20 - #define LHCALL_BIND_DMA 12 21 - #define LHCALL_SEND_DMA 13 22 - #define LHCALL_SET_PTE 14 23 - #define LHCALL_SET_PMD 15 24 - #define LHCALL_LOAD_TLS 16 9 + #include <asm/lguest_hcall.h> 25 10 26 11 #define LG_CLOCK_MIN_DELTA 100UL 27 12 #define LG_CLOCK_MAX_DELTA ULONG_MAX 28 - 29 - /*G:031 First, how does our Guest contact the Host to ask for privileged 30 - * operations? There are two ways: the direct way is to make a "hypercall", 31 - * to make requests of the Host Itself. 32 - * 33 - * Our hypercall mechanism uses the highest unused trap code (traps 32 and 34 - * above are used by real hardware interrupts). Seventeen hypercalls are 35 - * available: the hypercall number is put in the %eax register, and the 36 - * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 37 - * value makes sense, it's returned in %eax. 38 - * 39 - * Grossly invalid calls result in Sudden Death at the hands of the vengeful 40 - * Host, rather than returning failure. This reflects Winston Churchill's 41 - * definition of a gentleman: "someone who is only rude intentionally". */ 42 - #define LGUEST_TRAP_ENTRY 0x1F 43 - 44 - static inline unsigned long 45 - hcall(unsigned long call, 46 - unsigned long arg1, unsigned long arg2, unsigned long arg3) 47 - { 48 - /* "int" is the Intel instruction to trigger a trap. */ 49 - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 50 - /* The call is in %eax (aka "a"), and can be replaced */ 51 - : "=a"(call) 52 - /* The other arguments are in %eax, %edx, %ebx & %ecx */ 53 - : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 54 - /* "memory" means this might write somewhere in memory. 55 - * This isn't true for all calls, but it's safe to tell 56 - * gcc that it might happen so it doesn't get clever. */ 57 - : "memory"); 58 - return call; 59 - } 60 - /*:*/ 61 - 62 - void async_hcall(unsigned long call, 63 - unsigned long arg1, unsigned long arg2, unsigned long arg3); 64 - 65 - /* Can't use our min() macro here: needs to be a constant */ 66 - #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 67 - 68 - #define LHCALL_RING_SIZE 64 69 - struct hcall_ring 70 - { 71 - u32 eax, edx, ebx, ecx; 72 - }; 73 13 74 14 /*G:032 The second method of communicating with the Host is to via "struct 75 15 * lguest_data". The Guest's very first hypercall is to tell the Host where ··· 37 97 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 38 98 u8 hcall_status[LHCALL_RING_SIZE]; 39 99 /* The actual registers for the hypercalls. */ 40 - struct hcall_ring hcalls[LHCALL_RING_SIZE]; 100 + struct hcall_args hcalls[LHCALL_RING_SIZE]; 41 101 42 102 /* Fields initialized by the Host at boot: */ 43 103 /* Memory not to try to access */ 44 104 unsigned long reserve_mem; 45 - /* ID of this Guest (used by network driver to set ethernet address) */ 46 - u16 guestid; 47 105 /* KHz for the TSC clock. */ 48 106 u32 tsc_khz; 107 + /* Page where the top-level pagetable is */ 108 + unsigned long pgdir; 49 109 50 110 /* Fields initialized by the Guest at boot: */ 51 111 /* Instruction range to suppress interrupts even if enabled */ 52 112 unsigned long noirq_start, noirq_end; 113 + /* Address above which page tables are all identical. */ 114 + unsigned long kernel_address; 115 + /* The vector to try to use for system calls (0x40 or 0x80). */ 116 + unsigned int syscall_vec; 53 117 }; 54 118 extern struct lguest_data lguest_data; 55 119 #endif /* __ASSEMBLY__ */ 56 - #endif /* _ASM_LGUEST_H */ 120 + #endif /* _LINUX_LGUEST_H */

-51

include/linux/lguest_bus.h

··· 1 - #ifndef _ASM_LGUEST_DEVICE_H 2 - #define _ASM_LGUEST_DEVICE_H 3 - /* Everything you need to know about lguest devices. */ 4 - #include <linux/device.h> 5 - #include <linux/lguest.h> 6 - #include <linux/lguest_launcher.h> 7 - 8 - struct lguest_device { 9 - /* Unique busid, and index into lguest_page->devices[] */ 10 - unsigned int index; 11 - 12 - struct device dev; 13 - 14 - /* Driver can hang data off here. */ 15 - void *private; 16 - }; 17 - 18 - /*D:380 Since interrupt numbers are arbitrary, we use a convention: each device 19 - * can use the interrupt number corresponding to its index. The +1 is because 20 - * interrupt 0 is not usable (it's actually the timer interrupt). */ 21 - static inline int lgdev_irq(const struct lguest_device *dev) 22 - { 23 - return dev->index + 1; 24 - } 25 - /*:*/ 26 - 27 - /* dma args must not be vmalloced! */ 28 - void lguest_send_dma(unsigned long key, struct lguest_dma *dma); 29 - int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, 30 - unsigned int num, u8 irq); 31 - void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas); 32 - 33 - /* Map the virtual device space */ 34 - void *lguest_map(unsigned long phys_addr, unsigned long pages); 35 - void lguest_unmap(void *); 36 - 37 - struct lguest_driver { 38 - const char *name; 39 - struct module *owner; 40 - u16 device_type; 41 - int (*probe)(struct lguest_device *dev); 42 - void (*remove)(struct lguest_device *dev); 43 - 44 - struct device_driver drv; 45 - }; 46 - 47 - extern int register_lguest_driver(struct lguest_driver *drv); 48 - extern void unregister_lguest_driver(struct lguest_driver *drv); 49 - 50 - extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */ 51 - #endif /* _ASM_LGUEST_DEVICE_H */

+22 -90

include/linux/lguest_launcher.h

··· 1 1 #ifndef _ASM_LGUEST_USER 2 2 #define _ASM_LGUEST_USER 3 3 /* Everything the "lguest" userspace program needs to know. */ 4 + #include <linux/types.h> 4 5 /* They can register up to 32 arrays of lguest_dma. */ 5 6 #define LGUEST_MAX_DMA 32 6 7 /* At most we can dma 16 lguest_dma in one op. */ ··· 9 8 10 9 /* How many devices? Assume each one wants up to two dma arrays per device. */ 11 10 #define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) 12 - 13 - /*D:200 14 - * Lguest I/O 15 - * 16 - * The lguest I/O mechanism is the only way Guests can talk to devices. There 17 - * are two hypercalls involved: SEND_DMA for output and BIND_DMA for input. In 18 - * each case, "struct lguest_dma" describes the buffer: this contains 16 19 - * addr/len pairs, and if there are fewer buffer elements the len array is 20 - * terminated with a 0. 21 - * 22 - * I/O is organized by keys: BIND_DMA attaches buffers to a particular key, and 23 - * SEND_DMA transfers to buffers bound to particular key. By convention, keys 24 - * correspond to a physical address within the device's page. This means that 25 - * devices will never accidentally end up with the same keys, and allows the 26 - * Host use The Futex Trick (as we'll see later in our journey). 27 - * 28 - * SEND_DMA simply indicates a key to send to, and the physical address of the 29 - * "struct lguest_dma" to send. The Host will write the number of bytes 30 - * transferred into the "struct lguest_dma"'s used_len member. 31 - * 32 - * BIND_DMA indicates a key to bind to, a pointer to an array of "struct 33 - * lguest_dma"s ready for receiving, the size of that array, and an interrupt 34 - * to trigger when data is received. The Host will only allow transfers into 35 - * buffers with a used_len of zero: it then sets used_len to the number of 36 - * bytes transferred and triggers the interrupt for the Guest to process the 37 - * new input. */ 38 - struct lguest_dma 39 - { 40 - /* 0 if free to be used, filled by the Host. */ 41 - u32 used_len; 42 - unsigned long addr[LGUEST_MAX_DMA_SECTIONS]; 43 - u16 len[LGUEST_MAX_DMA_SECTIONS]; 44 - }; 45 - /*:*/ 46 - 47 - /*D:460 This is the layout of a block device memory page. The Launcher sets up 48 - * the num_sectors initially to tell the Guest the size of the disk. The Guest 49 - * puts the type, sector and length of the request in the first three fields, 50 - * then DMAs to the Host. The Host processes the request, sets up the result, 51 - * then DMAs back to the Guest. */ 52 - struct lguest_block_page 53 - { 54 - /* 0 is a read, 1 is a write. */ 55 - int type; 56 - u32 sector; /* Offset in device = sector * 512. */ 57 - u32 bytes; /* Length expected to be read/written in bytes */ 58 - /* 0 = pending, 1 = done, 2 = done, error */ 59 - int result; 60 - u32 num_sectors; /* Disk length = num_sectors * 512 */ 61 - }; 62 - 63 - /*D:520 The network device is basically a memory page where all the Guests on 64 - * the network publish their MAC (ethernet) addresses: it's an array of "struct 65 - * lguest_net": */ 66 - struct lguest_net 67 - { 68 - /* Simply the mac address (with multicast bit meaning promisc). */ 69 - unsigned char mac[6]; 70 - }; 71 - /*:*/ 72 11 73 12 /* Where the Host expects the Guest to SEND_DMA console output to. */ 74 13 #define LGUEST_CONSOLE_DMA_KEY 0 ··· 22 81 * complex burden for the Host and suboptimal for the Guest, so we have our own 23 82 * "lguest" bus and simple drivers. 24 83 * 25 - * Devices are described by an array of LGUEST_MAX_DEVICES of these structs, 26 - * placed by the Launcher just above the top of physical memory: 84 + * Devices are described by a simplified ID, a status byte, and some "config" 85 + * bytes which describe this device's configuration. This is placed by the 86 + * Launcher just above the top of physical memory: 27 87 */ 28 88 struct lguest_device_desc { 29 - /* The device type: console, network, disk etc. */ 30 - u16 type; 31 - #define LGUEST_DEVICE_T_CONSOLE 1 32 - #define LGUEST_DEVICE_T_NET 2 33 - #define LGUEST_DEVICE_T_BLOCK 3 89 + /* The device type: console, network, disk etc. Type 0 terminates. */ 90 + __u8 type; 91 + /* The number of bytes of the config array. */ 92 + __u8 config_len; 93 + /* A status byte, written by the Guest. */ 94 + __u8 status; 95 + __u8 config[0]; 96 + }; 34 97 35 - /* The specific features of this device: these depends on device type 36 - * except for LGUEST_DEVICE_F_RANDOMNESS. */ 37 - u16 features; 38 - #define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */ 39 - #define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */ 40 - 41 - /* This is how the Guest reports status of the device: the Host can set 42 - * LGUEST_DEVICE_S_REMOVED to indicate removal, but the rest are only 43 - * ever manipulated by the Guest, and only ever set. */ 44 - u16 status; 45 - /* 256 and above are device specific. */ 46 - #define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */ 47 - #define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */ 48 - #define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */ 49 - #define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */ 50 - #define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */ 51 - #define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */ 52 - 53 - /* Each device exists somewhere in Guest physical memory, over some 54 - * number of pages. */ 55 - u16 num_pages; 56 - u32 pfn; 98 + /*D:135 This is how we expect the device configuration field for a virtqueue 99 + * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */ 100 + struct lguest_vqconfig { 101 + /* The number of entries in the virtio_ring */ 102 + __u16 num; 103 + /* The interrupt we get when something happens. */ 104 + __u16 irq; 105 + /* The page number of the virtio ring for this device. */ 106 + __u32 pfn; 57 107 }; 58 108 /*:*/ 59 109 ··· 52 120 enum lguest_req 53 121 { 54 122 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ 55 - LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */ 123 + LHREQ_GETDMA, /* No longer used */ 56 124 LHREQ_IRQ, /* + irq */ 57 125 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ 58 126 };

+6

include/linux/mod_devicetable.h

··· 361 361 #define SSB_ANY_ID 0xFFFF 362 362 #define SSB_ANY_REV 0xFF 363 363 364 + struct virtio_device_id { 365 + __u32 device; 366 + __u32 vendor; 367 + }; 368 + #define VIRTIO_DEV_ANY_ID 0xffffffff 369 + 364 370 #endif /* LINUX_MOD_DEVICETABLE_H */

+46 -35

include/linux/screen_info.h

··· 8 8 */ 9 9 10 10 struct screen_info { 11 - u8 orig_x; /* 0x00 */ 12 - u8 orig_y; /* 0x01 */ 13 - u16 ext_mem_k; /* 0x02 */ 14 - u16 orig_video_page; /* 0x04 */ 15 - u8 orig_video_mode; /* 0x06 */ 16 - u8 orig_video_cols; /* 0x07 */ 17 - u16 unused2; /* 0x08 */ 18 - u16 orig_video_ega_bx; /* 0x0a */ 19 - u16 unused3; /* 0x0c */ 20 - u8 orig_video_lines; /* 0x0e */ 21 - u8 orig_video_isVGA; /* 0x0f */ 22 - u16 orig_video_points; /* 0x10 */ 11 + __u8 orig_x; /* 0x00 */ 12 + __u8 orig_y; /* 0x01 */ 13 + __u16 ext_mem_k; /* 0x02 */ 14 + __u16 orig_video_page; /* 0x04 */ 15 + __u8 orig_video_mode; /* 0x06 */ 16 + __u8 orig_video_cols; /* 0x07 */ 17 + __u16 unused2; /* 0x08 */ 18 + __u16 orig_video_ega_bx;/* 0x0a */ 19 + __u16 unused3; /* 0x0c */ 20 + __u8 orig_video_lines; /* 0x0e */ 21 + __u8 orig_video_isVGA; /* 0x0f */ 22 + __u16 orig_video_points;/* 0x10 */ 23 23 24 24 /* VESA graphic mode -- linear frame buffer */ 25 - u16 lfb_width; /* 0x12 */ 26 - u16 lfb_height; /* 0x14 */ 27 - u16 lfb_depth; /* 0x16 */ 28 - u32 lfb_base; /* 0x18 */ 29 - u32 lfb_size; /* 0x1c */ 30 - u16 cl_magic, cl_offset; /* 0x20 */ 31 - u16 lfb_linelength; /* 0x24 */ 32 - u8 red_size; /* 0x26 */ 33 - u8 red_pos; /* 0x27 */ 34 - u8 green_size; /* 0x28 */ 35 - u8 green_pos; /* 0x29 */ 36 - u8 blue_size; /* 0x2a */ 37 - u8 blue_pos; /* 0x2b */ 38 - u8 rsvd_size; /* 0x2c */ 39 - u8 rsvd_pos; /* 0x2d */ 40 - u16 vesapm_seg; /* 0x2e */ 41 - u16 vesapm_off; /* 0x30 */ 42 - u16 pages; /* 0x32 */ 43 - u16 vesa_attributes; /* 0x34 */ 44 - u32 capabilities; /* 0x36 */ 45 - u8 _reserved[6]; /* 0x3a */ 25 + __u16 lfb_width; /* 0x12 */ 26 + __u16 lfb_height; /* 0x14 */ 27 + __u16 lfb_depth; /* 0x16 */ 28 + __u32 lfb_base; /* 0x18 */ 29 + __u32 lfb_size; /* 0x1c */ 30 + __u16 cl_magic, cl_offset; /* 0x20 */ 31 + __u16 lfb_linelength; /* 0x24 */ 32 + __u8 red_size; /* 0x26 */ 33 + __u8 red_pos; /* 0x27 */ 34 + __u8 green_size; /* 0x28 */ 35 + __u8 green_pos; /* 0x29 */ 36 + __u8 blue_size; /* 0x2a */ 37 + __u8 blue_pos; /* 0x2b */ 38 + __u8 rsvd_size; /* 0x2c */ 39 + __u8 rsvd_pos; /* 0x2d */ 40 + __u16 vesapm_seg; /* 0x2e */ 41 + __u16 vesapm_off; /* 0x30 */ 42 + __u16 pages; /* 0x32 */ 43 + __u16 vesa_attributes; /* 0x34 */ 44 + __u32 capabilities; /* 0x36 */ 45 + __u8 _reserved[6]; /* 0x3a */ 46 46 } __attribute__((packed)); 47 - 48 - extern struct screen_info screen_info; 49 47 50 48 #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ 51 49 #define VIDEO_TYPE_CGA 0x11 /* CGA Display */ ··· 62 64 #define VIDEO_TYPE_SUNPCI 0x51 /* Sun PCI based frame buffer. */ 63 65 64 66 #define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */ 67 + 68 + #ifdef __KERNEL__ 69 + extern struct screen_info screen_info; 70 + 71 + #define ORIG_X (screen_info.orig_x) 72 + #define ORIG_Y (screen_info.orig_y) 73 + #define ORIG_VIDEO_MODE (screen_info.orig_video_mode) 74 + #define ORIG_VIDEO_COLS (screen_info.orig_video_cols) 75 + #define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx) 76 + #define ORIG_VIDEO_LINES (screen_info.orig_video_lines) 77 + #define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA) 78 + #define ORIG_VIDEO_POINTS (screen_info.orig_video_points) 79 + #endif /* __KERNEL__ */ 65 80 66 81 #endif /* _SCREEN_INFO_H */

+110

include/linux/virtio.h

··· 1 + #ifndef _LINUX_VIRTIO_H 2 + #define _LINUX_VIRTIO_H 3 + /* Everything a virtio driver needs to work with any particular virtio 4 + * implementation. */ 5 + #include <linux/types.h> 6 + #include <linux/scatterlist.h> 7 + #include <linux/spinlock.h> 8 + #include <linux/device.h> 9 + #include <linux/mod_devicetable.h> 10 + 11 + /** 12 + * virtqueue - a queue to register buffers for sending or receiving. 13 + * @callback: the function to call when buffers are consumed (can be NULL). 14 + * If this returns false, callbacks are suppressed until vq_ops->restart 15 + * is called. 16 + * @vdev: the virtio device this queue was created for. 17 + * @vq_ops: the operations for this virtqueue (see below). 18 + * @priv: a pointer for the virtqueue implementation to use. 19 + */ 20 + struct virtqueue 21 + { 22 + bool (*callback)(struct virtqueue *vq); 23 + struct virtio_device *vdev; 24 + struct virtqueue_ops *vq_ops; 25 + void *priv; 26 + }; 27 + 28 + /** 29 + * virtqueue_ops - operations for virtqueue abstraction layer 30 + * @add_buf: expose buffer to other end 31 + * vq: the struct virtqueue we're talking about. 32 + * sg: the description of the buffer(s). 33 + * out_num: the number of sg readable by other side 34 + * in_num: the number of sg which are writable (after readable ones) 35 + * data: the token identifying the buffer. 36 + * Returns 0 or an error. 37 + * @kick: update after add_buf 38 + * vq: the struct virtqueue 39 + * After one or more add_buf calls, invoke this to kick the other side. 40 + * @get_buf: get the next used buffer 41 + * vq: the struct virtqueue we're talking about. 42 + * len: the length written into the buffer 43 + * Returns NULL or the "data" token handed to add_buf. 44 + * @restart: restart callbacks after callback returned false. 45 + * vq: the struct virtqueue we're talking about. 46 + * This returns "false" (and doesn't re-enable) if there are pending 47 + * buffers in the queue, to avoid a race. 48 + * @shutdown: "unadd" all buffers. 49 + * vq: the struct virtqueue we're talking about. 50 + * Remove everything from the queue. 51 + * 52 + * Locking rules are straightforward: the driver is responsible for 53 + * locking. No two operations may be invoked simultaneously. 54 + * 55 + * All operations can be called in any context. 56 + */ 57 + struct virtqueue_ops { 58 + int (*add_buf)(struct virtqueue *vq, 59 + struct scatterlist sg[], 60 + unsigned int out_num, 61 + unsigned int in_num, 62 + void *data); 63 + 64 + void (*kick)(struct virtqueue *vq); 65 + 66 + void *(*get_buf)(struct virtqueue *vq, unsigned int *len); 67 + 68 + bool (*restart)(struct virtqueue *vq); 69 + 70 + void (*shutdown)(struct virtqueue *vq); 71 + }; 72 + 73 + /** 74 + * virtio_device - representation of a device using virtio 75 + * @index: unique position on the virtio bus 76 + * @dev: underlying device. 77 + * @id: the device type identification (used to match it with a driver). 78 + * @config: the configuration ops for this device. 79 + * @priv: private pointer for the driver's use. 80 + */ 81 + struct virtio_device 82 + { 83 + int index; 84 + struct device dev; 85 + struct virtio_device_id id; 86 + struct virtio_config_ops *config; 87 + void *priv; 88 + }; 89 + 90 + int register_virtio_device(struct virtio_device *dev); 91 + void unregister_virtio_device(struct virtio_device *dev); 92 + 93 + /** 94 + * virtio_driver - operations for a virtio I/O driver 95 + * @driver: underlying device driver (populate name and owner). 96 + * @id_table: the ids serviced by this driver. 97 + * @probe: the function to call when a device is found. Returns a token for 98 + * remove, or PTR_ERR(). 99 + * @remove: the function when a device is removed. 100 + */ 101 + struct virtio_driver { 102 + struct device_driver driver; 103 + const struct virtio_device_id *id_table; 104 + int (*probe)(struct virtio_device *dev); 105 + void (*remove)(struct virtio_device *dev); 106 + }; 107 + 108 + int register_virtio_driver(struct virtio_driver *drv); 109 + void unregister_virtio_driver(struct virtio_driver *drv); 110 + #endif /* _LINUX_VIRTIO_H */

+51

include/linux/virtio_blk.h

··· 1 + #ifndef _LINUX_VIRTIO_BLK_H 2 + #define _LINUX_VIRTIO_BLK_H 3 + #include <linux/virtio_config.h> 4 + 5 + /* The ID for virtio_block */ 6 + #define VIRTIO_ID_BLOCK 2 7 + 8 + /* Feature bits */ 9 + #define VIRTIO_CONFIG_BLK_F 0x40 10 + #define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */ 11 + 12 + /* The capacity (in 512-byte sectors). */ 13 + #define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41 14 + /* The maximum segment size. */ 15 + #define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42 16 + /* The maximum number of segments. */ 17 + #define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43 18 + 19 + /* These two define direction. */ 20 + #define VIRTIO_BLK_T_IN 0 21 + #define VIRTIO_BLK_T_OUT 1 22 + 23 + /* This bit says it's a scsi command, not an actual read or write. */ 24 + #define VIRTIO_BLK_T_SCSI_CMD 2 25 + 26 + /* Barrier before this op. */ 27 + #define VIRTIO_BLK_T_BARRIER 0x80000000 28 + 29 + /* This is the first element of the read scatter-gather list. */ 30 + struct virtio_blk_outhdr 31 + { 32 + /* VIRTIO_BLK_T* */ 33 + __u32 type; 34 + /* io priority. */ 35 + __u32 ioprio; 36 + /* Sector (ie. 512 byte offset) */ 37 + __u64 sector; 38 + /* Where to put reply. */ 39 + __u64 id; 40 + }; 41 + 42 + #define VIRTIO_BLK_S_OK 0 43 + #define VIRTIO_BLK_S_IOERR 1 44 + #define VIRTIO_BLK_S_UNSUPP 2 45 + 46 + /* This is the first element of the write scatter-gather list */ 47 + struct virtio_blk_inhdr 48 + { 49 + unsigned char status; 50 + }; 51 + #endif /* _LINUX_VIRTIO_BLK_H */

+111

include/linux/virtio_config.h

··· 1 + #ifndef _LINUX_VIRTIO_CONFIG_H 2 + #define _LINUX_VIRTIO_CONFIG_H 3 + /* Virtio devices use a standardized configuration space to define their 4 + * features and pass configuration information, but each implementation can 5 + * store and access that space differently. */ 6 + #include <linux/types.h> 7 + 8 + /* Status byte for guest to report progress, and synchronize config. */ 9 + /* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */ 10 + #define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 11 + /* We have found a driver for the device. */ 12 + #define VIRTIO_CONFIG_S_DRIVER 2 13 + /* Driver has used its parts of the config, and is happy */ 14 + #define VIRTIO_CONFIG_S_DRIVER_OK 4 15 + /* We've given up on this device. */ 16 + #define VIRTIO_CONFIG_S_FAILED 0x80 17 + 18 + /* Feature byte (actually 7 bits availabe): */ 19 + /* Requirements/features of the virtio implementation. */ 20 + #define VIRTIO_CONFIG_F_VIRTIO 1 21 + /* Requirements/features of the virtqueue (may have more than one). */ 22 + #define VIRTIO_CONFIG_F_VIRTQUEUE 2 23 + 24 + #ifdef __KERNEL__ 25 + struct virtio_device; 26 + 27 + /** 28 + * virtio_config_ops - operations for configuring a virtio device 29 + * @find: search for the next configuration field of the given type. 30 + * vdev: the virtio_device 31 + * type: the feature type 32 + * len: the (returned) length of the field if found. 33 + * Returns a token if found, or NULL. Never returnes the same field twice 34 + * (ie. it's used up). 35 + * @get: read the value of a configuration field after find(). 36 + * vdev: the virtio_device 37 + * token: the token returned from find(). 38 + * buf: the buffer to write the field value into. 39 + * len: the length of the buffer (given by find()). 40 + * Note that contents are conventionally little-endian. 41 + * @set: write the value of a configuration field after find(). 42 + * vdev: the virtio_device 43 + * token: the token returned from find(). 44 + * buf: the buffer to read the field value from. 45 + * len: the length of the buffer (given by find()). 46 + * Note that contents are conventionally little-endian. 47 + * @get_status: read the status byte 48 + * vdev: the virtio_device 49 + * Returns the status byte 50 + * @set_status: write the status byte 51 + * vdev: the virtio_device 52 + * status: the new status byte 53 + * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue. 54 + * vdev: the virtio_device 55 + * callback: the virqtueue callback 56 + * Returns the new virtqueue or ERR_PTR(). 57 + * @del_vq: free a virtqueue found by find_vq(). 58 + */ 59 + struct virtio_config_ops 60 + { 61 + void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len); 62 + void (*get)(struct virtio_device *vdev, void *token, 63 + void *buf, unsigned len); 64 + void (*set)(struct virtio_device *vdev, void *token, 65 + const void *buf, unsigned len); 66 + u8 (*get_status)(struct virtio_device *vdev); 67 + void (*set_status)(struct virtio_device *vdev, u8 status); 68 + struct virtqueue *(*find_vq)(struct virtio_device *vdev, 69 + bool (*callback)(struct virtqueue *)); 70 + void (*del_vq)(struct virtqueue *vq); 71 + }; 72 + 73 + /** 74 + * virtio_config_val - get a single virtio config and mark it used. 75 + * @config: the virtio config space 76 + * @type: the type to search for. 77 + * @val: a pointer to the value to fill in. 78 + * 79 + * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't 80 + * be found again. This version does endian conversion. */ 81 + #define virtio_config_val(vdev, type, v) ({ \ 82 + int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \ 83 + \ 84 + BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ 85 + && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ 86 + if (!_err) { \ 87 + switch (sizeof(*(v))) { \ 88 + case 2: le16_to_cpus((__u16 *) v); break; \ 89 + case 4: le32_to_cpus((__u32 *) v); break; \ 90 + case 8: le64_to_cpus((__u64 *) v); break; \ 91 + } \ 92 + } \ 93 + _err; \ 94 + }) 95 + 96 + int __virtio_config_val(struct virtio_device *dev, 97 + u8 type, void *val, size_t size); 98 + 99 + /** 100 + * virtio_use_bit - helper to use a feature bit in a bitfield value. 101 + * @dev: the virtio device 102 + * @token: the token as returned from vdev->config->find(). 103 + * @len: the length of the field. 104 + * @bitnum: the bit to test. 105 + * 106 + * If handed a NULL token, it returns false, otherwise returns bit status. 107 + * If it's one, it sets the mirroring acknowledgement bit. */ 108 + int virtio_use_bit(struct virtio_device *vdev, 109 + void *token, unsigned int len, unsigned int bitnum); 110 + #endif /* __KERNEL__ */ 111 + #endif /* _LINUX_VIRTIO_CONFIG_H */

+12

include/linux/virtio_console.h

··· 1 + #ifndef _LINUX_VIRTIO_CONSOLE_H 2 + #define _LINUX_VIRTIO_CONSOLE_H 3 + #include <linux/virtio_config.h> 4 + 5 + /* The ID for virtio console */ 6 + #define VIRTIO_ID_CONSOLE 3 7 + 8 + #ifdef __KERNEL__ 9 + int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)); 10 + #endif /* __KERNEL__ */ 11 + 12 + #endif /* _LINUX_VIRTIO_CONSOLE_H */

+36

include/linux/virtio_net.h

··· 1 + #ifndef _LINUX_VIRTIO_NET_H 2 + #define _LINUX_VIRTIO_NET_H 3 + #include <linux/virtio_config.h> 4 + 5 + /* The ID for virtio_net */ 6 + #define VIRTIO_ID_NET 1 7 + 8 + /* The bitmap of config for virtio net */ 9 + #define VIRTIO_CONFIG_NET_F 0x40 10 + #define VIRTIO_NET_F_NO_CSUM 0 11 + #define VIRTIO_NET_F_TSO4 1 12 + #define VIRTIO_NET_F_UFO 2 13 + #define VIRTIO_NET_F_TSO4_ECN 3 14 + #define VIRTIO_NET_F_TSO6 4 15 + 16 + /* The config defining mac address. */ 17 + #define VIRTIO_CONFIG_NET_MAC_F 0x41 18 + 19 + /* This is the first element of the scatter-gather list. If you don't 20 + * specify GSO or CSUM features, you can simply ignore the header. */ 21 + struct virtio_net_hdr 22 + { 23 + #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset 24 + __u8 flags; 25 + #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame 26 + #define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) 27 + /* FIXME: Do we need this? If they said they can handle ECN, do they care? */ 28 + #define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN 29 + #define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) 30 + #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP 31 + __u8 gso_type; 32 + __u16 gso_size; 33 + __u16 csum_start; 34 + __u16 csum_offset; 35 + }; 36 + #endif /* _LINUX_VIRTIO_NET_H */

+119

include/linux/virtio_ring.h

··· 1 + #ifndef _LINUX_VIRTIO_RING_H 2 + #define _LINUX_VIRTIO_RING_H 3 + /* An interface for efficient virtio implementation, currently for use by KVM 4 + * and lguest, but hopefully others soon. Do NOT change this since it will 5 + * break existing servers and clients. 6 + * 7 + * This header is BSD licensed so anyone can use the definitions to implement 8 + * compatible drivers/servers. 9 + * 10 + * Copyright Rusty Russell IBM Corporation 2007. */ 11 + #include <linux/types.h> 12 + 13 + /* This marks a buffer as continuing via the next field. */ 14 + #define VRING_DESC_F_NEXT 1 15 + /* This marks a buffer as write-only (otherwise read-only). */ 16 + #define VRING_DESC_F_WRITE 2 17 + 18 + /* This means don't notify other side when buffer added. */ 19 + #define VRING_USED_F_NO_NOTIFY 1 20 + /* This means don't interrupt guest when buffer consumed. */ 21 + #define VRING_AVAIL_F_NO_INTERRUPT 1 22 + 23 + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ 24 + struct vring_desc 25 + { 26 + /* Address (guest-physical). */ 27 + __u64 addr; 28 + /* Length. */ 29 + __u32 len; 30 + /* The flags as indicated above. */ 31 + __u16 flags; 32 + /* We chain unused descriptors via this, too */ 33 + __u16 next; 34 + }; 35 + 36 + struct vring_avail 37 + { 38 + __u16 flags; 39 + __u16 idx; 40 + __u16 ring[]; 41 + }; 42 + 43 + /* u32 is used here for ids for padding reasons. */ 44 + struct vring_used_elem 45 + { 46 + /* Index of start of used descriptor chain. */ 47 + __u32 id; 48 + /* Total length of the descriptor chain which was used (written to) */ 49 + __u32 len; 50 + }; 51 + 52 + struct vring_used 53 + { 54 + __u16 flags; 55 + __u16 idx; 56 + struct vring_used_elem ring[]; 57 + }; 58 + 59 + struct vring { 60 + unsigned int num; 61 + 62 + struct vring_desc *desc; 63 + 64 + struct vring_avail *avail; 65 + 66 + struct vring_used *used; 67 + }; 68 + 69 + /* The standard layout for the ring is a continuous chunk of memory which looks 70 + * like this. The used fields will be aligned to a "num+1" boundary. 71 + * 72 + * struct vring 73 + * { 74 + * // The actual descriptors (16 bytes each) 75 + * struct vring_desc desc[num]; 76 + * 77 + * // A ring of available descriptor heads with free-running index. 78 + * __u16 avail_flags; 79 + * __u16 avail_idx; 80 + * __u16 available[num]; 81 + * 82 + * // Padding so a correctly-chosen num value will cache-align used_idx. 83 + * char pad[sizeof(struct vring_desc) - sizeof(avail_flags)]; 84 + * 85 + * // A ring of used descriptor heads with free-running index. 86 + * __u16 used_flags; 87 + * __u16 used_idx; 88 + * struct vring_used_elem used[num]; 89 + * }; 90 + */ 91 + static inline void vring_init(struct vring *vr, unsigned int num, void *p) 92 + { 93 + vr->num = num; 94 + vr->desc = p; 95 + vr->avail = p + num*sizeof(struct vring); 96 + vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16)); 97 + } 98 + 99 + static inline unsigned vring_size(unsigned int num) 100 + { 101 + return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16)) 102 + + sizeof(__u32) + num * sizeof(struct vring_used_elem); 103 + } 104 + 105 + #ifdef __KERNEL__ 106 + #include <linux/irqreturn.h> 107 + struct virtio_device; 108 + struct virtqueue; 109 + 110 + struct virtqueue *vring_new_virtqueue(unsigned int num, 111 + struct virtio_device *vdev, 112 + void *pages, 113 + void (*notify)(struct virtqueue *vq), 114 + bool (*callback)(struct virtqueue *vq)); 115 + void vring_del_virtqueue(struct virtqueue *vq); 116 + 117 + irqreturn_t vring_interrupt(int irq, void *_vq); 118 + #endif /* __KERNEL__ */ 119 + #endif /* _LINUX_VIRTIO_RING_H */

+1

include/video/Kbuild

··· 1 1 unifdef-y += sisfb.h uvesafb.h 2 + unifdef-y += edid.h

+4 -5

include/video/edid.h

··· 1 1 #ifndef __linux_video_edid_h__ 2 2 #define __linux_video_edid_h__ 3 3 4 - #ifdef __KERNEL__ 4 + #if !defined(__KERNEL__) || defined(CONFIG_X86) 5 5 6 - 7 - #ifdef CONFIG_X86 8 6 struct edid_info { 9 7 unsigned char dummy[128]; 10 8 }; 11 9 10 + #ifdef __KERNEL__ 12 11 extern struct edid_info edid_info; 13 - #endif /* CONFIG_X86 */ 14 - 15 12 #endif /* __KERNEL__ */ 13 + 14 + #endif 16 15 17 16 #endif /* __linux_video_edid_h__ */

+18

scripts/mod/file2alias.c

··· 525 525 return 1; 526 526 } 527 527 528 + /* Looks like: virtio:dNvN */ 529 + static int do_virtio_entry(const char *filename, struct virtio_device_id *id, 530 + char *alias) 531 + { 532 + id->device = TO_NATIVE(id->device); 533 + id->vendor = TO_NATIVE(id->vendor); 534 + 535 + strcpy(alias, "virtio:"); 536 + ADD(alias, "d", 1, id->device); 537 + ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor); 538 + 539 + return 1; 540 + } 541 + 528 542 /* Ignore any prefix, eg. v850 prepends _ */ 529 543 static inline int sym_is(const char *symbol, const char *name) 530 544 { ··· 665 651 do_table(symval, sym->st_size, 666 652 sizeof(struct ssb_device_id), "ssb", 667 653 do_ssb_entry, mod); 654 + else if (sym_is(symname, "__mod_virtio_device_table")) 655 + do_table(symval, sym->st_size, 656 + sizeof(struct virtio_device_id), "virtio", 657 + do_virtio_entry, mod); 668 658 free(zeros); 669 659 } 670 660