Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus

* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest: Do not append space to guests kernel command line
lguest: Revert 1ce70c4fac3c3954bd48c035f448793867592bc0, fix real problem.
lguest: Sanitize the lguest clock.
lguest: fix __get_vm_area usage.
lguest: make sure cpu is initialized before accessing it

+50 -46
+5 -2
Documentation/lguest/lguest.c
··· 486 unsigned int i, len = 0; 487 488 for (i = 0; args[i]; i++) { 489 strcpy(dst+len, args[i]); 490 - strcat(dst+len, " "); 491 - len += strlen(args[i]) + 1; 492 } 493 /* In case it's empty. */ 494 dst[len] = '\0';
··· 486 unsigned int i, len = 0; 487 488 for (i = 0; args[i]; i++) { 489 + if (i) { 490 + strcat(dst+len, " "); 491 + len++; 492 + } 493 strcpy(dst+len, args[i]); 494 + len += strlen(args[i]); 495 } 496 /* In case it's empty. */ 497 dst[len] = '\0';
+22 -33
arch/x86/lguest/boot.c
··· 84 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 85 .syscall_vec = SYSCALL_VECTOR, 86 }; 87 - static cycle_t clock_base; 88 89 /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 90 * ring buffer of stored hypercalls which the Host will run though next time we ··· 326 case 1: /* Basic feature request. */ 327 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 328 *cx &= 0x00002201; 329 - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 330 - *dx &= 0x07808101; 331 /* The Host can do a nice optimization if it knows that the 332 * kernel mappings (addresses above 0xC0000000 or whatever 333 * PAGE_OFFSET is set to) haven't changed. But Linux calls ··· 480 { 481 *pmdp = pmdval; 482 lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, 483 - (__pa(pmdp)&(PAGE_SIZE-1)), 0); 484 } 485 486 /* There are a couple of legacy places where the kernel sets a PTE, but we ··· 594 return lguest_data.time.tv_sec; 595 } 596 597 static cycle_t lguest_clock_read(void) 598 { 599 unsigned long sec, nsec; 600 601 - /* If the Host tells the TSC speed, we can trust that. */ 602 - if (lguest_data.tsc_khz) 603 - return native_read_tsc(); 604 - 605 - /* If we can't use the TSC, we read the time value written by the Host. 606 - * Since it's in two parts (seconds and nanoseconds), we risk reading 607 - * it just as it's changing from 99 & 0.999999999 to 100 and 0, and 608 - * getting 99 and 0. As Linux tends to come apart under the stress of 609 - * time travel, we must be careful: */ 610 do { 611 /* First we read the seconds part. */ 612 sec = lguest_data.time.tv_sec; ··· 627 /* Now if the seconds part has changed, try again. */ 628 } while (unlikely(lguest_data.time.tv_sec != sec)); 629 630 - /* Our non-TSC clock is in real nanoseconds. */ 631 return sec*1000000000ULL + nsec; 632 } 633 634 - /* This is what we tell the kernel is our clocksource. */ 635 static struct clocksource lguest_clock = { 636 .name = "lguest", 637 - .rating = 400, 638 .read = lguest_clock_read, 639 .mask = CLOCKSOURCE_MASK(64), 640 .mult = 1 << 22, 641 .shift = 22, 642 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 643 }; 644 - 645 - /* The "scheduler clock" is just our real clock, adjusted to start at zero */ 646 - static unsigned long long lguest_sched_clock(void) 647 - { 648 - return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); 649 - } 650 651 /* We also need a "struct clock_event_device": Linux asks us to set it to go 652 * off some time in the future. Actually, James Morris figured all this out, I ··· 711 /* Set up the timer interrupt (0) to go to our simple timer routine */ 712 set_irq_handler(0, lguest_time_irq); 713 714 - /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can 715 - * use the TSC, otherwise it's a dumb nanosecond-resolution clock. 716 - * Either way, the "rating" is set so high that it's always chosen over 717 - * any other clocksource. */ 718 - if (lguest_data.tsc_khz) 719 - lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 720 - lguest_clock.shift); 721 - clock_base = lguest_clock_read(); 722 clocksource_register(&lguest_clock); 723 - 724 - /* Now we've set up our clock, we can use it as the scheduler clock */ 725 - pv_time_ops.sched_clock = lguest_sched_clock; 726 727 /* We can't set cpumask in the initializer: damn C limitations! Set it 728 * here and register our timer device. */ ··· 983 /* time operations */ 984 pv_time_ops.get_wallclock = lguest_get_wallclock; 985 pv_time_ops.time_init = lguest_time_init; 986 987 /* Now is a good time to look at the implementations of these functions 988 * before returning to the rest of lguest_init(). */
··· 84 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 85 .syscall_vec = SYSCALL_VECTOR, 86 }; 87 88 /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 89 * ring buffer of stored hypercalls which the Host will run though next time we ··· 327 case 1: /* Basic feature request. */ 328 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 329 *cx &= 0x00002201; 330 + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ 331 + *dx &= 0x07808111; 332 /* The Host can do a nice optimization if it knows that the 333 * kernel mappings (addresses above 0xC0000000 or whatever 334 * PAGE_OFFSET is set to) haven't changed. But Linux calls ··· 481 { 482 *pmdp = pmdval; 483 lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, 484 + (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); 485 } 486 487 /* There are a couple of legacy places where the kernel sets a PTE, but we ··· 595 return lguest_data.time.tv_sec; 596 } 597 598 + /* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, 599 + * or 0 if it's unusable as a reliable clock source. This matches what we want 600 + * here: if we return 0 from this function, the x86 TSC clock will not register 601 + * itself. */ 602 + static unsigned long lguest_cpu_khz(void) 603 + { 604 + return lguest_data.tsc_khz; 605 + } 606 + 607 + /* If we can't use the TSC, the kernel falls back to our "lguest_clock", where 608 + * we read the time value given to us by the Host. */ 609 static cycle_t lguest_clock_read(void) 610 { 611 unsigned long sec, nsec; 612 613 + /* Since the time is in two parts (seconds and nanoseconds), we risk 614 + * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 615 + * and getting 99 and 0. As Linux tends to come apart under the stress 616 + * of time travel, we must be careful: */ 617 do { 618 /* First we read the seconds part. */ 619 sec = lguest_data.time.tv_sec; ··· 622 /* Now if the seconds part has changed, try again. */ 623 } while (unlikely(lguest_data.time.tv_sec != sec)); 624 625 + /* Our lguest clock is in real nanoseconds. */ 626 return sec*1000000000ULL + nsec; 627 } 628 629 + /* This is the fallback clocksource: lower priority than the TSC clocksource. */ 630 static struct clocksource lguest_clock = { 631 .name = "lguest", 632 + .rating = 200, 633 .read = lguest_clock_read, 634 .mask = CLOCKSOURCE_MASK(64), 635 .mult = 1 << 22, 636 .shift = 22, 637 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 638 }; 639 640 /* We also need a "struct clock_event_device": Linux asks us to set it to go 641 * off some time in the future. Actually, James Morris figured all this out, I ··· 712 /* Set up the timer interrupt (0) to go to our simple timer routine */ 713 set_irq_handler(0, lguest_time_irq); 714 715 clocksource_register(&lguest_clock); 716 717 /* We can't set cpumask in the initializer: damn C limitations! Set it 718 * here and register our timer device. */ ··· 995 /* time operations */ 996 pv_time_ops.get_wallclock = lguest_get_wallclock; 997 pv_time_ops.time_init = lguest_time_init; 998 + pv_time_ops.get_cpu_khz = lguest_cpu_khz; 999 1000 /* Now is a good time to look at the implementations of these functions 1001 * before returning to the rest of lguest_init(). */
+13 -2
drivers/lguest/core.c
··· 69 switcher_page[i] = virt_to_page(addr); 70 } 71 72 /* Now we reserve the "virtual memory area" we want: 0xFFC00000 73 * (SWITCHER_ADDR). We might not get it in theory, but in practice 74 - * it's worked so far. */ 75 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 76 - VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); 77 if (!switcher_vma) { 78 err = -ENOMEM; 79 printk("lguest: could not map switcher pages high\n");
··· 69 switcher_page[i] = virt_to_page(addr); 70 } 71 72 + /* First we check that the Switcher won't overlap the fixmap area at 73 + * the top of memory. It's currently nowhere near, but it could have 74 + * very strange effects if it ever happened. */ 75 + if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ 76 + err = -ENOMEM; 77 + printk("lguest: mapping switcher would thwack fixmap\n"); 78 + goto free_pages; 79 + } 80 + 81 /* Now we reserve the "virtual memory area" we want: 0xFFC00000 82 * (SWITCHER_ADDR). We might not get it in theory, but in practice 83 + * it's worked so far. The end address needs +1 because __get_vm_area 84 + * allocates an extra guard page, so we need space for that. */ 85 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 86 + VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR 87 + + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); 88 if (!switcher_vma) { 89 err = -ENOMEM; 90 printk("lguest: could not map switcher pages high\n");
+9 -8
drivers/lguest/lguest_user.c
··· 241 cpu = &lg->cpus[cpu_id]; 242 if (!cpu) 243 return -EINVAL; 244 } 245 - 246 - /* Once the Guest is dead, all you can do is read() why it died. */ 247 - if (lg && lg->dead) 248 - return -ENOENT; 249 - 250 - /* If you're not the task which owns the Guest, you can only break */ 251 - if (lg && current != cpu->tsk && req != LHREQ_BREAK) 252 - return -EPERM; 253 254 switch (req) { 255 case LHREQ_INITIALIZE:
··· 241 cpu = &lg->cpus[cpu_id]; 242 if (!cpu) 243 return -EINVAL; 244 + 245 + /* Once the Guest is dead, you can only read() why it died. */ 246 + if (lg->dead) 247 + return -ENOENT; 248 + 249 + /* If you're not the task which owns the Guest, all you can do 250 + * is break the Launcher out of running the Guest. */ 251 + if (current != cpu->tsk && req != LHREQ_BREAK) 252 + return -EPERM; 253 } 254 255 switch (req) { 256 case LHREQ_INITIALIZE:
+1 -1
drivers/lguest/page_tables.c
··· 391 { 392 unsigned int i; 393 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 394 - if (lg->pgdirs[i].gpgdir == pgtable) 395 break; 396 return i; 397 }
··· 391 { 392 unsigned int i; 393 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 394 + if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) 395 break; 396 return i; 397 }