Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Provide timespec to guests rather than jiffies clock.

A non-periodic clock_event_device and the "jiffies" clock don't mix well:
tick_handle_periodic() can go into an infinite loop.

Currently lguest guests use the jiffies clock when the TSC is
unusable. Instead, make the Host write the current time into the lguest
page on every interrupt. This doesn't cost much but is more precise
and at least as accurate as the jiffies clock. It also gets rid of
the GET_WALLCLOCK hypercall.

Also, delay setting sched_clock until our clock is set up, otherwise
the early printk timestamps can go backwards (not harmful, just ugly).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Rusty Russell and committed by
Linus Torvalds
6c8dca5d a8a11f06

+60 -25
+13 -8
drivers/lguest/hypercalls.c
··· 64 64 else 65 65 guest_pagetable_flush_user(lg); 66 66 break; 67 - case LHCALL_GET_WALLCLOCK: { 68 - /* The Guest wants to know the real time in seconds since 1970, 69 - * in good Unix tradition. */ 70 - struct timespec ts; 71 - ktime_get_real_ts(&ts); 72 - regs->eax = ts.tv_sec; 73 - break; 74 - } 75 67 case LHCALL_BIND_DMA: 76 68 /* BIND_DMA really wants four arguments, but it's the only call 77 69 * which does. So the Guest packs the number of buffers and ··· 227 235 || put_user(lg->guestid, &lg->lguest_data->guestid)) 228 236 kill_guest(lg, "bad guest page %p", lg->lguest_data); 229 237 238 + /* We write the current time into the Guest's data page once now. */ 239 + write_timestamp(lg); 240 + 230 241 /* This is the one case where the above accesses might have been the 231 242 * first write to a Guest page. This may have caused a copy-on-write 232 243 * fault, but the Guest might be referring to the old (read-only) ··· 287 292 /* The hypercall is done. */ 288 293 clear_hcall(lg); 289 294 } 295 + } 296 + 297 + /* This routine supplies the Guest with time: it's used for wallclock time at 298 + * initial boot and as a rough time source if the TSC isn't available. */ 299 + void write_timestamp(struct lguest *lg) 300 + { 301 + struct timespec now; 302 + ktime_get_real_ts(&now); 303 + if (put_user(now, &lg->lguest_data->time)) 304 + kill_guest(lg, "Writing timestamp"); 290 305 }
+7
drivers/lguest/interrupts_and_traps.c
··· 175 175 * the stack as well: virtual interrupts never do. */ 176 176 set_guest_interrupt(lg, idt->a, idt->b, 0); 177 177 } 178 + 179 + /* Every time we deliver an interrupt, we update the timestamp in the 180 + * Guest's lguest_data struct. It would be better for the Guest if we 181 + * did this more often, but it can actually be quite slow: doing it 182 + * here is a compromise which means at least it gets updated every 183 + * timer interrupt. */ 184 + write_timestamp(lg); 178 185 } 179 186 180 187 /*H:220 Now we've got the routines to deliver interrupts, delivering traps
+1
drivers/lguest/lg.h
··· 256 256 257 257 /* hypercalls.c: */ 258 258 void do_hypercalls(struct lguest *lg); 259 + void write_timestamp(struct lguest *lg); 259 260 260 261 /*L:035 261 262 * Let's step aside for the moment, to study one important routine that's used
+36 -16
drivers/lguest/lguest.c
··· 643 643 * Time. 644 644 * 645 645 * It would be far better for everyone if the Guest had its own clock, but 646 - * until then it must ask the Host for the time. 646 + * until then the Host gives us the time on every interrupt. 647 647 */ 648 648 static unsigned long lguest_get_wallclock(void) 649 649 { 650 - return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); 650 + return lguest_data.time.tv_sec; 651 651 } 652 652 653 - /* If the Host tells us we can trust the TSC, we use that, otherwise we simply 654 - * use the imprecise but reliable "jiffies" counter. */ 655 653 static cycle_t lguest_clock_read(void) 656 654 { 655 + unsigned long sec, nsec; 656 + 657 + /* If the Host tells the TSC speed, we can trust that. */ 657 658 if (lguest_data.tsc_khz) 658 659 return native_read_tsc(); 659 - else 660 - return jiffies; 660 + 661 + /* If we can't use the TSC, we read the time value written by the Host. 662 + * Since it's in two parts (seconds and nanoseconds), we risk reading 663 + * it just as it's changing from 99 & 0.999999999 to 100 and 0, and 664 + * getting 99 and 0. As Linux tends to come apart under the stress of 665 + * time travel, we must be careful: */ 666 + do { 667 + /* First we read the seconds part. */ 668 + sec = lguest_data.time.tv_sec; 669 + /* This read memory barrier tells the compiler and the CPU that 670 + * this can't be reordered: we have to complete the above 671 + * before going on. */ 672 + rmb(); 673 + /* Now we read the nanoseconds part. */ 674 + nsec = lguest_data.time.tv_nsec; 675 + /* Make sure we've done that. */ 676 + rmb(); 677 + /* Now if the seconds part has changed, try again. */ 678 + } while (unlikely(lguest_data.time.tv_sec != sec)); 679 + 680 + /* Our non-TSC clock is in real nanoseconds. */ 681 + return sec*1000000000ULL + nsec; 661 682 } 662 683 663 684 /* This is what we tell the kernel is our clocksource. */ ··· 686 665 .name = "lguest", 687 666 .rating = 400, 688 667 .read = lguest_clock_read, 668 + .mask = CLOCKSOURCE_MASK(64), 669 + .mult = 1, 689 670 }; 690 671 672 + /* The "scheduler clock" is just our real clock, adjusted to start at zero */ 691 673 static unsigned long long lguest_sched_clock(void) 692 674 { 693 675 return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); ··· 766 742 set_irq_handler(0, lguest_time_irq); 767 743 768 744 /* Our clock structure look like arch/i386/kernel/tsc.c if we can use 769 - * the TSC, otherwise it looks like kernel/time/jiffies.c. Either way, 770 - * the "rating" is initialized so high that it's always chosen over any 771 - * other clocksource. */ 745 + * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 746 + * way, the "rating" is initialized so high that it's always chosen 747 + * over any other clocksource. */ 772 748 if (lguest_data.tsc_khz) { 773 749 lguest_clock.shift = 22; 774 750 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 775 751 lguest_clock.shift); 776 - lguest_clock.mask = CLOCKSOURCE_MASK(64); 777 752 lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; 778 - } else { 779 - /* To understand this, start at kernel/time/jiffies.c... */ 780 - lguest_clock.shift = 8; 781 - lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8; 782 - lguest_clock.mask = CLOCKSOURCE_MASK(32); 783 753 } 784 754 clock_base = lguest_clock_read(); 785 755 clocksource_register(&lguest_clock); 756 + 757 + /* Now we've set up our clock, we can use it as the scheduler clock */ 758 + paravirt_ops.sched_clock = lguest_sched_clock; 786 759 787 760 /* We can't set cpumask in the initializer: damn C limitations! Set it 788 761 * here and register our timer device. */ ··· 1017 996 paravirt_ops.time_init = lguest_time_init; 1018 997 paravirt_ops.set_lazy_mode = lguest_lazy_mode; 1019 998 paravirt_ops.wbinvd = lguest_wbinvd; 1020 - paravirt_ops.sched_clock = lguest_sched_clock; 1021 999 /* Now is a good time to look at the implementations of these functions 1022 1000 * before returning to the rest of lguest_init(). */ 1023 1001
+3 -1
include/linux/lguest.h
··· 17 17 #define LHCALL_TS 8 18 18 #define LHCALL_SET_CLOCKEVENT 9 19 19 #define LHCALL_HALT 10 20 - #define LHCALL_GET_WALLCLOCK 11 21 20 #define LHCALL_BIND_DMA 12 22 21 #define LHCALL_SEND_DMA 13 23 22 #define LHCALL_SET_PTE 14 ··· 86 87 * which saves the Guest a hypercall. CR2 is the native register where 87 88 * this address would normally be found. */ 88 89 unsigned long cr2; 90 + 91 + /* Wallclock time set by the Host. */ 92 + struct timespec time; 89 93 90 94 /* Async hypercall ring. Instead of directly making hypercalls, we can 91 95 * place them in here for processing the next time the Host wants.