Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
timers, init: Limit the number of per cpu calibration bootup messages
posix-cpu-timers: optimize and document timer_create callback
clockevents: Add missing include to pacify sparse
x86: vmiclock: Fix printk format
x86: Fix printk format due to variable type change
sparc: fix printk for change of variable type
clocksource/events: Fix fallout of generic code changes
nohz: Allow 32-bit machines to sleep for more than 2.15 seconds
nohz: Track last do_timer() cpu
nohz: Prevent clocksource wrapping during idle
nohz: Type cast printk argument
mips: Use generic mult/shift factor calculation for clocks
clocksource: Provide a generic mult/shift factor calculation
clockevents: Use u32 for mult and shift factors
nohz: Introduce arch_needs_cpu
nohz: Reuse ktime in sub-functions of tick_check_idle.
time: Remove xtime_cache
time: Implement logarithmic time accumulation

+353 -178
+11 -3
arch/mips/include/asm/time.h
··· 84 84 #endif 85 85 } 86 86 87 - extern void clocksource_set_clock(struct clocksource *cs, unsigned int clock); 88 - extern void clockevent_set_clock(struct clock_event_device *cd, 89 - unsigned int clock); 87 + static inline void clocksource_set_clock(struct clocksource *cs, 88 + unsigned int clock) 89 + { 90 + clocksource_calc_mult_shift(cs, clock, 4); 91 + } 92 + 93 + static inline void clockevent_set_clock(struct clock_event_device *cd, 94 + unsigned int clock) 95 + { 96 + clockevents_calc_mult_shift(cd, clock, 4); 97 + } 90 98 91 99 #endif /* _ASM_TIME_H */
-33
arch/mips/kernel/time.c
··· 71 71 72 72 unsigned int mips_hpt_frequency; 73 73 74 - void __init clocksource_set_clock(struct clocksource *cs, unsigned int clock) 75 - { 76 - u64 temp; 77 - u32 shift; 78 - 79 - /* Find a shift value */ 80 - for (shift = 32; shift > 0; shift--) { 81 - temp = (u64) NSEC_PER_SEC << shift; 82 - do_div(temp, clock); 83 - if ((temp >> 32) == 0) 84 - break; 85 - } 86 - cs->shift = shift; 87 - cs->mult = (u32) temp; 88 - } 89 - 90 - void __cpuinit clockevent_set_clock(struct clock_event_device *cd, 91 - unsigned int clock) 92 - { 93 - u64 temp; 94 - u32 shift; 95 - 96 - /* Find a shift value */ 97 - for (shift = 32; shift > 0; shift--) { 98 - temp = (u64) clock << shift; 99 - do_div(temp, NSEC_PER_SEC); 100 - if ((temp >> 32) == 0) 101 - break; 102 - } 103 - cd->shift = shift; 104 - cd->mult = (u32) temp; 105 - } 106 - 107 74 /* 108 75 * This function exists in order to cause an error due to a duplicate 109 76 * definition if platform code should have its own implementation. The hook
+1 -1
arch/powerpc/kernel/time.c
··· 924 924 *dec = decrementer_clockevent; 925 925 dec->cpumask = cpumask_of(cpu); 926 926 927 - printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n", 927 + printk(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", 928 928 dec->name, dec->mult, dec->shift, cpu); 929 929 930 930 clockevents_register_device(dec);
+8
arch/s390/include/asm/cputime.h
··· 183 183 unsigned long long idle_count; 184 184 unsigned long long idle_enter; 185 185 unsigned long long idle_time; 186 + int nohz_delay; 186 187 }; 187 188 188 189 DECLARE_PER_CPU(struct s390_idle_data, s390_idle); ··· 198 197 if ((&__get_cpu_var(s390_idle))->idle_enter != 0ULL) 199 198 vtime_start_cpu(); 200 199 } 200 + 201 + static inline int s390_nohz_delay(int cpu) 202 + { 203 + return per_cpu(s390_idle, cpu).nohz_delay != 0; 204 + } 205 + 206 + #define arch_needs_cpu(cpu) s390_nohz_delay(cpu) 201 207 202 208 #endif /* _S390_CPUTIME_H */
+2
arch/s390/kernel/s390_ext.c
··· 126 126 /* Serve timer interrupts first. */ 127 127 clock_comparator_work(); 128 128 kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++; 129 + if (code != 0x1004) 130 + __get_cpu_var(s390_idle).nohz_delay = 1; 129 131 index = ext_hash(code); 130 132 for (p = ext_int_hash[index]; p; p = p->next) { 131 133 if (likely(p->code == code))
+2
arch/s390/kernel/vtime.c
··· 167 167 /* Wait for external, I/O or machine check interrupt. */ 168 168 psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT; 169 169 170 + idle->nohz_delay = 0; 171 + 170 172 /* Check if the CPU timer needs to be reprogrammed. */ 171 173 if (vq->do_spt) { 172 174 __u64 vmax = VTIMER_MAX_SLICE;
+1 -1
arch/sparc/kernel/time_64.c
··· 847 847 sparc64_clockevent.min_delta_ns = 848 848 clockevent_delta2ns(0xF, &sparc64_clockevent); 849 849 850 - printk("clockevent: mult[%lx] shift[%d]\n", 850 + printk("clockevent: mult[%ux] shift[%d]\n", 851 851 sparc64_clockevent.mult, sparc64_clockevent.shift); 852 852 853 853 setup_sparc64_timer();
+1 -1
arch/x86/kernel/apic/apic.c
··· 647 647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 648 648 649 649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 650 - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 650 + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); 651 651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 652 652 calibration_result); 653 653
+1 -1
arch/x86/kernel/vmiclock_32.c
··· 226 226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 227 227 evt->cpumask = cpumask_of(cpu); 228 228 229 - printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 + printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", 230 230 evt->name, evt->mult, evt->shift); 231 231 clockevents_register_device(evt); 232 232 }
+1
drivers/s390/cio/cio.c
··· 618 618 old_regs = set_irq_regs(regs); 619 619 s390_idle_check(); 620 620 irq_enter(); 621 + __get_cpu_var(s390_idle).nohz_delay = 1; 621 622 if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) 622 623 /* Serve timer interrupts first. */ 623 624 clock_comparator_work();
+13 -6
include/linux/clockchips.h
··· 77 77 struct clock_event_device { 78 78 const char *name; 79 79 unsigned int features; 80 - unsigned long max_delta_ns; 81 - unsigned long min_delta_ns; 82 - unsigned long mult; 83 - int shift; 80 + u64 max_delta_ns; 81 + u64 min_delta_ns; 82 + u32 mult; 83 + u32 shift; 84 84 int rating; 85 85 int irq; 86 86 const struct cpumask *cpumask; ··· 116 116 } 117 117 118 118 /* Clock event layer functions */ 119 - extern unsigned long clockevent_delta2ns(unsigned long latch, 120 - struct clock_event_device *evt); 119 + extern u64 clockevent_delta2ns(unsigned long latch, 120 + struct clock_event_device *evt); 121 121 extern void clockevents_register_device(struct clock_event_device *dev); 122 122 123 123 extern void clockevents_exchange_device(struct clock_event_device *old, ··· 129 129 ktime_t expires, ktime_t now); 130 130 131 131 extern void clockevents_handle_noop(struct clock_event_device *dev); 132 + 133 + static inline void 134 + clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) 135 + { 136 + return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, 137 + freq, minsec); 138 + } 132 139 133 140 #ifdef CONFIG_GENERIC_CLOCKEVENTS 134 141 extern void clockevents_notify(unsigned long reason, void *arg);
+12
include/linux/clocksource.h
··· 151 151 * subtraction of non 64 bit counters 152 152 * @mult: cycle to nanosecond multiplier 153 153 * @shift: cycle to nanosecond divisor (power of two) 154 + * @max_idle_ns: max idle time permitted by the clocksource (nsecs) 154 155 * @flags: flags describing special properties 155 156 * @vread: vsyscall based read 156 157 * @resume: resume function for the clocksource, if necessary ··· 169 168 cycle_t mask; 170 169 u32 mult; 171 170 u32 shift; 171 + u64 max_idle_ns; 172 172 unsigned long flags; 173 173 cycle_t (*vread)(void); 174 174 void (*resume)(void); ··· 280 278 extern void clocksource_resume(void); 281 279 extern struct clocksource * __init __weak clocksource_default_clock(void); 282 280 extern void clocksource_mark_unstable(struct clocksource *cs); 281 + 282 + extern void 283 + clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec); 284 + 285 + static inline void 286 + clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec) 287 + { 288 + return clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 289 + NSEC_PER_SEC, minsec); 290 + } 283 291 284 292 #ifdef CONFIG_GENERIC_TIME_VSYSCALL 285 293 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
+5
include/linux/tick.h
··· 43 43 * @idle_exittime: Time when the idle state was left 44 44 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped 45 45 * @sleep_length: Duration of the current idle sleep 46 + * @do_timer_lst: CPU was the last one doing do_timer before going idle 46 47 */ 47 48 struct tick_sched { 48 49 struct hrtimer sched_timer; ··· 65 64 unsigned long last_jiffies; 66 65 unsigned long next_jiffies; 67 66 ktime_t idle_expires; 67 + int do_timer_last; 68 68 }; 69 69 70 70 extern void __init tick_init(void); ··· 100 98 extern struct tick_sched *tick_get_tick_sched(int cpu); 101 99 extern void tick_check_idle(int cpu); 102 100 extern int tick_oneshot_mode_active(void); 101 + # ifndef arch_needs_cpu 102 + # define arch_needs_cpu(cpu) (0) 103 + # endif 103 104 # else 104 105 static inline void tick_clock_notify(void) { } 105 106 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+1
include/linux/time.h
··· 148 148 149 149 extern struct timespec timespec_trunc(struct timespec t, unsigned gran); 150 150 extern int timekeeping_valid_for_hres(void); 151 + extern u64 timekeeping_max_deferment(void); 151 152 extern void update_wall_time(void); 152 153 extern void update_xtime_cache(u64 nsec); 153 154 extern void timekeeping_leap_insert(int leapsecond);
-4
include/linux/timex.h
··· 261 261 262 262 #define NTP_SCALE_SHIFT 32 263 263 264 - #ifdef CONFIG_NO_HZ 265 - #define NTP_INTERVAL_FREQ (2) 266 - #else 267 264 #define NTP_INTERVAL_FREQ (HZ) 268 - #endif 269 265 #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ) 270 266 271 267 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+15 -9
init/calibrate.c
··· 123 123 { 124 124 unsigned long ticks, loopbit; 125 125 int lps_precision = LPS_PREC; 126 + static bool printed; 126 127 127 128 if (preset_lpj) { 128 129 loops_per_jiffy = preset_lpj; 129 - printk(KERN_INFO 130 - "Calibrating delay loop (skipped) preset value.. "); 131 - } else if ((smp_processor_id() == 0) && lpj_fine) { 130 + if (!printed) 131 + pr_info("Calibrating delay loop (skipped) " 132 + "preset value.. "); 133 + } else if ((!printed) && lpj_fine) { 132 134 loops_per_jiffy = lpj_fine; 133 - printk(KERN_INFO 134 - "Calibrating delay loop (skipped), " 135 + pr_info("Calibrating delay loop (skipped), " 135 136 "value calculated using timer frequency.. "); 136 137 } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { 137 - printk(KERN_INFO 138 - "Calibrating delay using timer specific routine.. "); 138 + if (!printed) 139 + pr_info("Calibrating delay using timer " 140 + "specific routine.. "); 139 141 } else { 140 142 loops_per_jiffy = (1<<12); 141 143 142 - printk(KERN_INFO "Calibrating delay loop... "); 144 + if (!printed) 145 + pr_info("Calibrating delay loop... "); 143 146 while ((loops_per_jiffy <<= 1) != 0) { 144 147 /* wait for "start of" clock tick */ 145 148 ticks = jiffies; ··· 173 170 loops_per_jiffy &= ~loopbit; 174 171 } 175 172 } 176 - printk(KERN_CONT "%lu.%02lu BogoMIPS (lpj=%lu)\n", 173 + if (!printed) 174 + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", 177 175 loops_per_jiffy/(500000/HZ), 178 176 (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); 177 + 178 + printed = true; 179 179 }
+2 -3
kernel/cpu.c
··· 392 392 if (cpu == first_cpu) 393 393 continue; 394 394 error = _cpu_down(cpu, 1); 395 - if (!error) { 395 + if (!error) 396 396 cpumask_set_cpu(cpu, frozen_cpus); 397 - printk("CPU%d is down\n", cpu); 398 - } else { 397 + else { 399 398 printk(KERN_ERR "Error taking CPU%d down: %d\n", 400 399 cpu, error); 401 400 break;
+2 -1
kernel/hrtimer.c
··· 1238 1238 force_clock_reprogram = 1; 1239 1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; 1240 1240 printk(KERN_WARNING "hrtimer: interrupt too slow, " 1241 - "forcing clock min delta to %lu ns\n", dev->min_delta_ns); 1241 + "forcing clock min delta to %llu ns\n", 1242 + (unsigned long long) dev->min_delta_ns); 1242 1243 } 1243 1244 /* 1244 1245 * High resolution timer interrupt
+2 -3
kernel/posix-cpu-timers.c
··· 384 384 385 385 /* 386 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 387 - * This is called from sys_timer_create with the new timer already locked. 387 + * This is called from sys_timer_create() and do_cpu_nanosleep() with the 388 + * new timer already all-zeros initialized. 388 389 */ 389 390 int posix_cpu_timer_create(struct k_itimer *new_timer) 390 391 { ··· 397 396 return -EINVAL; 398 397 399 398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 400 - new_timer->it.cpu.incr.sched = 0; 401 - new_timer->it.cpu.expires.sched = 0; 402 399 403 400 read_lock(&tasklist_lock); 404 401 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
-1
kernel/time.c
··· 136 136 write_seqlock_irq(&xtime_lock); 137 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 138 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 139 - update_xtime_cache(0); 140 139 write_sequnlock_irq(&xtime_lock); 141 140 clock_was_set(); 142 141 }
+7 -6
kernel/time/clockevents.c
··· 20 20 #include <linux/sysdev.h> 21 21 #include <linux/tick.h> 22 22 23 + #include "tick-internal.h" 24 + 23 25 /* The registered clock event devices */ 24 26 static LIST_HEAD(clockevent_devices); 25 27 static LIST_HEAD(clockevents_released); ··· 39 37 * 40 38 * Math helper, returns latch value converted to nanoseconds (bound checked) 41 39 */ 42 - unsigned long clockevent_delta2ns(unsigned long latch, 43 - struct clock_event_device *evt) 40 + u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) 44 41 { 45 - u64 clc = ((u64) latch << evt->shift); 42 + u64 clc = (u64) latch << evt->shift; 46 43 47 44 if (unlikely(!evt->mult)) { 48 45 evt->mult = 1; ··· 51 50 do_div(clc, evt->mult); 52 51 if (clc < 1000) 53 52 clc = 1000; 54 - if (clc > LONG_MAX) 55 - clc = LONG_MAX; 53 + if (clc > KTIME_MAX) 54 + clc = KTIME_MAX; 56 55 57 - return (unsigned long) clc; 56 + return clc; 58 57 } 59 58 EXPORT_SYMBOL_GPL(clockevent_delta2ns); 60 59
+97
kernel/time/clocksource.c
··· 107 107 } 108 108 EXPORT_SYMBOL_GPL(timecounter_cyc2time); 109 109 110 + /** 111 + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks 112 + * @mult: pointer to mult variable 113 + * @shift: pointer to shift variable 114 + * @from: frequency to convert from 115 + * @to: frequency to convert to 116 + * @minsec: guaranteed runtime conversion range in seconds 117 + * 118 + * The function evaluates the shift/mult pair for the scaled math 119 + * operations of clocksources and clockevents. 120 + * 121 + * @to and @from are frequency values in HZ. For clock sources @to is 122 + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock 123 + * event @to is the counter frequency and @from is NSEC_PER_SEC. 124 + * 125 + * The @minsec conversion range argument controls the time frame in 126 + * seconds which must be covered by the runtime conversion with the 127 + * calculated mult and shift factors. This guarantees that no 64bit 128 + * overflow happens when the input value of the conversion is 129 + * multiplied with the calculated mult factor. Larger ranges may 130 + * reduce the conversion accuracy by chosing smaller mult and shift 131 + * factors. 132 + */ 133 + void 134 + clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) 135 + { 136 + u64 tmp; 137 + u32 sft, sftacc= 32; 138 + 139 + /* 140 + * Calculate the shift factor which is limiting the conversion 141 + * range: 142 + */ 143 + tmp = ((u64)minsec * from) >> 32; 144 + while (tmp) { 145 + tmp >>=1; 146 + sftacc--; 147 + } 148 + 149 + /* 150 + * Find the conversion shift/mult pair which has the best 151 + * accuracy and fits the maxsec conversion range: 152 + */ 153 + for (sft = 32; sft > 0; sft--) { 154 + tmp = (u64) to << sft; 155 + do_div(tmp, from); 156 + if ((tmp >> sftacc) == 0) 157 + break; 158 + } 159 + *mult = tmp; 160 + *shift = sft; 161 + } 162 + 110 163 /*[Clocksource internal variables]--------- 111 164 * curr_clocksource: 112 165 * currently selected clocksource. ··· 466 413 clocksource_resume_watchdog(); 467 414 } 468 415 416 + /** 417 + * clocksource_max_deferment - Returns max time the clocksource can be deferred 418 + * @cs: Pointer to clocksource 419 + * 420 + */ 421 + static u64 clocksource_max_deferment(struct clocksource *cs) 422 + { 423 + u64 max_nsecs, max_cycles; 424 + 425 + /* 426 + * Calculate the maximum number of cycles that we can pass to the 427 + * cyc2ns function without overflowing a 64-bit signed result. The 428 + * maximum number of cycles is equal to ULLONG_MAX/cs->mult which 429 + * is equivalent to the below. 430 + * max_cycles < (2^63)/cs->mult 431 + * max_cycles < 2^(log2((2^63)/cs->mult)) 432 + * max_cycles < 2^(log2(2^63) - log2(cs->mult)) 433 + * max_cycles < 2^(63 - log2(cs->mult)) 434 + * max_cycles < 1 << (63 - log2(cs->mult)) 435 + * Please note that we add 1 to the result of the log2 to account for 436 + * any rounding errors, ensure the above inequality is satisfied and 437 + * no overflow will occur. 438 + */ 439 + max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); 440 + 441 + /* 442 + * The actual maximum number of cycles we can defer the clocksource is 443 + * determined by the minimum of max_cycles and cs->mask. 444 + */ 445 + max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 446 + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); 447 + 448 + /* 449 + * To ensure that the clocksource does not wrap whilst we are idle, 450 + * limit the time the clocksource can be deferred by 12.5%. Please 451 + * note a margin of 12.5% is used because this can be computed with 452 + * a shift, versus say 10% which would require division. 453 + */ 454 + return max_nsecs - (max_nsecs >> 5); 455 + } 456 + 469 457 #ifdef CONFIG_GENERIC_TIME 470 458 471 459 /** ··· 605 511 */ 606 512 int clocksource_register(struct clocksource *cs) 607 513 { 514 + /* calculate max idle time permitted for this clocksource */ 515 + cs->max_idle_ns = clocksource_max_deferment(cs); 516 + 608 517 mutex_lock(&clocksource_mutex); 609 518 clocksource_enqueue(cs); 610 519 clocksource_select();
+2 -2
kernel/time/tick-oneshot.c
··· 50 50 dev->min_delta_ns += dev->min_delta_ns >> 1; 51 51 52 52 printk(KERN_WARNING 53 - "CE: %s increasing min_delta_ns to %lu nsec\n", 53 + "CE: %s increasing min_delta_ns to %llu nsec\n", 54 54 dev->name ? dev->name : "?", 55 - dev->min_delta_ns << 1); 55 + (unsigned long long) dev->min_delta_ns << 1); 56 56 57 57 i = 0; 58 58 }
+88 -53
kernel/time/tick-sched.c
··· 134 134 * value. We do this unconditionally on any cpu, as we don't know whether the 135 135 * cpu, which has the update task assigned is in a long sleep. 136 136 */ 137 - static void tick_nohz_update_jiffies(void) 137 + static void tick_nohz_update_jiffies(ktime_t now) 138 138 { 139 139 int cpu = smp_processor_id(); 140 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 141 141 unsigned long flags; 142 - ktime_t now; 143 - 144 - if (!ts->tick_stopped) 145 - return; 146 142 147 143 cpumask_clear_cpu(cpu, nohz_cpu_mask); 148 - now = ktime_get(); 149 144 ts->idle_waketime = now; 150 145 151 146 local_irq_save(flags); ··· 150 155 touch_softlockup_watchdog(); 151 156 } 152 157 153 - static void tick_nohz_stop_idle(int cpu) 158 + static void tick_nohz_stop_idle(int cpu, ktime_t now) 154 159 { 155 160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 161 + ktime_t delta; 156 162 157 - if (ts->idle_active) { 158 - ktime_t now, delta; 159 - now = ktime_get(); 160 - delta = ktime_sub(now, ts->idle_entrytime); 161 - ts->idle_lastupdate = now; 162 - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 163 - ts->idle_active = 0; 163 + delta = ktime_sub(now, ts->idle_entrytime); 164 + ts->idle_lastupdate = now; 165 + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 166 + ts->idle_active = 0; 164 167 165 - sched_clock_idle_wakeup_event(0); 166 - } 168 + sched_clock_idle_wakeup_event(0); 167 169 } 168 170 169 171 static ktime_t tick_nohz_start_idle(struct tick_sched *ts) ··· 208 216 struct tick_sched *ts; 209 217 ktime_t last_update, expires, now; 210 218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 219 + u64 time_delta; 211 220 int cpu; 212 221 213 222 local_irq_save(flags); ··· 256 263 257 264 if (ratelimit < 10) { 258 265 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 259 - local_softirq_pending()); 266 + (unsigned int) local_softirq_pending()); 260 267 ratelimit++; 261 268 } 262 269 goto end; ··· 268 275 seq = read_seqbegin(&xtime_lock); 269 276 last_update = last_jiffies_update; 270 277 last_jiffies = jiffies; 278 + time_delta = timekeeping_max_deferment(); 271 279 } while (read_seqretry(&xtime_lock, seq)); 272 280 273 - /* Get the next timer wheel timer */ 274 - next_jiffies = get_next_timer_interrupt(last_jiffies); 275 - delta_jiffies = next_jiffies - last_jiffies; 276 - 277 - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) 281 + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 282 + arch_needs_cpu(cpu)) { 283 + next_jiffies = last_jiffies + 1; 278 284 delta_jiffies = 1; 285 + } else { 286 + /* Get the next timer wheel timer */ 287 + next_jiffies = get_next_timer_interrupt(last_jiffies); 288 + delta_jiffies = next_jiffies - last_jiffies; 289 + } 279 290 /* 280 291 * Do not stop the tick, if we are only one off 281 292 * or if the cpu is required for rcu ··· 291 294 if ((long)delta_jiffies >= 1) { 292 295 293 296 /* 294 - * calculate the expiry time for the next timer wheel 295 - * timer 296 - */ 297 - expires = ktime_add_ns(last_update, tick_period.tv64 * 298 - delta_jiffies); 299 - 300 - /* 301 297 * If this cpu is the one which updates jiffies, then 302 298 * give up the assignment and let it be taken by the 303 299 * cpu which runs the tick timer next, which might be 304 300 * this cpu as well. If we don't drop this here the 305 301 * jiffies might be stale and do_timer() never 306 - * invoked. 302 + * invoked. Keep track of the fact that it was the one 303 + * which had the do_timer() duty last. If this cpu is 304 + * the one which had the do_timer() duty last, we 305 + * limit the sleep time to the timekeeping 306 + * max_deferement value which we retrieved 307 + * above. Otherwise we can sleep as long as we want. 307 308 */ 308 - if (cpu == tick_do_timer_cpu) 309 + if (cpu == tick_do_timer_cpu) { 309 310 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 311 + ts->do_timer_last = 1; 312 + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 313 + time_delta = KTIME_MAX; 314 + ts->do_timer_last = 0; 315 + } else if (!ts->do_timer_last) { 316 + time_delta = KTIME_MAX; 317 + } 318 + 319 + /* 320 + * calculate the expiry time for the next timer wheel 321 + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals 322 + * that there is no timer pending or at least extremely 323 + * far into the future (12 days for HZ=1000). In this 324 + * case we set the expiry to the end of time. 325 + */ 326 + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { 327 + /* 328 + * Calculate the time delta for the next timer event. 329 + * If the time delta exceeds the maximum time delta 330 + * permitted by the current clocksource then adjust 331 + * the time delta accordingly to ensure the 332 + * clocksource does not wrap. 333 + */ 334 + time_delta = min_t(u64, time_delta, 335 + tick_period.tv64 * delta_jiffies); 336 + } 337 + 338 + if (time_delta < KTIME_MAX) 339 + expires = ktime_add_ns(last_update, time_delta); 340 + else 341 + expires.tv64 = KTIME_MAX; 310 342 311 343 if (delta_jiffies > 1) 312 344 cpumask_set_cpu(cpu, nohz_cpu_mask); ··· 368 342 369 343 ts->idle_sleeps++; 370 344 345 + /* Mark expires */ 346 + ts->idle_expires = expires; 347 + 371 348 /* 372 - * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 373 - * there is no timer pending or at least extremly far 374 - * into the future (12 days for HZ=1000). In this case 375 - * we simply stop the tick timer: 349 + * If the expiration time == KTIME_MAX, then 350 + * in this case we simply stop the tick timer. 376 351 */ 377 - if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 - ts->idle_expires.tv64 = KTIME_MAX; 352 + if (unlikely(expires.tv64 == KTIME_MAX)) { 379 353 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 380 354 hrtimer_cancel(&ts->sched_timer); 381 355 goto out; 382 356 } 383 - 384 - /* Mark expiries */ 385 - ts->idle_expires = expires; 386 357 387 358 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 388 359 hrtimer_start(&ts->sched_timer, expires, ··· 459 436 ktime_t now; 460 437 461 438 local_irq_disable(); 462 - tick_nohz_stop_idle(cpu); 439 + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 440 + now = ktime_get(); 441 + 442 + if (ts->idle_active) 443 + tick_nohz_stop_idle(cpu, now); 463 444 464 445 if (!ts->inidle || !ts->tick_stopped) { 465 446 ts->inidle = 0; ··· 477 450 478 451 /* Update jiffies first */ 479 452 select_nohz_load_balancer(0); 480 - now = ktime_get(); 481 453 tick_do_update_jiffies64(now); 482 454 cpumask_clear_cpu(cpu, nohz_cpu_mask); 483 455 ··· 610 584 * timer and do not touch the other magic bits which need to be done 611 585 * when idle is left. 612 586 */ 613 - static void tick_nohz_kick_tick(int cpu) 587 + static void tick_nohz_kick_tick(int cpu, ktime_t now) 614 588 { 615 589 #if 0 616 590 /* Switch back to 2.6.27 behaviour */ 617 591 618 592 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 619 - ktime_t delta, now; 620 - 621 - if (!ts->tick_stopped) 622 - return; 593 + ktime_t delta; 623 594 624 595 /* 625 596 * Do not touch the tick device, when the next expiry is either 626 597 * already reached or less/equal than the tick period. 627 598 */ 628 - now = ktime_get(); 629 599 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 630 600 if (delta.tv64 <= tick_period.tv64) 631 601 return; ··· 630 608 #endif 631 609 } 632 610 611 + static inline void tick_check_nohz(int cpu) 612 + { 613 + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 614 + ktime_t now; 615 + 616 + if (!ts->idle_active && !ts->tick_stopped) 617 + return; 618 + now = ktime_get(); 619 + if (ts->idle_active) 620 + tick_nohz_stop_idle(cpu, now); 621 + if (ts->tick_stopped) { 622 + tick_nohz_update_jiffies(now); 623 + tick_nohz_kick_tick(cpu, now); 624 + } 625 + } 626 + 633 627 #else 634 628 635 629 static inline void tick_nohz_switch_to_nohz(void) { } 630 + static inline void tick_check_nohz(int cpu) { } 636 631 637 632 #endif /* NO_HZ */ 638 633 ··· 659 620 void tick_check_idle(int cpu) 660 621 { 661 622 tick_check_oneshot_broadcast(cpu); 662 - #ifdef CONFIG_NO_HZ 663 - tick_nohz_stop_idle(cpu); 664 - tick_nohz_update_jiffies(); 665 - tick_nohz_kick_tick(cpu); 666 - #endif 623 + tick_check_nohz(cpu); 667 624 } 668 625 669 626 /*
+73 -46
kernel/time/timekeeping.c
··· 165 165 /* flag for if timekeeping is suspended */ 166 166 int __read_mostly timekeeping_suspended; 167 167 168 - static struct timespec xtime_cache __attribute__ ((aligned (16))); 169 - void update_xtime_cache(u64 nsec) 170 - { 171 - xtime_cache = xtime; 172 - timespec_add_ns(&xtime_cache, nsec); 173 - } 174 - 175 168 /* must hold xtime_lock */ 176 169 void timekeeping_leap_insert(int leapsecond) 177 170 { ··· 325 332 326 333 xtime = *tv; 327 334 328 - update_xtime_cache(0); 329 - 330 335 timekeeper.ntp_error = 0; 331 336 ntp_clear(); 332 337 ··· 479 488 } 480 489 481 490 /** 491 + * timekeeping_max_deferment - Returns max time the clocksource can be deferred 492 + * 493 + * Caller must observe xtime_lock via read_seqbegin/read_seqretry to 494 + * ensure that the clocksource does not change! 495 + */ 496 + u64 timekeeping_max_deferment(void) 497 + { 498 + return timekeeper.clock->max_idle_ns; 499 + } 500 + 501 + /** 482 502 * read_persistent_clock - Return time from the persistent clock. 483 503 * 484 504 * Weak dummy function for arches that do not yet support it. ··· 550 548 } 551 549 set_normalized_timespec(&wall_to_monotonic, 552 550 -boot.tv_sec, -boot.tv_nsec); 553 - update_xtime_cache(0); 554 551 total_sleep_time.tv_sec = 0; 555 552 total_sleep_time.tv_nsec = 0; 556 553 write_sequnlock_irqrestore(&xtime_lock, flags); ··· 583 582 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 584 583 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 585 584 } 586 - update_xtime_cache(0); 587 585 /* re-base the last cycle value */ 588 586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 589 587 timekeeper.ntp_error = 0; ··· 723 723 } 724 724 725 725 /** 726 + * logarithmic_accumulation - shifted accumulation of cycles 727 + * 728 + * This functions accumulates a shifted interval of cycles into 729 + * into a shifted interval nanoseconds. Allows for O(log) accumulation 730 + * loop. 731 + * 732 + * Returns the unconsumed cycles. 733 + */ 734 + static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 735 + { 736 + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 737 + 738 + /* If the offset is smaller then a shifted interval, do nothing */ 739 + if (offset < timekeeper.cycle_interval<<shift) 740 + return offset; 741 + 742 + /* Accumulate one shifted interval */ 743 + offset -= timekeeper.cycle_interval << shift; 744 + timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; 745 + 746 + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 747 + while (timekeeper.xtime_nsec >= nsecps) { 748 + timekeeper.xtime_nsec -= nsecps; 749 + xtime.tv_sec++; 750 + second_overflow(); 751 + } 752 + 753 + /* Accumulate into raw time */ 754 + raw_time.tv_nsec += timekeeper.raw_interval << shift;; 755 + while (raw_time.tv_nsec >= NSEC_PER_SEC) { 756 + raw_time.tv_nsec -= NSEC_PER_SEC; 757 + raw_time.tv_sec++; 758 + } 759 + 760 + /* Accumulate error between NTP and clock interval */ 761 + timekeeper.ntp_error += tick_length << shift; 762 + timekeeper.ntp_error -= timekeeper.xtime_interval << 763 + (timekeeper.ntp_error_shift + shift); 764 + 765 + return offset; 766 + } 767 + 768 + /** 726 769 * update_wall_time - Uses the current clocksource to increment the wall time 727 770 * 728 771 * Called from the timer interrupt, must hold a write on xtime_lock. ··· 774 731 { 775 732 struct clocksource *clock; 776 733 cycle_t offset; 777 - u64 nsecs; 734 + int shift = 0, maxshift; 778 735 779 736 /* Make sure we're fully resumed: */ 780 737 if (unlikely(timekeeping_suspended)) ··· 788 745 #endif 789 746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 790 747 791 - /* normally this loop will run just once, however in the 792 - * case of lost or late ticks, it will accumulate correctly. 748 + /* 749 + * With NO_HZ we may have to accumulate many cycle_intervals 750 + * (think "ticks") worth of time at once. To do this efficiently, 751 + * we calculate the largest doubling multiple of cycle_intervals 752 + * that is smaller then the offset. We then accumulate that 753 + * chunk in one go, and then try to consume the next smaller 754 + * doubled multiple. 793 755 */ 756 + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 757 + shift = max(0, shift); 758 + /* Bound shift to one less then what overflows tick_length */ 759 + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; 760 + shift = min(shift, maxshift); 794 761 while (offset >= timekeeper.cycle_interval) { 795 - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 796 - 797 - /* accumulate one interval */ 798 - offset -= timekeeper.cycle_interval; 799 - clock->cycle_last += timekeeper.cycle_interval; 800 - 801 - timekeeper.xtime_nsec += timekeeper.xtime_interval; 802 - if (timekeeper.xtime_nsec >= nsecps) { 803 - timekeeper.xtime_nsec -= nsecps; 804 - xtime.tv_sec++; 805 - second_overflow(); 806 - } 807 - 808 - raw_time.tv_nsec += timekeeper.raw_interval; 809 - if (raw_time.tv_nsec >= NSEC_PER_SEC) { 810 - raw_time.tv_nsec -= NSEC_PER_SEC; 811 - raw_time.tv_sec++; 812 - } 813 - 814 - /* accumulate error between NTP and clock interval */ 815 - timekeeper.ntp_error += tick_length; 816 - timekeeper.ntp_error -= timekeeper.xtime_interval << 817 - timekeeper.ntp_error_shift; 762 + offset = logarithmic_accumulation(offset, shift); 763 + shift--; 818 764 } 819 765 820 766 /* correct the clock when NTP error is too big */ ··· 838 806 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; 839 807 timekeeper.ntp_error += timekeeper.xtime_nsec << 840 808 timekeeper.ntp_error_shift; 841 - 842 - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 843 - update_xtime_cache(nsecs); 844 809 845 810 /* check to see if there is a new clocksource to use */ 846 811 update_vsyscall(&xtime, timekeeper.clock); ··· 875 846 876 847 unsigned long get_seconds(void) 877 848 { 878 - return xtime_cache.tv_sec; 849 + return xtime.tv_sec; 879 850 } 880 851 EXPORT_SYMBOL(get_seconds); 881 852 882 853 struct timespec __current_kernel_time(void) 883 854 { 884 - return xtime_cache; 855 + return xtime; 885 856 } 886 857 887 858 struct timespec current_kernel_time(void) ··· 891 862 892 863 do { 893 864 seq = read_seqbegin(&xtime_lock); 894 - 895 - now = xtime_cache; 865 + now = xtime; 896 866 } while (read_seqretry(&xtime_lock, seq)); 897 867 898 868 return now; ··· 905 877 906 878 do { 907 879 seq = read_seqbegin(&xtime_lock); 908 - 909 - now = xtime_cache; 880 + now = xtime; 910 881 mono = wall_to_monotonic; 911 882 } while (read_seqretry(&xtime_lock, seq)); 912 883
+6 -4
kernel/time/timer_list.c
··· 204 204 return; 205 205 } 206 206 SEQ_printf(m, "%s\n", dev->name); 207 - SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 208 - SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 209 - SEQ_printf(m, " mult: %lu\n", dev->mult); 210 - SEQ_printf(m, " shift: %d\n", dev->shift); 207 + SEQ_printf(m, " max_delta_ns: %llu\n", 208 + (unsigned long long) dev->max_delta_ns); 209 + SEQ_printf(m, " min_delta_ns: %llu\n", 210 + (unsigned long long) dev->min_delta_ns); 211 + SEQ_printf(m, " mult: %u\n", dev->mult); 212 + SEQ_printf(m, " shift: %u\n", dev->shift); 211 213 SEQ_printf(m, " mode: %d\n", dev->mode); 212 214 SEQ_printf(m, " next_event: %Ld nsecs\n", 213 215 (unsigned long long) ktime_to_ns(dev->next_event));