kernel/timer.c at v2.6.22-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / kernel / timer.c
at v2.6.22-rc2 41 kB view raw
   1/*
   2 *  linux/kernel/timer.c
   3 *
   4 *  Kernel internal timers, basic process system calls
   5 *
   6 *  Copyright (C) 1991, 1992  Linus Torvalds
   7 *
   8 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
   9 *
  10 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
  11 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
  12 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  13 *              serialize accesses to xtime/lost_ticks).
  14 *                              Copyright (C) 1998  Andrea Arcangeli
  15 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
  16 *  2002-05-31	Move sys_sysinfo here and make its locking sane, Robert Love
  17 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
  18 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
  19 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  20 */
  21
  22#include <linux/kernel_stat.h>
  23#include <linux/module.h>
  24#include <linux/interrupt.h>
  25#include <linux/percpu.h>
  26#include <linux/init.h>
  27#include <linux/mm.h>
  28#include <linux/swap.h>
  29#include <linux/notifier.h>
  30#include <linux/thread_info.h>
  31#include <linux/time.h>
  32#include <linux/jiffies.h>
  33#include <linux/posix-timers.h>
  34#include <linux/cpu.h>
  35#include <linux/syscalls.h>
  36#include <linux/delay.h>
  37#include <linux/tick.h>
  38#include <linux/kallsyms.h>
  39
  40#include <asm/uaccess.h>
  41#include <asm/unistd.h>
  42#include <asm/div64.h>
  43#include <asm/timex.h>
  44#include <asm/io.h>
  45
  46u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
  47
  48EXPORT_SYMBOL(jiffies_64);
  49
  50/*
  51 * per-CPU timer vector definitions:
  52 */
  53#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
  54#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
  55#define TVN_SIZE (1 << TVN_BITS)
  56#define TVR_SIZE (1 << TVR_BITS)
  57#define TVN_MASK (TVN_SIZE - 1)
  58#define TVR_MASK (TVR_SIZE - 1)
  59
  60typedef struct tvec_s {
  61	struct list_head vec[TVN_SIZE];
  62} tvec_t;
  63
  64typedef struct tvec_root_s {
  65	struct list_head vec[TVR_SIZE];
  66} tvec_root_t;
  67
  68struct tvec_t_base_s {
  69	spinlock_t lock;
  70	struct timer_list *running_timer;
  71	unsigned long timer_jiffies;
  72	tvec_root_t tv1;
  73	tvec_t tv2;
  74	tvec_t tv3;
  75	tvec_t tv4;
  76	tvec_t tv5;
  77} ____cacheline_aligned;
  78
  79typedef struct tvec_t_base_s tvec_base_t;
  80
  81tvec_base_t boot_tvec_bases;
  82EXPORT_SYMBOL(boot_tvec_bases);
  83static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
  84
  85/*
  86 * Note that all tvec_bases is 2 byte aligned and lower bit of
  87 * base in timer_list is guaranteed to be zero. Use the LSB for
  88 * the new flag to indicate whether the timer is deferrable
  89 */
  90#define TBASE_DEFERRABLE_FLAG		(0x1)
  91
  92/* Functions below help us manage 'deferrable' flag */
  93static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
  94{
  95	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
  96}
  97
  98static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
  99{
 100	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
 101}
 102
 103static inline void timer_set_deferrable(struct timer_list *timer)
 104{
 105	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
 106	                               TBASE_DEFERRABLE_FLAG));
 107}
 108
 109static inline void
 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
 111{
 112	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
 113	                              tbase_get_deferrable(timer->base));
 114}
 115
 116/**
 117 * __round_jiffies - function to round jiffies to a full second
 118 * @j: the time in (absolute) jiffies that should be rounded
 119 * @cpu: the processor number on which the timeout will happen
 120 *
 121 * __round_jiffies() rounds an absolute time in the future (in jiffies)
 122 * up or down to (approximately) full seconds. This is useful for timers
 123 * for which the exact time they fire does not matter too much, as long as
 124 * they fire approximately every X seconds.
 125 *
 126 * By rounding these timers to whole seconds, all such timers will fire
 127 * at the same time, rather than at various times spread out. The goal
 128 * of this is to have the CPU wake up less, which saves power.
 129 *
 130 * The exact rounding is skewed for each processor to avoid all
 131 * processors firing at the exact same time, which could lead
 132 * to lock contention or spurious cache line bouncing.
 133 *
 134 * The return value is the rounded version of the @j parameter.
 135 */
 136unsigned long __round_jiffies(unsigned long j, int cpu)
 137{
 138	int rem;
 139	unsigned long original = j;
 140
 141	/*
 142	 * We don't want all cpus firing their timers at once hitting the
 143	 * same lock or cachelines, so we skew each extra cpu with an extra
 144	 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
 145	 * already did this.
 146	 * The skew is done by adding 3*cpunr, then round, then subtract this
 147	 * extra offset again.
 148	 */
 149	j += cpu * 3;
 150
 151	rem = j % HZ;
 152
 153	/*
 154	 * If the target jiffie is just after a whole second (which can happen
 155	 * due to delays of the timer irq, long irq off times etc etc) then
 156	 * we should round down to the whole second, not up. Use 1/4th second
 157	 * as cutoff for this rounding as an extreme upper bound for this.
 158	 */
 159	if (rem < HZ/4) /* round down */
 160		j = j - rem;
 161	else /* round up */
 162		j = j - rem + HZ;
 163
 164	/* now that we have rounded, subtract the extra skew again */
 165	j -= cpu * 3;
 166
 167	if (j <= jiffies) /* rounding ate our timeout entirely; */
 168		return original;
 169	return j;
 170}
 171EXPORT_SYMBOL_GPL(__round_jiffies);
 172
 173/**
 174 * __round_jiffies_relative - function to round jiffies to a full second
 175 * @j: the time in (relative) jiffies that should be rounded
 176 * @cpu: the processor number on which the timeout will happen
 177 *
 178 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 179 * up or down to (approximately) full seconds. This is useful for timers
 180 * for which the exact time they fire does not matter too much, as long as
 181 * they fire approximately every X seconds.
 182 *
 183 * By rounding these timers to whole seconds, all such timers will fire
 184 * at the same time, rather than at various times spread out. The goal
 185 * of this is to have the CPU wake up less, which saves power.
 186 *
 187 * The exact rounding is skewed for each processor to avoid all
 188 * processors firing at the exact same time, which could lead
 189 * to lock contention or spurious cache line bouncing.
 190 *
 191 * The return value is the rounded version of the @j parameter.
 192 */
 193unsigned long __round_jiffies_relative(unsigned long j, int cpu)
 194{
 195	/*
 196	 * In theory the following code can skip a jiffy in case jiffies
 197	 * increments right between the addition and the later subtraction.
 198	 * However since the entire point of this function is to use approximate
 199	 * timeouts, it's entirely ok to not handle that.
 200	 */
 201	return  __round_jiffies(j + jiffies, cpu) - jiffies;
 202}
 203EXPORT_SYMBOL_GPL(__round_jiffies_relative);
 204
 205/**
 206 * round_jiffies - function to round jiffies to a full second
 207 * @j: the time in (absolute) jiffies that should be rounded
 208 *
 209 * round_jiffies() rounds an absolute time in the future (in jiffies)
 210 * up or down to (approximately) full seconds. This is useful for timers
 211 * for which the exact time they fire does not matter too much, as long as
 212 * they fire approximately every X seconds.
 213 *
 214 * By rounding these timers to whole seconds, all such timers will fire
 215 * at the same time, rather than at various times spread out. The goal
 216 * of this is to have the CPU wake up less, which saves power.
 217 *
 218 * The return value is the rounded version of the @j parameter.
 219 */
 220unsigned long round_jiffies(unsigned long j)
 221{
 222	return __round_jiffies(j, raw_smp_processor_id());
 223}
 224EXPORT_SYMBOL_GPL(round_jiffies);
 225
 226/**
 227 * round_jiffies_relative - function to round jiffies to a full second
 228 * @j: the time in (relative) jiffies that should be rounded
 229 *
 230 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 231 * up or down to (approximately) full seconds. This is useful for timers
 232 * for which the exact time they fire does not matter too much, as long as
 233 * they fire approximately every X seconds.
 234 *
 235 * By rounding these timers to whole seconds, all such timers will fire
 236 * at the same time, rather than at various times spread out. The goal
 237 * of this is to have the CPU wake up less, which saves power.
 238 *
 239 * The return value is the rounded version of the @j parameter.
 240 */
 241unsigned long round_jiffies_relative(unsigned long j)
 242{
 243	return __round_jiffies_relative(j, raw_smp_processor_id());
 244}
 245EXPORT_SYMBOL_GPL(round_jiffies_relative);
 246
 247
 248static inline void set_running_timer(tvec_base_t *base,
 249					struct timer_list *timer)
 250{
 251#ifdef CONFIG_SMP
 252	base->running_timer = timer;
 253#endif
 254}
 255
 256static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 257{
 258	unsigned long expires = timer->expires;
 259	unsigned long idx = expires - base->timer_jiffies;
 260	struct list_head *vec;
 261
 262	if (idx < TVR_SIZE) {
 263		int i = expires & TVR_MASK;
 264		vec = base->tv1.vec + i;
 265	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 266		int i = (expires >> TVR_BITS) & TVN_MASK;
 267		vec = base->tv2.vec + i;
 268	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 269		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 270		vec = base->tv3.vec + i;
 271	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 272		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 273		vec = base->tv4.vec + i;
 274	} else if ((signed long) idx < 0) {
 275		/*
 276		 * Can happen if you add a timer with expires == jiffies,
 277		 * or you set a timer to go off in the past
 278		 */
 279		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
 280	} else {
 281		int i;
 282		/* If the timeout is larger than 0xffffffff on 64-bit
 283		 * architectures then we use the maximum timeout:
 284		 */
 285		if (idx > 0xffffffffUL) {
 286			idx = 0xffffffffUL;
 287			expires = idx + base->timer_jiffies;
 288		}
 289		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 290		vec = base->tv5.vec + i;
 291	}
 292	/*
 293	 * Timers are FIFO:
 294	 */
 295	list_add_tail(&timer->entry, vec);
 296}
 297
 298#ifdef CONFIG_TIMER_STATS
 299void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 300{
 301	if (timer->start_site)
 302		return;
 303
 304	timer->start_site = addr;
 305	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
 306	timer->start_pid = current->pid;
 307}
 308#endif
 309
 310/**
 311 * init_timer - initialize a timer.
 312 * @timer: the timer to be initialized
 313 *
 314 * init_timer() must be done to a timer prior calling *any* of the
 315 * other timer functions.
 316 */
 317void fastcall init_timer(struct timer_list *timer)
 318{
 319	timer->entry.next = NULL;
 320	timer->base = __raw_get_cpu_var(tvec_bases);
 321#ifdef CONFIG_TIMER_STATS
 322	timer->start_site = NULL;
 323	timer->start_pid = -1;
 324	memset(timer->start_comm, 0, TASK_COMM_LEN);
 325#endif
 326}
 327EXPORT_SYMBOL(init_timer);
 328
 329void fastcall init_timer_deferrable(struct timer_list *timer)
 330{
 331	init_timer(timer);
 332	timer_set_deferrable(timer);
 333}
 334EXPORT_SYMBOL(init_timer_deferrable);
 335
 336static inline void detach_timer(struct timer_list *timer,
 337				int clear_pending)
 338{
 339	struct list_head *entry = &timer->entry;
 340
 341	__list_del(entry->prev, entry->next);
 342	if (clear_pending)
 343		entry->next = NULL;
 344	entry->prev = LIST_POISON2;
 345}
 346
 347/*
 348 * We are using hashed locking: holding per_cpu(tvec_bases).lock
 349 * means that all timers which are tied to this base via timer->base are
 350 * locked, and the base itself is locked too.
 351 *
 352 * So __run_timers/migrate_timers can safely modify all timers which could
 353 * be found on ->tvX lists.
 354 *
 355 * When the timer's base is locked, and the timer removed from list, it is
 356 * possible to set timer->base = NULL and drop the lock: the timer remains
 357 * locked.
 358 */
 359static tvec_base_t *lock_timer_base(struct timer_list *timer,
 360					unsigned long *flags)
 361	__acquires(timer->base->lock)
 362{
 363	tvec_base_t *base;
 364
 365	for (;;) {
 366		tvec_base_t *prelock_base = timer->base;
 367		base = tbase_get_base(prelock_base);
 368		if (likely(base != NULL)) {
 369			spin_lock_irqsave(&base->lock, *flags);
 370			if (likely(prelock_base == timer->base))
 371				return base;
 372			/* The timer has migrated to another CPU */
 373			spin_unlock_irqrestore(&base->lock, *flags);
 374		}
 375		cpu_relax();
 376	}
 377}
 378
 379int __mod_timer(struct timer_list *timer, unsigned long expires)
 380{
 381	tvec_base_t *base, *new_base;
 382	unsigned long flags;
 383	int ret = 0;
 384
 385	timer_stats_timer_set_start_info(timer);
 386	BUG_ON(!timer->function);
 387
 388	base = lock_timer_base(timer, &flags);
 389
 390	if (timer_pending(timer)) {
 391		detach_timer(timer, 0);
 392		ret = 1;
 393	}
 394
 395	new_base = __get_cpu_var(tvec_bases);
 396
 397	if (base != new_base) {
 398		/*
 399		 * We are trying to schedule the timer on the local CPU.
 400		 * However we can't change timer's base while it is running,
 401		 * otherwise del_timer_sync() can't detect that the timer's
 402		 * handler yet has not finished. This also guarantees that
 403		 * the timer is serialized wrt itself.
 404		 */
 405		if (likely(base->running_timer != timer)) {
 406			/* See the comment in lock_timer_base() */
 407			timer_set_base(timer, NULL);
 408			spin_unlock(&base->lock);
 409			base = new_base;
 410			spin_lock(&base->lock);
 411			timer_set_base(timer, base);
 412		}
 413	}
 414
 415	timer->expires = expires;
 416	internal_add_timer(base, timer);
 417	spin_unlock_irqrestore(&base->lock, flags);
 418
 419	return ret;
 420}
 421
 422EXPORT_SYMBOL(__mod_timer);
 423
 424/**
 425 * add_timer_on - start a timer on a particular CPU
 426 * @timer: the timer to be added
 427 * @cpu: the CPU to start it on
 428 *
 429 * This is not very scalable on SMP. Double adds are not possible.
 430 */
 431void add_timer_on(struct timer_list *timer, int cpu)
 432{
 433	tvec_base_t *base = per_cpu(tvec_bases, cpu);
 434  	unsigned long flags;
 435
 436	timer_stats_timer_set_start_info(timer);
 437  	BUG_ON(timer_pending(timer) || !timer->function);
 438	spin_lock_irqsave(&base->lock, flags);
 439	timer_set_base(timer, base);
 440	internal_add_timer(base, timer);
 441	spin_unlock_irqrestore(&base->lock, flags);
 442}
 443
 444
 445/**
 446 * mod_timer - modify a timer's timeout
 447 * @timer: the timer to be modified
 448 * @expires: new timeout in jiffies
 449 *
 450 * mod_timer() is a more efficient way to update the expire field of an
 451 * active timer (if the timer is inactive it will be activated)
 452 *
 453 * mod_timer(timer, expires) is equivalent to:
 454 *
 455 *     del_timer(timer); timer->expires = expires; add_timer(timer);
 456 *
 457 * Note that if there are multiple unserialized concurrent users of the
 458 * same timer, then mod_timer() is the only safe way to modify the timeout,
 459 * since add_timer() cannot modify an already running timer.
 460 *
 461 * The function returns whether it has modified a pending timer or not.
 462 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
 463 * active timer returns 1.)
 464 */
 465int mod_timer(struct timer_list *timer, unsigned long expires)
 466{
 467	BUG_ON(!timer->function);
 468
 469	timer_stats_timer_set_start_info(timer);
 470	/*
 471	 * This is a common optimization triggered by the
 472	 * networking code - if the timer is re-modified
 473	 * to be the same thing then just return:
 474	 */
 475	if (timer->expires == expires && timer_pending(timer))
 476		return 1;
 477
 478	return __mod_timer(timer, expires);
 479}
 480
 481EXPORT_SYMBOL(mod_timer);
 482
 483/**
 484 * del_timer - deactive a timer.
 485 * @timer: the timer to be deactivated
 486 *
 487 * del_timer() deactivates a timer - this works on both active and inactive
 488 * timers.
 489 *
 490 * The function returns whether it has deactivated a pending timer or not.
 491 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
 492 * active timer returns 1.)
 493 */
 494int del_timer(struct timer_list *timer)
 495{
 496	tvec_base_t *base;
 497	unsigned long flags;
 498	int ret = 0;
 499
 500	timer_stats_timer_clear_start_info(timer);
 501	if (timer_pending(timer)) {
 502		base = lock_timer_base(timer, &flags);
 503		if (timer_pending(timer)) {
 504			detach_timer(timer, 1);
 505			ret = 1;
 506		}
 507		spin_unlock_irqrestore(&base->lock, flags);
 508	}
 509
 510	return ret;
 511}
 512
 513EXPORT_SYMBOL(del_timer);
 514
 515#ifdef CONFIG_SMP
 516/**
 517 * try_to_del_timer_sync - Try to deactivate a timer
 518 * @timer: timer do del
 519 *
 520 * This function tries to deactivate a timer. Upon successful (ret >= 0)
 521 * exit the timer is not queued and the handler is not running on any CPU.
 522 *
 523 * It must not be called from interrupt contexts.
 524 */
 525int try_to_del_timer_sync(struct timer_list *timer)
 526{
 527	tvec_base_t *base;
 528	unsigned long flags;
 529	int ret = -1;
 530
 531	base = lock_timer_base(timer, &flags);
 532
 533	if (base->running_timer == timer)
 534		goto out;
 535
 536	ret = 0;
 537	if (timer_pending(timer)) {
 538		detach_timer(timer, 1);
 539		ret = 1;
 540	}
 541out:
 542	spin_unlock_irqrestore(&base->lock, flags);
 543
 544	return ret;
 545}
 546
 547EXPORT_SYMBOL(try_to_del_timer_sync);
 548
 549/**
 550 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 551 * @timer: the timer to be deactivated
 552 *
 553 * This function only differs from del_timer() on SMP: besides deactivating
 554 * the timer it also makes sure the handler has finished executing on other
 555 * CPUs.
 556 *
 557 * Synchronization rules: Callers must prevent restarting of the timer,
 558 * otherwise this function is meaningless. It must not be called from
 559 * interrupt contexts. The caller must not hold locks which would prevent
 560 * completion of the timer's handler. The timer's handler must not call
 561 * add_timer_on(). Upon exit the timer is not queued and the handler is
 562 * not running on any CPU.
 563 *
 564 * The function returns whether it has deactivated a pending timer or not.
 565 */
 566int del_timer_sync(struct timer_list *timer)
 567{
 568	for (;;) {
 569		int ret = try_to_del_timer_sync(timer);
 570		if (ret >= 0)
 571			return ret;
 572		cpu_relax();
 573	}
 574}
 575
 576EXPORT_SYMBOL(del_timer_sync);
 577#endif
 578
 579static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 580{
 581	/* cascade all the timers from tv up one level */
 582	struct timer_list *timer, *tmp;
 583	struct list_head tv_list;
 584
 585	list_replace_init(tv->vec + index, &tv_list);
 586
 587	/*
 588	 * We are removing _all_ timers from the list, so we
 589	 * don't have to detach them individually.
 590	 */
 591	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
 592		BUG_ON(tbase_get_base(timer->base) != base);
 593		internal_add_timer(base, timer);
 594	}
 595
 596	return index;
 597}
 598
 599#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 600
 601/**
 602 * __run_timers - run all expired timers (if any) on this CPU.
 603 * @base: the timer vector to be processed.
 604 *
 605 * This function cascades all vectors and executes all expired timer
 606 * vectors.
 607 */
 608static inline void __run_timers(tvec_base_t *base)
 609{
 610	struct timer_list *timer;
 611
 612	spin_lock_irq(&base->lock);
 613	while (time_after_eq(jiffies, base->timer_jiffies)) {
 614		struct list_head work_list;
 615		struct list_head *head = &work_list;
 616 		int index = base->timer_jiffies & TVR_MASK;
 617
 618		/*
 619		 * Cascade timers:
 620		 */
 621		if (!index &&
 622			(!cascade(base, &base->tv2, INDEX(0))) &&
 623				(!cascade(base, &base->tv3, INDEX(1))) &&
 624					!cascade(base, &base->tv4, INDEX(2)))
 625			cascade(base, &base->tv5, INDEX(3));
 626		++base->timer_jiffies;
 627		list_replace_init(base->tv1.vec + index, &work_list);
 628		while (!list_empty(head)) {
 629			void (*fn)(unsigned long);
 630			unsigned long data;
 631
 632			timer = list_first_entry(head, struct timer_list,entry);
 633 			fn = timer->function;
 634 			data = timer->data;
 635
 636			timer_stats_account_timer(timer);
 637
 638			set_running_timer(base, timer);
 639			detach_timer(timer, 1);
 640			spin_unlock_irq(&base->lock);
 641			{
 642				int preempt_count = preempt_count();
 643				fn(data);
 644				if (preempt_count != preempt_count()) {
 645					printk(KERN_WARNING "huh, entered %p "
 646					       "with preempt_count %08x, exited"
 647					       " with %08x?\n",
 648					       fn, preempt_count,
 649					       preempt_count());
 650					BUG();
 651				}
 652			}
 653			spin_lock_irq(&base->lock);
 654		}
 655	}
 656	set_running_timer(base, NULL);
 657	spin_unlock_irq(&base->lock);
 658}
 659
 660#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
 661/*
 662 * Find out when the next timer event is due to happen. This
 663 * is used on S/390 to stop all activity when a cpus is idle.
 664 * This functions needs to be called disabled.
 665 */
 666static unsigned long __next_timer_interrupt(tvec_base_t *base)
 667{
 668	unsigned long timer_jiffies = base->timer_jiffies;
 669	unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
 670	int index, slot, array, found = 0;
 671	struct timer_list *nte;
 672	tvec_t *varray[4];
 673
 674	/* Look for timer events in tv1. */
 675	index = slot = timer_jiffies & TVR_MASK;
 676	do {
 677		list_for_each_entry(nte, base->tv1.vec + slot, entry) {
 678 			if (tbase_get_deferrable(nte->base))
 679 				continue;
 680
 681			found = 1;
 682			expires = nte->expires;
 683			/* Look at the cascade bucket(s)? */
 684			if (!index || slot < index)
 685				goto cascade;
 686			return expires;
 687		}
 688		slot = (slot + 1) & TVR_MASK;
 689	} while (slot != index);
 690
 691cascade:
 692	/* Calculate the next cascade event */
 693	if (index)
 694		timer_jiffies += TVR_SIZE - index;
 695	timer_jiffies >>= TVR_BITS;
 696
 697	/* Check tv2-tv5. */
 698	varray[0] = &base->tv2;
 699	varray[1] = &base->tv3;
 700	varray[2] = &base->tv4;
 701	varray[3] = &base->tv5;
 702
 703	for (array = 0; array < 4; array++) {
 704		tvec_t *varp = varray[array];
 705
 706		index = slot = timer_jiffies & TVN_MASK;
 707		do {
 708			list_for_each_entry(nte, varp->vec + slot, entry) {
 709				found = 1;
 710				if (time_before(nte->expires, expires))
 711					expires = nte->expires;
 712			}
 713			/*
 714			 * Do we still search for the first timer or are
 715			 * we looking up the cascade buckets ?
 716			 */
 717			if (found) {
 718				/* Look at the cascade bucket(s)? */
 719				if (!index || slot < index)
 720					break;
 721				return expires;
 722			}
 723			slot = (slot + 1) & TVN_MASK;
 724		} while (slot != index);
 725
 726		if (index)
 727			timer_jiffies += TVN_SIZE - index;
 728		timer_jiffies >>= TVN_BITS;
 729	}
 730	return expires;
 731}
 732
 733/*
 734 * Check, if the next hrtimer event is before the next timer wheel
 735 * event:
 736 */
 737static unsigned long cmp_next_hrtimer_event(unsigned long now,
 738					    unsigned long expires)
 739{
 740	ktime_t hr_delta = hrtimer_get_next_event();
 741	struct timespec tsdelta;
 742	unsigned long delta;
 743
 744	if (hr_delta.tv64 == KTIME_MAX)
 745		return expires;
 746
 747	/*
 748	 * Expired timer available, let it expire in the next tick
 749	 */
 750	if (hr_delta.tv64 <= 0)
 751		return now + 1;
 752
 753	tsdelta = ktime_to_timespec(hr_delta);
 754	delta = timespec_to_jiffies(&tsdelta);
 755	/*
 756	 * Take rounding errors in to account and make sure, that it
 757	 * expires in the next tick. Otherwise we go into an endless
 758	 * ping pong due to tick_nohz_stop_sched_tick() retriggering
 759	 * the timer softirq
 760	 */
 761	if (delta < 1)
 762		delta = 1;
 763	now += delta;
 764	if (time_before(now, expires))
 765		return now;
 766	return expires;
 767}
 768
 769/**
 770 * next_timer_interrupt - return the jiffy of the next pending timer
 771 * @now: current time (in jiffies)
 772 */
 773unsigned long get_next_timer_interrupt(unsigned long now)
 774{
 775	tvec_base_t *base = __get_cpu_var(tvec_bases);
 776	unsigned long expires;
 777
 778	spin_lock(&base->lock);
 779	expires = __next_timer_interrupt(base);
 780	spin_unlock(&base->lock);
 781
 782	if (time_before_eq(expires, now))
 783		return now;
 784
 785	return cmp_next_hrtimer_event(now, expires);
 786}
 787
 788#ifdef CONFIG_NO_IDLE_HZ
 789unsigned long next_timer_interrupt(void)
 790{
 791	return get_next_timer_interrupt(jiffies);
 792}
 793#endif
 794
 795#endif
 796
 797/*
 798 * Called from the timer interrupt handler to charge one tick to the current 
 799 * process.  user_tick is 1 if the tick is user time, 0 for system.
 800 */
 801void update_process_times(int user_tick)
 802{
 803	struct task_struct *p = current;
 804	int cpu = smp_processor_id();
 805
 806	/* Note: this timer irq context must be accounted for as well. */
 807	if (user_tick)
 808		account_user_time(p, jiffies_to_cputime(1));
 809	else
 810		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
 811	run_local_timers();
 812	if (rcu_pending(cpu))
 813		rcu_check_callbacks(cpu, user_tick);
 814	scheduler_tick();
 815 	run_posix_cpu_timers(p);
 816}
 817
 818/*
 819 * Nr of active tasks - counted in fixed-point numbers
 820 */
 821static unsigned long count_active_tasks(void)
 822{
 823	return nr_active() * FIXED_1;
 824}
 825
 826/*
 827 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
 828 * imply that avenrun[] is the standard name for this kind of thing.
 829 * Nothing else seems to be standardized: the fractional size etc
 830 * all seem to differ on different machines.
 831 *
 832 * Requires xtime_lock to access.
 833 */
 834unsigned long avenrun[3];
 835
 836EXPORT_SYMBOL(avenrun);
 837
 838/*
 839 * calc_load - given tick count, update the avenrun load estimates.
 840 * This is called while holding a write_lock on xtime_lock.
 841 */
 842static inline void calc_load(unsigned long ticks)
 843{
 844	unsigned long active_tasks; /* fixed-point */
 845	static int count = LOAD_FREQ;
 846
 847	count -= ticks;
 848	if (unlikely(count < 0)) {
 849		active_tasks = count_active_tasks();
 850		do {
 851			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 852			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 853			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
 854			count += LOAD_FREQ;
 855		} while (count < 0);
 856	}
 857}
 858
 859/*
 860 * This function runs timers and the timer-tq in bottom half context.
 861 */
 862static void run_timer_softirq(struct softirq_action *h)
 863{
 864	tvec_base_t *base = __get_cpu_var(tvec_bases);
 865
 866	hrtimer_run_queues();
 867
 868	if (time_after_eq(jiffies, base->timer_jiffies))
 869		__run_timers(base);
 870}
 871
 872/*
 873 * Called by the local, per-CPU timer interrupt on SMP.
 874 */
 875void run_local_timers(void)
 876{
 877	raise_softirq(TIMER_SOFTIRQ);
 878	softlockup_tick();
 879}
 880
 881/*
 882 * Called by the timer interrupt. xtime_lock must already be taken
 883 * by the timer IRQ!
 884 */
 885static inline void update_times(unsigned long ticks)
 886{
 887	update_wall_time();
 888	calc_load(ticks);
 889}
 890  
 891/*
 892 * The 64-bit jiffies value is not atomic - you MUST NOT read it
 893 * without sampling the sequence number in xtime_lock.
 894 * jiffies is defined in the linker script...
 895 */
 896
 897void do_timer(unsigned long ticks)
 898{
 899	jiffies_64 += ticks;
 900	update_times(ticks);
 901}
 902
 903#ifdef __ARCH_WANT_SYS_ALARM
 904
 905/*
 906 * For backwards compatibility?  This can be done in libc so Alpha
 907 * and all newer ports shouldn't need it.
 908 */
 909asmlinkage unsigned long sys_alarm(unsigned int seconds)
 910{
 911	return alarm_setitimer(seconds);
 912}
 913
 914#endif
 915
 916#ifndef __alpha__
 917
 918/*
 919 * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
 920 * should be moved into arch/i386 instead?
 921 */
 922
 923/**
 924 * sys_getpid - return the thread group id of the current process
 925 *
 926 * Note, despite the name, this returns the tgid not the pid.  The tgid and
 927 * the pid are identical unless CLONE_THREAD was specified on clone() in
 928 * which case the tgid is the same in all threads of the same group.
 929 *
 930 * This is SMP safe as current->tgid does not change.
 931 */
 932asmlinkage long sys_getpid(void)
 933{
 934	return current->tgid;
 935}
 936
 937/*
 938 * Accessing ->real_parent is not SMP-safe, it could
 939 * change from under us. However, we can use a stale
 940 * value of ->real_parent under rcu_read_lock(), see
 941 * release_task()->call_rcu(delayed_put_task_struct).
 942 */
 943asmlinkage long sys_getppid(void)
 944{
 945	int pid;
 946
 947	rcu_read_lock();
 948	pid = rcu_dereference(current->real_parent)->tgid;
 949	rcu_read_unlock();
 950
 951	return pid;
 952}
 953
 954asmlinkage long sys_getuid(void)
 955{
 956	/* Only we change this so SMP safe */
 957	return current->uid;
 958}
 959
 960asmlinkage long sys_geteuid(void)
 961{
 962	/* Only we change this so SMP safe */
 963	return current->euid;
 964}
 965
 966asmlinkage long sys_getgid(void)
 967{
 968	/* Only we change this so SMP safe */
 969	return current->gid;
 970}
 971
 972asmlinkage long sys_getegid(void)
 973{
 974	/* Only we change this so SMP safe */
 975	return  current->egid;
 976}
 977
 978#endif
 979
 980static void process_timeout(unsigned long __data)
 981{
 982	wake_up_process((struct task_struct *)__data);
 983}
 984
 985/**
 986 * schedule_timeout - sleep until timeout
 987 * @timeout: timeout value in jiffies
 988 *
 989 * Make the current task sleep until @timeout jiffies have
 990 * elapsed. The routine will return immediately unless
 991 * the current task state has been set (see set_current_state()).
 992 *
 993 * You can set the task state as follows -
 994 *
 995 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 996 * pass before the routine returns. The routine will return 0
 997 *
 998 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 999 * delivered to the current task. In this case the remaining time
1000 * in jiffies will be returned, or 0 if the timer expired in time
1001 *
1002 * The current task state is guaranteed to be TASK_RUNNING when this
1003 * routine returns.
1004 *
1005 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1006 * the CPU away without a bound on the timeout. In this case the return
1007 * value will be %MAX_SCHEDULE_TIMEOUT.
1008 *
1009 * In all cases the return value is guaranteed to be non-negative.
1010 */
1011fastcall signed long __sched schedule_timeout(signed long timeout)
1012{
1013	struct timer_list timer;
1014	unsigned long expire;
1015
1016	switch (timeout)
1017	{
1018	case MAX_SCHEDULE_TIMEOUT:
1019		/*
1020		 * These two special cases are useful to be comfortable
1021		 * in the caller. Nothing more. We could take
1022		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1023		 * but I' d like to return a valid offset (>=0) to allow
1024		 * the caller to do everything it want with the retval.
1025		 */
1026		schedule();
1027		goto out;
1028	default:
1029		/*
1030		 * Another bit of PARANOID. Note that the retval will be
1031		 * 0 since no piece of kernel is supposed to do a check
1032		 * for a negative retval of schedule_timeout() (since it
1033		 * should never happens anyway). You just have the printk()
1034		 * that will tell you if something is gone wrong and where.
1035		 */
1036		if (timeout < 0) {
1037			printk(KERN_ERR "schedule_timeout: wrong timeout "
1038				"value %lx\n", timeout);
1039			dump_stack();
1040			current->state = TASK_RUNNING;
1041			goto out;
1042		}
1043	}
1044
1045	expire = timeout + jiffies;
1046
1047	setup_timer(&timer, process_timeout, (unsigned long)current);
1048	__mod_timer(&timer, expire);
1049	schedule();
1050	del_singleshot_timer_sync(&timer);
1051
1052	timeout = expire - jiffies;
1053
1054 out:
1055	return timeout < 0 ? 0 : timeout;
1056}
1057EXPORT_SYMBOL(schedule_timeout);
1058
1059/*
1060 * We can use __set_current_state() here because schedule_timeout() calls
1061 * schedule() unconditionally.
1062 */
1063signed long __sched schedule_timeout_interruptible(signed long timeout)
1064{
1065	__set_current_state(TASK_INTERRUPTIBLE);
1066	return schedule_timeout(timeout);
1067}
1068EXPORT_SYMBOL(schedule_timeout_interruptible);
1069
1070signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1071{
1072	__set_current_state(TASK_UNINTERRUPTIBLE);
1073	return schedule_timeout(timeout);
1074}
1075EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1076
1077/* Thread ID - the internal kernel "pid" */
1078asmlinkage long sys_gettid(void)
1079{
1080	return current->pid;
1081}
1082
1083/**
1084 * do_sysinfo - fill in sysinfo struct
1085 * @info: pointer to buffer to fill
1086 */ 
1087int do_sysinfo(struct sysinfo *info)
1088{
1089	unsigned long mem_total, sav_total;
1090	unsigned int mem_unit, bitcount;
1091	unsigned long seq;
1092
1093	memset(info, 0, sizeof(struct sysinfo));
1094
1095	do {
1096		struct timespec tp;
1097		seq = read_seqbegin(&xtime_lock);
1098
1099		/*
1100		 * This is annoying.  The below is the same thing
1101		 * posix_get_clock_monotonic() does, but it wants to
1102		 * take the lock which we want to cover the loads stuff
1103		 * too.
1104		 */
1105
1106		getnstimeofday(&tp);
1107		tp.tv_sec += wall_to_monotonic.tv_sec;
1108		tp.tv_nsec += wall_to_monotonic.tv_nsec;
1109		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1110			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1111			tp.tv_sec++;
1112		}
1113		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1114
1115		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1116		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1117		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1118
1119		info->procs = nr_threads;
1120	} while (read_seqretry(&xtime_lock, seq));
1121
1122	si_meminfo(info);
1123	si_swapinfo(info);
1124
1125	/*
1126	 * If the sum of all the available memory (i.e. ram + swap)
1127	 * is less than can be stored in a 32 bit unsigned long then
1128	 * we can be binary compatible with 2.2.x kernels.  If not,
1129	 * well, in that case 2.2.x was broken anyways...
1130	 *
1131	 *  -Erik Andersen <andersee@debian.org>
1132	 */
1133
1134	mem_total = info->totalram + info->totalswap;
1135	if (mem_total < info->totalram || mem_total < info->totalswap)
1136		goto out;
1137	bitcount = 0;
1138	mem_unit = info->mem_unit;
1139	while (mem_unit > 1) {
1140		bitcount++;
1141		mem_unit >>= 1;
1142		sav_total = mem_total;
1143		mem_total <<= 1;
1144		if (mem_total < sav_total)
1145			goto out;
1146	}
1147
1148	/*
1149	 * If mem_total did not overflow, multiply all memory values by
1150	 * info->mem_unit and set it to 1.  This leaves things compatible
1151	 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1152	 * kernels...
1153	 */
1154
1155	info->mem_unit = 1;
1156	info->totalram <<= bitcount;
1157	info->freeram <<= bitcount;
1158	info->sharedram <<= bitcount;
1159	info->bufferram <<= bitcount;
1160	info->totalswap <<= bitcount;
1161	info->freeswap <<= bitcount;
1162	info->totalhigh <<= bitcount;
1163	info->freehigh <<= bitcount;
1164
1165out:
1166	return 0;
1167}
1168
1169asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1170{
1171	struct sysinfo val;
1172
1173	do_sysinfo(&val);
1174
1175	if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1176		return -EFAULT;
1177
1178	return 0;
1179}
1180
1181/*
1182 * lockdep: we want to track each per-CPU base as a separate lock-class,
1183 * but timer-bases are kmalloc()-ed, so we need to attach separate
1184 * keys to them:
1185 */
1186static struct lock_class_key base_lock_keys[NR_CPUS];
1187
1188static int __devinit init_timers_cpu(int cpu)
1189{
1190	int j;
1191	tvec_base_t *base;
1192	static char __devinitdata tvec_base_done[NR_CPUS];
1193
1194	if (!tvec_base_done[cpu]) {
1195		static char boot_done;
1196
1197		if (boot_done) {
1198			/*
1199			 * The APs use this path later in boot
1200			 */
1201			base = kmalloc_node(sizeof(*base), GFP_KERNEL,
1202						cpu_to_node(cpu));
1203			if (!base)
1204				return -ENOMEM;
1205
1206			/* Make sure that tvec_base is 2 byte aligned */
1207			if (tbase_get_deferrable(base)) {
1208				WARN_ON(1);
1209				kfree(base);
1210				return -ENOMEM;
1211			}
1212			memset(base, 0, sizeof(*base));
1213			per_cpu(tvec_bases, cpu) = base;
1214		} else {
1215			/*
1216			 * This is for the boot CPU - we use compile-time
1217			 * static initialisation because per-cpu memory isn't
1218			 * ready yet and because the memory allocators are not
1219			 * initialised either.
1220			 */
1221			boot_done = 1;
1222			base = &boot_tvec_bases;
1223		}
1224		tvec_base_done[cpu] = 1;
1225	} else {
1226		base = per_cpu(tvec_bases, cpu);
1227	}
1228
1229	spin_lock_init(&base->lock);
1230	lockdep_set_class(&base->lock, base_lock_keys + cpu);
1231
1232	for (j = 0; j < TVN_SIZE; j++) {
1233		INIT_LIST_HEAD(base->tv5.vec + j);
1234		INIT_LIST_HEAD(base->tv4.vec + j);
1235		INIT_LIST_HEAD(base->tv3.vec + j);
1236		INIT_LIST_HEAD(base->tv2.vec + j);
1237	}
1238	for (j = 0; j < TVR_SIZE; j++)
1239		INIT_LIST_HEAD(base->tv1.vec + j);
1240
1241	base->timer_jiffies = jiffies;
1242	return 0;
1243}
1244
1245#ifdef CONFIG_HOTPLUG_CPU
1246static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1247{
1248	struct timer_list *timer;
1249
1250	while (!list_empty(head)) {
1251		timer = list_first_entry(head, struct timer_list, entry);
1252		detach_timer(timer, 0);
1253		timer_set_base(timer, new_base);
1254		internal_add_timer(new_base, timer);
1255	}
1256}
1257
1258static void __devinit migrate_timers(int cpu)
1259{
1260	tvec_base_t *old_base;
1261	tvec_base_t *new_base;
1262	int i;
1263
1264	BUG_ON(cpu_online(cpu));
1265	old_base = per_cpu(tvec_bases, cpu);
1266	new_base = get_cpu_var(tvec_bases);
1267
1268	local_irq_disable();
1269	double_spin_lock(&new_base->lock, &old_base->lock,
1270			 smp_processor_id() < cpu);
1271
1272	BUG_ON(old_base->running_timer);
1273
1274	for (i = 0; i < TVR_SIZE; i++)
1275		migrate_timer_list(new_base, old_base->tv1.vec + i);
1276	for (i = 0; i < TVN_SIZE; i++) {
1277		migrate_timer_list(new_base, old_base->tv2.vec + i);
1278		migrate_timer_list(new_base, old_base->tv3.vec + i);
1279		migrate_timer_list(new_base, old_base->tv4.vec + i);
1280		migrate_timer_list(new_base, old_base->tv5.vec + i);
1281	}
1282
1283	double_spin_unlock(&new_base->lock, &old_base->lock,
1284			   smp_processor_id() < cpu);
1285	local_irq_enable();
1286	put_cpu_var(tvec_bases);
1287}
1288#endif /* CONFIG_HOTPLUG_CPU */
1289
1290static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1291				unsigned long action, void *hcpu)
1292{
1293	long cpu = (long)hcpu;
1294	switch(action) {
1295	case CPU_UP_PREPARE:
1296	case CPU_UP_PREPARE_FROZEN:
1297		if (init_timers_cpu(cpu) < 0)
1298			return NOTIFY_BAD;
1299		break;
1300#ifdef CONFIG_HOTPLUG_CPU
1301	case CPU_DEAD:
1302	case CPU_DEAD_FROZEN:
1303		migrate_timers(cpu);
1304		break;
1305#endif
1306	default:
1307		break;
1308	}
1309	return NOTIFY_OK;
1310}
1311
1312static struct notifier_block __cpuinitdata timers_nb = {
1313	.notifier_call	= timer_cpu_notify,
1314};
1315
1316
1317void __init init_timers(void)
1318{
1319	int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1320				(void *)(long)smp_processor_id());
1321
1322	init_timer_stats();
1323
1324	BUG_ON(err == NOTIFY_BAD);
1325	register_cpu_notifier(&timers_nb);
1326	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1327}
1328
1329#ifdef CONFIG_TIME_INTERPOLATION
1330
1331struct time_interpolator *time_interpolator __read_mostly;
1332static struct time_interpolator *time_interpolator_list __read_mostly;
1333static DEFINE_SPINLOCK(time_interpolator_lock);
1334
1335static inline cycles_t time_interpolator_get_cycles(unsigned int src)
1336{
1337	unsigned long (*x)(void);
1338
1339	switch (src)
1340	{
1341		case TIME_SOURCE_FUNCTION:
1342			x = time_interpolator->addr;
1343			return x();
1344
1345		case TIME_SOURCE_MMIO64	:
1346			return readq_relaxed((void __iomem *)time_interpolator->addr);
1347
1348		case TIME_SOURCE_MMIO32	:
1349			return readl_relaxed((void __iomem *)time_interpolator->addr);
1350
1351		default: return get_cycles();
1352	}
1353}
1354
1355static inline u64 time_interpolator_get_counter(int writelock)
1356{
1357	unsigned int src = time_interpolator->source;
1358
1359	if (time_interpolator->jitter)
1360	{
1361		cycles_t lcycle;
1362		cycles_t now;
1363
1364		do {
1365			lcycle = time_interpolator->last_cycle;
1366			now = time_interpolator_get_cycles(src);
1367			if (lcycle && time_after(lcycle, now))
1368				return lcycle;
1369
1370			/* When holding the xtime write lock, there's no need
1371			 * to add the overhead of the cmpxchg.  Readers are
1372			 * force to retry until the write lock is released.
1373			 */
1374			if (writelock) {
1375				time_interpolator->last_cycle = now;
1376				return now;
1377			}
1378			/* Keep track of the last timer value returned. The use of cmpxchg here
1379			 * will cause contention in an SMP environment.
1380			 */
1381		} while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1382		return now;
1383	}
1384	else
1385		return time_interpolator_get_cycles(src);
1386}
1387
1388void time_interpolator_reset(void)
1389{
1390	time_interpolator->offset = 0;
1391	time_interpolator->last_counter = time_interpolator_get_counter(1);
1392}
1393
1394#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1395
1396unsigned long time_interpolator_get_offset(void)
1397{
1398	/* If we do not have a time interpolator set up then just return zero */
1399	if (!time_interpolator)
1400		return 0;
1401
1402	return time_interpolator->offset +
1403		GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1404}
1405
1406#define INTERPOLATOR_ADJUST 65536
1407#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1408
1409void time_interpolator_update(long delta_nsec)
1410{
1411	u64 counter;
1412	unsigned long offset;
1413
1414	/* If there is no time interpolator set up then do nothing */
1415	if (!time_interpolator)
1416		return;
1417
1418	/*
1419	 * The interpolator compensates for late ticks by accumulating the late
1420	 * time in time_interpolator->offset. A tick earlier than expected will
1421	 * lead to a reset of the offset and a corresponding jump of the clock
1422	 * forward. Again this only works if the interpolator clock is running
1423	 * slightly slower than the regular clock and the tuning logic insures
1424	 * that.
1425	 */
1426
1427	counter = time_interpolator_get_counter(1);
1428	offset = time_interpolator->offset +
1429			GET_TI_NSECS(counter, time_interpolator);
1430
1431	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1432		time_interpolator->offset = offset - delta_nsec;
1433	else {
1434		time_interpolator->skips++;
1435		time_interpolator->ns_skipped += delta_nsec - offset;
1436		time_interpolator->offset = 0;
1437	}
1438	time_interpolator->last_counter = counter;
1439
1440	/* Tuning logic for time interpolator invoked every minute or so.
1441	 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1442	 * Increase interpolator clock speed if we skip too much time.
1443	 */
1444	if (jiffies % INTERPOLATOR_ADJUST == 0)
1445	{
1446		if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
1447			time_interpolator->nsec_per_cyc--;
1448		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1449			time_interpolator->nsec_per_cyc++;
1450		time_interpolator->skips = 0;
1451		time_interpolator->ns_skipped = 0;
1452	}
1453}
1454
1455static inline int
1456is_better_time_interpolator(struct time_interpolator *new)
1457{
1458	if (!time_interpolator)
1459		return 1;
1460	return new->frequency > 2*time_interpolator->frequency ||
1461	    (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1462}
1463
1464void
1465register_time_interpolator(struct time_interpolator *ti)
1466{
1467	unsigned long flags;
1468
1469	/* Sanity check */
1470	BUG_ON(ti->frequency == 0 || ti->mask == 0);
1471
1472	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1473	spin_lock(&time_interpolator_lock);
1474	write_seqlock_irqsave(&xtime_lock, flags);
1475	if (is_better_time_interpolator(ti)) {
1476		time_interpolator = ti;
1477		time_interpolator_reset();
1478	}
1479	write_sequnlock_irqrestore(&xtime_lock, flags);
1480
1481	ti->next = time_interpolator_list;
1482	time_interpolator_list = ti;
1483	spin_unlock(&time_interpolator_lock);
1484}
1485
1486void
1487unregister_time_interpolator(struct time_interpolator *ti)
1488{
1489	struct time_interpolator *curr, **prev;
1490	unsigned long flags;
1491
1492	spin_lock(&time_interpolator_lock);
1493	prev = &time_interpolator_list;
1494	for (curr = *prev; curr; curr = curr->next) {
1495		if (curr == ti) {
1496			*prev = curr->next;
1497			break;
1498		}
1499		prev = &curr->next;
1500	}
1501
1502	write_seqlock_irqsave(&xtime_lock, flags);
1503	if (ti == time_interpolator) {
1504		/* we lost the best time-interpolator: */
1505		time_interpolator = NULL;
1506		/* find the next-best interpolator */
1507		for (curr = time_interpolator_list; curr; curr = curr->next)
1508			if (is_better_time_interpolator(curr))
1509				time_interpolator = curr;
1510		time_interpolator_reset();
1511	}
1512	write_sequnlock_irqrestore(&xtime_lock, flags);
1513	spin_unlock(&time_interpolator_lock);
1514}
1515#endif /* CONFIG_TIME_INTERPOLATION */
1516
1517/**
1518 * msleep - sleep safely even with waitqueue interruptions
1519 * @msecs: Time in milliseconds to sleep for
1520 */
1521void msleep(unsigned int msecs)
1522{
1523	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1524
1525	while (timeout)
1526		timeout = schedule_timeout_uninterruptible(timeout);
1527}
1528
1529EXPORT_SYMBOL(msleep);
1530
1531/**
1532 * msleep_interruptible - sleep waiting for signals
1533 * @msecs: Time in milliseconds to sleep for
1534 */
1535unsigned long msleep_interruptible(unsigned int msecs)
1536{
1537	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1538
1539	while (timeout && !signal_pending(current))
1540		timeout = schedule_timeout_interruptible(timeout);
1541	return jiffies_to_msecs(timeout);
1542}
1543
1544EXPORT_SYMBOL(msleep_interruptible);