Merge branch 'timers-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'timers-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
hrtimer: prevent migration of per CPU hrtimers
hrtimer: mark migration state
hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers
hrtimer: migrate pending list on cpu offline

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

+103 -18
+14 -4
include/linux/hrtimer.h
··· 47 47 * HRTIMER_CB_IRQSAFE: Callback may run in hardirq context 48 48 * HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and 49 49 * does not restart the timer 50 - * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: Callback must run in hardirq context 51 - * Special mode for tick emultation 50 + * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context 51 + * Special mode for tick emulation and 52 + * scheduler timer. Such timers are per 53 + * cpu and not allowed to be migrated on 54 + * cpu unplug. 55 + * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context 56 + * with timer->base lock unlocked 57 + * used for timers which call wakeup to 58 + * avoid lock order problems with rq->lock 52 59 */ 53 60 enum hrtimer_cb_mode { 54 61 HRTIMER_CB_SOFTIRQ, 55 62 HRTIMER_CB_IRQSAFE, 56 63 HRTIMER_CB_IRQSAFE_NO_RESTART, 57 - HRTIMER_CB_IRQSAFE_NO_SOFTIRQ, 64 + HRTIMER_CB_IRQSAFE_PERCPU, 65 + HRTIMER_CB_IRQSAFE_UNLOCKED, 58 66 }; 59 67 60 68 /* ··· 75 67 * 0x02 callback function running 76 68 * 0x04 callback pending (high resolution mode) 77 69 * 78 - * Special case: 70 + * Special cases: 79 71 * 0x03 callback function running and enqueued 80 72 * (was requeued on another CPU) 73 + * 0x09 timer was migrated on CPU hotunplug 81 74 * The "callback function running and enqueued" status is only possible on 82 75 * SMP. It happens for example when a posix timer expired and the callback 83 76 * queued a signal. Between dropping the lock which protects the posix timer ··· 96 87 #define HRTIMER_STATE_ENQUEUED 0x01 97 88 #define HRTIMER_STATE_CALLBACK 0x02 98 89 #define HRTIMER_STATE_PENDING 0x04 90 + #define HRTIMER_STATE_MIGRATE 0x08 99 91 100 92 /** 101 93 * struct hrtimer - the basic hrtimer structure
+85 -10
kernel/hrtimer.c
··· 672 672 */ 673 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 674 674 return 1; 675 - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 + case HRTIMER_CB_IRQSAFE_PERCPU: 676 + case HRTIMER_CB_IRQSAFE_UNLOCKED: 676 677 /* 677 678 * This is solely for the sched tick emulation with 678 679 * dynamic tick support to ensure that we do not 679 680 * restart the tick right on the edge and end up with 680 681 * the tick timer in the softirq ! The calling site 681 - * takes care of this. 682 + * takes care of this. Also used for hrtimer sleeper ! 682 683 */ 683 684 debug_hrtimer_deactivate(timer); 684 685 return 1; ··· 1246 1245 timer_stats_account_hrtimer(timer); 1247 1246 1248 1247 fn = timer->function; 1249 - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1248 + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || 1249 + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { 1250 1250 /* 1251 1251 * Used for scheduler timers, avoid lock inversion with 1252 1252 * rq->lock and tasklist_lock. ··· 1454 1452 sl->timer.function = hrtimer_wakeup; 1455 1453 sl->task = task; 1456 1454 #ifdef CONFIG_HIGH_RES_TIMERS 1457 - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1455 + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; 1458 1456 #endif 1459 1457 } 1460 1458 ··· 1593 1591 1594 1592 #ifdef CONFIG_HOTPLUG_CPU 1595 1593 1596 - static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1597 - struct hrtimer_clock_base *new_base) 1594 + static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1595 + struct hrtimer_clock_base *new_base, int dcpu) 1598 1596 { 1599 1597 struct hrtimer *timer; 1600 1598 struct rb_node *node; 1599 + int raise = 0; 1601 1600 1602 1601 while ((node = rb_first(&old_base->active))) { 1603 1602 timer = rb_entry(node, struct hrtimer, node); 1604 1603 BUG_ON(hrtimer_callback_running(timer)); 1605 1604 debug_hrtimer_deactivate(timer); 1606 - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1605 + 1606 + /* 1607 + * Should not happen. Per CPU timers should be 1608 + * canceled _before_ the migration code is called 1609 + */ 1610 + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { 1611 + __remove_hrtimer(timer, old_base, 1612 + HRTIMER_STATE_INACTIVE, 0); 1613 + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", 1614 + timer, timer->function, dcpu); 1615 + continue; 1616 + } 1617 + 1618 + /* 1619 + * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1620 + * timer could be seen as !active and just vanish away 1621 + * under us on another CPU 1622 + */ 1623 + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1607 1624 timer->base = new_base; 1608 1625 /* 1609 1626 * Enqueue the timer. Allow reprogramming of the event device 1610 1627 */ 1611 1628 enqueue_hrtimer(timer, new_base, 1); 1629 + 1630 + #ifdef CONFIG_HIGH_RES_TIMERS 1631 + /* 1632 + * Happens with high res enabled when the timer was 1633 + * already expired and the callback mode is 1634 + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The 1635 + * enqueue code does not move them to the soft irq 1636 + * pending list for performance/latency reasons, but 1637 + * in the migration state, we need to do that 1638 + * otherwise we end up with a stale timer. 1639 + */ 1640 + if (timer->state == HRTIMER_STATE_MIGRATE) { 1641 + timer->state = HRTIMER_STATE_PENDING; 1642 + list_add_tail(&timer->cb_entry, 1643 + &new_base->cpu_base->cb_pending); 1644 + raise = 1; 1645 + } 1646 + #endif 1647 + /* Clear the migration state bit */ 1648 + timer->state &= ~HRTIMER_STATE_MIGRATE; 1612 1649 } 1650 + return raise; 1613 1651 } 1652 + 1653 + #ifdef CONFIG_HIGH_RES_TIMERS 1654 + static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, 1655 + struct hrtimer_cpu_base *new_base) 1656 + { 1657 + struct hrtimer *timer; 1658 + int raise = 0; 1659 + 1660 + while (!list_empty(&old_base->cb_pending)) { 1661 + timer = list_entry(old_base->cb_pending.next, 1662 + struct hrtimer, cb_entry); 1663 + 1664 + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); 1665 + timer->base = &new_base->clock_base[timer->base->index]; 1666 + list_add_tail(&timer->cb_entry, &new_base->cb_pending); 1667 + raise = 1; 1668 + } 1669 + return raise; 1670 + } 1671 + #else 1672 + static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, 1673 + struct hrtimer_cpu_base *new_base) 1674 + { 1675 + return 0; 1676 + } 1677 + #endif 1614 1678 1615 1679 static void migrate_hrtimers(int cpu) 1616 1680 { 1617 1681 struct hrtimer_cpu_base *old_base, *new_base; 1618 - int i; 1682 + int i, raise = 0; 1619 1683 1620 1684 BUG_ON(cpu_online(cpu)); 1621 1685 old_base = &per_cpu(hrtimer_bases, cpu); ··· 1694 1626 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1695 1627 1696 1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1697 - migrate_hrtimer_list(&old_base->clock_base[i], 1698 - &new_base->clock_base[i]); 1629 + if (migrate_hrtimer_list(&old_base->clock_base[i], 1630 + &new_base->clock_base[i], cpu)) 1631 + raise = 1; 1699 1632 } 1633 + 1634 + if (migrate_hrtimer_pending(old_base, new_base)) 1635 + raise = 1; 1700 1636 1701 1637 spin_unlock(&old_base->lock); 1702 1638 spin_unlock(&new_base->lock); 1703 1639 local_irq_enable(); 1704 1640 put_cpu_var(hrtimer_bases); 1641 + 1642 + if (raise) 1643 + hrtimer_raise_softirq(); 1705 1644 } 1706 1645 #endif /* CONFIG_HOTPLUG_CPU */ 1707 1646
+2 -2
kernel/sched.c
··· 201 201 hrtimer_init(&rt_b->rt_period_timer, 202 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 203 rt_b->rt_period_timer.function = sched_rt_period_timer; 204 - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; 205 205 } 206 206 207 207 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) ··· 1119 1119 1120 1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1121 1121 rq->hrtick_timer.function = hrtick; 1122 - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1122 + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; 1123 1123 } 1124 1124 #else 1125 1125 static inline void hrtick_clear(struct rq *rq)
+1 -1
kernel/time/tick-sched.c
··· 625 625 */ 626 626 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 627 627 ts->sched_timer.function = tick_sched_timer; 628 - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 628 + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; 629 629 630 630 /* Get the next period (per cpu) */ 631 631 ts->sched_timer.expires = tick_init_jiffy_update();
+1 -1
kernel/trace/trace_sysprof.c
··· 202 202 203 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 204 204 hrtimer->function = stack_trace_timer_fn; 205 - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; 206 206 207 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 208 208 }