Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
sched, cpu hotplug: fix set_cpus_allowed() use in hotplug callbacks
sched: fix mysql+oltp regression
sched_clock: delay using sched_clock()
sched clock: couple local and remote clocks
sched clock: simplify __update_sched_clock()
sched: eliminate scd->prev_raw
sched clock: clean up sched_clock_cpu()
sched clock: revert various sched_clock() changes
sched: move sched_clock before first use
sched: test runtime rather than period in global_rt_runtime()
sched: fix SCHED_HRTICK dependency
sched: fix warning in hrtick_start_fair()

+77 -164
+4 -27
include/linux/sched.h
··· 1551 1551 1552 1552 extern unsigned long long sched_clock(void); 1553 1553 1554 + extern void sched_clock_init(void); 1555 + extern u64 sched_clock_cpu(int cpu); 1556 + 1554 1557 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 1555 - static inline void sched_clock_init(void) 1556 - { 1557 - } 1558 - 1559 - static inline u64 sched_clock_cpu(int cpu) 1560 - { 1561 - return sched_clock(); 1562 - } 1563 - 1564 1558 static inline void sched_clock_tick(void) 1565 1559 { 1566 1560 } ··· 1566 1572 static inline void sched_clock_idle_wakeup_event(u64 delta_ns) 1567 1573 { 1568 1574 } 1569 - 1570 - #ifdef CONFIG_NO_HZ 1571 - static inline void sched_clock_tick_stop(int cpu) 1572 - { 1573 - } 1574 - 1575 - static inline void sched_clock_tick_start(int cpu) 1576 - { 1577 - } 1578 - #endif 1579 - 1580 - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 1581 - extern void sched_clock_init(void); 1582 - extern u64 sched_clock_cpu(int cpu); 1575 + #else 1583 1576 extern void sched_clock_tick(void); 1584 1577 extern void sched_clock_idle_sleep_event(void); 1585 1578 extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1586 - #ifdef CONFIG_NO_HZ 1587 - extern void sched_clock_tick_stop(int cpu); 1588 - extern void sched_clock_tick_start(int cpu); 1589 1579 #endif 1590 - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 1591 1580 1592 1581 /* 1593 1582 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+1 -1
kernel/Kconfig.hz
··· 55 55 default 1000 if HZ_1000 56 56 57 57 config SCHED_HRTICK 58 - def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS 58 + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
+2 -3
kernel/cpu.c
··· 349 349 goto out_notify; 350 350 BUG_ON(!cpu_online(cpu)); 351 351 352 + cpu_set(cpu, cpu_active_map); 353 + 352 354 /* Now call notifier in preparation. */ 353 355 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 354 356 ··· 384 382 } 385 383 386 384 err = _cpu_up(cpu, 0); 387 - 388 - if (cpu_online(cpu)) 389 - cpu_set(cpu, cpu_active_map); 390 385 391 386 out: 392 387 cpu_maps_update_done();
+1 -1
kernel/sched.c
··· 833 833 834 834 static inline u64 global_rt_runtime(void) 835 835 { 836 - if (sysctl_sched_rt_period < 0) 836 + if (sysctl_sched_rt_runtime < 0) 837 837 return RUNTIME_INF; 838 838 839 839 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+56 -122
kernel/sched_clock.c
··· 32 32 #include <linux/ktime.h> 33 33 #include <linux/module.h> 34 34 35 + /* 36 + * Scheduler clock - returns current time in nanosec units. 37 + * This is default implementation. 38 + * Architectures and sub-architectures can override this. 39 + */ 40 + unsigned long long __attribute__((weak)) sched_clock(void) 41 + { 42 + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 43 + } 44 + 45 + static __read_mostly int sched_clock_running; 35 46 36 47 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 37 - 38 - #define MULTI_SHIFT 15 39 - /* Max is double, Min is 1/2 */ 40 - #define MAX_MULTI (2LL << MULTI_SHIFT) 41 - #define MIN_MULTI (1LL << (MULTI_SHIFT-1)) 42 48 43 49 struct sched_clock_data { 44 50 /* ··· 55 49 raw_spinlock_t lock; 56 50 57 51 unsigned long tick_jiffies; 58 - u64 prev_raw; 59 52 u64 tick_raw; 60 53 u64 tick_gtod; 61 54 u64 clock; 62 - s64 multi; 63 - #ifdef CONFIG_NO_HZ 64 - int check_max; 65 - #endif 66 55 }; 67 56 68 57 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); ··· 72 71 return &per_cpu(sched_clock_data, cpu); 73 72 } 74 73 75 - static __read_mostly int sched_clock_running; 76 - 77 74 void sched_clock_init(void) 78 75 { 79 76 u64 ktime_now = ktime_to_ns(ktime_get()); ··· 83 84 84 85 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 85 86 scd->tick_jiffies = now_jiffies; 86 - scd->prev_raw = 0; 87 87 scd->tick_raw = 0; 88 88 scd->tick_gtod = ktime_now; 89 89 scd->clock = ktime_now; 90 - scd->multi = 1 << MULTI_SHIFT; 91 - #ifdef CONFIG_NO_HZ 92 - scd->check_max = 1; 93 - #endif 94 90 } 95 91 96 92 sched_clock_running = 1; 97 93 } 98 - 99 - #ifdef CONFIG_NO_HZ 100 - /* 101 - * The dynamic ticks makes the delta jiffies inaccurate. This 102 - * prevents us from checking the maximum time update. 103 - * Disable the maximum check during stopped ticks. 104 - */ 105 - void sched_clock_tick_stop(int cpu) 106 - { 107 - struct sched_clock_data *scd = cpu_sdc(cpu); 108 - 109 - scd->check_max = 0; 110 - } 111 - 112 - void sched_clock_tick_start(int cpu) 113 - { 114 - struct sched_clock_data *scd = cpu_sdc(cpu); 115 - 116 - scd->check_max = 1; 117 - } 118 - 119 - static int check_max(struct sched_clock_data *scd) 120 - { 121 - return scd->check_max; 122 - } 123 - #else 124 - static int check_max(struct sched_clock_data *scd) 125 - { 126 - return 1; 127 - } 128 - #endif /* CONFIG_NO_HZ */ 129 94 130 95 /* 131 96 * update the percpu scd from the raw @now value ··· 97 134 * - filter out backward motion 98 135 * - use jiffies to generate a min,max window to clip the raw values 99 136 */ 100 - static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) 137 + static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 101 138 { 102 139 unsigned long now_jiffies = jiffies; 103 140 long delta_jiffies = now_jiffies - scd->tick_jiffies; 104 141 u64 clock = scd->clock; 105 142 u64 min_clock, max_clock; 106 - s64 delta = now - scd->prev_raw; 143 + s64 delta = now - scd->tick_raw; 107 144 108 145 WARN_ON_ONCE(!irqs_disabled()); 109 - 110 - /* 111 - * At schedule tick the clock can be just under the gtod. We don't 112 - * want to push it too prematurely. 113 - */ 114 - min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); 115 - if (min_clock > TICK_NSEC) 116 - min_clock -= TICK_NSEC / 2; 146 + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; 117 147 118 148 if (unlikely(delta < 0)) { 119 149 clock++; 120 150 goto out; 121 151 } 122 152 123 - /* 124 - * The clock must stay within a jiffie of the gtod. 125 - * But since we may be at the start of a jiffy or the end of one 126 - * we add another jiffy buffer. 127 - */ 128 - max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; 153 + max_clock = min_clock + TICK_NSEC; 129 154 130 - delta *= scd->multi; 131 - delta >>= MULTI_SHIFT; 132 - 133 - if (unlikely(clock + delta > max_clock) && check_max(scd)) { 155 + if (unlikely(clock + delta > max_clock)) { 134 156 if (clock < max_clock) 135 157 clock = max_clock; 136 158 else ··· 128 180 if (unlikely(clock < min_clock)) 129 181 clock = min_clock; 130 182 131 - if (time) 132 - *time = clock; 133 - else { 134 - scd->prev_raw = now; 135 - scd->clock = clock; 136 - } 183 + scd->tick_jiffies = now_jiffies; 184 + scd->clock = clock; 185 + 186 + return clock; 137 187 } 138 188 139 189 static void lock_double_clock(struct sched_clock_data *data1, ··· 149 203 u64 sched_clock_cpu(int cpu) 150 204 { 151 205 struct sched_clock_data *scd = cpu_sdc(cpu); 152 - u64 now, clock; 206 + u64 now, clock, this_clock, remote_clock; 153 207 154 208 if (unlikely(!sched_clock_running)) 155 209 return 0ull; ··· 158 212 now = sched_clock(); 159 213 160 214 if (cpu != raw_smp_processor_id()) { 161 - /* 162 - * in order to update a remote cpu's clock based on our 163 - * unstable raw time rebase it against: 164 - * tick_raw (offset between raw counters) 165 - * tick_gotd (tick offset between cpus) 166 - */ 167 215 struct sched_clock_data *my_scd = this_scd(); 168 216 169 217 lock_double_clock(scd, my_scd); 170 218 171 - now -= my_scd->tick_raw; 172 - now += scd->tick_raw; 219 + this_clock = __update_sched_clock(my_scd, now); 220 + remote_clock = scd->clock; 173 221 174 - now += my_scd->tick_gtod; 175 - now -= scd->tick_gtod; 222 + /* 223 + * Use the opportunity that we have both locks 224 + * taken to couple the two clocks: we take the 225 + * larger time as the latest time for both 226 + * runqueues. (this creates monotonic movement) 227 + */ 228 + if (likely(remote_clock < this_clock)) { 229 + clock = this_clock; 230 + scd->clock = clock; 231 + } else { 232 + /* 233 + * Should be rare, but possible: 234 + */ 235 + clock = remote_clock; 236 + my_scd->clock = remote_clock; 237 + } 176 238 177 239 __raw_spin_unlock(&my_scd->lock); 178 - 179 - __update_sched_clock(scd, now, &clock); 180 - 181 - __raw_spin_unlock(&scd->lock); 182 - 183 240 } else { 184 241 __raw_spin_lock(&scd->lock); 185 - __update_sched_clock(scd, now, NULL); 186 - clock = scd->clock; 187 - __raw_spin_unlock(&scd->lock); 242 + clock = __update_sched_clock(scd, now); 188 243 } 244 + 245 + __raw_spin_unlock(&scd->lock); 189 246 190 247 return clock; 191 248 } ··· 196 247 void sched_clock_tick(void) 197 248 { 198 249 struct sched_clock_data *scd = this_scd(); 199 - unsigned long now_jiffies = jiffies; 200 - s64 mult, delta_gtod, delta_raw; 201 250 u64 now, now_gtod; 202 251 203 252 if (unlikely(!sched_clock_running)) ··· 207 260 now = sched_clock(); 208 261 209 262 __raw_spin_lock(&scd->lock); 210 - __update_sched_clock(scd, now, NULL); 263 + __update_sched_clock(scd, now); 211 264 /* 212 265 * update tick_gtod after __update_sched_clock() because that will 213 266 * already observe 1 new jiffy; adding a new tick_gtod to that would 214 267 * increase the clock 2 jiffies. 215 268 */ 216 - delta_gtod = now_gtod - scd->tick_gtod; 217 - delta_raw = now - scd->tick_raw; 218 - 219 - if ((long)delta_raw > 0) { 220 - mult = delta_gtod << MULTI_SHIFT; 221 - do_div(mult, delta_raw); 222 - scd->multi = mult; 223 - if (scd->multi > MAX_MULTI) 224 - scd->multi = MAX_MULTI; 225 - else if (scd->multi < MIN_MULTI) 226 - scd->multi = MIN_MULTI; 227 - } else 228 - scd->multi = 1 << MULTI_SHIFT; 229 - 230 269 scd->tick_raw = now; 231 270 scd->tick_gtod = now_gtod; 232 - scd->tick_jiffies = now_jiffies; 233 271 __raw_spin_unlock(&scd->lock); 234 272 } 235 273 ··· 233 301 void sched_clock_idle_wakeup_event(u64 delta_ns) 234 302 { 235 303 struct sched_clock_data *scd = this_scd(); 236 - u64 now = sched_clock(); 237 304 238 305 /* 239 306 * Override the previous timestamp and ignore all ··· 241 310 * rq clock: 242 311 */ 243 312 __raw_spin_lock(&scd->lock); 244 - scd->prev_raw = now; 245 313 scd->clock += delta_ns; 246 - scd->multi = 1 << MULTI_SHIFT; 247 314 __raw_spin_unlock(&scd->lock); 248 315 249 316 touch_softlockup_watchdog(); 250 317 } 251 318 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 252 319 253 - #endif 320 + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 254 321 255 - /* 256 - * Scheduler clock - returns current time in nanosec units. 257 - * This is default implementation. 258 - * Architectures and sub-architectures can override this. 259 - */ 260 - unsigned long long __attribute__((weak)) sched_clock(void) 322 + void sched_clock_init(void) 261 323 { 262 - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 324 + sched_clock_running = 1; 263 325 } 326 + 327 + u64 sched_clock_cpu(int cpu) 328 + { 329 + if (unlikely(!sched_clock_running)) 330 + return 0; 331 + 332 + return sched_clock(); 333 + } 334 + 335 + #endif 264 336 265 337 unsigned long long cpu_clock(int cpu) 266 338 {
+13 -8
kernel/sched_fair.c
··· 899 899 * doesn't make sense. Rely on vruntime for fairness. 900 900 */ 901 901 if (rq->curr != p) 902 - delta = max(10000LL, delta); 902 + delta = max_t(s64, 10000LL, delta); 903 903 904 904 hrtick_start(rq, delta); 905 905 } ··· 1442 1442 struct task_struct *p = NULL; 1443 1443 struct sched_entity *se; 1444 1444 1445 - while (next != &cfs_rq->tasks) { 1445 + if (next == &cfs_rq->tasks) 1446 + return NULL; 1447 + 1448 + /* Skip over entities that are not tasks */ 1449 + do { 1446 1450 se = list_entry(next, struct sched_entity, group_node); 1447 1451 next = next->next; 1452 + } while (next != &cfs_rq->tasks && !entity_is_task(se)); 1448 1453 1449 - /* Skip over entities that are not tasks */ 1450 - if (entity_is_task(se)) { 1451 - p = task_of(se); 1452 - break; 1453 - } 1454 - } 1454 + if (next == &cfs_rq->tasks) 1455 + return NULL; 1455 1456 1456 1457 cfs_rq->balance_iterator = next; 1458 + 1459 + if (entity_is_task(se)) 1460 + p = task_of(se); 1461 + 1457 1462 return p; 1458 1463 } 1459 1464
-2
kernel/time/tick-sched.c
··· 289 289 ts->tick_stopped = 1; 290 290 ts->idle_jiffies = last_jiffies; 291 291 rcu_enter_nohz(); 292 - sched_clock_tick_stop(cpu); 293 292 } 294 293 295 294 /* ··· 391 392 select_nohz_load_balancer(0); 392 393 now = ktime_get(); 393 394 tick_do_update_jiffies64(now); 394 - sched_clock_tick_start(cpu); 395 395 cpu_clear(cpu, nohz_cpu_mask); 396 396 397 397 /*