sched: avoid large irq-latencies in smp-balancing

SMP balancing is done with IRQs disabled and can iterate the full rq.
When rqs are large this can cause large irq-latencies. Limit the nr of
iterations on each run.

This fixes a scheduling latency regression reported by the -rt folks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Peter Zijlstra and committed by Ingo Molnar b82d9fdd 3c90e6e9

+19 -5
+1
include/linux/sched.h
··· 1466 1466 extern unsigned int sysctl_sched_child_runs_first; 1467 1467 extern unsigned int sysctl_sched_features; 1468 1468 extern unsigned int sysctl_sched_migration_cost; 1469 + extern unsigned int sysctl_sched_nr_migrate; 1469 1470 1470 1471 int sched_nr_latency_handler(struct ctl_table *table, int write, 1471 1472 struct file *file, void __user *buffer, size_t *length,
+10 -5
kernel/sched.c
··· 472 472 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 473 473 474 474 /* 475 + * Number of tasks to iterate in a single balance run. 476 + * Limited because this is done with IRQs disabled. 477 + */ 478 + const_debug unsigned int sysctl_sched_nr_migrate = 32; 479 + 480 + /* 475 481 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 476 482 * clock constructed from sched_clock(): 477 483 */ ··· 2241 2235 enum cpu_idle_type idle, int *all_pinned, 2242 2236 int *this_best_prio, struct rq_iterator *iterator) 2243 2237 { 2244 - int pulled = 0, pinned = 0, skip_for_load; 2238 + int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2245 2239 struct task_struct *p; 2246 2240 long rem_load_move = max_load_move; 2247 2241 ··· 2255 2249 */ 2256 2250 p = iterator->start(iterator->arg); 2257 2251 next: 2258 - if (!p) 2252 + if (!p || loops++ > sysctl_sched_nr_migrate) 2259 2253 goto out; 2260 2254 /* 2261 - * To help distribute high priority tasks accross CPUs we don't 2255 + * To help distribute high priority tasks across CPUs we don't 2262 2256 * skip a task if it will be the highest priority task (i.e. smallest 2263 2257 * prio value) on its new queue regardless of its load weight 2264 2258 */ ··· 2275 2269 rem_load_move -= p->se.load.weight; 2276 2270 2277 2271 /* 2278 - * We only want to steal up to the prescribed number of tasks 2279 - * and the prescribed amount of weighted load. 2272 + * We only want to steal up to the prescribed amount of weighted load. 2280 2273 */ 2281 2274 if (rem_load_move > 0) { 2282 2275 if (p->prio < *this_best_prio)
+8
kernel/sysctl.c
··· 301 301 .mode = 0644, 302 302 .proc_handler = &proc_dointvec, 303 303 }, 304 + { 305 + .ctl_name = CTL_UNNUMBERED, 306 + .procname = "sched_nr_migrate", 307 + .data = &sysctl_sched_nr_migrate, 308 + .maxlen = sizeof(unsigned int), 309 + .mode = 644, 310 + .proc_handler = &proc_dointvec, 311 + }, 304 312 #endif 305 313 { 306 314 .ctl_name = CTL_UNNUMBERED,