sched: adaptive scheduler granularity

Instead of specifying the preemption granularity, specify the wanted
latency. By fixing the granlarity to a constany the wakeup latency
it a function of the number of running tasks on the rq.

Invert this relation.

sysctl_sched_granularity becomes a minimum for the dynamic granularity
computed from the new sysctl_sched_latency.

Then use this latency to do more intelligent granularity decisions: if
there are fewer tasks running then we can schedule coarser. This helps
performance while still always keeping the latency target.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Peter Zijlstra and committed by Ingo Molnar 21805085 1fc84aaa

+86 -17
+1
include/linux/sched.h
··· 1399 1399 1400 1400 extern void sched_idle_next(void); 1401 1401 1402 + extern unsigned int sysctl_sched_latency; 1402 1403 extern unsigned int sysctl_sched_granularity; 1403 1404 extern unsigned int sysctl_sched_wakeup_granularity; 1404 1405 extern unsigned int sysctl_sched_batch_wakeup_granularity;
+9 -5
kernel/sched.c
··· 4911 4911 static inline void sched_init_granularity(void) 4912 4912 { 4913 4913 unsigned int factor = 1 + ilog2(num_online_cpus()); 4914 - const unsigned long gran_limit = 100000000; 4914 + const unsigned long limit = 100000000; 4915 4915 4916 4916 sysctl_sched_granularity *= factor; 4917 - if (sysctl_sched_granularity > gran_limit) 4918 - sysctl_sched_granularity = gran_limit; 4917 + if (sysctl_sched_granularity > limit) 4918 + sysctl_sched_granularity = limit; 4919 4919 4920 - sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; 4921 - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4920 + sysctl_sched_latency *= factor; 4921 + if (sysctl_sched_latency > limit) 4922 + sysctl_sched_latency = limit; 4923 + 4924 + sysctl_sched_runtime_limit = sysctl_sched_latency * 5; 4925 + sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; 4922 4926 } 4923 4927 4924 4928 #ifdef CONFIG_SMP
+65 -12
kernel/sched_fair.c
··· 15 15 * 16 16 * Scaled math optimizations by Thomas Gleixner 17 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 18 + * 19 + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 18 21 */ 19 22 20 23 /* 21 - * Preemption granularity: 22 - * (default: 10 msec, units: nanoseconds) 24 + * Targeted preemption latency for CPU-bound tasks: 25 + * (default: 20ms, units: nanoseconds) 23 26 * 24 - * NOTE: this granularity value is not the same as the concept of 25 - * 'timeslice length' - timeslices in CFS will typically be somewhat 26 - * larger than this value. (to see the precise effective timeslice 27 - * length of your workload, run vmstat and monitor the context-switches 28 - * field) 27 + * NOTE: this latency value is not the same as the concept of 28 + * 'timeslice length' - timeslices in CFS are of variable length. 29 + * (to see the precise effective timeslice length of your workload, 30 + * run vmstat and monitor the context-switches field) 29 31 * 30 32 * On SMP systems the value of this is multiplied by the log2 of the 31 33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 32 34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) 35 + * Targeted preemption latency for CPU-bound tasks: 33 36 */ 34 - unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; 37 + unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; 38 + 39 + /* 40 + * Minimal preemption granularity for CPU-bound tasks: 41 + * (default: 2 msec, units: nanoseconds) 42 + */ 43 + unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; 35 44 36 45 /* 37 46 * SCHED_BATCH wake-up granularity. ··· 222 213 */ 223 214 224 215 /* 216 + * Calculate the preemption granularity needed to schedule every 217 + * runnable task once per sysctl_sched_latency amount of time. 218 + * (down to a sensible low limit on granularity) 219 + * 220 + * For example, if there are 2 tasks running and latency is 10 msecs, 221 + * we switch tasks every 5 msecs. If we have 3 tasks running, we have 222 + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency 223 + * for each task. We do finer and finer scheduling up to until we 224 + * reach the minimum granularity value. 225 + * 226 + * To achieve this we use the following dynamic-granularity rule: 227 + * 228 + * gran = lat/nr - lat/nr/nr 229 + * 230 + * This comes out of the following equations: 231 + * 232 + * kA1 + gran = kB1 233 + * kB2 + gran = kA2 234 + * kA2 = kA1 235 + * kB2 = kB1 - d + d/nr 236 + * lat = d * nr 237 + * 238 + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), 239 + * '1' is start of time, '2' is end of time, 'd' is delay between 240 + * 1 and 2 (during which task B was running), 'nr' is number of tasks 241 + * running, 'lat' is the the period of each task. ('lat' is the 242 + * sched_latency that we aim for.) 243 + */ 244 + static long 245 + sched_granularity(struct cfs_rq *cfs_rq) 246 + { 247 + unsigned int gran = sysctl_sched_latency; 248 + unsigned int nr = cfs_rq->nr_running; 249 + 250 + if (nr > 1) { 251 + gran = gran/nr - gran/nr/nr; 252 + gran = max(gran, sysctl_sched_granularity); 253 + } 254 + 255 + return gran; 256 + } 257 + 258 + /* 225 259 * We rescale the rescheduling granularity of tasks according to their 226 260 * nice level, but only linearly, not exponentially: 227 261 */ ··· 354 302 delta_fair = calc_delta_fair(delta_exec, lw); 355 303 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); 356 304 357 - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { 305 + if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { 358 306 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); 359 307 delta = min(delta, (unsigned long)( 360 308 (long)sysctl_sched_runtime_limit - curr->wait_runtime)); ··· 741 689 if (next == curr) 742 690 return; 743 691 744 - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); 692 + __check_preempt_curr_fair(cfs_rq, next, curr, 693 + sched_granularity(cfs_rq)); 745 694 } 746 695 747 696 /************************************************** ··· 1087 1034 * it will preempt the parent: 1088 1035 */ 1089 1036 p->se.fair_key = current->se.fair_key - 1090 - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; 1037 + niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; 1091 1038 /* 1092 1039 * The first wait is dominated by the child-runs-first logic, 1093 1040 * so do not credit it with that waiting time yet: ··· 1100 1047 * -granularity/2, so initialize the task with that: 1101 1048 */ 1102 1049 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1103 - p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); 1050 + p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); 1104 1051 1105 1052 __enqueue_entity(cfs_rq, se); 1106 1053 }
+11
kernel/sysctl.c
··· 233 233 }, 234 234 { 235 235 .ctl_name = CTL_UNNUMBERED, 236 + .procname = "sched_latency_ns", 237 + .data = &sysctl_sched_latency, 238 + .maxlen = sizeof(unsigned int), 239 + .mode = 0644, 240 + .proc_handler = &proc_dointvec_minmax, 241 + .strategy = &sysctl_intvec, 242 + .extra1 = &min_sched_granularity_ns, 243 + .extra2 = &max_sched_granularity_ns, 244 + }, 245 + { 246 + .ctl_name = CTL_UNNUMBERED, 236 247 .procname = "sched_wakeup_granularity_ns", 237 248 .data = &sysctl_sched_wakeup_granularity, 238 249 .maxlen = sizeof(unsigned int),