Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] sched: remove SMT nice

Remove the SMT-nice feature which idles sibling cpus on SMT cpus to
facilitiate nice working properly where cpu power is shared. The idling of
cpus in the presence of runnable tasks is considered too fragile, easy to
break with outside code, and the complexity of managing this system if an
architecture comes along with many logical cores sharing cpu power will be
unworkable.

Remove the associated per_cpu_gain variable in sched_domains used only by
this code.

Also:

The reason is that with dynticks enabled, this code breaks without yet
further tweaks so dynticks brought on the rapid demise of this code. So
either we tweak this code or kill it off entirely. It was Ingo's preference
to kill it off. Either way this needs to happen for 2.6.21 since dynticks
has gone in.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Con Kolivas and committed by
Linus Torvalds
69f7c0a1 759b9775

+1 -165
-1
include/asm-i386/topology.h
··· 85 85 .idle_idx = 1, \ 86 86 .newidle_idx = 2, \ 87 87 .wake_idx = 1, \ 88 - .per_cpu_gain = 100, \ 89 88 .flags = SD_LOAD_BALANCE \ 90 89 | SD_BALANCE_EXEC \ 91 90 | SD_BALANCE_FORK \
-2
include/asm-ia64/topology.h
··· 65 65 .max_interval = 4, \ 66 66 .busy_factor = 64, \ 67 67 .imbalance_pct = 125, \ 68 - .per_cpu_gain = 100, \ 69 68 .cache_nice_tries = 2, \ 70 69 .busy_idx = 2, \ 71 70 .idle_idx = 1, \ ··· 96 97 .newidle_idx = 0, /* unused */ \ 97 98 .wake_idx = 1, \ 98 99 .forkexec_idx = 1, \ 99 - .per_cpu_gain = 100, \ 100 100 .flags = SD_LOAD_BALANCE \ 101 101 | SD_BALANCE_EXEC \ 102 102 | SD_BALANCE_FORK \
-1
include/asm-mips/mach-ip27/topology.h
··· 28 28 .busy_factor = 32, \ 29 29 .imbalance_pct = 125, \ 30 30 .cache_nice_tries = 1, \ 31 - .per_cpu_gain = 100, \ 32 31 .flags = SD_LOAD_BALANCE \ 33 32 | SD_BALANCE_EXEC \ 34 33 | SD_WAKE_BALANCE, \
-1
include/asm-powerpc/topology.h
··· 57 57 .busy_factor = 32, \ 58 58 .imbalance_pct = 125, \ 59 59 .cache_nice_tries = 1, \ 60 - .per_cpu_gain = 100, \ 61 60 .busy_idx = 3, \ 62 61 .idle_idx = 1, \ 63 62 .newidle_idx = 2, \
-1
include/asm-x86_64/topology.h
··· 43 43 .newidle_idx = 0, \ 44 44 .wake_idx = 1, \ 45 45 .forkexec_idx = 1, \ 46 - .per_cpu_gain = 100, \ 47 46 .flags = SD_LOAD_BALANCE \ 48 47 | SD_BALANCE_FORK \ 49 48 | SD_BALANCE_EXEC \
-1
include/linux/sched.h
··· 684 684 unsigned int imbalance_pct; /* No balance until over watermark */ 685 685 unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ 686 686 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 687 - unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ 688 687 unsigned int busy_idx; 689 688 unsigned int idle_idx; 690 689 unsigned int newidle_idx;
-4
include/linux/topology.h
··· 96 96 .busy_factor = 64, \ 97 97 .imbalance_pct = 110, \ 98 98 .cache_nice_tries = 0, \ 99 - .per_cpu_gain = 25, \ 100 99 .busy_idx = 0, \ 101 100 .idle_idx = 0, \ 102 101 .newidle_idx = 1, \ ··· 127 128 .busy_factor = 64, \ 128 129 .imbalance_pct = 125, \ 129 130 .cache_nice_tries = 1, \ 130 - .per_cpu_gain = 100, \ 131 131 .busy_idx = 2, \ 132 132 .idle_idx = 1, \ 133 133 .newidle_idx = 2, \ ··· 157 159 .busy_factor = 64, \ 158 160 .imbalance_pct = 125, \ 159 161 .cache_nice_tries = 1, \ 160 - .per_cpu_gain = 100, \ 161 162 .busy_idx = 2, \ 162 163 .idle_idx = 1, \ 163 164 .newidle_idx = 2, \ ··· 190 193 .newidle_idx = 0, /* unused */ \ 191 194 .wake_idx = 0, /* unused */ \ 192 195 .forkexec_idx = 0, /* unused */ \ 193 - .per_cpu_gain = 100, \ 194 196 .flags = SD_LOAD_BALANCE \ 195 197 | SD_SERIALIZE, \ 196 198 .last_balance = jiffies, \
+1 -154
kernel/sched.c
··· 3006 3006 } 3007 3007 #endif 3008 3008 3009 - static inline void wake_priority_sleeper(struct rq *rq) 3010 - { 3011 - #ifdef CONFIG_SCHED_SMT 3012 - if (!rq->nr_running) 3013 - return; 3014 - 3015 - spin_lock(&rq->lock); 3016 - /* 3017 - * If an SMT sibling task has been put to sleep for priority 3018 - * reasons reschedule the idle task to see if it can now run. 3019 - */ 3020 - if (rq->nr_running) 3021 - resched_task(rq->idle); 3022 - spin_unlock(&rq->lock); 3023 - #endif 3024 - } 3025 - 3026 3009 DEFINE_PER_CPU(struct kernel_stat, kstat); 3027 3010 3028 3011 EXPORT_PER_CPU_SYMBOL(kstat); ··· 3222 3239 3223 3240 update_cpu_clock(p, rq, now); 3224 3241 3225 - if (p == rq->idle) 3226 - /* Task on the idle queue */ 3227 - wake_priority_sleeper(rq); 3228 - else 3242 + if (p != rq->idle) 3229 3243 task_running_tick(rq, p); 3230 3244 #ifdef CONFIG_SMP 3231 3245 update_load(rq); ··· 3230 3250 raise_softirq(SCHED_SOFTIRQ); 3231 3251 #endif 3232 3252 } 3233 - 3234 - #ifdef CONFIG_SCHED_SMT 3235 - static inline void wakeup_busy_runqueue(struct rq *rq) 3236 - { 3237 - /* If an SMT runqueue is sleeping due to priority reasons wake it up */ 3238 - if (rq->curr == rq->idle && rq->nr_running) 3239 - resched_task(rq->idle); 3240 - } 3241 - 3242 - /* 3243 - * Called with interrupt disabled and this_rq's runqueue locked. 3244 - */ 3245 - static void wake_sleeping_dependent(int this_cpu) 3246 - { 3247 - struct sched_domain *tmp, *sd = NULL; 3248 - int i; 3249 - 3250 - for_each_domain(this_cpu, tmp) { 3251 - if (tmp->flags & SD_SHARE_CPUPOWER) { 3252 - sd = tmp; 3253 - break; 3254 - } 3255 - } 3256 - 3257 - if (!sd) 3258 - return; 3259 - 3260 - for_each_cpu_mask(i, sd->span) { 3261 - struct rq *smt_rq = cpu_rq(i); 3262 - 3263 - if (i == this_cpu) 3264 - continue; 3265 - if (unlikely(!spin_trylock(&smt_rq->lock))) 3266 - continue; 3267 - 3268 - wakeup_busy_runqueue(smt_rq); 3269 - spin_unlock(&smt_rq->lock); 3270 - } 3271 - } 3272 - 3273 - /* 3274 - * number of 'lost' timeslices this task wont be able to fully 3275 - * utilize, if another task runs on a sibling. This models the 3276 - * slowdown effect of other tasks running on siblings: 3277 - */ 3278 - static inline unsigned long 3279 - smt_slice(struct task_struct *p, struct sched_domain *sd) 3280 - { 3281 - return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3282 - } 3283 - 3284 - /* 3285 - * To minimise lock contention and not have to drop this_rq's runlock we only 3286 - * trylock the sibling runqueues and bypass those runqueues if we fail to 3287 - * acquire their lock. As we only trylock the normal locking order does not 3288 - * need to be obeyed. 3289 - */ 3290 - static int 3291 - dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) 3292 - { 3293 - struct sched_domain *tmp, *sd = NULL; 3294 - int ret = 0, i; 3295 - 3296 - /* kernel/rt threads do not participate in dependent sleeping */ 3297 - if (!p->mm || rt_task(p)) 3298 - return 0; 3299 - 3300 - for_each_domain(this_cpu, tmp) { 3301 - if (tmp->flags & SD_SHARE_CPUPOWER) { 3302 - sd = tmp; 3303 - break; 3304 - } 3305 - } 3306 - 3307 - if (!sd) 3308 - return 0; 3309 - 3310 - for_each_cpu_mask(i, sd->span) { 3311 - struct task_struct *smt_curr; 3312 - struct rq *smt_rq; 3313 - 3314 - if (i == this_cpu) 3315 - continue; 3316 - 3317 - smt_rq = cpu_rq(i); 3318 - if (unlikely(!spin_trylock(&smt_rq->lock))) 3319 - continue; 3320 - 3321 - smt_curr = smt_rq->curr; 3322 - 3323 - if (!smt_curr->mm) 3324 - goto unlock; 3325 - 3326 - /* 3327 - * If a user task with lower static priority than the 3328 - * running task on the SMT sibling is trying to schedule, 3329 - * delay it till there is proportionately less timeslice 3330 - * left of the sibling task to prevent a lower priority 3331 - * task from using an unfair proportion of the 3332 - * physical cpu's resources. -ck 3333 - */ 3334 - if (rt_task(smt_curr)) { 3335 - /* 3336 - * With real time tasks we run non-rt tasks only 3337 - * per_cpu_gain% of the time. 3338 - */ 3339 - if ((jiffies % DEF_TIMESLICE) > 3340 - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3341 - ret = 1; 3342 - } else { 3343 - if (smt_curr->static_prio < p->static_prio && 3344 - !TASK_PREEMPTS_CURR(p, smt_rq) && 3345 - smt_slice(smt_curr, sd) > task_timeslice(p)) 3346 - ret = 1; 3347 - } 3348 - unlock: 3349 - spin_unlock(&smt_rq->lock); 3350 - } 3351 - return ret; 3352 - } 3353 - #else 3354 - static inline void wake_sleeping_dependent(int this_cpu) 3355 - { 3356 - } 3357 - static inline int 3358 - dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) 3359 - { 3360 - return 0; 3361 - } 3362 - #endif 3363 3253 3364 3254 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 3365 3255 ··· 3357 3507 if (!rq->nr_running) { 3358 3508 next = rq->idle; 3359 3509 rq->expired_timestamp = 0; 3360 - wake_sleeping_dependent(cpu); 3361 3510 goto switch_tasks; 3362 3511 } 3363 3512 } ··· 3396 3547 } 3397 3548 } 3398 3549 next->sleep_type = SLEEP_NORMAL; 3399 - if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next)) 3400 - next = rq->idle; 3401 3550 switch_tasks: 3402 3551 if (next == rq->idle) 3403 3552 schedstat_inc(rq, sched_goidle);