Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fix from Ingo Molnar:
"Fix an exec() related scalability/performance regression, which was
caused by incorrectly calculating load and migrating tasks on exec()
when they shouldn't be"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix cpu_util_wake() for 'execl' type workloads

Changed files
+48 -14
kernel
sched
+48 -14
kernel/sched/fair.c
··· 5674 5674 return target; 5675 5675 } 5676 5676 5677 - static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5677 + static unsigned long cpu_util_without(int cpu, struct task_struct *p); 5678 5678 5679 - static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5679 + static unsigned long capacity_spare_without(int cpu, struct task_struct *p) 5680 5680 { 5681 - return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); 5681 + return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); 5682 5682 } 5683 5683 5684 5684 /* ··· 5738 5738 5739 5739 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); 5740 5740 5741 - spare_cap = capacity_spare_wake(i, p); 5741 + spare_cap = capacity_spare_without(i, p); 5742 5742 5743 5743 if (spare_cap > max_spare_cap) 5744 5744 max_spare_cap = spare_cap; ··· 5889 5889 return prev_cpu; 5890 5890 5891 5891 /* 5892 - * We need task's util for capacity_spare_wake, sync it up to prev_cpu's 5893 - * last_update_time. 5892 + * We need task's util for capacity_spare_without, sync it up to 5893 + * prev_cpu's last_update_time. 5894 5894 */ 5895 5895 if (!(sd_flag & SD_BALANCE_FORK)) 5896 5896 sync_entity_load_avg(&p->se); ··· 6216 6216 } 6217 6217 6218 6218 /* 6219 - * cpu_util_wake: Compute CPU utilization with any contributions from 6220 - * the waking task p removed. 6219 + * cpu_util_without: compute cpu utilization without any contributions from *p 6220 + * @cpu: the CPU which utilization is requested 6221 + * @p: the task which utilization should be discounted 6222 + * 6223 + * The utilization of a CPU is defined by the utilization of tasks currently 6224 + * enqueued on that CPU as well as tasks which are currently sleeping after an 6225 + * execution on that CPU. 6226 + * 6227 + * This method returns the utilization of the specified CPU by discounting the 6228 + * utilization of the specified task, whenever the task is currently 6229 + * contributing to the CPU utilization. 6221 6230 */ 6222 - static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6231 + static unsigned long cpu_util_without(int cpu, struct task_struct *p) 6223 6232 { 6224 6233 struct cfs_rq *cfs_rq; 6225 6234 unsigned int util; ··· 6240 6231 cfs_rq = &cpu_rq(cpu)->cfs; 6241 6232 util = READ_ONCE(cfs_rq->avg.util_avg); 6242 6233 6243 - /* Discount task's blocked util from CPU's util */ 6234 + /* Discount task's util from CPU's util */ 6244 6235 util -= min_t(unsigned int, util, task_util(p)); 6245 6236 6246 6237 /* ··· 6249 6240 * a) if *p is the only task sleeping on this CPU, then: 6250 6241 * cpu_util (== task_util) > util_est (== 0) 6251 6242 * and thus we return: 6252 - * cpu_util_wake = (cpu_util - task_util) = 0 6243 + * cpu_util_without = (cpu_util - task_util) = 0 6253 6244 * 6254 6245 * b) if other tasks are SLEEPING on this CPU, which is now exiting 6255 6246 * IDLE, then: 6256 6247 * cpu_util >= task_util 6257 6248 * cpu_util > util_est (== 0) 6258 6249 * and thus we discount *p's blocked utilization to return: 6259 - * cpu_util_wake = (cpu_util - task_util) >= 0 6250 + * cpu_util_without = (cpu_util - task_util) >= 0 6260 6251 * 6261 6252 * c) if other tasks are RUNNABLE on that CPU and 6262 6253 * util_est > cpu_util ··· 6269 6260 * covered by the following code when estimated utilization is 6270 6261 * enabled. 6271 6262 */ 6272 - if (sched_feat(UTIL_EST)) 6273 - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); 6263 + if (sched_feat(UTIL_EST)) { 6264 + unsigned int estimated = 6265 + READ_ONCE(cfs_rq->avg.util_est.enqueued); 6266 + 6267 + /* 6268 + * Despite the following checks we still have a small window 6269 + * for a possible race, when an execl's select_task_rq_fair() 6270 + * races with LB's detach_task(): 6271 + * 6272 + * detach_task() 6273 + * p->on_rq = TASK_ON_RQ_MIGRATING; 6274 + * ---------------------------------- A 6275 + * deactivate_task() \ 6276 + * dequeue_task() + RaceTime 6277 + * util_est_dequeue() / 6278 + * ---------------------------------- B 6279 + * 6280 + * The additional check on "current == p" it's required to 6281 + * properly fix the execl regression and it helps in further 6282 + * reducing the chances for the above race. 6283 + */ 6284 + if (unlikely(task_on_rq_queued(p) || current == p)) { 6285 + estimated -= min_t(unsigned int, estimated, 6286 + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); 6287 + } 6288 + util = max(util, estimated); 6289 + } 6274 6290 6275 6291 /* 6276 6292 * Utilization (estimated) can exceed the CPU capacity, thus let's