sched: min_vruntime fix

Current min_vruntime tracking is incorrect and will cause serious
problems when we don't run the leftmost task for some reason.

min_vruntime does two things; 1) it's used to determine a forward
direction when the u64 vruntime wraps, 2) it's used to track the
leftmost vruntime to position newly enqueued tasks from.

The current logic advances min_vruntime whenever the current task's
vruntime advance. Because the current task may pass the leftmost task
still waiting we're failing the second goal. This causes new tasks to be
placed too far ahead and thus penalizes their runtime.

Fix this by making min_vruntime the min_vruntime of the waiting tasks by
tracking it in enqueue/dequeue, and compare against current's vruntime
to obtain the absolute minimum when placing new tasks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Peter Zijlstra and committed by Ingo Molnar 3fe69747 0e1f3483

+28 -18
+28 -18
kernel/sched_fair.c
··· 175 * Maintain a cache of leftmost tree entries (it is frequently 176 * used): 177 */ 178 - if (leftmost) 179 cfs_rq->rb_leftmost = &se->run_node; 180 181 rb_link_node(&se->run_node, parent, link); 182 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); ··· 191 192 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 193 { 194 - if (cfs_rq->rb_leftmost == &se->run_node) 195 - cfs_rq->rb_leftmost = rb_next(&se->run_node); 196 197 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 198 } ··· 323 unsigned long delta_exec) 324 { 325 unsigned long delta_exec_weighted; 326 - u64 vruntime; 327 328 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 329 ··· 334 &curr->load); 335 } 336 curr->vruntime += delta_exec_weighted; 337 - 338 - /* 339 - * maintain cfs_rq->min_vruntime to be a monotonic increasing 340 - * value tracking the leftmost vruntime in the tree. 341 - */ 342 - if (first_fair(cfs_rq)) { 343 - vruntime = min_vruntime(curr->vruntime, 344 - __pick_next_entity(cfs_rq)->vruntime); 345 - } else 346 - vruntime = curr->vruntime; 347 - 348 - cfs_rq->min_vruntime = 349 - max_vruntime(cfs_rq->min_vruntime, vruntime); 350 } 351 352 static void update_curr(struct cfs_rq *cfs_rq) ··· 499 { 500 u64 vruntime; 501 502 - vruntime = cfs_rq->min_vruntime; 503 504 if (sched_feat(TREE_AVG)) { 505 struct sched_entity *last = __pick_last_entity(cfs_rq);
··· 175 * Maintain a cache of leftmost tree entries (it is frequently 176 * used): 177 */ 178 + if (leftmost) { 179 cfs_rq->rb_leftmost = &se->run_node; 180 + /* 181 + * maintain cfs_rq->min_vruntime to be a monotonic increasing 182 + * value tracking the leftmost vruntime in the tree. 183 + */ 184 + cfs_rq->min_vruntime = 185 + max_vruntime(cfs_rq->min_vruntime, se->vruntime); 186 + } 187 188 rb_link_node(&se->run_node, parent, link); 189 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); ··· 184 185 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 186 { 187 + if (cfs_rq->rb_leftmost == &se->run_node) { 188 + struct rb_node *next_node; 189 + struct sched_entity *next; 190 + 191 + next_node = rb_next(&se->run_node); 192 + cfs_rq->rb_leftmost = next_node; 193 + 194 + if (next_node) { 195 + next = rb_entry(next_node, 196 + struct sched_entity, run_node); 197 + cfs_rq->min_vruntime = 198 + max_vruntime(cfs_rq->min_vruntime, 199 + next->vruntime); 200 + } 201 + } 202 203 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 204 } ··· 303 unsigned long delta_exec) 304 { 305 unsigned long delta_exec_weighted; 306 307 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 308 ··· 315 &curr->load); 316 } 317 curr->vruntime += delta_exec_weighted; 318 } 319 320 static void update_curr(struct cfs_rq *cfs_rq) ··· 493 { 494 u64 vruntime; 495 496 + if (first_fair(cfs_rq)) { 497 + vruntime = min_vruntime(cfs_rq->min_vruntime, 498 + __pick_next_entity(cfs_rq)->vruntime); 499 + } else 500 + vruntime = cfs_rq->min_vruntime; 501 502 if (sched_feat(TREE_AVG)) { 503 struct sched_entity *last = __pick_last_entity(cfs_rq);