Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

- The hopefully final fix for the reported race problems in
kthread_parkme(). The previous attempt still left a hole and was
partially wrong.

- Plug a race in the remote tick mechanism which triggers a warning
about updates not being done correctly. That's a false positive if
the race condition is hit as the remote CPU is idle. Plug it by
checking the condition again when holding run queue lock.

- Fix a bug in the utilization estimation of a run queue which causes
the estimation to be 0 when a run queue is throttled.

- Advance the global expiration of the period timer when the timer is
restarted after a idle period. Otherwise the expiry time is stale and
the timer fires prematurely.

- Cure the drift between the bandwidth timer and the runqueue
accounting, which leads to bogus throttling of runqueues

- Place the call to cpufreq_update_util() correctly so the function
will observe the correct number of running RT tasks and not a stale
one.

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
kthread, sched/core: Fix kthread_parkme() (again...)
sched/util_est: Fix util_est_dequeue() for throttled cfs_rq
sched/fair: Advance global expiration when period timer is restarted
sched/fair: Fix bandwidth timer clock drift condition
sched/rt: Fix call to cpufreq_update_util()
sched/nohz: Skip remote tick on idle task entirely

Changed files
+99 -75
include
kernel
-1
include/linux/kthread.h
··· 62 62 int kthread_park(struct task_struct *k); 63 63 void kthread_unpark(struct task_struct *k); 64 64 void kthread_parkme(void); 65 - void kthread_park_complete(struct task_struct *k); 66 65 67 66 int kthreadd(void *unused); 68 67 extern struct task_struct *kthreadd_task;
+1 -1
include/linux/sched.h
··· 118 118 * the comment with set_special_state(). 119 119 */ 120 120 #define is_special_task_state(state) \ 121 - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) 121 + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) 122 122 123 123 #define __set_current_state(state_value) \ 124 124 do { \
+24 -6
kernel/kthread.c
··· 177 177 static void __kthread_parkme(struct kthread *self) 178 178 { 179 179 for (;;) { 180 - set_current_state(TASK_PARKED); 180 + /* 181 + * TASK_PARKED is a special state; we must serialize against 182 + * possible pending wakeups to avoid store-store collisions on 183 + * task->state. 184 + * 185 + * Such a collision might possibly result in the task state 186 + * changin from TASK_PARKED and us failing the 187 + * wait_task_inactive() in kthread_park(). 188 + */ 189 + set_special_state(TASK_PARKED); 181 190 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) 182 191 break; 192 + 193 + complete_all(&self->parked); 183 194 schedule(); 184 195 } 185 196 __set_current_state(TASK_RUNNING); ··· 201 190 __kthread_parkme(to_kthread(current)); 202 191 } 203 192 EXPORT_SYMBOL_GPL(kthread_parkme); 204 - 205 - void kthread_park_complete(struct task_struct *k) 206 - { 207 - complete_all(&to_kthread(k)->parked); 208 - } 209 193 210 194 static int kthread(void *_create) 211 195 { ··· 467 461 468 462 reinit_completion(&kthread->parked); 469 463 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 464 + /* 465 + * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. 466 + */ 470 467 wake_up_state(k, TASK_PARKED); 471 468 } 472 469 EXPORT_SYMBOL_GPL(kthread_unpark); ··· 496 487 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 497 488 if (k != current) { 498 489 wake_up_process(k); 490 + /* 491 + * Wait for __kthread_parkme() to complete(), this means we 492 + * _will_ have TASK_PARKED and are about to call schedule(). 493 + */ 499 494 wait_for_completion(&kthread->parked); 495 + /* 496 + * Now wait for that schedule() to complete and the task to 497 + * get scheduled out. 498 + */ 499 + WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); 500 500 } 501 501 502 502 return 0;
+32 -35
kernel/sched/core.c
··· 7 7 */ 8 8 #include "sched.h" 9 9 10 - #include <linux/kthread.h> 11 10 #include <linux/nospec.h> 12 11 13 12 #include <linux/kcov.h> ··· 2723 2724 membarrier_mm_sync_core_before_usermode(mm); 2724 2725 mmdrop(mm); 2725 2726 } 2726 - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { 2727 - switch (prev_state) { 2728 - case TASK_DEAD: 2729 - if (prev->sched_class->task_dead) 2730 - prev->sched_class->task_dead(prev); 2727 + if (unlikely(prev_state == TASK_DEAD)) { 2728 + if (prev->sched_class->task_dead) 2729 + prev->sched_class->task_dead(prev); 2731 2730 2732 - /* 2733 - * Remove function-return probe instances associated with this 2734 - * task and put them back on the free list. 2735 - */ 2736 - kprobe_flush_task(prev); 2731 + /* 2732 + * Remove function-return probe instances associated with this 2733 + * task and put them back on the free list. 2734 + */ 2735 + kprobe_flush_task(prev); 2737 2736 2738 - /* Task is done with its stack. */ 2739 - put_task_stack(prev); 2737 + /* Task is done with its stack. */ 2738 + put_task_stack(prev); 2740 2739 2741 - put_task_struct(prev); 2742 - break; 2743 - 2744 - case TASK_PARKED: 2745 - kthread_park_complete(prev); 2746 - break; 2747 - } 2740 + put_task_struct(prev); 2748 2741 } 2749 2742 2750 2743 tick_nohz_task_switch(); ··· 3104 3113 struct tick_work *twork = container_of(dwork, struct tick_work, work); 3105 3114 int cpu = twork->cpu; 3106 3115 struct rq *rq = cpu_rq(cpu); 3116 + struct task_struct *curr; 3107 3117 struct rq_flags rf; 3118 + u64 delta; 3108 3119 3109 3120 /* 3110 3121 * Handle the tick only if it appears the remote CPU is running in full ··· 3115 3122 * statistics and checks timeslices in a time-independent way, regardless 3116 3123 * of when exactly it is running. 3117 3124 */ 3118 - if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { 3119 - struct task_struct *curr; 3120 - u64 delta; 3125 + if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) 3126 + goto out_requeue; 3121 3127 3122 - rq_lock_irq(rq, &rf); 3123 - update_rq_clock(rq); 3124 - curr = rq->curr; 3125 - delta = rq_clock_task(rq) - curr->se.exec_start; 3128 + rq_lock_irq(rq, &rf); 3129 + curr = rq->curr; 3130 + if (is_idle_task(curr)) 3131 + goto out_unlock; 3126 3132 3127 - /* 3128 - * Make sure the next tick runs within a reasonable 3129 - * amount of time. 3130 - */ 3131 - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3132 - curr->sched_class->task_tick(rq, curr, 0); 3133 - rq_unlock_irq(rq, &rf); 3134 - } 3133 + update_rq_clock(rq); 3134 + delta = rq_clock_task(rq) - curr->se.exec_start; 3135 3135 3136 + /* 3137 + * Make sure the next tick runs within a reasonable 3138 + * amount of time. 3139 + */ 3140 + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3141 + curr->sched_class->task_tick(rq, curr, 0); 3142 + 3143 + out_unlock: 3144 + rq_unlock_irq(rq, &rf); 3145 + 3146 + out_requeue: 3136 3147 /* 3137 3148 * Run the remote tick once per second (1Hz). This arbitrary 3138 3149 * frequency is large enough to avoid overload but short enough
+1 -1
kernel/sched/cpufreq_schedutil.c
··· 192 192 { 193 193 struct rq *rq = cpu_rq(sg_cpu->cpu); 194 194 195 - if (rq->rt.rt_nr_running) 195 + if (rt_rq_is_runnable(&rq->rt)) 196 196 return sg_cpu->max; 197 197 198 198 /*
+22 -23
kernel/sched/fair.c
··· 3982 3982 if (!sched_feat(UTIL_EST)) 3983 3983 return; 3984 3984 3985 - /* 3986 - * Update root cfs_rq's estimated utilization 3987 - * 3988 - * If *p is the last task then the root cfs_rq's estimated utilization 3989 - * of a CPU is 0 by definition. 3990 - */ 3991 - ue.enqueued = 0; 3992 - if (cfs_rq->nr_running) { 3993 - ue.enqueued = cfs_rq->avg.util_est.enqueued; 3994 - ue.enqueued -= min_t(unsigned int, ue.enqueued, 3995 - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); 3996 - } 3985 + /* Update root cfs_rq's estimated utilization */ 3986 + ue.enqueued = cfs_rq->avg.util_est.enqueued; 3987 + ue.enqueued -= min_t(unsigned int, ue.enqueued, 3988 + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); 3997 3989 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); 3998 3990 3999 3991 /* ··· 4582 4590 now = sched_clock_cpu(smp_processor_id()); 4583 4591 cfs_b->runtime = cfs_b->quota; 4584 4592 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 4593 + cfs_b->expires_seq++; 4585 4594 } 4586 4595 4587 4596 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) ··· 4605 4612 struct task_group *tg = cfs_rq->tg; 4606 4613 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 4607 4614 u64 amount = 0, min_amount, expires; 4615 + int expires_seq; 4608 4616 4609 4617 /* note: this is a positive sum as runtime_remaining <= 0 */ 4610 4618 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; ··· 4622 4628 cfs_b->idle = 0; 4623 4629 } 4624 4630 } 4631 + expires_seq = cfs_b->expires_seq; 4625 4632 expires = cfs_b->runtime_expires; 4626 4633 raw_spin_unlock(&cfs_b->lock); 4627 4634 ··· 4632 4637 * spread between our sched_clock and the one on which runtime was 4633 4638 * issued. 4634 4639 */ 4635 - if ((s64)(expires - cfs_rq->runtime_expires) > 0) 4640 + if (cfs_rq->expires_seq != expires_seq) { 4641 + cfs_rq->expires_seq = expires_seq; 4636 4642 cfs_rq->runtime_expires = expires; 4643 + } 4637 4644 4638 4645 return cfs_rq->runtime_remaining > 0; 4639 4646 } ··· 4661 4664 * has not truly expired. 4662 4665 * 4663 4666 * Fortunately we can check determine whether this the case by checking 4664 - * whether the global deadline has advanced. It is valid to compare 4665 - * cfs_b->runtime_expires without any locks since we only care about 4666 - * exact equality, so a partial write will still work. 4667 + * whether the global deadline(cfs_b->expires_seq) has advanced. 4667 4668 */ 4668 - 4669 - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { 4669 + if (cfs_rq->expires_seq == cfs_b->expires_seq) { 4670 4670 /* extend local deadline, drift is bounded above by 2 ticks */ 4671 4671 cfs_rq->runtime_expires += TICK_NSEC; 4672 4672 } else { ··· 5196 5202 5197 5203 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 5198 5204 { 5205 + u64 overrun; 5206 + 5199 5207 lockdep_assert_held(&cfs_b->lock); 5200 5208 5201 - if (!cfs_b->period_active) { 5202 - cfs_b->period_active = 1; 5203 - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 5204 - hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 5205 - } 5209 + if (cfs_b->period_active) 5210 + return; 5211 + 5212 + cfs_b->period_active = 1; 5213 + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 5214 + cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); 5215 + cfs_b->expires_seq++; 5216 + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 5206 5217 } 5207 5218 5208 5219 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+10 -6
kernel/sched/rt.c
··· 508 508 509 509 rt_se = rt_rq->tg->rt_se[cpu]; 510 510 511 - if (!rt_se) 511 + if (!rt_se) { 512 512 dequeue_top_rt_rq(rt_rq); 513 + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 514 + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); 515 + } 513 516 else if (on_rt_rq(rt_se)) 514 517 dequeue_rt_entity(rt_se, 0); 515 518 } ··· 1004 1001 sub_nr_running(rq, rt_rq->rt_nr_running); 1005 1002 rt_rq->rt_queued = 0; 1006 1003 1007 - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1008 - cpufreq_update_util(rq, 0); 1009 1004 } 1010 1005 1011 1006 static void ··· 1015 1014 1016 1015 if (rt_rq->rt_queued) 1017 1016 return; 1018 - if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 1017 + 1018 + if (rt_rq_throttled(rt_rq)) 1019 1019 return; 1020 1020 1021 - add_nr_running(rq, rt_rq->rt_nr_running); 1022 - rt_rq->rt_queued = 1; 1021 + if (rt_rq->rt_nr_running) { 1022 + add_nr_running(rq, rt_rq->rt_nr_running); 1023 + rt_rq->rt_queued = 1; 1024 + } 1023 1025 1024 1026 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1025 1027 cpufreq_update_util(rq, 0);
+9 -2
kernel/sched/sched.h
··· 334 334 u64 runtime; 335 335 s64 hierarchical_quota; 336 336 u64 runtime_expires; 337 + int expires_seq; 337 338 338 - int idle; 339 - int period_active; 339 + short idle; 340 + short period_active; 340 341 struct hrtimer period_timer; 341 342 struct hrtimer slack_timer; 342 343 struct list_head throttled_cfs_rq; ··· 552 551 553 552 #ifdef CONFIG_CFS_BANDWIDTH 554 553 int runtime_enabled; 554 + int expires_seq; 555 555 u64 runtime_expires; 556 556 s64 runtime_remaining; 557 557 ··· 610 608 struct task_group *tg; 611 609 #endif 612 610 }; 611 + 612 + static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) 613 + { 614 + return rt_rq->rt_queued && rt_rq->rt_nr_running; 615 + } 613 616 614 617 /* Deadline class' related fields in a runqueue */ 615 618 struct dl_rq {