Merge branch 'tip/sched/core' into for-6.12

+1 -1

fs/bcachefs/six.c

··· 335 335 */ 336 336 rcu_read_lock(); 337 337 struct task_struct *owner = READ_ONCE(lock->owner); 338 - bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); 338 + bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); 339 339 rcu_read_unlock(); 340 340 341 341 return ret;

+1 -1

fs/select.c

··· 82 82 * Realtime tasks get a slack of 0 for obvious reasons. 83 83 */ 84 84 85 - if (rt_task(current)) 85 + if (rt_or_dl_task(current)) 86 86 return 0; 87 87 88 88 ktime_get_ts64(&now);

+1 -1

include/linux/ioprio.h

··· 40 40 { 41 41 if (task->policy == SCHED_IDLE) 42 42 return IOPRIO_CLASS_IDLE; 43 - else if (task_is_realtime(task)) 43 + else if (rt_or_dl_task_policy(task)) 44 44 return IOPRIO_CLASS_RT; 45 45 else 46 46 return IOPRIO_CLASS_BE;

+9 -3

include/linux/sched.h

··· 151 151 * Special states are those that do not use the normal wait-loop pattern. See 152 152 * the comment with set_special_state(). 153 153 */ 154 - #define is_special_task_state(state) \ 155 - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) 154 + #define is_special_task_state(state) \ 155 + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ 156 + TASK_DEAD | TASK_FROZEN)) 156 157 157 158 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 158 159 # define debug_normal_state_change(state_value) \ ··· 544 543 struct rb_node run_node; 545 544 u64 deadline; 546 545 u64 min_vruntime; 546 + u64 min_slice; 547 547 548 548 struct list_head group_node; 549 - unsigned int on_rq; 549 + unsigned char on_rq; 550 + unsigned char sched_delayed; 551 + unsigned char rel_deadline; 552 + unsigned char custom_slice; 553 + /* hole */ 550 554 551 555 u64 exec_start; 552 556 u64 sum_exec_runtime;

+7 -7

include/linux/sched/deadline.h

··· 10 10 11 11 #include <linux/sched.h> 12 12 13 - #define MAX_DL_PRIO 0 14 - 15 - static inline int dl_prio(int prio) 13 + static inline bool dl_prio(int prio) 16 14 { 17 - if (unlikely(prio < MAX_DL_PRIO)) 18 - return 1; 19 - return 0; 15 + return unlikely(prio < MAX_DL_PRIO); 20 16 } 21 17 22 - static inline int dl_task(struct task_struct *p) 18 + /* 19 + * Returns true if a task has a priority that belongs to DL class. PI-boosted 20 + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. 21 + */ 22 + static inline bool dl_task(struct task_struct *p) 23 23 { 24 24 return dl_prio(p->prio); 25 25 }

+1

include/linux/sched/prio.h

··· 14 14 */ 15 15 16 16 #define MAX_RT_PRIO 100 17 + #define MAX_DL_PRIO 0 17 18 18 19 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) 19 20 #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)

+27 -6

include/linux/sched/rt.h

··· 6 6 7 7 struct task_struct; 8 8 9 - static inline int rt_prio(int prio) 9 + static inline bool rt_prio(int prio) 10 10 { 11 - if (unlikely(prio < MAX_RT_PRIO)) 12 - return 1; 13 - return 0; 11 + return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); 14 12 } 15 13 16 - static inline int rt_task(struct task_struct *p) 14 + static inline bool rt_or_dl_prio(int prio) 15 + { 16 + return unlikely(prio < MAX_RT_PRIO); 17 + } 18 + 19 + /* 20 + * Returns true if a task has a priority that belongs to RT class. PI-boosted 21 + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. 22 + */ 23 + static inline bool rt_task(struct task_struct *p) 17 24 { 18 25 return rt_prio(p->prio); 19 26 } 20 27 21 - static inline bool task_is_realtime(struct task_struct *tsk) 28 + /* 29 + * Returns true if a task has a priority that belongs to RT or DL classes. 30 + * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore 31 + * PI-boosted tasks. 32 + */ 33 + static inline bool rt_or_dl_task(struct task_struct *p) 34 + { 35 + return rt_or_dl_prio(p->prio); 36 + } 37 + 38 + /* 39 + * Returns true if a task has a policy that belongs to RT or DL classes. 40 + * PI-boosted tasks will return false. 41 + */ 42 + static inline bool rt_or_dl_task_policy(struct task_struct *tsk) 22 43 { 23 44 int policy = tsk->policy; 24 45

+1 -1

kernel/freezer.c

··· 72 72 bool freeze; 73 73 74 74 raw_spin_lock_irq(&current->pi_lock); 75 - set_current_state(TASK_FROZEN); 75 + WRITE_ONCE(current->__state, TASK_FROZEN); 76 76 /* unstale saved_state so that __thaw_task() will wake us up */ 77 77 current->saved_state = TASK_RUNNING; 78 78 raw_spin_unlock_irq(&current->pi_lock);

+2 -2

kernel/locking/rtmutex.c

··· 347 347 { 348 348 int prio = task->prio; 349 349 350 - if (!rt_prio(prio)) 350 + if (!rt_or_dl_prio(prio)) 351 351 return DEFAULT_PRIO; 352 352 353 353 return prio; ··· 435 435 * Note that RT tasks are excluded from same priority (lateral) 436 436 * steals to prevent the introduction of an unbounded latency. 437 437 */ 438 - if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) 438 + if (rt_or_dl_prio(waiter->tree.prio)) 439 439 return false; 440 440 441 441 return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);

+2 -2

kernel/locking/rwsem.c

··· 631 631 * if it is an RT task or wait in the wait queue 632 632 * for too long. 633 633 */ 634 - if (has_handoff || (!rt_task(waiter->task) && 634 + if (has_handoff || (!rt_or_dl_task(waiter->task) && 635 635 !time_after(jiffies, waiter->timeout))) 636 636 return false; 637 637 ··· 914 914 if (owner_state != OWNER_WRITER) { 915 915 if (need_resched()) 916 916 break; 917 - if (rt_task(current) && 917 + if (rt_or_dl_task(current) && 918 918 (prev_owner_state != OWNER_WRITER)) 919 919 break; 920 920 }

+1 -1

kernel/locking/ww_mutex.h

··· 237 237 int a_prio = a->task->prio; 238 238 int b_prio = b->task->prio; 239 239 240 - if (rt_prio(a_prio) || rt_prio(b_prio)) { 240 + if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) { 241 241 242 242 if (a_prio > b_prio) 243 243 return true;

+55 -16

kernel/sched/core.c

··· 166 166 if (p->dl_server) 167 167 return -1; /* deadline */ 168 168 169 - if (rt_prio(p->prio)) /* includes deadline */ 169 + if (rt_or_dl_prio(p->prio)) 170 170 return p->prio; /* [-1, 99] */ 171 171 172 172 if (p->sched_class == &idle_sched_class) ··· 1702 1702 if (unlikely(!p->sched_class->uclamp_enabled)) 1703 1703 return; 1704 1704 1705 + if (p->se.sched_delayed) 1706 + return; 1707 + 1705 1708 for_each_clamp_id(clamp_id) 1706 1709 uclamp_rq_inc_id(rq, p, clamp_id); 1707 1710 ··· 1727 1724 return; 1728 1725 1729 1726 if (unlikely(!p->sched_class->uclamp_enabled)) 1727 + return; 1728 + 1729 + if (p->se.sched_delayed) 1730 1730 return; 1731 1731 1732 1732 for_each_clamp_id(clamp_id) ··· 2011 2005 psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); 2012 2006 } 2013 2007 2014 - uclamp_rq_inc(rq, p); 2015 2008 p->sched_class->enqueue_task(rq, p, flags); 2009 + /* 2010 + * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear 2011 + * ->sched_delayed. 2012 + */ 2013 + uclamp_rq_inc(rq, p); 2016 2014 2017 2015 if (sched_core_enabled(rq)) 2018 2016 sched_core_enqueue(rq, p); 2019 2017 } 2020 2018 2021 - void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 2019 + /* 2020 + * Must only return false when DEQUEUE_SLEEP. 2021 + */ 2022 + inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) 2022 2023 { 2023 2024 if (sched_core_enabled(rq)) 2024 2025 sched_core_dequeue(rq, p, flags); ··· 2038 2025 psi_dequeue(p, flags & DEQUEUE_SLEEP); 2039 2026 } 2040 2027 2028 + /* 2029 + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' 2030 + * and mark the task ->sched_delayed. 2031 + */ 2041 2032 uclamp_rq_dec(rq, p); 2042 - p->sched_class->dequeue_task(rq, p, flags); 2033 + return p->sched_class->dequeue_task(rq, p, flags); 2043 2034 } 2044 2035 2045 2036 void activate_task(struct rq *rq, struct task_struct *p, int flags) ··· 2061 2044 2062 2045 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 2063 2046 { 2064 - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); 2047 + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); 2048 + 2049 + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); 2065 2050 ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2066 2051 2052 + /* 2053 + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* 2054 + * dequeue_task() and cleared *after* enqueue_task(). 2055 + */ 2056 + 2067 2057 dequeue_task(rq, p, flags); 2058 + } 2059 + 2060 + static void block_task(struct rq *rq, struct task_struct *p, int flags) 2061 + { 2062 + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) 2063 + __block_task(rq, p); 2068 2064 } 2069 2065 2070 2066 /** ··· 3727 3697 3728 3698 rq = __task_rq_lock(p, &rf); 3729 3699 if (task_on_rq_queued(p)) { 3700 + update_rq_clock(rq); 3701 + if (p->se.sched_delayed) 3702 + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); 3730 3703 if (!task_on_cpu(rq, p)) { 3731 3704 /* 3732 3705 * When on_rq && !on_cpu the task is preempted, see if 3733 3706 * it should preempt the task that is current now. 3734 3707 */ 3735 - update_rq_clock(rq); 3736 3708 wakeup_preempt(rq, p, wake_flags); 3737 3709 } 3738 3710 ttwu_do_wakeup(p); ··· 4123 4091 * case the whole 'p->on_rq && ttwu_runnable()' case below 4124 4092 * without taking any locks. 4125 4093 * 4094 + * Specifically, given current runs ttwu() we must be before 4095 + * schedule()'s block_task(), as such this must not observe 4096 + * sched_delayed. 4097 + * 4126 4098 * In particular: 4127 4099 * - we rely on Program-Order guarantees for all the ordering, 4128 4100 * - we're serialized against set_special_state() by virtue of 4129 4101 * it disabling IRQs (this allows not taking ->pi_lock). 4130 4102 */ 4103 + SCHED_WARN_ON(p->se.sched_delayed); 4131 4104 if (!ttwu_state_match(p, state, &success)) 4132 4105 goto out; 4133 4106 ··· 4421 4384 p->se.nr_migrations = 0; 4422 4385 p->se.vruntime = 0; 4423 4386 p->se.vlag = 0; 4424 - p->se.slice = sysctl_sched_base_slice; 4425 4387 INIT_LIST_HEAD(&p->se.group_node); 4388 + 4389 + /* A delayed task cannot be in clone(). */ 4390 + SCHED_WARN_ON(p->se.sched_delayed); 4426 4391 4427 4392 #ifdef CONFIG_FAIR_GROUP_SCHED 4428 4393 p->se.cfs_rq = NULL; ··· 4677 4638 4678 4639 p->prio = p->normal_prio = p->static_prio; 4679 4640 set_load_weight(p, false); 4641 + p->se.custom_slice = 0; 4642 + p->se.slice = sysctl_sched_base_slice; 4680 4643 4681 4644 /* 4682 4645 * We don't need the reset flag anymore after the fork. It has ··· 6603 6562 if (signal_pending_state(prev_state, prev)) { 6604 6563 WRITE_ONCE(prev->__state, TASK_RUNNING); 6605 6564 } else { 6565 + int flags = DEQUEUE_NOCLOCK; 6566 + 6606 6567 prev->sched_contributes_to_load = 6607 6568 (prev_state & TASK_UNINTERRUPTIBLE) && 6608 6569 !(prev_state & TASK_NOLOAD) && 6609 6570 !(prev_state & TASK_FROZEN); 6610 6571 6611 - if (prev->sched_contributes_to_load) 6612 - rq->nr_uninterruptible++; 6572 + if (unlikely(is_special_task_state(prev_state))) 6573 + flags |= DEQUEUE_SPECIAL; 6613 6574 6614 6575 /* 6615 6576 * __schedule() ttwu() ··· 6624 6581 * 6625 6582 * After this, schedule() must not care about p->state any more. 6626 6583 */ 6627 - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); 6628 - 6629 - if (prev->in_iowait) { 6630 - atomic_inc(&rq->nr_iowait); 6631 - delayacct_blkio_start(); 6632 - } 6584 + block_task(rq, prev, flags); 6633 6585 } 6634 6586 switch_count = &prev->nvcsw; 6635 6587 } ··· 8499 8461 } 8500 8462 8501 8463 set_load_weight(&init_task, false); 8464 + init_task.se.slice = sysctl_sched_base_slice, 8502 8465 8503 8466 /* 8504 8467 * The boot idle thread does lazy MMU switching as well: ··· 8716 8677 schedstat_set(p->stats.sleep_start, 0); 8717 8678 schedstat_set(p->stats.block_start, 0); 8718 8679 8719 - if (!dl_task(p) && !rt_task(p)) { 8680 + if (!rt_or_dl_task(p)) { 8720 8681 /* 8721 8682 * Renice negative nice level userspace 8722 8683 * tasks back to 0:

+3 -2

kernel/sched/deadline.c

··· 2162 2162 enqueue_pushable_dl_task(rq, p); 2163 2163 } 2164 2164 2165 - static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 2165 + static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 2166 2166 { 2167 2167 update_curr_dl(rq); 2168 2168 ··· 2172 2172 dequeue_dl_entity(&p->dl, flags); 2173 2173 if (!p->dl.dl_throttled && !dl_server(&p->dl)) 2174 2174 dequeue_pushable_dl_task(rq, p); 2175 + 2176 + return true; 2175 2177 } 2176 2178 2177 2179 /* ··· 2428 2426 else 2429 2427 p = dl_se->server_pick_next(dl_se); 2430 2428 if (!p) { 2431 - WARN_ON_ONCE(1); 2432 2429 dl_se->dl_yielded = 1; 2433 2430 update_curr_dl_se(rq, dl_se, 0); 2434 2431 goto again;

+3 -2

kernel/sched/debug.c

··· 338 338 DL_PERIOD, 339 339 }; 340 340 341 - static unsigned long fair_server_period_max = (1 << 22) * NSEC_PER_USEC; /* ~4 seconds */ 341 + static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ 342 342 static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ 343 343 344 344 static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, ··· 739 739 else 740 740 SEQ_printf(m, " %c", task_state_to_char(p)); 741 741 742 - SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", 742 + SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", 743 743 p->comm, task_pid_nr(p), 744 744 SPLIT_NS(p->se.vruntime), 745 745 entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', 746 746 SPLIT_NS(p->se.deadline), 747 + p->se.custom_slice ? 'S' : ' ', 747 748 SPLIT_NS(p->se.slice), 748 749 SPLIT_NS(p->se.sum_exec_runtime), 749 750 (long long)(p->nvcsw + p->nivcsw),

+371 -125

kernel/sched/fair.c

··· 779 779 } 780 780 781 781 /* ensure we never gain time by being placed backwards. */ 782 - u64_u32_store(cfs_rq->min_vruntime, 783 - __update_min_vruntime(cfs_rq, vruntime)); 782 + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); 783 + } 784 + 785 + static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) 786 + { 787 + struct sched_entity *root = __pick_root_entity(cfs_rq); 788 + struct sched_entity *curr = cfs_rq->curr; 789 + u64 min_slice = ~0ULL; 790 + 791 + if (curr && curr->on_rq) 792 + min_slice = curr->slice; 793 + 794 + if (root) 795 + min_slice = min(min_slice, root->min_slice); 796 + 797 + return min_slice; 784 798 } 785 799 786 800 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) ··· 813 799 } 814 800 } 815 801 802 + static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) 803 + { 804 + if (node) { 805 + struct sched_entity *rse = __node_2_se(node); 806 + if (rse->min_slice < se->min_slice) 807 + se->min_slice = rse->min_slice; 808 + } 809 + } 810 + 816 811 /* 817 812 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) 818 813 */ 819 814 static inline bool min_vruntime_update(struct sched_entity *se, bool exit) 820 815 { 821 816 u64 old_min_vruntime = se->min_vruntime; 817 + u64 old_min_slice = se->min_slice; 822 818 struct rb_node *node = &se->run_node; 823 819 824 820 se->min_vruntime = se->vruntime; 825 821 __min_vruntime_update(se, node->rb_right); 826 822 __min_vruntime_update(se, node->rb_left); 827 823 828 - return se->min_vruntime == old_min_vruntime; 824 + se->min_slice = se->slice; 825 + __min_slice_update(se, node->rb_right); 826 + __min_slice_update(se, node->rb_left); 827 + 828 + return se->min_vruntime == old_min_vruntime && 829 + se->min_slice == old_min_slice; 829 830 } 830 831 831 832 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, ··· 853 824 { 854 825 avg_vruntime_add(cfs_rq, se); 855 826 se->min_vruntime = se->vruntime; 827 + se->min_slice = se->slice; 856 828 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, 857 829 __entity_less, &min_vruntime_cb); 858 830 } ··· 1004 974 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i 1005 975 * this is probably good enough. 1006 976 */ 1007 - static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) 977 + static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) 1008 978 { 1009 979 if ((s64)(se->vruntime - se->deadline) < 0) 1010 - return; 980 + return false; 1011 981 1012 982 /* 1013 983 * For EEVDF the virtual time slope is determined by w_i (iow. 1014 984 * nice) while the request time r_i is determined by 1015 985 * sysctl_sched_base_slice. 1016 986 */ 1017 - se->slice = sysctl_sched_base_slice; 987 + if (!se->custom_slice) 988 + se->slice = sysctl_sched_base_slice; 1018 989 1019 990 /* 1020 991 * EEVDF: vd_i = ve_i + r_i / w_i ··· 1025 994 /* 1026 995 * The task has consumed its request, reschedule. 1027 996 */ 1028 - if (cfs_rq->nr_running > 1) { 1029 - resched_curr(rq_of(cfs_rq)); 1030 - clear_buddies(cfs_rq, se); 1031 - } 997 + return true; 1032 998 } 1033 999 1034 1000 #include "pelt.h" ··· 1163 1135 dl_server_update(p->dl_server, delta_exec); 1164 1136 } 1165 1137 1138 + static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1139 + { 1140 + if (!sched_feat(PREEMPT_SHORT)) 1141 + return false; 1142 + 1143 + if (curr->vlag == curr->deadline) 1144 + return false; 1145 + 1146 + return !entity_eligible(cfs_rq, curr); 1147 + } 1148 + 1149 + static inline bool do_preempt_short(struct cfs_rq *cfs_rq, 1150 + struct sched_entity *pse, struct sched_entity *se) 1151 + { 1152 + if (!sched_feat(PREEMPT_SHORT)) 1153 + return false; 1154 + 1155 + if (pse->slice >= se->slice) 1156 + return false; 1157 + 1158 + if (!entity_eligible(cfs_rq, pse)) 1159 + return false; 1160 + 1161 + if (entity_before(pse, se)) 1162 + return true; 1163 + 1164 + if (!entity_eligible(cfs_rq, se)) 1165 + return true; 1166 + 1167 + return false; 1168 + } 1169 + 1166 1170 /* 1167 1171 * Used by other classes to account runtime. 1168 1172 */ ··· 1218 1158 struct sched_entity *curr = cfs_rq->curr; 1219 1159 struct rq *rq = rq_of(cfs_rq); 1220 1160 s64 delta_exec; 1161 + bool resched; 1221 1162 1222 1163 if (unlikely(!curr)) 1223 1164 return; ··· 1228 1167 return; 1229 1168 1230 1169 curr->vruntime += calc_delta_fair(delta_exec, curr); 1231 - update_deadline(cfs_rq, curr); 1170 + resched = update_deadline(cfs_rq, curr); 1232 1171 update_min_vruntime(cfs_rq); 1233 1172 1234 1173 if (entity_is_task(curr)) { ··· 1246 1185 } 1247 1186 1248 1187 account_cfs_rq_runtime(cfs_rq, delta_exec); 1188 + 1189 + if (rq->nr_running == 1) 1190 + return; 1191 + 1192 + if (resched || did_preempt_short(cfs_rq, curr)) { 1193 + resched_curr(rq); 1194 + clear_buddies(cfs_rq, curr); 1195 + } 1249 1196 } 1250 1197 1251 1198 static void update_curr_fair(struct rq *rq) ··· 5260 5191 u64 vslice, vruntime = avg_vruntime(cfs_rq); 5261 5192 s64 lag = 0; 5262 5193 5263 - se->slice = sysctl_sched_base_slice; 5194 + if (!se->custom_slice) 5195 + se->slice = sysctl_sched_base_slice; 5264 5196 vslice = calc_delta_fair(se->slice, se); 5265 5197 5266 5198 /* ··· 5342 5272 5343 5273 se->vruntime = vruntime - lag; 5344 5274 5275 + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { 5276 + se->deadline += se->vruntime; 5277 + se->rel_deadline = 0; 5278 + return; 5279 + } 5280 + 5345 5281 /* 5346 5282 * When joining the competition; the existing tasks will be, 5347 5283 * on average, halfway through their slice, as such start tasks ··· 5366 5290 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); 5367 5291 5368 5292 static inline bool cfs_bandwidth_used(void); 5293 + 5294 + static void 5295 + requeue_delayed_entity(struct sched_entity *se); 5369 5296 5370 5297 static void 5371 5298 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ··· 5457 5378 5458 5379 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5459 5380 5460 - static void 5381 + static bool 5461 5382 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5462 5383 { 5463 - int action = UPDATE_TG; 5384 + bool sleep = flags & DEQUEUE_SLEEP; 5464 5385 5386 + update_curr(cfs_rq); 5387 + 5388 + if (flags & DEQUEUE_DELAYED) { 5389 + SCHED_WARN_ON(!se->sched_delayed); 5390 + } else { 5391 + bool delay = sleep; 5392 + /* 5393 + * DELAY_DEQUEUE relies on spurious wakeups, special task 5394 + * states must not suffer spurious wakeups, excempt them. 5395 + */ 5396 + if (flags & DEQUEUE_SPECIAL) 5397 + delay = false; 5398 + 5399 + SCHED_WARN_ON(delay && se->sched_delayed); 5400 + 5401 + if (sched_feat(DELAY_DEQUEUE) && delay && 5402 + !entity_eligible(cfs_rq, se)) { 5403 + if (cfs_rq->next == se) 5404 + cfs_rq->next = NULL; 5405 + update_load_avg(cfs_rq, se, 0); 5406 + se->sched_delayed = 1; 5407 + return false; 5408 + } 5409 + } 5410 + 5411 + int action = UPDATE_TG; 5465 5412 if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) 5466 5413 action |= DO_DETACH; 5467 - 5468 - /* 5469 - * Update run-time statistics of the 'current'. 5470 - */ 5471 - update_curr(cfs_rq); 5472 5414 5473 5415 /* 5474 5416 * When dequeuing a sched_entity, we must: ··· 5508 5408 clear_buddies(cfs_rq, se); 5509 5409 5510 5410 update_entity_lag(cfs_rq, se); 5411 + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { 5412 + se->deadline -= se->vruntime; 5413 + se->rel_deadline = 1; 5414 + } 5415 + 5511 5416 if (se != cfs_rq->curr) 5512 5417 __dequeue_entity(cfs_rq, se); 5513 5418 se->on_rq = 0; ··· 5532 5427 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) 5533 5428 update_min_vruntime(cfs_rq); 5534 5429 5430 + if (flags & DEQUEUE_DELAYED) { 5431 + se->sched_delayed = 0; 5432 + if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5433 + se->vlag = 0; 5434 + } 5435 + 5535 5436 if (cfs_rq->nr_running == 0) 5536 5437 update_idle_cfs_rq_clock_pelt(cfs_rq); 5438 + 5439 + return true; 5537 5440 } 5538 5441 5539 5442 static void ··· 5567 5454 } 5568 5455 5569 5456 update_stats_curr_start(cfs_rq, se); 5457 + SCHED_WARN_ON(cfs_rq->curr); 5570 5458 cfs_rq->curr = se; 5571 5459 5572 5460 /* ··· 5588 5474 se->prev_sum_exec_runtime = se->sum_exec_runtime; 5589 5475 } 5590 5476 5477 + static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); 5478 + 5591 5479 /* 5592 5480 * Pick the next process, keeping these things in mind, in this order: 5593 5481 * 1) keep things fair between processes/task groups ··· 5598 5482 * 4) do not run the "skip" process, if something else is available 5599 5483 */ 5600 5484 static struct sched_entity * 5601 - pick_next_entity(struct cfs_rq *cfs_rq) 5485 + pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) 5602 5486 { 5603 5487 /* 5604 5488 * Enabling NEXT_BUDDY will affect latency but not fairness. 5605 5489 */ 5606 5490 if (sched_feat(NEXT_BUDDY) && 5607 - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) 5491 + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 5492 + /* ->next will never be delayed */ 5493 + SCHED_WARN_ON(cfs_rq->next->sched_delayed); 5608 5494 return cfs_rq->next; 5495 + } 5609 5496 5610 - return pick_eevdf(cfs_rq); 5497 + struct sched_entity *se = pick_eevdf(cfs_rq); 5498 + if (se->sched_delayed) { 5499 + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5500 + SCHED_WARN_ON(se->sched_delayed); 5501 + SCHED_WARN_ON(se->on_rq); 5502 + return NULL; 5503 + } 5504 + return se; 5611 5505 } 5612 5506 5613 5507 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); ··· 5641 5515 /* in !on_rq case, update occurred at dequeue */ 5642 5516 update_load_avg(cfs_rq, prev, 0); 5643 5517 } 5518 + SCHED_WARN_ON(cfs_rq->curr != prev); 5644 5519 cfs_rq->curr = NULL; 5645 5520 } 5646 5521 ··· 5939 5812 idle_task_delta = cfs_rq->idle_h_nr_running; 5940 5813 for_each_sched_entity(se) { 5941 5814 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5815 + int flags; 5816 + 5942 5817 /* throttled entity or throttle-on-deactivate */ 5943 5818 if (!se->on_rq) 5944 5819 goto done; 5945 5820 5946 - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 5821 + /* 5822 + * Abuse SPECIAL to avoid delayed dequeue in this instance. 5823 + * This avoids teaching dequeue_entities() about throttled 5824 + * entities and keeps things relatively simple. 5825 + */ 5826 + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; 5827 + if (se->sched_delayed) 5828 + flags |= DEQUEUE_DELAYED; 5829 + dequeue_entity(qcfs_rq, se, flags); 5947 5830 5948 5831 if (cfs_rq_is_idle(group_cfs_rq(se))) 5949 5832 idle_task_delta = cfs_rq->h_nr_running; ··· 6046 5909 for_each_sched_entity(se) { 6047 5910 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6048 5911 6049 - if (se->on_rq) 5912 + if (se->on_rq) { 5913 + SCHED_WARN_ON(se->sched_delayed); 6050 5914 break; 5915 + } 6051 5916 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); 6052 5917 6053 5918 if (cfs_rq_is_idle(group_cfs_rq(se))) ··· 6899 6760 } 6900 6761 #endif 6901 6762 6763 + static void 6764 + requeue_delayed_entity(struct sched_entity *se) 6765 + { 6766 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 6767 + 6768 + /* 6769 + * se->sched_delayed should imply: se->on_rq == 1. 6770 + * Because a delayed entity is one that is still on 6771 + * the runqueue competing until elegibility. 6772 + */ 6773 + SCHED_WARN_ON(!se->sched_delayed); 6774 + SCHED_WARN_ON(!se->on_rq); 6775 + 6776 + if (sched_feat(DELAY_ZERO)) { 6777 + update_entity_lag(cfs_rq, se); 6778 + if (se->vlag > 0) { 6779 + cfs_rq->nr_running--; 6780 + if (se != cfs_rq->curr) 6781 + __dequeue_entity(cfs_rq, se); 6782 + se->vlag = 0; 6783 + place_entity(cfs_rq, se, 0); 6784 + if (se != cfs_rq->curr) 6785 + __enqueue_entity(cfs_rq, se); 6786 + cfs_rq->nr_running++; 6787 + } 6788 + } 6789 + 6790 + update_load_avg(cfs_rq, se, 0); 6791 + se->sched_delayed = 0; 6792 + } 6793 + 6902 6794 /* 6903 6795 * The enqueue_task method is called before nr_running is 6904 6796 * increased. Here we update the fair scheduling stats and ··· 6943 6773 int idle_h_nr_running = task_has_idle_policy(p); 6944 6774 int task_new = !(flags & ENQUEUE_WAKEUP); 6945 6775 int rq_h_nr_running = rq->cfs.h_nr_running; 6776 + u64 slice = 0; 6777 + 6778 + if (flags & ENQUEUE_DELAYED) { 6779 + requeue_delayed_entity(se); 6780 + return; 6781 + } 6946 6782 6947 6783 /* 6948 6784 * The code below (indirectly) updates schedutil which looks at ··· 6967 6791 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 6968 6792 6969 6793 for_each_sched_entity(se) { 6970 - if (se->on_rq) 6794 + if (se->on_rq) { 6795 + if (se->sched_delayed) 6796 + requeue_delayed_entity(se); 6971 6797 break; 6798 + } 6972 6799 cfs_rq = cfs_rq_of(se); 6800 + 6801 + /* 6802 + * Basically set the slice of group entries to the min_slice of 6803 + * their respective cfs_rq. This ensures the group can service 6804 + * its entities in the desired time-frame. 6805 + */ 6806 + if (slice) { 6807 + se->slice = slice; 6808 + se->custom_slice = 1; 6809 + } 6973 6810 enqueue_entity(cfs_rq, se, flags); 6811 + slice = cfs_rq_min_slice(cfs_rq); 6974 6812 6975 6813 cfs_rq->h_nr_running++; 6976 6814 cfs_rq->idle_h_nr_running += idle_h_nr_running; ··· 7005 6815 update_load_avg(cfs_rq, se, UPDATE_TG); 7006 6816 se_update_runnable(se); 7007 6817 update_cfs_group(se); 6818 + 6819 + se->slice = slice; 6820 + slice = cfs_rq_min_slice(cfs_rq); 7008 6821 7009 6822 cfs_rq->h_nr_running++; 7010 6823 cfs_rq->idle_h_nr_running += idle_h_nr_running; ··· 7056 6863 static void set_next_buddy(struct sched_entity *se); 7057 6864 7058 6865 /* 7059 - * The dequeue_task method is called before nr_running is 7060 - * decreased. We remove the task from the rbtree and 7061 - * update the fair scheduling stats: 6866 + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() 6867 + * failing half-way through and resume the dequeue later. 6868 + * 6869 + * Returns: 6870 + * -1 - dequeue delayed 6871 + * 0 - dequeue throttled 6872 + * 1 - dequeue complete 7062 6873 */ 7063 - static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 6874 + static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) 7064 6875 { 7065 - struct cfs_rq *cfs_rq; 7066 - struct sched_entity *se = &p->se; 7067 - int task_sleep = flags & DEQUEUE_SLEEP; 7068 - int idle_h_nr_running = task_has_idle_policy(p); 7069 6876 bool was_sched_idle = sched_idle_rq(rq); 7070 6877 int rq_h_nr_running = rq->cfs.h_nr_running; 6878 + bool task_sleep = flags & DEQUEUE_SLEEP; 6879 + bool task_delayed = flags & DEQUEUE_DELAYED; 6880 + struct task_struct *p = NULL; 6881 + int idle_h_nr_running = 0; 6882 + int h_nr_running = 0; 6883 + struct cfs_rq *cfs_rq; 6884 + u64 slice = 0; 7071 6885 7072 - util_est_dequeue(&rq->cfs, p); 6886 + if (entity_is_task(se)) { 6887 + p = task_of(se); 6888 + h_nr_running = 1; 6889 + idle_h_nr_running = task_has_idle_policy(p); 6890 + } else { 6891 + cfs_rq = group_cfs_rq(se); 6892 + slice = cfs_rq_min_slice(cfs_rq); 6893 + } 7073 6894 7074 6895 for_each_sched_entity(se) { 7075 6896 cfs_rq = cfs_rq_of(se); 7076 - dequeue_entity(cfs_rq, se, flags); 7077 6897 7078 - cfs_rq->h_nr_running--; 6898 + if (!dequeue_entity(cfs_rq, se, flags)) { 6899 + if (p && &p->se == se) 6900 + return -1; 6901 + 6902 + break; 6903 + } 6904 + 6905 + cfs_rq->h_nr_running -= h_nr_running; 7079 6906 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7080 6907 7081 6908 if (cfs_rq_is_idle(cfs_rq)) 7082 - idle_h_nr_running = 1; 6909 + idle_h_nr_running = h_nr_running; 7083 6910 7084 6911 /* end evaluation on encountering a throttled cfs_rq */ 7085 6912 if (cfs_rq_throttled(cfs_rq)) 7086 - goto dequeue_throttle; 6913 + return 0; 7087 6914 7088 6915 /* Don't dequeue parent if it has other entities besides us */ 7089 6916 if (cfs_rq->load.weight) { 6917 + slice = cfs_rq_min_slice(cfs_rq); 6918 + 7090 6919 /* Avoid re-evaluating load for this entity: */ 7091 6920 se = parent_entity(se); 7092 6921 /* ··· 7120 6905 break; 7121 6906 } 7122 6907 flags |= DEQUEUE_SLEEP; 6908 + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); 7123 6909 } 7124 6910 7125 6911 for_each_sched_entity(se) { ··· 7130 6914 se_update_runnable(se); 7131 6915 update_cfs_group(se); 7132 6916 7133 - cfs_rq->h_nr_running--; 6917 + se->slice = slice; 6918 + slice = cfs_rq_min_slice(cfs_rq); 6919 + 6920 + cfs_rq->h_nr_running -= h_nr_running; 7134 6921 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7135 6922 7136 6923 if (cfs_rq_is_idle(cfs_rq)) 7137 - idle_h_nr_running = 1; 6924 + idle_h_nr_running = h_nr_running; 7138 6925 7139 6926 /* end evaluation on encountering a throttled cfs_rq */ 7140 6927 if (cfs_rq_throttled(cfs_rq)) 7141 - goto dequeue_throttle; 7142 - 6928 + return 0; 7143 6929 } 7144 6930 7145 - /* At this point se is NULL and we are at root level*/ 7146 - sub_nr_running(rq, 1); 6931 + sub_nr_running(rq, h_nr_running); 7147 6932 7148 6933 if (rq_h_nr_running && !rq->cfs.h_nr_running) 7149 6934 dl_server_stop(&rq->fair_server); ··· 7153 6936 if (unlikely(!was_sched_idle && sched_idle_rq(rq))) 7154 6937 rq->next_balance = jiffies; 7155 6938 7156 - dequeue_throttle: 7157 - util_est_update(&rq->cfs, p, task_sleep); 6939 + if (p && task_delayed) { 6940 + SCHED_WARN_ON(!task_sleep); 6941 + SCHED_WARN_ON(p->on_rq != 1); 6942 + 6943 + /* Fix-up what dequeue_task_fair() skipped */ 6944 + hrtick_update(rq); 6945 + 6946 + /* Fix-up what block_task() skipped. */ 6947 + __block_task(rq, p); 6948 + } 6949 + 6950 + return 1; 6951 + } 6952 + 6953 + /* 6954 + * The dequeue_task method is called before nr_running is 6955 + * decreased. We remove the task from the rbtree and 6956 + * update the fair scheduling stats: 6957 + */ 6958 + static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 6959 + { 6960 + util_est_dequeue(&rq->cfs, p); 6961 + 6962 + if (dequeue_entities(rq, &p->se, flags) < 0) { 6963 + util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); 6964 + return false; 6965 + } 6966 + 6967 + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); 7158 6968 hrtick_update(rq); 6969 + return true; 7159 6970 } 7160 6971 7161 6972 #ifdef CONFIG_SMP ··· 8565 8320 8566 8321 static void task_dead_fair(struct task_struct *p) 8567 8322 { 8568 - remove_entity_load_avg(&p->se); 8323 + struct sched_entity *se = &p->se; 8324 + 8325 + if (se->sched_delayed) { 8326 + struct rq_flags rf; 8327 + struct rq *rq; 8328 + 8329 + rq = task_rq_lock(p, &rf); 8330 + if (se->sched_delayed) { 8331 + update_rq_clock(rq); 8332 + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 8333 + } 8334 + task_rq_unlock(rq, p, &rf); 8335 + } 8336 + 8337 + remove_entity_load_avg(se); 8569 8338 } 8570 8339 8571 8340 /* ··· 8615 8356 static int 8616 8357 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8617 8358 { 8618 - if (rq->nr_running) 8359 + if (sched_fair_runnable(rq)) 8619 8360 return 1; 8620 8361 8621 8362 return sched_balance_newidle(rq, rf) != 0; ··· 8701 8442 cfs_rq = cfs_rq_of(se); 8702 8443 update_curr(cfs_rq); 8703 8444 /* 8704 - * XXX pick_eevdf(cfs_rq) != se ? 8445 + * If @p has a shorter slice than current and @p is eligible, override 8446 + * current's slice protection in order to allow preemption. 8447 + * 8448 + * Note that even if @p does not turn out to be the most eligible 8449 + * task at this moment, current's slice protection will be lost. 8450 + */ 8451 + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) 8452 + se->vlag = se->deadline + 1; 8453 + 8454 + /* 8455 + * If @p has become the most eligible task, force preemption. 8705 8456 */ 8706 8457 if (pick_eevdf(cfs_rq) == pse) 8707 8458 goto preempt; ··· 8722 8453 resched_curr(rq); 8723 8454 } 8724 8455 8725 - #ifdef CONFIG_SMP 8726 8456 static struct task_struct *pick_task_fair(struct rq *rq) 8727 8457 { 8728 8458 struct sched_entity *se; ··· 8733 8465 return NULL; 8734 8466 8735 8467 do { 8736 - struct sched_entity *curr = cfs_rq->curr; 8468 + /* Might not have done put_prev_entity() */ 8469 + if (cfs_rq->curr && cfs_rq->curr->on_rq) 8470 + update_curr(cfs_rq); 8737 8471 8738 - /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ 8739 - if (curr) { 8740 - if (curr->on_rq) 8741 - update_curr(cfs_rq); 8742 - else 8743 - curr = NULL; 8472 + if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8473 + goto again; 8744 8474 8745 - if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8746 - goto again; 8747 - } 8748 - 8749 - se = pick_next_entity(cfs_rq); 8475 + se = pick_next_entity(rq, cfs_rq); 8476 + if (!se) 8477 + goto again; 8750 8478 cfs_rq = group_cfs_rq(se); 8751 8479 } while (cfs_rq); 8752 8480 ··· 8756 8492 8757 8493 return task_of(se); 8758 8494 } 8759 - #endif 8760 8495 8761 8496 struct task_struct * 8762 8497 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8763 8498 { 8764 - struct cfs_rq *cfs_rq = &rq->cfs; 8765 8499 struct sched_entity *se; 8766 8500 struct task_struct *p; 8767 8501 int new_tasks; 8768 8502 8769 8503 again: 8770 - if (!sched_fair_runnable(rq)) 8504 + p = pick_task_fair(rq); 8505 + if (!p) 8771 8506 goto idle; 8507 + se = &p->se; 8772 8508 8773 8509 #ifdef CONFIG_FAIR_GROUP_SCHED 8774 8510 if (!prev || prev->sched_class != &fair_sched_class) ··· 8780 8516 * 8781 8517 * Therefore attempt to avoid putting and setting the entire cgroup 8782 8518 * hierarchy, only change the part that actually changes. 8783 - */ 8784 - 8785 - do { 8786 - struct sched_entity *curr = cfs_rq->curr; 8787 - 8788 - /* 8789 - * Since we got here without doing put_prev_entity() we also 8790 - * have to consider cfs_rq->curr. If it is still a runnable 8791 - * entity, update_curr() will update its vruntime, otherwise 8792 - * forget we've ever seen it. 8793 - */ 8794 - if (curr) { 8795 - if (curr->on_rq) 8796 - update_curr(cfs_rq); 8797 - else 8798 - curr = NULL; 8799 - 8800 - /* 8801 - * This call to check_cfs_rq_runtime() will do the 8802 - * throttle and dequeue its entity in the parent(s). 8803 - * Therefore the nr_running test will indeed 8804 - * be correct. 8805 - */ 8806 - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { 8807 - cfs_rq = &rq->cfs; 8808 - 8809 - if (!cfs_rq->nr_running) 8810 - goto idle; 8811 - 8812 - goto simple; 8813 - } 8814 - } 8815 - 8816 - se = pick_next_entity(cfs_rq); 8817 - cfs_rq = group_cfs_rq(se); 8818 - } while (cfs_rq); 8819 - 8820 - p = task_of(se); 8821 - 8822 - /* 8519 + * 8823 8520 * Since we haven't yet done put_prev_entity and if the selected task 8824 8521 * is a different task than we started out with, try and touch the 8825 8522 * least amount of cfs_rqs. 8826 8523 */ 8827 8524 if (prev != p) { 8828 8525 struct sched_entity *pse = &prev->se; 8526 + struct cfs_rq *cfs_rq; 8829 8527 8830 8528 while (!(cfs_rq = is_same_group(se, pse))) { 8831 8529 int se_depth = se->depth; ··· 8813 8587 if (prev) 8814 8588 put_prev_task(rq, prev); 8815 8589 8816 - do { 8817 - se = pick_next_entity(cfs_rq); 8818 - set_next_entity(cfs_rq, se); 8819 - cfs_rq = group_cfs_rq(se); 8820 - } while (cfs_rq); 8821 - 8822 - p = task_of(se); 8590 + for_each_sched_entity(se) 8591 + set_next_entity(cfs_rq_of(se), se); 8823 8592 8824 8593 done: __maybe_unused; 8825 8594 #ifdef CONFIG_SMP ··· 13093 12872 static void switched_from_fair(struct rq *rq, struct task_struct *p) 13094 12873 { 13095 12874 detach_task_cfs_rq(p); 12875 + /* 12876 + * Since this is called after changing class, this is a little weird 12877 + * and we cannot use DEQUEUE_DELAYED. 12878 + */ 12879 + if (p->se.sched_delayed) { 12880 + dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); 12881 + p->se.sched_delayed = 0; 12882 + p->se.rel_deadline = 0; 12883 + if (sched_feat(DELAY_ZERO) && p->se.vlag > 0) 12884 + p->se.vlag = 0; 12885 + } 13096 12886 } 13097 12887 13098 12888 static void switched_to_fair(struct rq *rq, struct task_struct *p) 13099 12889 { 12890 + SCHED_WARN_ON(p->se.sched_delayed); 12891 + 13100 12892 attach_task_cfs_rq(p); 13101 12893 13102 12894 set_task_max_allowed_capacity(p); ··· 13153 12919 /* ensure bandwidth has been allocated on our new cfs_rq */ 13154 12920 account_cfs_rq_runtime(cfs_rq, 0); 13155 12921 } 12922 + 12923 + if (!first) 12924 + return; 12925 + 12926 + SCHED_WARN_ON(se->sched_delayed); 13156 12927 } 13157 12928 13158 12929 void init_cfs_rq(struct cfs_rq *cfs_rq) 13159 12930 { 13160 12931 cfs_rq->tasks_timeline = RB_ROOT_CACHED; 13161 - u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); 12932 + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 13162 12933 #ifdef CONFIG_SMP 13163 12934 raw_spin_lock_init(&cfs_rq->removed.lock); 13164 12935 #endif ··· 13265 13026 13266 13027 void unregister_fair_sched_group(struct task_group *tg) 13267 13028 { 13268 - unsigned long flags; 13269 - struct rq *rq; 13270 13029 int cpu; 13271 13030 13272 13031 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 13273 13032 13274 13033 for_each_possible_cpu(cpu) { 13275 - if (tg->se[cpu]) 13276 - remove_entity_load_avg(tg->se[cpu]); 13034 + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; 13035 + struct sched_entity *se = tg->se[cpu]; 13036 + struct rq *rq = cpu_rq(cpu); 13037 + 13038 + if (se) { 13039 + if (se->sched_delayed) { 13040 + guard(rq_lock_irqsave)(rq); 13041 + if (se->sched_delayed) { 13042 + update_rq_clock(rq); 13043 + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 13044 + } 13045 + list_del_leaf_cfs_rq(cfs_rq); 13046 + } 13047 + remove_entity_load_avg(se); 13048 + } 13277 13049 13278 13050 /* 13279 13051 * Only empty task groups can be destroyed; so we can speculatively 13280 13052 * check on_list without danger of it being re-added. 13281 13053 */ 13282 - if (!tg->cfs_rq[cpu]->on_list) 13283 - continue; 13284 - 13285 - rq = cpu_rq(cpu); 13286 - 13287 - raw_spin_rq_lock_irqsave(rq, flags); 13288 - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 13289 - raw_spin_rq_unlock_irqrestore(rq, flags); 13054 + if (cfs_rq->on_list) { 13055 + guard(rq_lock_irqsave)(rq); 13056 + list_del_leaf_cfs_rq(cfs_rq); 13057 + } 13290 13058 } 13291 13059 } 13292 13060

+28

kernel/sched/features.h

··· 5 5 * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. 6 6 */ 7 7 SCHED_FEAT(PLACE_LAG, true) 8 + /* 9 + * Give new tasks half a slice to ease into the competition. 10 + */ 8 11 SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) 12 + /* 13 + * Preserve relative virtual deadline on 'migration'. 14 + */ 15 + SCHED_FEAT(PLACE_REL_DEADLINE, true) 16 + /* 17 + * Inhibit (wakeup) preemption until the current task has either matched the 18 + * 0-lag point or until is has exhausted it's slice. 19 + */ 9 20 SCHED_FEAT(RUN_TO_PARITY, true) 21 + /* 22 + * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for 23 + * current. 24 + */ 25 + SCHED_FEAT(PREEMPT_SHORT, true) 10 26 11 27 /* 12 28 * Prefer to schedule the task we woke last (assuming it failed ··· 36 20 * cache buddy being migrated away, increases cache locality. 37 21 */ 38 22 SCHED_FEAT(CACHE_HOT_BUDDY, true) 23 + 24 + /* 25 + * Delay dequeueing tasks until they get selected or woken. 26 + * 27 + * By delaying the dequeue for non-eligible tasks, they remain in the 28 + * competition and can burn off their negative lag. When they get selected 29 + * they'll have positive lag by definition. 30 + * 31 + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. 32 + */ 33 + SCHED_FEAT(DELAY_DEQUEUE, true) 34 + SCHED_FEAT(DELAY_ZERO, true) 39 35 40 36 /* 41 37 * Allow wakeup-time preemption of the current task:

+2 -1

kernel/sched/idle.c

··· 484 484 * It is not legal to sleep in the idle task - print a warning 485 485 * message if some code attempts to do it: 486 486 */ 487 - static void 487 + static bool 488 488 dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) 489 489 { 490 490 raw_spin_rq_unlock_irq(rq); 491 491 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 492 492 dump_stack(); 493 493 raw_spin_rq_lock_irq(rq); 494 + return true; 494 495 } 495 496 496 497 /*

+3 -1

kernel/sched/rt.c

··· 1483 1483 enqueue_pushable_task(rq, p); 1484 1484 } 1485 1485 1486 - static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1486 + static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1487 1487 { 1488 1488 struct sched_rt_entity *rt_se = &p->rt; 1489 1489 ··· 1491 1491 dequeue_rt_entity(rt_se, flags); 1492 1492 1493 1493 dequeue_pushable_task(rq, p); 1494 + 1495 + return true; 1494 1496 } 1495 1497 1496 1498 /*

+26 -7

kernel/sched/sched.h

··· 68 68 #include <linux/wait_api.h> 69 69 #include <linux/wait_bit.h> 70 70 #include <linux/workqueue_api.h> 71 + #include <linux/delayacct.h> 71 72 72 73 #include <trace/events/power.h> 73 74 #include <trace/events/sched.h> ··· 646 645 u64 min_vruntime_fi; 647 646 #endif 648 647 649 - #ifndef CONFIG_64BIT 650 - u64 min_vruntime_copy; 651 - #endif 652 - 653 648 struct rb_root_cached tasks_timeline; 654 649 655 650 /* ··· 888 891 889 892 static inline long se_runnable(struct sched_entity *se) 890 893 { 894 + if (se->sched_delayed) 895 + return false; 896 + 891 897 if (entity_is_task(se)) 892 898 return !!se->on_rq; 893 899 else ··· 905 905 906 906 static inline long se_runnable(struct sched_entity *se) 907 907 { 908 + if (se->sched_delayed) 909 + return false; 910 + 908 911 return !!se->on_rq; 909 912 } 910 913 ··· 2320 2317 * 2321 2318 */ 2322 2319 2323 - #define DEQUEUE_SLEEP 0x01 2320 + #define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ 2324 2321 #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ 2325 2322 #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ 2326 2323 #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ 2324 + #define DEQUEUE_SPECIAL 0x10 2327 2325 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ 2326 + #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ 2328 2327 2329 2328 #define ENQUEUE_WAKEUP 0x01 2330 2329 #define ENQUEUE_RESTORE 0x02 ··· 2342 2337 #endif 2343 2338 #define ENQUEUE_INITIAL 0x80 2344 2339 #define ENQUEUE_MIGRATING 0x100 2340 + #define ENQUEUE_DELAYED 0x200 2345 2341 2346 2342 #define RETRY_TASK ((void *)-1UL) 2347 2343 ··· 2361 2355 #endif 2362 2356 2363 2357 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 2364 - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 2358 + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 2365 2359 void (*yield_task) (struct rq *rq); 2366 2360 bool (*yield_to_task)(struct rq *rq, struct task_struct *p); 2367 2361 ··· 2715 2709 2716 2710 /* Check if we still need preemption */ 2717 2711 sched_update_tick_dependency(rq); 2712 + } 2713 + 2714 + static inline void __block_task(struct rq *rq, struct task_struct *p) 2715 + { 2716 + WRITE_ONCE(p->on_rq, 0); 2717 + ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2718 + if (p->sched_contributes_to_load) 2719 + rq->nr_uninterruptible++; 2720 + 2721 + if (p->in_iowait) { 2722 + atomic_inc(&rq->nr_iowait); 2723 + delayacct_blkio_start(); 2724 + } 2718 2725 } 2719 2726 2720 2727 extern void activate_task(struct rq *rq, struct task_struct *p, int flags); ··· 3755 3736 extern void __setscheduler_prio(struct task_struct *p, int prio); 3756 3737 extern void set_load_weight(struct task_struct *p, bool update_load); 3757 3738 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 3758 - extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); 3739 + extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); 3759 3740 3760 3741 extern void check_class_changing(struct rq *rq, struct task_struct *p, 3761 3742 const struct sched_class *prev_class);

+2 -1

kernel/sched/stop_task.c

··· 57 57 add_nr_running(rq, 1); 58 58 } 59 59 60 - static void 60 + static bool 61 61 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 62 62 { 63 63 sub_nr_running(rq, 1); 64 + return true; 64 65 } 65 66 66 67 static void yield_task_stop(struct rq *rq)

+24 -7

kernel/sched/syscalls.c

··· 57 57 * keep the priority unchanged. Otherwise, update priority 58 58 * to the normal priority: 59 59 */ 60 - if (!rt_prio(p->prio)) 60 + if (!rt_or_dl_prio(p->prio)) 61 61 return p->normal_prio; 62 62 return p->prio; 63 63 } ··· 420 420 421 421 p->policy = policy; 422 422 423 - if (dl_policy(policy)) 423 + if (dl_policy(policy)) { 424 424 __setparam_dl(p, attr); 425 - else if (fair_policy(policy)) 425 + } else if (fair_policy(policy)) { 426 426 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 427 + if (attr->sched_runtime) { 428 + p->se.custom_slice = 1; 429 + p->se.slice = clamp_t(u64, attr->sched_runtime, 430 + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ 431 + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ 432 + } else { 433 + p->se.custom_slice = 0; 434 + p->se.slice = sysctl_sched_base_slice; 435 + } 436 + } 427 437 428 438 /* 429 439 * __sched_setscheduler() ensures attr->sched_priority == 0 when ··· 733 723 * but store a possible modification of reset_on_fork. 734 724 */ 735 725 if (unlikely(policy == p->policy)) { 736 - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 726 + if (fair_policy(policy) && 727 + (attr->sched_nice != task_nice(p) || 728 + (attr->sched_runtime != p->se.slice))) 737 729 goto change; 738 730 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 739 731 goto change; ··· 881 869 .sched_priority = param->sched_priority, 882 870 .sched_nice = PRIO_TO_NICE(p->static_prio), 883 871 }; 872 + 873 + if (p->se.custom_slice) 874 + attr.sched_runtime = p->se.slice; 884 875 885 876 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 886 877 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ··· 1051 1036 1052 1037 static void get_params(struct task_struct *p, struct sched_attr *attr) 1053 1038 { 1054 - if (task_has_dl_policy(p)) 1039 + if (task_has_dl_policy(p)) { 1055 1040 __getparam_dl(p, attr); 1056 - else if (task_has_rt_policy(p)) 1041 + } else if (task_has_rt_policy(p)) { 1057 1042 attr->sched_priority = p->rt_priority; 1058 - else 1043 + } else { 1059 1044 attr->sched_nice = task_nice(p); 1045 + attr->sched_runtime = p->se.slice; 1046 + } 1060 1047 } 1061 1048 1062 1049 /**

+3 -3

kernel/time/hrtimer.c

··· 1975 1975 * expiry. 1976 1976 */ 1977 1977 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1978 - if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) 1978 + if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) 1979 1979 mode |= HRTIMER_MODE_HARD; 1980 1980 } 1981 1981 ··· 2075 2075 u64 slack; 2076 2076 2077 2077 slack = current->timer_slack_ns; 2078 - if (rt_task(current)) 2078 + if (rt_or_dl_task(current)) 2079 2079 slack = 0; 2080 2080 2081 2081 hrtimer_init_sleeper_on_stack(&t, clockid, mode); ··· 2280 2280 * Override any slack passed by the user if under 2281 2281 * rt contraints. 2282 2282 */ 2283 - if (rt_task(current)) 2283 + if (rt_or_dl_task(current)) 2284 2284 delta = 0; 2285 2285 2286 2286 hrtimer_init_sleeper_on_stack(&t, clock_id, mode);

+1 -1

kernel/trace/trace_sched_wakeup.c

··· 547 547 * - wakeup_dl handles tasks belonging to sched_dl class only. 548 548 */ 549 549 if (tracing_dl || (wakeup_dl && !dl_task(p)) || 550 - (wakeup_rt && !dl_task(p) && !rt_task(p)) || 550 + (wakeup_rt && !rt_or_dl_task(p)) || 551 551 (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) 552 552 return; 553 553

+2 -2

mm/page-writeback.c

··· 418 418 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; 419 419 420 420 tsk = current; 421 - if (rt_task(tsk)) { 421 + if (rt_or_dl_task(tsk)) { 422 422 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; 423 423 thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; 424 424 } ··· 477 477 else 478 478 dirty = vm_dirty_ratio * node_memory / 100; 479 479 480 - if (rt_task(tsk)) 480 + if (rt_or_dl_task(tsk)) 481 481 dirty += dirty / 4; 482 482 483 483 /*

+1 -1

mm/page_alloc.c

··· 4002 4002 */ 4003 4003 if (alloc_flags & ALLOC_MIN_RESERVE) 4004 4004 alloc_flags &= ~ALLOC_CPUSET; 4005 - } else if (unlikely(rt_task(current)) && in_task()) 4005 + } else if (unlikely(rt_or_dl_task(current)) && in_task()) 4006 4006 alloc_flags |= ALLOC_MIN_RESERVE; 4007 4007 4008 4008 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);