Merge branch 'tip/sched/core' into sched_ext/for-6.12

+6 -8

Documentation/scheduler/sched-deadline.rst

··· 749 749 of the command line options. Please refer to rt-app documentation for more 750 750 details (`<rt-app-sources>/doc/*.json`). 751 751 752 - The second testing application is a modification of schedtool, called 753 - schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a 754 - certain pid/application. schedtool-dl is available at: 755 - https://github.com/scheduler-tools/schedtool-dl.git. 752 + The second testing application is done using chrt which has support 753 + for SCHED_DEADLINE. 756 754 757 755 The usage is straightforward:: 758 756 759 - # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app 757 + # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app 760 758 761 759 With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation 762 - of 10ms every 100ms (note that parameters are expressed in microseconds). 763 - You can also use schedtool to create a reservation for an already running 760 + of 10ms every 100ms (note that parameters are expressed in nanoseconds). 761 + You can also use chrt to create a reservation for an already running 764 762 application, given that you know its pid:: 765 763 766 - # schedtool -E -t 10000000:100000000 my_app_pid 764 + # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid 767 765 768 766 Appendix B. Minimal main() 769 767 ==========================

+3 -3

drivers/cpufreq/cppc_cpufreq.c

··· 224 224 * Fake (unused) bandwidth; workaround to "fix" 225 225 * priority inheritance. 226 226 */ 227 - .sched_runtime = 1000000, 228 - .sched_deadline = 10000000, 229 - .sched_period = 10000000, 227 + .sched_runtime = NSEC_PER_MSEC, 228 + .sched_deadline = 10 * NSEC_PER_MSEC, 229 + .sched_period = 10 * NSEC_PER_MSEC, 230 230 }; 231 231 int ret; 232 232

+3 -3

include/uapi/linux/sched/types.h

··· 58 58 * 59 59 * This is reflected by the following fields of the sched_attr structure: 60 60 * 61 - * @sched_deadline representative of the task's deadline 62 - * @sched_runtime representative of the task's runtime 63 - * @sched_period representative of the task's period 61 + * @sched_deadline representative of the task's deadline in nanoseconds 62 + * @sched_runtime representative of the task's runtime in nanoseconds 63 + * @sched_period representative of the task's period in nanoseconds 64 64 * 65 65 * Given this task model, there are a multiplicity of scheduling algorithms 66 66 * and policies, that can be used to ensure all the tasks will make their

+9 -1

kernel/kthread.c

··· 845 845 * event only cares about the address. 846 846 */ 847 847 trace_sched_kthread_work_execute_end(work, func); 848 - } else if (!freezing(current)) 848 + } else if (!freezing(current)) { 849 849 schedule(); 850 + } else { 851 + /* 852 + * Handle the case where the current remains 853 + * TASK_INTERRUPTIBLE. try_to_freeze() expects 854 + * the current to be TASK_RUNNING. 855 + */ 856 + __set_current_state(TASK_RUNNING); 857 + } 850 858 851 859 try_to_freeze(); 852 860 cond_resched();

+32 -19

kernel/sched/core.c

··· 267 267 268 268 void sched_core_enqueue(struct rq *rq, struct task_struct *p) 269 269 { 270 + if (p->se.sched_delayed) 271 + return; 272 + 270 273 rq->core->core_task_seq++; 271 274 272 275 if (!p->core_cookie) ··· 280 277 281 278 void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) 282 279 { 280 + if (p->se.sched_delayed) 281 + return; 282 + 283 283 rq->core->core_task_seq++; 284 284 285 285 if (sched_core_enqueued(p)) { ··· 6483 6477 * Constants for the sched_mode argument of __schedule(). 6484 6478 * 6485 6479 * The mode argument allows RT enabled kernels to differentiate a 6486 - * preemption from blocking on an 'sleeping' spin/rwlock. Note that 6487 - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to 6488 - * optimize the AND operation out and just check for zero. 6480 + * preemption from blocking on an 'sleeping' spin/rwlock. 6489 6481 */ 6490 - #define SM_NONE 0x0 6491 - #define SM_PREEMPT 0x1 6492 - #define SM_RTLOCK_WAIT 0x2 6493 - 6494 - #ifndef CONFIG_PREEMPT_RT 6495 - # define SM_MASK_PREEMPT (~0U) 6496 - #else 6497 - # define SM_MASK_PREEMPT SM_PREEMPT 6498 - #endif 6482 + #define SM_IDLE (-1) 6483 + #define SM_NONE 0 6484 + #define SM_PREEMPT 1 6485 + #define SM_RTLOCK_WAIT 2 6499 6486 6500 6487 /* 6501 6488 * __schedule() is the main scheduler function. ··· 6529 6530 * 6530 6531 * WARNING: must be called with preemption disabled! 6531 6532 */ 6532 - static void __sched notrace __schedule(unsigned int sched_mode) 6533 + static void __sched notrace __schedule(int sched_mode) 6533 6534 { 6534 6535 struct task_struct *prev, *next; 6536 + /* 6537 + * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted 6538 + * as a preemption by schedule_debug() and RCU. 6539 + */ 6540 + bool preempt = sched_mode > SM_NONE; 6535 6541 unsigned long *switch_count; 6536 6542 unsigned long prev_state; 6537 6543 struct rq_flags rf; ··· 6547 6543 rq = cpu_rq(cpu); 6548 6544 prev = rq->curr; 6549 6545 6550 - schedule_debug(prev, !!sched_mode); 6546 + schedule_debug(prev, preempt); 6551 6547 6552 6548 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) 6553 6549 hrtick_clear(rq); 6554 6550 6555 6551 local_irq_disable(); 6556 - rcu_note_context_switch(!!sched_mode); 6552 + rcu_note_context_switch(preempt); 6557 6553 6558 6554 /* 6559 6555 * Make sure that signal_pending_state()->signal_pending() below ··· 6582 6578 6583 6579 switch_count = &prev->nivcsw; 6584 6580 6581 + /* Task state changes only considers SM_PREEMPT as preemption */ 6582 + preempt = sched_mode == SM_PREEMPT; 6583 + 6585 6584 /* 6586 6585 * We must load prev->state once (task_struct::state is volatile), such 6587 6586 * that we form a control dependency vs deactivate_task() below. 6588 6587 */ 6589 6588 prev_state = READ_ONCE(prev->__state); 6590 - if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { 6589 + if (sched_mode == SM_IDLE) { 6590 + if (!rq->nr_running) { 6591 + next = prev; 6592 + goto picked; 6593 + } 6594 + } else if (!preempt && prev_state) { 6591 6595 if (signal_pending_state(prev_state, prev)) { 6592 6596 WRITE_ONCE(prev->__state, TASK_RUNNING); 6593 6597 } else { ··· 6626 6614 } 6627 6615 6628 6616 next = pick_next_task(rq, prev, &rf); 6617 + picked: 6629 6618 clear_tsk_need_resched(prev); 6630 6619 clear_preempt_need_resched(); 6631 6620 #ifdef CONFIG_SCHED_DEBUG ··· 6668 6655 psi_account_irqtime(rq, prev, next); 6669 6656 psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 6670 6657 6671 - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); 6658 + trace_sched_switch(preempt, prev, next, prev_state); 6672 6659 6673 6660 /* Also unlocks the rq: */ 6674 6661 rq = context_switch(rq, prev, next, &rf); ··· 6744 6731 } 6745 6732 } 6746 6733 6747 - static __always_inline void __schedule_loop(unsigned int sched_mode) 6734 + static __always_inline void __schedule_loop(int sched_mode) 6748 6735 { 6749 6736 do { 6750 6737 preempt_disable(); ··· 6789 6776 */ 6790 6777 WARN_ON_ONCE(current->__state); 6791 6778 do { 6792 - __schedule(SM_NONE); 6779 + __schedule(SM_IDLE); 6793 6780 } while (need_resched()); 6794 6781 } 6795 6782

+3 -3

kernel/sched/cpufreq_schedutil.c

··· 662 662 * Fake (unused) bandwidth; workaround to "fix" 663 663 * priority inheritance. 664 664 */ 665 - .sched_runtime = 1000000, 666 - .sched_deadline = 10000000, 667 - .sched_period = 10000000, 665 + .sched_runtime = NSEC_PER_MSEC, 666 + .sched_deadline = 10 * NSEC_PER_MSEC, 667 + .sched_period = 10 * NSEC_PER_MSEC, 668 668 }; 669 669 struct cpufreq_policy *policy = sg_policy->policy; 670 670 int ret;

+23 -8

kernel/sched/debug.c

··· 739 739 else 740 740 SEQ_printf(m, " %c", task_state_to_char(p)); 741 741 742 - SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", 742 + SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", 743 743 p->comm, task_pid_nr(p), 744 744 SPLIT_NS(p->se.vruntime), 745 745 entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', ··· 750 750 (long long)(p->nvcsw + p->nivcsw), 751 751 p->prio); 752 752 753 - SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", 753 + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", 754 754 SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), 755 - SPLIT_NS(p->se.sum_exec_runtime), 756 755 SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), 757 756 SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); 758 757 759 758 #ifdef CONFIG_NUMA_BALANCING 760 - SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 759 + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 761 760 #endif 762 761 #ifdef CONFIG_CGROUP_SCHED 763 - SEQ_printf_task_group_path(m, task_group(p), " %s") 762 + SEQ_printf_task_group_path(m, task_group(p), " %s") 764 763 #endif 765 764 766 765 SEQ_printf(m, "\n"); ··· 771 772 772 773 SEQ_printf(m, "\n"); 773 774 SEQ_printf(m, "runnable tasks:\n"); 774 - SEQ_printf(m, " S task PID tree-key switches prio" 775 - " wait-time sum-exec sum-sleep\n"); 775 + SEQ_printf(m, " S task PID vruntime eligible " 776 + "deadline slice sum-exec switches " 777 + "prio wait-time sum-sleep sum-block" 778 + #ifdef CONFIG_NUMA_BALANCING 779 + " node group-id" 780 + #endif 781 + #ifdef CONFIG_CGROUP_SCHED 782 + " group-path" 783 + #endif 784 + "\n"); 776 785 SEQ_printf(m, "-------------------------------------------------------" 777 - "------------------------------------------------------\n"); 786 + "------------------------------------------------------" 787 + "------------------------------------------------------" 788 + #ifdef CONFIG_NUMA_BALANCING 789 + "--------------" 790 + #endif 791 + #ifdef CONFIG_CGROUP_SCHED 792 + "--------------" 793 + #endif 794 + "\n"); 778 795 779 796 rcu_read_lock(); 780 797 for_each_process_thread(g, p) {

+108 -7

kernel/sched/fair.c

··· 6949 6949 int rq_h_nr_running = rq->cfs.h_nr_running; 6950 6950 u64 slice = 0; 6951 6951 6952 - if (flags & ENQUEUE_DELAYED) { 6953 - requeue_delayed_entity(se); 6954 - return; 6955 - } 6956 - 6957 6952 /* 6958 6953 * The code below (indirectly) updates schedutil which looks at 6959 6954 * the cfs_rq utilization to select a frequency. 6960 6955 * Let's add the task's estimated utilization to the cfs_rq's 6961 6956 * estimated utilization, before we update schedutil. 6962 6957 */ 6963 - util_est_enqueue(&rq->cfs, p); 6958 + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) 6959 + util_est_enqueue(&rq->cfs, p); 6960 + 6961 + if (flags & ENQUEUE_DELAYED) { 6962 + requeue_delayed_entity(se); 6963 + return; 6964 + } 6964 6965 6965 6966 /* 6966 6967 * If in_iowait is set, the code below may not trigger any cpufreq ··· 7179 7178 */ 7180 7179 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 7181 7180 { 7182 - util_est_dequeue(&rq->cfs, p); 7181 + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) 7182 + util_est_dequeue(&rq->cfs, p); 7183 7183 7184 7184 if (dequeue_entities(rq, &p->se, flags) < 0) { 7185 7185 util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); ··· 8085 8083 p = NULL; 8086 8084 8087 8085 return cpu_util(cpu, p, -1, 0); 8086 + } 8087 + 8088 + /* 8089 + * This function computes an effective utilization for the given CPU, to be 8090 + * used for frequency selection given the linear relation: f = u * f_max. 8091 + * 8092 + * The scheduler tracks the following metrics: 8093 + * 8094 + * cpu_util_{cfs,rt,dl,irq}() 8095 + * cpu_bw_dl() 8096 + * 8097 + * Where the cfs,rt and dl util numbers are tracked with the same metric and 8098 + * synchronized windows and are thus directly comparable. 8099 + * 8100 + * The cfs,rt,dl utilization are the running times measured with rq->clock_task 8101 + * which excludes things like IRQ and steal-time. These latter are then accrued 8102 + * in the IRQ utilization. 8103 + * 8104 + * The DL bandwidth number OTOH is not a measured metric but a value computed 8105 + * based on the task model parameters and gives the minimal utilization 8106 + * required to meet deadlines. 8107 + */ 8108 + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 8109 + unsigned long *min, 8110 + unsigned long *max) 8111 + { 8112 + unsigned long util, irq, scale; 8113 + struct rq *rq = cpu_rq(cpu); 8114 + 8115 + scale = arch_scale_cpu_capacity(cpu); 8116 + 8117 + /* 8118 + * Early check to see if IRQ/steal time saturates the CPU, can be 8119 + * because of inaccuracies in how we track these -- see 8120 + * update_irq_load_avg(). 8121 + */ 8122 + irq = cpu_util_irq(rq); 8123 + if (unlikely(irq >= scale)) { 8124 + if (min) 8125 + *min = scale; 8126 + if (max) 8127 + *max = scale; 8128 + return scale; 8129 + } 8130 + 8131 + if (min) { 8132 + /* 8133 + * The minimum utilization returns the highest level between: 8134 + * - the computed DL bandwidth needed with the IRQ pressure which 8135 + * steals time to the deadline task. 8136 + * - The minimum performance requirement for CFS and/or RT. 8137 + */ 8138 + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); 8139 + 8140 + /* 8141 + * When an RT task is runnable and uclamp is not used, we must 8142 + * ensure that the task will run at maximum compute capacity. 8143 + */ 8144 + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) 8145 + *min = max(*min, scale); 8146 + } 8147 + 8148 + /* 8149 + * Because the time spend on RT/DL tasks is visible as 'lost' time to 8150 + * CFS tasks and we use the same metric to track the effective 8151 + * utilization (PELT windows are synchronized) we can directly add them 8152 + * to obtain the CPU's actual utilization. 8153 + */ 8154 + util = util_cfs + cpu_util_rt(rq); 8155 + util += cpu_util_dl(rq); 8156 + 8157 + /* 8158 + * The maximum hint is a soft bandwidth requirement, which can be lower 8159 + * than the actual utilization because of uclamp_max requirements. 8160 + */ 8161 + if (max) 8162 + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); 8163 + 8164 + if (util >= scale) 8165 + return scale; 8166 + 8167 + /* 8168 + * There is still idle time; further improve the number by using the 8169 + * IRQ metric. Because IRQ/steal time is hidden from the task clock we 8170 + * need to scale the task numbers: 8171 + * 8172 + * max - irq 8173 + * U' = irq + --------- * U 8174 + * max 8175 + */ 8176 + util = scale_irq_capacity(util, irq, scale); 8177 + util += irq; 8178 + 8179 + return min(scale, util); 8180 + } 8181 + 8182 + unsigned long sched_cpu_util(int cpu) 8183 + { 8184 + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); 8088 8185 } 8089 8186 8090 8187 /*

+2 -100

kernel/sched/syscalls.c

··· 272 272 273 273 lockdep_assert_rq_held(rq); 274 274 275 + /* hw_pressure doesn't care about invariance */ 275 276 return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | 276 277 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | 277 - update_hw_load_avg(now, rq, hw_pressure) | 278 + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | 278 279 update_irq_load_avg(rq, 0); 279 - } 280 - 281 - /* 282 - * This function computes an effective utilization for the given CPU, to be 283 - * used for frequency selection given the linear relation: f = u * f_max. 284 - * 285 - * The scheduler tracks the following metrics: 286 - * 287 - * cpu_util_{cfs,rt,dl,irq}() 288 - * cpu_bw_dl() 289 - * 290 - * Where the cfs,rt and dl util numbers are tracked with the same metric and 291 - * synchronized windows and are thus directly comparable. 292 - * 293 - * The cfs,rt,dl utilization are the running times measured with rq->clock_task 294 - * which excludes things like IRQ and steal-time. These latter are then accrued 295 - * in the IRQ utilization. 296 - * 297 - * The DL bandwidth number OTOH is not a measured metric but a value computed 298 - * based on the task model parameters and gives the minimal utilization 299 - * required to meet deadlines. 300 - */ 301 - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 302 - unsigned long *min, 303 - unsigned long *max) 304 - { 305 - unsigned long util, irq, scale; 306 - struct rq *rq = cpu_rq(cpu); 307 - 308 - scale = arch_scale_cpu_capacity(cpu); 309 - 310 - /* 311 - * Early check to see if IRQ/steal time saturates the CPU, can be 312 - * because of inaccuracies in how we track these -- see 313 - * update_irq_load_avg(). 314 - */ 315 - irq = cpu_util_irq(rq); 316 - if (unlikely(irq >= scale)) { 317 - if (min) 318 - *min = scale; 319 - if (max) 320 - *max = scale; 321 - return scale; 322 - } 323 - 324 - if (min) { 325 - /* 326 - * The minimum utilization returns the highest level between: 327 - * - the computed DL bandwidth needed with the IRQ pressure which 328 - * steals time to the deadline task. 329 - * - The minimum performance requirement for CFS and/or RT. 330 - */ 331 - *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); 332 - 333 - /* 334 - * When an RT task is runnable and uclamp is not used, we must 335 - * ensure that the task will run at maximum compute capacity. 336 - */ 337 - if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) 338 - *min = max(*min, scale); 339 - } 340 - 341 - /* 342 - * Because the time spend on RT/DL tasks is visible as 'lost' time to 343 - * CFS tasks and we use the same metric to track the effective 344 - * utilization (PELT windows are synchronized) we can directly add them 345 - * to obtain the CPU's actual utilization. 346 - */ 347 - util = util_cfs + cpu_util_rt(rq); 348 - util += cpu_util_dl(rq); 349 - 350 - /* 351 - * The maximum hint is a soft bandwidth requirement, which can be lower 352 - * than the actual utilization because of uclamp_max requirements. 353 - */ 354 - if (max) 355 - *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); 356 - 357 - if (util >= scale) 358 - return scale; 359 - 360 - /* 361 - * There is still idle time; further improve the number by using the 362 - * IRQ metric. Because IRQ/steal time is hidden from the task clock we 363 - * need to scale the task numbers: 364 - * 365 - * max - irq 366 - * U' = irq + --------- * U 367 - * max 368 - */ 369 - util = scale_irq_capacity(util, irq, scale); 370 - util += irq; 371 - 372 - return min(scale, util); 373 - } 374 - 375 - unsigned long sched_cpu_util(int cpu) 376 - { 377 - return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); 378 280 } 379 281 #endif /* CONFIG_SMP */ 380 282