Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets

Qais reported that iterating over all tasks when rebuilding root domains
for finding out which ones are DEADLINE and need their bandwidth
correctly restored on such root domains can be a costly operation (10+
ms delays on suspend-resume).

To fix the problem keep track of the number of DEADLINE tasks belonging
to each cpuset and then use this information (followup patch) to only
perform the above iteration if DEADLINE tasks are actually present in
the cpuset for which a corresponding root domain is being rebuilt.

Reported-by: Qais Yousef <qyousef@layalina.io>
Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Juri Lelli and committed by
Tejun Heo
6c24849f 111cd11b

+47
+4
include/linux/cpuset.h
··· 71 71 extern void cpuset_force_rebuild(void); 72 72 extern void cpuset_update_active_cpus(void); 73 73 extern void cpuset_wait_for_hotplug(void); 74 + extern void inc_dl_tasks_cs(struct task_struct *task); 75 + extern void dec_dl_tasks_cs(struct task_struct *task); 74 76 extern void cpuset_lock(void); 75 77 extern void cpuset_unlock(void); 76 78 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); ··· 191 189 192 190 static inline void cpuset_wait_for_hotplug(void) { } 193 191 192 + static inline void inc_dl_tasks_cs(struct task_struct *task) { } 193 + static inline void dec_dl_tasks_cs(struct task_struct *task) { } 194 194 static inline void cpuset_lock(void) { } 195 195 static inline void cpuset_unlock(void) { } 196 196
+4
kernel/cgroup/cgroup.c
··· 57 57 #include <linux/file.h> 58 58 #include <linux/fs_parser.h> 59 59 #include <linux/sched/cputime.h> 60 + #include <linux/sched/deadline.h> 60 61 #include <linux/psi.h> 61 62 #include <net/sock.h> 62 63 ··· 6683 6682 css_set_move_task(tsk, cset, NULL, false); 6684 6683 list_add_tail(&tsk->cg_list, &cset->dying_tasks); 6685 6684 cset->nr_tasks--; 6685 + 6686 + if (dl_task(tsk)) 6687 + dec_dl_tasks_cs(tsk); 6686 6688 6687 6689 WARN_ON_ONCE(cgroup_task_frozen(tsk)); 6688 6690 if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+25
kernel/cgroup/cpuset.c
··· 193 193 int use_parent_ecpus; 194 194 int child_ecpus_count; 195 195 196 + /* 197 + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we 198 + * know when to rebuild associated root domain bandwidth information. 199 + */ 200 + int nr_deadline_tasks; 201 + 196 202 /* Invalid partition error code, not lock protected */ 197 203 enum prs_errcode prs_err; 198 204 ··· 249 243 static inline struct cpuset *parent_cs(struct cpuset *cs) 250 244 { 251 245 return css_cs(cs->css.parent); 246 + } 247 + 248 + void inc_dl_tasks_cs(struct task_struct *p) 249 + { 250 + struct cpuset *cs = task_cs(p); 251 + 252 + cs->nr_deadline_tasks++; 253 + } 254 + 255 + void dec_dl_tasks_cs(struct task_struct *p) 256 + { 257 + struct cpuset *cs = task_cs(p); 258 + 259 + cs->nr_deadline_tasks--; 252 260 } 253 261 254 262 /* bits in struct cpuset flags field */ ··· 2519 2499 ret = security_task_setscheduler(task); 2520 2500 if (ret) 2521 2501 goto out_unlock; 2502 + 2503 + if (dl_task(task)) { 2504 + cs->nr_deadline_tasks++; 2505 + cpuset_attach_old_cs->nr_deadline_tasks--; 2506 + } 2522 2507 } 2523 2508 2524 2509 /*
+14
kernel/sched/deadline.c
··· 16 16 * Fabio Checconi <fchecconi@gmail.com> 17 17 */ 18 18 19 + #include <linux/cpuset.h> 20 + 19 21 /* 20 22 * Default limits for DL period; on the top end we guard against small util 21 23 * tasks still getting ridiculously long effective runtimes, on the bottom end we ··· 2598 2596 if (task_on_rq_queued(p) && p->dl.dl_runtime) 2599 2597 task_non_contending(p); 2600 2598 2599 + /* 2600 + * In case a task is setscheduled out from SCHED_DEADLINE we need to 2601 + * keep track of that on its cpuset (for correct bandwidth tracking). 2602 + */ 2603 + dec_dl_tasks_cs(p); 2604 + 2601 2605 if (!task_on_rq_queued(p)) { 2602 2606 /* 2603 2607 * Inactive timer is armed. However, p is leaving DEADLINE and ··· 2643 2635 { 2644 2636 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) 2645 2637 put_task_struct(p); 2638 + 2639 + /* 2640 + * In case a task is setscheduled to SCHED_DEADLINE we need to keep 2641 + * track of that on its cpuset (for correct bandwidth tracking). 2642 + */ 2643 + inc_dl_tasks_cs(p); 2646 2644 2647 2645 /* If p is not queued we will update its parameters at next wakeup. */ 2648 2646 if (!task_on_rq_queued(p)) {