Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull PSI updates from Ingo Molnar:

- Various performance optimizations, resulting in a 4%-9% speedup in
the mmtests/config-scheduler-perfpipe micro-benchmark.

- New interface to turn PSI on/off on a per cgroup level.

* tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/psi: Per-cgroup PSI accounting disable/re-enable interface
sched/psi: Cache parent psi_group to speed up group iteration
sched/psi: Consolidate cgroup_psi()
sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
sched/psi: Remove NR_ONCPU task accounting
sched/psi: Optimize task switch inside shared cgroups again
sched/psi: Move private helpers to sched/stats.h
sched/psi: Save percpu memory when !psi_cgroups_enabled
sched/psi: Don't create cgroup PSI files when psi_disabled
sched/psi: Fix periodic aggregation shut off

+362 -103
+23
Documentation/admin-guide/cgroup-v2.rst
··· 976 976 killing cgroups is a process directed operation, i.e. it affects 977 977 the whole thread-group. 978 978 979 + cgroup.pressure 980 + A read-write single value file that allowed values are "0" and "1". 981 + The default is "1". 982 + 983 + Writing "0" to the file will disable the cgroup PSI accounting. 984 + Writing "1" to the file will re-enable the cgroup PSI accounting. 985 + 986 + This control attribute is not hierarchical, so disable or enable PSI 987 + accounting in a cgroup does not affect PSI accounting in descendants 988 + and doesn't need pass enablement via ancestors from root. 989 + 990 + The reason this control attribute exists is that PSI accounts stalls for 991 + each cgroup separately and aggregates it at each level of the hierarchy. 992 + This may cause non-negligible overhead for some workloads when under 993 + deep level of the hierarchy, in which case this control attribute can 994 + be used to disable PSI accounting in the non-leaf cgroups. 995 + 996 + irq.pressure 997 + A read-write nested-keyed file. 998 + 999 + Shows pressure stall information for IRQ/SOFTIRQ. See 1000 + :ref:`Documentation/accounting/psi.rst <psi>` for details. 1001 + 979 1002 Controllers 980 1003 =========== 981 1004
+3
include/linux/cgroup-defs.h
··· 428 428 struct cgroup_file procs_file; /* handle for "cgroup.procs" */ 429 429 struct cgroup_file events_file; /* handle for "cgroup.events" */ 430 430 431 + /* handles for "{cpu,memory,io,irq}.pressure" */ 432 + struct cgroup_file psi_files[NR_PSI_RESOURCES]; 433 + 431 434 /* 432 435 * The bitmask of subsystems enabled on the child cgroups. 433 436 * ->subtree_control is the one configured through
-5
include/linux/cgroup.h
··· 682 682 pr_cont_kernfs_path(cgrp->kn); 683 683 } 684 684 685 - static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) 686 - { 687 - return cgrp->psi; 688 - } 689 - 690 685 bool cgroup_psi_enabled(void); 691 686 692 687 static inline void cgroup_init_kthreadd(void)
+8 -4
include/linux/psi.h
··· 7 7 #include <linux/sched.h> 8 8 #include <linux/poll.h> 9 9 #include <linux/cgroup-defs.h> 10 + #include <linux/cgroup.h> 10 11 11 12 struct seq_file; 12 13 struct css_set; ··· 18 17 extern struct psi_group psi_system; 19 18 20 19 void psi_init(void); 21 - 22 - void psi_task_change(struct task_struct *task, int clear, int set); 23 - void psi_task_switch(struct task_struct *prev, struct task_struct *next, 24 - bool sleep); 25 20 26 21 void psi_memstall_enter(unsigned long *flags); 27 22 void psi_memstall_leave(unsigned long *flags); ··· 31 34 poll_table *wait); 32 35 33 36 #ifdef CONFIG_CGROUPS 37 + static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) 38 + { 39 + return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; 40 + } 41 + 34 42 int psi_cgroup_alloc(struct cgroup *cgrp); 35 43 void psi_cgroup_free(struct cgroup *cgrp); 36 44 void cgroup_move_task(struct task_struct *p, struct css_set *to); 45 + void psi_cgroup_restart(struct psi_group *group); 37 46 #endif 38 47 39 48 #else /* CONFIG_PSI */ ··· 61 58 { 62 59 rcu_assign_pointer(p->cgroups, to); 63 60 } 61 + static inline void psi_cgroup_restart(struct psi_group *group) {} 64 62 #endif 65 63 66 64 #endif /* CONFIG_PSI */
+20 -11
include/linux/psi_types.h
··· 16 16 NR_MEMSTALL, 17 17 NR_RUNNING, 18 18 /* 19 - * This can't have values other than 0 or 1 and could be 20 - * implemented as a bit flag. But for now we still have room 21 - * in the first cacheline of psi_group_cpu, and this way we 22 - * don't have to special case any state tracking for it. 23 - */ 24 - NR_ONCPU, 25 - /* 26 19 * For IO and CPU stalls the presence of running/oncpu tasks 27 20 * in the domain means a partial rather than a full stall. 28 21 * For memory it's not so simple because of page reclaimers: ··· 25 32 * threads and memstall ones. 26 33 */ 27 34 NR_MEMSTALL_RUNNING, 28 - NR_PSI_TASK_COUNTS = 5, 35 + NR_PSI_TASK_COUNTS = 4, 29 36 }; 30 37 31 38 /* Task state bitmasks */ 32 39 #define TSK_IOWAIT (1 << NR_IOWAIT) 33 40 #define TSK_MEMSTALL (1 << NR_MEMSTALL) 34 41 #define TSK_RUNNING (1 << NR_RUNNING) 35 - #define TSK_ONCPU (1 << NR_ONCPU) 36 42 #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) 43 + 44 + /* Only one task can be scheduled, no corresponding task count */ 45 + #define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) 37 46 38 47 /* Resources that workloads could be stalled on */ 39 48 enum psi_res { 40 49 PSI_IO, 41 50 PSI_MEM, 42 51 PSI_CPU, 43 - NR_PSI_RESOURCES = 3, 52 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 53 + PSI_IRQ, 54 + #endif 55 + NR_PSI_RESOURCES, 44 56 }; 45 57 46 58 /* ··· 61 63 PSI_MEM_FULL, 62 64 PSI_CPU_SOME, 63 65 PSI_CPU_FULL, 66 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 67 + PSI_IRQ_FULL, 68 + #endif 64 69 /* Only per-CPU, to weigh the CPU in the global average: */ 65 70 PSI_NONIDLE, 66 - NR_PSI_STATES = 7, 71 + NR_PSI_STATES, 67 72 }; 73 + 74 + /* Use one bit in the state mask to track TSK_ONCPU */ 75 + #define PSI_ONCPU (1 << NR_PSI_STATES) 68 76 69 77 enum psi_aggregators { 70 78 PSI_AVGS = 0, ··· 151 147 }; 152 148 153 149 struct psi_group { 150 + struct psi_group *parent; 151 + bool enabled; 152 + 154 153 /* Protects data used by the aggregator */ 155 154 struct mutex avgs_lock; 156 155 ··· 194 187 }; 195 188 196 189 #else /* CONFIG_PSI */ 190 + 191 + #define NR_PSI_RESOURCES 0 197 192 198 193 struct psi_group { }; 199 194
+95 -9
kernel/cgroup/cgroup.c
··· 3698 3698 static int cgroup_io_pressure_show(struct seq_file *seq, void *v) 3699 3699 { 3700 3700 struct cgroup *cgrp = seq_css(seq)->cgroup; 3701 - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; 3701 + struct psi_group *psi = cgroup_psi(cgrp); 3702 3702 3703 3703 return psi_show(seq, psi, PSI_IO); 3704 3704 } 3705 3705 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) 3706 3706 { 3707 3707 struct cgroup *cgrp = seq_css(seq)->cgroup; 3708 - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; 3708 + struct psi_group *psi = cgroup_psi(cgrp); 3709 3709 3710 3710 return psi_show(seq, psi, PSI_MEM); 3711 3711 } 3712 3712 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) 3713 3713 { 3714 3714 struct cgroup *cgrp = seq_css(seq)->cgroup; 3715 - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; 3715 + struct psi_group *psi = cgroup_psi(cgrp); 3716 3716 3717 3717 return psi_show(seq, psi, PSI_CPU); 3718 3718 } 3719 3719 3720 - static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, 3721 - size_t nbytes, enum psi_res res) 3720 + static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, 3721 + size_t nbytes, enum psi_res res) 3722 3722 { 3723 3723 struct cgroup_file_ctx *ctx = of->priv; 3724 3724 struct psi_trigger *new; ··· 3738 3738 return -EBUSY; 3739 3739 } 3740 3740 3741 - psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; 3741 + psi = cgroup_psi(cgrp); 3742 3742 new = psi_trigger_create(psi, buf, res); 3743 3743 if (IS_ERR(new)) { 3744 3744 cgroup_put(cgrp); ··· 3755 3755 char *buf, size_t nbytes, 3756 3756 loff_t off) 3757 3757 { 3758 - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); 3758 + return pressure_write(of, buf, nbytes, PSI_IO); 3759 3759 } 3760 3760 3761 3761 static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, 3762 3762 char *buf, size_t nbytes, 3763 3763 loff_t off) 3764 3764 { 3765 - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); 3765 + return pressure_write(of, buf, nbytes, PSI_MEM); 3766 3766 } 3767 3767 3768 3768 static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, 3769 3769 char *buf, size_t nbytes, 3770 3770 loff_t off) 3771 3771 { 3772 - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); 3772 + return pressure_write(of, buf, nbytes, PSI_CPU); 3773 + } 3774 + 3775 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 3776 + static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) 3777 + { 3778 + struct cgroup *cgrp = seq_css(seq)->cgroup; 3779 + struct psi_group *psi = cgroup_psi(cgrp); 3780 + 3781 + return psi_show(seq, psi, PSI_IRQ); 3782 + } 3783 + 3784 + static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, 3785 + char *buf, size_t nbytes, 3786 + loff_t off) 3787 + { 3788 + return pressure_write(of, buf, nbytes, PSI_IRQ); 3789 + } 3790 + #endif 3791 + 3792 + static int cgroup_pressure_show(struct seq_file *seq, void *v) 3793 + { 3794 + struct cgroup *cgrp = seq_css(seq)->cgroup; 3795 + struct psi_group *psi = cgroup_psi(cgrp); 3796 + 3797 + seq_printf(seq, "%d\n", psi->enabled); 3798 + 3799 + return 0; 3800 + } 3801 + 3802 + static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, 3803 + char *buf, size_t nbytes, 3804 + loff_t off) 3805 + { 3806 + ssize_t ret; 3807 + int enable; 3808 + struct cgroup *cgrp; 3809 + struct psi_group *psi; 3810 + 3811 + ret = kstrtoint(strstrip(buf), 0, &enable); 3812 + if (ret) 3813 + return ret; 3814 + 3815 + if (enable < 0 || enable > 1) 3816 + return -ERANGE; 3817 + 3818 + cgrp = cgroup_kn_lock_live(of->kn, false); 3819 + if (!cgrp) 3820 + return -ENOENT; 3821 + 3822 + psi = cgroup_psi(cgrp); 3823 + if (psi->enabled != enable) { 3824 + int i; 3825 + 3826 + /* show or hide {cpu,memory,io,irq}.pressure files */ 3827 + for (i = 0; i < NR_PSI_RESOURCES; i++) 3828 + cgroup_file_show(&cgrp->psi_files[i], enable); 3829 + 3830 + psi->enabled = enable; 3831 + if (enable) 3832 + psi_cgroup_restart(psi); 3833 + } 3834 + 3835 + cgroup_kn_unlock(of->kn); 3836 + 3837 + return nbytes; 3773 3838 } 3774 3839 3775 3840 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, ··· 3854 3789 3855 3790 bool cgroup_psi_enabled(void) 3856 3791 { 3792 + if (static_branch_likely(&psi_disabled)) 3793 + return false; 3794 + 3857 3795 return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; 3858 3796 } 3859 3797 ··· 5243 5175 #ifdef CONFIG_PSI 5244 5176 { 5245 5177 .name = "io.pressure", 5178 + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), 5246 5179 .seq_show = cgroup_io_pressure_show, 5247 5180 .write = cgroup_io_pressure_write, 5248 5181 .poll = cgroup_pressure_poll, ··· 5251 5182 }, 5252 5183 { 5253 5184 .name = "memory.pressure", 5185 + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), 5254 5186 .seq_show = cgroup_memory_pressure_show, 5255 5187 .write = cgroup_memory_pressure_write, 5256 5188 .poll = cgroup_pressure_poll, ··· 5259 5189 }, 5260 5190 { 5261 5191 .name = "cpu.pressure", 5192 + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), 5262 5193 .seq_show = cgroup_cpu_pressure_show, 5263 5194 .write = cgroup_cpu_pressure_write, 5264 5195 .poll = cgroup_pressure_poll, 5265 5196 .release = cgroup_pressure_release, 5197 + }, 5198 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 5199 + { 5200 + .name = "irq.pressure", 5201 + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), 5202 + .seq_show = cgroup_irq_pressure_show, 5203 + .write = cgroup_irq_pressure_write, 5204 + .poll = cgroup_pressure_poll, 5205 + .release = cgroup_pressure_release, 5206 + }, 5207 + #endif 5208 + { 5209 + .name = "cgroup.pressure", 5210 + .seq_show = cgroup_pressure_show, 5211 + .write = cgroup_pressure_write, 5266 5212 }, 5267 5213 #endif /* CONFIG_PSI */ 5268 5214 { } /* terminate */
+1
kernel/sched/core.c
··· 701 701 702 702 rq->prev_irq_time += irq_delta; 703 703 delta -= irq_delta; 704 + psi_account_irqtime(rq->curr, irq_delta); 704 705 #endif 705 706 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 706 707 if (static_key_false((&paravirt_steal_rq_enabled))) {
+206 -74
kernel/sched/psi.c
··· 181 181 { 182 182 int cpu; 183 183 184 + group->enabled = true; 184 185 for_each_possible_cpu(cpu) 185 186 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); 186 187 group->avg_last_update = sched_clock(); ··· 202 201 { 203 202 if (!psi_enable) { 204 203 static_branch_enable(&psi_disabled); 204 + static_branch_disable(&psi_cgroups_enabled); 205 205 return; 206 206 } 207 207 ··· 213 211 group_init(&psi_system); 214 212 } 215 213 216 - static bool test_state(unsigned int *tasks, enum psi_states state) 214 + static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) 217 215 { 218 216 switch (state) { 219 217 case PSI_IO_SOME: ··· 226 224 return unlikely(tasks[NR_MEMSTALL] && 227 225 tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); 228 226 case PSI_CPU_SOME: 229 - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); 227 + return unlikely(tasks[NR_RUNNING] > oncpu); 230 228 case PSI_CPU_FULL: 231 - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); 229 + return unlikely(tasks[NR_RUNNING] && !oncpu); 232 230 case PSI_NONIDLE: 233 231 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || 234 232 tasks[NR_RUNNING]; ··· 690 688 bool wake_clock) 691 689 { 692 690 struct psi_group_cpu *groupc; 693 - u32 state_mask = 0; 694 691 unsigned int t, m; 695 692 enum psi_states s; 693 + u32 state_mask; 696 694 697 695 groupc = per_cpu_ptr(group->pcpu, cpu); 698 696 699 697 /* 700 - * First we assess the aggregate resource states this CPU's 701 - * tasks have been in since the last change, and account any 702 - * SOME and FULL time these may have resulted in. 703 - * 704 - * Then we update the task counts according to the state 698 + * First we update the task counts according to the state 705 699 * change requested through the @clear and @set bits. 700 + * 701 + * Then if the cgroup PSI stats accounting enabled, we 702 + * assess the aggregate resource states this CPU's tasks 703 + * have been in since the last change, and account any 704 + * SOME and FULL time these may have resulted in. 706 705 */ 707 706 write_seqcount_begin(&groupc->seq); 708 707 709 - record_times(groupc, now); 708 + /* 709 + * Start with TSK_ONCPU, which doesn't have a corresponding 710 + * task count - it's just a boolean flag directly encoded in 711 + * the state mask. Clear, set, or carry the current state if 712 + * no changes are requested. 713 + */ 714 + if (unlikely(clear & TSK_ONCPU)) { 715 + state_mask = 0; 716 + clear &= ~TSK_ONCPU; 717 + } else if (unlikely(set & TSK_ONCPU)) { 718 + state_mask = PSI_ONCPU; 719 + set &= ~TSK_ONCPU; 720 + } else { 721 + state_mask = groupc->state_mask & PSI_ONCPU; 722 + } 710 723 724 + /* 725 + * The rest of the state mask is calculated based on the task 726 + * counts. Update those first, then construct the mask. 727 + */ 711 728 for (t = 0, m = clear; m; m &= ~(1 << t), t++) { 712 729 if (!(m & (1 << t))) 713 730 continue; 714 731 if (groupc->tasks[t]) { 715 732 groupc->tasks[t]--; 716 733 } else if (!psi_bug) { 717 - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", 734 + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", 718 735 cpu, t, groupc->tasks[0], 719 736 groupc->tasks[1], groupc->tasks[2], 720 - groupc->tasks[3], groupc->tasks[4], 721 - clear, set); 737 + groupc->tasks[3], clear, set); 722 738 psi_bug = 1; 723 739 } 724 740 } ··· 745 725 if (set & (1 << t)) 746 726 groupc->tasks[t]++; 747 727 748 - /* Calculate state mask representing active states */ 728 + if (!group->enabled) { 729 + /* 730 + * On the first group change after disabling PSI, conclude 731 + * the current state and flush its time. This is unlikely 732 + * to matter to the user, but aggregation (get_recent_times) 733 + * may have already incorporated the live state into times_prev; 734 + * avoid a delta sample underflow when PSI is later re-enabled. 735 + */ 736 + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) 737 + record_times(groupc, now); 738 + 739 + groupc->state_mask = state_mask; 740 + 741 + write_seqcount_end(&groupc->seq); 742 + return; 743 + } 744 + 749 745 for (s = 0; s < NR_PSI_STATES; s++) { 750 - if (test_state(groupc->tasks, s)) 746 + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) 751 747 state_mask |= (1 << s); 752 748 } 753 749 ··· 775 739 * task in a cgroup is in_memstall, the corresponding groupc 776 740 * on that cpu is in PSI_MEM_FULL state. 777 741 */ 778 - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) 742 + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) 779 743 state_mask |= (1 << PSI_MEM_FULL); 744 + 745 + record_times(groupc, now); 780 746 781 747 groupc->state_mask = state_mask; 782 748 ··· 791 753 schedule_delayed_work(&group->avgs_work, PSI_FREQ); 792 754 } 793 755 794 - static struct psi_group *iterate_groups(struct task_struct *task, void **iter) 756 + static inline struct psi_group *task_psi_group(struct task_struct *task) 795 757 { 796 - if (*iter == &psi_system) 797 - return NULL; 798 - 799 758 #ifdef CONFIG_CGROUPS 800 - if (static_branch_likely(&psi_cgroups_enabled)) { 801 - struct cgroup *cgroup = NULL; 802 - 803 - if (!*iter) 804 - cgroup = task->cgroups->dfl_cgrp; 805 - else 806 - cgroup = cgroup_parent(*iter); 807 - 808 - if (cgroup && cgroup_parent(cgroup)) { 809 - *iter = cgroup; 810 - return cgroup_psi(cgroup); 811 - } 812 - } 759 + if (static_branch_likely(&psi_cgroups_enabled)) 760 + return cgroup_psi(task_dfl_cgroup(task)); 813 761 #endif 814 - *iter = &psi_system; 815 762 return &psi_system; 816 763 } 817 764 ··· 819 796 { 820 797 int cpu = task_cpu(task); 821 798 struct psi_group *group; 822 - bool wake_clock = true; 823 - void *iter = NULL; 824 799 u64 now; 825 800 826 801 if (!task->pid) ··· 827 806 psi_flags_change(task, clear, set); 828 807 829 808 now = cpu_clock(cpu); 830 - /* 831 - * Periodic aggregation shuts off if there is a period of no 832 - * task changes, so we wake it back up if necessary. However, 833 - * don't do this if the task change is the aggregation worker 834 - * itself going to sleep, or we'll ping-pong forever. 835 - */ 836 - if (unlikely((clear & TSK_RUNNING) && 837 - (task->flags & PF_WQ_WORKER) && 838 - wq_worker_last_func(task) == psi_avgs_work)) 839 - wake_clock = false; 840 809 841 - while ((group = iterate_groups(task, &iter))) 842 - psi_group_change(group, cpu, clear, set, now, wake_clock); 810 + group = task_psi_group(task); 811 + do { 812 + psi_group_change(group, cpu, clear, set, now, true); 813 + } while ((group = group->parent)); 843 814 } 844 815 845 816 void psi_task_switch(struct task_struct *prev, struct task_struct *next, ··· 839 826 { 840 827 struct psi_group *group, *common = NULL; 841 828 int cpu = task_cpu(prev); 842 - void *iter; 843 829 u64 now = cpu_clock(cpu); 844 830 845 831 if (next->pid) { 846 - bool identical_state; 847 - 848 832 psi_flags_change(next, 0, TSK_ONCPU); 849 833 /* 850 - * When switching between tasks that have an identical 851 - * runtime state, the cgroup that contains both tasks 852 - * we reach the first common ancestor. Iterate @next's 853 - * ancestors only until we encounter @prev's ONCPU. 834 + * Set TSK_ONCPU on @next's cgroups. If @next shares any 835 + * ancestors with @prev, those will already have @prev's 836 + * TSK_ONCPU bit set, and we can stop the iteration there. 854 837 */ 855 - identical_state = prev->psi_flags == next->psi_flags; 856 - iter = NULL; 857 - while ((group = iterate_groups(next, &iter))) { 858 - if (identical_state && 859 - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { 838 + group = task_psi_group(next); 839 + do { 840 + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & 841 + PSI_ONCPU) { 860 842 common = group; 861 843 break; 862 844 } 863 845 864 846 psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); 865 - } 847 + } while ((group = group->parent)); 866 848 } 867 849 868 850 if (prev->pid) { 869 851 int clear = TSK_ONCPU, set = 0; 852 + bool wake_clock = true; 870 853 871 854 /* 872 855 * When we're going to sleep, psi_dequeue() lets us ··· 876 867 clear |= TSK_MEMSTALL_RUNNING; 877 868 if (prev->in_iowait) 878 869 set |= TSK_IOWAIT; 870 + 871 + /* 872 + * Periodic aggregation shuts off if there is a period of no 873 + * task changes, so we wake it back up if necessary. However, 874 + * don't do this if the task change is the aggregation worker 875 + * itself going to sleep, or we'll ping-pong forever. 876 + */ 877 + if (unlikely((prev->flags & PF_WQ_WORKER) && 878 + wq_worker_last_func(prev) == psi_avgs_work)) 879 + wake_clock = false; 879 880 } 880 881 881 882 psi_flags_change(prev, clear, set); 882 883 883 - iter = NULL; 884 - while ((group = iterate_groups(prev, &iter)) && group != common) 885 - psi_group_change(group, cpu, clear, set, now, true); 884 + group = task_psi_group(prev); 885 + do { 886 + if (group == common) 887 + break; 888 + psi_group_change(group, cpu, clear, set, now, wake_clock); 889 + } while ((group = group->parent)); 886 890 887 891 /* 888 - * TSK_ONCPU is handled up to the common ancestor. If we're tasked 889 - * with dequeuing too, finish that for the rest of the hierarchy. 892 + * TSK_ONCPU is handled up to the common ancestor. If there are 893 + * any other differences between the two tasks (e.g. prev goes 894 + * to sleep, or only one task is memstall), finish propagating 895 + * those differences all the way up to the root. 890 896 */ 891 - if (sleep) { 897 + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { 892 898 clear &= ~TSK_ONCPU; 893 - for (; group; group = iterate_groups(prev, &iter)) 894 - psi_group_change(group, cpu, clear, set, now, true); 899 + for (; group; group = group->parent) 900 + psi_group_change(group, cpu, clear, set, now, wake_clock); 895 901 } 896 902 } 897 903 } 904 + 905 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 906 + void psi_account_irqtime(struct task_struct *task, u32 delta) 907 + { 908 + int cpu = task_cpu(task); 909 + struct psi_group *group; 910 + struct psi_group_cpu *groupc; 911 + u64 now; 912 + 913 + if (!task->pid) 914 + return; 915 + 916 + now = cpu_clock(cpu); 917 + 918 + group = task_psi_group(task); 919 + do { 920 + if (!group->enabled) 921 + continue; 922 + 923 + groupc = per_cpu_ptr(group->pcpu, cpu); 924 + 925 + write_seqcount_begin(&groupc->seq); 926 + 927 + record_times(groupc, now); 928 + groupc->times[PSI_IRQ_FULL] += delta; 929 + 930 + write_seqcount_end(&groupc->seq); 931 + 932 + if (group->poll_states & (1 << PSI_IRQ_FULL)) 933 + psi_schedule_poll_work(group, 1); 934 + } while ((group = group->parent)); 935 + } 936 + #endif 898 937 899 938 /** 900 939 * psi_memstall_enter - mark the beginning of a memory stall section ··· 1009 952 #ifdef CONFIG_CGROUPS 1010 953 int psi_cgroup_alloc(struct cgroup *cgroup) 1011 954 { 1012 - if (static_branch_likely(&psi_disabled)) 955 + if (!static_branch_likely(&psi_cgroups_enabled)) 1013 956 return 0; 1014 957 1015 958 cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); ··· 1022 965 return -ENOMEM; 1023 966 } 1024 967 group_init(cgroup->psi); 968 + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); 1025 969 return 0; 1026 970 } 1027 971 1028 972 void psi_cgroup_free(struct cgroup *cgroup) 1029 973 { 1030 - if (static_branch_likely(&psi_disabled)) 974 + if (!static_branch_likely(&psi_cgroups_enabled)) 1031 975 return; 1032 976 1033 977 cancel_delayed_work_sync(&cgroup->psi->avgs_work); ··· 1056 998 struct rq_flags rf; 1057 999 struct rq *rq; 1058 1000 1059 - if (static_branch_likely(&psi_disabled)) { 1001 + if (!static_branch_likely(&psi_cgroups_enabled)) { 1060 1002 /* 1061 1003 * Lame to do this here, but the scheduler cannot be locked 1062 1004 * from the outside, so we move cgroups from inside sched/. ··· 1104 1046 1105 1047 task_rq_unlock(rq, task, &rf); 1106 1048 } 1049 + 1050 + void psi_cgroup_restart(struct psi_group *group) 1051 + { 1052 + int cpu; 1053 + 1054 + /* 1055 + * After we disable psi_group->enabled, we don't actually 1056 + * stop percpu tasks accounting in each psi_group_cpu, 1057 + * instead only stop test_state() loop, record_times() 1058 + * and averaging worker, see psi_group_change() for details. 1059 + * 1060 + * When disable cgroup PSI, this function has nothing to sync 1061 + * since cgroup pressure files are hidden and percpu psi_group_cpu 1062 + * would see !psi_group->enabled and only do task accounting. 1063 + * 1064 + * When re-enable cgroup PSI, this function use psi_group_change() 1065 + * to get correct state mask from test_state() loop on tasks[], 1066 + * and restart groupc->state_start from now, use .clear = .set = 0 1067 + * here since no task status really changed. 1068 + */ 1069 + if (!group->enabled) 1070 + return; 1071 + 1072 + for_each_possible_cpu(cpu) { 1073 + struct rq *rq = cpu_rq(cpu); 1074 + struct rq_flags rf; 1075 + u64 now; 1076 + 1077 + rq_lock_irq(rq, &rf); 1078 + now = cpu_clock(cpu); 1079 + psi_group_change(group, cpu, 0, 0, now, true); 1080 + rq_unlock_irq(rq, &rf); 1081 + } 1082 + } 1107 1083 #endif /* CONFIG_CGROUPS */ 1108 1084 1109 1085 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) 1110 1086 { 1087 + bool only_full = false; 1111 1088 int full; 1112 1089 u64 now; 1113 1090 ··· 1157 1064 group->avg_next_update = update_averages(group, now); 1158 1065 mutex_unlock(&group->avgs_lock); 1159 1066 1160 - for (full = 0; full < 2; full++) { 1067 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1068 + only_full = res == PSI_IRQ; 1069 + #endif 1070 + 1071 + for (full = 0; full < 2 - only_full; full++) { 1161 1072 unsigned long avg[3] = { 0, }; 1162 1073 u64 total = 0; 1163 1074 int w; ··· 1175 1078 } 1176 1079 1177 1080 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", 1178 - full ? "full" : "some", 1081 + full || only_full ? "full" : "some", 1179 1082 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), 1180 1083 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), 1181 1084 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), ··· 1202 1105 state = PSI_IO_FULL + res * 2; 1203 1106 else 1204 1107 return ERR_PTR(-EINVAL); 1108 + 1109 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1110 + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) 1111 + return ERR_PTR(-EINVAL); 1112 + #endif 1205 1113 1206 1114 if (state >= PSI_NONIDLE) 1207 1115 return ERR_PTR(-EINVAL); ··· 1492 1390 .proc_release = psi_fop_release, 1493 1391 }; 1494 1392 1393 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1394 + static int psi_irq_show(struct seq_file *m, void *v) 1395 + { 1396 + return psi_show(m, &psi_system, PSI_IRQ); 1397 + } 1398 + 1399 + static int psi_irq_open(struct inode *inode, struct file *file) 1400 + { 1401 + return psi_open(file, psi_irq_show); 1402 + } 1403 + 1404 + static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, 1405 + size_t nbytes, loff_t *ppos) 1406 + { 1407 + return psi_write(file, user_buf, nbytes, PSI_IRQ); 1408 + } 1409 + 1410 + static const struct proc_ops psi_irq_proc_ops = { 1411 + .proc_open = psi_irq_open, 1412 + .proc_read = seq_read, 1413 + .proc_lseek = seq_lseek, 1414 + .proc_write = psi_irq_write, 1415 + .proc_poll = psi_fop_poll, 1416 + .proc_release = psi_fop_release, 1417 + }; 1418 + #endif 1419 + 1495 1420 static int __init psi_proc_init(void) 1496 1421 { 1497 1422 if (psi_enable) { ··· 1526 1397 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); 1527 1398 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); 1528 1399 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); 1400 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1401 + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); 1402 + #endif 1529 1403 } 1530 1404 return 0; 1531 1405 }
+6
kernel/sched/stats.h
··· 107 107 } 108 108 109 109 #ifdef CONFIG_PSI 110 + void psi_task_change(struct task_struct *task, int clear, int set); 111 + void psi_task_switch(struct task_struct *prev, struct task_struct *next, 112 + bool sleep); 113 + void psi_account_irqtime(struct task_struct *task, u32 delta); 114 + 110 115 /* 111 116 * PSI tracks state that persists across sleeps, such as iowaits and 112 117 * memory stalls. As a result, it has to distinguish between sleeps, ··· 206 201 static inline void psi_sched_switch(struct task_struct *prev, 207 202 struct task_struct *next, 208 203 bool sleep) {} 204 + static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} 209 205 #endif /* CONFIG_PSI */ 210 206 211 207 #ifdef CONFIG_SCHED_INFO