Add initial patches for CPU Controller on Control Group v2

+1260
+407
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.4.patch
···
··· 1 + commit e7cae741f6d645ac68fe8823ca6ef45dbbf6891b 2 + Author: Tejun Heo <tj@kernel.org> 3 + Date: Fri Mar 11 07:31:23 2016 -0500 4 + 5 + sched: Misc preps for cgroup unified hierarchy interface 6 + 7 + Make the following changes in preparation for the cpu controller 8 + interface implementation for the unified hierarchy. This patch 9 + doesn't cause any functional differences. 10 + 11 + * s/cpu_stats_show()/cpu_cfs_stats_show()/ 12 + 13 + * s/cpu_files/cpu_legacy_files/ 14 + 15 + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While 16 + at it, remove pointless cpuacct_stat_desc[] array. 17 + 18 + Signed-off-by: Tejun Heo <tj@kernel.org> 19 + Cc: Ingo Molnar <mingo@redhat.com> 20 + Cc: Peter Zijlstra <peterz@infradead.org> 21 + Cc: Li Zefan <lizefan@huawei.com> 22 + Cc: Johannes Weiner <hannes@cmpxchg.org> 23 + 24 + diff --git a/kernel/sched/core.c b/kernel/sched/core.c 25 + index 732e993..77f3ddd 100644 26 + --- a/kernel/sched/core.c 27 + +++ b/kernel/sched/core.c 28 + @@ -8512,7 +8512,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 29 + return ret; 30 + } 31 + 32 + -static int cpu_stats_show(struct seq_file *sf, void *v) 33 + +static int cpu_cfs_stats_show(struct seq_file *sf, void *v) 34 + { 35 + struct task_group *tg = css_tg(seq_css(sf)); 36 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 37 + @@ -8552,7 +8552,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 38 + } 39 + #endif /* CONFIG_RT_GROUP_SCHED */ 40 + 41 + -static struct cftype cpu_files[] = { 42 + +static struct cftype cpu_legacy_files[] = { 43 + #ifdef CONFIG_FAIR_GROUP_SCHED 44 + { 45 + .name = "shares", 46 + @@ -8573,7 +8573,7 @@ static struct cftype cpu_files[] = { 47 + }, 48 + { 49 + .name = "stat", 50 + - .seq_show = cpu_stats_show, 51 + + .seq_show = cpu_cfs_stats_show, 52 + }, 53 + #endif 54 + #ifdef CONFIG_RT_GROUP_SCHED 55 + @@ -8599,7 +8599,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { 56 + .fork = cpu_cgroup_fork, 57 + .can_attach = cpu_cgroup_can_attach, 58 + .attach = cpu_cgroup_attach, 59 + - .legacy_cftypes = cpu_files, 60 + + .legacy_cftypes = cpu_legacy_files, 61 + .early_init = 1, 62 + }; 63 + 64 + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c 65 + index dd7cbb5..42b2dd5 100644 66 + --- a/kernel/sched/cpuacct.c 67 + +++ b/kernel/sched/cpuacct.c 68 + @@ -177,36 +177,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) 69 + return 0; 70 + } 71 + 72 + -static const char * const cpuacct_stat_desc[] = { 73 + - [CPUACCT_STAT_USER] = "user", 74 + - [CPUACCT_STAT_SYSTEM] = "system", 75 + -}; 76 + - 77 + -static int cpuacct_stats_show(struct seq_file *sf, void *v) 78 + +static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp) 79 + { 80 + - struct cpuacct *ca = css_ca(seq_css(sf)); 81 + int cpu; 82 + - s64 val = 0; 83 + 84 + + *userp = 0; 85 + for_each_online_cpu(cpu) { 86 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 87 + - val += kcpustat->cpustat[CPUTIME_USER]; 88 + - val += kcpustat->cpustat[CPUTIME_NICE]; 89 + + *userp += kcpustat->cpustat[CPUTIME_USER]; 90 + + *userp += kcpustat->cpustat[CPUTIME_NICE]; 91 + } 92 + - val = cputime64_to_clock_t(val); 93 + - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); 94 + 95 + - val = 0; 96 + + *sysp = 0; 97 + for_each_online_cpu(cpu) { 98 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 99 + - val += kcpustat->cpustat[CPUTIME_SYSTEM]; 100 + - val += kcpustat->cpustat[CPUTIME_IRQ]; 101 + - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 102 + + *sysp += kcpustat->cpustat[CPUTIME_SYSTEM]; 103 + + *sysp += kcpustat->cpustat[CPUTIME_IRQ]; 104 + + *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 105 + } 106 + +} 107 + 108 + - val = cputime64_to_clock_t(val); 109 + - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 110 + +static int cpuacct_stats_show(struct seq_file *sf, void *v) 111 + +{ 112 + + cputime64_t user, sys; 113 + 114 + + cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys); 115 + + seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user)); 116 + + seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys)); 117 + return 0; 118 + } 119 + 120 + 121 + commit 1bb33e8a69f089f2d3f58a0e681d4ff352e11c97 122 + Author: Tejun Heo <tj@kernel.org> 123 + Date: Fri Mar 11 07:31:23 2016 -0500 124 + 125 + sched: Implement interface for cgroup unified hierarchy 126 + 127 + While the cpu controller doesn't have any functional problems, there 128 + are a couple interface issues which can be addressed in the v2 129 + interface. 130 + 131 + * cpuacct being a separate controller. This separation is artificial 132 + and rather pointless as demonstrated by most use cases co-mounting 133 + the two controllers. It also forces certain information to be 134 + accounted twice. 135 + 136 + * Use of different time units. Writable control knobs use 137 + microseconds, some stat fields use nanoseconds while other cpuacct 138 + stat fields use centiseconds. 139 + 140 + * Control knobs which can't be used in the root cgroup still show up 141 + in the root. 142 + 143 + * Control knob names and semantics aren't consistent with other 144 + controllers. 145 + 146 + This patchset implements cpu controller's interface on the unified 147 + hierarchy which adheres to the controller file conventions described 148 + in Documentation/cgroups/unified-hierarchy.txt. Overall, the 149 + following changes are made. 150 + 151 + * cpuacct is implictly enabled and disabled by cpu and its information 152 + is reported through "cpu.stat" which now uses microseconds for all 153 + time durations. All time duration fields now have "_usec" appended 154 + to them for clarity. While this doesn't solve the double accounting 155 + immediately, once majority of users switch to v2, cpu can directly 156 + account and report the relevant stats and cpuacct can be disabled on 157 + the unified hierarchy. 158 + 159 + Note that cpuacct.usage_percpu is currently not included in 160 + "cpu.stat". If this information is actually called for, it can be 161 + added later. 162 + 163 + * "cpu.shares" is replaced with "cpu.weight" and operates on the 164 + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). 165 + The weight is scaled to scheduler weight so that 100 maps to 1024 166 + and the ratio relationship is preserved - if weight is W and its 167 + scaled value is S, W / 100 == S / 1024. While the mapped range is a 168 + bit smaller than the orignal scheduler weight range, the dead zones 169 + on both sides are relatively small and covers wider range than the 170 + nice value mappings. This file doesn't make sense in the root 171 + cgroup and isn't create on root. 172 + 173 + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" 174 + which contains both quota and period. 175 + 176 + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by 177 + "cpu.rt.max" which contains both runtime and period. 178 + 179 + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for 180 + CFS bandwidth stats and also using raw division for u64. Use 181 + CONFIG_CFS_BANDWITH and do_div() instead. 182 + 183 + The semantics of "cpu.rt.max" is not fully decided yet. Dropped 184 + for now. 185 + 186 + Signed-off-by: Tejun Heo <tj@kernel.org> 187 + Cc: Ingo Molnar <mingo@redhat.com> 188 + Cc: Peter Zijlstra <peterz@infradead.org> 189 + Cc: Li Zefan <lizefan@huawei.com> 190 + Cc: Johannes Weiner <hannes@cmpxchg.org> 191 + 192 + diff --git a/kernel/sched/core.c b/kernel/sched/core.c 193 + index 77f3ddd..7aafe63 100644 194 + --- a/kernel/sched/core.c 195 + +++ b/kernel/sched/core.c 196 + @@ -8591,6 +8591,139 @@ static struct cftype cpu_legacy_files[] = { 197 + { } /* terminate */ 198 + }; 199 + 200 + +static int cpu_stats_show(struct seq_file *sf, void *v) 201 + +{ 202 + + cpuacct_cpu_stats_show(sf); 203 + + 204 + +#ifdef CONFIG_CFS_BANDWIDTH 205 + + { 206 + + struct task_group *tg = css_tg(seq_css(sf)); 207 + + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 208 + + u64 throttled_usec; 209 + + 210 + + throttled_usec = cfs_b->throttled_time; 211 + + do_div(throttled_usec, NSEC_PER_USEC); 212 + + 213 + + seq_printf(sf, "nr_periods %d\n" 214 + + "nr_throttled %d\n" 215 + + "throttled_usec %llu\n", 216 + + cfs_b->nr_periods, cfs_b->nr_throttled, 217 + + throttled_usec); 218 + + } 219 + +#endif 220 + + return 0; 221 + +} 222 + + 223 + +#ifdef CONFIG_FAIR_GROUP_SCHED 224 + +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 225 + + struct cftype *cft) 226 + +{ 227 + + struct task_group *tg = css_tg(css); 228 + + u64 weight = scale_load_down(tg->shares); 229 + + 230 + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); 231 + +} 232 + + 233 + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 234 + + struct cftype *cftype, u64 weight) 235 + +{ 236 + + /* 237 + + * cgroup weight knobs should use the common MIN, DFL and MAX 238 + + * values which are 1, 100 and 10000 respectively. While it loses 239 + + * a bit of range on both ends, it maps pretty well onto the shares 240 + + * value used by scheduler and the round-trip conversions preserve 241 + + * the original value over the entire range. 242 + + */ 243 + + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) 244 + + return -ERANGE; 245 + + 246 + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); 247 + + 248 + + return sched_group_set_shares(css_tg(css), scale_load(weight)); 249 + +} 250 + +#endif 251 + + 252 + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 253 + + long period, long quota) 254 + +{ 255 + + if (quota < 0) 256 + + seq_puts(sf, "max"); 257 + + else 258 + + seq_printf(sf, "%ld", quota); 259 + + 260 + + seq_printf(sf, " %ld\n", period); 261 + +} 262 + + 263 + +/* caller should put the current value in *@periodp before calling */ 264 + +static int __maybe_unused cpu_period_quota_parse(char *buf, 265 + + u64 *periodp, u64 *quotap) 266 + +{ 267 + + char tok[21]; /* U64_MAX */ 268 + + 269 + + if (!sscanf(buf, "%s %llu", tok, periodp)) 270 + + return -EINVAL; 271 + + 272 + + *periodp *= NSEC_PER_USEC; 273 + + 274 + + if (sscanf(tok, "%llu", quotap)) 275 + + *quotap *= NSEC_PER_USEC; 276 + + else if (!strcmp(tok, "max")) 277 + + *quotap = RUNTIME_INF; 278 + + else 279 + + return -EINVAL; 280 + + 281 + + return 0; 282 + +} 283 + + 284 + +#ifdef CONFIG_CFS_BANDWIDTH 285 + +static int cpu_max_show(struct seq_file *sf, void *v) 286 + +{ 287 + + struct task_group *tg = css_tg(seq_css(sf)); 288 + + 289 + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 290 + + return 0; 291 + +} 292 + + 293 + +static ssize_t cpu_max_write(struct kernfs_open_file *of, 294 + + char *buf, size_t nbytes, loff_t off) 295 + +{ 296 + + struct task_group *tg = css_tg(of_css(of)); 297 + + u64 period = tg_get_cfs_period(tg); 298 + + u64 quota; 299 + + int ret; 300 + + 301 + + ret = cpu_period_quota_parse(buf, &period, &quota); 302 + + if (!ret) 303 + + ret = tg_set_cfs_bandwidth(tg, period, quota); 304 + + return ret ?: nbytes; 305 + +} 306 + +#endif 307 + + 308 + +static struct cftype cpu_files[] = { 309 + + { 310 + + .name = "stat", 311 + + .flags = CFTYPE_NOT_ON_ROOT, 312 + + .seq_show = cpu_stats_show, 313 + + }, 314 + +#ifdef CONFIG_FAIR_GROUP_SCHED 315 + + { 316 + + .name = "weight", 317 + + .flags = CFTYPE_NOT_ON_ROOT, 318 + + .read_u64 = cpu_weight_read_u64, 319 + + .write_u64 = cpu_weight_write_u64, 320 + + }, 321 + +#endif 322 + +#ifdef CONFIG_CFS_BANDWIDTH 323 + + { 324 + + .name = "max", 325 + + .flags = CFTYPE_NOT_ON_ROOT, 326 + + .seq_show = cpu_max_show, 327 + + .write = cpu_max_write, 328 + + }, 329 + +#endif 330 + + { } /* terminate */ 331 + +}; 332 + + 333 + struct cgroup_subsys cpu_cgrp_subsys = { 334 + .css_alloc = cpu_cgroup_css_alloc, 335 + .css_free = cpu_cgroup_css_free, 336 + @@ -8600,7 +8733,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { 337 + .can_attach = cpu_cgroup_can_attach, 338 + .attach = cpu_cgroup_attach, 339 + .legacy_cftypes = cpu_legacy_files, 340 + + .dfl_cftypes = cpu_files, 341 + .early_init = 1, 342 + +#ifdef CONFIG_CGROUP_CPUACCT 343 + + /* 344 + + * cpuacct is enabled together with cpu on the unified hierarchy 345 + + * and its stats are reported through "cpu.stat". 346 + + */ 347 + + .depends_on = 1 << cpuacct_cgrp_id, 348 + +#endif 349 + }; 350 + 351 + #endif /* CONFIG_CGROUP_SCHED */ 352 + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c 353 + index 42b2dd5..b4d32a6 100644 354 + --- a/kernel/sched/cpuacct.c 355 + +++ b/kernel/sched/cpuacct.c 356 + @@ -224,6 +224,30 @@ static struct cftype files[] = { 357 + { } /* terminate */ 358 + }; 359 + 360 + +/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ 361 + +void cpuacct_cpu_stats_show(struct seq_file *sf) 362 + +{ 363 + + struct cgroup_subsys_state *css; 364 + + u64 usage, user, sys; 365 + + 366 + + css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); 367 + + 368 + + usage = cpuusage_read(css, seq_cft(sf)); 369 + + cpuacct_stats_read(css_ca(css), &user, &sys); 370 + + 371 + + user *= TICK_NSEC; 372 + + sys *= TICK_NSEC; 373 + + do_div(usage, NSEC_PER_USEC); 374 + + do_div(user, NSEC_PER_USEC); 375 + + do_div(sys, NSEC_PER_USEC); 376 + + 377 + + seq_printf(sf, "usage_usec %llu\n" 378 + + "user_usec %llu\n" 379 + + "system_usec %llu\n", usage, user, sys); 380 + + 381 + + css_put(css); 382 + +} 383 + + 384 + /* 385 + * charge this task's execution time to its accounting group. 386 + * 387 + diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h 388 + index ed60562..44eace9 100644 389 + --- a/kernel/sched/cpuacct.h 390 + +++ b/kernel/sched/cpuacct.h 391 + @@ -2,6 +2,7 @@ 392 + 393 + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 394 + extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); 395 + +extern void cpuacct_cpu_stats_show(struct seq_file *sf); 396 + 397 + #else 398 + 399 + @@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *p, int index, u64 val) 400 + { 401 + } 402 + 403 + +static inline void cpuacct_cpu_stats_show(struct seq_file *sf) 404 + +{ 405 + +} 406 + + 407 + #endif
+407
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.6.patch
···
··· 1 + commit 6426c5b02d4aab620219b08a5d97ad8851b56b0d 2 + Author: Tejun Heo <tj@kernel.org> 3 + Date: Fri Mar 11 07:31:23 2016 -0500 4 + 5 + sched: Misc preps for cgroup unified hierarchy interface 6 + 7 + Make the following changes in preparation for the cpu controller 8 + interface implementation for the unified hierarchy. This patch 9 + doesn't cause any functional differences. 10 + 11 + * s/cpu_stats_show()/cpu_cfs_stats_show()/ 12 + 13 + * s/cpu_files/cpu_legacy_files/ 14 + 15 + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While 16 + at it, remove pointless cpuacct_stat_desc[] array. 17 + 18 + Signed-off-by: Tejun Heo <tj@kernel.org> 19 + Cc: Ingo Molnar <mingo@redhat.com> 20 + Cc: Peter Zijlstra <peterz@infradead.org> 21 + Cc: Li Zefan <lizefan@huawei.com> 22 + Cc: Johannes Weiner <hannes@cmpxchg.org> 23 + 24 + diff --git a/kernel/sched/core.c b/kernel/sched/core.c 25 + index d1f7149..0d34f35 100644 26 + --- a/kernel/sched/core.c 27 + +++ b/kernel/sched/core.c 28 + @@ -8371,7 +8371,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 29 + return ret; 30 + } 31 + 32 + -static int cpu_stats_show(struct seq_file *sf, void *v) 33 + +static int cpu_cfs_stats_show(struct seq_file *sf, void *v) 34 + { 35 + struct task_group *tg = css_tg(seq_css(sf)); 36 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 37 + @@ -8411,7 +8411,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 38 + } 39 + #endif /* CONFIG_RT_GROUP_SCHED */ 40 + 41 + -static struct cftype cpu_files[] = { 42 + +static struct cftype cpu_legacy_files[] = { 43 + #ifdef CONFIG_FAIR_GROUP_SCHED 44 + { 45 + .name = "shares", 46 + @@ -8432,7 +8432,7 @@ static struct cftype cpu_files[] = { 47 + }, 48 + { 49 + .name = "stat", 50 + - .seq_show = cpu_stats_show, 51 + + .seq_show = cpu_cfs_stats_show, 52 + }, 53 + #endif 54 + #ifdef CONFIG_RT_GROUP_SCHED 55 + @@ -8457,7 +8457,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { 56 + .fork = cpu_cgroup_fork, 57 + .can_attach = cpu_cgroup_can_attach, 58 + .attach = cpu_cgroup_attach, 59 + - .legacy_cftypes = cpu_files, 60 + + .legacy_cftypes = cpu_legacy_files, 61 + .early_init = true, 62 + }; 63 + 64 + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c 65 + index 4a81120..b99030a 100644 66 + --- a/kernel/sched/cpuacct.c 67 + +++ b/kernel/sched/cpuacct.c 68 + @@ -180,36 +180,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) 69 + return 0; 70 + } 71 + 72 + -static const char * const cpuacct_stat_desc[] = { 73 + - [CPUACCT_STAT_USER] = "user", 74 + - [CPUACCT_STAT_SYSTEM] = "system", 75 + -}; 76 + - 77 + -static int cpuacct_stats_show(struct seq_file *sf, void *v) 78 + +static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp) 79 + { 80 + - struct cpuacct *ca = css_ca(seq_css(sf)); 81 + int cpu; 82 + - s64 val = 0; 83 + 84 + + *userp = 0; 85 + for_each_online_cpu(cpu) { 86 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 87 + - val += kcpustat->cpustat[CPUTIME_USER]; 88 + - val += kcpustat->cpustat[CPUTIME_NICE]; 89 + + *userp += kcpustat->cpustat[CPUTIME_USER]; 90 + + *userp += kcpustat->cpustat[CPUTIME_NICE]; 91 + } 92 + - val = cputime64_to_clock_t(val); 93 + - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); 94 + 95 + - val = 0; 96 + + *sysp = 0; 97 + for_each_online_cpu(cpu) { 98 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 99 + - val += kcpustat->cpustat[CPUTIME_SYSTEM]; 100 + - val += kcpustat->cpustat[CPUTIME_IRQ]; 101 + - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 102 + + *sysp += kcpustat->cpustat[CPUTIME_SYSTEM]; 103 + + *sysp += kcpustat->cpustat[CPUTIME_IRQ]; 104 + + *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 105 + } 106 + +} 107 + 108 + - val = cputime64_to_clock_t(val); 109 + - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 110 + +static int cpuacct_stats_show(struct seq_file *sf, void *v) 111 + +{ 112 + + cputime64_t user, sys; 113 + 114 + + cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys); 115 + + seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user)); 116 + + seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys)); 117 + return 0; 118 + } 119 + 120 + 121 + commit d2a799f795a5d5a69c9dc365c34f926e0649f840 122 + Author: Tejun Heo <tj@kernel.org> 123 + Date: Fri Mar 11 07:31:23 2016 -0500 124 + 125 + sched: Implement interface for cgroup unified hierarchy 126 + 127 + While the cpu controller doesn't have any functional problems, there 128 + are a couple interface issues which can be addressed in the v2 129 + interface. 130 + 131 + * cpuacct being a separate controller. This separation is artificial 132 + and rather pointless as demonstrated by most use cases co-mounting 133 + the two controllers. It also forces certain information to be 134 + accounted twice. 135 + 136 + * Use of different time units. Writable control knobs use 137 + microseconds, some stat fields use nanoseconds while other cpuacct 138 + stat fields use centiseconds. 139 + 140 + * Control knobs which can't be used in the root cgroup still show up 141 + in the root. 142 + 143 + * Control knob names and semantics aren't consistent with other 144 + controllers. 145 + 146 + This patchset implements cpu controller's interface on the unified 147 + hierarchy which adheres to the controller file conventions described 148 + in Documentation/cgroups/unified-hierarchy.txt. Overall, the 149 + following changes are made. 150 + 151 + * cpuacct is implictly enabled and disabled by cpu and its information 152 + is reported through "cpu.stat" which now uses microseconds for all 153 + time durations. All time duration fields now have "_usec" appended 154 + to them for clarity. While this doesn't solve the double accounting 155 + immediately, once majority of users switch to v2, cpu can directly 156 + account and report the relevant stats and cpuacct can be disabled on 157 + the unified hierarchy. 158 + 159 + Note that cpuacct.usage_percpu is currently not included in 160 + "cpu.stat". If this information is actually called for, it can be 161 + added later. 162 + 163 + * "cpu.shares" is replaced with "cpu.weight" and operates on the 164 + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). 165 + The weight is scaled to scheduler weight so that 100 maps to 1024 166 + and the ratio relationship is preserved - if weight is W and its 167 + scaled value is S, W / 100 == S / 1024. While the mapped range is a 168 + bit smaller than the orignal scheduler weight range, the dead zones 169 + on both sides are relatively small and covers wider range than the 170 + nice value mappings. This file doesn't make sense in the root 171 + cgroup and isn't create on root. 172 + 173 + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" 174 + which contains both quota and period. 175 + 176 + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by 177 + "cpu.rt.max" which contains both runtime and period. 178 + 179 + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for 180 + CFS bandwidth stats and also using raw division for u64. Use 181 + CONFIG_CFS_BANDWITH and do_div() instead. 182 + 183 + The semantics of "cpu.rt.max" is not fully decided yet. Dropped 184 + for now. 185 + 186 + Signed-off-by: Tejun Heo <tj@kernel.org> 187 + Cc: Ingo Molnar <mingo@redhat.com> 188 + Cc: Peter Zijlstra <peterz@infradead.org> 189 + Cc: Li Zefan <lizefan@huawei.com> 190 + Cc: Johannes Weiner <hannes@cmpxchg.org> 191 + 192 + diff --git a/kernel/sched/core.c b/kernel/sched/core.c 193 + index 0d34f35..5990efc 100644 194 + --- a/kernel/sched/core.c 195 + +++ b/kernel/sched/core.c 196 + @@ -8450,6 +8450,139 @@ static struct cftype cpu_legacy_files[] = { 197 + { } /* terminate */ 198 + }; 199 + 200 + +static int cpu_stats_show(struct seq_file *sf, void *v) 201 + +{ 202 + + cpuacct_cpu_stats_show(sf); 203 + + 204 + +#ifdef CONFIG_CFS_BANDWIDTH 205 + + { 206 + + struct task_group *tg = css_tg(seq_css(sf)); 207 + + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 208 + + u64 throttled_usec; 209 + + 210 + + throttled_usec = cfs_b->throttled_time; 211 + + do_div(throttled_usec, NSEC_PER_USEC); 212 + + 213 + + seq_printf(sf, "nr_periods %d\n" 214 + + "nr_throttled %d\n" 215 + + "throttled_usec %llu\n", 216 + + cfs_b->nr_periods, cfs_b->nr_throttled, 217 + + throttled_usec); 218 + + } 219 + +#endif 220 + + return 0; 221 + +} 222 + + 223 + +#ifdef CONFIG_FAIR_GROUP_SCHED 224 + +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 225 + + struct cftype *cft) 226 + +{ 227 + + struct task_group *tg = css_tg(css); 228 + + u64 weight = scale_load_down(tg->shares); 229 + + 230 + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); 231 + +} 232 + + 233 + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 234 + + struct cftype *cftype, u64 weight) 235 + +{ 236 + + /* 237 + + * cgroup weight knobs should use the common MIN, DFL and MAX 238 + + * values which are 1, 100 and 10000 respectively. While it loses 239 + + * a bit of range on both ends, it maps pretty well onto the shares 240 + + * value used by scheduler and the round-trip conversions preserve 241 + + * the original value over the entire range. 242 + + */ 243 + + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) 244 + + return -ERANGE; 245 + + 246 + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); 247 + + 248 + + return sched_group_set_shares(css_tg(css), scale_load(weight)); 249 + +} 250 + +#endif 251 + + 252 + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 253 + + long period, long quota) 254 + +{ 255 + + if (quota < 0) 256 + + seq_puts(sf, "max"); 257 + + else 258 + + seq_printf(sf, "%ld", quota); 259 + + 260 + + seq_printf(sf, " %ld\n", period); 261 + +} 262 + + 263 + +/* caller should put the current value in *@periodp before calling */ 264 + +static int __maybe_unused cpu_period_quota_parse(char *buf, 265 + + u64 *periodp, u64 *quotap) 266 + +{ 267 + + char tok[21]; /* U64_MAX */ 268 + + 269 + + if (!sscanf(buf, "%s %llu", tok, periodp)) 270 + + return -EINVAL; 271 + + 272 + + *periodp *= NSEC_PER_USEC; 273 + + 274 + + if (sscanf(tok, "%llu", quotap)) 275 + + *quotap *= NSEC_PER_USEC; 276 + + else if (!strcmp(tok, "max")) 277 + + *quotap = RUNTIME_INF; 278 + + else 279 + + return -EINVAL; 280 + + 281 + + return 0; 282 + +} 283 + + 284 + +#ifdef CONFIG_CFS_BANDWIDTH 285 + +static int cpu_max_show(struct seq_file *sf, void *v) 286 + +{ 287 + + struct task_group *tg = css_tg(seq_css(sf)); 288 + + 289 + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 290 + + return 0; 291 + +} 292 + + 293 + +static ssize_t cpu_max_write(struct kernfs_open_file *of, 294 + + char *buf, size_t nbytes, loff_t off) 295 + +{ 296 + + struct task_group *tg = css_tg(of_css(of)); 297 + + u64 period = tg_get_cfs_period(tg); 298 + + u64 quota; 299 + + int ret; 300 + + 301 + + ret = cpu_period_quota_parse(buf, &period, &quota); 302 + + if (!ret) 303 + + ret = tg_set_cfs_bandwidth(tg, period, quota); 304 + + return ret ?: nbytes; 305 + +} 306 + +#endif 307 + + 308 + +static struct cftype cpu_files[] = { 309 + + { 310 + + .name = "stat", 311 + + .flags = CFTYPE_NOT_ON_ROOT, 312 + + .seq_show = cpu_stats_show, 313 + + }, 314 + +#ifdef CONFIG_FAIR_GROUP_SCHED 315 + + { 316 + + .name = "weight", 317 + + .flags = CFTYPE_NOT_ON_ROOT, 318 + + .read_u64 = cpu_weight_read_u64, 319 + + .write_u64 = cpu_weight_write_u64, 320 + + }, 321 + +#endif 322 + +#ifdef CONFIG_CFS_BANDWIDTH 323 + + { 324 + + .name = "max", 325 + + .flags = CFTYPE_NOT_ON_ROOT, 326 + + .seq_show = cpu_max_show, 327 + + .write = cpu_max_write, 328 + + }, 329 + +#endif 330 + + { } /* terminate */ 331 + +}; 332 + + 333 + struct cgroup_subsys cpu_cgrp_subsys = { 334 + .css_alloc = cpu_cgroup_css_alloc, 335 + .css_released = cpu_cgroup_css_released, 336 + @@ -8458,7 +8591,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { 337 + .can_attach = cpu_cgroup_can_attach, 338 + .attach = cpu_cgroup_attach, 339 + .legacy_cftypes = cpu_legacy_files, 340 + + .dfl_cftypes = cpu_files, 341 + .early_init = true, 342 + +#ifdef CONFIG_CGROUP_CPUACCT 343 + + /* 344 + + * cpuacct is enabled together with cpu on the unified hierarchy 345 + + * and its stats are reported through "cpu.stat". 346 + + */ 347 + + .depends_on = 1 << cpuacct_cgrp_id, 348 + +#endif 349 + }; 350 + 351 + #endif /* CONFIG_CGROUP_SCHED */ 352 + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c 353 + index b99030a..a1a5a4b 100644 354 + --- a/kernel/sched/cpuacct.c 355 + +++ b/kernel/sched/cpuacct.c 356 + @@ -227,6 +227,30 @@ static struct cftype files[] = { 357 + { } /* terminate */ 358 + }; 359 + 360 + +/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ 361 + +void cpuacct_cpu_stats_show(struct seq_file *sf) 362 + +{ 363 + + struct cgroup_subsys_state *css; 364 + + u64 usage, user, sys; 365 + + 366 + + css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); 367 + + 368 + + usage = cpuusage_read(css, seq_cft(sf)); 369 + + cpuacct_stats_read(css_ca(css), &user, &sys); 370 + + 371 + + user *= TICK_NSEC; 372 + + sys *= TICK_NSEC; 373 + + do_div(usage, NSEC_PER_USEC); 374 + + do_div(user, NSEC_PER_USEC); 375 + + do_div(sys, NSEC_PER_USEC); 376 + + 377 + + seq_printf(sf, "usage_usec %llu\n" 378 + + "user_usec %llu\n" 379 + + "system_usec %llu\n", usage, user, sys); 380 + + 381 + + css_put(css); 382 + +} 383 + + 384 + /* 385 + * charge this task's execution time to its accounting group. 386 + * 387 + diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h 388 + index ba72807..ddf7af4 100644 389 + --- a/kernel/sched/cpuacct.h 390 + +++ b/kernel/sched/cpuacct.h 391 + @@ -2,6 +2,7 @@ 392 + 393 + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 394 + extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); 395 + +extern void cpuacct_cpu_stats_show(struct seq_file *sf); 396 + 397 + #else 398 + 399 + @@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) 400 + { 401 + } 402 + 403 + +static inline void cpuacct_cpu_stats_show(struct seq_file *sf) 404 + +{ 405 + +} 406 + + 407 + #endif
+407
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.7.patch
···
··· 1 + commit 0d966df508ef4d6c0b1baae9e369f4fb0d3e10af 2 + Author: Tejun Heo <tj@kernel.org> 3 + Date: Fri Mar 11 07:31:23 2016 -0500 4 + 5 + sched: Misc preps for cgroup unified hierarchy interface 6 + 7 + Make the following changes in preparation for the cpu controller 8 + interface implementation for the unified hierarchy. This patch 9 + doesn't cause any functional differences. 10 + 11 + * s/cpu_stats_show()/cpu_cfs_stats_show()/ 12 + 13 + * s/cpu_files/cpu_legacy_files/ 14 + 15 + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While 16 + at it, remove pointless cpuacct_stat_desc[] array. 17 + 18 + Signed-off-by: Tejun Heo <tj@kernel.org> 19 + Cc: Ingo Molnar <mingo@redhat.com> 20 + Cc: Peter Zijlstra <peterz@infradead.org> 21 + Cc: Li Zefan <lizefan@huawei.com> 22 + Cc: Johannes Weiner <hannes@cmpxchg.org> 23 + 24 + diff --git a/kernel/sched/core.c b/kernel/sched/core.c 25 + index 97ee9ac..c148dfe 100644 26 + --- a/kernel/sched/core.c 27 + +++ b/kernel/sched/core.c 28 + @@ -8482,7 +8482,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 29 + return ret; 30 + } 31 + 32 + -static int cpu_stats_show(struct seq_file *sf, void *v) 33 + +static int cpu_cfs_stats_show(struct seq_file *sf, void *v) 34 + { 35 + struct task_group *tg = css_tg(seq_css(sf)); 36 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 37 + @@ -8522,7 +8522,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 38 + } 39 + #endif /* CONFIG_RT_GROUP_SCHED */ 40 + 41 + -static struct cftype cpu_files[] = { 42 + +static struct cftype cpu_legacy_files[] = { 43 + #ifdef CONFIG_FAIR_GROUP_SCHED 44 + { 45 + .name = "shares", 46 + @@ -8543,7 +8543,7 @@ static struct cftype cpu_files[] = { 47 + }, 48 + { 49 + .name = "stat", 50 + - .seq_show = cpu_stats_show, 51 + + .seq_show = cpu_cfs_stats_show, 52 + }, 53 + #endif 54 + #ifdef CONFIG_RT_GROUP_SCHED 55 + @@ -8568,7 +8568,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { 56 + .fork = cpu_cgroup_fork, 57 + .can_attach = cpu_cgroup_can_attach, 58 + .attach = cpu_cgroup_attach, 59 + - .legacy_cftypes = cpu_files, 60 + + .legacy_cftypes = cpu_legacy_files, 61 + .early_init = true, 62 + }; 63 + 64 + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c 65 + index 41f85c4..3eb9eda 100644 66 + --- a/kernel/sched/cpuacct.c 67 + +++ b/kernel/sched/cpuacct.c 68 + @@ -242,36 +242,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) 69 + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); 70 + } 71 + 72 + -static const char * const cpuacct_stat_desc[] = { 73 + - [CPUACCT_STAT_USER] = "user", 74 + - [CPUACCT_STAT_SYSTEM] = "system", 75 + -}; 76 + - 77 + -static int cpuacct_stats_show(struct seq_file *sf, void *v) 78 + +static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp) 79 + { 80 + - struct cpuacct *ca = css_ca(seq_css(sf)); 81 + int cpu; 82 + - s64 val = 0; 83 + 84 + + *userp = 0; 85 + for_each_possible_cpu(cpu) { 86 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 87 + - val += kcpustat->cpustat[CPUTIME_USER]; 88 + - val += kcpustat->cpustat[CPUTIME_NICE]; 89 + + *userp += kcpustat->cpustat[CPUTIME_USER]; 90 + + *userp += kcpustat->cpustat[CPUTIME_NICE]; 91 + } 92 + - val = cputime64_to_clock_t(val); 93 + - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); 94 + 95 + - val = 0; 96 + + *sysp = 0; 97 + for_each_possible_cpu(cpu) { 98 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 99 + - val += kcpustat->cpustat[CPUTIME_SYSTEM]; 100 + - val += kcpustat->cpustat[CPUTIME_IRQ]; 101 + - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 102 + + *sysp += kcpustat->cpustat[CPUTIME_SYSTEM]; 103 + + *sysp += kcpustat->cpustat[CPUTIME_IRQ]; 104 + + *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 105 + } 106 + +} 107 + 108 + - val = cputime64_to_clock_t(val); 109 + - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 110 + +static int cpuacct_stats_show(struct seq_file *sf, void *v) 111 + +{ 112 + + cputime64_t user, sys; 113 + 114 + + cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys); 115 + + seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user)); 116 + + seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys)); 117 + return 0; 118 + } 119 + 120 + 121 + commit ed6d93036ec930cb774da10b7c87f67905ce71f1 122 + Author: Tejun Heo <tj@kernel.org> 123 + Date: Fri Mar 11 07:31:23 2016 -0500 124 + 125 + sched: Implement interface for cgroup unified hierarchy 126 + 127 + While the cpu controller doesn't have any functional problems, there 128 + are a couple interface issues which can be addressed in the v2 129 + interface. 130 + 131 + * cpuacct being a separate controller. This separation is artificial 132 + and rather pointless as demonstrated by most use cases co-mounting 133 + the two controllers. It also forces certain information to be 134 + accounted twice. 135 + 136 + * Use of different time units. Writable control knobs use 137 + microseconds, some stat fields use nanoseconds while other cpuacct 138 + stat fields use centiseconds. 139 + 140 + * Control knobs which can't be used in the root cgroup still show up 141 + in the root. 142 + 143 + * Control knob names and semantics aren't consistent with other 144 + controllers. 145 + 146 + This patchset implements cpu controller's interface on the unified 147 + hierarchy which adheres to the controller file conventions described 148 + in Documentation/cgroups/unified-hierarchy.txt. Overall, the 149 + following changes are made. 150 + 151 + * cpuacct is implictly enabled and disabled by cpu and its information 152 + is reported through "cpu.stat" which now uses microseconds for all 153 + time durations. All time duration fields now have "_usec" appended 154 + to them for clarity. While this doesn't solve the double accounting 155 + immediately, once majority of users switch to v2, cpu can directly 156 + account and report the relevant stats and cpuacct can be disabled on 157 + the unified hierarchy. 158 + 159 + Note that cpuacct.usage_percpu is currently not included in 160 + "cpu.stat". If this information is actually called for, it can be 161 + added later. 162 + 163 + * "cpu.shares" is replaced with "cpu.weight" and operates on the 164 + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). 165 + The weight is scaled to scheduler weight so that 100 maps to 1024 166 + and the ratio relationship is preserved - if weight is W and its 167 + scaled value is S, W / 100 == S / 1024. While the mapped range is a 168 + bit smaller than the orignal scheduler weight range, the dead zones 169 + on both sides are relatively small and covers wider range than the 170 + nice value mappings. This file doesn't make sense in the root 171 + cgroup and isn't create on root. 172 + 173 + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" 174 + which contains both quota and period. 175 + 176 + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by 177 + "cpu.rt.max" which contains both runtime and period. 178 + 179 + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for 180 + CFS bandwidth stats and also using raw division for u64. Use 181 + CONFIG_CFS_BANDWITH and do_div() instead. 182 + 183 + The semantics of "cpu.rt.max" is not fully decided yet. Dropped 184 + for now. 185 + 186 + Signed-off-by: Tejun Heo <tj@kernel.org> 187 + Cc: Ingo Molnar <mingo@redhat.com> 188 + Cc: Peter Zijlstra <peterz@infradead.org> 189 + Cc: Li Zefan <lizefan@huawei.com> 190 + Cc: Johannes Weiner <hannes@cmpxchg.org> 191 + 192 + diff --git a/kernel/sched/core.c b/kernel/sched/core.c 193 + index c148dfe..7bba2c5 100644 194 + --- a/kernel/sched/core.c 195 + +++ b/kernel/sched/core.c 196 + @@ -8561,6 +8561,139 @@ static struct cftype cpu_legacy_files[] = { 197 + { } /* terminate */ 198 + }; 199 + 200 + +static int cpu_stats_show(struct seq_file *sf, void *v) 201 + +{ 202 + + cpuacct_cpu_stats_show(sf); 203 + + 204 + +#ifdef CONFIG_CFS_BANDWIDTH 205 + + { 206 + + struct task_group *tg = css_tg(seq_css(sf)); 207 + + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 208 + + u64 throttled_usec; 209 + + 210 + + throttled_usec = cfs_b->throttled_time; 211 + + do_div(throttled_usec, NSEC_PER_USEC); 212 + + 213 + + seq_printf(sf, "nr_periods %d\n" 214 + + "nr_throttled %d\n" 215 + + "throttled_usec %llu\n", 216 + + cfs_b->nr_periods, cfs_b->nr_throttled, 217 + + throttled_usec); 218 + + } 219 + +#endif 220 + + return 0; 221 + +} 222 + + 223 + +#ifdef CONFIG_FAIR_GROUP_SCHED 224 + +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 225 + + struct cftype *cft) 226 + +{ 227 + + struct task_group *tg = css_tg(css); 228 + + u64 weight = scale_load_down(tg->shares); 229 + + 230 + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); 231 + +} 232 + + 233 + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 234 + + struct cftype *cftype, u64 weight) 235 + +{ 236 + + /* 237 + + * cgroup weight knobs should use the common MIN, DFL and MAX 238 + + * values which are 1, 100 and 10000 respectively. While it loses 239 + + * a bit of range on both ends, it maps pretty well onto the shares 240 + + * value used by scheduler and the round-trip conversions preserve 241 + + * the original value over the entire range. 242 + + */ 243 + + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) 244 + + return -ERANGE; 245 + + 246 + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); 247 + + 248 + + return sched_group_set_shares(css_tg(css), scale_load(weight)); 249 + +} 250 + +#endif 251 + + 252 + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 253 + + long period, long quota) 254 + +{ 255 + + if (quota < 0) 256 + + seq_puts(sf, "max"); 257 + + else 258 + + seq_printf(sf, "%ld", quota); 259 + + 260 + + seq_printf(sf, " %ld\n", period); 261 + +} 262 + + 263 + +/* caller should put the current value in *@periodp before calling */ 264 + +static int __maybe_unused cpu_period_quota_parse(char *buf, 265 + + u64 *periodp, u64 *quotap) 266 + +{ 267 + + char tok[21]; /* U64_MAX */ 268 + + 269 + + if (!sscanf(buf, "%s %llu", tok, periodp)) 270 + + return -EINVAL; 271 + + 272 + + *periodp *= NSEC_PER_USEC; 273 + + 274 + + if (sscanf(tok, "%llu", quotap)) 275 + + *quotap *= NSEC_PER_USEC; 276 + + else if (!strcmp(tok, "max")) 277 + + *quotap = RUNTIME_INF; 278 + + else 279 + + return -EINVAL; 280 + + 281 + + return 0; 282 + +} 283 + + 284 + +#ifdef CONFIG_CFS_BANDWIDTH 285 + +static int cpu_max_show(struct seq_file *sf, void *v) 286 + +{ 287 + + struct task_group *tg = css_tg(seq_css(sf)); 288 + + 289 + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 290 + + return 0; 291 + +} 292 + + 293 + +static ssize_t cpu_max_write(struct kernfs_open_file *of, 294 + + char *buf, size_t nbytes, loff_t off) 295 + +{ 296 + + struct task_group *tg = css_tg(of_css(of)); 297 + + u64 period = tg_get_cfs_period(tg); 298 + + u64 quota; 299 + + int ret; 300 + + 301 + + ret = cpu_period_quota_parse(buf, &period, &quota); 302 + + if (!ret) 303 + + ret = tg_set_cfs_bandwidth(tg, period, quota); 304 + + return ret ?: nbytes; 305 + +} 306 + +#endif 307 + + 308 + +static struct cftype cpu_files[] = { 309 + + { 310 + + .name = "stat", 311 + + .flags = CFTYPE_NOT_ON_ROOT, 312 + + .seq_show = cpu_stats_show, 313 + + }, 314 + +#ifdef CONFIG_FAIR_GROUP_SCHED 315 + + { 316 + + .name = "weight", 317 + + .flags = CFTYPE_NOT_ON_ROOT, 318 + + .read_u64 = cpu_weight_read_u64, 319 + + .write_u64 = cpu_weight_write_u64, 320 + + }, 321 + +#endif 322 + +#ifdef CONFIG_CFS_BANDWIDTH 323 + + { 324 + + .name = "max", 325 + + .flags = CFTYPE_NOT_ON_ROOT, 326 + + .seq_show = cpu_max_show, 327 + + .write = cpu_max_write, 328 + + }, 329 + +#endif 330 + + { } /* terminate */ 331 + +}; 332 + + 333 + struct cgroup_subsys cpu_cgrp_subsys = { 334 + .css_alloc = cpu_cgroup_css_alloc, 335 + .css_released = cpu_cgroup_css_released, 336 + @@ -8569,7 +8702,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { 337 + .can_attach = cpu_cgroup_can_attach, 338 + .attach = cpu_cgroup_attach, 339 + .legacy_cftypes = cpu_legacy_files, 340 + + .dfl_cftypes = cpu_files, 341 + .early_init = true, 342 + +#ifdef CONFIG_CGROUP_CPUACCT 343 + + /* 344 + + * cpuacct is enabled together with cpu on the unified hierarchy 345 + + * and its stats are reported through "cpu.stat". 346 + + */ 347 + + .depends_on = 1 << cpuacct_cgrp_id, 348 + +#endif 349 + }; 350 + 351 + #endif /* CONFIG_CGROUP_SCHED */ 352 + diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c 353 + index 3eb9eda..7a02d26 100644 354 + --- a/kernel/sched/cpuacct.c 355 + +++ b/kernel/sched/cpuacct.c 356 + @@ -305,6 +305,30 @@ static struct cftype files[] = { 357 + { } /* terminate */ 358 + }; 359 + 360 + +/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ 361 + +void cpuacct_cpu_stats_show(struct seq_file *sf) 362 + +{ 363 + + struct cgroup_subsys_state *css; 364 + + u64 usage, user, sys; 365 + + 366 + + css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); 367 + + 368 + + usage = cpuusage_read(css, seq_cft(sf)); 369 + + cpuacct_stats_read(css_ca(css), &user, &sys); 370 + + 371 + + user *= TICK_NSEC; 372 + + sys *= TICK_NSEC; 373 + + do_div(usage, NSEC_PER_USEC); 374 + + do_div(user, NSEC_PER_USEC); 375 + + do_div(sys, NSEC_PER_USEC); 376 + + 377 + + seq_printf(sf, "usage_usec %llu\n" 378 + + "user_usec %llu\n" 379 + + "system_usec %llu\n", usage, user, sys); 380 + + 381 + + css_put(css); 382 + +} 383 + + 384 + /* 385 + * charge this task's execution time to its accounting group. 386 + * 387 + diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h 388 + index ba72807..ddf7af4 100644 389 + --- a/kernel/sched/cpuacct.h 390 + +++ b/kernel/sched/cpuacct.h 391 + @@ -2,6 +2,7 @@ 392 + 393 + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 394 + extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); 395 + +extern void cpuacct_cpu_stats_show(struct seq_file *sf); 396 + 397 + #else 398 + 399 + @@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) 400 + { 401 + } 402 + 403 + +static inline void cpuacct_cpu_stats_show(struct seq_file *sf) 404 + +{ 405 + +} 406 + + 407 + #endif
+21
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md
···
··· 1 + Patches for CPU Controller on Control Group v2 2 + =============================================== 3 + 4 + See Tejun Heo's [explanation][1] for why these patches are currently 5 + out-of-tree. 6 + 7 + Generating the patches 8 + ----------------------- 9 + 10 + In a linux checkout, with remote tc-cgroup pointing to 11 + git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git, your 12 + nixpkgs checkout in the same directory as your linux checkout (or 13 + modify the command accordingly), and setting `ver` to the appropriate 14 + version: 15 + 16 + ```shell 17 + $ ver=4.7 18 + $ git log --reverse --patch v$ver..remotes/tc-cgroup/cgroup-v2-cpu-v$ver > ../nixpkgs/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/$ver.patch 19 + ``` 20 + 21 + [1]: https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu
+11
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/default.nix
···
··· 1 + let 2 + ents = builtins.readDir ./.; 3 + in builtins.listToAttrs (builtins.filter (x: x != null) (map (name: let 4 + match = builtins.match "(.*)\\.patch" name; 5 + in if match == null then null else { 6 + name = builtins.head match; 7 + value = { 8 + name = "cpu-cgroup-v2-${name}"; 9 + patch = ./. + "/${name}"; 10 + }; 11 + }) (builtins.attrNames ents)))
+2
pkgs/os-specific/linux/kernel/patches.nix
··· 144 sha256 = "14rm1qr87p7a5prz8g5fwbpxzdp3ighj095x8rvhm8csm20wspyy"; 145 }; 146 }; 147 }
··· 144 sha256 = "14rm1qr87p7a5prz8g5fwbpxzdp3ighj095x8rvhm8csm20wspyy"; 145 }; 146 }; 147 + 148 + cpu-cgroup-v2 = import ./cpu-cgroup-v2-patches; 149 }
+5
pkgs/top-level/all-packages.nix
··· 11165 linux_4_4 = callPackage ../os-specific/linux/kernel/linux-4.4.nix { 11166 kernelPatches = 11167 [ kernelPatches.bridge_stp_helper 11168 ] 11169 ++ lib.optionals ((platform.kernelArch or null) == "mips") 11170 [ kernelPatches.mips_fpureg_emu ··· 11176 linux_4_6 = callPackage ../os-specific/linux/kernel/linux-4.6.nix { 11177 kernelPatches = 11178 [ kernelPatches.bridge_stp_helper 11179 ] 11180 ++ lib.optionals ((platform.kernelArch or null) == "mips") 11181 [ kernelPatches.mips_fpureg_emu ··· 11187 linux_4_7 = callPackage ../os-specific/linux/kernel/linux-4.7.nix { 11188 kernelPatches = 11189 [ kernelPatches.bridge_stp_helper 11190 ] 11191 ++ lib.optionals ((platform.kernelArch or null) == "mips") 11192 [ kernelPatches.mips_fpureg_emu
··· 11165 linux_4_4 = callPackage ../os-specific/linux/kernel/linux-4.4.nix { 11166 kernelPatches = 11167 [ kernelPatches.bridge_stp_helper 11168 + kernelPatches.cpu-cgroup-v2."4.4" 11169 ] 11170 ++ lib.optionals ((platform.kernelArch or null) == "mips") 11171 [ kernelPatches.mips_fpureg_emu ··· 11177 linux_4_6 = callPackage ../os-specific/linux/kernel/linux-4.6.nix { 11178 kernelPatches = 11179 [ kernelPatches.bridge_stp_helper 11180 + kernelPatches.cpu-cgroup-v2."4.6" 11181 ] 11182 ++ lib.optionals ((platform.kernelArch or null) == "mips") 11183 [ kernelPatches.mips_fpureg_emu ··· 11189 linux_4_7 = callPackage ../os-specific/linux/kernel/linux-4.7.nix { 11190 kernelPatches = 11191 [ kernelPatches.bridge_stp_helper 11192 + # See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md 11193 + # when adding a new linux version 11194 + kernelPatches.cpu-cgroup-v2."4.7" 11195 ] 11196 ++ lib.optionals ((platform.kernelArch or null) == "mips") 11197 [ kernelPatches.mips_fpureg_emu