Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+17 -25

Documentation/cgroup-v2.txt

··· 893 893 CPU 894 894 --- 895 895 896 - .. note:: 897 - 898 - The interface for the cpu controller hasn't been merged yet 899 - 900 896 The "cpu" controllers regulates distribution of CPU cycles. This 901 897 controller implements weight and absolute bandwidth limit models for 902 898 normal scheduling policy and absolute bandwidth allocation model for ··· 906 910 907 911 cpu.stat 908 912 A read-only flat-keyed file which exists on non-root cgroups. 913 + This file exists whether the controller is enabled or not. 909 914 910 - It reports the following six stats: 915 + It always reports the following three stats: 911 916 912 917 - usage_usec 913 918 - user_usec 914 919 - system_usec 920 + 921 + and the following three when the controller is enabled: 922 + 915 923 - nr_periods 916 924 - nr_throttled 917 925 - throttled_usec ··· 925 925 cgroups. The default is "100". 926 926 927 927 The weight in the range [1, 10000]. 928 + 929 + cpu.weight.nice 930 + A read-write single value file which exists on non-root 931 + cgroups. The default is "0". 932 + 933 + The nice value is in the range [-20, 19]. 934 + 935 + This interface file is an alternative interface for 936 + "cpu.weight" and allows reading and setting weight using the 937 + same values used by nice(2). Because the range is smaller and 938 + granularity is coarser for the nice values, the read value is 939 + the closest approximation of the current weight. 928 940 929 941 cpu.max 930 942 A read-write two value file which exists on non-root cgroups. ··· 949 937 which indicates that the group may consume upto $MAX in each 950 938 $PERIOD duration. "max" for $MAX indicates no limit. If only 951 939 one number is written, $MAX is updated. 952 - 953 - cpu.rt.max 954 - .. note:: 955 - 956 - The semantics of this file is still under discussion and the 957 - interface hasn't been merged yet 958 - 959 - A read-write two value file which exists on all cgroups. 960 - The default is "0 100000". 961 - 962 - The maximum realtime runtime allocation. Over-committing 963 - configurations are disallowed and process migrations are 964 - rejected if not enough bandwidth is available. It's in the 965 - following format:: 966 - 967 - $MAX $PERIOD 968 - 969 - which indicates that the group may consume upto $MAX in each 970 - $PERIOD duration. If only one number is written, $MAX is 971 - updated. 972 940 973 941 974 942 Memory

+1 -1

MAINTAINERS

··· 3592 3592 S: Maintained 3593 3593 F: Documentation/cgroup-v1/cpusets.txt 3594 3594 F: include/linux/cpuset.h 3595 - F: kernel/cpuset.c 3595 + F: kernel/cgroup/cpuset.c 3596 3596 3597 3597 CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) 3598 3598 M: Johannes Weiner <hannes@cmpxchg.org>

+59

include/linux/cgroup-defs.h

··· 17 17 #include <linux/refcount.h> 18 18 #include <linux/percpu-refcount.h> 19 19 #include <linux/percpu-rwsem.h> 20 + #include <linux/u64_stats_sync.h> 20 21 #include <linux/workqueue.h> 21 22 #include <linux/bpf-cgroup.h> 22 23 ··· 256 255 struct rcu_head rcu_head; 257 256 }; 258 257 258 + /* 259 + * cgroup basic resource usage statistics. Accounting is done per-cpu in 260 + * cgroup_cpu_stat which is then lazily propagated up the hierarchy on 261 + * reads. 262 + * 263 + * When a stat gets updated, the cgroup_cpu_stat and its ancestors are 264 + * linked into the updated tree. On the following read, propagation only 265 + * considers and consumes the updated tree. This makes reading O(the 266 + * number of descendants which have been active since last read) instead of 267 + * O(the total number of descendants). 268 + * 269 + * This is important because there can be a lot of (draining) cgroups which 270 + * aren't active and stat may be read frequently. The combination can 271 + * become very expensive. By propagating selectively, increasing reading 272 + * frequency decreases the cost of each read. 273 + */ 274 + struct cgroup_cpu_stat { 275 + /* 276 + * ->sync protects all the current counters. These are the only 277 + * fields which get updated in the hot path. 278 + */ 279 + struct u64_stats_sync sync; 280 + struct task_cputime cputime; 281 + 282 + /* 283 + * Snapshots at the last reading. These are used to calculate the 284 + * deltas to propagate to the global counters. 285 + */ 286 + struct task_cputime last_cputime; 287 + 288 + /* 289 + * Child cgroups with stat updates on this cpu since the last read 290 + * are linked on the parent's ->updated_children through 291 + * ->updated_next. 292 + * 293 + * In addition to being more compact, singly-linked list pointing 294 + * to the cgroup makes it unnecessary for each per-cpu struct to 295 + * point back to the associated cgroup. 296 + * 297 + * Protected by per-cpu cgroup_cpu_stat_lock. 298 + */ 299 + struct cgroup *updated_children; /* terminated by self cgroup */ 300 + struct cgroup *updated_next; /* NULL iff not on the list */ 301 + }; 302 + 303 + struct cgroup_stat { 304 + /* per-cpu statistics are collected into the folowing global counters */ 305 + struct task_cputime cputime; 306 + struct prev_cputime prev_cputime; 307 + }; 308 + 259 309 struct cgroup { 260 310 /* self css with NULL ->ss, points back to this cgroup */ 261 311 struct cgroup_subsys_state self; ··· 405 353 * specific task are charged to the dom_cgrp. 406 354 */ 407 355 struct cgroup *dom_cgrp; 356 + 357 + /* cgroup basic resource statistics */ 358 + struct cgroup_cpu_stat __percpu *cpu_stat; 359 + struct cgroup_stat pending_stat; /* pending from children */ 360 + struct cgroup_stat stat; 408 361 409 362 /* 410 363 * list of pidlists, up to two for each namespace (one for procs, one ··· 570 513 void (*css_released)(struct cgroup_subsys_state *css); 571 514 void (*css_free)(struct cgroup_subsys_state *css); 572 515 void (*css_reset)(struct cgroup_subsys_state *css); 516 + int (*css_extra_stat_show)(struct seq_file *seq, 517 + struct cgroup_subsys_state *css); 573 518 574 519 int (*can_attach)(struct cgroup_taskset *tset); 575 520 void (*cancel_attach)(struct cgroup_taskset *tset);

+58

include/linux/cgroup.h

··· 23 23 #include <linux/nsproxy.h> 24 24 #include <linux/user_namespace.h> 25 25 #include <linux/refcount.h> 26 + #include <linux/kernel_stat.h> 26 27 27 28 #include <linux/cgroup-defs.h> 28 29 ··· 689 688 static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, 690 689 char *buf, size_t buflen) {} 691 690 #endif /* !CONFIG_CGROUPS */ 691 + 692 + /* 693 + * Basic resource stats. 694 + */ 695 + #ifdef CONFIG_CGROUPS 696 + 697 + #ifdef CONFIG_CGROUP_CPUACCT 698 + void cpuacct_charge(struct task_struct *tsk, u64 cputime); 699 + void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); 700 + #else 701 + static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 702 + static inline void cpuacct_account_field(struct task_struct *tsk, int index, 703 + u64 val) {} 704 + #endif 705 + 706 + void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); 707 + void __cgroup_account_cputime_field(struct cgroup *cgrp, 708 + enum cpu_usage_stat index, u64 delta_exec); 709 + 710 + static inline void cgroup_account_cputime(struct task_struct *task, 711 + u64 delta_exec) 712 + { 713 + struct cgroup *cgrp; 714 + 715 + cpuacct_charge(task, delta_exec); 716 + 717 + rcu_read_lock(); 718 + cgrp = task_dfl_cgroup(task); 719 + if (cgroup_parent(cgrp)) 720 + __cgroup_account_cputime(cgrp, delta_exec); 721 + rcu_read_unlock(); 722 + } 723 + 724 + static inline void cgroup_account_cputime_field(struct task_struct *task, 725 + enum cpu_usage_stat index, 726 + u64 delta_exec) 727 + { 728 + struct cgroup *cgrp; 729 + 730 + cpuacct_account_field(task, index, delta_exec); 731 + 732 + rcu_read_lock(); 733 + cgrp = task_dfl_cgroup(task); 734 + if (cgroup_parent(cgrp)) 735 + __cgroup_account_cputime_field(cgrp, index, delta_exec); 736 + rcu_read_unlock(); 737 + } 738 + 739 + #else /* CONFIG_CGROUPS */ 740 + 741 + static inline void cgroup_account_cputime(struct task_struct *task, 742 + u64 delta_exec) {} 743 + static inline void cgroup_account_cputime_field(struct task_struct *task, 744 + enum cpu_usage_stat index, 745 + u64 delta_exec) {} 746 + 747 + #endif /* CONFIG_CGROUPS */ 692 748 693 749 /* 694 750 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data

+2 -1

include/linux/sched/cputime.h

··· 54 54 55 55 extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); 56 56 extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); 57 - 57 + extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 58 + u64 *ut, u64 *st); 58 59 59 60 /* 60 61 * Thread group CPU time accounting.

+1 -1

kernel/cgroup/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 - obj-y := cgroup.o namespace.o cgroup-v1.o 2 + obj-y := cgroup.o stat.o namespace.o cgroup-v1.o 3 3 4 4 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o 5 5 obj-$(CONFIG_CGROUP_PIDS) += pids.o

+9

kernel/cgroup/cgroup-internal.h

··· 201 201 int cgroup_task_count(const struct cgroup *cgrp); 202 202 203 203 /* 204 + * stat.c 205 + */ 206 + void cgroup_stat_flush(struct cgroup *cgrp); 207 + int cgroup_stat_init(struct cgroup *cgrp); 208 + void cgroup_stat_exit(struct cgroup *cgrp); 209 + void cgroup_stat_show_cputime(struct seq_file *seq); 210 + void cgroup_stat_boot(void); 211 + 212 + /* 204 213 * namespace.c 205 214 */ 206 215 extern const struct proc_ns_operations cgroupns_operations;

+155 -2

kernel/cgroup/cgroup.c

··· 142 142 }; 143 143 #undef SUBSYS 144 144 145 + static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); 146 + 145 147 /* 146 148 * The default hierarchy, reserved for the subsystems that are otherwise 147 149 * unattached - it never has more than a single cgroup, and all tasks are 148 150 * part of that cgroup. 149 151 */ 150 - struct cgroup_root cgrp_dfl_root; 152 + struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; 151 153 EXPORT_SYMBOL_GPL(cgrp_dfl_root); 152 154 153 155 /* ··· 464 462 } 465 463 466 464 /** 465 + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem 466 + * @cgrp: the cgroup of interest 467 + * @ss: the subsystem of interest 468 + * 469 + * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist 470 + * or is offline, %NULL is returned. 471 + */ 472 + static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, 473 + struct cgroup_subsys *ss) 474 + { 475 + struct cgroup_subsys_state *css; 476 + 477 + rcu_read_lock(); 478 + css = cgroup_css(cgrp, ss); 479 + if (!css || !css_tryget_online(css)) 480 + css = NULL; 481 + rcu_read_unlock(); 482 + 483 + return css; 484 + } 485 + 486 + /** 467 487 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem 468 488 * @cgrp: the cgroup of interest 469 489 * @ss: the subsystem of interest (%NULL returns @cgrp->self) ··· 671 647 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 672 648 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 673 649 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 650 + 651 + /* 652 + * The following field is re-initialized when this cset gets linked 653 + * in cgroup_init(). However, let's initialize the field 654 + * statically too so that the default cgroup can be accessed safely 655 + * early during boot. 656 + */ 657 + .dfl_cgrp = &cgrp_dfl_root.cgrp, 674 658 }; 675 659 676 660 static int css_set_count = 1; /* 1 for init_css_set */ ··· 3347 3315 return 0; 3348 3316 } 3349 3317 3318 + static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, 3319 + struct cgroup *cgrp, int ssid) 3320 + { 3321 + struct cgroup_subsys *ss = cgroup_subsys[ssid]; 3322 + struct cgroup_subsys_state *css; 3323 + int ret; 3324 + 3325 + if (!ss->css_extra_stat_show) 3326 + return 0; 3327 + 3328 + css = cgroup_tryget_css(cgrp, ss); 3329 + if (!css) 3330 + return 0; 3331 + 3332 + ret = ss->css_extra_stat_show(seq, css); 3333 + css_put(css); 3334 + return ret; 3335 + } 3336 + 3337 + static int cpu_stat_show(struct seq_file *seq, void *v) 3338 + { 3339 + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; 3340 + int ret = 0; 3341 + 3342 + cgroup_stat_show_cputime(seq); 3343 + #ifdef CONFIG_CGROUP_SCHED 3344 + ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); 3345 + #endif 3346 + return ret; 3347 + } 3348 + 3350 3349 static int cgroup_file_open(struct kernfs_open_file *of) 3351 3350 { 3352 3351 struct cftype *cft = of->kn->priv; ··· 4485 4422 .name = "cgroup.stat", 4486 4423 .seq_show = cgroup_stat_show, 4487 4424 }, 4425 + { 4426 + .name = "cpu.stat", 4427 + .flags = CFTYPE_NOT_ON_ROOT, 4428 + .seq_show = cpu_stat_show, 4429 + }, 4488 4430 { } /* terminate */ 4489 4431 }; 4490 4432 ··· 4550 4482 */ 4551 4483 cgroup_put(cgroup_parent(cgrp)); 4552 4484 kernfs_put(cgrp->kn); 4485 + if (cgroup_on_dfl(cgrp)) 4486 + cgroup_stat_exit(cgrp); 4553 4487 kfree(cgrp); 4554 4488 } else { 4555 4489 /* ··· 4595 4525 4596 4526 /* cgroup release path */ 4597 4527 trace_cgroup_release(cgrp); 4528 + 4529 + if (cgroup_on_dfl(cgrp)) 4530 + cgroup_stat_flush(cgrp); 4598 4531 4599 4532 for (tcgrp = cgroup_parent(cgrp); tcgrp; 4600 4533 tcgrp = cgroup_parent(tcgrp)) ··· 4782 4709 if (ret) 4783 4710 goto out_free_cgrp; 4784 4711 4712 + if (cgroup_on_dfl(parent)) { 4713 + ret = cgroup_stat_init(cgrp); 4714 + if (ret) 4715 + goto out_cancel_ref; 4716 + } 4717 + 4785 4718 /* 4786 4719 * Temporarily set the pointer to NULL, so idr_find() won't return 4787 4720 * a half-baked cgroup. ··· 4795 4716 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); 4796 4717 if (cgrp->id < 0) { 4797 4718 ret = -ENOMEM; 4798 - goto out_cancel_ref; 4719 + goto out_stat_exit; 4799 4720 } 4800 4721 4801 4722 init_cgroup_housekeeping(cgrp); ··· 4846 4767 4847 4768 out_idr_free: 4848 4769 cgroup_idr_remove(&root->cgroup_idr, cgrp->id); 4770 + out_stat_exit: 4771 + if (cgroup_on_dfl(parent)) 4772 + cgroup_stat_exit(cgrp); 4849 4773 out_cancel_ref: 4850 4774 percpu_ref_exit(&cgrp->self.refcnt); 4851 4775 out_free_cgrp: ··· 5242 5160 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); 5243 5161 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 5244 5162 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); 5163 + 5164 + cgroup_stat_boot(); 5245 5165 5246 5166 /* 5247 5167 * The latency of the synchronize_sched() is too high for cgroups, ··· 5864 5780 return ret; 5865 5781 } 5866 5782 #endif /* CONFIG_CGROUP_BPF */ 5783 + 5784 + #ifdef CONFIG_SYSFS 5785 + static ssize_t show_delegatable_files(struct cftype *files, char *buf, 5786 + ssize_t size, const char *prefix) 5787 + { 5788 + struct cftype *cft; 5789 + ssize_t ret = 0; 5790 + 5791 + for (cft = files; cft && cft->name[0] != '\0'; cft++) { 5792 + if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) 5793 + continue; 5794 + 5795 + if (prefix) 5796 + ret += snprintf(buf + ret, size - ret, "%s.", prefix); 5797 + 5798 + ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); 5799 + 5800 + if (unlikely(ret >= size)) { 5801 + WARN_ON(1); 5802 + break; 5803 + } 5804 + } 5805 + 5806 + return ret; 5807 + } 5808 + 5809 + static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, 5810 + char *buf) 5811 + { 5812 + struct cgroup_subsys *ss; 5813 + int ssid; 5814 + ssize_t ret = 0; 5815 + 5816 + ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, 5817 + NULL); 5818 + 5819 + for_each_subsys(ss, ssid) 5820 + ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, 5821 + PAGE_SIZE - ret, 5822 + cgroup_subsys_name[ssid]); 5823 + 5824 + return ret; 5825 + } 5826 + static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); 5827 + 5828 + static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, 5829 + char *buf) 5830 + { 5831 + return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); 5832 + } 5833 + static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); 5834 + 5835 + static struct attribute *cgroup_sysfs_attrs[] = { 5836 + &cgroup_delegate_attr.attr, 5837 + &cgroup_features_attr.attr, 5838 + NULL, 5839 + }; 5840 + 5841 + static const struct attribute_group cgroup_sysfs_attr_group = { 5842 + .attrs = cgroup_sysfs_attrs, 5843 + .name = "cgroup", 5844 + }; 5845 + 5846 + static int __init cgroup_sysfs_init(void) 5847 + { 5848 + return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); 5849 + } 5850 + subsys_initcall(cgroup_sysfs_init); 5851 + #endif /* CONFIG_SYSFS */

+334

kernel/cgroup/stat.c

··· 1 + #include "cgroup-internal.h" 2 + 3 + #include <linux/sched/cputime.h> 4 + 5 + static DEFINE_MUTEX(cgroup_stat_mutex); 6 + static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); 7 + 8 + static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) 9 + { 10 + return per_cpu_ptr(cgrp->cpu_stat, cpu); 11 + } 12 + 13 + /** 14 + * cgroup_cpu_stat_updated - keep track of updated cpu_stat 15 + * @cgrp: target cgroup 16 + * @cpu: cpu on which cpu_stat was updated 17 + * 18 + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching 19 + * cpu_stat->updated_children list. See the comment on top of 20 + * cgroup_cpu_stat definition for details. 21 + */ 22 + static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) 23 + { 24 + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); 25 + struct cgroup *parent; 26 + unsigned long flags; 27 + 28 + /* 29 + * Speculative already-on-list test. This may race leading to 30 + * temporary inaccuracies, which is fine. 31 + * 32 + * Because @parent's updated_children is terminated with @parent 33 + * instead of NULL, we can tell whether @cgrp is on the list by 34 + * testing the next pointer for NULL. 35 + */ 36 + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) 37 + return; 38 + 39 + raw_spin_lock_irqsave(cpu_lock, flags); 40 + 41 + /* put @cgrp and all ancestors on the corresponding updated lists */ 42 + for (parent = cgroup_parent(cgrp); parent; 43 + cgrp = parent, parent = cgroup_parent(cgrp)) { 44 + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); 45 + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); 46 + 47 + /* 48 + * Both additions and removals are bottom-up. If a cgroup 49 + * is already in the tree, all ancestors are. 50 + */ 51 + if (cstat->updated_next) 52 + break; 53 + 54 + cstat->updated_next = pcstat->updated_children; 55 + pcstat->updated_children = cgrp; 56 + } 57 + 58 + raw_spin_unlock_irqrestore(cpu_lock, flags); 59 + } 60 + 61 + /** 62 + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree 63 + * @pos: current position 64 + * @root: root of the tree to traversal 65 + * @cpu: target cpu 66 + * 67 + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts 68 + * the traversal and %NULL return indicates the end. During traversal, 69 + * each returned cgroup is unlinked from the tree. Must be called with the 70 + * matching cgroup_cpu_stat_lock held. 71 + * 72 + * The only ordering guarantee is that, for a parent and a child pair 73 + * covered by a given traversal, if a child is visited, its parent is 74 + * guaranteed to be visited afterwards. 75 + */ 76 + static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, 77 + struct cgroup *root, int cpu) 78 + { 79 + struct cgroup_cpu_stat *cstat; 80 + struct cgroup *parent; 81 + 82 + if (pos == root) 83 + return NULL; 84 + 85 + /* 86 + * We're gonna walk down to the first leaf and visit/remove it. We 87 + * can pick whatever unvisited node as the starting point. 88 + */ 89 + if (!pos) 90 + pos = root; 91 + else 92 + pos = cgroup_parent(pos); 93 + 94 + /* walk down to the first leaf */ 95 + while (true) { 96 + cstat = cgroup_cpu_stat(pos, cpu); 97 + if (cstat->updated_children == pos) 98 + break; 99 + pos = cstat->updated_children; 100 + } 101 + 102 + /* 103 + * Unlink @pos from the tree. As the updated_children list is 104 + * singly linked, we have to walk it to find the removal point. 105 + * However, due to the way we traverse, @pos will be the first 106 + * child in most cases. The only exception is @root. 107 + */ 108 + parent = cgroup_parent(pos); 109 + if (parent && cstat->updated_next) { 110 + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); 111 + struct cgroup_cpu_stat *ncstat; 112 + struct cgroup **nextp; 113 + 114 + nextp = &pcstat->updated_children; 115 + while (true) { 116 + ncstat = cgroup_cpu_stat(*nextp, cpu); 117 + if (*nextp == pos) 118 + break; 119 + 120 + WARN_ON_ONCE(*nextp == parent); 121 + nextp = &ncstat->updated_next; 122 + } 123 + 124 + *nextp = cstat->updated_next; 125 + cstat->updated_next = NULL; 126 + } 127 + 128 + return pos; 129 + } 130 + 131 + static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, 132 + struct cgroup_stat *src_stat) 133 + { 134 + dst_stat->cputime.utime += src_stat->cputime.utime; 135 + dst_stat->cputime.stime += src_stat->cputime.stime; 136 + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; 137 + } 138 + 139 + static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) 140 + { 141 + struct cgroup *parent = cgroup_parent(cgrp); 142 + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); 143 + struct task_cputime *last_cputime = &cstat->last_cputime; 144 + struct task_cputime cputime; 145 + struct cgroup_stat delta; 146 + unsigned seq; 147 + 148 + lockdep_assert_held(&cgroup_stat_mutex); 149 + 150 + /* fetch the current per-cpu values */ 151 + do { 152 + seq = __u64_stats_fetch_begin(&cstat->sync); 153 + cputime = cstat->cputime; 154 + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); 155 + 156 + /* accumulate the deltas to propgate */ 157 + delta.cputime.utime = cputime.utime - last_cputime->utime; 158 + delta.cputime.stime = cputime.stime - last_cputime->stime; 159 + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - 160 + last_cputime->sum_exec_runtime; 161 + *last_cputime = cputime; 162 + 163 + /* transfer the pending stat into delta */ 164 + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); 165 + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); 166 + 167 + /* propagate delta into the global stat and the parent's pending */ 168 + cgroup_stat_accumulate(&cgrp->stat, &delta); 169 + if (parent) 170 + cgroup_stat_accumulate(&parent->pending_stat, &delta); 171 + } 172 + 173 + /* see cgroup_stat_flush() */ 174 + static void cgroup_stat_flush_locked(struct cgroup *cgrp) 175 + { 176 + int cpu; 177 + 178 + lockdep_assert_held(&cgroup_stat_mutex); 179 + 180 + for_each_possible_cpu(cpu) { 181 + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); 182 + struct cgroup *pos = NULL; 183 + 184 + raw_spin_lock_irq(cpu_lock); 185 + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) 186 + cgroup_cpu_stat_flush_one(pos, cpu); 187 + raw_spin_unlock_irq(cpu_lock); 188 + } 189 + } 190 + 191 + /** 192 + * cgroup_stat_flush - flush stats in @cgrp's subtree 193 + * @cgrp: target cgroup 194 + * 195 + * Collect all per-cpu stats in @cgrp's subtree into the global counters 196 + * and propagate them upwards. After this function returns, all cgroups in 197 + * the subtree have up-to-date ->stat. 198 + * 199 + * This also gets all cgroups in the subtree including @cgrp off the 200 + * ->updated_children lists. 201 + */ 202 + void cgroup_stat_flush(struct cgroup *cgrp) 203 + { 204 + mutex_lock(&cgroup_stat_mutex); 205 + cgroup_stat_flush_locked(cgrp); 206 + mutex_unlock(&cgroup_stat_mutex); 207 + } 208 + 209 + static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) 210 + { 211 + struct cgroup_cpu_stat *cstat; 212 + 213 + cstat = get_cpu_ptr(cgrp->cpu_stat); 214 + u64_stats_update_begin(&cstat->sync); 215 + return cstat; 216 + } 217 + 218 + static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, 219 + struct cgroup_cpu_stat *cstat) 220 + { 221 + u64_stats_update_end(&cstat->sync); 222 + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); 223 + put_cpu_ptr(cstat); 224 + } 225 + 226 + void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) 227 + { 228 + struct cgroup_cpu_stat *cstat; 229 + 230 + cstat = cgroup_cpu_stat_account_begin(cgrp); 231 + cstat->cputime.sum_exec_runtime += delta_exec; 232 + cgroup_cpu_stat_account_end(cgrp, cstat); 233 + } 234 + 235 + void __cgroup_account_cputime_field(struct cgroup *cgrp, 236 + enum cpu_usage_stat index, u64 delta_exec) 237 + { 238 + struct cgroup_cpu_stat *cstat; 239 + 240 + cstat = cgroup_cpu_stat_account_begin(cgrp); 241 + 242 + switch (index) { 243 + case CPUTIME_USER: 244 + case CPUTIME_NICE: 245 + cstat->cputime.utime += delta_exec; 246 + break; 247 + case CPUTIME_SYSTEM: 248 + case CPUTIME_IRQ: 249 + case CPUTIME_SOFTIRQ: 250 + cstat->cputime.stime += delta_exec; 251 + break; 252 + default: 253 + break; 254 + } 255 + 256 + cgroup_cpu_stat_account_end(cgrp, cstat); 257 + } 258 + 259 + void cgroup_stat_show_cputime(struct seq_file *seq) 260 + { 261 + struct cgroup *cgrp = seq_css(seq)->cgroup; 262 + u64 usage, utime, stime; 263 + 264 + if (!cgroup_parent(cgrp)) 265 + return; 266 + 267 + mutex_lock(&cgroup_stat_mutex); 268 + 269 + cgroup_stat_flush_locked(cgrp); 270 + 271 + usage = cgrp->stat.cputime.sum_exec_runtime; 272 + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, 273 + &utime, &stime); 274 + 275 + mutex_unlock(&cgroup_stat_mutex); 276 + 277 + do_div(usage, NSEC_PER_USEC); 278 + do_div(utime, NSEC_PER_USEC); 279 + do_div(stime, NSEC_PER_USEC); 280 + 281 + seq_printf(seq, "usage_usec %llu\n" 282 + "user_usec %llu\n" 283 + "system_usec %llu\n", 284 + usage, utime, stime); 285 + } 286 + 287 + int cgroup_stat_init(struct cgroup *cgrp) 288 + { 289 + int cpu; 290 + 291 + /* the root cgrp has cpu_stat preallocated */ 292 + if (!cgrp->cpu_stat) { 293 + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); 294 + if (!cgrp->cpu_stat) 295 + return -ENOMEM; 296 + } 297 + 298 + /* ->updated_children list is self terminated */ 299 + for_each_possible_cpu(cpu) 300 + cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; 301 + 302 + prev_cputime_init(&cgrp->stat.prev_cputime); 303 + 304 + return 0; 305 + } 306 + 307 + void cgroup_stat_exit(struct cgroup *cgrp) 308 + { 309 + int cpu; 310 + 311 + cgroup_stat_flush(cgrp); 312 + 313 + /* sanity check */ 314 + for_each_possible_cpu(cpu) { 315 + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); 316 + 317 + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || 318 + WARN_ON_ONCE(cstat->updated_next)) 319 + return; 320 + } 321 + 322 + free_percpu(cgrp->cpu_stat); 323 + cgrp->cpu_stat = NULL; 324 + } 325 + 326 + void __init cgroup_stat_boot(void) 327 + { 328 + int cpu; 329 + 330 + for_each_possible_cpu(cpu) 331 + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); 332 + 333 + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); 334 + }

+170 -4

kernel/sched/core.c

··· 6620 6620 return ret; 6621 6621 } 6622 6622 6623 - static int cpu_stats_show(struct seq_file *sf, void *v) 6623 + static int cpu_cfs_stat_show(struct seq_file *sf, void *v) 6624 6624 { 6625 6625 struct task_group *tg = css_tg(seq_css(sf)); 6626 6626 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; ··· 6660 6660 } 6661 6661 #endif /* CONFIG_RT_GROUP_SCHED */ 6662 6662 6663 - static struct cftype cpu_files[] = { 6663 + static struct cftype cpu_legacy_files[] = { 6664 6664 #ifdef CONFIG_FAIR_GROUP_SCHED 6665 6665 { 6666 6666 .name = "shares", ··· 6681 6681 }, 6682 6682 { 6683 6683 .name = "stat", 6684 - .seq_show = cpu_stats_show, 6684 + .seq_show = cpu_cfs_stat_show, 6685 6685 }, 6686 6686 #endif 6687 6687 #ifdef CONFIG_RT_GROUP_SCHED ··· 6699 6699 { } /* Terminate */ 6700 6700 }; 6701 6701 6702 + static int cpu_extra_stat_show(struct seq_file *sf, 6703 + struct cgroup_subsys_state *css) 6704 + { 6705 + #ifdef CONFIG_CFS_BANDWIDTH 6706 + { 6707 + struct task_group *tg = css_tg(css); 6708 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6709 + u64 throttled_usec; 6710 + 6711 + throttled_usec = cfs_b->throttled_time; 6712 + do_div(throttled_usec, NSEC_PER_USEC); 6713 + 6714 + seq_printf(sf, "nr_periods %d\n" 6715 + "nr_throttled %d\n" 6716 + "throttled_usec %llu\n", 6717 + cfs_b->nr_periods, cfs_b->nr_throttled, 6718 + throttled_usec); 6719 + } 6720 + #endif 6721 + return 0; 6722 + } 6723 + 6724 + #ifdef CONFIG_FAIR_GROUP_SCHED 6725 + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 6726 + struct cftype *cft) 6727 + { 6728 + struct task_group *tg = css_tg(css); 6729 + u64 weight = scale_load_down(tg->shares); 6730 + 6731 + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); 6732 + } 6733 + 6734 + static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 6735 + struct cftype *cft, u64 weight) 6736 + { 6737 + /* 6738 + * cgroup weight knobs should use the common MIN, DFL and MAX 6739 + * values which are 1, 100 and 10000 respectively. While it loses 6740 + * a bit of range on both ends, it maps pretty well onto the shares 6741 + * value used by scheduler and the round-trip conversions preserve 6742 + * the original value over the entire range. 6743 + */ 6744 + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) 6745 + return -ERANGE; 6746 + 6747 + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); 6748 + 6749 + return sched_group_set_shares(css_tg(css), scale_load(weight)); 6750 + } 6751 + 6752 + static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, 6753 + struct cftype *cft) 6754 + { 6755 + unsigned long weight = scale_load_down(css_tg(css)->shares); 6756 + int last_delta = INT_MAX; 6757 + int prio, delta; 6758 + 6759 + /* find the closest nice value to the current weight */ 6760 + for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { 6761 + delta = abs(sched_prio_to_weight[prio] - weight); 6762 + if (delta >= last_delta) 6763 + break; 6764 + last_delta = delta; 6765 + } 6766 + 6767 + return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); 6768 + } 6769 + 6770 + static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, 6771 + struct cftype *cft, s64 nice) 6772 + { 6773 + unsigned long weight; 6774 + 6775 + if (nice < MIN_NICE || nice > MAX_NICE) 6776 + return -ERANGE; 6777 + 6778 + weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; 6779 + return sched_group_set_shares(css_tg(css), scale_load(weight)); 6780 + } 6781 + #endif 6782 + 6783 + static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 6784 + long period, long quota) 6785 + { 6786 + if (quota < 0) 6787 + seq_puts(sf, "max"); 6788 + else 6789 + seq_printf(sf, "%ld", quota); 6790 + 6791 + seq_printf(sf, " %ld\n", period); 6792 + } 6793 + 6794 + /* caller should put the current value in *@periodp before calling */ 6795 + static int __maybe_unused cpu_period_quota_parse(char *buf, 6796 + u64 *periodp, u64 *quotap) 6797 + { 6798 + char tok[21]; /* U64_MAX */ 6799 + 6800 + if (!sscanf(buf, "%s %llu", tok, periodp)) 6801 + return -EINVAL; 6802 + 6803 + *periodp *= NSEC_PER_USEC; 6804 + 6805 + if (sscanf(tok, "%llu", quotap)) 6806 + *quotap *= NSEC_PER_USEC; 6807 + else if (!strcmp(tok, "max")) 6808 + *quotap = RUNTIME_INF; 6809 + else 6810 + return -EINVAL; 6811 + 6812 + return 0; 6813 + } 6814 + 6815 + #ifdef CONFIG_CFS_BANDWIDTH 6816 + static int cpu_max_show(struct seq_file *sf, void *v) 6817 + { 6818 + struct task_group *tg = css_tg(seq_css(sf)); 6819 + 6820 + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 6821 + return 0; 6822 + } 6823 + 6824 + static ssize_t cpu_max_write(struct kernfs_open_file *of, 6825 + char *buf, size_t nbytes, loff_t off) 6826 + { 6827 + struct task_group *tg = css_tg(of_css(of)); 6828 + u64 period = tg_get_cfs_period(tg); 6829 + u64 quota; 6830 + int ret; 6831 + 6832 + ret = cpu_period_quota_parse(buf, &period, &quota); 6833 + if (!ret) 6834 + ret = tg_set_cfs_bandwidth(tg, period, quota); 6835 + return ret ?: nbytes; 6836 + } 6837 + #endif 6838 + 6839 + static struct cftype cpu_files[] = { 6840 + #ifdef CONFIG_FAIR_GROUP_SCHED 6841 + { 6842 + .name = "weight", 6843 + .flags = CFTYPE_NOT_ON_ROOT, 6844 + .read_u64 = cpu_weight_read_u64, 6845 + .write_u64 = cpu_weight_write_u64, 6846 + }, 6847 + { 6848 + .name = "weight.nice", 6849 + .flags = CFTYPE_NOT_ON_ROOT, 6850 + .read_s64 = cpu_weight_nice_read_s64, 6851 + .write_s64 = cpu_weight_nice_write_s64, 6852 + }, 6853 + #endif 6854 + #ifdef CONFIG_CFS_BANDWIDTH 6855 + { 6856 + .name = "max", 6857 + .flags = CFTYPE_NOT_ON_ROOT, 6858 + .seq_show = cpu_max_show, 6859 + .write = cpu_max_write, 6860 + }, 6861 + #endif 6862 + { } /* terminate */ 6863 + }; 6864 + 6702 6865 struct cgroup_subsys cpu_cgrp_subsys = { 6703 6866 .css_alloc = cpu_cgroup_css_alloc, 6704 6867 .css_online = cpu_cgroup_css_online, 6705 6868 .css_released = cpu_cgroup_css_released, 6706 6869 .css_free = cpu_cgroup_css_free, 6870 + .css_extra_stat_show = cpu_extra_stat_show, 6707 6871 .fork = cpu_cgroup_fork, 6708 6872 .can_attach = cpu_cgroup_can_attach, 6709 6873 .attach = cpu_cgroup_attach, 6710 - .legacy_cftypes = cpu_files, 6874 + .legacy_cftypes = cpu_legacy_files, 6875 + .dfl_cftypes = cpu_files, 6711 6876 .early_init = true, 6877 + .threaded = true, 6712 6878 }; 6713 6879 6714 6880 #endif /* CONFIG_CGROUP_SCHED */

-18

kernel/sched/cpuacct.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifdef CONFIG_CGROUP_CPUACCT 3 - 4 - extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 5 - extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); 6 - 7 - #else 8 - 9 - static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) 10 - { 11 - } 12 - 13 - static inline void 14 - cpuacct_account_field(struct task_struct *tsk, int index, u64 val) 15 - { 16 - } 17 - 18 - #endif

+10 -4

kernel/sched/cputime.c

··· 109 109 */ 110 110 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 111 111 112 - cpuacct_account_field(p, index, tmp); 112 + cgroup_account_cputime_field(p, index, tmp); 113 113 } 114 114 115 115 /* ··· 446 446 EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 447 447 #endif /* __ARCH_HAS_VTIME_ACCOUNT */ 448 448 449 + void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 450 + u64 *ut, u64 *st) 451 + { 452 + *ut = curr->utime; 453 + *st = curr->stime; 454 + } 455 + 449 456 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 450 457 { 451 458 *ut = p->utime; ··· 591 584 * 592 585 * Assuming that rtime_i+1 >= rtime_i. 593 586 */ 594 - static void cputime_adjust(struct task_cputime *curr, 595 - struct prev_cputime *prev, 596 - u64 *ut, u64 *st) 587 + void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 588 + u64 *ut, u64 *st) 597 589 { 598 590 u64 rtime, stime, utime; 599 591 unsigned long flags;

+1 -1

kernel/sched/deadline.c

··· 1144 1144 account_group_exec_runtime(curr, delta_exec); 1145 1145 1146 1146 curr->se.exec_start = rq_clock_task(rq); 1147 - cpuacct_charge(curr, delta_exec); 1147 + cgroup_account_cputime(curr, delta_exec); 1148 1148 1149 1149 sched_rt_avg_update(rq, delta_exec); 1150 1150

+1 -1

kernel/sched/fair.c

··· 844 844 struct task_struct *curtask = task_of(curr); 845 845 846 846 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); 847 - cpuacct_charge(curtask, delta_exec); 847 + cgroup_account_cputime(curtask, delta_exec); 848 848 account_group_exec_runtime(curtask, delta_exec); 849 849 } 850 850

+1 -1

kernel/sched/rt.c

··· 969 969 account_group_exec_runtime(curr, delta_exec); 970 970 971 971 curr->se.exec_start = rq_clock_task(rq); 972 - cpuacct_charge(curr, delta_exec); 972 + cgroup_account_cputime(curr, delta_exec); 973 973 974 974 sched_rt_avg_update(rq, delta_exec); 975 975

+1 -1

kernel/sched/sched.h

··· 30 30 #include <linux/irq_work.h> 31 31 #include <linux/tick.h> 32 32 #include <linux/slab.h> 33 + #include <linux/cgroup.h> 33 34 34 35 #ifdef CONFIG_PARAVIRT 35 36 #include <asm/paravirt.h> ··· 38 37 39 38 #include "cpupri.h" 40 39 #include "cpudeadline.h" 41 - #include "cpuacct.h" 42 40 43 41 #ifdef CONFIG_SCHED_DEBUG 44 42 # define SCHED_WARN_ON(x) WARN_ONCE(x, #x)

+1 -1

kernel/sched/stop_task.c

··· 72 72 account_group_exec_runtime(curr, delta_exec); 73 73 74 74 curr->se.exec_start = rq_clock_task(rq); 75 - cpuacct_charge(curr, delta_exec); 75 + cgroup_account_cputime(curr, delta_exec); 76 76 } 77 77 78 78 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)