Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched_ext: Add support for cgroup bandwidth control interface

From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jun 2025 15:06:47 -1000

- Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both
CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED.

- Put bandwidth control interface files for both cgroup v1 and v2 under
CONFIG_GROUP_SCHED_BANDWIDTH.

- Update tg_bandwidth() to fetch configuration parameters from fair if
CONFIG_CFS_BANDWIDTH, SCX otherwise.

- Update tg_set_bandwidth() to update the parameters for both fair and SCX.

- Add bandwidth control parameters to struct scx_cgroup_init_args.

- Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth
control parameter updates.

- Update scx_qmap and maximal selftest to test the new feature.

Signed-off-by: Tejun Heo <tj@kernel.org>

Tejun Heo ddceadce 6e6558a6

+127 -10
+3
include/linux/sched/ext.h
··· 219 219 #ifdef CONFIG_EXT_GROUP_SCHED 220 220 u32 flags; /* SCX_TG_* */ 221 221 u32 weight; 222 + u64 bw_period_us; 223 + u64 bw_quota_us; 224 + u64 bw_burst_us; 222 225 #endif 223 226 }; 224 227
+5
init/Kconfig
··· 1065 1065 config GROUP_SCHED_WEIGHT 1066 1066 def_bool n 1067 1067 1068 + config GROUP_SCHED_BANDWIDTH 1069 + def_bool n 1070 + 1068 1071 config FAIR_GROUP_SCHED 1069 1072 bool "Group scheduling for SCHED_OTHER" 1070 1073 depends on CGROUP_SCHED ··· 1077 1074 config CFS_BANDWIDTH 1078 1075 bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" 1079 1076 depends on FAIR_GROUP_SCHED 1077 + select GROUP_SCHED_BANDWIDTH 1080 1078 default n 1081 1079 help 1082 1080 This option allows users to define CPU bandwidth rates (limits) for ··· 1112 1108 bool 1113 1109 depends on SCHED_CLASS_EXT && CGROUP_SCHED 1114 1110 select GROUP_SCHED_WEIGHT 1111 + select GROUP_SCHED_BANDWIDTH 1115 1112 default y 1116 1113 1117 1114 endif #CGROUP_SCHED
+24 -5
kernel/sched/core.c
··· 9545 9545 9546 9546 return 0; 9547 9547 } 9548 + #endif /* CONFIG_CFS_BANDWIDTH */ 9548 9549 9550 + #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 9549 9551 const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ 9550 9552 static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ 9551 9553 /* More than 203 days if BW_SHIFT equals 20. */ ··· 9556 9554 static void tg_bandwidth(struct task_group *tg, 9557 9555 u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) 9558 9556 { 9557 + #ifdef CONFIG_CFS_BANDWIDTH 9559 9558 if (period_us_p) 9560 9559 *period_us_p = tg_get_cfs_period(tg); 9561 9560 if (quota_us_p) 9562 9561 *quota_us_p = tg_get_cfs_quota(tg); 9563 9562 if (burst_us_p) 9564 9563 *burst_us_p = tg_get_cfs_burst(tg); 9564 + #else /* !CONFIG_CFS_BANDWIDTH */ 9565 + if (period_us_p) 9566 + *period_us_p = tg->scx.bw_period_us; 9567 + if (quota_us_p) 9568 + *quota_us_p = tg->scx.bw_quota_us; 9569 + if (burst_us_p) 9570 + *burst_us_p = tg->scx.bw_burst_us; 9571 + #endif /* CONFIG_CFS_BANDWIDTH */ 9565 9572 } 9566 9573 9567 9574 static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, ··· 9586 9575 u64 period_us, u64 quota_us, u64 burst_us) 9587 9576 { 9588 9577 const u64 max_usec = U64_MAX / NSEC_PER_USEC; 9578 + int ret = 0; 9589 9579 9590 9580 if (tg == &root_task_group) 9591 9581 return -EINVAL; ··· 9624 9612 burst_us + quota_us > max_bw_runtime_us)) 9625 9613 return -EINVAL; 9626 9614 9627 - return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); 9615 + #ifdef CONFIG_CFS_BANDWIDTH 9616 + ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); 9617 + #endif /* CONFIG_CFS_BANDWIDTH */ 9618 + if (!ret) 9619 + scx_group_set_bandwidth(tg, period_us, quota_us, burst_us); 9620 + return ret; 9628 9621 } 9629 9622 9630 9623 static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, ··· 9682 9665 tg_bandwidth(tg, &period_us, &quota_us, NULL); 9683 9666 return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 9684 9667 } 9685 - #endif /* CONFIG_CFS_BANDWIDTH */ 9668 + #endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ 9686 9669 9687 9670 #ifdef CONFIG_RT_GROUP_SCHED 9688 9671 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, ··· 9742 9725 .write_s64 = cpu_idle_write_s64, 9743 9726 }, 9744 9727 #endif 9745 - #ifdef CONFIG_CFS_BANDWIDTH 9728 + #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 9746 9729 { 9747 9730 .name = "cfs_period_us", 9748 9731 .read_u64 = cpu_period_read_u64, ··· 9758 9741 .read_u64 = cpu_burst_read_u64, 9759 9742 .write_u64 = cpu_burst_write_u64, 9760 9743 }, 9744 + #endif 9745 + #ifdef CONFIG_CFS_BANDWIDTH 9761 9746 { 9762 9747 .name = "stat", 9763 9748 .seq_show = cpu_cfs_stat_show, ··· 9973 9954 return 0; 9974 9955 } 9975 9956 9976 - #ifdef CONFIG_CFS_BANDWIDTH 9957 + #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 9977 9958 static int cpu_max_show(struct seq_file *sf, void *v) 9978 9959 { 9979 9960 struct task_group *tg = css_tg(seq_css(sf)); ··· 10020 10001 .write_s64 = cpu_idle_write_s64, 10021 10002 }, 10022 10003 #endif 10023 - #ifdef CONFIG_CFS_BANDWIDTH 10004 + #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 10024 10005 { 10025 10006 .name = "max", 10026 10007 .flags = CFTYPE_NOT_ON_ROOT,
+63 -3
kernel/sched/ext.c
··· 203 203 struct scx_cgroup_init_args { 204 204 /* the weight of the cgroup [1..10000] */ 205 205 u32 weight; 206 + 207 + /* bandwidth control parameters from cpu.max and cpu.max.burst */ 208 + u64 bw_period_us; 209 + u64 bw_quota_us; 210 + u64 bw_burst_us; 206 211 }; 207 212 208 213 enum scx_cpu_preempt_reason { ··· 669 664 * @cgrp: cgroup whose weight is being updated 670 665 * @weight: new weight [1..10000] 671 666 * 672 - * Update @tg's weight to @weight. 667 + * Update @cgrp's weight to @weight. 673 668 */ 674 669 void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); 670 + 671 + /** 672 + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed 673 + * @cgrp: cgroup whose bandwidth is being updated 674 + * @period_us: bandwidth control period 675 + * @quota_us: bandwidth control quota 676 + * @burst_us: bandwidth control burst 677 + * 678 + * Update @cgrp's bandwidth control parameters. This is from the cpu.max 679 + * cgroup interface. 680 + * 681 + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled 682 + * to. For example, if @period_us is 1_000_000 and @quota_us is 683 + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be 684 + * interpreted in the same fashion and specifies how much @cgrp can 685 + * burst temporarily. The specific control mechanism and thus the 686 + * interpretation of @period_us and burstiness is upto to the BPF 687 + * scheduler. 688 + */ 689 + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, 690 + u64 period_us, u64 quota_us, u64 burst_us); 691 + 675 692 #endif /* CONFIG_EXT_GROUP_SCHED */ 676 693 677 694 /* ··· 4086 4059 void scx_tg_init(struct task_group *tg) 4087 4060 { 4088 4061 tg->scx.weight = CGROUP_WEIGHT_DFL; 4062 + tg->scx.bw_period_us = default_bw_period_us(); 4063 + tg->scx.bw_quota_us = RUNTIME_INF; 4089 4064 } 4090 4065 4091 4066 int scx_tg_online(struct task_group *tg) ··· 4102 4073 if (scx_cgroup_enabled) { 4103 4074 if (SCX_HAS_OP(sch, cgroup_init)) { 4104 4075 struct scx_cgroup_init_args args = 4105 - { .weight = tg->scx.weight }; 4076 + { .weight = tg->scx.weight, 4077 + .bw_period_us = tg->scx.bw_period_us, 4078 + .bw_quota_us = tg->scx.bw_quota_us, 4079 + .bw_burst_us = tg->scx.bw_burst_us }; 4106 4080 4107 4081 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, 4108 4082 NULL, tg->css.cgroup, &args); ··· 4255 4223 void scx_group_set_idle(struct task_group *tg, bool idle) 4256 4224 { 4257 4225 /* TODO: Implement ops->cgroup_set_idle() */ 4226 + } 4227 + 4228 + void scx_group_set_bandwidth(struct task_group *tg, 4229 + u64 period_us, u64 quota_us, u64 burst_us) 4230 + { 4231 + struct scx_sched *sch = scx_root; 4232 + 4233 + percpu_down_read(&scx_cgroup_rwsem); 4234 + 4235 + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 4236 + (tg->scx.bw_period_us != period_us || 4237 + tg->scx.bw_quota_us != quota_us || 4238 + tg->scx.bw_burst_us != burst_us)) 4239 + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, 4240 + tg_cgrp(tg), period_us, quota_us, burst_us); 4241 + 4242 + tg->scx.bw_period_us = period_us; 4243 + tg->scx.bw_quota_us = quota_us; 4244 + tg->scx.bw_burst_us = burst_us; 4245 + 4246 + percpu_up_read(&scx_cgroup_rwsem); 4258 4247 } 4259 4248 4260 4249 static void scx_cgroup_lock(void) ··· 4453 4400 rcu_read_lock(); 4454 4401 css_for_each_descendant_pre(css, &root_task_group.css) { 4455 4402 struct task_group *tg = css_tg(css); 4456 - struct scx_cgroup_init_args args = { .weight = tg->scx.weight }; 4403 + struct scx_cgroup_init_args args = { 4404 + .weight = tg->scx.weight, 4405 + .bw_period_us = tg->scx.bw_period_us, 4406 + .bw_quota_us = tg->scx.bw_quota_us, 4407 + .bw_burst_us = tg->scx.bw_burst_us, 4408 + }; 4457 4409 4458 4410 if ((tg->scx.flags & 4459 4411 (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) ··· 5960 5902 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 5961 5903 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 5962 5904 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 5905 + static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 5963 5906 #endif 5964 5907 static void sched_ext_ops__cpu_online(s32 cpu) {} 5965 5908 static void sched_ext_ops__cpu_offline(s32 cpu) {} ··· 5998 5939 .cgroup_move = sched_ext_ops__cgroup_move, 5999 5940 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 6000 5941 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 5942 + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 6001 5943 #endif 6002 5944 .cpu_online = sched_ext_ops__cpu_online, 6003 5945 .cpu_offline = sched_ext_ops__cpu_offline,
+2
kernel/sched/ext.h
··· 104 104 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); 105 105 void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); 106 106 void scx_group_set_idle(struct task_group *tg, bool idle); 107 + void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us); 107 108 #else /* CONFIG_EXT_GROUP_SCHED */ 108 109 static inline void scx_tg_init(struct task_group *tg) {} 109 110 static inline int scx_tg_online(struct task_group *tg) { return 0; } ··· 115 114 static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} 116 115 static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} 117 116 static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} 117 + static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {} 118 118 #endif /* CONFIG_EXT_GROUP_SCHED */ 119 119 #endif /* CONFIG_CGROUP_SCHED */
+2 -2
kernel/sched/sched.h
··· 402 402 403 403 extern struct list_head task_groups; 404 404 405 - #ifdef CONFIG_CFS_BANDWIDTH 405 + #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 406 406 extern const u64 max_bw_quota_period_us; 407 407 408 408 /* ··· 413 413 { 414 414 return 100000ULL; 415 415 } 416 - #endif /* CONFIG_CFS_BANDWIDTH */ 416 + #endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ 417 417 418 418 struct cfs_bandwidth { 419 419 #ifdef CONFIG_CFS_BANDWIDTH
+23
tools/sched_ext/scx_qmap.bpf.c
··· 615 615 taskc->force_local, taskc->core_sched_seq); 616 616 } 617 617 618 + s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) 619 + { 620 + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", 621 + cgrp->kn->id, args->weight, args->bw_period_us, 622 + args->bw_quota_us, args->bw_burst_us); 623 + return 0; 624 + } 625 + 626 + void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 627 + { 628 + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); 629 + } 630 + 631 + void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, 632 + u64 period_us, u64 quota_us, u64 burst_us) 633 + { 634 + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, 635 + period_us, quota_us, burst_us); 636 + } 637 + 618 638 /* 619 639 * Print out the online and possible CPU map using bpf_printk() as a 620 640 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). ··· 860 840 .dump = (void *)qmap_dump, 861 841 .dump_cpu = (void *)qmap_dump_cpu, 862 842 .dump_task = (void *)qmap_dump_task, 843 + .cgroup_init = (void *)qmap_cgroup_init, 844 + .cgroup_set_weight = (void *)qmap_cgroup_set_weight, 845 + .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, 863 846 .cpu_online = (void *)qmap_cpu_online, 864 847 .cpu_offline = (void *)qmap_cpu_offline, 865 848 .init = (void *)qmap_init,
+5
tools/testing/selftests/sched_ext/maximal.bpf.c
··· 123 123 void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 124 124 {} 125 125 126 + void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp, 127 + u64 period_us, u64 quota_us, u64 burst_us) 128 + {} 129 + 126 130 s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) 127 131 { 128 132 return scx_bpf_create_dsq(DSQ_ID, -1); ··· 164 160 .cgroup_move = (void *) maximal_cgroup_move, 165 161 .cgroup_cancel_move = (void *) maximal_cgroup_cancel_move, 166 162 .cgroup_set_weight = (void *) maximal_cgroup_set_weight, 163 + .cgroup_set_bandwidth = (void *) maximal_cgroup_set_bandwidth, 167 164 .init = (void *) maximal_init, 168 165 .exit = (void *) maximal_exit, 169 166 .name = "maximal",