Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

scx_qmap: Implement highpri boosting

Implement a silly boosting mechanism for nice -20 tasks. The only purpose is
demonstrating and testing scx_bpf_dispatch_from_dsq(). The boosting only
works within SHARED_DSQ and makes only minor differences with increased
dispatch batch (-b).

This exercises moving tasks to a user DSQ and all local DSQs from
ops.dispatch() and BPF timerfn.

v2: - Updated to use scx_bpf_dispatch_from_dsq_set_{slice|vtime}().

- Drop the workaround for the iterated tasks not being trusted by the
verifier. The issue is fixed from BPF side.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Hodges <hodges.daniel.scott@gmail.com>
Cc: David Vernet <void@manifault.com>
Cc: Changwoo Min <multics69@gmail.com>
Cc: Andrea Righi <andrea.righi@linux.dev>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>

+130 -14
+120 -13
tools/sched_ext/scx_qmap.bpf.c
··· 27 27 enum consts { 28 28 ONE_SEC_IN_NS = 1000000000, 29 29 SHARED_DSQ = 0, 30 + HIGHPRI_DSQ = 1, 31 + HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ 30 32 }; 31 33 32 34 char _license[] SEC("license") = "GPL"; ··· 38 36 const volatile u32 stall_kernel_nth; 39 37 const volatile u32 dsp_inf_loop_after; 40 38 const volatile u32 dsp_batch; 39 + const volatile bool highpri_boosting; 41 40 const volatile bool print_shared_dsq; 42 41 const volatile s32 disallow_tgid; 43 42 const volatile bool suppress_dump; 44 43 44 + u64 nr_highpri_queued; 45 45 u32 test_error_cnt; 46 46 47 47 UEI_DEFINE(uei); ··· 99 95 /* Per-task scheduling context */ 100 96 struct task_ctx { 101 97 bool force_local; /* Dispatch directly to local_dsq */ 98 + bool highpri; 102 99 u64 core_sched_seq; 103 100 }; 104 101 ··· 127 122 /* Statistics */ 128 123 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; 129 124 u64 nr_core_sched_execed; 125 + u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; 130 126 u32 cpuperf_min, cpuperf_avg, cpuperf_max; 131 127 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; 132 128 ··· 146 140 return -1; 147 141 } 148 142 143 + static struct task_ctx *lookup_task_ctx(struct task_struct *p) 144 + { 145 + struct task_ctx *tctx; 146 + 147 + if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { 148 + scx_bpf_error("task_ctx lookup failed"); 149 + return NULL; 150 + } 151 + return tctx; 152 + } 153 + 149 154 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 150 155 s32 prev_cpu, u64 wake_flags) 151 156 { 152 157 struct task_ctx *tctx; 153 158 s32 cpu; 154 159 155 - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 156 - if (!tctx) { 157 - scx_bpf_error("task_ctx lookup failed"); 160 + if (!(tctx = lookup_task_ctx(p))) 158 161 return -ESRCH; 159 - } 160 162 161 163 cpu = pick_direct_dispatch_cpu(p, prev_cpu); 162 164 ··· 211 197 if (test_error_cnt && !--test_error_cnt) 212 198 scx_bpf_error("test triggering error"); 213 199 214 - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 215 - if (!tctx) { 216 - scx_bpf_error("task_ctx lookup failed"); 200 + if (!(tctx = lookup_task_ctx(p))) 217 201 return; 218 - } 219 202 220 203 /* 221 204 * All enqueued tasks must have their core_sched_seq updated for correct ··· 266 255 return; 267 256 } 268 257 258 + if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { 259 + tctx->highpri = true; 260 + __sync_fetch_and_add(&nr_highpri_queued, 1); 261 + } 269 262 __sync_fetch_and_add(&nr_enqueued, 1); 270 263 } 271 264 ··· 286 271 287 272 static void update_core_sched_head_seq(struct task_struct *p) 288 273 { 289 - struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 290 274 int idx = weight_to_idx(p->scx.weight); 275 + struct task_ctx *tctx; 291 276 292 - if (tctx) 277 + if ((tctx = lookup_task_ctx(p))) 293 278 core_sched_head_seqs[idx] = tctx->core_sched_seq; 294 - else 295 - scx_bpf_error("task_ctx lookup failed"); 279 + } 280 + 281 + /* 282 + * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly 283 + * selective priority boosting mechanism by scanning SHARED_DSQ looking for 284 + * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This 285 + * makes minor difference only when dsp_batch is larger than 1. 286 + * 287 + * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and 288 + * non-rq-lock holding BPF programs. As demonstration, this function is called 289 + * from qmap_dispatch() and monitor_timerfn(). 290 + */ 291 + static bool dispatch_highpri(bool from_timer) 292 + { 293 + struct task_struct *p; 294 + s32 this_cpu = bpf_get_smp_processor_id(); 295 + 296 + /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ 297 + bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { 298 + static u64 highpri_seq; 299 + struct task_ctx *tctx; 300 + 301 + if (!(tctx = lookup_task_ctx(p))) 302 + return false; 303 + 304 + if (tctx->highpri) { 305 + /* exercise the set_*() and vtime interface too */ 306 + scx_bpf_dispatch_from_dsq_set_slice( 307 + BPF_FOR_EACH_ITER, slice_ns * 2); 308 + scx_bpf_dispatch_from_dsq_set_vtime( 309 + BPF_FOR_EACH_ITER, highpri_seq++); 310 + scx_bpf_dispatch_vtime_from_dsq( 311 + BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 312 + } 313 + } 314 + 315 + /* 316 + * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU 317 + * is found. 318 + */ 319 + bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { 320 + bool dispatched = false; 321 + s32 cpu; 322 + 323 + if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) 324 + cpu = this_cpu; 325 + else 326 + cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 327 + 328 + if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, 329 + SCX_DSQ_LOCAL_ON | cpu, 330 + SCX_ENQ_PREEMPT)) { 331 + if (cpu == this_cpu) { 332 + dispatched = true; 333 + __sync_fetch_and_add(&nr_expedited_local, 1); 334 + } else { 335 + __sync_fetch_and_add(&nr_expedited_remote, 1); 336 + } 337 + if (from_timer) 338 + __sync_fetch_and_add(&nr_expedited_from_timer, 1); 339 + } else { 340 + __sync_fetch_and_add(&nr_expedited_lost, 1); 341 + } 342 + 343 + if (dispatched) 344 + return true; 345 + } 346 + 347 + return false; 296 348 } 297 349 298 350 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) ··· 371 289 void *fifo; 372 290 s32 i, pid; 373 291 374 - if (scx_bpf_consume(SHARED_DSQ)) 292 + if (dispatch_highpri(false)) 293 + return; 294 + 295 + if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) 375 296 return; 376 297 377 298 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { ··· 411 326 412 327 /* Dispatch or advance. */ 413 328 bpf_repeat(BPF_MAX_LOOPS) { 329 + struct task_ctx *tctx; 330 + 414 331 if (bpf_map_pop_elem(fifo, &pid)) 415 332 break; 416 333 ··· 420 333 if (!p) 421 334 continue; 422 335 336 + if (!(tctx = lookup_task_ctx(p))) { 337 + bpf_task_release(p); 338 + return; 339 + } 340 + 341 + if (tctx->highpri) 342 + __sync_fetch_and_sub(&nr_highpri_queued, 1); 343 + 423 344 update_core_sched_head_seq(p); 424 345 __sync_fetch_and_add(&nr_dispatched, 1); 346 + 425 347 scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 426 348 bpf_task_release(p); 349 + 427 350 batch--; 428 351 cpuc->dsp_cnt--; 429 352 if (!batch || !scx_bpf_dispatch_nr_slots()) { 353 + if (dispatch_highpri(false)) 354 + return; 430 355 scx_bpf_consume(SHARED_DSQ); 431 356 return; 432 357 } ··· 763 664 764 665 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 765 666 { 667 + bpf_rcu_read_lock(); 668 + dispatch_highpri(true); 669 + bpf_rcu_read_unlock(); 670 + 766 671 monitor_cpuperf(); 767 672 768 673 if (print_shared_dsq) ··· 785 682 print_cpus(); 786 683 787 684 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 685 + if (ret) 686 + return ret; 687 + 688 + ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); 788 689 if (ret) 789 690 return ret; 790 691
+10 -1
tools/sched_ext/scx_qmap.c
··· 29 29 " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" 30 30 " -b COUNT Dispatch upto COUNT tasks together\n" 31 31 " -P Print out DSQ content to trace_pipe every second, use with -b\n" 32 + " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" 32 33 " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" 33 34 " -D LEN Set scx_exit_info.dump buffer length\n" 34 35 " -S Suppress qmap-specific debug dump\n" ··· 64 63 65 64 skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); 66 65 67 - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { 66 + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { 68 67 switch (opt) { 69 68 case 's': 70 69 skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; ··· 86 85 break; 87 86 case 'P': 88 87 skel->rodata->print_shared_dsq = true; 88 + break; 89 + case 'H': 90 + skel->rodata->highpri_boosting = true; 89 91 break; 90 92 case 'd': 91 93 skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); ··· 125 121 skel->bss->nr_reenqueued, skel->bss->nr_dequeued, 126 122 skel->bss->nr_core_sched_execed, 127 123 skel->bss->nr_ddsp_from_enq); 124 + printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", 125 + skel->bss->nr_expedited_local, 126 + skel->bss->nr_expedited_remote, 127 + skel->bss->nr_expedited_from_timer, 128 + skel->bss->nr_expedited_lost); 128 129 if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) 129 130 printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", 130 131 skel->bss->cpuperf_min,