sched_ext: Add scx_simple and scx_example_qmap example schedulers

+7 -1

Makefile

··· 1355 1355 $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean 1356 1356 endif 1357 1357 1358 + tools-clean-targets := sched_ext 1359 + PHONY += $(tools-clean-targets) 1360 + $(tools-clean-targets): 1361 + $(Q)$(MAKE) -sC tools $@_clean 1362 + tools_clean: $(tools-clean-targets) 1363 + 1358 1364 # Clear a bunch of variables before executing the submake 1359 1365 ifeq ($(quiet),silent_) 1360 1366 tools_silent=s ··· 1533 1527 $(mrproper-dirs): 1534 1528 $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) 1535 1529 1536 - mrproper: clean $(mrproper-dirs) 1530 + mrproper: clean $(mrproper-dirs) tools_clean 1537 1531 $(call cmd,rmfiles) 1538 1532 @find . $(RCS_FIND_IGNORE) \ 1539 1533 $ -name '*.rmeta' $ \

+9 -1

tools/Makefile

··· 28 28 @echo ' pci - PCI tools' 29 29 @echo ' perf - Linux performance measurement and analysis tool' 30 30 @echo ' selftests - various kernel selftests' 31 + @echo ' sched_ext - sched_ext example schedulers' 31 32 @echo ' bootconfig - boot config tool' 32 33 @echo ' spi - spi tools' 33 34 @echo ' tmon - thermal monitoring and tuning tool' ··· 91 90 perf: FORCE 92 91 $(Q)mkdir -p $(PERF_O) . 93 92 $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= 93 + 94 + sched_ext: FORCE 95 + $(call descend,sched_ext) 94 96 95 97 selftests: FORCE 96 98 $(call descend,testing/$@) ··· 188 184 $(Q)mkdir -p $(PERF_O) . 189 185 $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean 190 186 187 + sched_ext_clean: 188 + $(call descend,sched_ext,clean) 189 + 191 190 selftests_clean: 192 191 $(call descend,testing/$(@:_clean=),clean) 193 192 ··· 220 213 mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ 221 214 freefall_clean build_clean libbpf_clean libsubcmd_clean \ 222 215 gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ 223 - intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean 216 + intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ 217 + sched_ext_clean 224 218 225 219 .PHONY: FORCE

+2

tools/sched_ext/.gitignore

··· 1 + tools/ 2 + build/

+246

tools/sched_ext/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + # Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 3 + include ../build/Build.include 4 + include ../scripts/Makefile.arch 5 + include ../scripts/Makefile.include 6 + 7 + all: all_targets 8 + 9 + ifneq ($(LLVM),) 10 + ifneq ($(filter %/,$(LLVM)),) 11 + LLVM_PREFIX := $(LLVM) 12 + else ifneq ($(filter -%,$(LLVM)),) 13 + LLVM_SUFFIX := $(LLVM) 14 + endif 15 + 16 + CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi 17 + CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu 18 + CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl 19 + CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu 20 + CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu 21 + CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu 22 + CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu 23 + CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu 24 + CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu 25 + CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) 26 + 27 + ifeq ($(CROSS_COMPILE),) 28 + ifeq ($(CLANG_TARGET_FLAGS),) 29 + $(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) 30 + else 31 + CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) 32 + endif # CLANG_TARGET_FLAGS 33 + else 34 + CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) 35 + endif # CROSS_COMPILE 36 + 37 + CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as 38 + else 39 + CC := $(CROSS_COMPILE)gcc 40 + endif # LLVM 41 + 42 + CURDIR := $(abspath .) 43 + TOOLSDIR := $(abspath ..) 44 + LIBDIR := $(TOOLSDIR)/lib 45 + BPFDIR := $(LIBDIR)/bpf 46 + TOOLSINCDIR := $(TOOLSDIR)/include 47 + BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool 48 + APIDIR := $(TOOLSINCDIR)/uapi 49 + GENDIR := $(abspath ../../include/generated) 50 + GENHDR := $(GENDIR)/autoconf.h 51 + 52 + ifeq ($(O),) 53 + OUTPUT_DIR := $(CURDIR)/build 54 + else 55 + OUTPUT_DIR := $(O)/build 56 + endif # O 57 + OBJ_DIR := $(OUTPUT_DIR)/obj 58 + INCLUDE_DIR := $(OUTPUT_DIR)/include 59 + BPFOBJ_DIR := $(OBJ_DIR)/libbpf 60 + SCXOBJ_DIR := $(OBJ_DIR)/sched_ext 61 + BINDIR := $(OUTPUT_DIR)/bin 62 + BPFOBJ := $(BPFOBJ_DIR)/libbpf.a 63 + ifneq ($(CROSS_COMPILE),) 64 + HOST_BUILD_DIR := $(OBJ_DIR)/host 65 + HOST_OUTPUT_DIR := host-tools 66 + HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include 67 + else 68 + HOST_BUILD_DIR := $(OBJ_DIR) 69 + HOST_OUTPUT_DIR := $(OUTPUT_DIR) 70 + HOST_INCLUDE_DIR := $(INCLUDE_DIR) 71 + endif 72 + HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a 73 + RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids 74 + DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool 75 + 76 + VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ 77 + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ 78 + ../../vmlinux \ 79 + /sys/kernel/btf/vmlinux \ 80 + /boot/vmlinux-$(shell uname -r) 81 + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) 82 + ifeq ($(VMLINUX_BTF),) 83 + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") 84 + endif 85 + 86 + BPFTOOL ?= $(DEFAULT_BPFTOOL) 87 + 88 + ifneq ($(wildcard $(GENHDR)),) 89 + GENFLAGS := -DHAVE_GENHDR 90 + endif 91 + 92 + CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ 93 + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ 94 + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include 95 + 96 + # Silence some warnings when compiled with clang 97 + ifneq ($(LLVM),) 98 + CFLAGS += -Wno-unused-command-line-argument 99 + endif 100 + 101 + LDFLAGS = -lelf -lz -lpthread 102 + 103 + IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ 104 + grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') 105 + 106 + # Get Clang's default includes on this system, as opposed to those seen by 107 + # '-target bpf'. This fixes "missing" files on some architectures/distros, 108 + # such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. 109 + # 110 + # Use '-idirafter': Don't interfere with include mechanics except where the 111 + # build would have failed anyways. 112 + define get_sys_includes 113 + $(shell $(1) -v -E - </dev/null 2>&1 \ 114 + | sed -n '/<...> search starts here:/,/End of search list./{ s| $/.*$|-idirafter \1|p }') \ 115 + $(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') 116 + endef 117 + 118 + BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ 119 + $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \ 120 + -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \ 121 + -I$(INCLUDE_DIR) -I$(APIDIR) \ 122 + -I../../include \ 123 + $(call get_sys_includes,$(CLANG)) \ 124 + -Wall -Wno-compare-distinct-pointer-types \ 125 + -O2 -mcpu=v3 126 + 127 + # sort removes libbpf duplicates when not cross-building 128 + MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ 129 + $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \ 130 + $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR)) 131 + 132 + $(MAKE_DIRS): 133 + $(call msg,MKDIR,,$@) 134 + $(Q)mkdir -p $@ 135 + 136 + $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ 137 + $(APIDIR)/linux/bpf.h \ 138 + | $(OBJ_DIR)/libbpf 139 + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \ 140 + EXTRA_CFLAGS='-g -O0 -fPIC' \ 141 + DESTDIR=$(OUTPUT_DIR) prefix= all install_headers 142 + 143 + $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ 144 + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool 145 + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ 146 + ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \ 147 + EXTRA_CFLAGS='-g -O0' \ 148 + OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ 149 + LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \ 150 + LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/ \ 151 + prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin 152 + 153 + $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) 154 + ifeq ($(VMLINUX_H),) 155 + $(call msg,GEN,,$@) 156 + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ 157 + else 158 + $(call msg,CP,,$@) 159 + $(Q)cp "$(VMLINUX_H)" $@ 160 + endif 161 + 162 + $(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ 163 + | $(BPFOBJ) $(SCXOBJ_DIR) 164 + $(call msg,CLNG-BPF,,$(notdir $@)) 165 + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ 166 + 167 + $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) 168 + $(eval sched=$(notdir $@)) 169 + $(call msg,GEN-SKEL,,$(sched)) 170 + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< 171 + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) 172 + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) 173 + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) 174 + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ 175 + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) 176 + 177 + SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) 178 + 179 + c-sched-targets = scx_simple scx_qmap 180 + 181 + $(addprefix $(BINDIR)/,$(c-sched-targets)): \ 182 + $(BINDIR)/%: \ 183 + $(filter-out %.bpf.c,%.c) \ 184 + $(INCLUDE_DIR)/%.bpf.skel.h \ 185 + $(SCX_COMMON_DEPS) 186 + $(eval sched=$(notdir $@)) 187 + $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o 188 + $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) 189 + 190 + $(c-sched-targets): %: $(BINDIR)/% 191 + 192 + install: all 193 + $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ 194 + $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ 195 + 196 + clean: 197 + rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) 198 + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h 199 + rm -f $(c-sched-targets) 200 + 201 + help: 202 + @echo 'Building targets' 203 + @echo '================' 204 + @echo '' 205 + @echo ' all - Compile all schedulers' 206 + @echo '' 207 + @echo 'Alternatively, you may compile individual schedulers:' 208 + @echo '' 209 + @printf ' %s\n' $(c-sched-targets) 210 + @echo '' 211 + @echo 'For any scheduler build target, you may specify an alternative' 212 + @echo 'build output path with the O= environment variable. For example:' 213 + @echo '' 214 + @echo ' O=/tmp/sched_ext make all' 215 + @echo '' 216 + @echo 'will compile all schedulers, and emit the build artifacts to' 217 + @echo '/tmp/sched_ext/build.' 218 + @echo '' 219 + @echo '' 220 + @echo 'Installing targets' 221 + @echo '==================' 222 + @echo '' 223 + @echo ' install - Compile and install all schedulers to /usr/bin.' 224 + @echo ' You may specify the DESTDIR= environment variable' 225 + @echo ' to indicate a prefix for /usr/bin. For example:' 226 + @echo '' 227 + @echo ' DESTDIR=/tmp/sched_ext make install' 228 + @echo '' 229 + @echo ' will build the schedulers in CWD/build, and' 230 + @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' 231 + @echo '' 232 + @echo '' 233 + @echo 'Cleaning targets' 234 + @echo '================' 235 + @echo '' 236 + @echo ' clean - Remove all generated files' 237 + 238 + all_targets: $(c-sched-targets) 239 + 240 + .PHONY: all all_targets $(c-sched-targets) clean help 241 + 242 + # delete failed targets 243 + .DELETE_ON_ERROR: 244 + 245 + # keep intermediate (.bpf.skel.h, .bpf.o, etc) targets 246 + .SECONDARY:

+11

tools/sched_ext/include/bpf-compat/gnu/stubs.h

··· 1 + /* 2 + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when 3 + * compiling BPF files although its content doesn't play any role. The file in 4 + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is 5 + * defined. When compiling a BPF source, __x86_64__ isn't set and thus 6 + * stubs-32.h is selected. However, the file is not there if the system doesn't 7 + * have 32bit glibc devel package installed leading to a build failure. 8 + * 9 + * The problem is worked around by making this file available in the include 10 + * search paths before the system one when building BPF. 11 + */

+379

tools/sched_ext/include/scx/common.bpf.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 + */ 7 + #ifndef __SCX_COMMON_BPF_H 8 + #define __SCX_COMMON_BPF_H 9 + 10 + #include "vmlinux.h" 11 + #include <bpf/bpf_helpers.h> 12 + #include <bpf/bpf_tracing.h> 13 + #include <asm-generic/errno.h> 14 + #include "user_exit_info.h" 15 + 16 + #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ 17 + #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 18 + #define PF_EXITING 0x00000004 19 + #define CLOCK_MONOTONIC 1 20 + 21 + /* 22 + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can 23 + * lead to really confusing misbehaviors. Let's trigger a build failure. 24 + */ 25 + static inline void ___vmlinux_h_sanity_check___(void) 26 + { 27 + _Static_assert(SCX_DSQ_FLAG_BUILTIN, 28 + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); 29 + } 30 + 31 + s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; 32 + s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; 33 + void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; 34 + u32 scx_bpf_dispatch_nr_slots(void) __ksym; 35 + void scx_bpf_dispatch_cancel(void) __ksym; 36 + bool scx_bpf_consume(u64 dsq_id) __ksym; 37 + s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; 38 + void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; 39 + void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; 40 + void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; 41 + u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; 42 + const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; 43 + const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; 44 + void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; 45 + const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; 46 + const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; 47 + void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; 48 + bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; 49 + s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; 50 + s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; 51 + bool scx_bpf_task_running(const struct task_struct *p) __ksym; 52 + s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; 53 + 54 + static inline __attribute__((format(printf, 1, 2))) 55 + void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} 56 + 57 + /* 58 + * Helper macro for initializing the fmt and variadic argument inputs to both 59 + * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to 60 + * refer to the initialized list of inputs to the bstr kfunc. 61 + */ 62 + #define scx_bpf_bstr_preamble(fmt, args...) \ 63 + static char ___fmt[] = fmt; \ 64 + /* \ 65 + * Note that __param[] must have at least one \ 66 + * element to keep the verifier happy. \ 67 + */ \ 68 + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ 69 + \ 70 + _Pragma("GCC diagnostic push") \ 71 + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ 72 + ___bpf_fill(___param, args); \ 73 + _Pragma("GCC diagnostic pop") \ 74 + 75 + /* 76 + * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments 77 + * instead of an array of u64. Using this macro will cause the scheduler to 78 + * exit cleanly with the specified exit code being passed to user space. 79 + */ 80 + #define scx_bpf_exit(code, fmt, args...) \ 81 + ({ \ 82 + scx_bpf_bstr_preamble(fmt, args) \ 83 + scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ 84 + ___scx_bpf_bstr_format_checker(fmt, ##args); \ 85 + }) 86 + 87 + /* 88 + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments 89 + * instead of an array of u64. Invoking this macro will cause the scheduler to 90 + * exit in an erroneous state, with diagnostic information being passed to the 91 + * user. 92 + */ 93 + #define scx_bpf_error(fmt, args...) \ 94 + ({ \ 95 + scx_bpf_bstr_preamble(fmt, args) \ 96 + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ 97 + ___scx_bpf_bstr_format_checker(fmt, ##args); \ 98 + }) 99 + 100 + #define BPF_STRUCT_OPS(name, args...) \ 101 + SEC("struct_ops/"#name) \ 102 + BPF_PROG(name, ##args) 103 + 104 + #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 105 + SEC("struct_ops.s/"#name) \ 106 + BPF_PROG(name, ##args) 107 + 108 + /** 109 + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized 110 + * @elfsec: the data section of the BPF program in which to place the array 111 + * @arr: the name of the array 112 + * 113 + * libbpf has an API for setting map value sizes. Since data sections (i.e. 114 + * bss, data, rodata) themselves are maps, a data section can be resized. If 115 + * a data section has an array as its last element, the BTF info for that 116 + * array will be adjusted so that length of the array is extended to meet the 117 + * new length of the data section. This macro annotates an array to have an 118 + * element count of one with the assumption that this array can be resized 119 + * within the userspace program. It also annotates the section specifier so 120 + * this array exists in a custom sub data section which can be resized 121 + * independently. 122 + * 123 + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an 124 + * array declared with RESIZABLE_ARRAY(). 125 + */ 126 + #define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) 127 + 128 + /** 129 + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member 130 + * @base: struct or array to index 131 + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) 132 + * 133 + * The verifier often gets confused by the instruction sequence the compiler 134 + * generates for indexing struct fields or arrays. This macro forces the 135 + * compiler to generate a code sequence which first calculates the byte offset, 136 + * checks it against the struct or array size and add that byte offset to 137 + * generate the pointer to the member to help the verifier. 138 + * 139 + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, 140 + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller 141 + * must check for %NULL and take appropriate action to appease the verifier. To 142 + * avoid confusing the verifier, it's best to check for %NULL and dereference 143 + * immediately. 144 + * 145 + * vptr = MEMBER_VPTR(my_array, [i][j]); 146 + * if (!vptr) 147 + * return error; 148 + * *vptr = new_value; 149 + * 150 + * sizeof(@base) should encompass the memory area to be accessed and thus can't 151 + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of 152 + * `MEMBER_VPTR(ptr, ->member)`. 153 + */ 154 + #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ 155 + ({ \ 156 + u64 __base = (u64)&(base); \ 157 + u64 __addr = (u64)&((base) member) - __base; \ 158 + _Static_assert(sizeof(base) >= sizeof((base) member), \ 159 + "@base is smaller than @member, is @base a pointer?"); \ 160 + asm volatile ( \ 161 + "if %0 <= %[max] goto +2\n" \ 162 + "%0 = 0\n" \ 163 + "goto +1\n" \ 164 + "%0 += %1\n" \ 165 + : "+r"(__addr) \ 166 + : "r"(__base), \ 167 + [max]"i"(sizeof(base) - sizeof((base) member))); \ 168 + __addr; \ 169 + }) 170 + 171 + /** 172 + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element 173 + * @arr: array to index into 174 + * @i: array index 175 + * @n: number of elements in array 176 + * 177 + * Similar to MEMBER_VPTR() but is intended for use with arrays where the 178 + * element count needs to be explicit. 179 + * It can be used in cases where a global array is defined with an initial 180 + * size but is intended to be be resized before loading the BPF program. 181 + * Without this version of the macro, MEMBER_VPTR() will use the compile time 182 + * size of the array to compute the max, which will result in rejection by 183 + * the verifier. 184 + */ 185 + #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ 186 + ({ \ 187 + u64 __base = (u64)arr; \ 188 + u64 __addr = (u64)&(arr[i]) - __base; \ 189 + asm volatile ( \ 190 + "if %0 <= %[max] goto +2\n" \ 191 + "%0 = 0\n" \ 192 + "goto +1\n" \ 193 + "%0 += %1\n" \ 194 + : "+r"(__addr) \ 195 + : "r"(__base), \ 196 + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ 197 + __addr; \ 198 + }) 199 + 200 + 201 + /* 202 + * BPF declarations and helpers 203 + */ 204 + 205 + /* list and rbtree */ 206 + #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) 207 + #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) 208 + 209 + void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; 210 + void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; 211 + 212 + #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) 213 + #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) 214 + 215 + void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; 216 + void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; 217 + struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; 218 + struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; 219 + struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, 220 + struct bpf_rb_node *node) __ksym; 221 + int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, 222 + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), 223 + void *meta, __u64 off) __ksym; 224 + #define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) 225 + 226 + struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; 227 + 228 + void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; 229 + #define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) 230 + 231 + /* task */ 232 + struct task_struct *bpf_task_from_pid(s32 pid) __ksym; 233 + struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; 234 + void bpf_task_release(struct task_struct *p) __ksym; 235 + 236 + /* cgroup */ 237 + struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; 238 + void bpf_cgroup_release(struct cgroup *cgrp) __ksym; 239 + struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; 240 + 241 + /* css iteration */ 242 + struct bpf_iter_css; 243 + struct cgroup_subsys_state; 244 + extern int bpf_iter_css_new(struct bpf_iter_css *it, 245 + struct cgroup_subsys_state *start, 246 + unsigned int flags) __weak __ksym; 247 + extern struct cgroup_subsys_state * 248 + bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; 249 + extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; 250 + 251 + /* cpumask */ 252 + struct bpf_cpumask *bpf_cpumask_create(void) __ksym; 253 + struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; 254 + void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; 255 + u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; 256 + u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; 257 + void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 258 + void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 259 + bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; 260 + bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 261 + bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 262 + void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; 263 + void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; 264 + bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, 265 + const struct cpumask *src2) __ksym; 266 + void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, 267 + const struct cpumask *src2) __ksym; 268 + void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, 269 + const struct cpumask *src2) __ksym; 270 + bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; 271 + bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; 272 + bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; 273 + bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; 274 + bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; 275 + void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; 276 + u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; 277 + u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, 278 + const struct cpumask *src2) __ksym; 279 + 280 + /* rcu */ 281 + void bpf_rcu_read_lock(void) __ksym; 282 + void bpf_rcu_read_unlock(void) __ksym; 283 + 284 + 285 + /* 286 + * Other helpers 287 + */ 288 + 289 + /* useful compiler attributes */ 290 + #define likely(x) __builtin_expect(!!(x), 1) 291 + #define unlikely(x) __builtin_expect(!!(x), 0) 292 + #define __maybe_unused __attribute__((__unused__)) 293 + 294 + /* 295 + * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They 296 + * prevent compiler from caching, redoing or reordering reads or writes. 297 + */ 298 + typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; 299 + typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; 300 + typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; 301 + typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; 302 + 303 + static __always_inline void __read_once_size(const volatile void *p, void *res, int size) 304 + { 305 + switch (size) { 306 + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; 307 + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; 308 + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; 309 + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; 310 + default: 311 + barrier(); 312 + __builtin_memcpy((void *)res, (const void *)p, size); 313 + barrier(); 314 + } 315 + } 316 + 317 + static __always_inline void __write_once_size(volatile void *p, void *res, int size) 318 + { 319 + switch (size) { 320 + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; 321 + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; 322 + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; 323 + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; 324 + default: 325 + barrier(); 326 + __builtin_memcpy((void *)p, (const void *)res, size); 327 + barrier(); 328 + } 329 + } 330 + 331 + #define READ_ONCE(x) \ 332 + ({ \ 333 + union { typeof(x) __val; char __c[1]; } __u = \ 334 + { .__c = { 0 } }; \ 335 + __read_once_size(&(x), __u.__c, sizeof(x)); \ 336 + __u.__val; \ 337 + }) 338 + 339 + #define WRITE_ONCE(x, val) \ 340 + ({ \ 341 + union { typeof(x) __val; char __c[1]; } __u = \ 342 + { .__val = (val) }; \ 343 + __write_once_size(&(x), __u.__c, sizeof(x)); \ 344 + __u.__val; \ 345 + }) 346 + 347 + /* 348 + * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. 349 + * @v: The value for which we're computing the base 2 logarithm. 350 + */ 351 + static inline u32 log2_u32(u32 v) 352 + { 353 + u32 r; 354 + u32 shift; 355 + 356 + r = (v > 0xFFFF) << 4; v >>= r; 357 + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; 358 + shift = (v > 0xF) << 2; v >>= shift; r |= shift; 359 + shift = (v > 0x3) << 1; v >>= shift; r |= shift; 360 + r |= (v >> 1); 361 + return r; 362 + } 363 + 364 + /* 365 + * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. 366 + * @v: The value for which we're computing the base 2 logarithm. 367 + */ 368 + static inline u32 log2_u64(u64 v) 369 + { 370 + u32 hi = v >> 32; 371 + if (hi) 372 + return log2_u32(hi) + 32 + 1; 373 + else 374 + return log2_u32(v) + 1; 375 + } 376 + 377 + #include "compat.bpf.h" 378 + 379 + #endif /* __SCX_COMMON_BPF_H */

+75

tools/sched_ext/include/scx/common.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2023 David Vernet <dvernet@meta.com> 6 + */ 7 + #ifndef __SCHED_EXT_COMMON_H 8 + #define __SCHED_EXT_COMMON_H 9 + 10 + #ifdef __KERNEL__ 11 + #error "Should not be included by BPF programs" 12 + #endif 13 + 14 + #include <stdarg.h> 15 + #include <stdio.h> 16 + #include <stdlib.h> 17 + #include <stdint.h> 18 + #include <errno.h> 19 + 20 + typedef uint8_t u8; 21 + typedef uint16_t u16; 22 + typedef uint32_t u32; 23 + typedef uint64_t u64; 24 + typedef int8_t s8; 25 + typedef int16_t s16; 26 + typedef int32_t s32; 27 + typedef int64_t s64; 28 + 29 + #define SCX_BUG(__fmt, ...) \ 30 + do { \ 31 + fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \ 32 + if (errno) \ 33 + fprintf(stderr, " (%s)\n", strerror(errno)); \ 34 + else \ 35 + fprintf(stderr, "\n"); \ 36 + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ 37 + fprintf(stderr, "\n"); \ 38 + \ 39 + exit(EXIT_FAILURE); \ 40 + } while (0) 41 + 42 + #define SCX_BUG_ON(__cond, __fmt, ...) \ 43 + do { \ 44 + if (__cond) \ 45 + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ 46 + } while (0) 47 + 48 + /** 49 + * RESIZE_ARRAY - Convenience macro for resizing a BPF array 50 + * @__skel: the skeleton containing the array 51 + * @elfsec: the data section of the BPF program in which the array exists 52 + * @arr: the name of the array 53 + * @n: the desired array element count 54 + * 55 + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two 56 + * operations. It resizes the map which corresponds to the custom data 57 + * section that contains the target array. As a side effect, the BTF info for 58 + * the array is adjusted so that the array length is sized to cover the new 59 + * data section size. The second operation is reassigning the skeleton pointer 60 + * for that custom data section so that it points to the newly memory mapped 61 + * region. 62 + */ 63 + #define RESIZE_ARRAY(__skel, elfsec, arr, n) \ 64 + do { \ 65 + size_t __sz; \ 66 + bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \ 67 + sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \ 68 + (__skel)->elfsec##_##arr = \ 69 + bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \ 70 + } while (0) 71 + 72 + #include "user_exit_info.h" 73 + #include "compat.h" 74 + 75 + #endif /* __SCHED_EXT_COMMON_H */

+28

tools/sched_ext/include/scx/compat.bpf.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2024 David Vernet <dvernet@meta.com> 6 + */ 7 + #ifndef __SCX_COMPAT_BPF_H 8 + #define __SCX_COMPAT_BPF_H 9 + 10 + #define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ 11 + ({ \ 12 + __type __ret = 0; \ 13 + if (bpf_core_enum_value_exists(__type, __ent)) \ 14 + __ret = __ent; \ 15 + __ret; \ 16 + }) 17 + 18 + /* 19 + * Define sched_ext_ops. This may be expanded to define multiple variants for 20 + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). 21 + */ 22 + #define SCX_OPS_DEFINE(__name, ...) \ 23 + SEC(".struct_ops.link") \ 24 + struct sched_ext_ops __name = { \ 25 + __VA_ARGS__, \ 26 + }; 27 + 28 + #endif /* __SCX_COMPAT_BPF_H */

+153

tools/sched_ext/include/scx/compat.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2024 David Vernet <dvernet@meta.com> 6 + */ 7 + #ifndef __SCX_COMPAT_H 8 + #define __SCX_COMPAT_H 9 + 10 + #include <bpf/btf.h> 11 + 12 + struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); 13 + 14 + static inline void __COMPAT_load_vmlinux_btf(void) 15 + { 16 + if (!__COMPAT_vmlinux_btf) { 17 + __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); 18 + SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); 19 + } 20 + } 21 + 22 + static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) 23 + { 24 + const struct btf_type *t; 25 + const char *n; 26 + s32 tid; 27 + int i; 28 + 29 + __COMPAT_load_vmlinux_btf(); 30 + 31 + tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); 32 + if (tid < 0) 33 + return false; 34 + 35 + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); 36 + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); 37 + 38 + if (btf_is_enum(t)) { 39 + struct btf_enum *e = btf_enum(t); 40 + 41 + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { 42 + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); 43 + SCX_BUG_ON(!n, "btf__name_by_offset()"); 44 + if (!strcmp(n, name)) { 45 + *v = e[i].val; 46 + return true; 47 + } 48 + } 49 + } else if (btf_is_enum64(t)) { 50 + struct btf_enum64 *e = btf_enum64(t); 51 + 52 + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { 53 + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); 54 + SCX_BUG_ON(!n, "btf__name_by_offset()"); 55 + if (!strcmp(n, name)) { 56 + *v = btf_enum64_value(&e[i]); 57 + return true; 58 + } 59 + } 60 + } 61 + 62 + return false; 63 + } 64 + 65 + #define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ 66 + ({ \ 67 + u64 __val = 0; \ 68 + __COMPAT_read_enum(__type, __ent, &__val); \ 69 + __val; \ 70 + }) 71 + 72 + static inline bool __COMPAT_has_ksym(const char *ksym) 73 + { 74 + __COMPAT_load_vmlinux_btf(); 75 + return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; 76 + } 77 + 78 + static inline bool __COMPAT_struct_has_field(const char *type, const char *field) 79 + { 80 + const struct btf_type *t; 81 + const struct btf_member *m; 82 + const char *n; 83 + s32 tid; 84 + int i; 85 + 86 + __COMPAT_load_vmlinux_btf(); 87 + tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); 88 + if (tid < 0) 89 + return false; 90 + 91 + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); 92 + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); 93 + 94 + m = btf_members(t); 95 + 96 + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { 97 + n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); 98 + SCX_BUG_ON(!n, "btf__name_by_offset()"); 99 + if (!strcmp(n, field)) 100 + return true; 101 + } 102 + 103 + return false; 104 + } 105 + 106 + #define SCX_OPS_SWITCH_PARTIAL \ 107 + __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") 108 + 109 + /* 110 + * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() 111 + * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load 112 + * and attach it, backward compatibility is automatically maintained where 113 + * reasonable. 114 + */ 115 + #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ 116 + struct __scx_name *__skel; \ 117 + \ 118 + __skel = __scx_name##__open(); \ 119 + SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ 120 + __skel; \ 121 + }) 122 + 123 + #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \ 124 + SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ 125 + }) 126 + 127 + /* 128 + * New versions of bpftool now emit additional link placeholders for BPF maps, 129 + * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps 130 + * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do 131 + * nothing with those links and won't attempt to auto-attach maps. 132 + * 133 + * To maintain compatibility with older libbpf while avoiding trying to attach 134 + * twice, disable the autoattach feature on newer libbpf. 135 + */ 136 + #if LIBBPF_MAJOR_VERSION > 1 || \ 137 + (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) 138 + #define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ 139 + bpf_map__set_autoattach((__skel)->maps.__ops_name, false) 140 + #else 141 + #define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) 142 + #endif 143 + 144 + #define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ 145 + struct bpf_link *__link; \ 146 + __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \ 147 + SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ 148 + __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ 149 + SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ 150 + __link; \ 151 + }) 152 + 153 + #endif /* __SCX_COMPAT_H */

+64

tools/sched_ext/include/scx/user_exit_info.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Define struct user_exit_info which is shared between BPF and userspace parts 4 + * to communicate exit status and other information. 5 + * 6 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 7 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 8 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 9 + */ 10 + #ifndef __USER_EXIT_INFO_H 11 + #define __USER_EXIT_INFO_H 12 + 13 + enum uei_sizes { 14 + UEI_REASON_LEN = 128, 15 + UEI_MSG_LEN = 1024, 16 + }; 17 + 18 + struct user_exit_info { 19 + int kind; 20 + s64 exit_code; 21 + char reason[UEI_REASON_LEN]; 22 + char msg[UEI_MSG_LEN]; 23 + }; 24 + 25 + #ifdef __bpf__ 26 + 27 + #include "vmlinux.h" 28 + #include <bpf/bpf_core_read.h> 29 + 30 + #define UEI_DEFINE(__name) \ 31 + struct user_exit_info __name SEC(".data") 32 + 33 + #define UEI_RECORD(__uei_name, __ei) ({ \ 34 + bpf_probe_read_kernel_str(__uei_name.reason, \ 35 + sizeof(__uei_name.reason), (__ei)->reason); \ 36 + bpf_probe_read_kernel_str(__uei_name.msg, \ 37 + sizeof(__uei_name.msg), (__ei)->msg); \ 38 + if (bpf_core_field_exists((__ei)->exit_code)) \ 39 + __uei_name.exit_code = (__ei)->exit_code; \ 40 + /* use __sync to force memory barrier */ \ 41 + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ 42 + (__ei)->kind); \ 43 + }) 44 + 45 + #else /* !__bpf__ */ 46 + 47 + #include <stdio.h> 48 + #include <stdbool.h> 49 + 50 + #define UEI_EXITED(__skel, __uei_name) ({ \ 51 + /* use __sync to force memory barrier */ \ 52 + __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ 53 + }) 54 + 55 + #define UEI_REPORT(__skel, __uei_name) ({ \ 56 + struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ 57 + fprintf(stderr, "EXIT: %s", __uei->reason); \ 58 + if (__uei->msg[0] != '\0') \ 59 + fprintf(stderr, " (%s)", __uei->msg); \ 60 + fputs("\n", stderr); \ 61 + }) 62 + 63 + #endif /* __bpf__ */ 64 + #endif /* __USER_EXIT_INFO_H */

+264

tools/sched_ext/scx_qmap.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A simple five-level FIFO queue scheduler. 4 + * 5 + * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets 6 + * assigned to one depending on its compound weight. Each CPU round robins 7 + * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from 8 + * queue0, 2 from queue1, 4 from queue2 and so on. 9 + * 10 + * This scheduler demonstrates: 11 + * 12 + * - BPF-side queueing using PIDs. 13 + * - Sleepable per-task storage allocation using ops.prep_enable(). 14 + * 15 + * This scheduler is primarily for demonstration and testing of sched_ext 16 + * features and unlikely to be useful for actual workloads. 17 + * 18 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 19 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 20 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 21 + */ 22 + #include <scx/common.bpf.h> 23 + 24 + enum consts { 25 + ONE_SEC_IN_NS = 1000000000, 26 + SHARED_DSQ = 0, 27 + }; 28 + 29 + char _license[] SEC("license") = "GPL"; 30 + 31 + const volatile u64 slice_ns = SCX_SLICE_DFL; 32 + const volatile u32 dsp_batch; 33 + 34 + u32 test_error_cnt; 35 + 36 + UEI_DEFINE(uei); 37 + 38 + struct qmap { 39 + __uint(type, BPF_MAP_TYPE_QUEUE); 40 + __uint(max_entries, 4096); 41 + __type(value, u32); 42 + } queue0 SEC(".maps"), 43 + queue1 SEC(".maps"), 44 + queue2 SEC(".maps"), 45 + queue3 SEC(".maps"), 46 + queue4 SEC(".maps"); 47 + 48 + struct { 49 + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 50 + __uint(max_entries, 5); 51 + __type(key, int); 52 + __array(values, struct qmap); 53 + } queue_arr SEC(".maps") = { 54 + .values = { 55 + [0] = &queue0, 56 + [1] = &queue1, 57 + [2] = &queue2, 58 + [3] = &queue3, 59 + [4] = &queue4, 60 + }, 61 + }; 62 + 63 + /* Per-task scheduling context */ 64 + struct task_ctx { 65 + bool force_local; /* Dispatch directly to local_dsq */ 66 + }; 67 + 68 + struct { 69 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 70 + __uint(map_flags, BPF_F_NO_PREALLOC); 71 + __type(key, int); 72 + __type(value, struct task_ctx); 73 + } task_ctx_stor SEC(".maps"); 74 + 75 + struct cpu_ctx { 76 + u64 dsp_idx; /* dispatch index */ 77 + u64 dsp_cnt; /* remaining count */ 78 + }; 79 + 80 + struct { 81 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 82 + __uint(max_entries, 1); 83 + __type(key, u32); 84 + __type(value, struct cpu_ctx); 85 + } cpu_ctx_stor SEC(".maps"); 86 + 87 + /* Statistics */ 88 + u64 nr_enqueued, nr_dispatched, nr_dequeued; 89 + 90 + s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, 91 + s32 prev_cpu, u64 wake_flags) 92 + { 93 + struct task_ctx *tctx; 94 + s32 cpu; 95 + 96 + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 97 + if (!tctx) { 98 + scx_bpf_error("task_ctx lookup failed"); 99 + return -ESRCH; 100 + } 101 + 102 + if (p->nr_cpus_allowed == 1 || 103 + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 104 + tctx->force_local = true; 105 + return prev_cpu; 106 + } 107 + 108 + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 109 + if (cpu >= 0) 110 + return cpu; 111 + 112 + return prev_cpu; 113 + } 114 + 115 + static int weight_to_idx(u32 weight) 116 + { 117 + /* Coarsely map the compound weight to a FIFO. */ 118 + if (weight <= 25) 119 + return 0; 120 + else if (weight <= 50) 121 + return 1; 122 + else if (weight < 200) 123 + return 2; 124 + else if (weight < 400) 125 + return 3; 126 + else 127 + return 4; 128 + } 129 + 130 + void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) 131 + { 132 + struct task_ctx *tctx; 133 + u32 pid = p->pid; 134 + int idx = weight_to_idx(p->scx.weight); 135 + void *ring; 136 + 137 + if (test_error_cnt && !--test_error_cnt) 138 + scx_bpf_error("test triggering error"); 139 + 140 + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 141 + if (!tctx) { 142 + scx_bpf_error("task_ctx lookup failed"); 143 + return; 144 + } 145 + 146 + /* Is select_cpu() is telling us to enqueue locally? */ 147 + if (tctx->force_local) { 148 + tctx->force_local = false; 149 + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 150 + return; 151 + } 152 + 153 + ring = bpf_map_lookup_elem(&queue_arr, &idx); 154 + if (!ring) { 155 + scx_bpf_error("failed to find ring %d", idx); 156 + return; 157 + } 158 + 159 + /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ 160 + if (bpf_map_push_elem(ring, &pid, 0)) { 161 + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); 162 + return; 163 + } 164 + 165 + __sync_fetch_and_add(&nr_enqueued, 1); 166 + } 167 + 168 + /* 169 + * The BPF queue map doesn't support removal and sched_ext can handle spurious 170 + * dispatches. qmap_dequeue() is only used to collect statistics. 171 + */ 172 + void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) 173 + { 174 + __sync_fetch_and_add(&nr_dequeued, 1); 175 + } 176 + 177 + void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) 178 + { 179 + struct task_struct *p; 180 + struct cpu_ctx *cpuc; 181 + u32 zero = 0, batch = dsp_batch ?: 1; 182 + void *fifo; 183 + s32 i, pid; 184 + 185 + if (scx_bpf_consume(SHARED_DSQ)) 186 + return; 187 + 188 + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { 189 + scx_bpf_error("failed to look up cpu_ctx"); 190 + return; 191 + } 192 + 193 + for (i = 0; i < 5; i++) { 194 + /* Advance the dispatch cursor and pick the fifo. */ 195 + if (!cpuc->dsp_cnt) { 196 + cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; 197 + cpuc->dsp_cnt = 1 << cpuc->dsp_idx; 198 + } 199 + 200 + fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); 201 + if (!fifo) { 202 + scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); 203 + return; 204 + } 205 + 206 + /* Dispatch or advance. */ 207 + bpf_repeat(BPF_MAX_LOOPS) { 208 + if (bpf_map_pop_elem(fifo, &pid)) 209 + break; 210 + 211 + p = bpf_task_from_pid(pid); 212 + if (!p) 213 + continue; 214 + 215 + __sync_fetch_and_add(&nr_dispatched, 1); 216 + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); 217 + bpf_task_release(p); 218 + batch--; 219 + cpuc->dsp_cnt--; 220 + if (!batch || !scx_bpf_dispatch_nr_slots()) { 221 + scx_bpf_consume(SHARED_DSQ); 222 + return; 223 + } 224 + if (!cpuc->dsp_cnt) 225 + break; 226 + } 227 + 228 + cpuc->dsp_cnt = 0; 229 + } 230 + } 231 + 232 + s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, 233 + struct scx_init_task_args *args) 234 + { 235 + /* 236 + * @p is new. Let's ensure that its task_ctx is available. We can sleep 237 + * in this function and the following will automatically use GFP_KERNEL. 238 + */ 239 + if (bpf_task_storage_get(&task_ctx_stor, p, 0, 240 + BPF_LOCAL_STORAGE_GET_F_CREATE)) 241 + return 0; 242 + else 243 + return -ENOMEM; 244 + } 245 + 246 + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 247 + { 248 + return scx_bpf_create_dsq(SHARED_DSQ, -1); 249 + } 250 + 251 + void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) 252 + { 253 + UEI_RECORD(uei, ei); 254 + } 255 + 256 + SCX_OPS_DEFINE(qmap_ops, 257 + .select_cpu = (void *)qmap_select_cpu, 258 + .enqueue = (void *)qmap_enqueue, 259 + .dequeue = (void *)qmap_dequeue, 260 + .dispatch = (void *)qmap_dispatch, 261 + .init_task = (void *)qmap_init_task, 262 + .init = (void *)qmap_init, 263 + .exit = (void *)qmap_exit, 264 + .name = "qmap");

+99

tools/sched_ext/scx_qmap.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 + */ 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <unistd.h> 10 + #include <inttypes.h> 11 + #include <signal.h> 12 + #include <libgen.h> 13 + #include <bpf/bpf.h> 14 + #include <scx/common.h> 15 + #include "scx_qmap.bpf.skel.h" 16 + 17 + const char help_fmt[] = 18 + "A simple five-level FIFO queue sched_ext scheduler.\n" 19 + "\n" 20 + "See the top-level comment in .bpf.c for more details.\n" 21 + "\n" 22 + "Usage: %s [-s SLICE_US] [-e COUNT] [-b COUNT] [-p] [-v]\n" 23 + "\n" 24 + " -s SLICE_US Override slice duration\n" 25 + " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" 26 + " -b COUNT Dispatch upto COUNT tasks together\n" 27 + " -p Switch only tasks on SCHED_EXT policy intead of all\n" 28 + " -v Print libbpf debug messages\n" 29 + " -h Display this help and exit\n"; 30 + 31 + static bool verbose; 32 + static volatile int exit_req; 33 + 34 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 35 + { 36 + if (level == LIBBPF_DEBUG && !verbose) 37 + return 0; 38 + return vfprintf(stderr, format, args); 39 + } 40 + 41 + static void sigint_handler(int dummy) 42 + { 43 + exit_req = 1; 44 + } 45 + 46 + int main(int argc, char **argv) 47 + { 48 + struct scx_qmap *skel; 49 + struct bpf_link *link; 50 + int opt; 51 + 52 + libbpf_set_print(libbpf_print_fn); 53 + signal(SIGINT, sigint_handler); 54 + signal(SIGTERM, sigint_handler); 55 + 56 + skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); 57 + 58 + while ((opt = getopt(argc, argv, "s:e:b:pvh")) != -1) { 59 + switch (opt) { 60 + case 's': 61 + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; 62 + break; 63 + case 'e': 64 + skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); 65 + break; 66 + case 'b': 67 + skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); 68 + break; 69 + case 'p': 70 + skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; 71 + break; 72 + case 'v': 73 + verbose = true; 74 + break; 75 + default: 76 + fprintf(stderr, help_fmt, basename(argv[0])); 77 + return opt != 'h'; 78 + } 79 + } 80 + 81 + SCX_OPS_LOAD(skel, qmap_ops, scx_qmap); 82 + link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); 83 + 84 + while (!exit_req && !UEI_EXITED(skel, uei)) { 85 + long nr_enqueued = skel->bss->nr_enqueued; 86 + long nr_dispatched = skel->bss->nr_dispatched; 87 + 88 + printf("stats : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n", 89 + nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, 90 + skel->bss->nr_dequeued); 91 + fflush(stdout); 92 + sleep(1); 93 + } 94 + 95 + bpf_link__destroy(link); 96 + UEI_REPORT(skel, uei); 97 + scx_qmap__destroy(skel); 98 + return 0; 99 + }

+63

tools/sched_ext/scx_simple.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A simple scheduler. 4 + * 5 + * A simple global FIFO scheduler. It also demonstrates the following niceties. 6 + * 7 + * - Statistics tracking how many tasks are queued to local and global dsq's. 8 + * - Termination notification for userspace. 9 + * 10 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 11 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 12 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 13 + */ 14 + #include <scx/common.bpf.h> 15 + 16 + char _license[] SEC("license") = "GPL"; 17 + 18 + UEI_DEFINE(uei); 19 + 20 + struct { 21 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 22 + __uint(key_size, sizeof(u32)); 23 + __uint(value_size, sizeof(u64)); 24 + __uint(max_entries, 2); /* [local, global] */ 25 + } stats SEC(".maps"); 26 + 27 + static void stat_inc(u32 idx) 28 + { 29 + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); 30 + if (cnt_p) 31 + (*cnt_p)++; 32 + } 33 + 34 + s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 35 + { 36 + bool is_idle = false; 37 + s32 cpu; 38 + 39 + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 40 + if (is_idle) { 41 + stat_inc(0); /* count local queueing */ 42 + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 43 + } 44 + 45 + return cpu; 46 + } 47 + 48 + void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) 49 + { 50 + stat_inc(1); /* count global queueing */ 51 + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 52 + } 53 + 54 + void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) 55 + { 56 + UEI_RECORD(uei, ei); 57 + } 58 + 59 + SCX_OPS_DEFINE(simple_ops, 60 + .select_cpu = (void *)simple_select_cpu, 61 + .enqueue = (void *)simple_enqueue, 62 + .exit = (void *)simple_exit, 63 + .name = "simple");

+99

tools/sched_ext/scx_simple.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 + */ 7 + #include <stdio.h> 8 + #include <unistd.h> 9 + #include <signal.h> 10 + #include <libgen.h> 11 + #include <bpf/bpf.h> 12 + #include <scx/common.h> 13 + #include "scx_simple.bpf.skel.h" 14 + 15 + const char help_fmt[] = 16 + "A simple sched_ext scheduler.\n" 17 + "\n" 18 + "See the top-level comment in .bpf.c for more details.\n" 19 + "\n" 20 + "Usage: %s [-v]\n" 21 + "\n" 22 + " -v Print libbpf debug messages\n" 23 + " -h Display this help and exit\n"; 24 + 25 + static bool verbose; 26 + static volatile int exit_req; 27 + 28 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 29 + { 30 + if (level == LIBBPF_DEBUG && !verbose) 31 + return 0; 32 + return vfprintf(stderr, format, args); 33 + } 34 + 35 + static void sigint_handler(int simple) 36 + { 37 + exit_req = 1; 38 + } 39 + 40 + static void read_stats(struct scx_simple *skel, __u64 *stats) 41 + { 42 + int nr_cpus = libbpf_num_possible_cpus(); 43 + __u64 cnts[2][nr_cpus]; 44 + __u32 idx; 45 + 46 + memset(stats, 0, sizeof(stats[0]) * 2); 47 + 48 + for (idx = 0; idx < 2; idx++) { 49 + int ret, cpu; 50 + 51 + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), 52 + &idx, cnts[idx]); 53 + if (ret < 0) 54 + continue; 55 + for (cpu = 0; cpu < nr_cpus; cpu++) 56 + stats[idx] += cnts[idx][cpu]; 57 + } 58 + } 59 + 60 + int main(int argc, char **argv) 61 + { 62 + struct scx_simple *skel; 63 + struct bpf_link *link; 64 + __u32 opt; 65 + 66 + libbpf_set_print(libbpf_print_fn); 67 + signal(SIGINT, sigint_handler); 68 + signal(SIGTERM, sigint_handler); 69 + 70 + skel = SCX_OPS_OPEN(simple_ops, scx_simple); 71 + 72 + while ((opt = getopt(argc, argv, "vh")) != -1) { 73 + switch (opt) { 74 + case 'v': 75 + verbose = true; 76 + break; 77 + default: 78 + fprintf(stderr, help_fmt, basename(argv[0])); 79 + return opt != 'h'; 80 + } 81 + } 82 + 83 + SCX_OPS_LOAD(skel, simple_ops, scx_simple); 84 + link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); 85 + 86 + while (!exit_req && !UEI_EXITED(skel, uei)) { 87 + __u64 stats[2]; 88 + 89 + read_stats(skel, stats); 90 + printf("local=%llu global=%llu\n", stats[0], stats[1]); 91 + fflush(stdout); 92 + sleep(1); 93 + } 94 + 95 + bpf_link__destroy(link); 96 + UEI_REPORT(skel, uei); 97 + scx_simple__destroy(skel); 98 + return 0; 99 + }