Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched: Add new scheduler syscalls to support an extended scheduling parameters ABI

Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).

In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:

- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.

Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.

For these reasons, this patch:

- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;

- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().

Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.

Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Dario Faggioli and committed by
Ingo Molnar
d50dde5a 56b48110

+326 -24
+1 -1
arch/arm/include/asm/unistd.h
··· 15 15 16 16 #include <uapi/asm/unistd.h> 17 17 18 - #define __NR_syscalls (380) 18 + #define __NR_syscalls (384) 19 19 #define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) 20 20 21 21 #define __ARCH_WANT_STAT64
+2
arch/arm/include/uapi/asm/unistd.h
··· 406 406 #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) 407 407 #define __NR_kcmp (__NR_SYSCALL_BASE+378) 408 408 #define __NR_finit_module (__NR_SYSCALL_BASE+379) 409 + #define __NR_sched_setattr (__NR_SYSCALL_BASE+380) 410 + #define __NR_sched_getattr (__NR_SYSCALL_BASE+381) 409 411 410 412 /* 411 413 * This may need to be greater than __NR_last_syscall+1 in order to
+2
arch/arm/kernel/calls.S
··· 389 389 CALL(sys_process_vm_writev) 390 390 CALL(sys_kcmp) 391 391 CALL(sys_finit_module) 392 + /* 380 */ CALL(sys_sched_setattr) 393 + CALL(sys_sched_getattr) 392 394 #ifndef syscalls_counted 393 395 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls 394 396 #define syscalls_counted
+2
arch/x86/syscalls/syscall_32.tbl
··· 357 357 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 358 358 349 i386 kcmp sys_kcmp 359 359 350 i386 finit_module sys_finit_module 360 + 351 i386 sched_setattr sys_sched_setattr 361 + 352 i386 sched_getattr sys_sched_getattr
+2
arch/x86/syscalls/syscall_64.tbl
··· 320 320 311 64 process_vm_writev sys_process_vm_writev 321 321 312 common kcmp sys_kcmp 322 322 313 common finit_module sys_finit_module 323 + 314 common sched_setattr sys_sched_setattr 324 + 315 common sched_getattr sys_sched_getattr 323 325 324 326 # 325 327 # x32-specific system call numbers start at 512 to avoid cache impact
+62
include/linux/sched.h
··· 56 56 57 57 #include <asm/processor.h> 58 58 59 + #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ 60 + 61 + /* 62 + * Extended scheduling parameters data structure. 63 + * 64 + * This is needed because the original struct sched_param can not be 65 + * altered without introducing ABI issues with legacy applications 66 + * (e.g., in sched_getparam()). 67 + * 68 + * However, the possibility of specifying more than just a priority for 69 + * the tasks may be useful for a wide variety of application fields, e.g., 70 + * multimedia, streaming, automation and control, and many others. 71 + * 72 + * This variant (sched_attr) is meant at describing a so-called 73 + * sporadic time-constrained task. In such model a task is specified by: 74 + * - the activation period or minimum instance inter-arrival time; 75 + * - the maximum (or average, depending on the actual scheduling 76 + * discipline) computation time of all instances, a.k.a. runtime; 77 + * - the deadline (relative to the actual activation time) of each 78 + * instance. 79 + * Very briefly, a periodic (sporadic) task asks for the execution of 80 + * some specific computation --which is typically called an instance-- 81 + * (at most) every period. Moreover, each instance typically lasts no more 82 + * than the runtime and must be completed by time instant t equal to 83 + * the instance activation time + the deadline. 84 + * 85 + * This is reflected by the actual fields of the sched_attr structure: 86 + * 87 + * @size size of the structure, for fwd/bwd compat. 88 + * 89 + * @sched_policy task's scheduling policy 90 + * @sched_flags for customizing the scheduler behaviour 91 + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) 92 + * @sched_priority task's static priority (SCHED_FIFO/RR) 93 + * @sched_deadline representative of the task's deadline 94 + * @sched_runtime representative of the task's runtime 95 + * @sched_period representative of the task's period 96 + * 97 + * Given this task model, there are a multiplicity of scheduling algorithms 98 + * and policies, that can be used to ensure all the tasks will make their 99 + * timing constraints. 100 + */ 101 + struct sched_attr { 102 + u32 size; 103 + 104 + u32 sched_policy; 105 + u64 sched_flags; 106 + 107 + /* SCHED_NORMAL, SCHED_BATCH */ 108 + s32 sched_nice; 109 + 110 + /* SCHED_FIFO, SCHED_RR */ 111 + u32 sched_priority; 112 + 113 + /* SCHED_DEADLINE */ 114 + u64 sched_runtime; 115 + u64 sched_deadline; 116 + u64 sched_period; 117 + }; 118 + 59 119 struct exec_domain; 60 120 struct futex_pi_state; 61 121 struct robust_list_head; ··· 2018 1958 const struct sched_param *); 2019 1959 extern int sched_setscheduler_nocheck(struct task_struct *, int, 2020 1960 const struct sched_param *); 1961 + extern int sched_setattr(struct task_struct *, 1962 + const struct sched_attr *); 2021 1963 extern struct task_struct *idle_task(int cpu); 2022 1964 /** 2023 1965 * is_idle_task - is the specified task an idle task?
+6
include/linux/syscalls.h
··· 38 38 struct rlimit64; 39 39 struct rusage; 40 40 struct sched_param; 41 + struct sched_attr; 41 42 struct sel_arg_struct; 42 43 struct semaphore; 43 44 struct sembuf; ··· 280 279 struct sched_param __user *param); 281 280 asmlinkage long sys_sched_setparam(pid_t pid, 282 281 struct sched_param __user *param); 282 + asmlinkage long sys_sched_setattr(pid_t pid, 283 + struct sched_attr __user *attr); 283 284 asmlinkage long sys_sched_getscheduler(pid_t pid); 284 285 asmlinkage long sys_sched_getparam(pid_t pid, 285 286 struct sched_param __user *param); 287 + asmlinkage long sys_sched_getattr(pid_t pid, 288 + struct sched_attr __user *attr, 289 + unsigned int size); 286 290 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 287 291 unsigned long __user *user_mask_ptr); 288 292 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
+243 -20
kernel/sched/core.c
··· 2817 2817 __task_rq_unlock(rq); 2818 2818 } 2819 2819 #endif 2820 + 2820 2821 void set_user_nice(struct task_struct *p, long nice) 2821 2822 { 2822 2823 int old_prio, delta, on_rq; ··· 2992 2991 return pid ? find_task_by_vpid(pid) : current; 2993 2992 } 2994 2993 2995 - /* Actually do priority change: must hold rq lock. */ 2996 - static void 2997 - __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 2994 + /* Actually do priority change: must hold pi & rq lock. */ 2995 + static void __setscheduler(struct rq *rq, struct task_struct *p, 2996 + const struct sched_attr *attr) 2998 2997 { 2998 + int policy = attr->sched_policy; 2999 + 2999 3000 p->policy = policy; 3000 - p->rt_priority = prio; 3001 + 3002 + if (rt_policy(policy)) 3003 + p->rt_priority = attr->sched_priority; 3004 + else 3005 + p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3006 + 3001 3007 p->normal_prio = normal_prio(p); 3002 - /* we are holding p->pi_lock already */ 3003 3008 p->prio = rt_mutex_getprio(p); 3009 + 3004 3010 if (rt_prio(p->prio)) 3005 3011 p->sched_class = &rt_sched_class; 3006 3012 else 3007 3013 p->sched_class = &fair_sched_class; 3014 + 3008 3015 set_load_weight(p); 3009 3016 } 3010 - 3011 3017 /* 3012 3018 * check the target process has a UID that matches the current process's 3013 3019 */ ··· 3031 3023 return match; 3032 3024 } 3033 3025 3034 - static int __sched_setscheduler(struct task_struct *p, int policy, 3035 - const struct sched_param *param, bool user) 3026 + static int __sched_setscheduler(struct task_struct *p, 3027 + const struct sched_attr *attr, 3028 + bool user) 3036 3029 { 3037 3030 int retval, oldprio, oldpolicy = -1, on_rq, running; 3031 + int policy = attr->sched_policy; 3038 3032 unsigned long flags; 3039 3033 const struct sched_class *prev_class; 3040 3034 struct rq *rq; ··· 3064 3054 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3065 3055 * SCHED_BATCH and SCHED_IDLE is 0. 3066 3056 */ 3067 - if (param->sched_priority < 0 || 3068 - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3069 - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3057 + if (attr->sched_priority < 0 || 3058 + (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3059 + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3070 3060 return -EINVAL; 3071 - if (rt_policy(policy) != (param->sched_priority != 0)) 3061 + if (rt_policy(policy) != (attr->sched_priority != 0)) 3072 3062 return -EINVAL; 3073 3063 3074 3064 /* 3075 3065 * Allow unprivileged RT tasks to decrease priority: 3076 3066 */ 3077 3067 if (user && !capable(CAP_SYS_NICE)) { 3068 + if (fair_policy(policy)) { 3069 + if (!can_nice(p, attr->sched_nice)) 3070 + return -EPERM; 3071 + } 3072 + 3078 3073 if (rt_policy(policy)) { 3079 3074 unsigned long rlim_rtprio = 3080 3075 task_rlimit(p, RLIMIT_RTPRIO); ··· 3089 3074 return -EPERM; 3090 3075 3091 3076 /* can't increase priority */ 3092 - if (param->sched_priority > p->rt_priority && 3093 - param->sched_priority > rlim_rtprio) 3077 + if (attr->sched_priority > p->rt_priority && 3078 + attr->sched_priority > rlim_rtprio) 3094 3079 return -EPERM; 3095 3080 } 3096 3081 ··· 3138 3123 /* 3139 3124 * If not changing anything there's no need to proceed further: 3140 3125 */ 3141 - if (unlikely(policy == p->policy && (!rt_policy(policy) || 3142 - param->sched_priority == p->rt_priority))) { 3126 + if (unlikely(policy == p->policy)) { 3127 + if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3128 + goto change; 3129 + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3130 + goto change; 3131 + 3143 3132 task_rq_unlock(rq, p, &flags); 3144 3133 return 0; 3145 3134 } 3135 + change: 3146 3136 3147 3137 #ifdef CONFIG_RT_GROUP_SCHED 3148 3138 if (user) { ··· 3181 3161 3182 3162 oldprio = p->prio; 3183 3163 prev_class = p->sched_class; 3184 - __setscheduler(rq, p, policy, param->sched_priority); 3164 + __setscheduler(rq, p, attr); 3185 3165 3186 3166 if (running) 3187 3167 p->sched_class->set_curr_task(rq); ··· 3209 3189 int sched_setscheduler(struct task_struct *p, int policy, 3210 3190 const struct sched_param *param) 3211 3191 { 3212 - return __sched_setscheduler(p, policy, param, true); 3192 + struct sched_attr attr = { 3193 + .sched_policy = policy, 3194 + .sched_priority = param->sched_priority 3195 + }; 3196 + return __sched_setscheduler(p, &attr, true); 3213 3197 } 3214 3198 EXPORT_SYMBOL_GPL(sched_setscheduler); 3199 + 3200 + int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3201 + { 3202 + return __sched_setscheduler(p, attr, true); 3203 + } 3204 + EXPORT_SYMBOL_GPL(sched_setattr); 3215 3205 3216 3206 /** 3217 3207 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ··· 3239 3209 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3240 3210 const struct sched_param *param) 3241 3211 { 3242 - return __sched_setscheduler(p, policy, param, false); 3212 + struct sched_attr attr = { 3213 + .sched_policy = policy, 3214 + .sched_priority = param->sched_priority 3215 + }; 3216 + return __sched_setscheduler(p, &attr, false); 3243 3217 } 3244 3218 3245 3219 static int ··· 3266 3232 rcu_read_unlock(); 3267 3233 3268 3234 return retval; 3235 + } 3236 + 3237 + /* 3238 + * Mimics kernel/events/core.c perf_copy_attr(). 3239 + */ 3240 + static int sched_copy_attr(struct sched_attr __user *uattr, 3241 + struct sched_attr *attr) 3242 + { 3243 + u32 size; 3244 + int ret; 3245 + 3246 + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3247 + return -EFAULT; 3248 + 3249 + /* 3250 + * zero the full structure, so that a short copy will be nice. 3251 + */ 3252 + memset(attr, 0, sizeof(*attr)); 3253 + 3254 + ret = get_user(size, &uattr->size); 3255 + if (ret) 3256 + return ret; 3257 + 3258 + if (size > PAGE_SIZE) /* silly large */ 3259 + goto err_size; 3260 + 3261 + if (!size) /* abi compat */ 3262 + size = SCHED_ATTR_SIZE_VER0; 3263 + 3264 + if (size < SCHED_ATTR_SIZE_VER0) 3265 + goto err_size; 3266 + 3267 + /* 3268 + * If we're handed a bigger struct than we know of, 3269 + * ensure all the unknown bits are 0 - i.e. new 3270 + * user-space does not rely on any kernel feature 3271 + * extensions we dont know about yet. 3272 + */ 3273 + if (size > sizeof(*attr)) { 3274 + unsigned char __user *addr; 3275 + unsigned char __user *end; 3276 + unsigned char val; 3277 + 3278 + addr = (void __user *)uattr + sizeof(*attr); 3279 + end = (void __user *)uattr + size; 3280 + 3281 + for (; addr < end; addr++) { 3282 + ret = get_user(val, addr); 3283 + if (ret) 3284 + return ret; 3285 + if (val) 3286 + goto err_size; 3287 + } 3288 + size = sizeof(*attr); 3289 + } 3290 + 3291 + ret = copy_from_user(attr, uattr, size); 3292 + if (ret) 3293 + return -EFAULT; 3294 + 3295 + /* 3296 + * XXX: do we want to be lenient like existing syscalls; or do we want 3297 + * to be strict and return an error on out-of-bounds values? 3298 + */ 3299 + attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3300 + 3301 + out: 3302 + return ret; 3303 + 3304 + err_size: 3305 + put_user(sizeof(*attr), &uattr->size); 3306 + ret = -E2BIG; 3307 + goto out; 3269 3308 } 3270 3309 3271 3310 /** ··· 3369 3262 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3370 3263 { 3371 3264 return do_sched_setscheduler(pid, -1, param); 3265 + } 3266 + 3267 + /** 3268 + * sys_sched_setattr - same as above, but with extended sched_attr 3269 + * @pid: the pid in question. 3270 + * @attr: structure containing the extended parameters. 3271 + */ 3272 + SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) 3273 + { 3274 + struct sched_attr attr; 3275 + struct task_struct *p; 3276 + int retval; 3277 + 3278 + if (!uattr || pid < 0) 3279 + return -EINVAL; 3280 + 3281 + if (sched_copy_attr(uattr, &attr)) 3282 + return -EFAULT; 3283 + 3284 + rcu_read_lock(); 3285 + retval = -ESRCH; 3286 + p = find_process_by_pid(pid); 3287 + if (p != NULL) 3288 + retval = sched_setattr(p, &attr); 3289 + rcu_read_unlock(); 3290 + 3291 + return retval; 3372 3292 } 3373 3293 3374 3294 /** ··· 3461 3327 */ 3462 3328 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3463 3329 3330 + return retval; 3331 + 3332 + out_unlock: 3333 + rcu_read_unlock(); 3334 + return retval; 3335 + } 3336 + 3337 + static int sched_read_attr(struct sched_attr __user *uattr, 3338 + struct sched_attr *attr, 3339 + unsigned int usize) 3340 + { 3341 + int ret; 3342 + 3343 + if (!access_ok(VERIFY_WRITE, uattr, usize)) 3344 + return -EFAULT; 3345 + 3346 + /* 3347 + * If we're handed a smaller struct than we know of, 3348 + * ensure all the unknown bits are 0 - i.e. old 3349 + * user-space does not get uncomplete information. 3350 + */ 3351 + if (usize < sizeof(*attr)) { 3352 + unsigned char *addr; 3353 + unsigned char *end; 3354 + 3355 + addr = (void *)attr + usize; 3356 + end = (void *)attr + sizeof(*attr); 3357 + 3358 + for (; addr < end; addr++) { 3359 + if (*addr) 3360 + goto err_size; 3361 + } 3362 + 3363 + attr->size = usize; 3364 + } 3365 + 3366 + ret = copy_to_user(uattr, attr, usize); 3367 + if (ret) 3368 + return -EFAULT; 3369 + 3370 + out: 3371 + return ret; 3372 + 3373 + err_size: 3374 + ret = -E2BIG; 3375 + goto out; 3376 + } 3377 + 3378 + /** 3379 + * sys_sched_getattr - same as above, but with extended "sched_param" 3380 + * @pid: the pid in question. 3381 + * @attr: structure containing the extended parameters. 3382 + * @size: sizeof(attr) for fwd/bwd comp. 3383 + */ 3384 + SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3385 + unsigned int, size) 3386 + { 3387 + struct sched_attr attr = { 3388 + .size = sizeof(struct sched_attr), 3389 + }; 3390 + struct task_struct *p; 3391 + int retval; 3392 + 3393 + if (!uattr || pid < 0 || size > PAGE_SIZE || 3394 + size < SCHED_ATTR_SIZE_VER0) 3395 + return -EINVAL; 3396 + 3397 + rcu_read_lock(); 3398 + p = find_process_by_pid(pid); 3399 + retval = -ESRCH; 3400 + if (!p) 3401 + goto out_unlock; 3402 + 3403 + retval = security_task_getscheduler(p); 3404 + if (retval) 3405 + goto out_unlock; 3406 + 3407 + attr.sched_policy = p->policy; 3408 + if (task_has_rt_policy(p)) 3409 + attr.sched_priority = p->rt_priority; 3410 + else 3411 + attr.sched_nice = TASK_NICE(p); 3412 + 3413 + rcu_read_unlock(); 3414 + 3415 + retval = sched_read_attr(uattr, &attr, size); 3464 3416 return retval; 3465 3417 3466 3418 out_unlock: ··· 6620 6400 static void normalize_task(struct rq *rq, struct task_struct *p) 6621 6401 { 6622 6402 const struct sched_class *prev_class = p->sched_class; 6403 + struct sched_attr attr = { 6404 + .sched_policy = SCHED_NORMAL, 6405 + }; 6623 6406 int old_prio = p->prio; 6624 6407 int on_rq; 6625 6408 6626 6409 on_rq = p->on_rq; 6627 6410 if (on_rq) 6628 6411 dequeue_task(rq, p, 0); 6629 - __setscheduler(rq, p, SCHED_NORMAL, 0); 6412 + __setscheduler(rq, p, &attr); 6630 6413 if (on_rq) { 6631 6414 enqueue_task(rq, p, 0); 6632 6415 resched_task(rq->curr);
+6 -3
kernel/sched/sched.h
··· 81 81 */ 82 82 #define RUNTIME_INF ((u64)~0ULL) 83 83 84 + static inline int fair_policy(int policy) 85 + { 86 + return policy == SCHED_NORMAL || policy == SCHED_BATCH; 87 + } 88 + 84 89 static inline int rt_policy(int policy) 85 90 { 86 - if (policy == SCHED_FIFO || policy == SCHED_RR) 87 - return 1; 88 - return 0; 91 + return policy == SCHED_FIFO || policy == SCHED_RR; 89 92 } 90 93 91 94 static inline int task_has_rt_policy(struct task_struct *p)