sched: Add new scheduler syscalls to support an extended scheduling parameters ABI

+1 -1

arch/arm/include/asm/unistd.h

··· 15 15 16 16 #include <uapi/asm/unistd.h> 17 17 18 - #define __NR_syscalls (380) 18 + #define __NR_syscalls (384) 19 19 #define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) 20 20 21 21 #define __ARCH_WANT_STAT64

+2

arch/arm/include/uapi/asm/unistd.h

··· 406 406 #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) 407 407 #define __NR_kcmp (__NR_SYSCALL_BASE+378) 408 408 #define __NR_finit_module (__NR_SYSCALL_BASE+379) 409 + #define __NR_sched_setattr (__NR_SYSCALL_BASE+380) 410 + #define __NR_sched_getattr (__NR_SYSCALL_BASE+381) 409 411 410 412 /* 411 413 * This may need to be greater than __NR_last_syscall+1 in order to

+2

arch/arm/kernel/calls.S

··· 389 389 CALL(sys_process_vm_writev) 390 390 CALL(sys_kcmp) 391 391 CALL(sys_finit_module) 392 + /* 380 */ CALL(sys_sched_setattr) 393 + CALL(sys_sched_getattr) 392 394 #ifndef syscalls_counted 393 395 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls 394 396 #define syscalls_counted

+2

arch/x86/syscalls/syscall_32.tbl

··· 357 357 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 358 358 349 i386 kcmp sys_kcmp 359 359 350 i386 finit_module sys_finit_module 360 + 351 i386 sched_setattr sys_sched_setattr 361 + 352 i386 sched_getattr sys_sched_getattr

+2

arch/x86/syscalls/syscall_64.tbl

··· 320 320 311 64 process_vm_writev sys_process_vm_writev 321 321 312 common kcmp sys_kcmp 322 322 313 common finit_module sys_finit_module 323 + 314 common sched_setattr sys_sched_setattr 324 + 315 common sched_getattr sys_sched_getattr 323 325 324 326 # 325 327 # x32-specific system call numbers start at 512 to avoid cache impact

+62

include/linux/sched.h

··· 56 56 57 57 #include <asm/processor.h> 58 58 59 + #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ 60 + 61 + /* 62 + * Extended scheduling parameters data structure. 63 + * 64 + * This is needed because the original struct sched_param can not be 65 + * altered without introducing ABI issues with legacy applications 66 + * (e.g., in sched_getparam()). 67 + * 68 + * However, the possibility of specifying more than just a priority for 69 + * the tasks may be useful for a wide variety of application fields, e.g., 70 + * multimedia, streaming, automation and control, and many others. 71 + * 72 + * This variant (sched_attr) is meant at describing a so-called 73 + * sporadic time-constrained task. In such model a task is specified by: 74 + * - the activation period or minimum instance inter-arrival time; 75 + * - the maximum (or average, depending on the actual scheduling 76 + * discipline) computation time of all instances, a.k.a. runtime; 77 + * - the deadline (relative to the actual activation time) of each 78 + * instance. 79 + * Very briefly, a periodic (sporadic) task asks for the execution of 80 + * some specific computation --which is typically called an instance-- 81 + * (at most) every period. Moreover, each instance typically lasts no more 82 + * than the runtime and must be completed by time instant t equal to 83 + * the instance activation time + the deadline. 84 + * 85 + * This is reflected by the actual fields of the sched_attr structure: 86 + * 87 + * @size size of the structure, for fwd/bwd compat. 88 + * 89 + * @sched_policy task's scheduling policy 90 + * @sched_flags for customizing the scheduler behaviour 91 + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) 92 + * @sched_priority task's static priority (SCHED_FIFO/RR) 93 + * @sched_deadline representative of the task's deadline 94 + * @sched_runtime representative of the task's runtime 95 + * @sched_period representative of the task's period 96 + * 97 + * Given this task model, there are a multiplicity of scheduling algorithms 98 + * and policies, that can be used to ensure all the tasks will make their 99 + * timing constraints. 100 + */ 101 + struct sched_attr { 102 + u32 size; 103 + 104 + u32 sched_policy; 105 + u64 sched_flags; 106 + 107 + /* SCHED_NORMAL, SCHED_BATCH */ 108 + s32 sched_nice; 109 + 110 + /* SCHED_FIFO, SCHED_RR */ 111 + u32 sched_priority; 112 + 113 + /* SCHED_DEADLINE */ 114 + u64 sched_runtime; 115 + u64 sched_deadline; 116 + u64 sched_period; 117 + }; 118 + 59 119 struct exec_domain; 60 120 struct futex_pi_state; 61 121 struct robust_list_head; ··· 2018 1958 const struct sched_param *); 2019 1959 extern int sched_setscheduler_nocheck(struct task_struct *, int, 2020 1960 const struct sched_param *); 1961 + extern int sched_setattr(struct task_struct *, 1962 + const struct sched_attr *); 2021 1963 extern struct task_struct *idle_task(int cpu); 2022 1964 /** 2023 1965 * is_idle_task - is the specified task an idle task?

+6

include/linux/syscalls.h

··· 38 38 struct rlimit64; 39 39 struct rusage; 40 40 struct sched_param; 41 + struct sched_attr; 41 42 struct sel_arg_struct; 42 43 struct semaphore; 43 44 struct sembuf; ··· 280 279 struct sched_param __user *param); 281 280 asmlinkage long sys_sched_setparam(pid_t pid, 282 281 struct sched_param __user *param); 282 + asmlinkage long sys_sched_setattr(pid_t pid, 283 + struct sched_attr __user *attr); 283 284 asmlinkage long sys_sched_getscheduler(pid_t pid); 284 285 asmlinkage long sys_sched_getparam(pid_t pid, 285 286 struct sched_param __user *param); 287 + asmlinkage long sys_sched_getattr(pid_t pid, 288 + struct sched_attr __user *attr, 289 + unsigned int size); 286 290 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 287 291 unsigned long __user *user_mask_ptr); 288 292 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

+243 -20

kernel/sched/core.c

··· 2817 2817 __task_rq_unlock(rq); 2818 2818 } 2819 2819 #endif 2820 + 2820 2821 void set_user_nice(struct task_struct *p, long nice) 2821 2822 { 2822 2823 int old_prio, delta, on_rq; ··· 2992 2991 return pid ? find_task_by_vpid(pid) : current; 2993 2992 } 2994 2993 2995 - /* Actually do priority change: must hold rq lock. */ 2996 - static void 2997 - __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 2994 + /* Actually do priority change: must hold pi & rq lock. */ 2995 + static void __setscheduler(struct rq *rq, struct task_struct *p, 2996 + const struct sched_attr *attr) 2998 2997 { 2998 + int policy = attr->sched_policy; 2999 + 2999 3000 p->policy = policy; 3000 - p->rt_priority = prio; 3001 + 3002 + if (rt_policy(policy)) 3003 + p->rt_priority = attr->sched_priority; 3004 + else 3005 + p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3006 + 3001 3007 p->normal_prio = normal_prio(p); 3002 - /* we are holding p->pi_lock already */ 3003 3008 p->prio = rt_mutex_getprio(p); 3009 + 3004 3010 if (rt_prio(p->prio)) 3005 3011 p->sched_class = &rt_sched_class; 3006 3012 else 3007 3013 p->sched_class = &fair_sched_class; 3014 + 3008 3015 set_load_weight(p); 3009 3016 } 3010 - 3011 3017 /* 3012 3018 * check the target process has a UID that matches the current process's 3013 3019 */ ··· 3031 3023 return match; 3032 3024 } 3033 3025 3034 - static int __sched_setscheduler(struct task_struct *p, int policy, 3035 - const struct sched_param *param, bool user) 3026 + static int __sched_setscheduler(struct task_struct *p, 3027 + const struct sched_attr *attr, 3028 + bool user) 3036 3029 { 3037 3030 int retval, oldprio, oldpolicy = -1, on_rq, running; 3031 + int policy = attr->sched_policy; 3038 3032 unsigned long flags; 3039 3033 const struct sched_class *prev_class; 3040 3034 struct rq *rq; ··· 3064 3054 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3065 3055 * SCHED_BATCH and SCHED_IDLE is 0. 3066 3056 */ 3067 - if (param->sched_priority < 0 || 3068 - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3069 - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3057 + if (attr->sched_priority < 0 || 3058 + (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3059 + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3070 3060 return -EINVAL; 3071 - if (rt_policy(policy) != (param->sched_priority != 0)) 3061 + if (rt_policy(policy) != (attr->sched_priority != 0)) 3072 3062 return -EINVAL; 3073 3063 3074 3064 /* 3075 3065 * Allow unprivileged RT tasks to decrease priority: 3076 3066 */ 3077 3067 if (user && !capable(CAP_SYS_NICE)) { 3068 + if (fair_policy(policy)) { 3069 + if (!can_nice(p, attr->sched_nice)) 3070 + return -EPERM; 3071 + } 3072 + 3078 3073 if (rt_policy(policy)) { 3079 3074 unsigned long rlim_rtprio = 3080 3075 task_rlimit(p, RLIMIT_RTPRIO); ··· 3089 3074 return -EPERM; 3090 3075 3091 3076 /* can't increase priority */ 3092 - if (param->sched_priority > p->rt_priority && 3093 - param->sched_priority > rlim_rtprio) 3077 + if (attr->sched_priority > p->rt_priority && 3078 + attr->sched_priority > rlim_rtprio) 3094 3079 return -EPERM; 3095 3080 } 3096 3081 ··· 3138 3123 /* 3139 3124 * If not changing anything there's no need to proceed further: 3140 3125 */ 3141 - if (unlikely(policy == p->policy && (!rt_policy(policy) || 3142 - param->sched_priority == p->rt_priority))) { 3126 + if (unlikely(policy == p->policy)) { 3127 + if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3128 + goto change; 3129 + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3130 + goto change; 3131 + 3143 3132 task_rq_unlock(rq, p, &flags); 3144 3133 return 0; 3145 3134 } 3135 + change: 3146 3136 3147 3137 #ifdef CONFIG_RT_GROUP_SCHED 3148 3138 if (user) { ··· 3181 3161 3182 3162 oldprio = p->prio; 3183 3163 prev_class = p->sched_class; 3184 - __setscheduler(rq, p, policy, param->sched_priority); 3164 + __setscheduler(rq, p, attr); 3185 3165 3186 3166 if (running) 3187 3167 p->sched_class->set_curr_task(rq); ··· 3209 3189 int sched_setscheduler(struct task_struct *p, int policy, 3210 3190 const struct sched_param *param) 3211 3191 { 3212 - return __sched_setscheduler(p, policy, param, true); 3192 + struct sched_attr attr = { 3193 + .sched_policy = policy, 3194 + .sched_priority = param->sched_priority 3195 + }; 3196 + return __sched_setscheduler(p, &attr, true); 3213 3197 } 3214 3198 EXPORT_SYMBOL_GPL(sched_setscheduler); 3199 + 3200 + int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3201 + { 3202 + return __sched_setscheduler(p, attr, true); 3203 + } 3204 + EXPORT_SYMBOL_GPL(sched_setattr); 3215 3205 3216 3206 /** 3217 3207 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ··· 3239 3209 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3240 3210 const struct sched_param *param) 3241 3211 { 3242 - return __sched_setscheduler(p, policy, param, false); 3212 + struct sched_attr attr = { 3213 + .sched_policy = policy, 3214 + .sched_priority = param->sched_priority 3215 + }; 3216 + return __sched_setscheduler(p, &attr, false); 3243 3217 } 3244 3218 3245 3219 static int ··· 3266 3232 rcu_read_unlock(); 3267 3233 3268 3234 return retval; 3235 + } 3236 + 3237 + /* 3238 + * Mimics kernel/events/core.c perf_copy_attr(). 3239 + */ 3240 + static int sched_copy_attr(struct sched_attr __user *uattr, 3241 + struct sched_attr *attr) 3242 + { 3243 + u32 size; 3244 + int ret; 3245 + 3246 + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3247 + return -EFAULT; 3248 + 3249 + /* 3250 + * zero the full structure, so that a short copy will be nice. 3251 + */ 3252 + memset(attr, 0, sizeof(*attr)); 3253 + 3254 + ret = get_user(size, &uattr->size); 3255 + if (ret) 3256 + return ret; 3257 + 3258 + if (size > PAGE_SIZE) /* silly large */ 3259 + goto err_size; 3260 + 3261 + if (!size) /* abi compat */ 3262 + size = SCHED_ATTR_SIZE_VER0; 3263 + 3264 + if (size < SCHED_ATTR_SIZE_VER0) 3265 + goto err_size; 3266 + 3267 + /* 3268 + * If we're handed a bigger struct than we know of, 3269 + * ensure all the unknown bits are 0 - i.e. new 3270 + * user-space does not rely on any kernel feature 3271 + * extensions we dont know about yet. 3272 + */ 3273 + if (size > sizeof(*attr)) { 3274 + unsigned char __user *addr; 3275 + unsigned char __user *end; 3276 + unsigned char val; 3277 + 3278 + addr = (void __user *)uattr + sizeof(*attr); 3279 + end = (void __user *)uattr + size; 3280 + 3281 + for (; addr < end; addr++) { 3282 + ret = get_user(val, addr); 3283 + if (ret) 3284 + return ret; 3285 + if (val) 3286 + goto err_size; 3287 + } 3288 + size = sizeof(*attr); 3289 + } 3290 + 3291 + ret = copy_from_user(attr, uattr, size); 3292 + if (ret) 3293 + return -EFAULT; 3294 + 3295 + /* 3296 + * XXX: do we want to be lenient like existing syscalls; or do we want 3297 + * to be strict and return an error on out-of-bounds values? 3298 + */ 3299 + attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3300 + 3301 + out: 3302 + return ret; 3303 + 3304 + err_size: 3305 + put_user(sizeof(*attr), &uattr->size); 3306 + ret = -E2BIG; 3307 + goto out; 3269 3308 } 3270 3309 3271 3310 /** ··· 3369 3262 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3370 3263 { 3371 3264 return do_sched_setscheduler(pid, -1, param); 3265 + } 3266 + 3267 + /** 3268 + * sys_sched_setattr - same as above, but with extended sched_attr 3269 + * @pid: the pid in question. 3270 + * @attr: structure containing the extended parameters. 3271 + */ 3272 + SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) 3273 + { 3274 + struct sched_attr attr; 3275 + struct task_struct *p; 3276 + int retval; 3277 + 3278 + if (!uattr || pid < 0) 3279 + return -EINVAL; 3280 + 3281 + if (sched_copy_attr(uattr, &attr)) 3282 + return -EFAULT; 3283 + 3284 + rcu_read_lock(); 3285 + retval = -ESRCH; 3286 + p = find_process_by_pid(pid); 3287 + if (p != NULL) 3288 + retval = sched_setattr(p, &attr); 3289 + rcu_read_unlock(); 3290 + 3291 + return retval; 3372 3292 } 3373 3293 3374 3294 /** ··· 3461 3327 */ 3462 3328 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3463 3329 3330 + return retval; 3331 + 3332 + out_unlock: 3333 + rcu_read_unlock(); 3334 + return retval; 3335 + } 3336 + 3337 + static int sched_read_attr(struct sched_attr __user *uattr, 3338 + struct sched_attr *attr, 3339 + unsigned int usize) 3340 + { 3341 + int ret; 3342 + 3343 + if (!access_ok(VERIFY_WRITE, uattr, usize)) 3344 + return -EFAULT; 3345 + 3346 + /* 3347 + * If we're handed a smaller struct than we know of, 3348 + * ensure all the unknown bits are 0 - i.e. old 3349 + * user-space does not get uncomplete information. 3350 + */ 3351 + if (usize < sizeof(*attr)) { 3352 + unsigned char *addr; 3353 + unsigned char *end; 3354 + 3355 + addr = (void *)attr + usize; 3356 + end = (void *)attr + sizeof(*attr); 3357 + 3358 + for (; addr < end; addr++) { 3359 + if (*addr) 3360 + goto err_size; 3361 + } 3362 + 3363 + attr->size = usize; 3364 + } 3365 + 3366 + ret = copy_to_user(uattr, attr, usize); 3367 + if (ret) 3368 + return -EFAULT; 3369 + 3370 + out: 3371 + return ret; 3372 + 3373 + err_size: 3374 + ret = -E2BIG; 3375 + goto out; 3376 + } 3377 + 3378 + /** 3379 + * sys_sched_getattr - same as above, but with extended "sched_param" 3380 + * @pid: the pid in question. 3381 + * @attr: structure containing the extended parameters. 3382 + * @size: sizeof(attr) for fwd/bwd comp. 3383 + */ 3384 + SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3385 + unsigned int, size) 3386 + { 3387 + struct sched_attr attr = { 3388 + .size = sizeof(struct sched_attr), 3389 + }; 3390 + struct task_struct *p; 3391 + int retval; 3392 + 3393 + if (!uattr || pid < 0 || size > PAGE_SIZE || 3394 + size < SCHED_ATTR_SIZE_VER0) 3395 + return -EINVAL; 3396 + 3397 + rcu_read_lock(); 3398 + p = find_process_by_pid(pid); 3399 + retval = -ESRCH; 3400 + if (!p) 3401 + goto out_unlock; 3402 + 3403 + retval = security_task_getscheduler(p); 3404 + if (retval) 3405 + goto out_unlock; 3406 + 3407 + attr.sched_policy = p->policy; 3408 + if (task_has_rt_policy(p)) 3409 + attr.sched_priority = p->rt_priority; 3410 + else 3411 + attr.sched_nice = TASK_NICE(p); 3412 + 3413 + rcu_read_unlock(); 3414 + 3415 + retval = sched_read_attr(uattr, &attr, size); 3464 3416 return retval; 3465 3417 3466 3418 out_unlock: ··· 6620 6400 static void normalize_task(struct rq *rq, struct task_struct *p) 6621 6401 { 6622 6402 const struct sched_class *prev_class = p->sched_class; 6403 + struct sched_attr attr = { 6404 + .sched_policy = SCHED_NORMAL, 6405 + }; 6623 6406 int old_prio = p->prio; 6624 6407 int on_rq; 6625 6408 6626 6409 on_rq = p->on_rq; 6627 6410 if (on_rq) 6628 6411 dequeue_task(rq, p, 0); 6629 - __setscheduler(rq, p, SCHED_NORMAL, 0); 6412 + __setscheduler(rq, p, &attr); 6630 6413 if (on_rq) { 6631 6414 enqueue_task(rq, p, 0); 6632 6415 resched_task(rq->curr);

+6 -3

kernel/sched/sched.h

··· 81 81 */ 82 82 #define RUNTIME_INF ((u64)~0ULL) 83 83 84 + static inline int fair_policy(int policy) 85 + { 86 + return policy == SCHED_NORMAL || policy == SCHED_BATCH; 87 + } 88 + 84 89 static inline int rt_policy(int policy) 85 90 { 86 - if (policy == SCHED_FIFO || policy == SCHED_RR) 87 - return 1; 88 - return 0; 91 + return policy == SCHED_FIFO || policy == SCHED_RR; 89 92 } 90 93 91 94 static inline int task_has_rt_policy(struct task_struct *p)