[PATCH] sched: add new SCHED_BATCH policy

Add a new SCHED_BATCH (3) scheduling policy: such tasks are presumed
CPU-intensive, and will acquire a constant +5 priority level penalty. Such
policy is nice for workloads that are non-interactive, but which do not
want to give up their nice levels. The policy is also useful for workloads
that want a deterministic scheduling policy without interactivity causing
extra preemptions (between that workload's tasks).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Ingo Molnar and committed by Linus Torvalds b0a9499c 2d0cfb52

+40 -19
+4 -3
include/linux/sched.h
··· 160 #define SCHED_NORMAL 0 161 #define SCHED_FIFO 1 162 #define SCHED_RR 2 163 164 struct sched_param { 165 int sched_priority; ··· 471 472 /* 473 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 474 - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are 475 - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values 476 - * are inverted: lower p->prio value means higher priority. 477 * 478 * The MAX_USER_RT_PRIO value allows the actual maximum 479 * RT priority to be separate from the value exported to
··· 160 #define SCHED_NORMAL 0 161 #define SCHED_FIFO 1 162 #define SCHED_RR 2 163 + #define SCHED_BATCH 3 164 165 struct sched_param { 166 int sched_priority; ··· 470 471 /* 472 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 473 + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 474 + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority 475 + * values are inverted: lower p->prio value means higher priority. 476 * 477 * The MAX_USER_RT_PRIO value allows the actual maximum 478 * RT priority to be separate from the value exported to
+3 -1
kernel/exit.c
··· 244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 245 current->exit_signal = SIGCHLD; 246 247 - if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) 248 set_user_nice(current, 0); 249 /* cpus_allowed? */ 250 /* rt_priority? */
··· 244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 245 current->exit_signal = SIGCHLD; 246 247 + if ((current->policy == SCHED_NORMAL || 248 + current->policy == SCHED_BATCH) 249 + && (task_nice(current) < 0)) 250 set_user_nice(current, 0); 251 /* cpus_allowed? */ 252 /* rt_priority? */
+33 -15
kernel/sched.c
··· 748 unsigned long long __sleep_time = now - p->timestamp; 749 unsigned long sleep_time; 750 751 - if (__sleep_time > NS_MAX_SLEEP_AVG) 752 - sleep_time = NS_MAX_SLEEP_AVG; 753 - else 754 - sleep_time = (unsigned long)__sleep_time; 755 756 if (likely(sleep_time > 0)) { 757 /* ··· 3564 * The RT priorities are set via sched_setscheduler(), but we still 3565 * allow the 'normal' nice value to be set - but as expected 3566 * it wont have any effect on scheduling until the task is 3567 - * not SCHED_NORMAL: 3568 */ 3569 if (rt_task(p)) { 3570 p->static_prio = NICE_TO_PRIO(nice); ··· 3710 BUG_ON(p->array); 3711 p->policy = policy; 3712 p->rt_priority = prio; 3713 - if (policy != SCHED_NORMAL) 3714 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3715 - else 3716 p->prio = p->static_prio; 3717 } 3718 3719 /** ··· 3743 if (policy < 0) 3744 policy = oldpolicy = p->policy; 3745 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3746 - policy != SCHED_NORMAL) 3747 - return -EINVAL; 3748 /* 3749 * Valid priorities for SCHED_FIFO and SCHED_RR are 3750 - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3751 */ 3752 if (param->sched_priority < 0 || 3753 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3754 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3755 return -EINVAL; 3756 - if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3757 return -EINVAL; 3758 3759 /* 3760 * Allow unprivileged RT tasks to decrease priority: 3761 */ 3762 if (!capable(CAP_SYS_NICE)) { 3763 - /* can't change policy */ 3764 - if (policy != p->policy && 3765 - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3766 return -EPERM; 3767 /* can't increase priority */ 3768 - if (policy != SCHED_NORMAL && 3769 param->sched_priority > p->rt_priority && 3770 param->sched_priority > 3771 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) ··· 4249 ret = MAX_USER_RT_PRIO-1; 4250 break; 4251 case SCHED_NORMAL: 4252 ret = 0; 4253 break; 4254 } ··· 4273 ret = 1; 4274 break; 4275 case SCHED_NORMAL: 4276 ret = 0; 4277 } 4278 return ret;
··· 748 unsigned long long __sleep_time = now - p->timestamp; 749 unsigned long sleep_time; 750 751 + if (unlikely(p->policy == SCHED_BATCH)) 752 + sleep_time = 0; 753 + else { 754 + if (__sleep_time > NS_MAX_SLEEP_AVG) 755 + sleep_time = NS_MAX_SLEEP_AVG; 756 + else 757 + sleep_time = (unsigned long)__sleep_time; 758 + } 759 760 if (likely(sleep_time > 0)) { 761 /* ··· 3560 * The RT priorities are set via sched_setscheduler(), but we still 3561 * allow the 'normal' nice value to be set - but as expected 3562 * it wont have any effect on scheduling until the task is 3563 + * not SCHED_NORMAL/SCHED_BATCH: 3564 */ 3565 if (rt_task(p)) { 3566 p->static_prio = NICE_TO_PRIO(nice); ··· 3706 BUG_ON(p->array); 3707 p->policy = policy; 3708 p->rt_priority = prio; 3709 + if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3710 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3711 + } else { 3712 p->prio = p->static_prio; 3713 + /* 3714 + * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3715 + */ 3716 + if (policy == SCHED_BATCH) 3717 + p->sleep_avg = 0; 3718 + } 3719 } 3720 3721 /** ··· 3733 if (policy < 0) 3734 policy = oldpolicy = p->policy; 3735 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3736 + policy != SCHED_NORMAL && policy != SCHED_BATCH) 3737 + return -EINVAL; 3738 /* 3739 * Valid priorities for SCHED_FIFO and SCHED_RR are 3740 + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 3741 + * SCHED_BATCH is 0. 3742 */ 3743 if (param->sched_priority < 0 || 3744 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3745 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3746 return -EINVAL; 3747 + if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) 3748 + != (param->sched_priority == 0)) 3749 return -EINVAL; 3750 3751 /* 3752 * Allow unprivileged RT tasks to decrease priority: 3753 */ 3754 if (!capable(CAP_SYS_NICE)) { 3755 + /* 3756 + * can't change policy, except between SCHED_NORMAL 3757 + * and SCHED_BATCH: 3758 + */ 3759 + if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && 3760 + (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && 3761 + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3762 return -EPERM; 3763 /* can't increase priority */ 3764 + if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && 3765 param->sched_priority > p->rt_priority && 3766 param->sched_priority > 3767 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) ··· 4233 ret = MAX_USER_RT_PRIO-1; 4234 break; 4235 case SCHED_NORMAL: 4236 + case SCHED_BATCH: 4237 ret = 0; 4238 break; 4239 } ··· 4256 ret = 1; 4257 break; 4258 case SCHED_NORMAL: 4259 + case SCHED_BATCH: 4260 ret = 0; 4261 } 4262 return ret;