[PATCH] sched: add new SCHED_BATCH policy

Add a new SCHED_BATCH (3) scheduling policy: such tasks are presumed
CPU-intensive, and will acquire a constant +5 priority level penalty. Such
policy is nice for workloads that are non-interactive, but which do not
want to give up their nice levels. The policy is also useful for workloads
that want a deterministic scheduling policy without interactivity causing
extra preemptions (between that workload's tasks).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Ingo Molnar and committed by Linus Torvalds b0a9499c 2d0cfb52

+40 -19
+4 -3
include/linux/sched.h
··· 160 160 #define SCHED_NORMAL 0 161 161 #define SCHED_FIFO 1 162 162 #define SCHED_RR 2 163 + #define SCHED_BATCH 3 163 164 164 165 struct sched_param { 165 166 int sched_priority; ··· 471 470 472 471 /* 473 472 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 474 - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are 475 - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values 476 - * are inverted: lower p->prio value means higher priority. 473 + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 474 + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority 475 + * values are inverted: lower p->prio value means higher priority. 477 476 * 478 477 * The MAX_USER_RT_PRIO value allows the actual maximum 479 478 * RT priority to be separate from the value exported to
+3 -1
kernel/exit.c
··· 244 244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 245 245 current->exit_signal = SIGCHLD; 246 246 247 - if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) 247 + if ((current->policy == SCHED_NORMAL || 248 + current->policy == SCHED_BATCH) 249 + && (task_nice(current) < 0)) 248 250 set_user_nice(current, 0); 249 251 /* cpus_allowed? */ 250 252 /* rt_priority? */
+33 -15
kernel/sched.c
··· 748 748 unsigned long long __sleep_time = now - p->timestamp; 749 749 unsigned long sleep_time; 750 750 751 - if (__sleep_time > NS_MAX_SLEEP_AVG) 752 - sleep_time = NS_MAX_SLEEP_AVG; 753 - else 754 - sleep_time = (unsigned long)__sleep_time; 751 + if (unlikely(p->policy == SCHED_BATCH)) 752 + sleep_time = 0; 753 + else { 754 + if (__sleep_time > NS_MAX_SLEEP_AVG) 755 + sleep_time = NS_MAX_SLEEP_AVG; 756 + else 757 + sleep_time = (unsigned long)__sleep_time; 758 + } 755 759 756 760 if (likely(sleep_time > 0)) { 757 761 /* ··· 3564 3560 * The RT priorities are set via sched_setscheduler(), but we still 3565 3561 * allow the 'normal' nice value to be set - but as expected 3566 3562 * it wont have any effect on scheduling until the task is 3567 - * not SCHED_NORMAL: 3563 + * not SCHED_NORMAL/SCHED_BATCH: 3568 3564 */ 3569 3565 if (rt_task(p)) { 3570 3566 p->static_prio = NICE_TO_PRIO(nice); ··· 3710 3706 BUG_ON(p->array); 3711 3707 p->policy = policy; 3712 3708 p->rt_priority = prio; 3713 - if (policy != SCHED_NORMAL) 3709 + if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3714 3710 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3715 - else 3711 + } else { 3716 3712 p->prio = p->static_prio; 3713 + /* 3714 + * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3715 + */ 3716 + if (policy == SCHED_BATCH) 3717 + p->sleep_avg = 0; 3718 + } 3717 3719 } 3718 3720 3719 3721 /** ··· 3743 3733 if (policy < 0) 3744 3734 policy = oldpolicy = p->policy; 3745 3735 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3746 - policy != SCHED_NORMAL) 3747 - return -EINVAL; 3736 + policy != SCHED_NORMAL && policy != SCHED_BATCH) 3737 + return -EINVAL; 3748 3738 /* 3749 3739 * Valid priorities for SCHED_FIFO and SCHED_RR are 3750 - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3740 + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 3741 + * SCHED_BATCH is 0. 3751 3742 */ 3752 3743 if (param->sched_priority < 0 || 3753 3744 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3754 3745 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3755 3746 return -EINVAL; 3756 - if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3747 + if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) 3748 + != (param->sched_priority == 0)) 3757 3749 return -EINVAL; 3758 3750 3759 3751 /* 3760 3752 * Allow unprivileged RT tasks to decrease priority: 3761 3753 */ 3762 3754 if (!capable(CAP_SYS_NICE)) { 3763 - /* can't change policy */ 3764 - if (policy != p->policy && 3765 - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3755 + /* 3756 + * can't change policy, except between SCHED_NORMAL 3757 + * and SCHED_BATCH: 3758 + */ 3759 + if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && 3760 + (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && 3761 + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3766 3762 return -EPERM; 3767 3763 /* can't increase priority */ 3768 - if (policy != SCHED_NORMAL && 3764 + if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && 3769 3765 param->sched_priority > p->rt_priority && 3770 3766 param->sched_priority > 3771 3767 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) ··· 4249 4233 ret = MAX_USER_RT_PRIO-1; 4250 4234 break; 4251 4235 case SCHED_NORMAL: 4236 + case SCHED_BATCH: 4252 4237 ret = 0; 4253 4238 break; 4254 4239 } ··· 4273 4256 ret = 1; 4274 4257 break; 4275 4258 case SCHED_NORMAL: 4259 + case SCHED_BATCH: 4276 4260 ret = 0; 4277 4261 } 4278 4262 return ret;