Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next

Pull scheduler updates from Ingo Molnar:
"The main scheduling related changes in this cycle were:

- various sched/numa updates, for better performance

- tree wide cleanup of open coded nice levels

- nohz fix related to rq->nr_running use

- cpuidle changes and continued consolidation to improve the
kernel/sched/idle.c high level idle scheduling logic. As part of
this effort I pulled cpuidle driver changes from Rafael as well.

- standardized idle polling amongst architectures

- continued work on preparing better power/energy aware scheduling

- sched/rt updates

- misc fixlets and cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits)
sched/numa: Decay ->wakee_flips instead of zeroing
sched/numa: Update migrate_improves/degrades_locality()
sched/numa: Allow task switch if load imbalance improves
sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code
sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice()
sched: Initialize rq->age_stamp on processor start
sched, nohz: Change rq->nr_running to always use wrappers
sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance()
sched: Use clamp() and clamp_val() to make sys_nice() more readable
sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups()
sched/numa: Fix initialization of sched_domain_topology for NUMA
sched: Call select_idle_sibling() when not affine_sd
sched: Simplify return logic in sched_read_attr()
sched: Simplify return logic in sched_copy_attr()
sched: Fix exec_start/task_hot on migrated tasks
arm64: Remove TIF_POLLING_NRFLAG
metag: Remove TIF_POLLING_NRFLAG
sched/idle: Make cpuidle_idle_call() void
sched/idle: Reflow cpuidle_idle_call()
sched/idle: Delay clearing the polling bit
...

+761 -664
+2 -2
arch/alpha/include/asm/thread_info.h
··· 73 73 #define TIF_SYSCALL_AUDIT 4 /* syscall audit active */ 74 74 #define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */ 75 75 #define TIF_MEMDIE 13 /* is terminating due to OOM killer */ 76 + #define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */ 76 77 77 78 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 78 79 #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) 79 80 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) 80 81 #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) 81 82 #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) 83 + #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) 82 84 83 85 /* Work to do on interrupt/exception return. */ 84 86 #define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ ··· 94 92 #define TS_UAC_NOFIX 0x0002 /* ! flags as they match */ 95 93 #define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */ 96 94 #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 97 - #define TS_POLLING 0x0010 /* idle task polling need_resched, 98 - skip sending interrupt */ 99 95 100 96 #ifndef __ASSEMBLY__ 101 97 #define HAVE_SET_RESTORE_SIGMASK 1
+26
arch/arm/kernel/topology.c
··· 185 185 return &cpu_topology[cpu].core_sibling; 186 186 } 187 187 188 + /* 189 + * The current assumption is that we can power gate each core independently. 190 + * This will be superseded by DT binding once available. 191 + */ 192 + const struct cpumask *cpu_corepower_mask(int cpu) 193 + { 194 + return &cpu_topology[cpu].thread_sibling; 195 + } 196 + 188 197 static void update_siblings_masks(unsigned int cpuid) 189 198 { 190 199 struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; ··· 275 266 cpu_topology[cpuid].socket_id, mpidr); 276 267 } 277 268 269 + static inline const int cpu_corepower_flags(void) 270 + { 271 + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; 272 + } 273 + 274 + static struct sched_domain_topology_level arm_topology[] = { 275 + #ifdef CONFIG_SCHED_MC 276 + { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) }, 277 + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 278 + #endif 279 + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 280 + { NULL, }, 281 + }; 282 + 278 283 /* 279 284 * init_cpu_topology is called at boot when only one cpu is running 280 285 * which prevent simultaneous write access to cpu_topology array ··· 312 289 smp_wmb(); 313 290 314 291 parse_dt_topology(); 292 + 293 + /* Set scheduler topology descriptor */ 294 + set_sched_topology(arm_topology); 315 295 }
-2
arch/arm64/include/asm/thread_info.h
··· 95 95 * TIF_NEED_RESCHED - rescheduling necessary 96 96 * TIF_NOTIFY_RESUME - callback before returning to user 97 97 * TIF_USEDFPU - FPU was used by this task this quantum (SMP) 98 - * TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED 99 98 */ 100 99 #define TIF_SIGPENDING 0 101 100 #define TIF_NEED_RESCHED 1 102 101 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 103 102 #define TIF_SYSCALL_TRACE 8 104 - #define TIF_POLLING_NRFLAG 16 105 103 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ 106 104 #define TIF_FREEZE 19 107 105 #define TIF_RESTORE_SIGMASK 20
+2 -1
arch/ia64/include/asm/thread_info.h
··· 107 107 #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ 108 108 #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ 109 109 #define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */ 110 + #define TIF_POLLING_NRFLAG 22 /* idle is polling for TIF_NEED_RESCHED */ 110 111 111 112 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 112 113 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) ··· 119 118 #define _TIF_MCA_INIT (1 << TIF_MCA_INIT) 120 119 #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) 121 120 #define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE) 121 + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) 122 122 123 123 /* "work to do on user-return" bits */ 124 124 #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ ··· 127 125 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ 128 126 #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) 129 127 130 - #define TS_POLLING 1 /* true if in idle loop and not sleeping */ 131 128 #define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ 132 129 133 130 #ifndef __ASSEMBLY__
-24
arch/ia64/include/asm/topology.h
··· 46 46 47 47 void build_cpu_to_node_map(void); 48 48 49 - #define SD_CPU_INIT (struct sched_domain) { \ 50 - .parent = NULL, \ 51 - .child = NULL, \ 52 - .groups = NULL, \ 53 - .min_interval = 1, \ 54 - .max_interval = 4, \ 55 - .busy_factor = 64, \ 56 - .imbalance_pct = 125, \ 57 - .cache_nice_tries = 2, \ 58 - .busy_idx = 2, \ 59 - .idle_idx = 1, \ 60 - .newidle_idx = 0, \ 61 - .wake_idx = 0, \ 62 - .forkexec_idx = 0, \ 63 - .flags = SD_LOAD_BALANCE \ 64 - | SD_BALANCE_NEWIDLE \ 65 - | SD_BALANCE_EXEC \ 66 - | SD_BALANCE_FORK \ 67 - | SD_WAKE_AFFINE, \ 68 - .last_balance = jiffies, \ 69 - .balance_interval = 1, \ 70 - .nr_balance_failed = 0, \ 71 - } 72 - 73 49 #endif /* CONFIG_NUMA */ 74 50 75 51 #ifdef CONFIG_SMP
+2 -4
arch/metag/include/asm/thread_info.h
··· 117 117 #define TIF_SECCOMP 5 /* secure computing */ 118 118 #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ 119 119 #define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ 120 - #define TIF_POLLING_NRFLAG 8 /* true if poll_idle() is polling 121 - TIF_NEED_RESCHED */ 122 - #define TIF_MEMDIE 9 /* is terminating due to OOM killer */ 123 - #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint instrumentation */ 120 + #define TIF_MEMDIE 8 /* is terminating due to OOM killer */ 121 + #define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */ 124 122 125 123 126 124 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
+23 -8
arch/powerpc/kernel/smp.c
··· 766 766 return 0; 767 767 } 768 768 769 + #ifdef CONFIG_SCHED_SMT 770 + /* cpumask of CPUs with asymetric SMT dependancy */ 771 + static const int powerpc_smt_flags(void) 772 + { 773 + int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; 774 + 775 + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { 776 + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); 777 + flags |= SD_ASYM_PACKING; 778 + } 779 + return flags; 780 + } 781 + #endif 782 + 783 + static struct sched_domain_topology_level powerpc_topology[] = { 784 + #ifdef CONFIG_SCHED_SMT 785 + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, 786 + #endif 787 + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 788 + { NULL, }, 789 + }; 790 + 769 791 void __init smp_cpus_done(unsigned int max_cpus) 770 792 { 771 793 cpumask_var_t old_mask; ··· 812 790 813 791 dump_numa_cpu_topology(); 814 792 815 - } 793 + set_sched_topology(powerpc_topology); 816 794 817 - int arch_sd_sibling_asym_packing(void) 818 - { 819 - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { 820 - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); 821 - return SD_ASYM_PACKING; 822 - } 823 - return 0; 824 795 } 825 796 826 797 #ifdef CONFIG_HOTPLUG_CPU
+1 -12
arch/s390/include/asm/topology.h
··· 26 26 27 27 #define mc_capable() 1 28 28 29 - static inline const struct cpumask *cpu_coregroup_mask(int cpu) 30 - { 31 - return &cpu_topology[cpu].core_mask; 32 - } 33 - 34 - static inline const struct cpumask *cpu_book_mask(int cpu) 35 - { 36 - return &cpu_topology[cpu].book_mask; 37 - } 38 - 39 29 int topology_cpu_init(struct cpu *); 40 30 int topology_set_cpu_management(int fc); 41 31 void topology_schedule_update(void); 42 32 void store_topology(struct sysinfo_15_1_x *info); 43 33 void topology_expect_change(void); 34 + const struct cpumask *cpu_coregroup_mask(int cpu); 44 35 45 36 #else /* CONFIG_SCHED_BOOK */ 46 37 ··· 54 63 { 55 64 }; 56 65 #endif 57 - 58 - #define SD_BOOK_INIT SD_CPU_INIT 59 66 60 67 #include <asm-generic/topology.h> 61 68
+20
arch/s390/kernel/topology.c
··· 445 445 return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); 446 446 } 447 447 448 + const struct cpumask *cpu_coregroup_mask(int cpu) 449 + { 450 + return &cpu_topology[cpu].core_mask; 451 + } 452 + 453 + static const struct cpumask *cpu_book_mask(int cpu) 454 + { 455 + return &cpu_topology[cpu].book_mask; 456 + } 457 + 458 + static struct sched_domain_topology_level s390_topology[] = { 459 + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 460 + { cpu_book_mask, SD_INIT_NAME(BOOK) }, 461 + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 462 + { NULL, }, 463 + }; 464 + 448 465 static int __init topology_init(void) 449 466 { 450 467 if (!MACHINE_HAS_TOPOLOGY) { ··· 470 453 } 471 454 set_topology_timer(); 472 455 out: 456 + 457 + set_sched_topology(s390_topology); 458 + 473 459 return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); 474 460 } 475 461 device_initcall(topology_init);
+2 -1
arch/tile/include/asm/thread_info.h
··· 129 129 #define TIF_MEMDIE 7 /* OOM killer at work */ 130 130 #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ 131 131 #define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */ 132 + #define TIF_POLLING_NRFLAG 10 /* idle is polling for TIF_NEED_RESCHED */ 132 133 133 134 #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) 134 135 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) ··· 141 140 #define _TIF_MEMDIE (1<<TIF_MEMDIE) 142 141 #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) 143 142 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) 143 + #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) 144 144 145 145 /* Work to do on any return to user space. */ 146 146 #define _TIF_ALLWORK_MASK \ ··· 164 162 #ifdef __tilegx__ 165 163 #define TS_COMPAT 0x0001 /* 32-bit compatibility mode */ 166 164 #endif 167 - #define TS_POLLING 0x0004 /* in idle loop but not sleeping */ 168 165 #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */ 169 166 170 167 #ifndef __ASSEMBLY__
-33
arch/tile/include/asm/topology.h
··· 44 44 /* For now, use numa node -1 for global allocation. */ 45 45 #define pcibus_to_node(bus) ((void)(bus), -1) 46 46 47 - /* 48 - * TILE architecture has many cores integrated in one processor, so we need 49 - * setup bigger balance_interval for both CPU/NODE scheduling domains to 50 - * reduce process scheduling costs. 51 - */ 52 - 53 - /* sched_domains SD_CPU_INIT for TILE architecture */ 54 - #define SD_CPU_INIT (struct sched_domain) { \ 55 - .min_interval = 4, \ 56 - .max_interval = 128, \ 57 - .busy_factor = 64, \ 58 - .imbalance_pct = 125, \ 59 - .cache_nice_tries = 1, \ 60 - .busy_idx = 2, \ 61 - .idle_idx = 1, \ 62 - .newidle_idx = 0, \ 63 - .wake_idx = 0, \ 64 - .forkexec_idx = 0, \ 65 - \ 66 - .flags = 1*SD_LOAD_BALANCE \ 67 - | 1*SD_BALANCE_NEWIDLE \ 68 - | 1*SD_BALANCE_EXEC \ 69 - | 1*SD_BALANCE_FORK \ 70 - | 0*SD_BALANCE_WAKE \ 71 - | 0*SD_WAKE_AFFINE \ 72 - | 0*SD_SHARE_CPUPOWER \ 73 - | 0*SD_SHARE_PKG_RESOURCES \ 74 - | 0*SD_SERIALIZE \ 75 - , \ 76 - .last_balance = jiffies, \ 77 - .balance_interval = 32, \ 78 - } 79 - 80 47 /* By definition, we create nodes based on online memory. */ 81 48 #define node_has_online_mem(nid) 1 82 49
+2 -2
arch/x86/include/asm/thread_info.h
··· 83 83 #define TIF_FORK 18 /* ret_from_fork */ 84 84 #define TIF_NOHZ 19 /* in adaptive nohz mode */ 85 85 #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 86 + #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ 86 87 #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 87 88 #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 88 89 #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ ··· 107 106 #define _TIF_IA32 (1 << TIF_IA32) 108 107 #define _TIF_FORK (1 << TIF_FORK) 109 108 #define _TIF_NOHZ (1 << TIF_NOHZ) 109 + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) 110 110 #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 111 111 #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 112 112 #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) ··· 193 191 * have to worry about atomic accesses. 194 192 */ 195 193 #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ 196 - #define TS_POLLING 0x0004 /* idle task polling need_resched, 197 - skip sending interrupt */ 198 194 #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 199 195 200 196 #ifndef __ASSEMBLY__
-11
arch/x86/kernel/apm_32.c
··· 844 844 int polling; 845 845 int err = 0; 846 846 847 - polling = !!(current_thread_info()->status & TS_POLLING); 848 - if (polling) { 849 - current_thread_info()->status &= ~TS_POLLING; 850 - /* 851 - * TS_POLLING-cleared state must be visible before we 852 - * test NEED_RESCHED: 853 - */ 854 - smp_mb(); 855 - } 856 847 if (!need_resched()) { 857 848 idled = 1; 858 849 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); 859 850 } 860 - if (polling) 861 - current_thread_info()->status |= TS_POLLING; 862 851 863 852 if (!idled) 864 853 return 0;
+1 -1
drivers/block/loop.c
··· 548 548 struct loop_device *lo = data; 549 549 struct bio *bio; 550 550 551 - set_user_nice(current, -20); 551 + set_user_nice(current, MIN_NICE); 552 552 553 553 while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { 554 554
+1 -1
drivers/block/nbd.c
··· 533 533 struct nbd_device *nbd = data; 534 534 struct request *req; 535 535 536 - set_user_nice(current, -20); 536 + set_user_nice(current, MIN_NICE); 537 537 while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { 538 538 /* wait for something to do */ 539 539 wait_event_interruptible(nbd->waiting_wq,
+1 -1
drivers/block/pktcdvd.c
··· 1463 1463 struct packet_data *pkt; 1464 1464 long min_sleep_time, residue; 1465 1465 1466 - set_user_nice(current, -20); 1466 + set_user_nice(current, MIN_NICE); 1467 1467 set_freezable(); 1468 1468 1469 1469 for (;;) {
+1 -1
drivers/char/ipmi/ipmi_si_intf.c
··· 1007 1007 struct timespec busy_until; 1008 1008 1009 1009 ipmi_si_set_not_busy(&busy_until); 1010 - set_user_nice(current, 19); 1010 + set_user_nice(current, MAX_NICE); 1011 1011 while (!kthread_should_stop()) { 1012 1012 int busy_wait; 1013 1013
+42 -13
drivers/cpuidle/cpuidle.c
··· 32 32 static int enabled_devices; 33 33 static int off __read_mostly; 34 34 static int initialized __read_mostly; 35 + static bool use_deepest_state __read_mostly; 35 36 36 37 int cpuidle_disabled(void) 37 38 { ··· 66 65 } 67 66 68 67 /** 69 - * cpuidle_enabled - check if the cpuidle framework is ready 70 - * @dev: cpuidle device for this cpu 71 - * @drv: cpuidle driver for this cpu 68 + * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode. 69 + * @enable: Whether enable or disable the feature. 72 70 * 73 - * Return 0 on success, otherwise: 74 - * -NODEV : the cpuidle framework is not available 75 - * -EBUSY : the cpuidle framework is not initialized 71 + * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and 72 + * always use the state with the greatest exit latency (out of the states that 73 + * are not disabled). 74 + * 75 + * This function can only be called after cpuidle_pause() to avoid races. 76 76 */ 77 - int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) 77 + void cpuidle_use_deepest_state(bool enable) 78 78 { 79 - if (off || !initialized) 80 - return -ENODEV; 79 + use_deepest_state = enable; 80 + } 81 81 82 - if (!drv || !dev || !dev->enabled) 83 - return -EBUSY; 82 + /** 83 + * cpuidle_find_deepest_state - Find the state of the greatest exit latency. 84 + * @drv: cpuidle driver for a given CPU. 85 + * @dev: cpuidle device for a given CPU. 86 + */ 87 + static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, 88 + struct cpuidle_device *dev) 89 + { 90 + unsigned int latency_req = 0; 91 + int i, ret = CPUIDLE_DRIVER_STATE_START - 1; 84 92 85 - return 0; 93 + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { 94 + struct cpuidle_state *s = &drv->states[i]; 95 + struct cpuidle_state_usage *su = &dev->states_usage[i]; 96 + 97 + if (s->disabled || su->disable || s->exit_latency <= latency_req) 98 + continue; 99 + 100 + latency_req = s->exit_latency; 101 + ret = i; 102 + } 103 + return ret; 86 104 } 87 105 88 106 /** ··· 158 138 */ 159 139 int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) 160 140 { 141 + if (off || !initialized) 142 + return -ENODEV; 143 + 144 + if (!drv || !dev || !dev->enabled) 145 + return -EBUSY; 146 + 147 + if (unlikely(use_deepest_state)) 148 + return cpuidle_find_deepest_state(drv, dev); 149 + 161 150 return cpuidle_curr_governor->select(drv, dev); 162 151 } 163 152 ··· 198 169 */ 199 170 void cpuidle_reflect(struct cpuidle_device *dev, int index) 200 171 { 201 - if (cpuidle_curr_governor->reflect) 172 + if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state)) 202 173 cpuidle_curr_governor->reflect(dev, index); 203 174 } 204 175
+9 -8
drivers/cpuidle/governors/menu.c
··· 296 296 data->needs_update = 0; 297 297 } 298 298 299 - data->last_state_idx = 0; 299 + data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; 300 300 301 301 /* Special case when user has set very strict latency requirement */ 302 302 if (unlikely(latency_req == 0)) ··· 309 309 310 310 311 311 data->bucket = which_bucket(data->next_timer_us); 312 - 313 - /* 314 - * if the correction factor is 0 (eg first time init or cpu hotplug 315 - * etc), we actually want to start out with a unity factor. 316 - */ 317 - if (data->correction_factor[data->bucket] == 0) 318 - data->correction_factor[data->bucket] = RESOLUTION * DECAY; 319 312 320 313 /* 321 314 * Force the result of multiplication to be 64 bits even if both ··· 459 466 struct cpuidle_device *dev) 460 467 { 461 468 struct menu_device *data = &per_cpu(menu_devices, dev->cpu); 469 + int i; 462 470 463 471 memset(data, 0, sizeof(struct menu_device)); 472 + 473 + /* 474 + * if the correction factor is 0 (eg first time init or cpu hotplug 475 + * etc), we actually want to start out with a unity factor. 476 + */ 477 + for(i = 0; i < BUCKETS; i++) 478 + data->correction_factor[i] = RESOLUTION * DECAY; 464 479 465 480 return 0; 466 481 }
+1 -1
drivers/s390/crypto/ap_bus.c
··· 1803 1803 int requests; 1804 1804 struct ap_device *ap_dev; 1805 1805 1806 - set_user_nice(current, 19); 1806 + set_user_nice(current, MAX_NICE); 1807 1807 while (1) { 1808 1808 if (ap_suspend_flag) 1809 1809 return 0;
+2 -2
drivers/scsi/bnx2fc/bnx2fc_fcoe.c
··· 464 464 struct fcoe_percpu_s *bg = arg; 465 465 struct sk_buff *skb; 466 466 467 - set_user_nice(current, -20); 467 + set_user_nice(current, MIN_NICE); 468 468 set_current_state(TASK_INTERRUPTIBLE); 469 469 while (!kthread_should_stop()) { 470 470 schedule(); ··· 602 602 struct bnx2fc_work *work, *tmp; 603 603 LIST_HEAD(work_list); 604 604 605 - set_user_nice(current, -20); 605 + set_user_nice(current, MIN_NICE); 606 606 set_current_state(TASK_INTERRUPTIBLE); 607 607 while (!kthread_should_stop()) { 608 608 schedule();
+1 -1
drivers/scsi/bnx2i/bnx2i_hwi.c
··· 1870 1870 struct bnx2i_work *work, *tmp; 1871 1871 LIST_HEAD(work_list); 1872 1872 1873 - set_user_nice(current, -20); 1873 + set_user_nice(current, MIN_NICE); 1874 1874 1875 1875 while (!kthread_should_stop()) { 1876 1876 spin_lock_bh(&p->p_work_lock);
+1 -1
drivers/scsi/fcoe/fcoe.c
··· 1872 1872 1873 1873 skb_queue_head_init(&tmp); 1874 1874 1875 - set_user_nice(current, -20); 1875 + set_user_nice(current, MIN_NICE); 1876 1876 1877 1877 retry: 1878 1878 while (!kthread_should_stop()) {
+1 -1
drivers/scsi/ibmvscsi/ibmvfc.c
··· 4515 4515 struct ibmvfc_host *vhost = data; 4516 4516 int rc; 4517 4517 4518 - set_user_nice(current, -20); 4518 + set_user_nice(current, MIN_NICE); 4519 4519 4520 4520 while (1) { 4521 4521 rc = wait_event_interruptible(vhost->work_wait_q,
+1 -1
drivers/scsi/ibmvscsi/ibmvscsi.c
··· 2213 2213 struct ibmvscsi_host_data *hostdata = data; 2214 2214 int rc; 2215 2215 2216 - set_user_nice(current, -20); 2216 + set_user_nice(current, MIN_NICE); 2217 2217 2218 2218 while (1) { 2219 2219 rc = wait_event_interruptible(hostdata->work_wait_q,
+1 -1
drivers/scsi/lpfc/lpfc_hbadisc.c
··· 731 731 struct lpfc_hba *phba = p; 732 732 int rc; 733 733 734 - set_user_nice(current, -20); 734 + set_user_nice(current, MIN_NICE); 735 735 current->flags |= PF_NOFREEZE; 736 736 phba->data_flags = 0; 737 737
+1 -1
drivers/scsi/qla2xxx/qla_os.c
··· 4828 4828 ha = (struct qla_hw_data *)data; 4829 4829 base_vha = pci_get_drvdata(ha->pdev); 4830 4830 4831 - set_user_nice(current, -20); 4831 + set_user_nice(current, MIN_NICE); 4832 4832 4833 4833 set_current_state(TASK_INTERRUPTIBLE); 4834 4834 while (!kthread_should_stop()) {
+2 -2
drivers/staging/android/binder.c
··· 439 439 set_user_nice(current, nice); 440 440 return; 441 441 } 442 - min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur; 442 + min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); 443 443 binder_debug(BINDER_DEBUG_PRIORITY_CAP, 444 444 "%d: nice value %ld not allowed use %ld instead\n", 445 445 current->pid, nice, min_nice); 446 446 set_user_nice(current, min_nice); 447 - if (min_nice < 20) 447 + if (min_nice <= MAX_NICE) 448 448 return; 449 449 binder_user_error("%d RLIMIT_NICE not set\n", current->pid); 450 450 }
+1 -1
drivers/staging/lustre/lustre/llite/lloop.c
··· 404 404 int refcheck; 405 405 int ret = 0; 406 406 407 - set_user_nice(current, -20); 407 + set_user_nice(current, MIN_NICE); 408 408 409 409 lo->lo_state = LLOOP_BOUND; 410 410
+1 -1
fs/ocfs2/cluster/heartbeat.c
··· 1107 1107 1108 1108 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); 1109 1109 1110 - set_user_nice(current, -20); 1110 + set_user_nice(current, MIN_NICE); 1111 1111 1112 1112 /* Pin node */ 1113 1113 o2nm_depend_this_node();
+2 -5
include/linux/cpuidle.h
··· 120 120 #ifdef CONFIG_CPU_IDLE 121 121 extern void disable_cpuidle(void); 122 122 123 - extern int cpuidle_enabled(struct cpuidle_driver *drv, 124 - struct cpuidle_device *dev); 125 123 extern int cpuidle_select(struct cpuidle_driver *drv, 126 124 struct cpuidle_device *dev); 127 125 extern int cpuidle_enter(struct cpuidle_driver *drv, ··· 143 145 extern int cpuidle_enable_device(struct cpuidle_device *dev); 144 146 extern void cpuidle_disable_device(struct cpuidle_device *dev); 145 147 extern int cpuidle_play_dead(void); 148 + extern void cpuidle_use_deepest_state(bool enable); 146 149 147 150 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); 148 151 #else 149 152 static inline void disable_cpuidle(void) { } 150 - static inline int cpuidle_enabled(struct cpuidle_driver *drv, 151 - struct cpuidle_device *dev) 152 - {return -ENODEV; } 153 153 static inline int cpuidle_select(struct cpuidle_driver *drv, 154 154 struct cpuidle_device *dev) 155 155 {return -ENODEV; } ··· 176 180 {return -ENODEV; } 177 181 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } 178 182 static inline int cpuidle_play_dead(void) {return -ENODEV; } 183 + static inline void cpuidle_use_deepest_state(bool enable) {} 179 184 static inline struct cpuidle_driver *cpuidle_get_cpu_driver( 180 185 struct cpuidle_device *dev) {return NULL; } 181 186 #endif
+57 -47
include/linux/sched.h
··· 870 870 #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ 871 871 #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 872 872 #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 873 + #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ 873 874 #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 874 875 #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 875 876 #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ ··· 878 877 #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 879 878 #define SD_NUMA 0x4000 /* cross-node balancing */ 880 879 881 - extern int __weak arch_sd_sibiling_asym_packing(void); 880 + #ifdef CONFIG_SCHED_SMT 881 + static inline const int cpu_smt_flags(void) 882 + { 883 + return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; 884 + } 885 + #endif 886 + 887 + #ifdef CONFIG_SCHED_MC 888 + static inline const int cpu_core_flags(void) 889 + { 890 + return SD_SHARE_PKG_RESOURCES; 891 + } 892 + #endif 893 + 894 + #ifdef CONFIG_NUMA 895 + static inline const int cpu_numa_flags(void) 896 + { 897 + return SD_NUMA; 898 + } 899 + #endif 882 900 883 901 struct sched_domain_attr { 884 902 int relax_domain_level; ··· 1004 984 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); 1005 985 1006 986 bool cpus_share_cache(int this_cpu, int that_cpu); 987 + 988 + typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 989 + typedef const int (*sched_domain_flags_f)(void); 990 + 991 + #define SDTL_OVERLAP 0x01 992 + 993 + struct sd_data { 994 + struct sched_domain **__percpu sd; 995 + struct sched_group **__percpu sg; 996 + struct sched_group_power **__percpu sgp; 997 + }; 998 + 999 + struct sched_domain_topology_level { 1000 + sched_domain_mask_f mask; 1001 + sched_domain_flags_f sd_flags; 1002 + int flags; 1003 + int numa_level; 1004 + struct sd_data data; 1005 + #ifdef CONFIG_SCHED_DEBUG 1006 + char *name; 1007 + #endif 1008 + }; 1009 + 1010 + extern struct sched_domain_topology_level *sched_domain_topology; 1011 + 1012 + extern void set_sched_topology(struct sched_domain_topology_level *tl); 1013 + 1014 + #ifdef CONFIG_SCHED_DEBUG 1015 + # define SD_INIT_NAME(type) .name = #type 1016 + #else 1017 + # define SD_INIT_NAME(type) 1018 + #endif 1007 1019 1008 1020 #else /* CONFIG_SMP */ 1009 1021 ··· 1175 1123 1176 1124 /* 1177 1125 * Original scheduling parameters. Copied here from sched_attr 1178 - * during sched_setscheduler2(), they will remain the same until 1179 - * the next sched_setscheduler2(). 1126 + * during sched_setattr(), they will remain the same until 1127 + * the next sched_setattr(). 1180 1128 */ 1181 1129 u64 dl_runtime; /* maximum runtime for each instance */ 1182 1130 u64 dl_deadline; /* relative deadline of each instance */ ··· 2775 2723 2776 2724 /* 2777 2725 * Idle thread specific functions to determine the need_resched 2778 - * polling state. We have two versions, one based on TS_POLLING in 2779 - * thread_info.status and one based on TIF_POLLING_NRFLAG in 2780 - * thread_info.flags 2726 + * polling state. 2781 2727 */ 2782 - #ifdef TS_POLLING 2783 - static inline int tsk_is_polling(struct task_struct *p) 2784 - { 2785 - return task_thread_info(p)->status & TS_POLLING; 2786 - } 2787 - static inline void __current_set_polling(void) 2788 - { 2789 - current_thread_info()->status |= TS_POLLING; 2790 - } 2791 - 2792 - static inline bool __must_check current_set_polling_and_test(void) 2793 - { 2794 - __current_set_polling(); 2795 - 2796 - /* 2797 - * Polling state must be visible before we test NEED_RESCHED, 2798 - * paired by resched_task() 2799 - */ 2800 - smp_mb(); 2801 - 2802 - return unlikely(tif_need_resched()); 2803 - } 2804 - 2805 - static inline void __current_clr_polling(void) 2806 - { 2807 - current_thread_info()->status &= ~TS_POLLING; 2808 - } 2809 - 2810 - static inline bool __must_check current_clr_polling_and_test(void) 2811 - { 2812 - __current_clr_polling(); 2813 - 2814 - /* 2815 - * Polling state must be visible before we test NEED_RESCHED, 2816 - * paired by resched_task() 2817 - */ 2818 - smp_mb(); 2819 - 2820 - return unlikely(tif_need_resched()); 2821 - } 2822 - #elif defined(TIF_POLLING_NRFLAG) 2728 + #ifdef TIF_POLLING_NRFLAG 2823 2729 static inline int tsk_is_polling(struct task_struct *p) 2824 2730 { 2825 2731 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+16
include/linux/sched/prio.h
··· 41 41 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 42 42 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 43 43 44 + /* 45 + * Convert nice value [19,-20] to rlimit style value [1,40]. 46 + */ 47 + static inline long nice_to_rlimit(long nice) 48 + { 49 + return (MAX_NICE - nice + 1); 50 + } 51 + 52 + /* 53 + * Convert rlimit style value [1,40] to nice value [-20, 19]. 54 + */ 55 + static inline long rlimit_to_nice(long prio) 56 + { 57 + return (MAX_NICE - prio + 1); 58 + } 59 + 44 60 #endif /* _SCHED_PRIO_H */
-14
include/linux/thread_info.h
··· 104 104 #define test_thread_flag(flag) \ 105 105 test_ti_thread_flag(current_thread_info(), flag) 106 106 107 - static inline __deprecated void set_need_resched(void) 108 - { 109 - /* 110 - * Use of this function in deprecated. 111 - * 112 - * As of this writing there are only a few users in the DRM tree left 113 - * all of which are wrong and can be removed without causing too much 114 - * grief. 115 - * 116 - * The DRM people are aware and are working on removing the last few 117 - * instances. 118 - */ 119 - } 120 - 121 107 #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) 122 108 123 109 #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
+13 -115
include/linux/topology.h
··· 66 66 #define PENALTY_FOR_NODE_WITH_CPUS (1) 67 67 #endif 68 68 69 - /* 70 - * Below are the 3 major initializers used in building sched_domains: 71 - * SD_SIBLING_INIT, for SMT domains 72 - * SD_CPU_INIT, for SMP domains 73 - * 74 - * Any architecture that cares to do any tuning to these values should do so 75 - * by defining their own arch-specific initializer in include/asm/topology.h. 76 - * A definition there will automagically override these default initializers 77 - * and allow arch-specific performance tuning of sched_domains. 78 - * (Only non-zero and non-null fields need be specified.) 79 - */ 80 - 81 - #ifdef CONFIG_SCHED_SMT 82 - /* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, 83 - * so can't we drop this in favor of CONFIG_SCHED_SMT? 84 - */ 85 - #define ARCH_HAS_SCHED_WAKE_IDLE 86 - /* Common values for SMT siblings */ 87 - #ifndef SD_SIBLING_INIT 88 - #define SD_SIBLING_INIT (struct sched_domain) { \ 89 - .min_interval = 1, \ 90 - .max_interval = 2, \ 91 - .busy_factor = 64, \ 92 - .imbalance_pct = 110, \ 93 - \ 94 - .flags = 1*SD_LOAD_BALANCE \ 95 - | 1*SD_BALANCE_NEWIDLE \ 96 - | 1*SD_BALANCE_EXEC \ 97 - | 1*SD_BALANCE_FORK \ 98 - | 0*SD_BALANCE_WAKE \ 99 - | 1*SD_WAKE_AFFINE \ 100 - | 1*SD_SHARE_CPUPOWER \ 101 - | 1*SD_SHARE_PKG_RESOURCES \ 102 - | 0*SD_SERIALIZE \ 103 - | 0*SD_PREFER_SIBLING \ 104 - | arch_sd_sibling_asym_packing() \ 105 - , \ 106 - .last_balance = jiffies, \ 107 - .balance_interval = 1, \ 108 - .smt_gain = 1178, /* 15% */ \ 109 - .max_newidle_lb_cost = 0, \ 110 - .next_decay_max_lb_cost = jiffies, \ 111 - } 112 - #endif 113 - #endif /* CONFIG_SCHED_SMT */ 114 - 115 - #ifdef CONFIG_SCHED_MC 116 - /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ 117 - #ifndef SD_MC_INIT 118 - #define SD_MC_INIT (struct sched_domain) { \ 119 - .min_interval = 1, \ 120 - .max_interval = 4, \ 121 - .busy_factor = 64, \ 122 - .imbalance_pct = 125, \ 123 - .cache_nice_tries = 1, \ 124 - .busy_idx = 2, \ 125 - .wake_idx = 0, \ 126 - .forkexec_idx = 0, \ 127 - \ 128 - .flags = 1*SD_LOAD_BALANCE \ 129 - | 1*SD_BALANCE_NEWIDLE \ 130 - | 1*SD_BALANCE_EXEC \ 131 - | 1*SD_BALANCE_FORK \ 132 - | 0*SD_BALANCE_WAKE \ 133 - | 1*SD_WAKE_AFFINE \ 134 - | 0*SD_SHARE_CPUPOWER \ 135 - | 1*SD_SHARE_PKG_RESOURCES \ 136 - | 0*SD_SERIALIZE \ 137 - , \ 138 - .last_balance = jiffies, \ 139 - .balance_interval = 1, \ 140 - .max_newidle_lb_cost = 0, \ 141 - .next_decay_max_lb_cost = jiffies, \ 142 - } 143 - #endif 144 - #endif /* CONFIG_SCHED_MC */ 145 - 146 - /* Common values for CPUs */ 147 - #ifndef SD_CPU_INIT 148 - #define SD_CPU_INIT (struct sched_domain) { \ 149 - .min_interval = 1, \ 150 - .max_interval = 4, \ 151 - .busy_factor = 64, \ 152 - .imbalance_pct = 125, \ 153 - .cache_nice_tries = 1, \ 154 - .busy_idx = 2, \ 155 - .idle_idx = 1, \ 156 - .newidle_idx = 0, \ 157 - .wake_idx = 0, \ 158 - .forkexec_idx = 0, \ 159 - \ 160 - .flags = 1*SD_LOAD_BALANCE \ 161 - | 1*SD_BALANCE_NEWIDLE \ 162 - | 1*SD_BALANCE_EXEC \ 163 - | 1*SD_BALANCE_FORK \ 164 - | 0*SD_BALANCE_WAKE \ 165 - | 1*SD_WAKE_AFFINE \ 166 - | 0*SD_SHARE_CPUPOWER \ 167 - | 0*SD_SHARE_PKG_RESOURCES \ 168 - | 0*SD_SERIALIZE \ 169 - | 1*SD_PREFER_SIBLING \ 170 - , \ 171 - .last_balance = jiffies, \ 172 - .balance_interval = 1, \ 173 - .max_newidle_lb_cost = 0, \ 174 - .next_decay_max_lb_cost = jiffies, \ 175 - } 176 - #endif 177 - 178 - #ifdef CONFIG_SCHED_BOOK 179 - #ifndef SD_BOOK_INIT 180 - #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! 181 - #endif 182 - #endif /* CONFIG_SCHED_BOOK */ 183 - 184 69 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 185 70 DECLARE_PER_CPU(int, numa_node); 186 71 ··· 179 294 #ifndef topology_core_cpumask 180 295 #define topology_core_cpumask(cpu) cpumask_of(cpu) 181 296 #endif 297 + 298 + #ifdef CONFIG_SCHED_SMT 299 + static inline const struct cpumask *cpu_smt_mask(int cpu) 300 + { 301 + return topology_thread_cpumask(cpu); 302 + } 303 + #endif 304 + 305 + static inline const struct cpumask *cpu_cpu_mask(int cpu) 306 + { 307 + return cpumask_of_node(cpu_to_node(cpu)); 308 + } 309 + 182 310 183 311 #endif /* _LINUX_TOPOLOGY_H */
+1 -1
kernel/locking/locktorture.c
··· 216 216 static DEFINE_TORTURE_RANDOM(rand); 217 217 218 218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 219 - set_user_nice(current, 19); 219 + set_user_nice(current, MAX_NICE); 220 220 221 221 do { 222 222 if ((torture_random(&rand) & 0xfffff) == 0)
+2
kernel/power/suspend.c
··· 54 54 55 55 static void freeze_enter(void) 56 56 { 57 + cpuidle_use_deepest_state(true); 57 58 cpuidle_resume(); 58 59 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 59 60 cpuidle_pause(); 61 + cpuidle_use_deepest_state(false); 60 62 } 61 63 62 64 void freeze_wake(void)
+176 -148
kernel/sched/core.c
··· 522 522 #endif /* CONFIG_SCHED_HRTICK */ 523 523 524 524 /* 525 + * cmpxchg based fetch_or, macro so it works for different integer types 526 + */ 527 + #define fetch_or(ptr, val) \ 528 + ({ typeof(*(ptr)) __old, __val = *(ptr); \ 529 + for (;;) { \ 530 + __old = cmpxchg((ptr), __val, __val | (val)); \ 531 + if (__old == __val) \ 532 + break; \ 533 + __val = __old; \ 534 + } \ 535 + __old; \ 536 + }) 537 + 538 + #ifdef TIF_POLLING_NRFLAG 539 + /* 540 + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 541 + * this avoids any races wrt polling state changes and thereby avoids 542 + * spurious IPIs. 543 + */ 544 + static bool set_nr_and_not_polling(struct task_struct *p) 545 + { 546 + struct thread_info *ti = task_thread_info(p); 547 + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 548 + } 549 + #else 550 + static bool set_nr_and_not_polling(struct task_struct *p) 551 + { 552 + set_tsk_need_resched(p); 553 + return true; 554 + } 555 + #endif 556 + 557 + /* 525 558 * resched_task - mark a task 'to be rescheduled now'. 526 559 * 527 560 * On UP this means the setting of the need_resched flag, on SMP it ··· 570 537 if (test_tsk_need_resched(p)) 571 538 return; 572 539 573 - set_tsk_need_resched(p); 574 - 575 540 cpu = task_cpu(p); 541 + 576 542 if (cpu == smp_processor_id()) { 543 + set_tsk_need_resched(p); 577 544 set_preempt_need_resched(); 578 545 return; 579 546 } 580 547 581 - /* NEED_RESCHED must be visible before we test polling */ 582 - smp_mb(); 583 - if (!tsk_is_polling(p)) 548 + if (set_nr_and_not_polling(p)) 584 549 smp_send_reschedule(cpu); 585 550 } 586 551 ··· 3049 3018 int can_nice(const struct task_struct *p, const int nice) 3050 3019 { 3051 3020 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3052 - int nice_rlim = 20 - nice; 3021 + int nice_rlim = nice_to_rlimit(nice); 3053 3022 3054 3023 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3055 3024 capable(CAP_SYS_NICE)); ··· 3073 3042 * We don't have to worry. Conceptually one call occurs first 3074 3043 * and we have a single winner. 3075 3044 */ 3076 - if (increment < -40) 3077 - increment = -40; 3078 - if (increment > 40) 3079 - increment = 40; 3080 - 3045 + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3081 3046 nice = task_nice(current) + increment; 3082 - if (nice < MIN_NICE) 3083 - nice = MIN_NICE; 3084 - if (nice > MAX_NICE) 3085 - nice = MAX_NICE; 3086 3047 3048 + nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3087 3049 if (increment < 0 && !can_nice(current, nice)) 3088 3050 return -EPERM; 3089 3051 ··· 3666 3642 */ 3667 3643 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3668 3644 3669 - out: 3670 - return ret; 3645 + return 0; 3671 3646 3672 3647 err_size: 3673 3648 put_user(sizeof(*attr), &uattr->size); 3674 - ret = -E2BIG; 3675 - goto out; 3649 + return -E2BIG; 3676 3650 } 3677 3651 3678 3652 /** ··· 3830 3808 3831 3809 for (; addr < end; addr++) { 3832 3810 if (*addr) 3833 - goto err_size; 3811 + return -EFBIG; 3834 3812 } 3835 3813 3836 3814 attr->size = usize; ··· 3840 3818 if (ret) 3841 3819 return -EFAULT; 3842 3820 3843 - out: 3844 - return ret; 3845 - 3846 - err_size: 3847 - ret = -E2BIG; 3848 - goto out; 3821 + return 0; 3849 3822 } 3850 3823 3851 3824 /** ··· 5110 5093 .priority = CPU_PRI_MIGRATION, 5111 5094 }; 5112 5095 5096 + static void __cpuinit set_cpu_rq_start_time(void) 5097 + { 5098 + int cpu = smp_processor_id(); 5099 + struct rq *rq = cpu_rq(cpu); 5100 + rq->age_stamp = sched_clock_cpu(cpu); 5101 + } 5102 + 5113 5103 static int sched_cpu_active(struct notifier_block *nfb, 5114 5104 unsigned long action, void *hcpu) 5115 5105 { 5116 5106 switch (action & ~CPU_TASKS_FROZEN) { 5107 + case CPU_STARTING: 5108 + set_cpu_rq_start_time(); 5109 + return NOTIFY_OK; 5117 5110 case CPU_DOWN_FAILED: 5118 5111 set_cpu_active((long)hcpu, true); 5119 5112 return NOTIFY_OK; ··· 5332 5305 SD_BALANCE_FORK | 5333 5306 SD_BALANCE_EXEC | 5334 5307 SD_SHARE_CPUPOWER | 5335 - SD_SHARE_PKG_RESOURCES)) { 5308 + SD_SHARE_PKG_RESOURCES | 5309 + SD_SHARE_POWERDOMAIN)) { 5336 5310 if (sd->groups != sd->groups->next) 5337 5311 return 0; 5338 5312 } ··· 5364 5336 SD_BALANCE_EXEC | 5365 5337 SD_SHARE_CPUPOWER | 5366 5338 SD_SHARE_PKG_RESOURCES | 5367 - SD_PREFER_SIBLING); 5339 + SD_PREFER_SIBLING | 5340 + SD_SHARE_POWERDOMAIN); 5368 5341 if (nr_node_ids == 1) 5369 5342 pflags &= ~SD_SERIALIZE; 5370 5343 } ··· 5639 5610 5640 5611 __setup("isolcpus=", isolated_cpu_setup); 5641 5612 5642 - static const struct cpumask *cpu_cpu_mask(int cpu) 5643 - { 5644 - return cpumask_of_node(cpu_to_node(cpu)); 5645 - } 5646 - 5647 - struct sd_data { 5648 - struct sched_domain **__percpu sd; 5649 - struct sched_group **__percpu sg; 5650 - struct sched_group_power **__percpu sgp; 5651 - }; 5652 - 5653 5613 struct s_data { 5654 5614 struct sched_domain ** __percpu sd; 5655 5615 struct root_domain *rd; ··· 5649 5631 sa_sd, 5650 5632 sa_sd_storage, 5651 5633 sa_none, 5652 - }; 5653 - 5654 - struct sched_domain_topology_level; 5655 - 5656 - typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5657 - typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5658 - 5659 - #define SDTL_OVERLAP 0x01 5660 - 5661 - struct sched_domain_topology_level { 5662 - sched_domain_init_f init; 5663 - sched_domain_mask_f mask; 5664 - int flags; 5665 - int numa_level; 5666 - struct sd_data data; 5667 5634 }; 5668 5635 5669 5636 /* ··· 5818 5815 continue; 5819 5816 5820 5817 group = get_group(i, sdd, &sg); 5821 - cpumask_clear(sched_group_cpus(sg)); 5822 - sg->sgp->power = 0; 5823 5818 cpumask_setall(sched_group_mask(sg)); 5824 5819 5825 5820 for_each_cpu(j, span) { ··· 5867 5866 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5868 5867 } 5869 5868 5870 - int __weak arch_sd_sibling_asym_packing(void) 5871 - { 5872 - return 0*SD_ASYM_PACKING; 5873 - } 5874 - 5875 5869 /* 5876 5870 * Initializers for schedule domains 5877 5871 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5878 5872 */ 5879 - 5880 - #ifdef CONFIG_SCHED_DEBUG 5881 - # define SD_INIT_NAME(sd, type) sd->name = #type 5882 - #else 5883 - # define SD_INIT_NAME(sd, type) do { } while (0) 5884 - #endif 5885 - 5886 - #define SD_INIT_FUNC(type) \ 5887 - static noinline struct sched_domain * \ 5888 - sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 5889 - { \ 5890 - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 5891 - *sd = SD_##type##_INIT; \ 5892 - SD_INIT_NAME(sd, type); \ 5893 - sd->private = &tl->data; \ 5894 - return sd; \ 5895 - } 5896 - 5897 - SD_INIT_FUNC(CPU) 5898 - #ifdef CONFIG_SCHED_SMT 5899 - SD_INIT_FUNC(SIBLING) 5900 - #endif 5901 - #ifdef CONFIG_SCHED_MC 5902 - SD_INIT_FUNC(MC) 5903 - #endif 5904 - #ifdef CONFIG_SCHED_BOOK 5905 - SD_INIT_FUNC(BOOK) 5906 - #endif 5907 5873 5908 5874 static int default_relax_domain_level = -1; 5909 5875 int sched_domain_level_max; ··· 5959 5991 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5960 5992 } 5961 5993 5962 - #ifdef CONFIG_SCHED_SMT 5963 - static const struct cpumask *cpu_smt_mask(int cpu) 5964 - { 5965 - return topology_thread_cpumask(cpu); 5966 - } 5967 - #endif 5968 - 5969 - /* 5970 - * Topology list, bottom-up. 5971 - */ 5972 - static struct sched_domain_topology_level default_topology[] = { 5973 - #ifdef CONFIG_SCHED_SMT 5974 - { sd_init_SIBLING, cpu_smt_mask, }, 5975 - #endif 5976 - #ifdef CONFIG_SCHED_MC 5977 - { sd_init_MC, cpu_coregroup_mask, }, 5978 - #endif 5979 - #ifdef CONFIG_SCHED_BOOK 5980 - { sd_init_BOOK, cpu_book_mask, }, 5981 - #endif 5982 - { sd_init_CPU, cpu_cpu_mask, }, 5983 - { NULL, }, 5984 - }; 5985 - 5986 - static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5987 - 5988 - #define for_each_sd_topology(tl) \ 5989 - for (tl = sched_domain_topology; tl->init; tl++) 5990 - 5991 5994 #ifdef CONFIG_NUMA 5992 - 5993 5995 static int sched_domains_numa_levels; 5994 5996 static int *sched_domains_numa_distance; 5995 5997 static struct cpumask ***sched_domains_numa_masks; 5996 5998 static int sched_domains_curr_level; 5999 + #endif 5997 6000 5998 - static inline int sd_local_flags(int level) 5999 - { 6000 - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 6001 - return 0; 6002 - 6003 - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6004 - } 6001 + /* 6002 + * SD_flags allowed in topology descriptions. 6003 + * 6004 + * SD_SHARE_CPUPOWER - describes SMT topologies 6005 + * SD_SHARE_PKG_RESOURCES - describes shared caches 6006 + * SD_NUMA - describes NUMA topologies 6007 + * SD_SHARE_POWERDOMAIN - describes shared power domain 6008 + * 6009 + * Odd one out: 6010 + * SD_ASYM_PACKING - describes SMT quirks 6011 + */ 6012 + #define TOPOLOGY_SD_FLAGS \ 6013 + (SD_SHARE_CPUPOWER | \ 6014 + SD_SHARE_PKG_RESOURCES | \ 6015 + SD_NUMA | \ 6016 + SD_ASYM_PACKING | \ 6017 + SD_SHARE_POWERDOMAIN) 6005 6018 6006 6019 static struct sched_domain * 6007 - sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6020 + sd_init(struct sched_domain_topology_level *tl, int cpu) 6008 6021 { 6009 6022 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6010 - int level = tl->numa_level; 6011 - int sd_weight = cpumask_weight( 6012 - sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6023 + int sd_weight, sd_flags = 0; 6024 + 6025 + #ifdef CONFIG_NUMA 6026 + /* 6027 + * Ugly hack to pass state to sd_numa_mask()... 6028 + */ 6029 + sched_domains_curr_level = tl->numa_level; 6030 + #endif 6031 + 6032 + sd_weight = cpumask_weight(tl->mask(cpu)); 6033 + 6034 + if (tl->sd_flags) 6035 + sd_flags = (*tl->sd_flags)(); 6036 + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 6037 + "wrong sd_flags in topology description\n")) 6038 + sd_flags &= ~TOPOLOGY_SD_FLAGS; 6013 6039 6014 6040 *sd = (struct sched_domain){ 6015 6041 .min_interval = sd_weight, 6016 6042 .max_interval = 2*sd_weight, 6017 6043 .busy_factor = 32, 6018 6044 .imbalance_pct = 125, 6019 - .cache_nice_tries = 2, 6020 - .busy_idx = 3, 6021 - .idle_idx = 2, 6045 + 6046 + .cache_nice_tries = 0, 6047 + .busy_idx = 0, 6048 + .idle_idx = 0, 6022 6049 .newidle_idx = 0, 6023 6050 .wake_idx = 0, 6024 6051 .forkexec_idx = 0, 6025 6052 6026 6053 .flags = 1*SD_LOAD_BALANCE 6027 6054 | 1*SD_BALANCE_NEWIDLE 6028 - | 0*SD_BALANCE_EXEC 6029 - | 0*SD_BALANCE_FORK 6055 + | 1*SD_BALANCE_EXEC 6056 + | 1*SD_BALANCE_FORK 6030 6057 | 0*SD_BALANCE_WAKE 6031 - | 0*SD_WAKE_AFFINE 6058 + | 1*SD_WAKE_AFFINE 6032 6059 | 0*SD_SHARE_CPUPOWER 6033 6060 | 0*SD_SHARE_PKG_RESOURCES 6034 - | 1*SD_SERIALIZE 6061 + | 0*SD_SERIALIZE 6035 6062 | 0*SD_PREFER_SIBLING 6036 - | 1*SD_NUMA 6037 - | sd_local_flags(level) 6063 + | 0*SD_NUMA 6064 + | sd_flags 6038 6065 , 6066 + 6039 6067 .last_balance = jiffies, 6040 6068 .balance_interval = sd_weight, 6069 + .smt_gain = 0, 6041 6070 .max_newidle_lb_cost = 0, 6042 6071 .next_decay_max_lb_cost = jiffies, 6072 + #ifdef CONFIG_SCHED_DEBUG 6073 + .name = tl->name, 6074 + #endif 6043 6075 }; 6044 - SD_INIT_NAME(sd, NUMA); 6045 - sd->private = &tl->data; 6046 6076 6047 6077 /* 6048 - * Ugly hack to pass state to sd_numa_mask()... 6078 + * Convert topological properties into behaviour. 6049 6079 */ 6050 - sched_domains_curr_level = tl->numa_level; 6080 + 6081 + if (sd->flags & SD_SHARE_CPUPOWER) { 6082 + sd->imbalance_pct = 110; 6083 + sd->smt_gain = 1178; /* ~15% */ 6084 + 6085 + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 6086 + sd->imbalance_pct = 117; 6087 + sd->cache_nice_tries = 1; 6088 + sd->busy_idx = 2; 6089 + 6090 + #ifdef CONFIG_NUMA 6091 + } else if (sd->flags & SD_NUMA) { 6092 + sd->cache_nice_tries = 2; 6093 + sd->busy_idx = 3; 6094 + sd->idle_idx = 2; 6095 + 6096 + sd->flags |= SD_SERIALIZE; 6097 + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 6098 + sd->flags &= ~(SD_BALANCE_EXEC | 6099 + SD_BALANCE_FORK | 6100 + SD_WAKE_AFFINE); 6101 + } 6102 + 6103 + #endif 6104 + } else { 6105 + sd->flags |= SD_PREFER_SIBLING; 6106 + sd->cache_nice_tries = 1; 6107 + sd->busy_idx = 2; 6108 + sd->idle_idx = 1; 6109 + } 6110 + 6111 + sd->private = &tl->data; 6051 6112 6052 6113 return sd; 6053 6114 } 6115 + 6116 + /* 6117 + * Topology list, bottom-up. 6118 + */ 6119 + static struct sched_domain_topology_level default_topology[] = { 6120 + #ifdef CONFIG_SCHED_SMT 6121 + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 6122 + #endif 6123 + #ifdef CONFIG_SCHED_MC 6124 + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 6125 + #endif 6126 + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 6127 + { NULL, }, 6128 + }; 6129 + 6130 + struct sched_domain_topology_level *sched_domain_topology = default_topology; 6131 + 6132 + #define for_each_sd_topology(tl) \ 6133 + for (tl = sched_domain_topology; tl->mask; tl++) 6134 + 6135 + void set_sched_topology(struct sched_domain_topology_level *tl) 6136 + { 6137 + sched_domain_topology = tl; 6138 + } 6139 + 6140 + #ifdef CONFIG_NUMA 6054 6141 6055 6142 static const struct cpumask *sd_numa_mask(int cpu) 6056 6143 { ··· 6250 6227 } 6251 6228 } 6252 6229 6253 - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6230 + /* Compute default topology size */ 6231 + for (i = 0; sched_domain_topology[i].mask; i++); 6232 + 6233 + tl = kzalloc((i + level + 1) * 6254 6234 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6255 6235 if (!tl) 6256 6236 return; ··· 6261 6235 /* 6262 6236 * Copy the default topology bits.. 6263 6237 */ 6264 - for (i = 0; default_topology[i].init; i++) 6265 - tl[i] = default_topology[i]; 6238 + for (i = 0; sched_domain_topology[i].mask; i++) 6239 + tl[i] = sched_domain_topology[i]; 6266 6240 6267 6241 /* 6268 6242 * .. and append 'j' levels of NUMA goodness. 6269 6243 */ 6270 6244 for (j = 0; j < level; i++, j++) { 6271 6245 tl[i] = (struct sched_domain_topology_level){ 6272 - .init = sd_numa_init, 6273 6246 .mask = sd_numa_mask, 6247 + .sd_flags = cpu_numa_flags, 6274 6248 .flags = SDTL_OVERLAP, 6275 6249 .numa_level = j, 6250 + SD_INIT_NAME(NUMA) 6276 6251 }; 6277 6252 } 6278 6253 ··· 6431 6404 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6432 6405 struct sched_domain *child, int cpu) 6433 6406 { 6434 - struct sched_domain *sd = tl->init(tl, cpu); 6407 + struct sched_domain *sd = sd_init(tl, cpu); 6435 6408 if (!sd) 6436 6409 return child; 6437 6410 ··· 7001 6974 if (cpu_isolated_map == NULL) 7002 6975 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7003 6976 idle_thread_set_boot_cpu(); 6977 + set_cpu_rq_start_time(); 7004 6978 #endif 7005 6979 init_sched_fair_class(); 7006 6980
+3 -3
kernel/sched/deadline.c
··· 520 520 * We need to take care of a possible races here. In fact, the 521 521 * task might have changed its scheduling policy to something 522 522 * different from SCHED_DEADLINE or changed its reservation 523 - * parameters (through sched_setscheduler()). 523 + * parameters (through sched_setattr()). 524 524 */ 525 525 if (!dl_task(p) || dl_se->dl_new) 526 526 goto unlock; ··· 741 741 742 742 WARN_ON(!dl_prio(prio)); 743 743 dl_rq->dl_nr_running++; 744 - inc_nr_running(rq_of_dl_rq(dl_rq)); 744 + add_nr_running(rq_of_dl_rq(dl_rq), 1); 745 745 746 746 inc_dl_deadline(dl_rq, deadline); 747 747 inc_dl_migration(dl_se, dl_rq); ··· 755 755 WARN_ON(!dl_prio(prio)); 756 756 WARN_ON(!dl_rq->dl_nr_running); 757 757 dl_rq->dl_nr_running--; 758 - dec_nr_running(rq_of_dl_rq(dl_rq)); 758 + sub_nr_running(rq_of_dl_rq(dl_rq), 1); 759 759 760 760 dec_dl_deadline(dl_rq, dl_se->deadline); 761 761 dec_dl_migration(dl_se, dl_rq);
+163 -65
kernel/sched/fair.c
··· 1095 1095 env->best_cpu = env->dst_cpu; 1096 1096 } 1097 1097 1098 + static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, 1099 + long src_load, long dst_load, 1100 + struct task_numa_env *env) 1101 + { 1102 + long imb, old_imb; 1103 + 1104 + /* We care about the slope of the imbalance, not the direction. */ 1105 + if (dst_load < src_load) 1106 + swap(dst_load, src_load); 1107 + 1108 + /* Is the difference below the threshold? */ 1109 + imb = dst_load * 100 - src_load * env->imbalance_pct; 1110 + if (imb <= 0) 1111 + return false; 1112 + 1113 + /* 1114 + * The imbalance is above the allowed threshold. 1115 + * Compare it with the old imbalance. 1116 + */ 1117 + if (orig_dst_load < orig_src_load) 1118 + swap(orig_dst_load, orig_src_load); 1119 + 1120 + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; 1121 + 1122 + /* Would this change make things worse? */ 1123 + return (old_imb > imb); 1124 + } 1125 + 1098 1126 /* 1099 1127 * This checks if the overall compute and NUMA accesses of the system would 1100 1128 * be improved if the source tasks was migrated to the target dst_cpu taking ··· 1135 1107 struct rq *src_rq = cpu_rq(env->src_cpu); 1136 1108 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1137 1109 struct task_struct *cur; 1138 - long dst_load, src_load; 1110 + long orig_src_load, src_load; 1111 + long orig_dst_load, dst_load; 1139 1112 long load; 1140 1113 long imp = (groupimp > 0) ? groupimp : taskimp; 1141 1114 ··· 1210 1181 * In the overloaded case, try and keep the load balanced. 1211 1182 */ 1212 1183 balance: 1213 - dst_load = env->dst_stats.load; 1214 - src_load = env->src_stats.load; 1184 + orig_dst_load = env->dst_stats.load; 1185 + orig_src_load = env->src_stats.load; 1215 1186 1216 1187 /* XXX missing power terms */ 1217 1188 load = task_h_load(env->p); 1218 - dst_load += load; 1219 - src_load -= load; 1189 + dst_load = orig_dst_load + load; 1190 + src_load = orig_src_load - load; 1220 1191 1221 1192 if (cur) { 1222 1193 load = task_h_load(cur); ··· 1224 1195 src_load += load; 1225 1196 } 1226 1197 1227 - /* make src_load the smaller */ 1228 - if (dst_load < src_load) 1229 - swap(dst_load, src_load); 1230 - 1231 - if (src_load * env->imbalance_pct < dst_load * 100) 1198 + if (load_too_imbalanced(orig_src_load, orig_dst_load, 1199 + src_load, dst_load, env)) 1232 1200 goto unlock; 1233 1201 1234 1202 assign: ··· 1327 1301 if (env.best_cpu == -1) 1328 1302 return -EAGAIN; 1329 1303 1330 - sched_setnuma(p, env.dst_nid); 1304 + /* 1305 + * If the task is part of a workload that spans multiple NUMA nodes, 1306 + * and is migrating into one of the workload's active nodes, remember 1307 + * this node as the task's preferred numa node, so the workload can 1308 + * settle down. 1309 + * A task that migrated to a second choice node will be better off 1310 + * trying for a better one later. Do not set the preferred node here. 1311 + */ 1312 + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) 1313 + sched_setnuma(p, env.dst_nid); 1331 1314 1332 1315 /* 1333 1316 * Reset the scan period if the task is being rescheduled on an ··· 1361 1326 /* Attempt to migrate a task to a CPU on the preferred node. */ 1362 1327 static void numa_migrate_preferred(struct task_struct *p) 1363 1328 { 1329 + unsigned long interval = HZ; 1330 + 1364 1331 /* This task has no NUMA fault statistics yet */ 1365 1332 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1366 1333 return; 1367 1334 1368 1335 /* Periodically retry migrating the task to the preferred node */ 1369 - p->numa_migrate_retry = jiffies + HZ; 1336 + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1337 + p->numa_migrate_retry = jiffies + interval; 1370 1338 1371 1339 /* Success if task is already running on preferred CPU */ 1372 1340 if (task_node(p) == p->numa_preferred_nid) ··· 1776 1738 struct task_struct *p = current; 1777 1739 bool migrated = flags & TNF_MIGRATED; 1778 1740 int cpu_node = task_node(current); 1741 + int local = !!(flags & TNF_FAULT_LOCAL); 1779 1742 int priv; 1780 1743 1781 1744 if (!numabalancing_enabled) ··· 1825 1786 task_numa_group(p, last_cpupid, flags, &priv); 1826 1787 } 1827 1788 1789 + /* 1790 + * If a workload spans multiple NUMA nodes, a shared fault that 1791 + * occurs wholly within the set of nodes that the workload is 1792 + * actively using should be counted as local. This allows the 1793 + * scan rate to slow down when a workload has settled down. 1794 + */ 1795 + if (!priv && !local && p->numa_group && 1796 + node_isset(cpu_node, p->numa_group->active_nodes) && 1797 + node_isset(mem_node, p->numa_group->active_nodes)) 1798 + local = 1; 1799 + 1828 1800 task_numa_placement(p); 1829 1801 1830 1802 /* ··· 1850 1800 1851 1801 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1852 1802 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1853 - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1803 + p->numa_faults_locality[local] += pages; 1854 1804 } 1855 1805 1856 1806 static void reset_ptenuma_scan(struct task_struct *p) ··· 3351 3301 } 3352 3302 3353 3303 if (!se) 3354 - rq->nr_running -= task_delta; 3304 + sub_nr_running(rq, task_delta); 3355 3305 3356 3306 cfs_rq->throttled = 1; 3357 3307 cfs_rq->throttled_clock = rq_clock(rq); ··· 3402 3352 } 3403 3353 3404 3354 if (!se) 3405 - rq->nr_running += task_delta; 3355 + add_nr_running(rq, task_delta); 3406 3356 3407 3357 /* determine whether we need to wake up potentially idle cpu */ 3408 3358 if (rq->curr == rq->idle && rq->cfs.nr_running) ··· 3934 3884 3935 3885 if (!se) { 3936 3886 update_rq_runnable_avg(rq, rq->nr_running); 3937 - inc_nr_running(rq); 3887 + add_nr_running(rq, 1); 3938 3888 } 3939 3889 hrtick_update(rq); 3940 3890 } ··· 3994 3944 } 3995 3945 3996 3946 if (!se) { 3997 - dec_nr_running(rq); 3947 + sub_nr_running(rq, 1); 3998 3948 update_rq_runnable_avg(rq, 1); 3999 3949 } 4000 3950 hrtick_update(rq); ··· 4065 4015 * about the loss. 4066 4016 */ 4067 4017 if (jiffies > current->wakee_flip_decay_ts + HZ) { 4068 - current->wakee_flips = 0; 4018 + current->wakee_flips >>= 1; 4069 4019 current->wakee_flip_decay_ts = jiffies; 4070 4020 } 4071 4021 ··· 4499 4449 sd = tmp; 4500 4450 } 4501 4451 4502 - if (affine_sd) { 4503 - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4504 - prev_cpu = cpu; 4452 + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4453 + prev_cpu = cpu; 4505 4454 4455 + if (sd_flag & SD_BALANCE_WAKE) { 4506 4456 new_cpu = select_idle_sibling(p, prev_cpu); 4507 4457 goto unlock; 4508 4458 } ··· 4570 4520 atomic_long_add(se->avg.load_avg_contrib, 4571 4521 &cfs_rq->removed_load); 4572 4522 } 4523 + 4524 + /* We have migrated, no longer consider this task hot */ 4525 + se->exec_start = 0; 4573 4526 } 4574 4527 #endif /* CONFIG_SMP */ 4575 4528 ··· 5123 5070 /* Returns true if the destination node has incurred more faults */ 5124 5071 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5125 5072 { 5073 + struct numa_group *numa_group = rcu_dereference(p->numa_group); 5126 5074 int src_nid, dst_nid; 5127 5075 5128 5076 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || ··· 5137 5083 if (src_nid == dst_nid) 5138 5084 return false; 5139 5085 5140 - /* Always encourage migration to the preferred node. */ 5086 + if (numa_group) { 5087 + /* Task is already in the group's interleave set. */ 5088 + if (node_isset(src_nid, numa_group->active_nodes)) 5089 + return false; 5090 + 5091 + /* Task is moving into the group's interleave set. */ 5092 + if (node_isset(dst_nid, numa_group->active_nodes)) 5093 + return true; 5094 + 5095 + return group_faults(p, dst_nid) > group_faults(p, src_nid); 5096 + } 5097 + 5098 + /* Encourage migration to the preferred node. */ 5141 5099 if (dst_nid == p->numa_preferred_nid) 5142 5100 return true; 5143 5101 5144 - /* If both task and group weight improve, this move is a winner. */ 5145 - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && 5146 - group_weight(p, dst_nid) > group_weight(p, src_nid)) 5147 - return true; 5148 - 5149 - return false; 5102 + return task_faults(p, dst_nid) > task_faults(p, src_nid); 5150 5103 } 5151 5104 5152 5105 5153 5106 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5154 5107 { 5108 + struct numa_group *numa_group = rcu_dereference(p->numa_group); 5155 5109 int src_nid, dst_nid; 5156 5110 5157 5111 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) ··· 5174 5112 if (src_nid == dst_nid) 5175 5113 return false; 5176 5114 5115 + if (numa_group) { 5116 + /* Task is moving within/into the group's interleave set. */ 5117 + if (node_isset(dst_nid, numa_group->active_nodes)) 5118 + return false; 5119 + 5120 + /* Task is moving out of the group's interleave set. */ 5121 + if (node_isset(src_nid, numa_group->active_nodes)) 5122 + return true; 5123 + 5124 + return group_faults(p, dst_nid) < group_faults(p, src_nid); 5125 + } 5126 + 5177 5127 /* Migrating away from the preferred node is always bad. */ 5178 5128 if (src_nid == p->numa_preferred_nid) 5179 5129 return true; 5180 5130 5181 - /* If either task or group weight get worse, don't do it. */ 5182 - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || 5183 - group_weight(p, dst_nid) < group_weight(p, src_nid)) 5184 - return true; 5185 - 5186 - return false; 5131 + return task_faults(p, dst_nid) < task_faults(p, src_nid); 5187 5132 } 5188 5133 5189 5134 #else ··· 5633 5564 { 5634 5565 struct rq *rq = cpu_rq(cpu); 5635 5566 u64 total, available, age_stamp, avg; 5567 + s64 delta; 5636 5568 5637 5569 /* 5638 5570 * Since we're reading these variables without serialization make sure ··· 5642 5572 age_stamp = ACCESS_ONCE(rq->age_stamp); 5643 5573 avg = ACCESS_ONCE(rq->rt_avg); 5644 5574 5645 - total = sched_avg_period() + (rq_clock(rq) - age_stamp); 5575 + delta = rq_clock(rq) - age_stamp; 5576 + if (unlikely(delta < 0)) 5577 + delta = 0; 5578 + 5579 + total = sched_avg_period() + delta; 5646 5580 5647 5581 if (unlikely(total < avg)) { 5648 5582 /* Ensures that power won't end up being negative */ ··· 6714 6640 return ld_moved; 6715 6641 } 6716 6642 6643 + static inline unsigned long 6644 + get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) 6645 + { 6646 + unsigned long interval = sd->balance_interval; 6647 + 6648 + if (cpu_busy) 6649 + interval *= sd->busy_factor; 6650 + 6651 + /* scale ms to jiffies */ 6652 + interval = msecs_to_jiffies(interval); 6653 + interval = clamp(interval, 1UL, max_load_balance_interval); 6654 + 6655 + return interval; 6656 + } 6657 + 6658 + static inline void 6659 + update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) 6660 + { 6661 + unsigned long interval, next; 6662 + 6663 + interval = get_sd_balance_interval(sd, cpu_busy); 6664 + next = sd->last_balance + interval; 6665 + 6666 + if (time_after(*next_balance, next)) 6667 + *next_balance = next; 6668 + } 6669 + 6717 6670 /* 6718 6671 * idle_balance is called by schedule() if this_cpu is about to become 6719 6672 * idle. Attempts to pull tasks from other CPUs. 6720 6673 */ 6721 6674 static int idle_balance(struct rq *this_rq) 6722 6675 { 6676 + unsigned long next_balance = jiffies + HZ; 6677 + int this_cpu = this_rq->cpu; 6723 6678 struct sched_domain *sd; 6724 6679 int pulled_task = 0; 6725 - unsigned long next_balance = jiffies + HZ; 6726 6680 u64 curr_cost = 0; 6727 - int this_cpu = this_rq->cpu; 6728 6681 6729 6682 idle_enter_fair(this_rq); 6730 6683 ··· 6761 6660 */ 6762 6661 this_rq->idle_stamp = rq_clock(this_rq); 6763 6662 6764 - if (this_rq->avg_idle < sysctl_sched_migration_cost) 6663 + if (this_rq->avg_idle < sysctl_sched_migration_cost) { 6664 + rcu_read_lock(); 6665 + sd = rcu_dereference_check_sched_domain(this_rq->sd); 6666 + if (sd) 6667 + update_next_balance(sd, 0, &next_balance); 6668 + rcu_read_unlock(); 6669 + 6765 6670 goto out; 6671 + } 6766 6672 6767 6673 /* 6768 6674 * Drop the rq->lock, but keep IRQ/preempt disabled. ··· 6779 6671 update_blocked_averages(this_cpu); 6780 6672 rcu_read_lock(); 6781 6673 for_each_domain(this_cpu, sd) { 6782 - unsigned long interval; 6783 6674 int continue_balancing = 1; 6784 6675 u64 t0, domain_cost; 6785 6676 6786 6677 if (!(sd->flags & SD_LOAD_BALANCE)) 6787 6678 continue; 6788 6679 6789 - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 6680 + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { 6681 + update_next_balance(sd, 0, &next_balance); 6790 6682 break; 6683 + } 6791 6684 6792 6685 if (sd->flags & SD_BALANCE_NEWIDLE) { 6793 6686 t0 = sched_clock_cpu(this_cpu); 6794 6687 6795 - /* If we've pulled tasks over stop searching: */ 6796 6688 pulled_task = load_balance(this_cpu, this_rq, 6797 6689 sd, CPU_NEWLY_IDLE, 6798 6690 &continue_balancing); ··· 6804 6696 curr_cost += domain_cost; 6805 6697 } 6806 6698 6807 - interval = msecs_to_jiffies(sd->balance_interval); 6808 - if (time_after(next_balance, sd->last_balance + interval)) 6809 - next_balance = sd->last_balance + interval; 6810 - if (pulled_task) 6699 + update_next_balance(sd, 0, &next_balance); 6700 + 6701 + /* 6702 + * Stop searching for tasks to pull if there are 6703 + * now runnable tasks on this rq. 6704 + */ 6705 + if (pulled_task || this_rq->nr_running > 0) 6811 6706 break; 6812 6707 } 6813 6708 rcu_read_unlock(); ··· 6828 6717 if (this_rq->cfs.h_nr_running && !pulled_task) 6829 6718 pulled_task = 1; 6830 6719 6831 - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6832 - /* 6833 - * We are going idle. next_balance may be set based on 6834 - * a busy processor. So reset next_balance. 6835 - */ 6836 - this_rq->next_balance = next_balance; 6837 - } 6838 - 6839 6720 out: 6721 + /* Move the next balance forward */ 6722 + if (time_after(this_rq->next_balance, next_balance)) 6723 + this_rq->next_balance = next_balance; 6724 + 6840 6725 /* Is there a task of a high priority class? */ 6841 - if (this_rq->nr_running != this_rq->cfs.h_nr_running && 6842 - ((this_rq->stop && this_rq->stop->on_rq) || 6843 - this_rq->dl.dl_nr_running || 6844 - (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) 6726 + if (this_rq->nr_running != this_rq->cfs.h_nr_running) 6845 6727 pulled_task = -1; 6846 6728 6847 6729 if (pulled_task) { ··· 7115 7011 break; 7116 7012 } 7117 7013 7118 - interval = sd->balance_interval; 7119 - if (idle != CPU_IDLE) 7120 - interval *= sd->busy_factor; 7121 - 7122 - /* scale ms to jiffies */ 7123 - interval = msecs_to_jiffies(interval); 7124 - interval = clamp(interval, 1UL, max_load_balance_interval); 7014 + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); 7125 7015 7126 7016 need_serialize = sd->flags & SD_SERIALIZE; 7127 - 7128 7017 if (need_serialize) { 7129 7018 if (!spin_trylock(&balancing)) 7130 7019 goto out; ··· 7133 7036 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 7134 7037 } 7135 7038 sd->last_balance = jiffies; 7039 + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); 7136 7040 } 7137 7041 if (need_serialize) 7138 7042 spin_unlock(&balancing);
+61 -81
kernel/sched/idle.c
··· 67 67 * cpuidle_idle_call - the main idle function 68 68 * 69 69 * NOTE: no locks or semaphores should be used here 70 - * return non-zero on failure 71 70 */ 72 - static int cpuidle_idle_call(void) 71 + static void cpuidle_idle_call(void) 73 72 { 74 73 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 75 74 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 76 - int next_state, entered_state, ret; 75 + int next_state, entered_state; 77 76 bool broadcast; 78 77 79 78 /* 80 79 * Check if the idle task must be rescheduled. If it is the 81 - * case, exit the function after re-enabling the local irq and 82 - * set again the polling flag 80 + * case, exit the function after re-enabling the local irq. 83 81 */ 84 - if (current_clr_polling_and_test()) { 82 + if (need_resched()) { 85 83 local_irq_enable(); 86 - __current_set_polling(); 87 - return 0; 84 + return; 88 85 } 89 86 90 87 /* ··· 98 101 rcu_idle_enter(); 99 102 100 103 /* 101 - * Check if the cpuidle framework is ready, otherwise fallback 102 - * to the default arch specific idle method 104 + * Ask the cpuidle framework to choose a convenient idle state. 105 + * Fall back to the default arch idle method on errors. 103 106 */ 104 - ret = cpuidle_enabled(drv, dev); 105 - 106 - if (!ret) { 107 + next_state = cpuidle_select(drv, dev); 108 + if (next_state < 0) { 109 + use_default: 107 110 /* 108 - * Ask the governor to choose an idle state it thinks 109 - * it is convenient to go to. There is *always* a 110 - * convenient idle state 111 + * We can't use the cpuidle framework, let's use the default 112 + * idle routine. 111 113 */ 112 - next_state = cpuidle_select(drv, dev); 113 - 114 - /* 115 - * The idle task must be scheduled, it is pointless to 116 - * go to idle, just update no idle residency and get 117 - * out of this function 118 - */ 119 - if (current_clr_polling_and_test()) { 120 - dev->last_residency = 0; 121 - entered_state = next_state; 114 + if (current_clr_polling_and_test()) 122 115 local_irq_enable(); 123 - } else { 124 - broadcast = !!(drv->states[next_state].flags & 125 - CPUIDLE_FLAG_TIMER_STOP); 116 + else 117 + arch_cpu_idle(); 126 118 127 - if (broadcast) 128 - /* 129 - * Tell the time framework to switch 130 - * to a broadcast timer because our 131 - * local timer will be shutdown. If a 132 - * local timer is used from another 133 - * cpu as a broadcast timer, this call 134 - * may fail if it is not available 135 - */ 136 - ret = clockevents_notify( 137 - CLOCK_EVT_NOTIFY_BROADCAST_ENTER, 138 - &dev->cpu); 139 - 140 - if (!ret) { 141 - trace_cpu_idle_rcuidle(next_state, dev->cpu); 142 - 143 - /* 144 - * Enter the idle state previously 145 - * returned by the governor 146 - * decision. This function will block 147 - * until an interrupt occurs and will 148 - * take care of re-enabling the local 149 - * interrupts 150 - */ 151 - entered_state = cpuidle_enter(drv, dev, 152 - next_state); 153 - 154 - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, 155 - dev->cpu); 156 - 157 - if (broadcast) 158 - clockevents_notify( 159 - CLOCK_EVT_NOTIFY_BROADCAST_EXIT, 160 - &dev->cpu); 161 - 162 - /* 163 - * Give the governor an opportunity to reflect on the 164 - * outcome 165 - */ 166 - cpuidle_reflect(dev, entered_state); 167 - } 168 - } 119 + goto exit_idle; 169 120 } 170 121 171 - /* 172 - * We can't use the cpuidle framework, let's use the default 173 - * idle routine 174 - */ 175 - if (ret) 176 - arch_cpu_idle(); 177 122 123 + /* 124 + * The idle task must be scheduled, it is pointless to 125 + * go to idle, just update no idle residency and get 126 + * out of this function 127 + */ 128 + if (current_clr_polling_and_test()) { 129 + dev->last_residency = 0; 130 + entered_state = next_state; 131 + local_irq_enable(); 132 + goto exit_idle; 133 + } 134 + 135 + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); 136 + 137 + /* 138 + * Tell the time framework to switch to a broadcast timer 139 + * because our local timer will be shutdown. If a local timer 140 + * is used from another cpu as a broadcast timer, this call may 141 + * fail if it is not available 142 + */ 143 + if (broadcast && 144 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 145 + goto use_default; 146 + 147 + trace_cpu_idle_rcuidle(next_state, dev->cpu); 148 + 149 + /* 150 + * Enter the idle state previously returned by the governor decision. 151 + * This function will block until an interrupt occurs and will take 152 + * care of re-enabling the local interrupts 153 + */ 154 + entered_state = cpuidle_enter(drv, dev, next_state); 155 + 156 + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); 157 + 158 + if (broadcast) 159 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 160 + 161 + /* 162 + * Give the governor an opportunity to reflect on the outcome 163 + */ 164 + cpuidle_reflect(dev, entered_state); 165 + 166 + exit_idle: 178 167 __current_set_polling(); 179 168 180 169 /* 181 - * It is up to the idle functions to enable back the local 182 - * interrupt 170 + * It is up to the idle functions to reenable local interrupts 183 171 */ 184 172 if (WARN_ON_ONCE(irqs_disabled())) 185 173 local_irq_enable(); 186 174 187 175 rcu_idle_exit(); 188 176 start_critical_timings(); 189 - 190 - return 0; 191 177 } 192 178 193 179 /*
+101 -18
kernel/sched/rt.c
··· 79 79 rt_rq->overloaded = 0; 80 80 plist_head_init(&rt_rq->pushable_tasks); 81 81 #endif 82 + /* We start is dequeued state, because no RT tasks are queued */ 83 + rt_rq->rt_queued = 0; 82 84 83 85 rt_rq->rt_time = 0; 84 86 rt_rq->rt_throttled = 0; ··· 112 110 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 113 111 { 114 112 return rt_se->rt_rq; 113 + } 114 + 115 + static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 116 + { 117 + struct rt_rq *rt_rq = rt_se->rt_rq; 118 + 119 + return rt_rq->rq; 115 120 } 116 121 117 122 void free_rt_sched_group(struct task_group *tg) ··· 220 211 return container_of(rt_rq, struct rq, rt); 221 212 } 222 213 223 - static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 214 + static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 224 215 { 225 216 struct task_struct *p = rt_task_of(rt_se); 226 - struct rq *rq = task_rq(p); 217 + 218 + return task_rq(p); 219 + } 220 + 221 + static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 222 + { 223 + struct rq *rq = rq_of_rt_se(rt_se); 227 224 228 225 return &rq->rt; 229 226 } ··· 406 391 } 407 392 #endif /* CONFIG_SMP */ 408 393 394 + static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 395 + static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 396 + 409 397 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 410 398 { 411 399 return !list_empty(&rt_se->run_list); ··· 470 452 rt_se = rt_rq->tg->rt_se[cpu]; 471 453 472 454 if (rt_rq->rt_nr_running) { 473 - if (rt_se && !on_rt_rq(rt_se)) 455 + if (!rt_se) 456 + enqueue_top_rt_rq(rt_rq); 457 + else if (!on_rt_rq(rt_se)) 474 458 enqueue_rt_entity(rt_se, false); 459 + 475 460 if (rt_rq->highest_prio.curr < curr->prio) 476 461 resched_task(curr); 477 462 } ··· 487 466 488 467 rt_se = rt_rq->tg->rt_se[cpu]; 489 468 490 - if (rt_se && on_rt_rq(rt_se)) 469 + if (!rt_se) 470 + dequeue_top_rt_rq(rt_rq); 471 + else if (on_rt_rq(rt_se)) 491 472 dequeue_rt_entity(rt_se); 473 + } 474 + 475 + static inline int rt_rq_throttled(struct rt_rq *rt_rq) 476 + { 477 + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 492 478 } 493 479 494 480 static int rt_se_boosted(struct sched_rt_entity *rt_se) ··· 560 532 561 533 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 562 534 { 563 - if (rt_rq->rt_nr_running) 564 - resched_task(rq_of_rt_rq(rt_rq)->curr); 535 + struct rq *rq = rq_of_rt_rq(rt_rq); 536 + 537 + if (!rt_rq->rt_nr_running) 538 + return; 539 + 540 + enqueue_top_rt_rq(rt_rq); 541 + resched_task(rq->curr); 565 542 } 566 543 567 544 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 568 545 { 546 + dequeue_top_rt_rq(rt_rq); 547 + } 548 + 549 + static inline int rt_rq_throttled(struct rt_rq *rt_rq) 550 + { 551 + return rt_rq->rt_throttled; 569 552 } 570 553 571 554 static inline const struct cpumask *sched_rt_period_mask(void) ··· 961 922 } 962 923 } 963 924 925 + static void 926 + dequeue_top_rt_rq(struct rt_rq *rt_rq) 927 + { 928 + struct rq *rq = rq_of_rt_rq(rt_rq); 929 + 930 + BUG_ON(&rq->rt != rt_rq); 931 + 932 + if (!rt_rq->rt_queued) 933 + return; 934 + 935 + BUG_ON(!rq->nr_running); 936 + 937 + sub_nr_running(rq, rt_rq->rt_nr_running); 938 + rt_rq->rt_queued = 0; 939 + } 940 + 941 + static void 942 + enqueue_top_rt_rq(struct rt_rq *rt_rq) 943 + { 944 + struct rq *rq = rq_of_rt_rq(rt_rq); 945 + 946 + BUG_ON(&rq->rt != rt_rq); 947 + 948 + if (rt_rq->rt_queued) 949 + return; 950 + if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 951 + return; 952 + 953 + add_nr_running(rq, rt_rq->rt_nr_running); 954 + rt_rq->rt_queued = 1; 955 + } 956 + 964 957 #if defined CONFIG_SMP 965 958 966 959 static void ··· 1116 1045 #endif /* CONFIG_RT_GROUP_SCHED */ 1117 1046 1118 1047 static inline 1048 + unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1049 + { 1050 + struct rt_rq *group_rq = group_rt_rq(rt_se); 1051 + 1052 + if (group_rq) 1053 + return group_rq->rt_nr_running; 1054 + else 1055 + return 1; 1056 + } 1057 + 1058 + static inline 1119 1059 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1120 1060 { 1121 1061 int prio = rt_se_prio(rt_se); 1122 1062 1123 1063 WARN_ON(!rt_prio(prio)); 1124 - rt_rq->rt_nr_running++; 1064 + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1125 1065 1126 1066 inc_rt_prio(rt_rq, prio); 1127 1067 inc_rt_migration(rt_se, rt_rq); ··· 1144 1062 { 1145 1063 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1146 1064 WARN_ON(!rt_rq->rt_nr_running); 1147 - rt_rq->rt_nr_running--; 1065 + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1148 1066 1149 1067 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1150 1068 dec_rt_migration(rt_se, rt_rq); ··· 1201 1119 back = rt_se; 1202 1120 } 1203 1121 1122 + dequeue_top_rt_rq(rt_rq_of_se(back)); 1123 + 1204 1124 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1205 1125 if (on_rt_rq(rt_se)) 1206 1126 __dequeue_rt_entity(rt_se); ··· 1211 1127 1212 1128 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1213 1129 { 1130 + struct rq *rq = rq_of_rt_se(rt_se); 1131 + 1214 1132 dequeue_rt_stack(rt_se); 1215 1133 for_each_sched_rt_entity(rt_se) 1216 1134 __enqueue_rt_entity(rt_se, head); 1135 + enqueue_top_rt_rq(&rq->rt); 1217 1136 } 1218 1137 1219 1138 static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1220 1139 { 1140 + struct rq *rq = rq_of_rt_se(rt_se); 1141 + 1221 1142 dequeue_rt_stack(rt_se); 1222 1143 1223 1144 for_each_sched_rt_entity(rt_se) { ··· 1231 1142 if (rt_rq && rt_rq->rt_nr_running) 1232 1143 __enqueue_rt_entity(rt_se, false); 1233 1144 } 1145 + enqueue_top_rt_rq(&rq->rt); 1234 1146 } 1235 1147 1236 1148 /* ··· 1249 1159 1250 1160 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1251 1161 enqueue_pushable_task(rq, p); 1252 - 1253 - inc_nr_running(rq); 1254 1162 } 1255 1163 1256 1164 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) ··· 1259 1171 dequeue_rt_entity(rt_se); 1260 1172 1261 1173 dequeue_pushable_task(rq, p); 1262 - 1263 - dec_nr_running(rq); 1264 1174 } 1265 1175 1266 1176 /* ··· 1463 1377 if (prev->sched_class == &rt_sched_class) 1464 1378 update_curr_rt(rq); 1465 1379 1466 - if (!rt_rq->rt_nr_running) 1467 - return NULL; 1468 - 1469 - if (rt_rq_throttled(rt_rq)) 1380 + if (!rt_rq->rt_queued) 1470 1381 return NULL; 1471 1382 1472 1383 put_prev_task(rq, prev); ··· 1975 1892 */ 1976 1893 if (p->on_rq && rq->curr != p) { 1977 1894 #ifdef CONFIG_SMP 1978 - if (rq->rt.overloaded && push_rt_task(rq) && 1895 + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 1979 1896 /* Don't resched if we changed runqueues */ 1980 - rq != task_rq(p)) 1897 + push_rt_task(rq) && rq != task_rq(p)) 1981 1898 check_resched = 0; 1982 1899 #endif /* CONFIG_SMP */ 1983 1900 if (check_resched && p->prio < rq->curr->prio)
+9 -17
kernel/sched/sched.h
··· 409 409 int overloaded; 410 410 struct plist_head pushable_tasks; 411 411 #endif 412 + int rt_queued; 413 + 412 414 int rt_throttled; 413 415 u64 rt_time; 414 416 u64 rt_runtime; ··· 424 422 struct task_group *tg; 425 423 #endif 426 424 }; 427 - 428 - #ifdef CONFIG_RT_GROUP_SCHED 429 - static inline int rt_rq_throttled(struct rt_rq *rt_rq) 430 - { 431 - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 432 - } 433 - #else 434 - static inline int rt_rq_throttled(struct rt_rq *rt_rq) 435 - { 436 - return rt_rq->rt_throttled; 437 - } 438 - #endif 439 425 440 426 /* Deadline class' related fields in a runqueue */ 441 427 struct dl_rq { ··· 1206 1216 1207 1217 extern void init_task_runnable_average(struct task_struct *p); 1208 1218 1209 - static inline void inc_nr_running(struct rq *rq) 1219 + static inline void add_nr_running(struct rq *rq, unsigned count) 1210 1220 { 1211 - rq->nr_running++; 1221 + unsigned prev_nr = rq->nr_running; 1222 + 1223 + rq->nr_running = prev_nr + count; 1212 1224 1213 1225 #ifdef CONFIG_NO_HZ_FULL 1214 - if (rq->nr_running == 2) { 1226 + if (prev_nr < 2 && rq->nr_running >= 2) { 1215 1227 if (tick_nohz_full_cpu(rq->cpu)) { 1216 1228 /* Order rq->nr_running write against the IPI */ 1217 1229 smp_wmb(); ··· 1223 1231 #endif 1224 1232 } 1225 1233 1226 - static inline void dec_nr_running(struct rq *rq) 1234 + static inline void sub_nr_running(struct rq *rq, unsigned count) 1227 1235 { 1228 - rq->nr_running--; 1236 + rq->nr_running -= count; 1229 1237 } 1230 1238 1231 1239 static inline void rq_last_tick_reset(struct rq *rq)
+2 -2
kernel/sched/stop_task.c
··· 41 41 static void 42 42 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 43 43 { 44 - inc_nr_running(rq); 44 + add_nr_running(rq, 1); 45 45 } 46 46 47 47 static void 48 48 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 49 49 { 50 - dec_nr_running(rq); 50 + sub_nr_running(rq, 1); 51 51 } 52 52 53 53 static void yield_task_stop(struct rq *rq)
+3 -3
kernel/sys.c
··· 250 250 else 251 251 p = current; 252 252 if (p) { 253 - niceval = 20 - task_nice(p); 253 + niceval = nice_to_rlimit(task_nice(p)); 254 254 if (niceval > retval) 255 255 retval = niceval; 256 256 } ··· 261 261 else 262 262 pgrp = task_pgrp(current); 263 263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 264 - niceval = 20 - task_nice(p); 264 + niceval = nice_to_rlimit(task_nice(p)); 265 265 if (niceval > retval) 266 266 retval = niceval; 267 267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); ··· 277 277 278 278 do_each_thread(g, p) { 279 279 if (uid_eq(task_uid(p), uid)) { 280 - niceval = 20 - task_nice(p); 280 + niceval = nice_to_rlimit(task_nice(p)); 281 281 if (niceval > retval) 282 282 retval = niceval; 283 283 }
+3 -3
kernel/workqueue.c
··· 100 100 101 101 /* 102 102 * Rescue workers are used only on emergencies and shared by 103 - * all cpus. Give -20. 103 + * all cpus. Give MIN_NICE. 104 104 */ 105 - RESCUER_NICE_LEVEL = -20, 106 - HIGHPRI_NICE_LEVEL = -20, 105 + RESCUER_NICE_LEVEL = MIN_NICE, 106 + HIGHPRI_NICE_LEVEL = MIN_NICE, 107 107 108 108 WQ_NAME_LEN = 24, 109 109 };
+1 -1
mm/huge_memory.c
··· 2740 2740 struct mm_slot *mm_slot; 2741 2741 2742 2742 set_freezable(); 2743 - set_user_nice(current, 19); 2743 + set_user_nice(current, MAX_NICE); 2744 2744 2745 2745 while (!kthread_should_stop()) { 2746 2746 khugepaged_do_scan();
-3
mm/memory.c
··· 3920 3920 } 3921 3921 } 3922 3922 3923 - /* THP should already have been handled */ 3924 - BUG_ON(pmd_numa(*pmd)); 3925 - 3926 3923 /* 3927 3924 * Use __pte_alloc instead of pte_alloc_map, because we can't 3928 3925 * run pte_offset_map on the pmd, if an huge pmd could