Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

-5

Documentation/sysctl/kernel.txt

··· 428 428 numa_balancing_scan_size_mb is how many megabytes worth of pages are 429 429 scanned for a given scan. 430 430 431 - numa_balancing_settle_count is how many scan periods must complete before 432 - the schedule balancer stops pushing the task towards a preferred node. This 433 - gives the scheduler a chance to place the task on an alternative node if the 434 - preferred node is overloaded. 435 - 436 431 numa_balancing_migrate_deferred is how many page migrations get skipped 437 432 unconditionally, after a page migration is skipped because a page is shared 438 433 with other tasks. This reduces page migration overhead, and determines

+1 -1

arch/arm/include/asm/unistd.h

··· 15 15 16 16 #include <uapi/asm/unistd.h> 17 17 18 - #define __NR_syscalls (380) 18 + #define __NR_syscalls (384) 19 19 #define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0) 20 20 21 21 #define __ARCH_WANT_STAT64

+2

arch/arm/include/uapi/asm/unistd.h

··· 406 406 #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) 407 407 #define __NR_kcmp (__NR_SYSCALL_BASE+378) 408 408 #define __NR_finit_module (__NR_SYSCALL_BASE+379) 409 + #define __NR_sched_setattr (__NR_SYSCALL_BASE+380) 410 + #define __NR_sched_getattr (__NR_SYSCALL_BASE+381) 409 411 410 412 /* 411 413 * This may need to be greater than __NR_last_syscall+1 in order to

+2

arch/arm/kernel/calls.S

··· 389 389 CALL(sys_process_vm_writev) 390 390 CALL(sys_kcmp) 391 391 CALL(sys_finit_module) 392 + /* 380 */ CALL(sys_sched_setattr) 393 + CALL(sys_sched_getattr) 392 394 #ifndef syscalls_counted 393 395 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls 394 396 #define syscalls_counted

+2

arch/m68k/include/asm/mac_via.h

··· 254 254 extern volatile __u8 *via1,*via2; 255 255 extern int rbv_present,via_alt_mapping; 256 256 257 + struct irq_desc; 258 + 257 259 extern void via_register_interrupts(void); 258 260 extern void via_irq_enable(int); 259 261 extern void via_irq_disable(int);

+43

arch/x86/include/asm/mwait.h

··· 1 1 #ifndef _ASM_X86_MWAIT_H 2 2 #define _ASM_X86_MWAIT_H 3 3 4 + #include <linux/sched.h> 5 + 4 6 #define MWAIT_SUBSTATE_MASK 0xf 5 7 #define MWAIT_CSTATE_MASK 0xf 6 8 #define MWAIT_SUBSTATE_SIZE 4 ··· 14 12 #define CPUID5_ECX_INTERRUPT_BREAK 0x2 15 13 16 14 #define MWAIT_ECX_INTERRUPT_BREAK 0x1 15 + 16 + static inline void __monitor(const void *eax, unsigned long ecx, 17 + unsigned long edx) 18 + { 19 + /* "monitor %eax, %ecx, %edx;" */ 20 + asm volatile(".byte 0x0f, 0x01, 0xc8;" 21 + :: "a" (eax), "c" (ecx), "d"(edx)); 22 + } 23 + 24 + static inline void __mwait(unsigned long eax, unsigned long ecx) 25 + { 26 + /* "mwait %eax, %ecx;" */ 27 + asm volatile(".byte 0x0f, 0x01, 0xc9;" 28 + :: "a" (eax), "c" (ecx)); 29 + } 30 + 31 + /* 32 + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 33 + * which can obviate IPI to trigger checking of need_resched. 34 + * We execute MONITOR against need_resched and enter optimized wait state 35 + * through MWAIT. Whenever someone changes need_resched, we would be woken 36 + * up from MWAIT (without an IPI). 37 + * 38 + * New with Core Duo processors, MWAIT can take some hints based on CPU 39 + * capability. 40 + */ 41 + static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 42 + { 43 + if (!current_set_polling_and_test()) { 44 + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { 45 + mb(); 46 + clflush((void *)&current_thread_info()->flags); 47 + mb(); 48 + } 49 + 50 + __monitor((void *)&current_thread_info()->flags, 0, 0); 51 + if (!need_resched()) 52 + __mwait(eax, ecx); 53 + } 54 + current_clr_polling(); 55 + } 17 56 18 57 #endif /* _ASM_X86_MWAIT_H */

-23

arch/x86/include/asm/processor.h

··· 700 700 #endif 701 701 } 702 702 703 - static inline void __monitor(const void *eax, unsigned long ecx, 704 - unsigned long edx) 705 - { 706 - /* "monitor %eax, %ecx, %edx;" */ 707 - asm volatile(".byte 0x0f, 0x01, 0xc8;" 708 - :: "a" (eax), "c" (ecx), "d"(edx)); 709 - } 710 - 711 - static inline void __mwait(unsigned long eax, unsigned long ecx) 712 - { 713 - /* "mwait %eax, %ecx;" */ 714 - asm volatile(".byte 0x0f, 0x01, 0xc9;" 715 - :: "a" (eax), "c" (ecx)); 716 - } 717 - 718 - static inline void __sti_mwait(unsigned long eax, unsigned long ecx) 719 - { 720 - trace_hardirqs_on(); 721 - /* "mwait %eax, %ecx;" */ 722 - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" 723 - :: "a" (eax), "c" (ecx)); 724 - } 725 - 726 703 extern void select_idle_routine(const struct cpuinfo_x86 *c); 727 704 extern void init_amd_e400_c1e_mask(void); 728 705

+17 -58

arch/x86/include/asm/timer.h

··· 4 4 #include <linux/pm.h> 5 5 #include <linux/percpu.h> 6 6 #include <linux/interrupt.h> 7 + #include <linux/math64.h> 7 8 8 9 #define TICK_SIZE (tick_nsec / 1000) 9 10 ··· 13 12 14 13 extern int no_timer_check; 15 14 16 - /* Accelerators for sched_clock() 17 - * convert from cycles(64bits) => nanoseconds (64bits) 18 - * basic equation: 19 - * ns = cycles / (freq / ns_per_sec) 20 - * ns = cycles * (ns_per_sec / freq) 21 - * ns = cycles * (10^9 / (cpu_khz * 10^3)) 22 - * ns = cycles * (10^6 / cpu_khz) 15 + /* 16 + * We use the full linear equation: f(x) = a + b*x, in order to allow 17 + * a continuous function in the face of dynamic freq changes. 23 18 * 24 - * Then we use scaling math (suggested by george@mvista.com) to get: 25 - * ns = cycles * (10^6 * SC / cpu_khz) / SC 26 - * ns = cycles * cyc2ns_scale / SC 19 + * Continuity means that when our frequency changes our slope (b); we want to 20 + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. 27 21 * 28 - * And since SC is a constant power of two, we can convert the div 29 - * into a shift. 22 + * Without an offset (a) the above would not be possible. 30 23 * 31 - * We can use khz divisor instead of mhz to keep a better precision, since 32 - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. 33 - * (mathieu.desnoyers@polymtl.ca) 34 - * 35 - * -johnstul@us.ibm.com "math is hard, lets go shopping!" 36 - * 37 - * In: 38 - * 39 - * ns = cycles * cyc2ns_scale / SC 40 - * 41 - * Although we may still have enough bits to store the value of ns, 42 - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, 43 - * leading to an incorrect result. 44 - * 45 - * To avoid this, we can decompose 'cycles' into quotient and remainder 46 - * of division by SC. Then, 47 - * 48 - * ns = (quot * SC + rem) * cyc2ns_scale / SC 49 - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC 50 - * 51 - * - sqazi@google.com 24 + * See the comment near cycles_2_ns() for details on how we compute (b). 52 25 */ 26 + struct cyc2ns_data { 27 + u32 cyc2ns_mul; 28 + u32 cyc2ns_shift; 29 + u64 cyc2ns_offset; 30 + u32 __count; 31 + /* u32 hole */ 32 + }; /* 24 bytes -- do not grow */ 53 33 54 - DECLARE_PER_CPU(unsigned long, cyc2ns); 55 - DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); 56 - 57 - #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 58 - 59 - static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 60 - { 61 - int cpu = smp_processor_id(); 62 - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); 63 - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), 64 - (1UL << CYC2NS_SCALE_FACTOR)); 65 - return ns; 66 - } 67 - 68 - static inline unsigned long long cycles_2_ns(unsigned long long cyc) 69 - { 70 - unsigned long long ns; 71 - unsigned long flags; 72 - 73 - local_irq_save(flags); 74 - ns = __cycles_2_ns(cyc); 75 - local_irq_restore(flags); 76 - 77 - return ns; 78 - } 34 + extern struct cyc2ns_data *cyc2ns_read_begin(void); 35 + extern void cyc2ns_read_end(struct cyc2ns_data *); 79 36 80 37 #endif /* _ASM_X86_TIMER_H */

-23

arch/x86/kernel/acpi/cstate.c

··· 150 150 } 151 151 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 152 152 153 - /* 154 - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 155 - * which can obviate IPI to trigger checking of need_resched. 156 - * We execute MONITOR against need_resched and enter optimized wait state 157 - * through MWAIT. Whenever someone changes need_resched, we would be woken 158 - * up from MWAIT (without an IPI). 159 - * 160 - * New with Core Duo processors, MWAIT can take some hints based on CPU 161 - * capability. 162 - */ 163 - void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 164 - { 165 - if (!need_resched()) { 166 - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 167 - clflush((void *)&current_thread_info()->flags); 168 - 169 - __monitor((void *)&current_thread_info()->flags, 0, 0); 170 - smp_mb(); 171 - if (!need_resched()) 172 - __mwait(ax, cx); 173 - } 174 - } 175 - 176 153 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 177 154 { 178 155 unsigned int cpu = smp_processor_id();

+1 -1

arch/x86/kernel/cpu/amd.c

··· 487 487 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 488 488 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 489 489 if (!check_tsc_unstable()) 490 - sched_clock_stable = 1; 490 + set_sched_clock_stable(); 491 491 } 492 492 493 493 #ifdef CONFIG_X86_64

+1 -1

arch/x86/kernel/cpu/intel.c

··· 93 93 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 94 94 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 95 95 if (!check_tsc_unstable()) 96 - sched_clock_stable = 1; 96 + set_sched_clock_stable(); 97 97 } 98 98 99 99 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */

+11 -5

arch/x86/kernel/cpu/perf_event.c

··· 1883 1883 1884 1884 void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1885 1885 { 1886 + struct cyc2ns_data *data; 1887 + 1886 1888 userpg->cap_user_time = 0; 1887 1889 userpg->cap_user_time_zero = 0; 1888 1890 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; 1889 1891 userpg->pmc_width = x86_pmu.cntval_bits; 1890 1892 1891 - if (!sched_clock_stable) 1893 + if (!sched_clock_stable()) 1892 1894 return; 1893 1895 1896 + data = cyc2ns_read_begin(); 1897 + 1894 1898 userpg->cap_user_time = 1; 1895 - userpg->time_mult = this_cpu_read(cyc2ns); 1896 - userpg->time_shift = CYC2NS_SCALE_FACTOR; 1897 - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1899 + userpg->time_mult = data->cyc2ns_mul; 1900 + userpg->time_shift = data->cyc2ns_shift; 1901 + userpg->time_offset = data->cyc2ns_offset - now; 1898 1902 1899 1903 userpg->cap_user_time_zero = 1; 1900 - userpg->time_zero = this_cpu_read(cyc2ns_offset); 1904 + userpg->time_zero = data->cyc2ns_offset; 1905 + 1906 + cyc2ns_read_end(data); 1901 1907 } 1902 1908 1903 1909 /*

+2

arch/x86/kernel/smpboot.c

··· 1417 1417 * The WBINVD is insufficient due to the spurious-wakeup 1418 1418 * case where we return around the loop. 1419 1419 */ 1420 + mb(); 1420 1421 clflush(mwait_ptr); 1422 + mb(); 1421 1423 __monitor(mwait_ptr, 0, 0); 1422 1424 mb(); 1423 1425 __mwait(eax, 0);

+257 -61

arch/x86/kernel/tsc.c

··· 11 11 #include <linux/clocksource.h> 12 12 #include <linux/percpu.h> 13 13 #include <linux/timex.h> 14 + #include <linux/static_key.h> 14 15 15 16 #include <asm/hpet.h> 16 17 #include <asm/timer.h> ··· 38 37 erroneous rdtsc usage on !cpu_has_tsc processors */ 39 38 static int __read_mostly tsc_disabled = -1; 40 39 40 + static struct static_key __use_tsc = STATIC_KEY_INIT; 41 + 41 42 int tsc_clocksource_reliable; 43 + 44 + /* 45 + * Use a ring-buffer like data structure, where a writer advances the head by 46 + * writing a new data entry and a reader advances the tail when it observes a 47 + * new entry. 48 + * 49 + * Writers are made to wait on readers until there's space to write a new 50 + * entry. 51 + * 52 + * This means that we can always use an {offset, mul} pair to compute a ns 53 + * value that is 'roughly' in the right direction, even if we're writing a new 54 + * {offset, mul} pair during the clock read. 55 + * 56 + * The down-side is that we can no longer guarantee strict monotonicity anymore 57 + * (assuming the TSC was that to begin with), because while we compute the 58 + * intersection point of the two clock slopes and make sure the time is 59 + * continuous at the point of switching; we can no longer guarantee a reader is 60 + * strictly before or after the switch point. 61 + * 62 + * It does mean a reader no longer needs to disable IRQs in order to avoid 63 + * CPU-Freq updates messing with his times, and similarly an NMI reader will 64 + * no longer run the risk of hitting half-written state. 65 + */ 66 + 67 + struct cyc2ns { 68 + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ 69 + struct cyc2ns_data *head; /* 48 + 8 = 56 */ 70 + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ 71 + }; /* exactly fits one cacheline */ 72 + 73 + static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); 74 + 75 + struct cyc2ns_data *cyc2ns_read_begin(void) 76 + { 77 + struct cyc2ns_data *head; 78 + 79 + preempt_disable(); 80 + 81 + head = this_cpu_read(cyc2ns.head); 82 + /* 83 + * Ensure we observe the entry when we observe the pointer to it. 84 + * matches the wmb from cyc2ns_write_end(). 85 + */ 86 + smp_read_barrier_depends(); 87 + head->__count++; 88 + barrier(); 89 + 90 + return head; 91 + } 92 + 93 + void cyc2ns_read_end(struct cyc2ns_data *head) 94 + { 95 + barrier(); 96 + /* 97 + * If we're the outer most nested read; update the tail pointer 98 + * when we're done. This notifies possible pending writers 99 + * that we've observed the head pointer and that the other 100 + * entry is now free. 101 + */ 102 + if (!--head->__count) { 103 + /* 104 + * x86-TSO does not reorder writes with older reads; 105 + * therefore once this write becomes visible to another 106 + * cpu, we must be finished reading the cyc2ns_data. 107 + * 108 + * matches with cyc2ns_write_begin(). 109 + */ 110 + this_cpu_write(cyc2ns.tail, head); 111 + } 112 + preempt_enable(); 113 + } 114 + 115 + /* 116 + * Begin writing a new @data entry for @cpu. 117 + * 118 + * Assumes some sort of write side lock; currently 'provided' by the assumption 119 + * that cpufreq will call its notifiers sequentially. 120 + */ 121 + static struct cyc2ns_data *cyc2ns_write_begin(int cpu) 122 + { 123 + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); 124 + struct cyc2ns_data *data = c2n->data; 125 + 126 + if (data == c2n->head) 127 + data++; 128 + 129 + /* XXX send an IPI to @cpu in order to guarantee a read? */ 130 + 131 + /* 132 + * When we observe the tail write from cyc2ns_read_end(), 133 + * the cpu must be done with that entry and its safe 134 + * to start writing to it. 135 + */ 136 + while (c2n->tail == data) 137 + cpu_relax(); 138 + 139 + return data; 140 + } 141 + 142 + static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) 143 + { 144 + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); 145 + 146 + /* 147 + * Ensure the @data writes are visible before we publish the 148 + * entry. Matches the data-depencency in cyc2ns_read_begin(). 149 + */ 150 + smp_wmb(); 151 + 152 + ACCESS_ONCE(c2n->head) = data; 153 + } 154 + 155 + /* 156 + * Accelerators for sched_clock() 157 + * convert from cycles(64bits) => nanoseconds (64bits) 158 + * basic equation: 159 + * ns = cycles / (freq / ns_per_sec) 160 + * ns = cycles * (ns_per_sec / freq) 161 + * ns = cycles * (10^9 / (cpu_khz * 10^3)) 162 + * ns = cycles * (10^6 / cpu_khz) 163 + * 164 + * Then we use scaling math (suggested by george@mvista.com) to get: 165 + * ns = cycles * (10^6 * SC / cpu_khz) / SC 166 + * ns = cycles * cyc2ns_scale / SC 167 + * 168 + * And since SC is a constant power of two, we can convert the div 169 + * into a shift. 170 + * 171 + * We can use khz divisor instead of mhz to keep a better precision, since 172 + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. 173 + * (mathieu.desnoyers@polymtl.ca) 174 + * 175 + * -johnstul@us.ibm.com "math is hard, lets go shopping!" 176 + */ 177 + 178 + #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 179 + 180 + static void cyc2ns_data_init(struct cyc2ns_data *data) 181 + { 182 + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; 183 + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; 184 + data->cyc2ns_offset = 0; 185 + data->__count = 0; 186 + } 187 + 188 + static void cyc2ns_init(int cpu) 189 + { 190 + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); 191 + 192 + cyc2ns_data_init(&c2n->data[0]); 193 + cyc2ns_data_init(&c2n->data[1]); 194 + 195 + c2n->head = c2n->data; 196 + c2n->tail = c2n->data; 197 + } 198 + 199 + static inline unsigned long long cycles_2_ns(unsigned long long cyc) 200 + { 201 + struct cyc2ns_data *data, *tail; 202 + unsigned long long ns; 203 + 204 + /* 205 + * See cyc2ns_read_*() for details; replicated in order to avoid 206 + * an extra few instructions that came with the abstraction. 207 + * Notable, it allows us to only do the __count and tail update 208 + * dance when its actually needed. 209 + */ 210 + 211 + preempt_disable(); 212 + data = this_cpu_read(cyc2ns.head); 213 + tail = this_cpu_read(cyc2ns.tail); 214 + 215 + if (likely(data == tail)) { 216 + ns = data->cyc2ns_offset; 217 + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); 218 + } else { 219 + data->__count++; 220 + 221 + barrier(); 222 + 223 + ns = data->cyc2ns_offset; 224 + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); 225 + 226 + barrier(); 227 + 228 + if (!--data->__count) 229 + this_cpu_write(cyc2ns.tail, data); 230 + } 231 + preempt_enable(); 232 + 233 + return ns; 234 + } 235 + 236 + /* XXX surely we already have this someplace in the kernel?! */ 237 + #define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) 238 + 239 + static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) 240 + { 241 + unsigned long long tsc_now, ns_now; 242 + struct cyc2ns_data *data; 243 + unsigned long flags; 244 + 245 + local_irq_save(flags); 246 + sched_clock_idle_sleep_event(); 247 + 248 + if (!cpu_khz) 249 + goto done; 250 + 251 + data = cyc2ns_write_begin(cpu); 252 + 253 + rdtscll(tsc_now); 254 + ns_now = cycles_2_ns(tsc_now); 255 + 256 + /* 257 + * Compute a new multiplier as per the above comment and ensure our 258 + * time function is continuous; see the comment near struct 259 + * cyc2ns_data. 260 + */ 261 + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); 262 + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; 263 + data->cyc2ns_offset = ns_now - 264 + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); 265 + 266 + cyc2ns_write_end(cpu, data); 267 + 268 + done: 269 + sched_clock_idle_wakeup_event(0); 270 + local_irq_restore(flags); 271 + } 42 272 /* 43 273 * Scheduler clock - returns current time in nanosec units. 44 274 */ 45 275 u64 native_sched_clock(void) 46 276 { 47 - u64 this_offset; 277 + u64 tsc_now; 48 278 49 279 /* 50 280 * Fall back to jiffies if there's no TSC available: ··· 285 53 * very important for it to be as fast as the platform 286 54 * can achieve it. ) 287 55 */ 288 - if (unlikely(tsc_disabled)) { 56 + if (!static_key_false(&__use_tsc)) { 289 57 /* No locking but a rare wrong value is not a big deal: */ 290 58 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 291 59 } 292 60 293 61 /* read the Time Stamp Counter: */ 294 - rdtscll(this_offset); 62 + rdtscll(tsc_now); 295 63 296 64 /* return the value in ns */ 297 - return __cycles_2_ns(this_offset); 65 + return cycles_2_ns(tsc_now); 298 66 } 299 67 300 68 /* We need to define a real function for sched_clock, to override the ··· 821 589 EXPORT_SYMBOL(recalibrate_cpu_khz); 822 590 823 591 824 - /* Accelerators for sched_clock() 825 - * convert from cycles(64bits) => nanoseconds (64bits) 826 - * basic equation: 827 - * ns = cycles / (freq / ns_per_sec) 828 - * ns = cycles * (ns_per_sec / freq) 829 - * ns = cycles * (10^9 / (cpu_khz * 10^3)) 830 - * ns = cycles * (10^6 / cpu_khz) 831 - * 832 - * Then we use scaling math (suggested by george@mvista.com) to get: 833 - * ns = cycles * (10^6 * SC / cpu_khz) / SC 834 - * ns = cycles * cyc2ns_scale / SC 835 - * 836 - * And since SC is a constant power of two, we can convert the div 837 - * into a shift. 838 - * 839 - * We can use khz divisor instead of mhz to keep a better precision, since 840 - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. 841 - * (mathieu.desnoyers@polymtl.ca) 842 - * 843 - * -johnstul@us.ibm.com "math is hard, lets go shopping!" 844 - */ 845 - 846 - DEFINE_PER_CPU(unsigned long, cyc2ns); 847 - DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); 848 - 849 - static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) 850 - { 851 - unsigned long long tsc_now, ns_now, *offset; 852 - unsigned long flags, *scale; 853 - 854 - local_irq_save(flags); 855 - sched_clock_idle_sleep_event(); 856 - 857 - scale = &per_cpu(cyc2ns, cpu); 858 - offset = &per_cpu(cyc2ns_offset, cpu); 859 - 860 - rdtscll(tsc_now); 861 - ns_now = __cycles_2_ns(tsc_now); 862 - 863 - if (cpu_khz) { 864 - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + 865 - cpu_khz / 2) / cpu_khz; 866 - *offset = ns_now - mult_frac(tsc_now, *scale, 867 - (1UL << CYC2NS_SCALE_FACTOR)); 868 - } 869 - 870 - sched_clock_idle_wakeup_event(0); 871 - local_irq_restore(flags); 872 - } 873 - 874 592 static unsigned long long cyc2ns_suspend; 875 593 876 594 void tsc_save_sched_clock_state(void) 877 595 { 878 - if (!sched_clock_stable) 596 + if (!sched_clock_stable()) 879 597 return; 880 598 881 599 cyc2ns_suspend = sched_clock(); ··· 845 663 unsigned long flags; 846 664 int cpu; 847 665 848 - if (!sched_clock_stable) 666 + if (!sched_clock_stable()) 849 667 return; 850 668 851 669 local_irq_save(flags); 852 670 853 - __this_cpu_write(cyc2ns_offset, 0); 671 + /* 672 + * We're comming out of suspend, there's no concurrency yet; don't 673 + * bother being nice about the RCU stuff, just write to both 674 + * data fields. 675 + */ 676 + 677 + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); 678 + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); 679 + 854 680 offset = cyc2ns_suspend - sched_clock(); 855 681 856 - for_each_possible_cpu(cpu) 857 - per_cpu(cyc2ns_offset, cpu) = offset; 682 + for_each_possible_cpu(cpu) { 683 + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; 684 + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; 685 + } 858 686 859 687 local_irq_restore(flags); 860 688 } ··· 987 795 { 988 796 if (!tsc_unstable) { 989 797 tsc_unstable = 1; 990 - sched_clock_stable = 0; 798 + clear_sched_clock_stable(); 991 799 disable_sched_clock_irqtime(); 992 800 pr_info("Marking TSC unstable due to %s\n", reason); 993 801 /* Change only the rating, when not registered */ ··· 1187 995 * speed as the bootup CPU. (cpufreq notifiers will fix this 1188 996 * up if their speed diverges) 1189 997 */ 1190 - for_each_possible_cpu(cpu) 998 + for_each_possible_cpu(cpu) { 999 + cyc2ns_init(cpu); 1191 1000 set_cyc2ns_scale(cpu_khz, cpu); 1001 + } 1192 1002 1193 1003 if (tsc_disabled > 0) 1194 1004 return; 1195 1005 1196 1006 /* now allow native_sched_clock() to use rdtsc */ 1007 + 1197 1008 tsc_disabled = 0; 1009 + static_key_slow_inc(&__use_tsc); 1198 1010 1199 1011 if (!no_sched_irq_time) 1200 1012 enable_sched_clock_irqtime();

+40 -26

arch/x86/platform/uv/tlb_uv.c

··· 433 433 return; 434 434 } 435 435 436 + /* 437 + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative 438 + * number, not an absolute. It converts a duration in cycles to a duration in 439 + * ns. 440 + */ 441 + static inline unsigned long long cycles_2_ns(unsigned long long cyc) 442 + { 443 + struct cyc2ns_data *data = cyc2ns_read_begin(); 444 + unsigned long long ns; 445 + 446 + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); 447 + 448 + cyc2ns_read_end(data); 449 + return ns; 450 + } 451 + 452 + /* 453 + * The reverse of the above; converts a duration in ns to a duration in cycles. 454 + */ 455 + static inline unsigned long long ns_2_cycles(unsigned long long ns) 456 + { 457 + struct cyc2ns_data *data = cyc2ns_read_begin(); 458 + unsigned long long cyc; 459 + 460 + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; 461 + 462 + cyc2ns_read_end(data); 463 + return cyc; 464 + } 465 + 436 466 static inline unsigned long cycles_2_us(unsigned long long cyc) 437 467 { 438 - unsigned long long ns; 439 - unsigned long us; 440 - int cpu = smp_processor_id(); 468 + return cycles_2_ns(cyc) / NSEC_PER_USEC; 469 + } 441 470 442 - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; 443 - us = ns / 1000; 444 - return us; 471 + static inline cycles_t sec_2_cycles(unsigned long sec) 472 + { 473 + return ns_2_cycles(sec * NSEC_PER_SEC); 474 + } 475 + 476 + static inline unsigned long long usec_2_cycles(unsigned long usec) 477 + { 478 + return ns_2_cycles(usec * NSEC_PER_USEC); 445 479 } 446 480 447 481 /* ··· 700 666 else 701 667 return uv2_wait_completion(bau_desc, mmr_offset, right_shift, 702 668 bcp, try); 703 - } 704 - 705 - static inline cycles_t sec_2_cycles(unsigned long sec) 706 - { 707 - unsigned long ns; 708 - cycles_t cyc; 709 - 710 - ns = sec * 1000000000; 711 - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 712 - return cyc; 713 669 } 714 670 715 671 /* ··· 1349 1325 1350 1326 static void ptc_seq_stop(struct seq_file *file, void *data) 1351 1327 { 1352 - } 1353 - 1354 - static inline unsigned long long usec_2_cycles(unsigned long microsec) 1355 - { 1356 - unsigned long ns; 1357 - unsigned long long cyc; 1358 - 1359 - ns = microsec * 1000; 1360 - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 1361 - return cyc; 1362 1328 } 1363 1329 1364 1330 /*

+2

arch/x86/syscalls/syscall_32.tbl

··· 357 357 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 358 358 349 i386 kcmp sys_kcmp 359 359 350 i386 finit_module sys_finit_module 360 + 351 i386 sched_setattr sys_sched_setattr 361 + 352 i386 sched_getattr sys_sched_getattr

+2

arch/x86/syscalls/syscall_64.tbl

··· 320 320 311 64 process_vm_writev sys_process_vm_writev 321 321 312 common kcmp sys_kcmp 322 322 313 common finit_module sys_finit_module 323 + 314 common sched_setattr sys_sched_setattr 324 + 315 common sched_getattr sys_sched_getattr 323 325 324 326 # 325 327 # x32-specific system call numbers start at 512 to avoid cache impact

+1 -4

drivers/acpi/acpi_pad.c

··· 193 193 CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 194 194 stop_critical_timings(); 195 195 196 - __monitor((void *)&current_thread_info()->flags, 0, 0); 197 - smp_mb(); 198 - if (!need_resched()) 199 - __mwait(power_saving_mwait_eax, 1); 196 + mwait_idle_with_hints(power_saving_mwait_eax, 1); 200 197 201 198 start_critical_timings(); 202 199 if (lapic_marked_unstable)

-15

drivers/acpi/processor_idle.c

··· 727 727 if (unlikely(!pr)) 728 728 return -EINVAL; 729 729 730 - if (cx->entry_method == ACPI_CSTATE_FFH) { 731 - if (current_set_polling_and_test()) 732 - return -EINVAL; 733 - } 734 - 735 730 lapic_timer_state_broadcast(pr, cx, 1); 736 731 acpi_idle_do_entry(cx); 737 732 ··· 780 785 if (unlikely(!pr)) 781 786 return -EINVAL; 782 787 783 - if (cx->entry_method == ACPI_CSTATE_FFH) { 784 - if (current_set_polling_and_test()) 785 - return -EINVAL; 786 - } 787 - 788 788 /* 789 789 * Must be done before busmaster disable as we might need to 790 790 * access HPET ! ··· 829 839 acpi_safe_halt(); 830 840 return -EBUSY; 831 841 } 832 - } 833 - 834 - if (cx->entry_method == ACPI_CSTATE_FFH) { 835 - if (current_set_polling_and_test()) 836 - return -EINVAL; 837 842 } 838 843 839 844 acpi_unlazy_tlb(smp_processor_id());

+1 -10

drivers/idle/intel_idle.c

··· 377 377 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 378 378 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 379 379 380 - if (!current_set_polling_and_test()) { 381 - 382 - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 383 - clflush((void *)&current_thread_info()->flags); 384 - 385 - __monitor((void *)&current_thread_info()->flags, 0, 0); 386 - smp_mb(); 387 - if (!need_resched()) 388 - __mwait(eax, ecx); 389 - } 380 + mwait_idle_with_hints(eax, ecx); 390 381 391 382 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 392 383 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);

+2 -4

drivers/thermal/intel_powerclamp.c

··· 438 438 */ 439 439 local_touch_nmi(); 440 440 stop_critical_timings(); 441 - __monitor((void *)&current_thread_info()->flags, 0, 0); 442 - cpu_relax(); /* allow HT sibling to run */ 443 - __mwait(eax, ecx); 441 + mwait_idle_with_hints(eax, ecx); 444 442 start_critical_timings(); 445 443 atomic_inc(&idle_wakeup_counter); 446 444 } 447 445 tick_nohz_idle_exit(); 448 - preempt_enable_no_resched(); 446 + preempt_enable(); 449 447 } 450 448 del_timer_sync(&wakeup_timer); 451 449 clear_bit(cpunr, cpu_clamping_mask);

+29 -3

include/linux/bottom_half.h

··· 1 1 #ifndef _LINUX_BH_H 2 2 #define _LINUX_BH_H 3 3 4 - extern void local_bh_disable(void); 4 + #include <linux/preempt.h> 5 + #include <linux/preempt_mask.h> 6 + 7 + #ifdef CONFIG_TRACE_IRQFLAGS 8 + extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); 9 + #else 10 + static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) 11 + { 12 + preempt_count_add(cnt); 13 + barrier(); 14 + } 15 + #endif 16 + 17 + static inline void local_bh_disable(void) 18 + { 19 + __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); 20 + } 21 + 5 22 extern void _local_bh_enable(void); 6 - extern void local_bh_enable(void); 7 - extern void local_bh_enable_ip(unsigned long ip); 23 + extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); 24 + 25 + static inline void local_bh_enable_ip(unsigned long ip) 26 + { 27 + __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); 28 + } 29 + 30 + static inline void local_bh_enable(void) 31 + { 32 + __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); 33 + } 8 34 9 35 #endif /* _LINUX_BH_H */

+1

include/linux/hardirq.h

··· 5 5 #include <linux/lockdep.h> 6 6 #include <linux/ftrace_irq.h> 7 7 #include <linux/vtime.h> 8 + #include <asm/hardirq.h> 8 9 9 10 10 11 extern void synchronize_irq(unsigned int irq);

+10

include/linux/init_task.h

··· 11 11 #include <linux/user_namespace.h> 12 12 #include <linux/securebits.h> 13 13 #include <linux/seqlock.h> 14 + #include <linux/rbtree.h> 14 15 #include <net/net_namespace.h> 15 16 #include <linux/sched/rt.h> 16 17 ··· 155 154 156 155 #define INIT_TASK_COMM "swapper" 157 156 157 + #ifdef CONFIG_RT_MUTEXES 158 + # define INIT_RT_MUTEXES(tsk) \ 159 + .pi_waiters = RB_ROOT, \ 160 + .pi_waiters_leftmost = NULL, 161 + #else 162 + # define INIT_RT_MUTEXES(tsk) 163 + #endif 164 + 158 165 /* 159 166 * INIT_TASK is used to set up the first task table, touch at 160 167 * your own risk!. Base=0, limit=0x1fffff (=2MB) ··· 230 221 INIT_TRACE_RECURSION \ 231 222 INIT_TASK_RCU_PREEMPT(tsk) \ 232 223 INIT_CPUSET_SEQ(tsk) \ 224 + INIT_RT_MUTEXES(tsk) \ 233 225 INIT_VTIME(tsk) \ 234 226 } 235 227

+35 -2

include/linux/preempt.h

··· 64 64 } while (0) 65 65 66 66 #else 67 - #define preempt_enable() preempt_enable_no_resched() 67 + #define preempt_enable() \ 68 + do { \ 69 + barrier(); \ 70 + preempt_count_dec(); \ 71 + } while (0) 68 72 #define preempt_check_resched() do { } while (0) 69 73 #endif 70 74 ··· 97 93 __preempt_schedule_context(); \ 98 94 } while (0) 99 95 #else 100 - #define preempt_enable_notrace() preempt_enable_no_resched_notrace() 96 + #define preempt_enable_notrace() \ 97 + do { \ 98 + barrier(); \ 99 + __preempt_count_dec(); \ 100 + } while (0) 101 101 #endif 102 102 103 103 #else /* !CONFIG_PREEMPT_COUNT */ ··· 123 115 #define preempt_enable_notrace() barrier() 124 116 125 117 #endif /* CONFIG_PREEMPT_COUNT */ 118 + 119 + #ifdef MODULE 120 + /* 121 + * Modules have no business playing preemption tricks. 122 + */ 123 + #undef sched_preempt_enable_no_resched 124 + #undef preempt_enable_no_resched 125 + #undef preempt_enable_no_resched_notrace 126 + #undef preempt_check_resched 127 + #endif 128 + 129 + #ifdef CONFIG_PREEMPT 130 + #define preempt_set_need_resched() \ 131 + do { \ 132 + set_preempt_need_resched(); \ 133 + } while (0) 134 + #define preempt_fold_need_resched() \ 135 + do { \ 136 + if (tif_need_resched()) \ 137 + set_preempt_need_resched(); \ 138 + } while (0) 139 + #else 140 + #define preempt_set_need_resched() do { } while (0) 141 + #define preempt_fold_need_resched() do { } while (0) 142 + #endif 126 143 127 144 #ifdef CONFIG_PREEMPT_NOTIFIERS 128 145

+15 -1

include/linux/preempt_mask.h

··· 2 2 #define LINUX_PREEMPT_MASK_H 3 3 4 4 #include <linux/preempt.h> 5 - #include <asm/hardirq.h> 6 5 7 6 /* 8 7 * We put the hardirq and softirq counter into the preemption ··· 76 77 #else 77 78 # define PREEMPT_CHECK_OFFSET 0 78 79 #endif 80 + 81 + /* 82 + * The preempt_count offset needed for things like: 83 + * 84 + * spin_lock_bh() 85 + * 86 + * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and 87 + * softirqs, such that unlock sequences of: 88 + * 89 + * spin_unlock(); 90 + * local_bh_enable(); 91 + * 92 + * Work as expected. 93 + */ 94 + #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) 79 95 80 96 /* 81 97 * Are we running in atomic context? WARNING: this macro cannot

+6 -12

include/linux/rtmutex.h

··· 13 13 #define __LINUX_RT_MUTEX_H 14 14 15 15 #include <linux/linkage.h> 16 - #include <linux/plist.h> 16 + #include <linux/rbtree.h> 17 17 #include <linux/spinlock_types.h> 18 18 19 19 extern int max_lock_depth; /* for sysctl */ ··· 22 22 * The rt_mutex structure 23 23 * 24 24 * @wait_lock: spinlock to protect the structure 25 - * @wait_list: pilist head to enqueue waiters in priority order 25 + * @waiters: rbtree root to enqueue waiters in priority order 26 + * @waiters_leftmost: top waiter 26 27 * @owner: the mutex owner 27 28 */ 28 29 struct rt_mutex { 29 30 raw_spinlock_t wait_lock; 30 - struct plist_head wait_list; 31 + struct rb_root waiters; 32 + struct rb_node *waiters_leftmost; 31 33 struct task_struct *owner; 32 34 #ifdef CONFIG_DEBUG_RT_MUTEXES 33 35 int save_state; ··· 68 66 69 67 #define __RT_MUTEX_INITIALIZER(mutexname) \ 70 68 { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ 71 - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ 69 + , .waiters = RB_ROOT \ 72 70 , .owner = NULL \ 73 71 __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} 74 72 ··· 99 97 extern int rt_mutex_trylock(struct rt_mutex *lock); 100 98 101 99 extern void rt_mutex_unlock(struct rt_mutex *lock); 102 - 103 - #ifdef CONFIG_RT_MUTEXES 104 - # define INIT_RT_MUTEXES(tsk) \ 105 - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ 106 - INIT_RT_MUTEX_DEBUG(tsk) 107 - #else 108 - # define INIT_RT_MUTEXES(tsk) 109 - #endif 110 100 111 101 #endif

+4 -8

include/linux/rwlock_api_smp.h

··· 172 172 173 173 static inline void __raw_read_lock_bh(rwlock_t *lock) 174 174 { 175 - local_bh_disable(); 176 - preempt_disable(); 175 + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 177 176 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 178 177 LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); 179 178 } ··· 199 200 200 201 static inline void __raw_write_lock_bh(rwlock_t *lock) 201 202 { 202 - local_bh_disable(); 203 - preempt_disable(); 203 + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 204 204 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 205 205 LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); 206 206 } ··· 248 250 { 249 251 rwlock_release(&lock->dep_map, 1, _RET_IP_); 250 252 do_raw_read_unlock(lock); 251 - preempt_enable_no_resched(); 252 - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 253 + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 253 254 } 254 255 255 256 static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, ··· 272 275 { 273 276 rwlock_release(&lock->dep_map, 1, _RET_IP_); 274 277 do_raw_write_unlock(lock); 275 - preempt_enable_no_resched(); 276 - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 278 + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 277 279 } 278 280 279 281 #endif /* __LINUX_RWLOCK_API_SMP_H */

+137 -4

include/linux/sched.h

··· 16 16 #include <linux/types.h> 17 17 #include <linux/timex.h> 18 18 #include <linux/jiffies.h> 19 + #include <linux/plist.h> 19 20 #include <linux/rbtree.h> 20 21 #include <linux/thread_info.h> 21 22 #include <linux/cpumask.h> ··· 56 55 #include <linux/gfp.h> 57 56 58 57 #include <asm/processor.h> 58 + 59 + #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ 60 + 61 + /* 62 + * Extended scheduling parameters data structure. 63 + * 64 + * This is needed because the original struct sched_param can not be 65 + * altered without introducing ABI issues with legacy applications 66 + * (e.g., in sched_getparam()). 67 + * 68 + * However, the possibility of specifying more than just a priority for 69 + * the tasks may be useful for a wide variety of application fields, e.g., 70 + * multimedia, streaming, automation and control, and many others. 71 + * 72 + * This variant (sched_attr) is meant at describing a so-called 73 + * sporadic time-constrained task. In such model a task is specified by: 74 + * - the activation period or minimum instance inter-arrival time; 75 + * - the maximum (or average, depending on the actual scheduling 76 + * discipline) computation time of all instances, a.k.a. runtime; 77 + * - the deadline (relative to the actual activation time) of each 78 + * instance. 79 + * Very briefly, a periodic (sporadic) task asks for the execution of 80 + * some specific computation --which is typically called an instance-- 81 + * (at most) every period. Moreover, each instance typically lasts no more 82 + * than the runtime and must be completed by time instant t equal to 83 + * the instance activation time + the deadline. 84 + * 85 + * This is reflected by the actual fields of the sched_attr structure: 86 + * 87 + * @size size of the structure, for fwd/bwd compat. 88 + * 89 + * @sched_policy task's scheduling policy 90 + * @sched_flags for customizing the scheduler behaviour 91 + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) 92 + * @sched_priority task's static priority (SCHED_FIFO/RR) 93 + * @sched_deadline representative of the task's deadline 94 + * @sched_runtime representative of the task's runtime 95 + * @sched_period representative of the task's period 96 + * 97 + * Given this task model, there are a multiplicity of scheduling algorithms 98 + * and policies, that can be used to ensure all the tasks will make their 99 + * timing constraints. 100 + * 101 + * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the 102 + * only user of this new interface. More information about the algorithm 103 + * available in the scheduling class file or in Documentation/. 104 + */ 105 + struct sched_attr { 106 + u32 size; 107 + 108 + u32 sched_policy; 109 + u64 sched_flags; 110 + 111 + /* SCHED_NORMAL, SCHED_BATCH */ 112 + s32 sched_nice; 113 + 114 + /* SCHED_FIFO, SCHED_RR */ 115 + u32 sched_priority; 116 + 117 + /* SCHED_DEADLINE */ 118 + u64 sched_runtime; 119 + u64 sched_deadline; 120 + u64 sched_period; 121 + }; 59 122 60 123 struct exec_domain; 61 124 struct futex_pi_state; ··· 233 168 234 169 #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) 235 170 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) 236 - #define task_is_dead(task) ((task)->exit_state != 0) 237 171 #define task_is_stopped_or_traced(task) \ 238 172 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 239 173 #define task_contributes_to_load(task) \ ··· 1093 1029 #endif 1094 1030 }; 1095 1031 1032 + struct sched_dl_entity { 1033 + struct rb_node rb_node; 1034 + 1035 + /* 1036 + * Original scheduling parameters. Copied here from sched_attr 1037 + * during sched_setscheduler2(), they will remain the same until 1038 + * the next sched_setscheduler2(). 1039 + */ 1040 + u64 dl_runtime; /* maximum runtime for each instance */ 1041 + u64 dl_deadline; /* relative deadline of each instance */ 1042 + u64 dl_period; /* separation of two instances (period) */ 1043 + u64 dl_bw; /* dl_runtime / dl_deadline */ 1044 + 1045 + /* 1046 + * Actual scheduling parameters. Initialized with the values above, 1047 + * they are continously updated during task execution. Note that 1048 + * the remaining runtime could be < 0 in case we are in overrun. 1049 + */ 1050 + s64 runtime; /* remaining runtime for this instance */ 1051 + u64 deadline; /* absolute deadline for this instance */ 1052 + unsigned int flags; /* specifying the scheduler behaviour */ 1053 + 1054 + /* 1055 + * Some bool flags: 1056 + * 1057 + * @dl_throttled tells if we exhausted the runtime. If so, the 1058 + * task has to wait for a replenishment to be performed at the 1059 + * next firing of dl_timer. 1060 + * 1061 + * @dl_new tells if a new instance arrived. If so we must 1062 + * start executing it with full runtime and reset its absolute 1063 + * deadline; 1064 + * 1065 + * @dl_boosted tells if we are boosted due to DI. If so we are 1066 + * outside bandwidth enforcement mechanism (but only until we 1067 + * exit the critical section). 1068 + */ 1069 + int dl_throttled, dl_new, dl_boosted; 1070 + 1071 + /* 1072 + * Bandwidth enforcement timer. Each -deadline task has its 1073 + * own bandwidth to be enforced, thus we need one timer per task. 1074 + */ 1075 + struct hrtimer dl_timer; 1076 + }; 1096 1077 1097 1078 struct rcu_node; 1098 1079 ··· 1174 1065 #ifdef CONFIG_CGROUP_SCHED 1175 1066 struct task_group *sched_task_group; 1176 1067 #endif 1068 + struct sched_dl_entity dl; 1177 1069 1178 1070 #ifdef CONFIG_PREEMPT_NOTIFIERS 1179 1071 /* list of struct preempt_notifier: */ ··· 1208 1098 struct list_head tasks; 1209 1099 #ifdef CONFIG_SMP 1210 1100 struct plist_node pushable_tasks; 1101 + struct rb_node pushable_dl_tasks; 1211 1102 #endif 1212 1103 1213 1104 struct mm_struct *mm, *active_mm; ··· 1360 1249 1361 1250 #ifdef CONFIG_RT_MUTEXES 1362 1251 /* PI waiters blocked on a rt_mutex held by this task */ 1363 - struct plist_head pi_waiters; 1252 + struct rb_root pi_waiters; 1253 + struct rb_node *pi_waiters_leftmost; 1364 1254 /* Deadlock detection and priority inheritance handling */ 1365 1255 struct rt_mutex_waiter *pi_blocked_on; 1256 + /* Top pi_waiters task */ 1257 + struct task_struct *pi_top_task; 1366 1258 #endif 1367 1259 1368 1260 #ifdef CONFIG_DEBUG_MUTEXES ··· 1994 1880 * but then during bootup it turns out that sched_clock() 1995 1881 * is reliable after all: 1996 1882 */ 1997 - extern int sched_clock_stable; 1883 + extern int sched_clock_stable(void); 1884 + extern void set_sched_clock_stable(void); 1885 + extern void clear_sched_clock_stable(void); 1998 1886 1999 1887 extern void sched_clock_tick(void); 2000 1888 extern void sched_clock_idle_sleep_event(void); ··· 2075 1959 const struct sched_param *); 2076 1960 extern int sched_setscheduler_nocheck(struct task_struct *, int, 2077 1961 const struct sched_param *); 1962 + extern int sched_setattr(struct task_struct *, 1963 + const struct sched_attr *); 2078 1964 extern struct task_struct *idle_task(int cpu); 2079 1965 /** 2080 1966 * is_idle_task - is the specified task an idle task? ··· 2156 2038 #else 2157 2039 static inline void kick_process(struct task_struct *tsk) { } 2158 2040 #endif 2159 - extern void sched_fork(unsigned long clone_flags, struct task_struct *p); 2041 + extern int sched_fork(unsigned long clone_flags, struct task_struct *p); 2160 2042 extern void sched_dead(struct task_struct *p); 2161 2043 2162 2044 extern void proc_caches_init(void); ··· 2744 2626 return unlikely(tif_need_resched()); 2745 2627 } 2746 2628 #endif 2629 + 2630 + static inline void current_clr_polling(void) 2631 + { 2632 + __current_clr_polling(); 2633 + 2634 + /* 2635 + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. 2636 + * Once the bit is cleared, we'll get IPIs with every new 2637 + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also 2638 + * fold. 2639 + */ 2640 + smp_mb(); /* paired with resched_task() */ 2641 + 2642 + preempt_fold_need_resched(); 2643 + } 2747 2644 2748 2645 static __always_inline bool need_resched(void) 2749 2646 {

+24

include/linux/sched/deadline.h

··· 1 + #ifndef _SCHED_DEADLINE_H 2 + #define _SCHED_DEADLINE_H 3 + 4 + /* 5 + * SCHED_DEADLINE tasks has negative priorities, reflecting 6 + * the fact that any of them has higher prio than RT and 7 + * NORMAL/BATCH tasks. 8 + */ 9 + 10 + #define MAX_DL_PRIO 0 11 + 12 + static inline int dl_prio(int prio) 13 + { 14 + if (unlikely(prio < MAX_DL_PRIO)) 15 + return 1; 16 + return 0; 17 + } 18 + 19 + static inline int dl_task(struct task_struct *p) 20 + { 21 + return dl_prio(p->prio); 22 + } 23 + 24 + #endif /* _SCHED_DEADLINE_H */

+5

include/linux/sched/rt.h

··· 35 35 #ifdef CONFIG_RT_MUTEXES 36 36 extern int rt_mutex_getprio(struct task_struct *p); 37 37 extern void rt_mutex_setprio(struct task_struct *p, int prio); 38 + extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); 38 39 extern void rt_mutex_adjust_pi(struct task_struct *p); 39 40 static inline bool tsk_is_pi_blocked(struct task_struct *tsk) 40 41 { ··· 45 44 static inline int rt_mutex_getprio(struct task_struct *p) 46 45 { 47 46 return p->normal_prio; 47 + } 48 + static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) 49 + { 50 + return NULL; 48 51 } 49 52 # define rt_mutex_adjust_pi(p) do { } while (0) 50 53 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)

-1

include/linux/sched/sysctl.h

··· 48 48 extern unsigned int sysctl_numa_balancing_scan_period_min; 49 49 extern unsigned int sysctl_numa_balancing_scan_period_max; 50 50 extern unsigned int sysctl_numa_balancing_scan_size; 51 - extern unsigned int sysctl_numa_balancing_settle_count; 52 51 53 52 #ifdef CONFIG_SCHED_DEBUG 54 53 extern unsigned int sysctl_sched_migration_cost;

+4 -8

include/linux/spinlock_api_smp.h

··· 131 131 132 132 static inline void __raw_spin_lock_bh(raw_spinlock_t *lock) 133 133 { 134 - local_bh_disable(); 135 - preempt_disable(); 134 + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 136 135 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 137 136 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); 138 137 } ··· 173 174 { 174 175 spin_release(&lock->dep_map, 1, _RET_IP_); 175 176 do_raw_spin_unlock(lock); 176 - preempt_enable_no_resched(); 177 - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 177 + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 178 178 } 179 179 180 180 static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) 181 181 { 182 - local_bh_disable(); 183 - preempt_disable(); 182 + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 184 183 if (do_raw_spin_trylock(lock)) { 185 184 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); 186 185 return 1; 187 186 } 188 - preempt_enable_no_resched(); 189 - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 187 + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); 190 188 return 0; 191 189 } 192 190

+11 -5

include/linux/spinlock_api_up.h

··· 24 24 * flags straight, to suppress compiler warnings of unused lock 25 25 * variables, and to add the proper checker annotations: 26 26 */ 27 + #define ___LOCK(lock) \ 28 + do { __acquire(lock); (void)(lock); } while (0) 29 + 27 30 #define __LOCK(lock) \ 28 - do { preempt_disable(); __acquire(lock); (void)(lock); } while (0) 31 + do { preempt_disable(); ___LOCK(lock); } while (0) 29 32 30 33 #define __LOCK_BH(lock) \ 31 - do { local_bh_disable(); __LOCK(lock); } while (0) 34 + do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0) 32 35 33 36 #define __LOCK_IRQ(lock) \ 34 37 do { local_irq_disable(); __LOCK(lock); } while (0) ··· 39 36 #define __LOCK_IRQSAVE(lock, flags) \ 40 37 do { local_irq_save(flags); __LOCK(lock); } while (0) 41 38 39 + #define ___UNLOCK(lock) \ 40 + do { __release(lock); (void)(lock); } while (0) 41 + 42 42 #define __UNLOCK(lock) \ 43 - do { preempt_enable(); __release(lock); (void)(lock); } while (0) 43 + do { preempt_enable(); ___UNLOCK(lock); } while (0) 44 44 45 45 #define __UNLOCK_BH(lock) \ 46 - do { preempt_enable_no_resched(); local_bh_enable(); \ 47 - __release(lock); (void)(lock); } while (0) 46 + do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \ 47 + ___UNLOCK(lock); } while (0) 48 48 49 49 #define __UNLOCK_IRQ(lock) \ 50 50 do { local_irq_enable(); __UNLOCK(lock); } while (0)

+6

include/linux/syscalls.h

··· 38 38 struct rlimit64; 39 39 struct rusage; 40 40 struct sched_param; 41 + struct sched_attr; 41 42 struct sel_arg_struct; 42 43 struct semaphore; 43 44 struct sembuf; ··· 280 279 struct sched_param __user *param); 281 280 asmlinkage long sys_sched_setparam(pid_t pid, 282 281 struct sched_param __user *param); 282 + asmlinkage long sys_sched_setattr(pid_t pid, 283 + struct sched_attr __user *attr); 283 284 asmlinkage long sys_sched_getscheduler(pid_t pid); 284 285 asmlinkage long sys_sched_getparam(pid_t pid, 285 286 struct sched_param __user *param); 287 + asmlinkage long sys_sched_getattr(pid_t pid, 288 + struct sched_attr __user *attr, 289 + unsigned int size); 286 290 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 287 291 unsigned long __user *user_mask_ptr); 288 292 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

+4 -1

include/linux/uaccess.h

··· 25 25 26 26 static inline void pagefault_enable(void) 27 27 { 28 + #ifndef CONFIG_PREEMPT 28 29 /* 29 30 * make sure to issue those last loads/stores before enabling 30 31 * the pagefault handler again. 31 32 */ 32 33 barrier(); 33 34 preempt_count_dec(); 34 - preempt_check_resched(); 35 + #else 36 + preempt_enable(); 37 + #endif 35 38 } 36 39 37 40 #ifndef ARCH_HAS_NOCACHE_UACCESS

+1 -18

include/net/busy_poll.h

··· 42 42 return sysctl_net_busy_poll; 43 43 } 44 44 45 - /* a wrapper to make debug_smp_processor_id() happy 46 - * we can use sched_clock() because we don't care much about precision 47 - * we only care that the average is bounded 48 - */ 49 - #ifdef CONFIG_DEBUG_PREEMPT 50 45 static inline u64 busy_loop_us_clock(void) 51 46 { 52 - u64 rc; 53 - 54 - preempt_disable_notrace(); 55 - rc = sched_clock(); 56 - preempt_enable_no_resched_notrace(); 57 - 58 - return rc >> 10; 47 + return local_clock() >> 10; 59 48 } 60 - #else /* CONFIG_DEBUG_PREEMPT */ 61 - static inline u64 busy_loop_us_clock(void) 62 - { 63 - return sched_clock() >> 10; 64 - } 65 - #endif /* CONFIG_DEBUG_PREEMPT */ 66 49 67 50 static inline unsigned long sk_busy_loop_end_time(struct sock *sk) 68 51 {

+6

include/uapi/linux/sched.h

··· 39 39 #define SCHED_BATCH 3 40 40 /* SCHED_ISO: reserved but not implemented yet */ 41 41 #define SCHED_IDLE 5 42 + #define SCHED_DEADLINE 6 43 + 42 44 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ 43 45 #define SCHED_RESET_ON_FORK 0x40000000 44 46 47 + /* 48 + * For the sched_{set,get}attr() calls 49 + */ 50 + #define SCHED_FLAG_RESET_ON_FORK 0x01 45 51 46 52 #endif /* _UAPI_LINUX_SCHED_H */

+10 -7

kernel/cpu/idle.c

··· 105 105 __current_set_polling(); 106 106 } 107 107 arch_cpu_idle_exit(); 108 - /* 109 - * We need to test and propagate the TIF_NEED_RESCHED 110 - * bit here because we might not have send the 111 - * reschedule IPI to idle tasks. 112 - */ 113 - if (tif_need_resched()) 114 - set_preempt_need_resched(); 115 108 } 109 + 110 + /* 111 + * Since we fell out of the loop above, we know 112 + * TIF_NEED_RESCHED must be set, propagate it into 113 + * PREEMPT_NEED_RESCHED. 114 + * 115 + * This is required because for polling idle loops we will 116 + * not have had an IPI to fold the state for us. 117 + */ 118 + preempt_set_need_resched(); 116 119 tick_nohz_idle_exit(); 117 120 schedule_preempt_disabled(); 118 121 }

+7 -5

kernel/fork.c

··· 1087 1087 { 1088 1088 raw_spin_lock_init(&p->pi_lock); 1089 1089 #ifdef CONFIG_RT_MUTEXES 1090 - plist_head_init(&p->pi_waiters); 1090 + p->pi_waiters = RB_ROOT; 1091 + p->pi_waiters_leftmost = NULL; 1091 1092 p->pi_blocked_on = NULL; 1093 + p->pi_top_task = NULL; 1092 1094 #endif 1093 1095 } 1094 1096 ··· 1313 1311 #endif 1314 1312 1315 1313 /* Perform scheduler related setup. Assign this task to a CPU. */ 1316 - sched_fork(clone_flags, p); 1314 + retval = sched_fork(clone_flags, p); 1315 + if (retval) 1316 + goto bad_fork_cleanup_policy; 1317 1317 1318 1318 retval = perf_event_init_task(p); 1319 1319 if (retval) ··· 1407 1403 p->tgid = p->pid; 1408 1404 } 1409 1405 1410 - p->pdeath_signal = 0; 1411 - p->exit_state = 0; 1412 - 1413 1406 p->nr_dirtied = 0; 1414 1407 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1415 1408 p->dirty_paused_when = 0; 1416 1409 1410 + p->pdeath_signal = 0; 1417 1411 INIT_LIST_HEAD(&p->thread_group); 1418 1412 p->task_works = NULL; 1419 1413

+2

kernel/futex.c

··· 2426 2426 * code while we sleep on uaddr. 2427 2427 */ 2428 2428 debug_rt_mutex_init_waiter(&rt_waiter); 2429 + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); 2430 + RB_CLEAR_NODE(&rt_waiter.tree_entry); 2429 2431 rt_waiter.task = NULL; 2430 2432 2431 2433 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);

+2 -1

kernel/hrtimer.c

··· 46 46 #include <linux/sched.h> 47 47 #include <linux/sched/sysctl.h> 48 48 #include <linux/sched/rt.h> 49 + #include <linux/sched/deadline.h> 49 50 #include <linux/timer.h> 50 51 #include <linux/freezer.h> 51 52 ··· 1611 1610 unsigned long slack; 1612 1611 1613 1612 slack = current->timer_slack_ns; 1614 - if (rt_task(current)) 1613 + if (dl_task(current) || rt_task(current)) 1615 1614 slack = 0; 1616 1615 1617 1616 hrtimer_init_on_stack(&t.timer, clockid, mode);

+2 -6

kernel/locking/rtmutex-debug.c

··· 24 24 #include <linux/kallsyms.h> 25 25 #include <linux/syscalls.h> 26 26 #include <linux/interrupt.h> 27 - #include <linux/plist.h> 27 + #include <linux/rbtree.h> 28 28 #include <linux/fs.h> 29 29 #include <linux/debug_locks.h> 30 30 ··· 57 57 58 58 void rt_mutex_debug_task_free(struct task_struct *task) 59 59 { 60 - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); 60 + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); 61 61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); 62 62 } 63 63 ··· 154 154 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 155 155 { 156 156 memset(waiter, 0x11, sizeof(*waiter)); 157 - plist_node_init(&waiter->list_entry, MAX_PRIO); 158 - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); 159 157 waiter->deadlock_task_pid = NULL; 160 158 } 161 159 162 160 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 163 161 { 164 162 put_pid(waiter->deadlock_task_pid); 165 - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); 166 - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 167 163 memset(waiter, 0x22, sizeof(*waiter)); 168 164 } 169 165

+135 -31

kernel/locking/rtmutex.c

··· 14 14 #include <linux/export.h> 15 15 #include <linux/sched.h> 16 16 #include <linux/sched/rt.h> 17 + #include <linux/sched/deadline.h> 17 18 #include <linux/timer.h> 18 19 19 20 #include "rtmutex_common.h" ··· 92 91 } 93 92 #endif 94 93 94 + static inline int 95 + rt_mutex_waiter_less(struct rt_mutex_waiter *left, 96 + struct rt_mutex_waiter *right) 97 + { 98 + if (left->prio < right->prio) 99 + return 1; 100 + 101 + /* 102 + * If both waiters have dl_prio(), we check the deadlines of the 103 + * associated tasks. 104 + * If left waiter has a dl_prio(), and we didn't return 1 above, 105 + * then right waiter has a dl_prio() too. 106 + */ 107 + if (dl_prio(left->prio)) 108 + return (left->task->dl.deadline < right->task->dl.deadline); 109 + 110 + return 0; 111 + } 112 + 113 + static void 114 + rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) 115 + { 116 + struct rb_node **link = &lock->waiters.rb_node; 117 + struct rb_node *parent = NULL; 118 + struct rt_mutex_waiter *entry; 119 + int leftmost = 1; 120 + 121 + while (*link) { 122 + parent = *link; 123 + entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); 124 + if (rt_mutex_waiter_less(waiter, entry)) { 125 + link = &parent->rb_left; 126 + } else { 127 + link = &parent->rb_right; 128 + leftmost = 0; 129 + } 130 + } 131 + 132 + if (leftmost) 133 + lock->waiters_leftmost = &waiter->tree_entry; 134 + 135 + rb_link_node(&waiter->tree_entry, parent, link); 136 + rb_insert_color(&waiter->tree_entry, &lock->waiters); 137 + } 138 + 139 + static void 140 + rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) 141 + { 142 + if (RB_EMPTY_NODE(&waiter->tree_entry)) 143 + return; 144 + 145 + if (lock->waiters_leftmost == &waiter->tree_entry) 146 + lock->waiters_leftmost = rb_next(&waiter->tree_entry); 147 + 148 + rb_erase(&waiter->tree_entry, &lock->waiters); 149 + RB_CLEAR_NODE(&waiter->tree_entry); 150 + } 151 + 152 + static void 153 + rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 154 + { 155 + struct rb_node **link = &task->pi_waiters.rb_node; 156 + struct rb_node *parent = NULL; 157 + struct rt_mutex_waiter *entry; 158 + int leftmost = 1; 159 + 160 + while (*link) { 161 + parent = *link; 162 + entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); 163 + if (rt_mutex_waiter_less(waiter, entry)) { 164 + link = &parent->rb_left; 165 + } else { 166 + link = &parent->rb_right; 167 + leftmost = 0; 168 + } 169 + } 170 + 171 + if (leftmost) 172 + task->pi_waiters_leftmost = &waiter->pi_tree_entry; 173 + 174 + rb_link_node(&waiter->pi_tree_entry, parent, link); 175 + rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); 176 + } 177 + 178 + static void 179 + rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 180 + { 181 + if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) 182 + return; 183 + 184 + if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) 185 + task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); 186 + 187 + rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); 188 + RB_CLEAR_NODE(&waiter->pi_tree_entry); 189 + } 190 + 95 191 /* 96 - * Calculate task priority from the waiter list priority 192 + * Calculate task priority from the waiter tree priority 97 193 * 98 - * Return task->normal_prio when the waiter list is empty or when 194 + * Return task->normal_prio when the waiter tree is empty or when 99 195 * the waiter is not allowed to do priority boosting 100 196 */ 101 197 int rt_mutex_getprio(struct task_struct *task) ··· 200 102 if (likely(!task_has_pi_waiters(task))) 201 103 return task->normal_prio; 202 104 203 - return min(task_top_pi_waiter(task)->pi_list_entry.prio, 105 + return min(task_top_pi_waiter(task)->prio, 204 106 task->normal_prio); 107 + } 108 + 109 + struct task_struct *rt_mutex_get_top_task(struct task_struct *task) 110 + { 111 + if (likely(!task_has_pi_waiters(task))) 112 + return NULL; 113 + 114 + return task_top_pi_waiter(task)->task; 205 115 } 206 116 207 117 /* ··· 221 115 { 222 116 int prio = rt_mutex_getprio(task); 223 117 224 - if (task->prio != prio) 118 + if (task->prio != prio || dl_prio(prio)) 225 119 rt_mutex_setprio(task, prio); 226 120 } 227 121 ··· 339 233 * When deadlock detection is off then we check, if further 340 234 * priority adjustment is necessary. 341 235 */ 342 - if (!detect_deadlock && waiter->list_entry.prio == task->prio) 236 + if (!detect_deadlock && waiter->prio == task->prio) 343 237 goto out_unlock_pi; 344 238 345 239 lock = waiter->lock; ··· 360 254 top_waiter = rt_mutex_top_waiter(lock); 361 255 362 256 /* Requeue the waiter */ 363 - plist_del(&waiter->list_entry, &lock->wait_list); 364 - waiter->list_entry.prio = task->prio; 365 - plist_add(&waiter->list_entry, &lock->wait_list); 257 + rt_mutex_dequeue(lock, waiter); 258 + waiter->prio = task->prio; 259 + rt_mutex_enqueue(lock, waiter); 366 260 367 261 /* Release the task */ 368 262 raw_spin_unlock_irqrestore(&task->pi_lock, flags); ··· 386 280 387 281 if (waiter == rt_mutex_top_waiter(lock)) { 388 282 /* Boost the owner */ 389 - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); 390 - waiter->pi_list_entry.prio = waiter->list_entry.prio; 391 - plist_add(&waiter->pi_list_entry, &task->pi_waiters); 283 + rt_mutex_dequeue_pi(task, top_waiter); 284 + rt_mutex_enqueue_pi(task, waiter); 392 285 __rt_mutex_adjust_prio(task); 393 286 394 287 } else if (top_waiter == waiter) { 395 288 /* Deboost the owner */ 396 - plist_del(&waiter->pi_list_entry, &task->pi_waiters); 289 + rt_mutex_dequeue_pi(task, waiter); 397 290 waiter = rt_mutex_top_waiter(lock); 398 - waiter->pi_list_entry.prio = waiter->list_entry.prio; 399 - plist_add(&waiter->pi_list_entry, &task->pi_waiters); 291 + rt_mutex_enqueue_pi(task, waiter); 400 292 __rt_mutex_adjust_prio(task); 401 293 } 402 294 ··· 459 355 * 3) it is top waiter 460 356 */ 461 357 if (rt_mutex_has_waiters(lock)) { 462 - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { 358 + if (task->prio >= rt_mutex_top_waiter(lock)->prio) { 463 359 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 464 360 return 0; 465 361 } ··· 473 369 474 370 /* remove the queued waiter. */ 475 371 if (waiter) { 476 - plist_del(&waiter->list_entry, &lock->wait_list); 372 + rt_mutex_dequeue(lock, waiter); 477 373 task->pi_blocked_on = NULL; 478 374 } 479 375 ··· 483 379 */ 484 380 if (rt_mutex_has_waiters(lock)) { 485 381 top = rt_mutex_top_waiter(lock); 486 - top->pi_list_entry.prio = top->list_entry.prio; 487 - plist_add(&top->pi_list_entry, &task->pi_waiters); 382 + rt_mutex_enqueue_pi(task, top); 488 383 } 489 384 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 490 385 } ··· 519 416 __rt_mutex_adjust_prio(task); 520 417 waiter->task = task; 521 418 waiter->lock = lock; 522 - plist_node_init(&waiter->list_entry, task->prio); 523 - plist_node_init(&waiter->pi_list_entry, task->prio); 419 + waiter->prio = task->prio; 524 420 525 421 /* Get the top priority waiter on the lock */ 526 422 if (rt_mutex_has_waiters(lock)) 527 423 top_waiter = rt_mutex_top_waiter(lock); 528 - plist_add(&waiter->list_entry, &lock->wait_list); 424 + rt_mutex_enqueue(lock, waiter); 529 425 530 426 task->pi_blocked_on = waiter; 531 427 ··· 535 433 536 434 if (waiter == rt_mutex_top_waiter(lock)) { 537 435 raw_spin_lock_irqsave(&owner->pi_lock, flags); 538 - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 539 - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 436 + rt_mutex_dequeue_pi(owner, top_waiter); 437 + rt_mutex_enqueue_pi(owner, waiter); 540 438 541 439 __rt_mutex_adjust_prio(owner); 542 440 if (owner->pi_blocked_on) ··· 588 486 * boosted mode and go back to normal after releasing 589 487 * lock->wait_lock. 590 488 */ 591 - plist_del(&waiter->pi_list_entry, &current->pi_waiters); 489 + rt_mutex_dequeue_pi(current, waiter); 592 490 593 491 rt_mutex_set_owner(lock, NULL); 594 492 ··· 612 510 int chain_walk = 0; 613 511 614 512 raw_spin_lock_irqsave(&current->pi_lock, flags); 615 - plist_del(&waiter->list_entry, &lock->wait_list); 513 + rt_mutex_dequeue(lock, waiter); 616 514 current->pi_blocked_on = NULL; 617 515 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 618 516 ··· 623 521 624 522 raw_spin_lock_irqsave(&owner->pi_lock, flags); 625 523 626 - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 524 + rt_mutex_dequeue_pi(owner, waiter); 627 525 628 526 if (rt_mutex_has_waiters(lock)) { 629 527 struct rt_mutex_waiter *next; 630 528 631 529 next = rt_mutex_top_waiter(lock); 632 - plist_add(&next->pi_list_entry, &owner->pi_waiters); 530 + rt_mutex_enqueue_pi(owner, next); 633 531 } 634 532 __rt_mutex_adjust_prio(owner); 635 533 ··· 638 536 639 537 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 640 538 } 641 - 642 - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 643 539 644 540 if (!chain_walk) 645 541 return; ··· 665 565 raw_spin_lock_irqsave(&task->pi_lock, flags); 666 566 667 567 waiter = task->pi_blocked_on; 668 - if (!waiter || waiter->list_entry.prio == task->prio) { 568 + if (!waiter || (waiter->prio == task->prio && 569 + !dl_prio(task->prio))) { 669 570 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 670 571 return; 671 572 } ··· 739 638 int ret = 0; 740 639 741 640 debug_rt_mutex_init_waiter(&waiter); 641 + RB_CLEAR_NODE(&waiter.pi_tree_entry); 642 + RB_CLEAR_NODE(&waiter.tree_entry); 742 643 743 644 raw_spin_lock(&lock->wait_lock); 744 645 ··· 1007 904 { 1008 905 lock->owner = NULL; 1009 906 raw_spin_lock_init(&lock->wait_lock); 1010 - plist_head_init(&lock->wait_list); 907 + lock->waiters = RB_ROOT; 908 + lock->waiters_leftmost = NULL; 1011 909 1012 910 debug_rt_mutex_init(lock, name); 1013 911 }

+12 -11

kernel/locking/rtmutex_common.h

··· 40 40 * This is the control structure for tasks blocked on a rt_mutex, 41 41 * which is allocated on the kernel stack on of the blocked task. 42 42 * 43 - * @list_entry: pi node to enqueue into the mutex waiters list 44 - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list 43 + * @tree_entry: pi node to enqueue into the mutex waiters tree 44 + * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree 45 45 * @task: task reference to the blocked task 46 46 */ 47 47 struct rt_mutex_waiter { 48 - struct plist_node list_entry; 49 - struct plist_node pi_list_entry; 48 + struct rb_node tree_entry; 49 + struct rb_node pi_tree_entry; 50 50 struct task_struct *task; 51 51 struct rt_mutex *lock; 52 52 #ifdef CONFIG_DEBUG_RT_MUTEXES ··· 54 54 struct pid *deadlock_task_pid; 55 55 struct rt_mutex *deadlock_lock; 56 56 #endif 57 + int prio; 57 58 }; 58 59 59 60 /* 60 - * Various helpers to access the waiters-plist: 61 + * Various helpers to access the waiters-tree: 61 62 */ 62 63 static inline int rt_mutex_has_waiters(struct rt_mutex *lock) 63 64 { 64 - return !plist_head_empty(&lock->wait_list); 65 + return !RB_EMPTY_ROOT(&lock->waiters); 65 66 } 66 67 67 68 static inline struct rt_mutex_waiter * ··· 70 69 { 71 70 struct rt_mutex_waiter *w; 72 71 73 - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, 74 - list_entry); 72 + w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, 73 + tree_entry); 75 74 BUG_ON(w->lock != lock); 76 75 77 76 return w; ··· 79 78 80 79 static inline int task_has_pi_waiters(struct task_struct *p) 81 80 { 82 - return !plist_head_empty(&p->pi_waiters); 81 + return !RB_EMPTY_ROOT(&p->pi_waiters); 83 82 } 84 83 85 84 static inline struct rt_mutex_waiter * 86 85 task_top_pi_waiter(struct task_struct *p) 87 86 { 88 - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, 89 - pi_list_entry); 87 + return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, 88 + pi_tree_entry); 90 89 } 91 90 92 91 /*

+3 -2

kernel/sched/Makefile

··· 11 11 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 12 12 endif 13 13 14 - obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14 + obj-y += core.o proc.o clock.o cputime.o 15 + obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15 16 obj-y += wait.o completion.o 16 - obj-$(CONFIG_SMP) += cpupri.o 17 + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17 18 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18 19 obj-$(CONFIG_SCHEDSTATS) += stats.o 19 20 obj-$(CONFIG_SCHED_DEBUG) += debug.o

+47 -31

kernel/sched/clock.c

··· 26 26 * at 0 on boot (but people really shouldn't rely on that). 27 27 * 28 28 * cpu_clock(i) -- can be used from any context, including NMI. 29 - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) 30 29 * local_clock() -- is cpu_clock() on the current cpu. 30 + * 31 + * sched_clock_cpu(i) 31 32 * 32 33 * How: 33 34 * ··· 51 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 52 51 * that is otherwise invisible (TSC gets stopped). 53 52 * 54 - * 55 - * Notes: 56 - * 57 - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things 58 - * like cpufreq interrupts that can change the base clock (TSC) multiplier 59 - * and cause funny jumps in time -- although the filtering provided by 60 - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it 61 - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on 62 - * sched_clock(). 63 53 */ 64 54 #include <linux/spinlock.h> 65 55 #include <linux/hardirq.h> ··· 58 66 #include <linux/percpu.h> 59 67 #include <linux/ktime.h> 60 68 #include <linux/sched.h> 69 + #include <linux/static_key.h> 70 + #include <linux/workqueue.h> 61 71 62 72 /* 63 73 * Scheduler clock - returns current time in nanosec units. ··· 76 82 __read_mostly int sched_clock_running; 77 83 78 84 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 79 - __read_mostly int sched_clock_stable; 85 + static struct static_key __sched_clock_stable = STATIC_KEY_INIT; 86 + 87 + int sched_clock_stable(void) 88 + { 89 + if (static_key_false(&__sched_clock_stable)) 90 + return false; 91 + return true; 92 + } 93 + 94 + void set_sched_clock_stable(void) 95 + { 96 + if (!sched_clock_stable()) 97 + static_key_slow_dec(&__sched_clock_stable); 98 + } 99 + 100 + static void __clear_sched_clock_stable(struct work_struct *work) 101 + { 102 + /* XXX worry about clock continuity */ 103 + if (sched_clock_stable()) 104 + static_key_slow_inc(&__sched_clock_stable); 105 + } 106 + 107 + static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); 108 + 109 + void clear_sched_clock_stable(void) 110 + { 111 + if (keventd_up()) 112 + schedule_work(&sched_clock_work); 113 + else 114 + __clear_sched_clock_stable(&sched_clock_work); 115 + } 80 116 81 117 struct sched_clock_data { 82 118 u64 tick_raw; ··· 266 242 struct sched_clock_data *scd; 267 243 u64 clock; 268 244 269 - WARN_ON_ONCE(!irqs_disabled()); 270 - 271 - if (sched_clock_stable) 245 + if (sched_clock_stable()) 272 246 return sched_clock(); 273 247 274 248 if (unlikely(!sched_clock_running)) 275 249 return 0ull; 276 250 251 + preempt_disable(); 277 252 scd = cpu_sdc(cpu); 278 253 279 254 if (cpu != smp_processor_id()) 280 255 clock = sched_clock_remote(scd); 281 256 else 282 257 clock = sched_clock_local(scd); 258 + preempt_enable(); 283 259 284 260 return clock; 285 261 } ··· 289 265 struct sched_clock_data *scd; 290 266 u64 now, now_gtod; 291 267 292 - if (sched_clock_stable) 268 + if (sched_clock_stable()) 293 269 return; 294 270 295 271 if (unlikely(!sched_clock_running)) ··· 340 316 */ 341 317 u64 cpu_clock(int cpu) 342 318 { 343 - u64 clock; 344 - unsigned long flags; 319 + if (static_key_false(&__sched_clock_stable)) 320 + return sched_clock_cpu(cpu); 345 321 346 - local_irq_save(flags); 347 - clock = sched_clock_cpu(cpu); 348 - local_irq_restore(flags); 349 - 350 - return clock; 322 + return sched_clock(); 351 323 } 352 324 353 325 /* ··· 355 335 */ 356 336 u64 local_clock(void) 357 337 { 358 - u64 clock; 359 - unsigned long flags; 338 + if (static_key_false(&__sched_clock_stable)) 339 + return sched_clock_cpu(raw_smp_processor_id()); 360 340 361 - local_irq_save(flags); 362 - clock = sched_clock_cpu(smp_processor_id()); 363 - local_irq_restore(flags); 364 - 365 - return clock; 341 + return sched_clock(); 366 342 } 367 343 368 344 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ ··· 378 362 379 363 u64 cpu_clock(int cpu) 380 364 { 381 - return sched_clock_cpu(cpu); 365 + return sched_clock(); 382 366 } 383 367 384 368 u64 local_clock(void) 385 369 { 386 - return sched_clock_cpu(0); 370 + return sched_clock(); 387 371 } 388 372 389 373 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */

+716 -122

kernel/sched/core.c

··· 296 296 */ 297 297 int sysctl_sched_rt_runtime = 950000; 298 298 299 - 300 - 301 299 /* 302 300 * __task_rq_lock - lock the rq @p resides on. 303 301 */ ··· 897 899 { 898 900 int prio; 899 901 900 - if (task_has_rt_policy(p)) 902 + if (task_has_dl_policy(p)) 903 + prio = MAX_DL_PRIO-1; 904 + else if (task_has_rt_policy(p)) 901 905 prio = MAX_RT_PRIO-1 - p->rt_priority; 902 906 else 903 907 prio = __normal_prio(p); ··· 945 945 if (prev_class->switched_from) 946 946 prev_class->switched_from(rq, p); 947 947 p->sched_class->switched_to(rq, p); 948 - } else if (oldprio != p->prio) 948 + } else if (oldprio != p->prio || dl_task(p)) 949 949 p->sched_class->prio_changed(rq, p, oldprio); 950 950 } 951 951 ··· 1499 1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1500 1500 * this IPI. 1501 1501 */ 1502 - if (tif_need_resched()) 1503 - set_preempt_need_resched(); 1502 + preempt_fold_need_resched(); 1504 1503 1505 1504 if (llist_empty(&this_rq()->wake_list) 1506 1505 && !tick_nohz_full_cpu(smp_processor_id()) ··· 1716 1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1717 1718 #endif 1718 1719 1720 + RB_CLEAR_NODE(&p->dl.rb_node); 1721 + hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1722 + p->dl.dl_runtime = p->dl.runtime = 0; 1723 + p->dl.dl_deadline = p->dl.deadline = 0; 1724 + p->dl.dl_period = 0; 1725 + p->dl.flags = 0; 1726 + 1719 1727 INIT_LIST_HEAD(&p->rt.run_list); 1720 1728 1721 1729 #ifdef CONFIG_PREEMPT_NOTIFIERS ··· 1774 1768 /* 1775 1769 * fork()/clone()-time setup: 1776 1770 */ 1777 - void sched_fork(unsigned long clone_flags, struct task_struct *p) 1771 + int sched_fork(unsigned long clone_flags, struct task_struct *p) 1778 1772 { 1779 1773 unsigned long flags; 1780 1774 int cpu = get_cpu(); ··· 1796 1790 * Revert to default priority/policy on fork if requested. 1797 1791 */ 1798 1792 if (unlikely(p->sched_reset_on_fork)) { 1799 - if (task_has_rt_policy(p)) { 1793 + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 1800 1794 p->policy = SCHED_NORMAL; 1801 1795 p->static_prio = NICE_TO_PRIO(0); 1802 1796 p->rt_priority = 0; ··· 1813 1807 p->sched_reset_on_fork = 0; 1814 1808 } 1815 1809 1816 - if (!rt_prio(p->prio)) 1810 + if (dl_prio(p->prio)) { 1811 + put_cpu(); 1812 + return -EAGAIN; 1813 + } else if (rt_prio(p->prio)) { 1814 + p->sched_class = &rt_sched_class; 1815 + } else { 1817 1816 p->sched_class = &fair_sched_class; 1817 + } 1818 1818 1819 1819 if (p->sched_class->task_fork) 1820 1820 p->sched_class->task_fork(p); ··· 1846 1834 init_task_preempt_count(p); 1847 1835 #ifdef CONFIG_SMP 1848 1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1837 + RB_CLEAR_NODE(&p->pushable_dl_tasks); 1849 1838 #endif 1850 1839 1851 1840 put_cpu(); 1841 + return 0; 1852 1842 } 1843 + 1844 + unsigned long to_ratio(u64 period, u64 runtime) 1845 + { 1846 + if (runtime == RUNTIME_INF) 1847 + return 1ULL << 20; 1848 + 1849 + /* 1850 + * Doing this here saves a lot of checks in all 1851 + * the calling paths, and returning zero seems 1852 + * safe for them anyway. 1853 + */ 1854 + if (period == 0) 1855 + return 0; 1856 + 1857 + return div64_u64(runtime << 20, period); 1858 + } 1859 + 1860 + #ifdef CONFIG_SMP 1861 + inline struct dl_bw *dl_bw_of(int i) 1862 + { 1863 + return &cpu_rq(i)->rd->dl_bw; 1864 + } 1865 + 1866 + static inline int dl_bw_cpus(int i) 1867 + { 1868 + struct root_domain *rd = cpu_rq(i)->rd; 1869 + int cpus = 0; 1870 + 1871 + for_each_cpu_and(i, rd->span, cpu_active_mask) 1872 + cpus++; 1873 + 1874 + return cpus; 1875 + } 1876 + #else 1877 + inline struct dl_bw *dl_bw_of(int i) 1878 + { 1879 + return &cpu_rq(i)->dl.dl_bw; 1880 + } 1881 + 1882 + static inline int dl_bw_cpus(int i) 1883 + { 1884 + return 1; 1885 + } 1886 + #endif 1887 + 1888 + static inline 1889 + void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) 1890 + { 1891 + dl_b->total_bw -= tsk_bw; 1892 + } 1893 + 1894 + static inline 1895 + void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) 1896 + { 1897 + dl_b->total_bw += tsk_bw; 1898 + } 1899 + 1900 + static inline 1901 + bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) 1902 + { 1903 + return dl_b->bw != -1 && 1904 + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 1905 + } 1906 + 1907 + /* 1908 + * We must be sure that accepting a new task (or allowing changing the 1909 + * parameters of an existing one) is consistent with the bandwidth 1910 + * constraints. If yes, this function also accordingly updates the currently 1911 + * allocated bandwidth to reflect the new situation. 1912 + * 1913 + * This function is called while holding p's rq->lock. 1914 + */ 1915 + static int dl_overflow(struct task_struct *p, int policy, 1916 + const struct sched_attr *attr) 1917 + { 1918 + 1919 + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 1920 + u64 period = attr->sched_period; 1921 + u64 runtime = attr->sched_runtime; 1922 + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 1923 + int cpus, err = -1; 1924 + 1925 + if (new_bw == p->dl.dl_bw) 1926 + return 0; 1927 + 1928 + /* 1929 + * Either if a task, enters, leave, or stays -deadline but changes 1930 + * its parameters, we may need to update accordingly the total 1931 + * allocated bandwidth of the container. 1932 + */ 1933 + raw_spin_lock(&dl_b->lock); 1934 + cpus = dl_bw_cpus(task_cpu(p)); 1935 + if (dl_policy(policy) && !task_has_dl_policy(p) && 1936 + !__dl_overflow(dl_b, cpus, 0, new_bw)) { 1937 + __dl_add(dl_b, new_bw); 1938 + err = 0; 1939 + } else if (dl_policy(policy) && task_has_dl_policy(p) && 1940 + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 1941 + __dl_clear(dl_b, p->dl.dl_bw); 1942 + __dl_add(dl_b, new_bw); 1943 + err = 0; 1944 + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 1945 + __dl_clear(dl_b, p->dl.dl_bw); 1946 + err = 0; 1947 + } 1948 + raw_spin_unlock(&dl_b->lock); 1949 + 1950 + return err; 1951 + } 1952 + 1953 + extern void init_dl_bw(struct dl_bw *dl_b); 1853 1954 1854 1955 /* 1855 1956 * wake_up_new_task - wake up a newly created task for the first time. ··· 2127 2002 mmdrop(mm); 2128 2003 if (unlikely(prev_state == TASK_DEAD)) { 2129 2004 task_numa_free(prev); 2005 + 2006 + if (prev->sched_class->task_dead) 2007 + prev->sched_class->task_dead(prev); 2130 2008 2131 2009 /* 2132 2010 * Remove function-return probe instances associated with this ··· 2424 2296 2425 2297 #ifdef CONFIG_SMP 2426 2298 rq->idle_balance = idle_cpu(cpu); 2427 - trigger_load_balance(rq, cpu); 2299 + trigger_load_balance(rq); 2428 2300 #endif 2429 2301 rq_last_tick_reset(rq); 2430 2302 } ··· 2542 2414 { 2543 2415 /* 2544 2416 * Test if we are atomic. Since do_exit() needs to call into 2545 - * schedule() atomically, we ignore that path for now. 2546 - * Otherwise, whine if we are scheduling when we should not be. 2417 + * schedule() atomically, we ignore that path. Otherwise whine 2418 + * if we are scheduling when we should not. 2547 2419 */ 2548 - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2420 + if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) 2549 2421 __schedule_bug(prev); 2550 2422 rcu_sleep_check(); 2551 2423 ··· 2889 2761 */ 2890 2762 void rt_mutex_setprio(struct task_struct *p, int prio) 2891 2763 { 2892 - int oldprio, on_rq, running; 2764 + int oldprio, on_rq, running, enqueue_flag = 0; 2893 2765 struct rq *rq; 2894 2766 const struct sched_class *prev_class; 2895 2767 2896 - BUG_ON(prio < 0 || prio > MAX_PRIO); 2768 + BUG_ON(prio > MAX_PRIO); 2897 2769 2898 2770 rq = __task_rq_lock(p); 2899 2771 ··· 2916 2788 } 2917 2789 2918 2790 trace_sched_pi_setprio(p, prio); 2791 + p->pi_top_task = rt_mutex_get_top_task(p); 2919 2792 oldprio = p->prio; 2920 2793 prev_class = p->sched_class; 2921 2794 on_rq = p->on_rq; ··· 2926 2797 if (running) 2927 2798 p->sched_class->put_prev_task(rq, p); 2928 2799 2929 - if (rt_prio(prio)) 2800 + /* 2801 + * Boosting condition are: 2802 + * 1. -rt task is running and holds mutex A 2803 + * --> -dl task blocks on mutex A 2804 + * 2805 + * 2. -dl task is running and holds mutex A 2806 + * --> -dl task blocks on mutex A and could preempt the 2807 + * running task 2808 + */ 2809 + if (dl_prio(prio)) { 2810 + if (!dl_prio(p->normal_prio) || (p->pi_top_task && 2811 + dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 2812 + p->dl.dl_boosted = 1; 2813 + p->dl.dl_throttled = 0; 2814 + enqueue_flag = ENQUEUE_REPLENISH; 2815 + } else 2816 + p->dl.dl_boosted = 0; 2817 + p->sched_class = &dl_sched_class; 2818 + } else if (rt_prio(prio)) { 2819 + if (dl_prio(oldprio)) 2820 + p->dl.dl_boosted = 0; 2821 + if (oldprio < prio) 2822 + enqueue_flag = ENQUEUE_HEAD; 2930 2823 p->sched_class = &rt_sched_class; 2931 - else 2824 + } else { 2825 + if (dl_prio(oldprio)) 2826 + p->dl.dl_boosted = 0; 2932 2827 p->sched_class = &fair_sched_class; 2828 + } 2933 2829 2934 2830 p->prio = prio; 2935 2831 2936 2832 if (running) 2937 2833 p->sched_class->set_curr_task(rq); 2938 2834 if (on_rq) 2939 - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2835 + enqueue_task(rq, p, enqueue_flag); 2940 2836 2941 2837 check_class_changed(rq, p, prev_class, oldprio); 2942 2838 out_unlock: 2943 2839 __task_rq_unlock(rq); 2944 2840 } 2945 2841 #endif 2842 + 2946 2843 void set_user_nice(struct task_struct *p, long nice) 2947 2844 { 2948 2845 int old_prio, delta, on_rq; ··· 2986 2831 * The RT priorities are set via sched_setscheduler(), but we still 2987 2832 * allow the 'normal' nice value to be set - but as expected 2988 2833 * it wont have any effect on scheduling until the task is 2989 - * SCHED_FIFO/SCHED_RR: 2834 + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 2990 2835 */ 2991 - if (task_has_rt_policy(p)) { 2836 + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2992 2837 p->static_prio = NICE_TO_PRIO(nice); 2993 2838 goto out_unlock; 2994 2839 } ··· 3143 2988 return pid ? find_task_by_vpid(pid) : current; 3144 2989 } 3145 2990 3146 - /* Actually do priority change: must hold rq lock. */ 2991 + /* 2992 + * This function initializes the sched_dl_entity of a newly becoming 2993 + * SCHED_DEADLINE task. 2994 + * 2995 + * Only the static values are considered here, the actual runtime and the 2996 + * absolute deadline will be properly calculated when the task is enqueued 2997 + * for the first time with its new policy. 2998 + */ 3147 2999 static void 3148 - __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3000 + __setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3149 3001 { 3002 + struct sched_dl_entity *dl_se = &p->dl; 3003 + 3004 + init_dl_task_timer(dl_se); 3005 + dl_se->dl_runtime = attr->sched_runtime; 3006 + dl_se->dl_deadline = attr->sched_deadline; 3007 + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3008 + dl_se->flags = attr->sched_flags; 3009 + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3010 + dl_se->dl_throttled = 0; 3011 + dl_se->dl_new = 1; 3012 + } 3013 + 3014 + /* Actually do priority change: must hold pi & rq lock. */ 3015 + static void __setscheduler(struct rq *rq, struct task_struct *p, 3016 + const struct sched_attr *attr) 3017 + { 3018 + int policy = attr->sched_policy; 3019 + 3020 + if (policy == -1) /* setparam */ 3021 + policy = p->policy; 3022 + 3150 3023 p->policy = policy; 3151 - p->rt_priority = prio; 3024 + 3025 + if (dl_policy(policy)) 3026 + __setparam_dl(p, attr); 3027 + else if (fair_policy(policy)) 3028 + p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3029 + 3030 + /* 3031 + * __sched_setscheduler() ensures attr->sched_priority == 0 when 3032 + * !rt_policy. Always setting this ensures that things like 3033 + * getparam()/getattr() don't report silly values for !rt tasks. 3034 + */ 3035 + p->rt_priority = attr->sched_priority; 3036 + 3152 3037 p->normal_prio = normal_prio(p); 3153 - /* we are holding p->pi_lock already */ 3154 3038 p->prio = rt_mutex_getprio(p); 3155 - if (rt_prio(p->prio)) 3039 + 3040 + if (dl_prio(p->prio)) 3041 + p->sched_class = &dl_sched_class; 3042 + else if (rt_prio(p->prio)) 3156 3043 p->sched_class = &rt_sched_class; 3157 3044 else 3158 3045 p->sched_class = &fair_sched_class; 3046 + 3159 3047 set_load_weight(p); 3048 + } 3049 + 3050 + static void 3051 + __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3052 + { 3053 + struct sched_dl_entity *dl_se = &p->dl; 3054 + 3055 + attr->sched_priority = p->rt_priority; 3056 + attr->sched_runtime = dl_se->dl_runtime; 3057 + attr->sched_deadline = dl_se->dl_deadline; 3058 + attr->sched_period = dl_se->dl_period; 3059 + attr->sched_flags = dl_se->flags; 3060 + } 3061 + 3062 + /* 3063 + * This function validates the new parameters of a -deadline task. 3064 + * We ask for the deadline not being zero, and greater or equal 3065 + * than the runtime, as well as the period of being zero or 3066 + * greater than deadline. Furthermore, we have to be sure that 3067 + * user parameters are above the internal resolution (1us); we 3068 + * check sched_runtime only since it is always the smaller one. 3069 + */ 3070 + static bool 3071 + __checkparam_dl(const struct sched_attr *attr) 3072 + { 3073 + return attr && attr->sched_deadline != 0 && 3074 + (attr->sched_period == 0 || 3075 + (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3076 + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && 3077 + attr->sched_runtime >= (2 << (DL_SCALE - 1)); 3160 3078 } 3161 3079 3162 3080 /* ··· 3248 3020 return match; 3249 3021 } 3250 3022 3251 - static int __sched_setscheduler(struct task_struct *p, int policy, 3252 - const struct sched_param *param, bool user) 3023 + static int __sched_setscheduler(struct task_struct *p, 3024 + const struct sched_attr *attr, 3025 + bool user) 3253 3026 { 3254 3027 int retval, oldprio, oldpolicy = -1, on_rq, running; 3028 + int policy = attr->sched_policy; 3255 3029 unsigned long flags; 3256 3030 const struct sched_class *prev_class; 3257 3031 struct rq *rq; ··· 3267 3037 reset_on_fork = p->sched_reset_on_fork; 3268 3038 policy = oldpolicy = p->policy; 3269 3039 } else { 3270 - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3271 - policy &= ~SCHED_RESET_ON_FORK; 3040 + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3272 3041 3273 - if (policy != SCHED_FIFO && policy != SCHED_RR && 3042 + if (policy != SCHED_DEADLINE && 3043 + policy != SCHED_FIFO && policy != SCHED_RR && 3274 3044 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3275 3045 policy != SCHED_IDLE) 3276 3046 return -EINVAL; 3277 3047 } 3048 + 3049 + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3050 + return -EINVAL; 3278 3051 3279 3052 /* 3280 3053 * Valid priorities for SCHED_FIFO and SCHED_RR are 3281 3054 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3282 3055 * SCHED_BATCH and SCHED_IDLE is 0. 3283 3056 */ 3284 - if (param->sched_priority < 0 || 3285 - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3286 - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3057 + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3058 + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3287 3059 return -EINVAL; 3288 - if (rt_policy(policy) != (param->sched_priority != 0)) 3060 + if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3061 + (rt_policy(policy) != (attr->sched_priority != 0))) 3289 3062 return -EINVAL; 3290 3063 3291 3064 /* 3292 3065 * Allow unprivileged RT tasks to decrease priority: 3293 3066 */ 3294 3067 if (user && !capable(CAP_SYS_NICE)) { 3068 + if (fair_policy(policy)) { 3069 + if (attr->sched_nice < TASK_NICE(p) && 3070 + !can_nice(p, attr->sched_nice)) 3071 + return -EPERM; 3072 + } 3073 + 3295 3074 if (rt_policy(policy)) { 3296 3075 unsigned long rlim_rtprio = 3297 3076 task_rlimit(p, RLIMIT_RTPRIO); ··· 3310 3071 return -EPERM; 3311 3072 3312 3073 /* can't increase priority */ 3313 - if (param->sched_priority > p->rt_priority && 3314 - param->sched_priority > rlim_rtprio) 3074 + if (attr->sched_priority > p->rt_priority && 3075 + attr->sched_priority > rlim_rtprio) 3315 3076 return -EPERM; 3316 3077 } 3317 3078 ··· 3359 3120 /* 3360 3121 * If not changing anything there's no need to proceed further: 3361 3122 */ 3362 - if (unlikely(policy == p->policy && (!rt_policy(policy) || 3363 - param->sched_priority == p->rt_priority))) { 3123 + if (unlikely(policy == p->policy)) { 3124 + if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3125 + goto change; 3126 + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3127 + goto change; 3128 + if (dl_policy(policy)) 3129 + goto change; 3130 + 3364 3131 task_rq_unlock(rq, p, &flags); 3365 3132 return 0; 3366 3133 } 3134 + change: 3367 3135 3368 - #ifdef CONFIG_RT_GROUP_SCHED 3369 3136 if (user) { 3137 + #ifdef CONFIG_RT_GROUP_SCHED 3370 3138 /* 3371 3139 * Do not allow realtime tasks into groups that have no runtime 3372 3140 * assigned. ··· 3384 3138 task_rq_unlock(rq, p, &flags); 3385 3139 return -EPERM; 3386 3140 } 3387 - } 3388 3141 #endif 3142 + #ifdef CONFIG_SMP 3143 + if (dl_bandwidth_enabled() && dl_policy(policy)) { 3144 + cpumask_t *span = rq->rd->span; 3145 + 3146 + /* 3147 + * Don't allow tasks with an affinity mask smaller than 3148 + * the entire root_domain to become SCHED_DEADLINE. We 3149 + * will also fail if there's no bandwidth available. 3150 + */ 3151 + if (!cpumask_subset(span, &p->cpus_allowed) || 3152 + rq->rd->dl_bw.bw == 0) { 3153 + task_rq_unlock(rq, p, &flags); 3154 + return -EPERM; 3155 + } 3156 + } 3157 + #endif 3158 + } 3389 3159 3390 3160 /* recheck policy now with rq lock held */ 3391 3161 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ··· 3409 3147 task_rq_unlock(rq, p, &flags); 3410 3148 goto recheck; 3411 3149 } 3150 + 3151 + /* 3152 + * If setscheduling to SCHED_DEADLINE (or changing the parameters 3153 + * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3154 + * is available. 3155 + */ 3156 + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3157 + task_rq_unlock(rq, p, &flags); 3158 + return -EBUSY; 3159 + } 3160 + 3412 3161 on_rq = p->on_rq; 3413 3162 running = task_current(rq, p); 3414 3163 if (on_rq) ··· 3431 3158 3432 3159 oldprio = p->prio; 3433 3160 prev_class = p->sched_class; 3434 - __setscheduler(rq, p, policy, param->sched_priority); 3161 + __setscheduler(rq, p, attr); 3435 3162 3436 3163 if (running) 3437 3164 p->sched_class->set_curr_task(rq); ··· 3446 3173 return 0; 3447 3174 } 3448 3175 3176 + static int _sched_setscheduler(struct task_struct *p, int policy, 3177 + const struct sched_param *param, bool check) 3178 + { 3179 + struct sched_attr attr = { 3180 + .sched_policy = policy, 3181 + .sched_priority = param->sched_priority, 3182 + .sched_nice = PRIO_TO_NICE(p->static_prio), 3183 + }; 3184 + 3185 + /* 3186 + * Fixup the legacy SCHED_RESET_ON_FORK hack 3187 + */ 3188 + if (policy & SCHED_RESET_ON_FORK) { 3189 + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3190 + policy &= ~SCHED_RESET_ON_FORK; 3191 + attr.sched_policy = policy; 3192 + } 3193 + 3194 + return __sched_setscheduler(p, &attr, check); 3195 + } 3449 3196 /** 3450 3197 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3451 3198 * @p: the task in question. ··· 3479 3186 int sched_setscheduler(struct task_struct *p, int policy, 3480 3187 const struct sched_param *param) 3481 3188 { 3482 - return __sched_setscheduler(p, policy, param, true); 3189 + return _sched_setscheduler(p, policy, param, true); 3483 3190 } 3484 3191 EXPORT_SYMBOL_GPL(sched_setscheduler); 3192 + 3193 + int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3194 + { 3195 + return __sched_setscheduler(p, attr, true); 3196 + } 3197 + EXPORT_SYMBOL_GPL(sched_setattr); 3485 3198 3486 3199 /** 3487 3200 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ··· 3505 3206 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3506 3207 const struct sched_param *param) 3507 3208 { 3508 - return __sched_setscheduler(p, policy, param, false); 3209 + return _sched_setscheduler(p, policy, param, false); 3509 3210 } 3510 3211 3511 3212 static int ··· 3528 3229 rcu_read_unlock(); 3529 3230 3530 3231 return retval; 3232 + } 3233 + 3234 + /* 3235 + * Mimics kernel/events/core.c perf_copy_attr(). 3236 + */ 3237 + static int sched_copy_attr(struct sched_attr __user *uattr, 3238 + struct sched_attr *attr) 3239 + { 3240 + u32 size; 3241 + int ret; 3242 + 3243 + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3244 + return -EFAULT; 3245 + 3246 + /* 3247 + * zero the full structure, so that a short copy will be nice. 3248 + */ 3249 + memset(attr, 0, sizeof(*attr)); 3250 + 3251 + ret = get_user(size, &uattr->size); 3252 + if (ret) 3253 + return ret; 3254 + 3255 + if (size > PAGE_SIZE) /* silly large */ 3256 + goto err_size; 3257 + 3258 + if (!size) /* abi compat */ 3259 + size = SCHED_ATTR_SIZE_VER0; 3260 + 3261 + if (size < SCHED_ATTR_SIZE_VER0) 3262 + goto err_size; 3263 + 3264 + /* 3265 + * If we're handed a bigger struct than we know of, 3266 + * ensure all the unknown bits are 0 - i.e. new 3267 + * user-space does not rely on any kernel feature 3268 + * extensions we dont know about yet. 3269 + */ 3270 + if (size > sizeof(*attr)) { 3271 + unsigned char __user *addr; 3272 + unsigned char __user *end; 3273 + unsigned char val; 3274 + 3275 + addr = (void __user *)uattr + sizeof(*attr); 3276 + end = (void __user *)uattr + size; 3277 + 3278 + for (; addr < end; addr++) { 3279 + ret = get_user(val, addr); 3280 + if (ret) 3281 + return ret; 3282 + if (val) 3283 + goto err_size; 3284 + } 3285 + size = sizeof(*attr); 3286 + } 3287 + 3288 + ret = copy_from_user(attr, uattr, size); 3289 + if (ret) 3290 + return -EFAULT; 3291 + 3292 + /* 3293 + * XXX: do we want to be lenient like existing syscalls; or do we want 3294 + * to be strict and return an error on out-of-bounds values? 3295 + */ 3296 + attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3297 + 3298 + out: 3299 + return ret; 3300 + 3301 + err_size: 3302 + put_user(sizeof(*attr), &uattr->size); 3303 + ret = -E2BIG; 3304 + goto out; 3531 3305 } 3532 3306 3533 3307 /** ··· 3631 3259 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3632 3260 { 3633 3261 return do_sched_setscheduler(pid, -1, param); 3262 + } 3263 + 3264 + /** 3265 + * sys_sched_setattr - same as above, but with extended sched_attr 3266 + * @pid: the pid in question. 3267 + * @uattr: structure containing the extended parameters. 3268 + */ 3269 + SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) 3270 + { 3271 + struct sched_attr attr; 3272 + struct task_struct *p; 3273 + int retval; 3274 + 3275 + if (!uattr || pid < 0) 3276 + return -EINVAL; 3277 + 3278 + if (sched_copy_attr(uattr, &attr)) 3279 + return -EFAULT; 3280 + 3281 + rcu_read_lock(); 3282 + retval = -ESRCH; 3283 + p = find_process_by_pid(pid); 3284 + if (p != NULL) 3285 + retval = sched_setattr(p, &attr); 3286 + rcu_read_unlock(); 3287 + 3288 + return retval; 3634 3289 } 3635 3290 3636 3291 /** ··· 3715 3316 if (retval) 3716 3317 goto out_unlock; 3717 3318 3319 + if (task_has_dl_policy(p)) { 3320 + retval = -EINVAL; 3321 + goto out_unlock; 3322 + } 3718 3323 lp.sched_priority = p->rt_priority; 3719 3324 rcu_read_unlock(); 3720 3325 ··· 3727 3324 */ 3728 3325 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3729 3326 3327 + return retval; 3328 + 3329 + out_unlock: 3330 + rcu_read_unlock(); 3331 + return retval; 3332 + } 3333 + 3334 + static int sched_read_attr(struct sched_attr __user *uattr, 3335 + struct sched_attr *attr, 3336 + unsigned int usize) 3337 + { 3338 + int ret; 3339 + 3340 + if (!access_ok(VERIFY_WRITE, uattr, usize)) 3341 + return -EFAULT; 3342 + 3343 + /* 3344 + * If we're handed a smaller struct than we know of, 3345 + * ensure all the unknown bits are 0 - i.e. old 3346 + * user-space does not get uncomplete information. 3347 + */ 3348 + if (usize < sizeof(*attr)) { 3349 + unsigned char *addr; 3350 + unsigned char *end; 3351 + 3352 + addr = (void *)attr + usize; 3353 + end = (void *)attr + sizeof(*attr); 3354 + 3355 + for (; addr < end; addr++) { 3356 + if (*addr) 3357 + goto err_size; 3358 + } 3359 + 3360 + attr->size = usize; 3361 + } 3362 + 3363 + ret = copy_to_user(uattr, attr, usize); 3364 + if (ret) 3365 + return -EFAULT; 3366 + 3367 + out: 3368 + return ret; 3369 + 3370 + err_size: 3371 + ret = -E2BIG; 3372 + goto out; 3373 + } 3374 + 3375 + /** 3376 + * sys_sched_getattr - similar to sched_getparam, but with sched_attr 3377 + * @pid: the pid in question. 3378 + * @uattr: structure containing the extended parameters. 3379 + * @size: sizeof(attr) for fwd/bwd comp. 3380 + */ 3381 + SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3382 + unsigned int, size) 3383 + { 3384 + struct sched_attr attr = { 3385 + .size = sizeof(struct sched_attr), 3386 + }; 3387 + struct task_struct *p; 3388 + int retval; 3389 + 3390 + if (!uattr || pid < 0 || size > PAGE_SIZE || 3391 + size < SCHED_ATTR_SIZE_VER0) 3392 + return -EINVAL; 3393 + 3394 + rcu_read_lock(); 3395 + p = find_process_by_pid(pid); 3396 + retval = -ESRCH; 3397 + if (!p) 3398 + goto out_unlock; 3399 + 3400 + retval = security_task_getscheduler(p); 3401 + if (retval) 3402 + goto out_unlock; 3403 + 3404 + attr.sched_policy = p->policy; 3405 + if (p->sched_reset_on_fork) 3406 + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3407 + if (task_has_dl_policy(p)) 3408 + __getparam_dl(p, &attr); 3409 + else if (task_has_rt_policy(p)) 3410 + attr.sched_priority = p->rt_priority; 3411 + else 3412 + attr.sched_nice = TASK_NICE(p); 3413 + 3414 + rcu_read_unlock(); 3415 + 3416 + retval = sched_read_attr(uattr, &attr, size); 3730 3417 return retval; 3731 3418 3732 3419 out_unlock: ··· 3868 3375 if (retval) 3869 3376 goto out_unlock; 3870 3377 3378 + 3871 3379 cpuset_cpus_allowed(p, cpus_allowed); 3872 3380 cpumask_and(new_mask, in_mask, cpus_allowed); 3381 + 3382 + /* 3383 + * Since bandwidth control happens on root_domain basis, 3384 + * if admission test is enabled, we only admit -deadline 3385 + * tasks allowed to run on all the CPUs in the task's 3386 + * root_domain. 3387 + */ 3388 + #ifdef CONFIG_SMP 3389 + if (task_has_dl_policy(p)) { 3390 + const struct cpumask *span = task_rq(p)->rd->span; 3391 + 3392 + if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { 3393 + retval = -EBUSY; 3394 + goto out_unlock; 3395 + } 3396 + } 3397 + #endif 3873 3398 again: 3874 3399 retval = set_cpus_allowed_ptr(p, new_mask); 3875 3400 ··· 4164 3653 } 4165 3654 4166 3655 double_rq_lock(rq, p_rq); 4167 - while (task_rq(p) != p_rq) { 3656 + if (task_rq(p) != p_rq) { 4168 3657 double_rq_unlock(rq, p_rq); 4169 3658 goto again; 4170 3659 } ··· 4253 3742 case SCHED_RR: 4254 3743 ret = MAX_USER_RT_PRIO-1; 4255 3744 break; 3745 + case SCHED_DEADLINE: 4256 3746 case SCHED_NORMAL: 4257 3747 case SCHED_BATCH: 4258 3748 case SCHED_IDLE: ··· 4280 3768 case SCHED_RR: 4281 3769 ret = 1; 4282 3770 break; 3771 + case SCHED_DEADLINE: 4283 3772 case SCHED_NORMAL: 4284 3773 case SCHED_BATCH: 4285 3774 case SCHED_IDLE: ··· 5027 4514 static int sched_cpu_inactive(struct notifier_block *nfb, 5028 4515 unsigned long action, void *hcpu) 5029 4516 { 4517 + unsigned long flags; 4518 + long cpu = (long)hcpu; 4519 + 5030 4520 switch (action & ~CPU_TASKS_FROZEN) { 5031 4521 case CPU_DOWN_PREPARE: 5032 - set_cpu_active((long)hcpu, false); 4522 + set_cpu_active(cpu, false); 4523 + 4524 + /* explicitly allow suspend */ 4525 + if (!(action & CPU_TASKS_FROZEN)) { 4526 + struct dl_bw *dl_b = dl_bw_of(cpu); 4527 + bool overflow; 4528 + int cpus; 4529 + 4530 + raw_spin_lock_irqsave(&dl_b->lock, flags); 4531 + cpus = dl_bw_cpus(cpu); 4532 + overflow = __dl_overflow(dl_b, cpus, 0, 0); 4533 + raw_spin_unlock_irqrestore(&dl_b->lock, flags); 4534 + 4535 + if (overflow) 4536 + return notifier_from_errno(-EBUSY); 4537 + } 5033 4538 return NOTIFY_OK; 5034 - default: 5035 - return NOTIFY_DONE; 5036 4539 } 4540 + 4541 + return NOTIFY_DONE; 5037 4542 } 5038 4543 5039 4544 static int __init migration_init(void) ··· 5270 4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5271 4740 5272 4741 cpupri_cleanup(&rd->cpupri); 4742 + cpudl_cleanup(&rd->cpudl); 4743 + free_cpumask_var(rd->dlo_mask); 5273 4744 free_cpumask_var(rd->rto_mask); 5274 4745 free_cpumask_var(rd->online); 5275 4746 free_cpumask_var(rd->span); ··· 5323 4790 goto out; 5324 4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5325 4792 goto free_span; 5326 - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 4793 + if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5327 4794 goto free_online; 4795 + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 4796 + goto free_dlo_mask; 4797 + 4798 + init_dl_bw(&rd->dl_bw); 4799 + if (cpudl_init(&rd->cpudl) != 0) 4800 + goto free_dlo_mask; 5328 4801 5329 4802 if (cpupri_init(&rd->cpupri) != 0) 5330 4803 goto free_rto_mask; ··· 5338 4799 5339 4800 free_rto_mask: 5340 4801 free_cpumask_var(rd->rto_mask); 4802 + free_dlo_mask: 4803 + free_cpumask_var(rd->dlo_mask); 5341 4804 free_online: 5342 4805 free_cpumask_var(rd->online); 5343 4806 free_span: ··· 6691 6150 free_cpumask_var(non_isolated_cpus); 6692 6151 6693 6152 init_sched_rt_class(); 6153 + init_sched_dl_class(); 6694 6154 } 6695 6155 #else 6696 6156 void __init sched_init_smp(void) ··· 6761 6219 #endif /* CONFIG_CPUMASK_OFFSTACK */ 6762 6220 } 6763 6221 6222 + init_rt_bandwidth(&def_rt_bandwidth, 6223 + global_rt_period(), global_rt_runtime()); 6224 + init_dl_bandwidth(&def_dl_bandwidth, 6225 + global_rt_period(), global_rt_runtime()); 6226 + 6764 6227 #ifdef CONFIG_SMP 6765 6228 init_defrootdomain(); 6766 6229 #endif 6767 - 6768 - init_rt_bandwidth(&def_rt_bandwidth, 6769 - global_rt_period(), global_rt_runtime()); 6770 6230 6771 6231 #ifdef CONFIG_RT_GROUP_SCHED 6772 6232 init_rt_bandwidth(&root_task_group.rt_bandwidth, ··· 6793 6249 rq->calc_load_update = jiffies + LOAD_FREQ; 6794 6250 init_cfs_rq(&rq->cfs); 6795 6251 init_rt_rq(&rq->rt, rq); 6252 + init_dl_rq(&rq->dl, rq); 6796 6253 #ifdef CONFIG_FAIR_GROUP_SCHED 6797 6254 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6798 6255 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); ··· 6863 6318 6864 6319 #ifdef CONFIG_PREEMPT_NOTIFIERS 6865 6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6866 - #endif 6867 - 6868 - #ifdef CONFIG_RT_MUTEXES 6869 - plist_head_init(&init_task.pi_waiters); 6870 6321 #endif 6871 6322 6872 6323 /* ··· 6938 6397 static void normalize_task(struct rq *rq, struct task_struct *p) 6939 6398 { 6940 6399 const struct sched_class *prev_class = p->sched_class; 6400 + struct sched_attr attr = { 6401 + .sched_policy = SCHED_NORMAL, 6402 + }; 6941 6403 int old_prio = p->prio; 6942 6404 int on_rq; 6943 6405 6944 6406 on_rq = p->on_rq; 6945 6407 if (on_rq) 6946 6408 dequeue_task(rq, p, 0); 6947 - __setscheduler(rq, p, SCHED_NORMAL, 0); 6409 + __setscheduler(rq, p, &attr); 6948 6410 if (on_rq) { 6949 6411 enqueue_task(rq, p, 0); 6950 6412 resched_task(rq->curr); ··· 6977 6433 p->se.statistics.block_start = 0; 6978 6434 #endif 6979 6435 6980 - if (!rt_task(p)) { 6436 + if (!dl_task(p) && !rt_task(p)) { 6981 6437 /* 6982 6438 * Renice negative nice level userspace 6983 6439 * tasks back to 0: ··· 7172 6628 } 7173 6629 #endif /* CONFIG_CGROUP_SCHED */ 7174 6630 7175 - #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7176 - static unsigned long to_ratio(u64 period, u64 runtime) 7177 - { 7178 - if (runtime == RUNTIME_INF) 7179 - return 1ULL << 20; 7180 - 7181 - return div64_u64(runtime << 20, period); 7182 - } 7183 - #endif 7184 - 7185 6631 #ifdef CONFIG_RT_GROUP_SCHED 7186 6632 /* 7187 6633 * Ensure that the real time constraints are schedulable. ··· 7345 6811 do_div(rt_period_us, NSEC_PER_USEC); 7346 6812 return rt_period_us; 7347 6813 } 6814 + #endif /* CONFIG_RT_GROUP_SCHED */ 7348 6815 6816 + #ifdef CONFIG_RT_GROUP_SCHED 7349 6817 static int sched_rt_global_constraints(void) 7350 6818 { 7351 - u64 runtime, period; 7352 6819 int ret = 0; 7353 - 7354 - if (sysctl_sched_rt_period <= 0) 7355 - return -EINVAL; 7356 - 7357 - runtime = global_rt_runtime(); 7358 - period = global_rt_period(); 7359 - 7360 - /* 7361 - * Sanity check on the sysctl variables. 7362 - */ 7363 - if (runtime > period && runtime != RUNTIME_INF) 7364 - return -EINVAL; 7365 6820 7366 6821 mutex_lock(&rt_constraints_mutex); 7367 6822 read_lock(&tasklist_lock); ··· 7374 6851 static int sched_rt_global_constraints(void) 7375 6852 { 7376 6853 unsigned long flags; 7377 - int i; 7378 - 7379 - if (sysctl_sched_rt_period <= 0) 7380 - return -EINVAL; 7381 - 7382 - /* 7383 - * There's always some RT tasks in the root group 7384 - * -- migration, kstopmachine etc.. 7385 - */ 7386 - if (sysctl_sched_rt_runtime == 0) 7387 - return -EBUSY; 6854 + int i, ret = 0; 7388 6855 7389 6856 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7390 6857 for_each_possible_cpu(i) { ··· 7386 6873 } 7387 6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7388 6875 7389 - return 0; 6876 + return ret; 7390 6877 } 7391 6878 #endif /* CONFIG_RT_GROUP_SCHED */ 6879 + 6880 + static int sched_dl_global_constraints(void) 6881 + { 6882 + u64 runtime = global_rt_runtime(); 6883 + u64 period = global_rt_period(); 6884 + u64 new_bw = to_ratio(period, runtime); 6885 + int cpu, ret = 0; 6886 + 6887 + /* 6888 + * Here we want to check the bandwidth not being set to some 6889 + * value smaller than the currently allocated bandwidth in 6890 + * any of the root_domains. 6891 + * 6892 + * FIXME: Cycling on all the CPUs is overdoing, but simpler than 6893 + * cycling on root_domains... Discussion on different/better 6894 + * solutions is welcome! 6895 + */ 6896 + for_each_possible_cpu(cpu) { 6897 + struct dl_bw *dl_b = dl_bw_of(cpu); 6898 + 6899 + raw_spin_lock(&dl_b->lock); 6900 + if (new_bw < dl_b->total_bw) 6901 + ret = -EBUSY; 6902 + raw_spin_unlock(&dl_b->lock); 6903 + 6904 + if (ret) 6905 + break; 6906 + } 6907 + 6908 + return ret; 6909 + } 6910 + 6911 + static void sched_dl_do_global(void) 6912 + { 6913 + u64 new_bw = -1; 6914 + int cpu; 6915 + 6916 + def_dl_bandwidth.dl_period = global_rt_period(); 6917 + def_dl_bandwidth.dl_runtime = global_rt_runtime(); 6918 + 6919 + if (global_rt_runtime() != RUNTIME_INF) 6920 + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 6921 + 6922 + /* 6923 + * FIXME: As above... 6924 + */ 6925 + for_each_possible_cpu(cpu) { 6926 + struct dl_bw *dl_b = dl_bw_of(cpu); 6927 + 6928 + raw_spin_lock(&dl_b->lock); 6929 + dl_b->bw = new_bw; 6930 + raw_spin_unlock(&dl_b->lock); 6931 + } 6932 + } 6933 + 6934 + static int sched_rt_global_validate(void) 6935 + { 6936 + if (sysctl_sched_rt_period <= 0) 6937 + return -EINVAL; 6938 + 6939 + if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) 6940 + return -EINVAL; 6941 + 6942 + return 0; 6943 + } 6944 + 6945 + static void sched_rt_do_global(void) 6946 + { 6947 + def_rt_bandwidth.rt_runtime = global_rt_runtime(); 6948 + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 6949 + } 6950 + 6951 + int sched_rt_handler(struct ctl_table *table, int write, 6952 + void __user *buffer, size_t *lenp, 6953 + loff_t *ppos) 6954 + { 6955 + int old_period, old_runtime; 6956 + static DEFINE_MUTEX(mutex); 6957 + int ret; 6958 + 6959 + mutex_lock(&mutex); 6960 + old_period = sysctl_sched_rt_period; 6961 + old_runtime = sysctl_sched_rt_runtime; 6962 + 6963 + ret = proc_dointvec(table, write, buffer, lenp, ppos); 6964 + 6965 + if (!ret && write) { 6966 + ret = sched_rt_global_validate(); 6967 + if (ret) 6968 + goto undo; 6969 + 6970 + ret = sched_rt_global_constraints(); 6971 + if (ret) 6972 + goto undo; 6973 + 6974 + ret = sched_dl_global_constraints(); 6975 + if (ret) 6976 + goto undo; 6977 + 6978 + sched_rt_do_global(); 6979 + sched_dl_do_global(); 6980 + } 6981 + if (0) { 6982 + undo: 6983 + sysctl_sched_rt_period = old_period; 6984 + sysctl_sched_rt_runtime = old_runtime; 6985 + } 6986 + mutex_unlock(&mutex); 6987 + 6988 + return ret; 6989 + } 7392 6990 7393 6991 int sched_rr_handler(struct ctl_table *table, int write, 7394 6992 void __user *buffer, size_t *lenp, ··· 7517 6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7518 6894 } 7519 6895 mutex_unlock(&mutex); 7520 - return ret; 7521 - } 7522 - 7523 - int sched_rt_handler(struct ctl_table *table, int write, 7524 - void __user *buffer, size_t *lenp, 7525 - loff_t *ppos) 7526 - { 7527 - int ret; 7528 - int old_period, old_runtime; 7529 - static DEFINE_MUTEX(mutex); 7530 - 7531 - mutex_lock(&mutex); 7532 - old_period = sysctl_sched_rt_period; 7533 - old_runtime = sysctl_sched_rt_runtime; 7534 - 7535 - ret = proc_dointvec(table, write, buffer, lenp, ppos); 7536 - 7537 - if (!ret && write) { 7538 - ret = sched_rt_global_constraints(); 7539 - if (ret) { 7540 - sysctl_sched_rt_period = old_period; 7541 - sysctl_sched_rt_runtime = old_runtime; 7542 - } else { 7543 - def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7544 - def_rt_bandwidth.rt_period = 7545 - ns_to_ktime(global_rt_period()); 7546 - } 7547 - } 7548 - mutex_unlock(&mutex); 7549 - 7550 6896 return ret; 7551 6897 } 7552 6898

+216

kernel/sched/cpudeadline.c

··· 1 + /* 2 + * kernel/sched/cpudl.c 3 + * 4 + * Global CPU deadline management 5 + * 6 + * Author: Juri Lelli <j.lelli@sssup.it> 7 + * 8 + * This program is free software; you can redistribute it and/or 9 + * modify it under the terms of the GNU General Public License 10 + * as published by the Free Software Foundation; version 2 11 + * of the License. 12 + */ 13 + 14 + #include <linux/gfp.h> 15 + #include <linux/kernel.h> 16 + #include "cpudeadline.h" 17 + 18 + static inline int parent(int i) 19 + { 20 + return (i - 1) >> 1; 21 + } 22 + 23 + static inline int left_child(int i) 24 + { 25 + return (i << 1) + 1; 26 + } 27 + 28 + static inline int right_child(int i) 29 + { 30 + return (i << 1) + 2; 31 + } 32 + 33 + static inline int dl_time_before(u64 a, u64 b) 34 + { 35 + return (s64)(a - b) < 0; 36 + } 37 + 38 + static void cpudl_exchange(struct cpudl *cp, int a, int b) 39 + { 40 + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 41 + 42 + swap(cp->elements[a], cp->elements[b]); 43 + swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); 44 + } 45 + 46 + static void cpudl_heapify(struct cpudl *cp, int idx) 47 + { 48 + int l, r, largest; 49 + 50 + /* adapted from lib/prio_heap.c */ 51 + while(1) { 52 + l = left_child(idx); 53 + r = right_child(idx); 54 + largest = idx; 55 + 56 + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, 57 + cp->elements[l].dl)) 58 + largest = l; 59 + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, 60 + cp->elements[r].dl)) 61 + largest = r; 62 + if (largest == idx) 63 + break; 64 + 65 + /* Push idx down the heap one level and bump one up */ 66 + cpudl_exchange(cp, largest, idx); 67 + idx = largest; 68 + } 69 + } 70 + 71 + static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) 72 + { 73 + WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); 74 + 75 + if (dl_time_before(new_dl, cp->elements[idx].dl)) { 76 + cp->elements[idx].dl = new_dl; 77 + cpudl_heapify(cp, idx); 78 + } else { 79 + cp->elements[idx].dl = new_dl; 80 + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, 81 + cp->elements[idx].dl)) { 82 + cpudl_exchange(cp, idx, parent(idx)); 83 + idx = parent(idx); 84 + } 85 + } 86 + } 87 + 88 + static inline int cpudl_maximum(struct cpudl *cp) 89 + { 90 + return cp->elements[0].cpu; 91 + } 92 + 93 + /* 94 + * cpudl_find - find the best (later-dl) CPU in the system 95 + * @cp: the cpudl max-heap context 96 + * @p: the task 97 + * @later_mask: a mask to fill in with the selected CPUs (or NULL) 98 + * 99 + * Returns: int - best CPU (heap maximum if suitable) 100 + */ 101 + int cpudl_find(struct cpudl *cp, struct task_struct *p, 102 + struct cpumask *later_mask) 103 + { 104 + int best_cpu = -1; 105 + const struct sched_dl_entity *dl_se = &p->dl; 106 + 107 + if (later_mask && cpumask_and(later_mask, cp->free_cpus, 108 + &p->cpus_allowed) && cpumask_and(later_mask, 109 + later_mask, cpu_active_mask)) { 110 + best_cpu = cpumask_any(later_mask); 111 + goto out; 112 + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 113 + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 114 + best_cpu = cpudl_maximum(cp); 115 + if (later_mask) 116 + cpumask_set_cpu(best_cpu, later_mask); 117 + } 118 + 119 + out: 120 + WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); 121 + 122 + return best_cpu; 123 + } 124 + 125 + /* 126 + * cpudl_set - update the cpudl max-heap 127 + * @cp: the cpudl max-heap context 128 + * @cpu: the target cpu 129 + * @dl: the new earliest deadline for this cpu 130 + * 131 + * Notes: assumes cpu_rq(cpu)->lock is locked 132 + * 133 + * Returns: (void) 134 + */ 135 + void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) 136 + { 137 + int old_idx, new_cpu; 138 + unsigned long flags; 139 + 140 + WARN_ON(cpu > num_present_cpus()); 141 + 142 + raw_spin_lock_irqsave(&cp->lock, flags); 143 + old_idx = cp->cpu_to_idx[cpu]; 144 + if (!is_valid) { 145 + /* remove item */ 146 + if (old_idx == IDX_INVALID) { 147 + /* 148 + * Nothing to remove if old_idx was invalid. 149 + * This could happen if a rq_offline_dl is 150 + * called for a CPU without -dl tasks running. 151 + */ 152 + goto out; 153 + } 154 + new_cpu = cp->elements[cp->size - 1].cpu; 155 + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 156 + cp->elements[old_idx].cpu = new_cpu; 157 + cp->size--; 158 + cp->cpu_to_idx[new_cpu] = old_idx; 159 + cp->cpu_to_idx[cpu] = IDX_INVALID; 160 + while (old_idx > 0 && dl_time_before( 161 + cp->elements[parent(old_idx)].dl, 162 + cp->elements[old_idx].dl)) { 163 + cpudl_exchange(cp, old_idx, parent(old_idx)); 164 + old_idx = parent(old_idx); 165 + } 166 + cpumask_set_cpu(cpu, cp->free_cpus); 167 + cpudl_heapify(cp, old_idx); 168 + 169 + goto out; 170 + } 171 + 172 + if (old_idx == IDX_INVALID) { 173 + cp->size++; 174 + cp->elements[cp->size - 1].dl = 0; 175 + cp->elements[cp->size - 1].cpu = cpu; 176 + cp->cpu_to_idx[cpu] = cp->size - 1; 177 + cpudl_change_key(cp, cp->size - 1, dl); 178 + cpumask_clear_cpu(cpu, cp->free_cpus); 179 + } else { 180 + cpudl_change_key(cp, old_idx, dl); 181 + } 182 + 183 + out: 184 + raw_spin_unlock_irqrestore(&cp->lock, flags); 185 + } 186 + 187 + /* 188 + * cpudl_init - initialize the cpudl structure 189 + * @cp: the cpudl max-heap context 190 + */ 191 + int cpudl_init(struct cpudl *cp) 192 + { 193 + int i; 194 + 195 + memset(cp, 0, sizeof(*cp)); 196 + raw_spin_lock_init(&cp->lock); 197 + cp->size = 0; 198 + for (i = 0; i < NR_CPUS; i++) 199 + cp->cpu_to_idx[i] = IDX_INVALID; 200 + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) 201 + return -ENOMEM; 202 + cpumask_setall(cp->free_cpus); 203 + 204 + return 0; 205 + } 206 + 207 + /* 208 + * cpudl_cleanup - clean up the cpudl structure 209 + * @cp: the cpudl max-heap context 210 + */ 211 + void cpudl_cleanup(struct cpudl *cp) 212 + { 213 + /* 214 + * nothing to do for the moment 215 + */ 216 + }

+33

kernel/sched/cpudeadline.h

··· 1 + #ifndef _LINUX_CPUDL_H 2 + #define _LINUX_CPUDL_H 3 + 4 + #include <linux/sched.h> 5 + 6 + #define IDX_INVALID -1 7 + 8 + struct array_item { 9 + u64 dl; 10 + int cpu; 11 + }; 12 + 13 + struct cpudl { 14 + raw_spinlock_t lock; 15 + int size; 16 + int cpu_to_idx[NR_CPUS]; 17 + struct array_item elements[NR_CPUS]; 18 + cpumask_var_t free_cpus; 19 + }; 20 + 21 + 22 + #ifdef CONFIG_SMP 23 + int cpudl_find(struct cpudl *cp, struct task_struct *p, 24 + struct cpumask *later_mask); 25 + void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 26 + int cpudl_init(struct cpudl *cp); 27 + void cpudl_cleanup(struct cpudl *cp); 28 + #else 29 + #define cpudl_set(cp, cpu, dl) do { } while (0) 30 + #define cpudl_init() do { } while (0) 31 + #endif /* CONFIG_SMP */ 32 + 33 + #endif /* _LINUX_CPUDL_H */

+1640

kernel/sched/deadline.c

··· 1 + /* 2 + * Deadline Scheduling Class (SCHED_DEADLINE) 3 + * 4 + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). 5 + * 6 + * Tasks that periodically executes their instances for less than their 7 + * runtime won't miss any of their deadlines. 8 + * Tasks that are not periodic or sporadic or that tries to execute more 9 + * than their reserved bandwidth will be slowed down (and may potentially 10 + * miss some of their deadlines), and won't affect any other task. 11 + * 12 + * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, 13 + * Juri Lelli <juri.lelli@gmail.com>, 14 + * Michael Trimarchi <michael@amarulasolutions.com>, 15 + * Fabio Checconi <fchecconi@gmail.com> 16 + */ 17 + #include "sched.h" 18 + 19 + #include <linux/slab.h> 20 + 21 + struct dl_bandwidth def_dl_bandwidth; 22 + 23 + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 24 + { 25 + return container_of(dl_se, struct task_struct, dl); 26 + } 27 + 28 + static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) 29 + { 30 + return container_of(dl_rq, struct rq, dl); 31 + } 32 + 33 + static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) 34 + { 35 + struct task_struct *p = dl_task_of(dl_se); 36 + struct rq *rq = task_rq(p); 37 + 38 + return &rq->dl; 39 + } 40 + 41 + static inline int on_dl_rq(struct sched_dl_entity *dl_se) 42 + { 43 + return !RB_EMPTY_NODE(&dl_se->rb_node); 44 + } 45 + 46 + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) 47 + { 48 + struct sched_dl_entity *dl_se = &p->dl; 49 + 50 + return dl_rq->rb_leftmost == &dl_se->rb_node; 51 + } 52 + 53 + void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) 54 + { 55 + raw_spin_lock_init(&dl_b->dl_runtime_lock); 56 + dl_b->dl_period = period; 57 + dl_b->dl_runtime = runtime; 58 + } 59 + 60 + extern unsigned long to_ratio(u64 period, u64 runtime); 61 + 62 + void init_dl_bw(struct dl_bw *dl_b) 63 + { 64 + raw_spin_lock_init(&dl_b->lock); 65 + raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); 66 + if (global_rt_runtime() == RUNTIME_INF) 67 + dl_b->bw = -1; 68 + else 69 + dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); 70 + raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); 71 + dl_b->total_bw = 0; 72 + } 73 + 74 + void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) 75 + { 76 + dl_rq->rb_root = RB_ROOT; 77 + 78 + #ifdef CONFIG_SMP 79 + /* zero means no -deadline tasks */ 80 + dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; 81 + 82 + dl_rq->dl_nr_migratory = 0; 83 + dl_rq->overloaded = 0; 84 + dl_rq->pushable_dl_tasks_root = RB_ROOT; 85 + #else 86 + init_dl_bw(&dl_rq->dl_bw); 87 + #endif 88 + } 89 + 90 + #ifdef CONFIG_SMP 91 + 92 + static inline int dl_overloaded(struct rq *rq) 93 + { 94 + return atomic_read(&rq->rd->dlo_count); 95 + } 96 + 97 + static inline void dl_set_overload(struct rq *rq) 98 + { 99 + if (!rq->online) 100 + return; 101 + 102 + cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); 103 + /* 104 + * Must be visible before the overload count is 105 + * set (as in sched_rt.c). 106 + * 107 + * Matched by the barrier in pull_dl_task(). 108 + */ 109 + smp_wmb(); 110 + atomic_inc(&rq->rd->dlo_count); 111 + } 112 + 113 + static inline void dl_clear_overload(struct rq *rq) 114 + { 115 + if (!rq->online) 116 + return; 117 + 118 + atomic_dec(&rq->rd->dlo_count); 119 + cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); 120 + } 121 + 122 + static void update_dl_migration(struct dl_rq *dl_rq) 123 + { 124 + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { 125 + if (!dl_rq->overloaded) { 126 + dl_set_overload(rq_of_dl_rq(dl_rq)); 127 + dl_rq->overloaded = 1; 128 + } 129 + } else if (dl_rq->overloaded) { 130 + dl_clear_overload(rq_of_dl_rq(dl_rq)); 131 + dl_rq->overloaded = 0; 132 + } 133 + } 134 + 135 + static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 136 + { 137 + struct task_struct *p = dl_task_of(dl_se); 138 + dl_rq = &rq_of_dl_rq(dl_rq)->dl; 139 + 140 + dl_rq->dl_nr_total++; 141 + if (p->nr_cpus_allowed > 1) 142 + dl_rq->dl_nr_migratory++; 143 + 144 + update_dl_migration(dl_rq); 145 + } 146 + 147 + static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 148 + { 149 + struct task_struct *p = dl_task_of(dl_se); 150 + dl_rq = &rq_of_dl_rq(dl_rq)->dl; 151 + 152 + dl_rq->dl_nr_total--; 153 + if (p->nr_cpus_allowed > 1) 154 + dl_rq->dl_nr_migratory--; 155 + 156 + update_dl_migration(dl_rq); 157 + } 158 + 159 + /* 160 + * The list of pushable -deadline task is not a plist, like in 161 + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. 162 + */ 163 + static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) 164 + { 165 + struct dl_rq *dl_rq = &rq->dl; 166 + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; 167 + struct rb_node *parent = NULL; 168 + struct task_struct *entry; 169 + int leftmost = 1; 170 + 171 + BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); 172 + 173 + while (*link) { 174 + parent = *link; 175 + entry = rb_entry(parent, struct task_struct, 176 + pushable_dl_tasks); 177 + if (dl_entity_preempt(&p->dl, &entry->dl)) 178 + link = &parent->rb_left; 179 + else { 180 + link = &parent->rb_right; 181 + leftmost = 0; 182 + } 183 + } 184 + 185 + if (leftmost) 186 + dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; 187 + 188 + rb_link_node(&p->pushable_dl_tasks, parent, link); 189 + rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); 190 + } 191 + 192 + static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) 193 + { 194 + struct dl_rq *dl_rq = &rq->dl; 195 + 196 + if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) 197 + return; 198 + 199 + if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { 200 + struct rb_node *next_node; 201 + 202 + next_node = rb_next(&p->pushable_dl_tasks); 203 + dl_rq->pushable_dl_tasks_leftmost = next_node; 204 + } 205 + 206 + rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); 207 + RB_CLEAR_NODE(&p->pushable_dl_tasks); 208 + } 209 + 210 + static inline int has_pushable_dl_tasks(struct rq *rq) 211 + { 212 + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); 213 + } 214 + 215 + static int push_dl_task(struct rq *rq); 216 + 217 + #else 218 + 219 + static inline 220 + void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) 221 + { 222 + } 223 + 224 + static inline 225 + void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) 226 + { 227 + } 228 + 229 + static inline 230 + void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 231 + { 232 + } 233 + 234 + static inline 235 + void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 236 + { 237 + } 238 + 239 + #endif /* CONFIG_SMP */ 240 + 241 + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 242 + static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 243 + static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 244 + int flags); 245 + 246 + /* 247 + * We are being explicitly informed that a new instance is starting, 248 + * and this means that: 249 + * - the absolute deadline of the entity has to be placed at 250 + * current time + relative deadline; 251 + * - the runtime of the entity has to be set to the maximum value. 252 + * 253 + * The capability of specifying such event is useful whenever a -deadline 254 + * entity wants to (try to!) synchronize its behaviour with the scheduler's 255 + * one, and to (try to!) reconcile itself with its own scheduling 256 + * parameters. 257 + */ 258 + static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, 259 + struct sched_dl_entity *pi_se) 260 + { 261 + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 262 + struct rq *rq = rq_of_dl_rq(dl_rq); 263 + 264 + WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); 265 + 266 + /* 267 + * We use the regular wall clock time to set deadlines in the 268 + * future; in fact, we must consider execution overheads (time 269 + * spent on hardirq context, etc.). 270 + */ 271 + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 272 + dl_se->runtime = pi_se->dl_runtime; 273 + dl_se->dl_new = 0; 274 + } 275 + 276 + /* 277 + * Pure Earliest Deadline First (EDF) scheduling does not deal with the 278 + * possibility of a entity lasting more than what it declared, and thus 279 + * exhausting its runtime. 280 + * 281 + * Here we are interested in making runtime overrun possible, but we do 282 + * not want a entity which is misbehaving to affect the scheduling of all 283 + * other entities. 284 + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) 285 + * is used, in order to confine each entity within its own bandwidth. 286 + * 287 + * This function deals exactly with that, and ensures that when the runtime 288 + * of a entity is replenished, its deadline is also postponed. That ensures 289 + * the overrunning entity can't interfere with other entity in the system and 290 + * can't make them miss their deadlines. Reasons why this kind of overruns 291 + * could happen are, typically, a entity voluntarily trying to overcome its 292 + * runtime, or it just underestimated it during sched_setscheduler_ex(). 293 + */ 294 + static void replenish_dl_entity(struct sched_dl_entity *dl_se, 295 + struct sched_dl_entity *pi_se) 296 + { 297 + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 298 + struct rq *rq = rq_of_dl_rq(dl_rq); 299 + 300 + BUG_ON(pi_se->dl_runtime <= 0); 301 + 302 + /* 303 + * This could be the case for a !-dl task that is boosted. 304 + * Just go with full inherited parameters. 305 + */ 306 + if (dl_se->dl_deadline == 0) { 307 + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 308 + dl_se->runtime = pi_se->dl_runtime; 309 + } 310 + 311 + /* 312 + * We keep moving the deadline away until we get some 313 + * available runtime for the entity. This ensures correct 314 + * handling of situations where the runtime overrun is 315 + * arbitrary large. 316 + */ 317 + while (dl_se->runtime <= 0) { 318 + dl_se->deadline += pi_se->dl_period; 319 + dl_se->runtime += pi_se->dl_runtime; 320 + } 321 + 322 + /* 323 + * At this point, the deadline really should be "in 324 + * the future" with respect to rq->clock. If it's 325 + * not, we are, for some reason, lagging too much! 326 + * Anyway, after having warn userspace abut that, 327 + * we still try to keep the things running by 328 + * resetting the deadline and the budget of the 329 + * entity. 330 + */ 331 + if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 332 + static bool lag_once = false; 333 + 334 + if (!lag_once) { 335 + lag_once = true; 336 + printk_sched("sched: DL replenish lagged to much\n"); 337 + } 338 + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 339 + dl_se->runtime = pi_se->dl_runtime; 340 + } 341 + } 342 + 343 + /* 344 + * Here we check if --at time t-- an entity (which is probably being 345 + * [re]activated or, in general, enqueued) can use its remaining runtime 346 + * and its current deadline _without_ exceeding the bandwidth it is 347 + * assigned (function returns true if it can't). We are in fact applying 348 + * one of the CBS rules: when a task wakes up, if the residual runtime 349 + * over residual deadline fits within the allocated bandwidth, then we 350 + * can keep the current (absolute) deadline and residual budget without 351 + * disrupting the schedulability of the system. Otherwise, we should 352 + * refill the runtime and set the deadline a period in the future, 353 + * because keeping the current (absolute) deadline of the task would 354 + * result in breaking guarantees promised to other tasks. 355 + * 356 + * This function returns true if: 357 + * 358 + * runtime / (deadline - t) > dl_runtime / dl_period , 359 + * 360 + * IOW we can't recycle current parameters. 361 + * 362 + * Notice that the bandwidth check is done against the period. For 363 + * task with deadline equal to period this is the same of using 364 + * dl_deadline instead of dl_period in the equation above. 365 + */ 366 + static bool dl_entity_overflow(struct sched_dl_entity *dl_se, 367 + struct sched_dl_entity *pi_se, u64 t) 368 + { 369 + u64 left, right; 370 + 371 + /* 372 + * left and right are the two sides of the equation above, 373 + * after a bit of shuffling to use multiplications instead 374 + * of divisions. 375 + * 376 + * Note that none of the time values involved in the two 377 + * multiplications are absolute: dl_deadline and dl_runtime 378 + * are the relative deadline and the maximum runtime of each 379 + * instance, runtime is the runtime left for the last instance 380 + * and (deadline - t), since t is rq->clock, is the time left 381 + * to the (absolute) deadline. Even if overflowing the u64 type 382 + * is very unlikely to occur in both cases, here we scale down 383 + * as we want to avoid that risk at all. Scaling down by 10 384 + * means that we reduce granularity to 1us. We are fine with it, 385 + * since this is only a true/false check and, anyway, thinking 386 + * of anything below microseconds resolution is actually fiction 387 + * (but still we want to give the user that illusion >;). 388 + */ 389 + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); 390 + right = ((dl_se->deadline - t) >> DL_SCALE) * 391 + (pi_se->dl_runtime >> DL_SCALE); 392 + 393 + return dl_time_before(right, left); 394 + } 395 + 396 + /* 397 + * When a -deadline entity is queued back on the runqueue, its runtime and 398 + * deadline might need updating. 399 + * 400 + * The policy here is that we update the deadline of the entity only if: 401 + * - the current deadline is in the past, 402 + * - using the remaining runtime with the current deadline would make 403 + * the entity exceed its bandwidth. 404 + */ 405 + static void update_dl_entity(struct sched_dl_entity *dl_se, 406 + struct sched_dl_entity *pi_se) 407 + { 408 + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 409 + struct rq *rq = rq_of_dl_rq(dl_rq); 410 + 411 + /* 412 + * The arrival of a new instance needs special treatment, i.e., 413 + * the actual scheduling parameters have to be "renewed". 414 + */ 415 + if (dl_se->dl_new) { 416 + setup_new_dl_entity(dl_se, pi_se); 417 + return; 418 + } 419 + 420 + if (dl_time_before(dl_se->deadline, rq_clock(rq)) || 421 + dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { 422 + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 423 + dl_se->runtime = pi_se->dl_runtime; 424 + } 425 + } 426 + 427 + /* 428 + * If the entity depleted all its runtime, and if we want it to sleep 429 + * while waiting for some new execution time to become available, we 430 + * set the bandwidth enforcement timer to the replenishment instant 431 + * and try to activate it. 432 + * 433 + * Notice that it is important for the caller to know if the timer 434 + * actually started or not (i.e., the replenishment instant is in 435 + * the future or in the past). 436 + */ 437 + static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) 438 + { 439 + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 440 + struct rq *rq = rq_of_dl_rq(dl_rq); 441 + ktime_t now, act; 442 + ktime_t soft, hard; 443 + unsigned long range; 444 + s64 delta; 445 + 446 + if (boosted) 447 + return 0; 448 + /* 449 + * We want the timer to fire at the deadline, but considering 450 + * that it is actually coming from rq->clock and not from 451 + * hrtimer's time base reading. 452 + */ 453 + act = ns_to_ktime(dl_se->deadline); 454 + now = hrtimer_cb_get_time(&dl_se->dl_timer); 455 + delta = ktime_to_ns(now) - rq_clock(rq); 456 + act = ktime_add_ns(act, delta); 457 + 458 + /* 459 + * If the expiry time already passed, e.g., because the value 460 + * chosen as the deadline is too small, don't even try to 461 + * start the timer in the past! 462 + */ 463 + if (ktime_us_delta(act, now) < 0) 464 + return 0; 465 + 466 + hrtimer_set_expires(&dl_se->dl_timer, act); 467 + 468 + soft = hrtimer_get_softexpires(&dl_se->dl_timer); 469 + hard = hrtimer_get_expires(&dl_se->dl_timer); 470 + range = ktime_to_ns(ktime_sub(hard, soft)); 471 + __hrtimer_start_range_ns(&dl_se->dl_timer, soft, 472 + range, HRTIMER_MODE_ABS, 0); 473 + 474 + return hrtimer_active(&dl_se->dl_timer); 475 + } 476 + 477 + /* 478 + * This is the bandwidth enforcement timer callback. If here, we know 479 + * a task is not on its dl_rq, since the fact that the timer was running 480 + * means the task is throttled and needs a runtime replenishment. 481 + * 482 + * However, what we actually do depends on the fact the task is active, 483 + * (it is on its rq) or has been removed from there by a call to 484 + * dequeue_task_dl(). In the former case we must issue the runtime 485 + * replenishment and add the task back to the dl_rq; in the latter, we just 486 + * do nothing but clearing dl_throttled, so that runtime and deadline 487 + * updating (and the queueing back to dl_rq) will be done by the 488 + * next call to enqueue_task_dl(). 489 + */ 490 + static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) 491 + { 492 + struct sched_dl_entity *dl_se = container_of(timer, 493 + struct sched_dl_entity, 494 + dl_timer); 495 + struct task_struct *p = dl_task_of(dl_se); 496 + struct rq *rq = task_rq(p); 497 + raw_spin_lock(&rq->lock); 498 + 499 + /* 500 + * We need to take care of a possible races here. In fact, the 501 + * task might have changed its scheduling policy to something 502 + * different from SCHED_DEADLINE or changed its reservation 503 + * parameters (through sched_setscheduler()). 504 + */ 505 + if (!dl_task(p) || dl_se->dl_new) 506 + goto unlock; 507 + 508 + sched_clock_tick(); 509 + update_rq_clock(rq); 510 + dl_se->dl_throttled = 0; 511 + if (p->on_rq) { 512 + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 513 + if (task_has_dl_policy(rq->curr)) 514 + check_preempt_curr_dl(rq, p, 0); 515 + else 516 + resched_task(rq->curr); 517 + #ifdef CONFIG_SMP 518 + /* 519 + * Queueing this task back might have overloaded rq, 520 + * check if we need to kick someone away. 521 + */ 522 + if (has_pushable_dl_tasks(rq)) 523 + push_dl_task(rq); 524 + #endif 525 + } 526 + unlock: 527 + raw_spin_unlock(&rq->lock); 528 + 529 + return HRTIMER_NORESTART; 530 + } 531 + 532 + void init_dl_task_timer(struct sched_dl_entity *dl_se) 533 + { 534 + struct hrtimer *timer = &dl_se->dl_timer; 535 + 536 + if (hrtimer_active(timer)) { 537 + hrtimer_try_to_cancel(timer); 538 + return; 539 + } 540 + 541 + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 542 + timer->function = dl_task_timer; 543 + } 544 + 545 + static 546 + int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) 547 + { 548 + int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); 549 + int rorun = dl_se->runtime <= 0; 550 + 551 + if (!rorun && !dmiss) 552 + return 0; 553 + 554 + /* 555 + * If we are beyond our current deadline and we are still 556 + * executing, then we have already used some of the runtime of 557 + * the next instance. Thus, if we do not account that, we are 558 + * stealing bandwidth from the system at each deadline miss! 559 + */ 560 + if (dmiss) { 561 + dl_se->runtime = rorun ? dl_se->runtime : 0; 562 + dl_se->runtime -= rq_clock(rq) - dl_se->deadline; 563 + } 564 + 565 + return 1; 566 + } 567 + 568 + /* 569 + * Update the current task's runtime statistics (provided it is still 570 + * a -deadline task and has not been removed from the dl_rq). 571 + */ 572 + static void update_curr_dl(struct rq *rq) 573 + { 574 + struct task_struct *curr = rq->curr; 575 + struct sched_dl_entity *dl_se = &curr->dl; 576 + u64 delta_exec; 577 + 578 + if (!dl_task(curr) || !on_dl_rq(dl_se)) 579 + return; 580 + 581 + /* 582 + * Consumed budget is computed considering the time as 583 + * observed by schedulable tasks (excluding time spent 584 + * in hardirq context, etc.). Deadlines are instead 585 + * computed using hard walltime. This seems to be the more 586 + * natural solution, but the full ramifications of this 587 + * approach need further study. 588 + */ 589 + delta_exec = rq_clock_task(rq) - curr->se.exec_start; 590 + if (unlikely((s64)delta_exec < 0)) 591 + delta_exec = 0; 592 + 593 + schedstat_set(curr->se.statistics.exec_max, 594 + max(curr->se.statistics.exec_max, delta_exec)); 595 + 596 + curr->se.sum_exec_runtime += delta_exec; 597 + account_group_exec_runtime(curr, delta_exec); 598 + 599 + curr->se.exec_start = rq_clock_task(rq); 600 + cpuacct_charge(curr, delta_exec); 601 + 602 + sched_rt_avg_update(rq, delta_exec); 603 + 604 + dl_se->runtime -= delta_exec; 605 + if (dl_runtime_exceeded(rq, dl_se)) { 606 + __dequeue_task_dl(rq, curr, 0); 607 + if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 608 + dl_se->dl_throttled = 1; 609 + else 610 + enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 611 + 612 + if (!is_leftmost(curr, &rq->dl)) 613 + resched_task(curr); 614 + } 615 + 616 + /* 617 + * Because -- for now -- we share the rt bandwidth, we need to 618 + * account our runtime there too, otherwise actual rt tasks 619 + * would be able to exceed the shared quota. 620 + * 621 + * Account to the root rt group for now. 622 + * 623 + * The solution we're working towards is having the RT groups scheduled 624 + * using deadline servers -- however there's a few nasties to figure 625 + * out before that can happen. 626 + */ 627 + if (rt_bandwidth_enabled()) { 628 + struct rt_rq *rt_rq = &rq->rt; 629 + 630 + raw_spin_lock(&rt_rq->rt_runtime_lock); 631 + rt_rq->rt_time += delta_exec; 632 + /* 633 + * We'll let actual RT tasks worry about the overflow here, we 634 + * have our own CBS to keep us inline -- see above. 635 + */ 636 + raw_spin_unlock(&rt_rq->rt_runtime_lock); 637 + } 638 + } 639 + 640 + #ifdef CONFIG_SMP 641 + 642 + static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); 643 + 644 + static inline u64 next_deadline(struct rq *rq) 645 + { 646 + struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); 647 + 648 + if (next && dl_prio(next->prio)) 649 + return next->dl.deadline; 650 + else 651 + return 0; 652 + } 653 + 654 + static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) 655 + { 656 + struct rq *rq = rq_of_dl_rq(dl_rq); 657 + 658 + if (dl_rq->earliest_dl.curr == 0 || 659 + dl_time_before(deadline, dl_rq->earliest_dl.curr)) { 660 + /* 661 + * If the dl_rq had no -deadline tasks, or if the new task 662 + * has shorter deadline than the current one on dl_rq, we 663 + * know that the previous earliest becomes our next earliest, 664 + * as the new task becomes the earliest itself. 665 + */ 666 + dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; 667 + dl_rq->earliest_dl.curr = deadline; 668 + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); 669 + } else if (dl_rq->earliest_dl.next == 0 || 670 + dl_time_before(deadline, dl_rq->earliest_dl.next)) { 671 + /* 672 + * On the other hand, if the new -deadline task has a 673 + * a later deadline than the earliest one on dl_rq, but 674 + * it is earlier than the next (if any), we must 675 + * recompute the next-earliest. 676 + */ 677 + dl_rq->earliest_dl.next = next_deadline(rq); 678 + } 679 + } 680 + 681 + static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) 682 + { 683 + struct rq *rq = rq_of_dl_rq(dl_rq); 684 + 685 + /* 686 + * Since we may have removed our earliest (and/or next earliest) 687 + * task we must recompute them. 688 + */ 689 + if (!dl_rq->dl_nr_running) { 690 + dl_rq->earliest_dl.curr = 0; 691 + dl_rq->earliest_dl.next = 0; 692 + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 693 + } else { 694 + struct rb_node *leftmost = dl_rq->rb_leftmost; 695 + struct sched_dl_entity *entry; 696 + 697 + entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); 698 + dl_rq->earliest_dl.curr = entry->deadline; 699 + dl_rq->earliest_dl.next = next_deadline(rq); 700 + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); 701 + } 702 + } 703 + 704 + #else 705 + 706 + static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} 707 + static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} 708 + 709 + #endif /* CONFIG_SMP */ 710 + 711 + static inline 712 + void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 713 + { 714 + int prio = dl_task_of(dl_se)->prio; 715 + u64 deadline = dl_se->deadline; 716 + 717 + WARN_ON(!dl_prio(prio)); 718 + dl_rq->dl_nr_running++; 719 + 720 + inc_dl_deadline(dl_rq, deadline); 721 + inc_dl_migration(dl_se, dl_rq); 722 + } 723 + 724 + static inline 725 + void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 726 + { 727 + int prio = dl_task_of(dl_se)->prio; 728 + 729 + WARN_ON(!dl_prio(prio)); 730 + WARN_ON(!dl_rq->dl_nr_running); 731 + dl_rq->dl_nr_running--; 732 + 733 + dec_dl_deadline(dl_rq, dl_se->deadline); 734 + dec_dl_migration(dl_se, dl_rq); 735 + } 736 + 737 + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) 738 + { 739 + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 740 + struct rb_node **link = &dl_rq->rb_root.rb_node; 741 + struct rb_node *parent = NULL; 742 + struct sched_dl_entity *entry; 743 + int leftmost = 1; 744 + 745 + BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); 746 + 747 + while (*link) { 748 + parent = *link; 749 + entry = rb_entry(parent, struct sched_dl_entity, rb_node); 750 + if (dl_time_before(dl_se->deadline, entry->deadline)) 751 + link = &parent->rb_left; 752 + else { 753 + link = &parent->rb_right; 754 + leftmost = 0; 755 + } 756 + } 757 + 758 + if (leftmost) 759 + dl_rq->rb_leftmost = &dl_se->rb_node; 760 + 761 + rb_link_node(&dl_se->rb_node, parent, link); 762 + rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); 763 + 764 + inc_dl_tasks(dl_se, dl_rq); 765 + } 766 + 767 + static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) 768 + { 769 + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 770 + 771 + if (RB_EMPTY_NODE(&dl_se->rb_node)) 772 + return; 773 + 774 + if (dl_rq->rb_leftmost == &dl_se->rb_node) { 775 + struct rb_node *next_node; 776 + 777 + next_node = rb_next(&dl_se->rb_node); 778 + dl_rq->rb_leftmost = next_node; 779 + } 780 + 781 + rb_erase(&dl_se->rb_node, &dl_rq->rb_root); 782 + RB_CLEAR_NODE(&dl_se->rb_node); 783 + 784 + dec_dl_tasks(dl_se, dl_rq); 785 + } 786 + 787 + static void 788 + enqueue_dl_entity(struct sched_dl_entity *dl_se, 789 + struct sched_dl_entity *pi_se, int flags) 790 + { 791 + BUG_ON(on_dl_rq(dl_se)); 792 + 793 + /* 794 + * If this is a wakeup or a new instance, the scheduling 795 + * parameters of the task might need updating. Otherwise, 796 + * we want a replenishment of its runtime. 797 + */ 798 + if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) 799 + replenish_dl_entity(dl_se, pi_se); 800 + else 801 + update_dl_entity(dl_se, pi_se); 802 + 803 + __enqueue_dl_entity(dl_se); 804 + } 805 + 806 + static void dequeue_dl_entity(struct sched_dl_entity *dl_se) 807 + { 808 + __dequeue_dl_entity(dl_se); 809 + } 810 + 811 + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) 812 + { 813 + struct task_struct *pi_task = rt_mutex_get_top_task(p); 814 + struct sched_dl_entity *pi_se = &p->dl; 815 + 816 + /* 817 + * Use the scheduling parameters of the top pi-waiter 818 + * task if we have one and its (relative) deadline is 819 + * smaller than our one... OTW we keep our runtime and 820 + * deadline. 821 + */ 822 + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) 823 + pi_se = &pi_task->dl; 824 + 825 + /* 826 + * If p is throttled, we do nothing. In fact, if it exhausted 827 + * its budget it needs a replenishment and, since it now is on 828 + * its rq, the bandwidth timer callback (which clearly has not 829 + * run yet) will take care of this. 830 + */ 831 + if (p->dl.dl_throttled) 832 + return; 833 + 834 + enqueue_dl_entity(&p->dl, pi_se, flags); 835 + 836 + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 837 + enqueue_pushable_dl_task(rq, p); 838 + 839 + inc_nr_running(rq); 840 + } 841 + 842 + static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 843 + { 844 + dequeue_dl_entity(&p->dl); 845 + dequeue_pushable_dl_task(rq, p); 846 + } 847 + 848 + static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 849 + { 850 + update_curr_dl(rq); 851 + __dequeue_task_dl(rq, p, flags); 852 + 853 + dec_nr_running(rq); 854 + } 855 + 856 + /* 857 + * Yield task semantic for -deadline tasks is: 858 + * 859 + * get off from the CPU until our next instance, with 860 + * a new runtime. This is of little use now, since we 861 + * don't have a bandwidth reclaiming mechanism. Anyway, 862 + * bandwidth reclaiming is planned for the future, and 863 + * yield_task_dl will indicate that some spare budget 864 + * is available for other task instances to use it. 865 + */ 866 + static void yield_task_dl(struct rq *rq) 867 + { 868 + struct task_struct *p = rq->curr; 869 + 870 + /* 871 + * We make the task go to sleep until its current deadline by 872 + * forcing its runtime to zero. This way, update_curr_dl() stops 873 + * it and the bandwidth timer will wake it up and will give it 874 + * new scheduling parameters (thanks to dl_new=1). 875 + */ 876 + if (p->dl.runtime > 0) { 877 + rq->curr->dl.dl_new = 1; 878 + p->dl.runtime = 0; 879 + } 880 + update_curr_dl(rq); 881 + } 882 + 883 + #ifdef CONFIG_SMP 884 + 885 + static int find_later_rq(struct task_struct *task); 886 + 887 + static int 888 + select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) 889 + { 890 + struct task_struct *curr; 891 + struct rq *rq; 892 + 893 + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 894 + goto out; 895 + 896 + rq = cpu_rq(cpu); 897 + 898 + rcu_read_lock(); 899 + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 900 + 901 + /* 902 + * If we are dealing with a -deadline task, we must 903 + * decide where to wake it up. 904 + * If it has a later deadline and the current task 905 + * on this rq can't move (provided the waking task 906 + * can!) we prefer to send it somewhere else. On the 907 + * other hand, if it has a shorter deadline, we 908 + * try to make it stay here, it might be important. 909 + */ 910 + if (unlikely(dl_task(curr)) && 911 + (curr->nr_cpus_allowed < 2 || 912 + !dl_entity_preempt(&p->dl, &curr->dl)) && 913 + (p->nr_cpus_allowed > 1)) { 914 + int target = find_later_rq(p); 915 + 916 + if (target != -1) 917 + cpu = target; 918 + } 919 + rcu_read_unlock(); 920 + 921 + out: 922 + return cpu; 923 + } 924 + 925 + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) 926 + { 927 + /* 928 + * Current can't be migrated, useless to reschedule, 929 + * let's hope p can move out. 930 + */ 931 + if (rq->curr->nr_cpus_allowed == 1 || 932 + cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) 933 + return; 934 + 935 + /* 936 + * p is migratable, so let's not schedule it and 937 + * see if it is pushed or pulled somewhere else. 938 + */ 939 + if (p->nr_cpus_allowed != 1 && 940 + cpudl_find(&rq->rd->cpudl, p, NULL) != -1) 941 + return; 942 + 943 + resched_task(rq->curr); 944 + } 945 + 946 + #endif /* CONFIG_SMP */ 947 + 948 + /* 949 + * Only called when both the current and waking task are -deadline 950 + * tasks. 951 + */ 952 + static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 953 + int flags) 954 + { 955 + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { 956 + resched_task(rq->curr); 957 + return; 958 + } 959 + 960 + #ifdef CONFIG_SMP 961 + /* 962 + * In the unlikely case current and p have the same deadline 963 + * let us try to decide what's the best thing to do... 964 + */ 965 + if ((p->dl.deadline == rq->curr->dl.deadline) && 966 + !test_tsk_need_resched(rq->curr)) 967 + check_preempt_equal_dl(rq, p); 968 + #endif /* CONFIG_SMP */ 969 + } 970 + 971 + #ifdef CONFIG_SCHED_HRTICK 972 + static void start_hrtick_dl(struct rq *rq, struct task_struct *p) 973 + { 974 + s64 delta = p->dl.dl_runtime - p->dl.runtime; 975 + 976 + if (delta > 10000) 977 + hrtick_start(rq, p->dl.runtime); 978 + } 979 + #endif 980 + 981 + static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, 982 + struct dl_rq *dl_rq) 983 + { 984 + struct rb_node *left = dl_rq->rb_leftmost; 985 + 986 + if (!left) 987 + return NULL; 988 + 989 + return rb_entry(left, struct sched_dl_entity, rb_node); 990 + } 991 + 992 + struct task_struct *pick_next_task_dl(struct rq *rq) 993 + { 994 + struct sched_dl_entity *dl_se; 995 + struct task_struct *p; 996 + struct dl_rq *dl_rq; 997 + 998 + dl_rq = &rq->dl; 999 + 1000 + if (unlikely(!dl_rq->dl_nr_running)) 1001 + return NULL; 1002 + 1003 + dl_se = pick_next_dl_entity(rq, dl_rq); 1004 + BUG_ON(!dl_se); 1005 + 1006 + p = dl_task_of(dl_se); 1007 + p->se.exec_start = rq_clock_task(rq); 1008 + 1009 + /* Running task will never be pushed. */ 1010 + dequeue_pushable_dl_task(rq, p); 1011 + 1012 + #ifdef CONFIG_SCHED_HRTICK 1013 + if (hrtick_enabled(rq)) 1014 + start_hrtick_dl(rq, p); 1015 + #endif 1016 + 1017 + #ifdef CONFIG_SMP 1018 + rq->post_schedule = has_pushable_dl_tasks(rq); 1019 + #endif /* CONFIG_SMP */ 1020 + 1021 + return p; 1022 + } 1023 + 1024 + static void put_prev_task_dl(struct rq *rq, struct task_struct *p) 1025 + { 1026 + update_curr_dl(rq); 1027 + 1028 + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) 1029 + enqueue_pushable_dl_task(rq, p); 1030 + } 1031 + 1032 + static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) 1033 + { 1034 + update_curr_dl(rq); 1035 + 1036 + #ifdef CONFIG_SCHED_HRTICK 1037 + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1038 + start_hrtick_dl(rq, p); 1039 + #endif 1040 + } 1041 + 1042 + static void task_fork_dl(struct task_struct *p) 1043 + { 1044 + /* 1045 + * SCHED_DEADLINE tasks cannot fork and this is achieved through 1046 + * sched_fork() 1047 + */ 1048 + } 1049 + 1050 + static void task_dead_dl(struct task_struct *p) 1051 + { 1052 + struct hrtimer *timer = &p->dl.dl_timer; 1053 + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 1054 + 1055 + /* 1056 + * Since we are TASK_DEAD we won't slip out of the domain! 1057 + */ 1058 + raw_spin_lock_irq(&dl_b->lock); 1059 + dl_b->total_bw -= p->dl.dl_bw; 1060 + raw_spin_unlock_irq(&dl_b->lock); 1061 + 1062 + hrtimer_cancel(timer); 1063 + } 1064 + 1065 + static void set_curr_task_dl(struct rq *rq) 1066 + { 1067 + struct task_struct *p = rq->curr; 1068 + 1069 + p->se.exec_start = rq_clock_task(rq); 1070 + 1071 + /* You can't push away the running task */ 1072 + dequeue_pushable_dl_task(rq, p); 1073 + } 1074 + 1075 + #ifdef CONFIG_SMP 1076 + 1077 + /* Only try algorithms three times */ 1078 + #define DL_MAX_TRIES 3 1079 + 1080 + static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1081 + { 1082 + if (!task_running(rq, p) && 1083 + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1084 + (p->nr_cpus_allowed > 1)) 1085 + return 1; 1086 + 1087 + return 0; 1088 + } 1089 + 1090 + /* Returns the second earliest -deadline task, NULL otherwise */ 1091 + static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) 1092 + { 1093 + struct rb_node *next_node = rq->dl.rb_leftmost; 1094 + struct sched_dl_entity *dl_se; 1095 + struct task_struct *p = NULL; 1096 + 1097 + next_node: 1098 + next_node = rb_next(next_node); 1099 + if (next_node) { 1100 + dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); 1101 + p = dl_task_of(dl_se); 1102 + 1103 + if (pick_dl_task(rq, p, cpu)) 1104 + return p; 1105 + 1106 + goto next_node; 1107 + } 1108 + 1109 + return NULL; 1110 + } 1111 + 1112 + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); 1113 + 1114 + static int find_later_rq(struct task_struct *task) 1115 + { 1116 + struct sched_domain *sd; 1117 + struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); 1118 + int this_cpu = smp_processor_id(); 1119 + int best_cpu, cpu = task_cpu(task); 1120 + 1121 + /* Make sure the mask is initialized first */ 1122 + if (unlikely(!later_mask)) 1123 + return -1; 1124 + 1125 + if (task->nr_cpus_allowed == 1) 1126 + return -1; 1127 + 1128 + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1129 + task, later_mask); 1130 + if (best_cpu == -1) 1131 + return -1; 1132 + 1133 + /* 1134 + * If we are here, some target has been found, 1135 + * the most suitable of which is cached in best_cpu. 1136 + * This is, among the runqueues where the current tasks 1137 + * have later deadlines than the task's one, the rq 1138 + * with the latest possible one. 1139 + * 1140 + * Now we check how well this matches with task's 1141 + * affinity and system topology. 1142 + * 1143 + * The last cpu where the task run is our first 1144 + * guess, since it is most likely cache-hot there. 1145 + */ 1146 + if (cpumask_test_cpu(cpu, later_mask)) 1147 + return cpu; 1148 + /* 1149 + * Check if this_cpu is to be skipped (i.e., it is 1150 + * not in the mask) or not. 1151 + */ 1152 + if (!cpumask_test_cpu(this_cpu, later_mask)) 1153 + this_cpu = -1; 1154 + 1155 + rcu_read_lock(); 1156 + for_each_domain(cpu, sd) { 1157 + if (sd->flags & SD_WAKE_AFFINE) { 1158 + 1159 + /* 1160 + * If possible, preempting this_cpu is 1161 + * cheaper than migrating. 1162 + */ 1163 + if (this_cpu != -1 && 1164 + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1165 + rcu_read_unlock(); 1166 + return this_cpu; 1167 + } 1168 + 1169 + /* 1170 + * Last chance: if best_cpu is valid and is 1171 + * in the mask, that becomes our choice. 1172 + */ 1173 + if (best_cpu < nr_cpu_ids && 1174 + cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { 1175 + rcu_read_unlock(); 1176 + return best_cpu; 1177 + } 1178 + } 1179 + } 1180 + rcu_read_unlock(); 1181 + 1182 + /* 1183 + * At this point, all our guesses failed, we just return 1184 + * 'something', and let the caller sort the things out. 1185 + */ 1186 + if (this_cpu != -1) 1187 + return this_cpu; 1188 + 1189 + cpu = cpumask_any(later_mask); 1190 + if (cpu < nr_cpu_ids) 1191 + return cpu; 1192 + 1193 + return -1; 1194 + } 1195 + 1196 + /* Locks the rq it finds */ 1197 + static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) 1198 + { 1199 + struct rq *later_rq = NULL; 1200 + int tries; 1201 + int cpu; 1202 + 1203 + for (tries = 0; tries < DL_MAX_TRIES; tries++) { 1204 + cpu = find_later_rq(task); 1205 + 1206 + if ((cpu == -1) || (cpu == rq->cpu)) 1207 + break; 1208 + 1209 + later_rq = cpu_rq(cpu); 1210 + 1211 + /* Retry if something changed. */ 1212 + if (double_lock_balance(rq, later_rq)) { 1213 + if (unlikely(task_rq(task) != rq || 1214 + !cpumask_test_cpu(later_rq->cpu, 1215 + &task->cpus_allowed) || 1216 + task_running(rq, task) || !task->on_rq)) { 1217 + double_unlock_balance(rq, later_rq); 1218 + later_rq = NULL; 1219 + break; 1220 + } 1221 + } 1222 + 1223 + /* 1224 + * If the rq we found has no -deadline task, or 1225 + * its earliest one has a later deadline than our 1226 + * task, the rq is a good one. 1227 + */ 1228 + if (!later_rq->dl.dl_nr_running || 1229 + dl_time_before(task->dl.deadline, 1230 + later_rq->dl.earliest_dl.curr)) 1231 + break; 1232 + 1233 + /* Otherwise we try again. */ 1234 + double_unlock_balance(rq, later_rq); 1235 + later_rq = NULL; 1236 + } 1237 + 1238 + return later_rq; 1239 + } 1240 + 1241 + static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) 1242 + { 1243 + struct task_struct *p; 1244 + 1245 + if (!has_pushable_dl_tasks(rq)) 1246 + return NULL; 1247 + 1248 + p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, 1249 + struct task_struct, pushable_dl_tasks); 1250 + 1251 + BUG_ON(rq->cpu != task_cpu(p)); 1252 + BUG_ON(task_current(rq, p)); 1253 + BUG_ON(p->nr_cpus_allowed <= 1); 1254 + 1255 + BUG_ON(!p->on_rq); 1256 + BUG_ON(!dl_task(p)); 1257 + 1258 + return p; 1259 + } 1260 + 1261 + /* 1262 + * See if the non running -deadline tasks on this rq 1263 + * can be sent to some other CPU where they can preempt 1264 + * and start executing. 1265 + */ 1266 + static int push_dl_task(struct rq *rq) 1267 + { 1268 + struct task_struct *next_task; 1269 + struct rq *later_rq; 1270 + 1271 + if (!rq->dl.overloaded) 1272 + return 0; 1273 + 1274 + next_task = pick_next_pushable_dl_task(rq); 1275 + if (!next_task) 1276 + return 0; 1277 + 1278 + retry: 1279 + if (unlikely(next_task == rq->curr)) { 1280 + WARN_ON(1); 1281 + return 0; 1282 + } 1283 + 1284 + /* 1285 + * If next_task preempts rq->curr, and rq->curr 1286 + * can move away, it makes sense to just reschedule 1287 + * without going further in pushing next_task. 1288 + */ 1289 + if (dl_task(rq->curr) && 1290 + dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && 1291 + rq->curr->nr_cpus_allowed > 1) { 1292 + resched_task(rq->curr); 1293 + return 0; 1294 + } 1295 + 1296 + /* We might release rq lock */ 1297 + get_task_struct(next_task); 1298 + 1299 + /* Will lock the rq it'll find */ 1300 + later_rq = find_lock_later_rq(next_task, rq); 1301 + if (!later_rq) { 1302 + struct task_struct *task; 1303 + 1304 + /* 1305 + * We must check all this again, since 1306 + * find_lock_later_rq releases rq->lock and it is 1307 + * then possible that next_task has migrated. 1308 + */ 1309 + task = pick_next_pushable_dl_task(rq); 1310 + if (task_cpu(next_task) == rq->cpu && task == next_task) { 1311 + /* 1312 + * The task is still there. We don't try 1313 + * again, some other cpu will pull it when ready. 1314 + */ 1315 + dequeue_pushable_dl_task(rq, next_task); 1316 + goto out; 1317 + } 1318 + 1319 + if (!task) 1320 + /* No more tasks */ 1321 + goto out; 1322 + 1323 + put_task_struct(next_task); 1324 + next_task = task; 1325 + goto retry; 1326 + } 1327 + 1328 + deactivate_task(rq, next_task, 0); 1329 + set_task_cpu(next_task, later_rq->cpu); 1330 + activate_task(later_rq, next_task, 0); 1331 + 1332 + resched_task(later_rq->curr); 1333 + 1334 + double_unlock_balance(rq, later_rq); 1335 + 1336 + out: 1337 + put_task_struct(next_task); 1338 + 1339 + return 1; 1340 + } 1341 + 1342 + static void push_dl_tasks(struct rq *rq) 1343 + { 1344 + /* Terminates as it moves a -deadline task */ 1345 + while (push_dl_task(rq)) 1346 + ; 1347 + } 1348 + 1349 + static int pull_dl_task(struct rq *this_rq) 1350 + { 1351 + int this_cpu = this_rq->cpu, ret = 0, cpu; 1352 + struct task_struct *p; 1353 + struct rq *src_rq; 1354 + u64 dmin = LONG_MAX; 1355 + 1356 + if (likely(!dl_overloaded(this_rq))) 1357 + return 0; 1358 + 1359 + /* 1360 + * Match the barrier from dl_set_overloaded; this guarantees that if we 1361 + * see overloaded we must also see the dlo_mask bit. 1362 + */ 1363 + smp_rmb(); 1364 + 1365 + for_each_cpu(cpu, this_rq->rd->dlo_mask) { 1366 + if (this_cpu == cpu) 1367 + continue; 1368 + 1369 + src_rq = cpu_rq(cpu); 1370 + 1371 + /* 1372 + * It looks racy, abd it is! However, as in sched_rt.c, 1373 + * we are fine with this. 1374 + */ 1375 + if (this_rq->dl.dl_nr_running && 1376 + dl_time_before(this_rq->dl.earliest_dl.curr, 1377 + src_rq->dl.earliest_dl.next)) 1378 + continue; 1379 + 1380 + /* Might drop this_rq->lock */ 1381 + double_lock_balance(this_rq, src_rq); 1382 + 1383 + /* 1384 + * If there are no more pullable tasks on the 1385 + * rq, we're done with it. 1386 + */ 1387 + if (src_rq->dl.dl_nr_running <= 1) 1388 + goto skip; 1389 + 1390 + p = pick_next_earliest_dl_task(src_rq, this_cpu); 1391 + 1392 + /* 1393 + * We found a task to be pulled if: 1394 + * - it preempts our current (if there's one), 1395 + * - it will preempt the last one we pulled (if any). 1396 + */ 1397 + if (p && dl_time_before(p->dl.deadline, dmin) && 1398 + (!this_rq->dl.dl_nr_running || 1399 + dl_time_before(p->dl.deadline, 1400 + this_rq->dl.earliest_dl.curr))) { 1401 + WARN_ON(p == src_rq->curr); 1402 + WARN_ON(!p->on_rq); 1403 + 1404 + /* 1405 + * Then we pull iff p has actually an earlier 1406 + * deadline than the current task of its runqueue. 1407 + */ 1408 + if (dl_time_before(p->dl.deadline, 1409 + src_rq->curr->dl.deadline)) 1410 + goto skip; 1411 + 1412 + ret = 1; 1413 + 1414 + deactivate_task(src_rq, p, 0); 1415 + set_task_cpu(p, this_cpu); 1416 + activate_task(this_rq, p, 0); 1417 + dmin = p->dl.deadline; 1418 + 1419 + /* Is there any other task even earlier? */ 1420 + } 1421 + skip: 1422 + double_unlock_balance(this_rq, src_rq); 1423 + } 1424 + 1425 + return ret; 1426 + } 1427 + 1428 + static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) 1429 + { 1430 + /* Try to pull other tasks here */ 1431 + if (dl_task(prev)) 1432 + pull_dl_task(rq); 1433 + } 1434 + 1435 + static void post_schedule_dl(struct rq *rq) 1436 + { 1437 + push_dl_tasks(rq); 1438 + } 1439 + 1440 + /* 1441 + * Since the task is not running and a reschedule is not going to happen 1442 + * anytime soon on its runqueue, we try pushing it away now. 1443 + */ 1444 + static void task_woken_dl(struct rq *rq, struct task_struct *p) 1445 + { 1446 + if (!task_running(rq, p) && 1447 + !test_tsk_need_resched(rq->curr) && 1448 + has_pushable_dl_tasks(rq) && 1449 + p->nr_cpus_allowed > 1 && 1450 + dl_task(rq->curr) && 1451 + (rq->curr->nr_cpus_allowed < 2 || 1452 + dl_entity_preempt(&rq->curr->dl, &p->dl))) { 1453 + push_dl_tasks(rq); 1454 + } 1455 + } 1456 + 1457 + static void set_cpus_allowed_dl(struct task_struct *p, 1458 + const struct cpumask *new_mask) 1459 + { 1460 + struct rq *rq; 1461 + int weight; 1462 + 1463 + BUG_ON(!dl_task(p)); 1464 + 1465 + /* 1466 + * Update only if the task is actually running (i.e., 1467 + * it is on the rq AND it is not throttled). 1468 + */ 1469 + if (!on_dl_rq(&p->dl)) 1470 + return; 1471 + 1472 + weight = cpumask_weight(new_mask); 1473 + 1474 + /* 1475 + * Only update if the process changes its state from whether it 1476 + * can migrate or not. 1477 + */ 1478 + if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1479 + return; 1480 + 1481 + rq = task_rq(p); 1482 + 1483 + /* 1484 + * The process used to be able to migrate OR it can now migrate 1485 + */ 1486 + if (weight <= 1) { 1487 + if (!task_current(rq, p)) 1488 + dequeue_pushable_dl_task(rq, p); 1489 + BUG_ON(!rq->dl.dl_nr_migratory); 1490 + rq->dl.dl_nr_migratory--; 1491 + } else { 1492 + if (!task_current(rq, p)) 1493 + enqueue_pushable_dl_task(rq, p); 1494 + rq->dl.dl_nr_migratory++; 1495 + } 1496 + 1497 + update_dl_migration(&rq->dl); 1498 + } 1499 + 1500 + /* Assumes rq->lock is held */ 1501 + static void rq_online_dl(struct rq *rq) 1502 + { 1503 + if (rq->dl.overloaded) 1504 + dl_set_overload(rq); 1505 + 1506 + if (rq->dl.dl_nr_running > 0) 1507 + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1508 + } 1509 + 1510 + /* Assumes rq->lock is held */ 1511 + static void rq_offline_dl(struct rq *rq) 1512 + { 1513 + if (rq->dl.overloaded) 1514 + dl_clear_overload(rq); 1515 + 1516 + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1517 + } 1518 + 1519 + void init_sched_dl_class(void) 1520 + { 1521 + unsigned int i; 1522 + 1523 + for_each_possible_cpu(i) 1524 + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), 1525 + GFP_KERNEL, cpu_to_node(i)); 1526 + } 1527 + 1528 + #endif /* CONFIG_SMP */ 1529 + 1530 + static void switched_from_dl(struct rq *rq, struct task_struct *p) 1531 + { 1532 + if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1533 + hrtimer_try_to_cancel(&p->dl.dl_timer); 1534 + 1535 + #ifdef CONFIG_SMP 1536 + /* 1537 + * Since this might be the only -deadline task on the rq, 1538 + * this is the right place to try to pull some other one 1539 + * from an overloaded cpu, if any. 1540 + */ 1541 + if (!rq->dl.dl_nr_running) 1542 + pull_dl_task(rq); 1543 + #endif 1544 + } 1545 + 1546 + /* 1547 + * When switching to -deadline, we may overload the rq, then 1548 + * we try to push someone off, if possible. 1549 + */ 1550 + static void switched_to_dl(struct rq *rq, struct task_struct *p) 1551 + { 1552 + int check_resched = 1; 1553 + 1554 + /* 1555 + * If p is throttled, don't consider the possibility 1556 + * of preempting rq->curr, the check will be done right 1557 + * after its runtime will get replenished. 1558 + */ 1559 + if (unlikely(p->dl.dl_throttled)) 1560 + return; 1561 + 1562 + if (p->on_rq || rq->curr != p) { 1563 + #ifdef CONFIG_SMP 1564 + if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1565 + /* Only reschedule if pushing failed */ 1566 + check_resched = 0; 1567 + #endif /* CONFIG_SMP */ 1568 + if (check_resched && task_has_dl_policy(rq->curr)) 1569 + check_preempt_curr_dl(rq, p, 0); 1570 + } 1571 + } 1572 + 1573 + /* 1574 + * If the scheduling parameters of a -deadline task changed, 1575 + * a push or pull operation might be needed. 1576 + */ 1577 + static void prio_changed_dl(struct rq *rq, struct task_struct *p, 1578 + int oldprio) 1579 + { 1580 + if (p->on_rq || rq->curr == p) { 1581 + #ifdef CONFIG_SMP 1582 + /* 1583 + * This might be too much, but unfortunately 1584 + * we don't have the old deadline value, and 1585 + * we can't argue if the task is increasing 1586 + * or lowering its prio, so... 1587 + */ 1588 + if (!rq->dl.overloaded) 1589 + pull_dl_task(rq); 1590 + 1591 + /* 1592 + * If we now have a earlier deadline task than p, 1593 + * then reschedule, provided p is still on this 1594 + * runqueue. 1595 + */ 1596 + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && 1597 + rq->curr == p) 1598 + resched_task(p); 1599 + #else 1600 + /* 1601 + * Again, we don't know if p has a earlier 1602 + * or later deadline, so let's blindly set a 1603 + * (maybe not needed) rescheduling point. 1604 + */ 1605 + resched_task(p); 1606 + #endif /* CONFIG_SMP */ 1607 + } else 1608 + switched_to_dl(rq, p); 1609 + } 1610 + 1611 + const struct sched_class dl_sched_class = { 1612 + .next = &rt_sched_class, 1613 + .enqueue_task = enqueue_task_dl, 1614 + .dequeue_task = dequeue_task_dl, 1615 + .yield_task = yield_task_dl, 1616 + 1617 + .check_preempt_curr = check_preempt_curr_dl, 1618 + 1619 + .pick_next_task = pick_next_task_dl, 1620 + .put_prev_task = put_prev_task_dl, 1621 + 1622 + #ifdef CONFIG_SMP 1623 + .select_task_rq = select_task_rq_dl, 1624 + .set_cpus_allowed = set_cpus_allowed_dl, 1625 + .rq_online = rq_online_dl, 1626 + .rq_offline = rq_offline_dl, 1627 + .pre_schedule = pre_schedule_dl, 1628 + .post_schedule = post_schedule_dl, 1629 + .task_woken = task_woken_dl, 1630 + #endif 1631 + 1632 + .set_curr_task = set_curr_task_dl, 1633 + .task_tick = task_tick_dl, 1634 + .task_fork = task_fork_dl, 1635 + .task_dead = task_dead_dl, 1636 + 1637 + .prio_changed = prio_changed_dl, 1638 + .switched_from = switched_from_dl, 1639 + .switched_to = switched_to_dl, 1640 + };

+2 -2

kernel/sched/debug.c

··· 139 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 140 140 #endif 141 141 #ifdef CONFIG_NUMA_BALANCING 142 - SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); 142 + SEQ_printf(m, " %d", task_node(p)); 143 143 #endif 144 144 #ifdef CONFIG_CGROUP_SCHED 145 145 SEQ_printf(m, " %s", task_group_path(task_group(p))); ··· 371 371 PN(cpu_clk); 372 372 P(jiffies); 373 373 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 374 - P(sched_clock_stable); 374 + P(sched_clock_stable()); 375 375 #endif 376 376 #undef PN 377 377 #undef P

+36 -47

kernel/sched/fair.c

··· 872 872 return max(smin, smax); 873 873 } 874 874 875 - /* 876 - * Once a preferred node is selected the scheduler balancer will prefer moving 877 - * a task to that node for sysctl_numa_balancing_settle_count number of PTE 878 - * scans. This will give the process the chance to accumulate more faults on 879 - * the preferred node but still allow the scheduler to move the task again if 880 - * the nodes CPUs are overloaded. 881 - */ 882 - unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; 883 - 884 875 static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 885 876 { 886 877 rq->nr_numa_running += (p->numa_preferred_nid != -1); ··· 921 930 if (!p->numa_group) 922 931 return 0; 923 932 924 - return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; 933 + return p->numa_group->faults[task_faults_idx(nid, 0)] + 934 + p->numa_group->faults[task_faults_idx(nid, 1)]; 925 935 } 926 936 927 937 /* ··· 1015 1023 1016 1024 struct numa_stats src_stats, dst_stats; 1017 1025 1018 - int imbalance_pct, idx; 1026 + int imbalance_pct; 1019 1027 1020 1028 struct task_struct *best_task; 1021 1029 long best_imp; ··· 1203 1211 * elsewhere, so there is no point in (re)trying. 1204 1212 */ 1205 1213 if (unlikely(!sd)) { 1206 - p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1214 + p->numa_preferred_nid = task_node(p); 1207 1215 return -EINVAL; 1208 1216 } 1209 1217 ··· 1270 1278 p->numa_migrate_retry = jiffies + HZ; 1271 1279 1272 1280 /* Success if task is already running on preferred CPU */ 1273 - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1281 + if (task_node(p) == p->numa_preferred_nid) 1274 1282 return; 1275 1283 1276 1284 /* Otherwise, try migrate to a CPU on the preferred node */ ··· 1342 1350 * scanning faster if shared accesses dominate as it may 1343 1351 * simply bounce migrations uselessly 1344 1352 */ 1345 - period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); 1346 1353 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1347 1354 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1348 1355 } ··· 4092 4101 */ 4093 4102 static struct sched_group * 4094 4103 find_idlest_group(struct sched_domain *sd, struct task_struct *p, 4095 - int this_cpu, int load_idx) 4104 + int this_cpu, int sd_flag) 4096 4105 { 4097 4106 struct sched_group *idlest = NULL, *group = sd->groups; 4098 4107 unsigned long min_load = ULONG_MAX, this_load = 0; 4108 + int load_idx = sd->forkexec_idx; 4099 4109 int imbalance = 100 + (sd->imbalance_pct-100)/2; 4110 + 4111 + if (sd_flag & SD_BALANCE_WAKE) 4112 + load_idx = sd->wake_idx; 4100 4113 4101 4114 do { 4102 4115 unsigned long load, avg_load; ··· 4269 4274 } 4270 4275 4271 4276 while (sd) { 4272 - int load_idx = sd->forkexec_idx; 4273 4277 struct sched_group *group; 4274 4278 int weight; 4275 4279 ··· 4277 4283 continue; 4278 4284 } 4279 4285 4280 - if (sd_flag & SD_BALANCE_WAKE) 4281 - load_idx = sd->wake_idx; 4282 - 4283 - group = find_idlest_group(sd, p, cpu, load_idx); 4286 + group = find_idlest_group(sd, p, cpu, sd_flag); 4284 4287 if (!group) { 4285 4288 sd = sd->child; 4286 4289 continue; ··· 5503 5512 struct sched_group *group, int load_idx, 5504 5513 int local_group, struct sg_lb_stats *sgs) 5505 5514 { 5506 - unsigned long nr_running; 5507 5515 unsigned long load; 5508 5516 int i; 5509 5517 ··· 5511 5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5512 5522 struct rq *rq = cpu_rq(i); 5513 5523 5514 - nr_running = rq->nr_running; 5515 - 5516 5524 /* Bias balancing toward cpus of our domain */ 5517 5525 if (local_group) 5518 5526 load = target_load(i, load_idx); ··· 5518 5530 load = source_load(i, load_idx); 5519 5531 5520 5532 sgs->group_load += load; 5521 - sgs->sum_nr_running += nr_running; 5533 + sgs->sum_nr_running += rq->nr_running; 5522 5534 #ifdef CONFIG_NUMA_BALANCING 5523 5535 sgs->nr_numa_running += rq->nr_numa_running; 5524 5536 sgs->nr_preferred_running += rq->nr_preferred_running; ··· 6509 6521 unsigned long next_balance; /* in jiffy units */ 6510 6522 } nohz ____cacheline_aligned; 6511 6523 6512 - static inline int find_new_ilb(int call_cpu) 6524 + static inline int find_new_ilb(void) 6513 6525 { 6514 6526 int ilb = cpumask_first(nohz.idle_cpus_mask); 6515 6527 ··· 6524 6536 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 6525 6537 * CPU (if there is one). 6526 6538 */ 6527 - static void nohz_balancer_kick(int cpu) 6539 + static void nohz_balancer_kick(void) 6528 6540 { 6529 6541 int ilb_cpu; 6530 6542 6531 6543 nohz.next_balance++; 6532 6544 6533 - ilb_cpu = find_new_ilb(cpu); 6545 + ilb_cpu = find_new_ilb(); 6534 6546 6535 6547 if (ilb_cpu >= nr_cpu_ids) 6536 6548 return; ··· 6640 6652 * 6641 6653 * Balancing parameters are set up in init_sched_domains. 6642 6654 */ 6643 - static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6655 + static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) 6644 6656 { 6645 6657 int continue_balancing = 1; 6646 - struct rq *rq = cpu_rq(cpu); 6658 + int cpu = rq->cpu; 6647 6659 unsigned long interval; 6648 6660 struct sched_domain *sd; 6649 6661 /* Earliest time when we have to do rebalance again */ ··· 6740 6752 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 6741 6753 * rebalancing for all the cpus for whom scheduler ticks are stopped. 6742 6754 */ 6743 - static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 6755 + static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 6744 6756 { 6745 - struct rq *this_rq = cpu_rq(this_cpu); 6757 + int this_cpu = this_rq->cpu; 6746 6758 struct rq *rq; 6747 6759 int balance_cpu; 6748 6760 ··· 6769 6781 update_idle_cpu_load(rq); 6770 6782 raw_spin_unlock_irq(&rq->lock); 6771 6783 6772 - rebalance_domains(balance_cpu, CPU_IDLE); 6784 + rebalance_domains(rq, CPU_IDLE); 6773 6785 6774 6786 if (time_after(this_rq->next_balance, rq->next_balance)) 6775 6787 this_rq->next_balance = rq->next_balance; ··· 6788 6800 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 6789 6801 * domain span are idle. 6790 6802 */ 6791 - static inline int nohz_kick_needed(struct rq *rq, int cpu) 6803 + static inline int nohz_kick_needed(struct rq *rq) 6792 6804 { 6793 6805 unsigned long now = jiffies; 6794 6806 struct sched_domain *sd; 6795 6807 struct sched_group_power *sgp; 6796 - int nr_busy; 6808 + int nr_busy, cpu = rq->cpu; 6797 6809 6798 - if (unlikely(idle_cpu(cpu))) 6810 + if (unlikely(rq->idle_balance)) 6799 6811 return 0; 6800 6812 6801 6813 /* ··· 6844 6856 return 1; 6845 6857 } 6846 6858 #else 6847 - static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 6859 + static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } 6848 6860 #endif 6849 6861 6850 6862 /* ··· 6853 6865 */ 6854 6866 static void run_rebalance_domains(struct softirq_action *h) 6855 6867 { 6856 - int this_cpu = smp_processor_id(); 6857 - struct rq *this_rq = cpu_rq(this_cpu); 6868 + struct rq *this_rq = this_rq(); 6858 6869 enum cpu_idle_type idle = this_rq->idle_balance ? 6859 6870 CPU_IDLE : CPU_NOT_IDLE; 6860 6871 6861 - rebalance_domains(this_cpu, idle); 6872 + rebalance_domains(this_rq, idle); 6862 6873 6863 6874 /* 6864 6875 * If this cpu has a pending nohz_balance_kick, then do the 6865 6876 * balancing on behalf of the other idle cpus whose ticks are 6866 6877 * stopped. 6867 6878 */ 6868 - nohz_idle_balance(this_cpu, idle); 6879 + nohz_idle_balance(this_rq, idle); 6869 6880 } 6870 6881 6871 - static inline int on_null_domain(int cpu) 6882 + static inline int on_null_domain(struct rq *rq) 6872 6883 { 6873 - return !rcu_dereference_sched(cpu_rq(cpu)->sd); 6884 + return !rcu_dereference_sched(rq->sd); 6874 6885 } 6875 6886 6876 6887 /* 6877 6888 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 6878 6889 */ 6879 - void trigger_load_balance(struct rq *rq, int cpu) 6890 + void trigger_load_balance(struct rq *rq) 6880 6891 { 6881 6892 /* Don't need to rebalance while attached to NULL domain */ 6882 - if (time_after_eq(jiffies, rq->next_balance) && 6883 - likely(!on_null_domain(cpu))) 6893 + if (unlikely(on_null_domain(rq))) 6894 + return; 6895 + 6896 + if (time_after_eq(jiffies, rq->next_balance)) 6884 6897 raise_softirq(SCHED_SOFTIRQ); 6885 6898 #ifdef CONFIG_NO_HZ_COMMON 6886 - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 6887 - nohz_balancer_kick(cpu); 6899 + if (nohz_kick_needed(rq)) 6900 + nohz_balancer_kick(); 6888 6901 #endif 6889 6902 } 6890 6903

+1 -1

kernel/sched/rt.c

··· 1738 1738 !test_tsk_need_resched(rq->curr) && 1739 1739 has_pushable_tasks(rq) && 1740 1740 p->nr_cpus_allowed > 1 && 1741 - rt_task(rq->curr) && 1741 + (dl_task(rq->curr) || rt_task(rq->curr)) && 1742 1742 (rq->curr->nr_cpus_allowed < 2 || 1743 1743 rq->curr->prio <= p->prio)) 1744 1744 push_rt_tasks(rq);

+140 -6

kernel/sched/sched.h

··· 2 2 #include <linux/sched.h> 3 3 #include <linux/sched/sysctl.h> 4 4 #include <linux/sched/rt.h> 5 + #include <linux/sched/deadline.h> 5 6 #include <linux/mutex.h> 6 7 #include <linux/spinlock.h> 7 8 #include <linux/stop_machine.h> ··· 10 9 #include <linux/slab.h> 11 10 12 11 #include "cpupri.h" 12 + #include "cpudeadline.h" 13 13 #include "cpuacct.h" 14 14 15 15 struct rq; ··· 75 73 #define NICE_0_SHIFT SCHED_LOAD_SHIFT 76 74 77 75 /* 76 + * Single value that decides SCHED_DEADLINE internal math precision. 77 + * 10 -> just above 1us 78 + * 9 -> just above 0.5us 79 + */ 80 + #define DL_SCALE (10) 81 + 82 + /* 78 83 * These are the 'tuning knobs' of the scheduler: 79 84 */ 80 85 ··· 90 81 */ 91 82 #define RUNTIME_INF ((u64)~0ULL) 92 83 84 + static inline int fair_policy(int policy) 85 + { 86 + return policy == SCHED_NORMAL || policy == SCHED_BATCH; 87 + } 88 + 93 89 static inline int rt_policy(int policy) 94 90 { 95 - if (policy == SCHED_FIFO || policy == SCHED_RR) 96 - return 1; 97 - return 0; 91 + return policy == SCHED_FIFO || policy == SCHED_RR; 92 + } 93 + 94 + static inline int dl_policy(int policy) 95 + { 96 + return policy == SCHED_DEADLINE; 98 97 } 99 98 100 99 static inline int task_has_rt_policy(struct task_struct *p) 101 100 { 102 101 return rt_policy(p->policy); 102 + } 103 + 104 + static inline int task_has_dl_policy(struct task_struct *p) 105 + { 106 + return dl_policy(p->policy); 107 + } 108 + 109 + static inline bool dl_time_before(u64 a, u64 b) 110 + { 111 + return (s64)(a - b) < 0; 112 + } 113 + 114 + /* 115 + * Tells if entity @a should preempt entity @b. 116 + */ 117 + static inline bool 118 + dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) 119 + { 120 + return dl_time_before(a->deadline, b->deadline); 103 121 } 104 122 105 123 /* ··· 143 107 ktime_t rt_period; 144 108 u64 rt_runtime; 145 109 struct hrtimer rt_period_timer; 110 + }; 111 + /* 112 + * To keep the bandwidth of -deadline tasks and groups under control 113 + * we need some place where: 114 + * - store the maximum -deadline bandwidth of the system (the group); 115 + * - cache the fraction of that bandwidth that is currently allocated. 116 + * 117 + * This is all done in the data structure below. It is similar to the 118 + * one used for RT-throttling (rt_bandwidth), with the main difference 119 + * that, since here we are only interested in admission control, we 120 + * do not decrease any runtime while the group "executes", neither we 121 + * need a timer to replenish it. 122 + * 123 + * With respect to SMP, the bandwidth is given on a per-CPU basis, 124 + * meaning that: 125 + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; 126 + * - dl_total_bw array contains, in the i-eth element, the currently 127 + * allocated bandwidth on the i-eth CPU. 128 + * Moreover, groups consume bandwidth on each CPU, while tasks only 129 + * consume bandwidth on the CPU they're running on. 130 + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw 131 + * that will be shown the next time the proc or cgroup controls will 132 + * be red. It on its turn can be changed by writing on its own 133 + * control. 134 + */ 135 + struct dl_bandwidth { 136 + raw_spinlock_t dl_runtime_lock; 137 + u64 dl_runtime; 138 + u64 dl_period; 139 + }; 140 + 141 + static inline int dl_bandwidth_enabled(void) 142 + { 143 + return sysctl_sched_rt_runtime >= 0; 144 + } 145 + 146 + extern struct dl_bw *dl_bw_of(int i); 147 + 148 + struct dl_bw { 149 + raw_spinlock_t lock; 150 + u64 bw, total_bw; 146 151 }; 147 152 148 153 extern struct mutex sched_domains_mutex; ··· 441 364 #endif 442 365 }; 443 366 367 + /* Deadline class' related fields in a runqueue */ 368 + struct dl_rq { 369 + /* runqueue is an rbtree, ordered by deadline */ 370 + struct rb_root rb_root; 371 + struct rb_node *rb_leftmost; 372 + 373 + unsigned long dl_nr_running; 374 + 375 + #ifdef CONFIG_SMP 376 + /* 377 + * Deadline values of the currently executing and the 378 + * earliest ready task on this rq. Caching these facilitates 379 + * the decision wether or not a ready but not running task 380 + * should migrate somewhere else. 381 + */ 382 + struct { 383 + u64 curr; 384 + u64 next; 385 + } earliest_dl; 386 + 387 + unsigned long dl_nr_migratory; 388 + unsigned long dl_nr_total; 389 + int overloaded; 390 + 391 + /* 392 + * Tasks on this rq that can be pushed away. They are kept in 393 + * an rb-tree, ordered by tasks' deadlines, with caching 394 + * of the leftmost (earliest deadline) element. 395 + */ 396 + struct rb_root pushable_dl_tasks_root; 397 + struct rb_node *pushable_dl_tasks_leftmost; 398 + #else 399 + struct dl_bw dl_bw; 400 + #endif 401 + }; 402 + 444 403 #ifdef CONFIG_SMP 445 404 446 405 /* ··· 493 380 struct rcu_head rcu; 494 381 cpumask_var_t span; 495 382 cpumask_var_t online; 383 + 384 + /* 385 + * The bit corresponding to a CPU gets set here if such CPU has more 386 + * than one runnable -deadline task (as it is below for RT tasks). 387 + */ 388 + cpumask_var_t dlo_mask; 389 + atomic_t dlo_count; 390 + struct dl_bw dl_bw; 391 + struct cpudl cpudl; 496 392 497 393 /* 498 394 * The "RT overload" flag: it gets set if a CPU has more than ··· 554 432 555 433 struct cfs_rq cfs; 556 434 struct rt_rq rt; 435 + struct dl_rq dl; 557 436 558 437 #ifdef CONFIG_FAIR_GROUP_SCHED 559 438 /* list of leaf cfs_rq on this cpu: */ ··· 950 827 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 951 828 } 952 829 953 - 954 - 955 830 static inline int task_current(struct rq *rq, struct task_struct *p) 956 831 { 957 832 return rq->curr == p; ··· 1109 988 #else 1110 989 #define ENQUEUE_WAKING 0 1111 990 #endif 991 + #define ENQUEUE_REPLENISH 8 1112 992 1113 993 #define DEQUEUE_SLEEP 1 1114 994 ··· 1145 1023 void (*set_curr_task) (struct rq *rq); 1146 1024 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1147 1025 void (*task_fork) (struct task_struct *p); 1026 + void (*task_dead) (struct task_struct *p); 1148 1027 1149 1028 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1150 1029 void (*switched_to) (struct rq *this_rq, struct task_struct *task); ··· 1165 1042 for (class = sched_class_highest; class; class = class->next) 1166 1043 1167 1044 extern const struct sched_class stop_sched_class; 1045 + extern const struct sched_class dl_sched_class; 1168 1046 extern const struct sched_class rt_sched_class; 1169 1047 extern const struct sched_class fair_sched_class; 1170 1048 extern const struct sched_class idle_sched_class; ··· 1175 1051 1176 1052 extern void update_group_power(struct sched_domain *sd, int cpu); 1177 1053 1178 - extern void trigger_load_balance(struct rq *rq, int cpu); 1054 + extern void trigger_load_balance(struct rq *rq); 1179 1055 extern void idle_balance(int this_cpu, struct rq *this_rq); 1180 1056 1181 1057 extern void idle_enter_fair(struct rq *this_rq); ··· 1192 1068 extern void sysrq_sched_debug_show(void); 1193 1069 extern void sched_init_granularity(void); 1194 1070 extern void update_max_interval(void); 1071 + 1072 + extern void init_sched_dl_class(void); 1195 1073 extern void init_sched_rt_class(void); 1196 1074 extern void init_sched_fair_class(void); 1075 + extern void init_sched_dl_class(void); 1197 1076 1198 1077 extern void resched_task(struct task_struct *p); 1199 1078 extern void resched_cpu(int cpu); 1200 1079 1201 1080 extern struct rt_bandwidth def_rt_bandwidth; 1202 1081 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1082 + 1083 + extern struct dl_bandwidth def_dl_bandwidth; 1084 + extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 1085 + extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1086 + 1087 + unsigned long to_ratio(u64 period, u64 runtime); 1203 1088 1204 1089 extern void update_idle_cpu_load(struct rq *this_rq); 1205 1090 ··· 1486 1353 1487 1354 extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1488 1355 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1356 + extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); 1489 1357 1490 1358 extern void cfs_bandwidth_usage_inc(void); 1491 1359 extern void cfs_bandwidth_usage_dec(void);

+1 -1

kernel/sched/stop_task.c

··· 103 103 * Simple, special scheduling class for the per-CPU stop tasks: 104 104 */ 105 105 const struct sched_class stop_sched_class = { 106 - .next = &rt_sched_class, 106 + .next = &dl_sched_class, 107 107 108 108 .enqueue_task = enqueue_task_stop, 109 109 .dequeue_task = dequeue_task_stop,

+8 -31

kernel/softirq.c

··· 89 89 * where hardirqs are disabled legitimately: 90 90 */ 91 91 #ifdef CONFIG_TRACE_IRQFLAGS 92 - static void __local_bh_disable(unsigned long ip, unsigned int cnt) 92 + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) 93 93 { 94 94 unsigned long flags; 95 95 ··· 107 107 /* 108 108 * Were softirqs turned off above: 109 109 */ 110 - if (softirq_count() == cnt) 110 + if (softirq_count() == (cnt & SOFTIRQ_MASK)) 111 111 trace_softirqs_off(ip); 112 112 raw_local_irq_restore(flags); 113 113 114 114 if (preempt_count() == cnt) 115 115 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 116 116 } 117 - #else /* !CONFIG_TRACE_IRQFLAGS */ 118 - static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 119 - { 120 - preempt_count_add(cnt); 121 - barrier(); 122 - } 117 + EXPORT_SYMBOL(__local_bh_disable_ip); 123 118 #endif /* CONFIG_TRACE_IRQFLAGS */ 124 - 125 - void local_bh_disable(void) 126 - { 127 - __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); 128 - } 129 - 130 - EXPORT_SYMBOL(local_bh_disable); 131 119 132 120 static void __local_bh_enable(unsigned int cnt) 133 121 { 134 122 WARN_ON_ONCE(!irqs_disabled()); 135 123 136 - if (softirq_count() == cnt) 124 + if (softirq_count() == (cnt & SOFTIRQ_MASK)) 137 125 trace_softirqs_on(_RET_IP_); 138 126 preempt_count_sub(cnt); 139 127 } ··· 139 151 140 152 EXPORT_SYMBOL(_local_bh_enable); 141 153 142 - static inline void _local_bh_enable_ip(unsigned long ip) 154 + void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) 143 155 { 144 156 WARN_ON_ONCE(in_irq() || irqs_disabled()); 145 157 #ifdef CONFIG_TRACE_IRQFLAGS ··· 154 166 * Keep preemption disabled until we are done with 155 167 * softirq processing: 156 168 */ 157 - preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); 169 + preempt_count_sub(cnt - 1); 158 170 159 171 if (unlikely(!in_interrupt() && local_softirq_pending())) { 160 172 /* ··· 170 182 #endif 171 183 preempt_check_resched(); 172 184 } 173 - 174 - void local_bh_enable(void) 175 - { 176 - _local_bh_enable_ip(_RET_IP_); 177 - } 178 - EXPORT_SYMBOL(local_bh_enable); 179 - 180 - void local_bh_enable_ip(unsigned long ip) 181 - { 182 - _local_bh_enable_ip(ip); 183 - } 184 - EXPORT_SYMBOL(local_bh_enable_ip); 185 + EXPORT_SYMBOL(__local_bh_enable_ip); 185 186 186 187 /* 187 188 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, ··· 241 264 pending = local_softirq_pending(); 242 265 account_irq_enter_time(current); 243 266 244 - __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); 267 + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); 245 268 in_hardirq = lockdep_softirq_start(); 246 269 247 270 cpu = smp_processor_id();

-7

kernel/sysctl.c

··· 385 385 .proc_handler = proc_dointvec, 386 386 }, 387 387 { 388 - .procname = "numa_balancing_settle_count", 389 - .data = &sysctl_numa_balancing_settle_count, 390 - .maxlen = sizeof(unsigned int), 391 - .mode = 0644, 392 - .proc_handler = proc_dointvec, 393 - }, 394 - { 395 388 .procname = "numa_balancing_migrate_deferred", 396 389 .data = &sysctl_numa_balancing_migrate_deferred, 397 390 .maxlen = sizeof(unsigned int),

+1 -1

kernel/time/tick-sched.c

··· 177 177 * TODO: kick full dynticks CPUs when 178 178 * sched_clock_stable is set. 179 179 */ 180 - if (!sched_clock_stable) { 180 + if (!sched_clock_stable()) { 181 181 trace_tick_stop(0, "unstable sched clock\n"); 182 182 /* 183 183 * Don't allow the user to think they can get

+1 -1

kernel/trace/ring_buffer.c

··· 2558 2558 if (unlikely(test_time_stamp(delta))) { 2559 2559 int local_clock_stable = 1; 2560 2560 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 2561 - local_clock_stable = sched_clock_stable; 2561 + local_clock_stable = sched_clock_stable(); 2562 2562 #endif 2563 2563 WARN_ONCE(delta > (1ULL << 59), 2564 2564 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",

+61 -4

kernel/trace/trace_sched_wakeup.c

··· 16 16 #include <linux/uaccess.h> 17 17 #include <linux/ftrace.h> 18 18 #include <linux/sched/rt.h> 19 + #include <linux/sched/deadline.h> 19 20 #include <trace/events/sched.h> 20 21 #include "trace.h" 21 22 ··· 28 27 static int wakeup_current_cpu; 29 28 static unsigned wakeup_prio = -1; 30 29 static int wakeup_rt; 30 + static int wakeup_dl; 31 + static int tracing_dl = 0; 31 32 32 33 static arch_spinlock_t wakeup_lock = 33 34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; ··· 440 437 { 441 438 wakeup_cpu = -1; 442 439 wakeup_prio = -1; 440 + tracing_dl = 0; 443 441 444 442 if (wakeup_task) 445 443 put_task_struct(wakeup_task); ··· 476 472 tracing_record_cmdline(p); 477 473 tracing_record_cmdline(current); 478 474 479 - if ((wakeup_rt && !rt_task(p)) || 480 - p->prio >= wakeup_prio || 481 - p->prio >= current->prio) 475 + /* 476 + * Semantic is like this: 477 + * - wakeup tracer handles all tasks in the system, independently 478 + * from their scheduling class; 479 + * - wakeup_rt tracer handles tasks belonging to sched_dl and 480 + * sched_rt class; 481 + * - wakeup_dl handles tasks belonging to sched_dl class only. 482 + */ 483 + if (tracing_dl || (wakeup_dl && !dl_task(p)) || 484 + (wakeup_rt && !dl_task(p) && !rt_task(p)) || 485 + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) 482 486 return; 483 487 484 488 pc = preempt_count(); ··· 498 486 arch_spin_lock(&wakeup_lock); 499 487 500 488 /* check for races. */ 501 - if (!tracer_enabled || p->prio >= wakeup_prio) 489 + if (!tracer_enabled || tracing_dl || 490 + (!dl_task(p) && p->prio >= wakeup_prio)) 502 491 goto out_locked; 503 492 504 493 /* reset the trace */ ··· 508 495 wakeup_cpu = task_cpu(p); 509 496 wakeup_current_cpu = wakeup_cpu; 510 497 wakeup_prio = p->prio; 498 + 499 + /* 500 + * Once you start tracing a -deadline task, don't bother tracing 501 + * another task until the first one wakes up. 502 + */ 503 + if (dl_task(p)) 504 + tracing_dl = 1; 505 + else 506 + tracing_dl = 0; 511 507 512 508 wakeup_task = p; 513 509 get_task_struct(wakeup_task); ··· 619 597 620 598 static int wakeup_tracer_init(struct trace_array *tr) 621 599 { 600 + wakeup_dl = 0; 622 601 wakeup_rt = 0; 623 602 return __wakeup_tracer_init(tr); 624 603 } 625 604 626 605 static int wakeup_rt_tracer_init(struct trace_array *tr) 627 606 { 607 + wakeup_dl = 0; 628 608 wakeup_rt = 1; 609 + return __wakeup_tracer_init(tr); 610 + } 611 + 612 + static int wakeup_dl_tracer_init(struct trace_array *tr) 613 + { 614 + wakeup_dl = 1; 615 + wakeup_rt = 0; 629 616 return __wakeup_tracer_init(tr); 630 617 } 631 618 ··· 705 674 .use_max_tr = true, 706 675 }; 707 676 677 + static struct tracer wakeup_dl_tracer __read_mostly = 678 + { 679 + .name = "wakeup_dl", 680 + .init = wakeup_dl_tracer_init, 681 + .reset = wakeup_tracer_reset, 682 + .start = wakeup_tracer_start, 683 + .stop = wakeup_tracer_stop, 684 + .wait_pipe = poll_wait_pipe, 685 + .print_max = true, 686 + .print_header = wakeup_print_header, 687 + .print_line = wakeup_print_line, 688 + .flags = &tracer_flags, 689 + .set_flag = wakeup_set_flag, 690 + .flag_changed = wakeup_flag_changed, 691 + #ifdef CONFIG_FTRACE_SELFTEST 692 + .selftest = trace_selftest_startup_wakeup, 693 + #endif 694 + .open = wakeup_trace_open, 695 + .close = wakeup_trace_close, 696 + .use_max_tr = true, 697 + }; 698 + 708 699 __init static int init_wakeup_tracer(void) 709 700 { 710 701 int ret; ··· 736 683 return ret; 737 684 738 685 ret = register_tracer(&wakeup_rt_tracer); 686 + if (ret) 687 + return ret; 688 + 689 + ret = register_tracer(&wakeup_dl_tracer); 739 690 if (ret) 740 691 return ret; 741 692

+19 -14

kernel/trace/trace_selftest.c

··· 1022 1022 #ifdef CONFIG_SCHED_TRACER 1023 1023 static int trace_wakeup_test_thread(void *data) 1024 1024 { 1025 - /* Make this a RT thread, doesn't need to be too high */ 1026 - static const struct sched_param param = { .sched_priority = 5 }; 1025 + /* Make this a -deadline thread */ 1026 + static const struct sched_attr attr = { 1027 + .sched_policy = SCHED_DEADLINE, 1028 + .sched_runtime = 100000ULL, 1029 + .sched_deadline = 10000000ULL, 1030 + .sched_period = 10000000ULL 1031 + }; 1027 1032 struct completion *x = data; 1028 1033 1029 - sched_setscheduler(current, SCHED_FIFO, &param); 1034 + sched_setattr(current, &attr); 1030 1035 1031 1036 /* Make it know we have a new prio */ 1032 1037 complete(x); ··· 1045 1040 /* we are awake, now wait to disappear */ 1046 1041 while (!kthread_should_stop()) { 1047 1042 /* 1048 - * This is an RT task, do short sleeps to let 1049 - * others run. 1043 + * This will likely be the system top priority 1044 + * task, do short sleeps to let others run. 1050 1045 */ 1051 1046 msleep(100); 1052 1047 } ··· 1059 1054 { 1060 1055 unsigned long save_max = tracing_max_latency; 1061 1056 struct task_struct *p; 1062 - struct completion isrt; 1057 + struct completion is_ready; 1063 1058 unsigned long count; 1064 1059 int ret; 1065 1060 1066 - init_completion(&isrt); 1061 + init_completion(&is_ready); 1067 1062 1068 - /* create a high prio thread */ 1069 - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); 1063 + /* create a -deadline thread */ 1064 + p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); 1070 1065 if (IS_ERR(p)) { 1071 1066 printk(KERN_CONT "Failed to create ftrace wakeup test thread "); 1072 1067 return -1; 1073 1068 } 1074 1069 1075 - /* make sure the thread is running at an RT prio */ 1076 - wait_for_completion(&isrt); 1070 + /* make sure the thread is running at -deadline policy */ 1071 + wait_for_completion(&is_ready); 1077 1072 1078 1073 /* start the tracing */ 1079 1074 ret = tracer_init(trace, tr); ··· 1087 1082 1088 1083 while (p->on_rq) { 1089 1084 /* 1090 - * Sleep to make sure the RT thread is asleep too. 1085 + * Sleep to make sure the -deadline thread is asleep too. 1091 1086 * On virtual machines we can't rely on timings, 1092 1087 * but we want to make sure this test still works. 1093 1088 */ 1094 1089 msleep(100); 1095 1090 } 1096 1091 1097 - init_completion(&isrt); 1092 + init_completion(&is_ready); 1098 1093 1099 1094 wake_up_process(p); 1100 1095 1101 1096 /* Wait for the task to wake up */ 1102 - wait_for_completion(&isrt); 1097 + wait_for_completion(&is_ready); 1103 1098 1104 1099 /* stop the tracing. */ 1105 1100 tracing_stop();

+2 -2

net/ipv4/tcp.c

··· 1623 1623 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 1624 1624 !sysctl_tcp_low_latency && 1625 1625 net_dma_find_channel()) { 1626 - preempt_enable_no_resched(); 1626 + preempt_enable(); 1627 1627 tp->ucopy.pinned_list = 1628 1628 dma_pin_iovec_pages(msg->msg_iov, len); 1629 1629 } else { 1630 - preempt_enable_no_resched(); 1630 + preempt_enable(); 1631 1631 } 1632 1632 } 1633 1633 #endif