sched: Make separate sched*.c translation units

+2 -1

include/linux/latencytop.h

··· 10 10 #define _INCLUDE_GUARD_LATENCYTOP_H_ 11 11 12 12 #include <linux/compiler.h> 13 + struct task_struct; 14 + 13 15 #ifdef CONFIG_LATENCYTOP 14 16 15 17 #define LT_SAVECOUNT 32 ··· 25 23 }; 26 24 27 25 28 - struct task_struct; 29 26 30 27 extern int latencytop_enabled; 31 28 void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);

+9

include/linux/sched.h

··· 925 925 return to_cpumask(sg->cpumask); 926 926 } 927 927 928 + /** 929 + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 930 + * @group: The group whose first cpu is to be returned. 931 + */ 932 + static inline unsigned int group_first_cpu(struct sched_group *group) 933 + { 934 + return cpumask_first(sched_group_cpus(group)); 935 + } 936 + 928 937 struct sched_domain_attr { 929 938 int relax_domain_level; 930 939 };

+7 -3

kernel/Makefile

··· 2 2 # Makefile for the linux kernel. 3 3 # 4 4 5 - obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5 + obj-y = fork.o exec_domain.o panic.o printk.o \ 6 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 7 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 8 8 signal.o sys.o kmod.o workqueue.o pid.o \ ··· 10 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 11 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 12 12 notifier.o ksysfs.o sched_clock.o cred.o \ 13 - async.o range.o 14 - obj-y += groups.o 13 + async.o range.o groups.o 14 + 15 + obj-y += sched.o sched_idletask.o sched_fair.o sched_rt.o sched_stoptask.o 16 + obj-$(CONFIG_SCHED_AUTOGROUP) += sched_autogroup.o 17 + obj-$(CONFIG_SCHEDSTATS) += sched_stats.o 18 + obj-$(CONFIG_SCHED_DEBUG) += sched_debug.o 15 19 16 20 ifdef CONFIG_FUNCTION_TRACER 17 21 # Do not trace debug files and internal ftrace files

+55 -1773

kernel/sched.c

··· 56 56 #include <linux/percpu.h> 57 57 #include <linux/proc_fs.h> 58 58 #include <linux/seq_file.h> 59 - #include <linux/stop_machine.h> 60 59 #include <linux/sysctl.h> 61 60 #include <linux/syscalls.h> 62 61 #include <linux/times.h> ··· 71 72 #include <linux/ftrace.h> 72 73 #include <linux/slab.h> 73 74 #include <linux/init_task.h> 74 - #include <linux/jump_label.h> 75 75 76 76 #include <asm/tlb.h> 77 77 #include <asm/irq_regs.h> 78 - #include <asm/mutex.h> 79 78 #ifdef CONFIG_PARAVIRT 80 79 #include <asm/paravirt.h> 81 80 #endif 82 81 83 - #include "sched_cpupri.h" 82 + #include "sched.h" 84 83 #include "workqueue_sched.h" 85 - #include "sched_autogroup.h" 86 84 87 85 #define CREATE_TRACE_POINTS 88 86 #include <trace/events/sched.h> 89 87 90 - /* 91 - * Convert user-nice values [ -20 ... 0 ... 19 ] 92 - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 93 - * and back. 94 - */ 95 - #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 96 - #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 97 - #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 98 - 99 - /* 100 - * 'User priority' is the nice value converted to something we 101 - * can work with better when scaling various scheduler parameters, 102 - * it's a [ 0 ... 39 ] range. 103 - */ 104 - #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 105 - #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 106 - #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 107 - 108 - /* 109 - * Helpers for converting nanosecond timing to jiffy resolution 110 - */ 111 - #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 112 - 113 - #define NICE_0_LOAD SCHED_LOAD_SCALE 114 - #define NICE_0_SHIFT SCHED_LOAD_SHIFT 115 - 116 - /* 117 - * These are the 'tuning knobs' of the scheduler: 118 - * 119 - * default timeslice is 100 msecs (used only for SCHED_RR tasks). 120 - * Timeslices get refilled after they expire. 121 - */ 122 - #define DEF_TIMESLICE (100 * HZ / 1000) 123 - 124 - /* 125 - * single value that denotes runtime == period, ie unlimited time. 126 - */ 127 - #define RUNTIME_INF ((u64)~0ULL) 128 - 129 - static inline int rt_policy(int policy) 130 - { 131 - if (policy == SCHED_FIFO || policy == SCHED_RR) 132 - return 1; 133 - return 0; 134 - } 135 - 136 - static inline int task_has_rt_policy(struct task_struct *p) 137 - { 138 - return rt_policy(p->policy); 139 - } 140 - 141 - /* 142 - * This is the priority-queue data structure of the RT scheduling class: 143 - */ 144 - struct rt_prio_array { 145 - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 146 - struct list_head queue[MAX_RT_PRIO]; 147 - }; 148 - 149 - struct rt_bandwidth { 150 - /* nests inside the rq lock: */ 151 - raw_spinlock_t rt_runtime_lock; 152 - ktime_t rt_period; 153 - u64 rt_runtime; 154 - struct hrtimer rt_period_timer; 155 - }; 156 - 157 - static struct rt_bandwidth def_rt_bandwidth; 158 - 159 - static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 160 - 161 - static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 162 - { 163 - struct rt_bandwidth *rt_b = 164 - container_of(timer, struct rt_bandwidth, rt_period_timer); 165 - ktime_t now; 166 - int overrun; 167 - int idle = 0; 168 - 169 - for (;;) { 170 - now = hrtimer_cb_get_time(timer); 171 - overrun = hrtimer_forward(timer, now, rt_b->rt_period); 172 - 173 - if (!overrun) 174 - break; 175 - 176 - idle = do_sched_rt_period_timer(rt_b, overrun); 177 - } 178 - 179 - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 180 - } 181 - 182 - static 183 - void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 184 - { 185 - rt_b->rt_period = ns_to_ktime(period); 186 - rt_b->rt_runtime = runtime; 187 - 188 - raw_spin_lock_init(&rt_b->rt_runtime_lock); 189 - 190 - hrtimer_init(&rt_b->rt_period_timer, 191 - CLOCK_MONOTONIC, HRTIMER_MODE_REL); 192 - rt_b->rt_period_timer.function = sched_rt_period_timer; 193 - } 194 - 195 - static inline int rt_bandwidth_enabled(void) 196 - { 197 - return sysctl_sched_rt_runtime >= 0; 198 - } 199 - 200 - static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 88 + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 201 89 { 202 90 unsigned long delta; 203 91 ktime_t soft, hard, now; ··· 104 218 } 105 219 } 106 220 107 - static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 108 - { 109 - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 110 - return; 111 - 112 - if (hrtimer_active(&rt_b->rt_period_timer)) 113 - return; 114 - 115 - raw_spin_lock(&rt_b->rt_runtime_lock); 116 - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); 117 - raw_spin_unlock(&rt_b->rt_runtime_lock); 118 - } 119 - 120 - #ifdef CONFIG_RT_GROUP_SCHED 121 - static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 122 - { 123 - hrtimer_cancel(&rt_b->rt_period_timer); 124 - } 125 - #endif 126 - 127 - /* 128 - * sched_domains_mutex serializes calls to init_sched_domains, 129 - * detach_destroy_domains and partition_sched_domains. 130 - */ 131 - static DEFINE_MUTEX(sched_domains_mutex); 132 - 133 - #ifdef CONFIG_CGROUP_SCHED 134 - 135 - #include <linux/cgroup.h> 136 - 137 - struct cfs_rq; 138 - 139 - static LIST_HEAD(task_groups); 140 - 141 - struct cfs_bandwidth { 142 - #ifdef CONFIG_CFS_BANDWIDTH 143 - raw_spinlock_t lock; 144 - ktime_t period; 145 - u64 quota, runtime; 146 - s64 hierarchal_quota; 147 - u64 runtime_expires; 148 - 149 - int idle, timer_active; 150 - struct hrtimer period_timer, slack_timer; 151 - struct list_head throttled_cfs_rq; 152 - 153 - /* statistics */ 154 - int nr_periods, nr_throttled; 155 - u64 throttled_time; 156 - #endif 157 - }; 158 - 159 - /* task group related information */ 160 - struct task_group { 161 - struct cgroup_subsys_state css; 162 - 163 - #ifdef CONFIG_FAIR_GROUP_SCHED 164 - /* schedulable entities of this group on each cpu */ 165 - struct sched_entity **se; 166 - /* runqueue "owned" by this group on each cpu */ 167 - struct cfs_rq **cfs_rq; 168 - unsigned long shares; 169 - 170 - atomic_t load_weight; 171 - #endif 172 - 173 - #ifdef CONFIG_RT_GROUP_SCHED 174 - struct sched_rt_entity **rt_se; 175 - struct rt_rq **rt_rq; 176 - 177 - struct rt_bandwidth rt_bandwidth; 178 - #endif 179 - 180 - struct rcu_head rcu; 181 - struct list_head list; 182 - 183 - struct task_group *parent; 184 - struct list_head siblings; 185 - struct list_head children; 186 - 187 - #ifdef CONFIG_SCHED_AUTOGROUP 188 - struct autogroup *autogroup; 189 - #endif 190 - 191 - struct cfs_bandwidth cfs_bandwidth; 192 - }; 193 - 194 - /* task_group_lock serializes the addition/removal of task groups */ 195 - static DEFINE_SPINLOCK(task_group_lock); 196 - 197 - #ifdef CONFIG_FAIR_GROUP_SCHED 198 - 199 - # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD 200 - 201 - /* 202 - * A weight of 0 or 1 can cause arithmetics problems. 203 - * A weight of a cfs_rq is the sum of weights of which entities 204 - * are queued on this cfs_rq, so a weight of a entity should not be 205 - * too large, so as the shares value of a task group. 206 - * (The default weight is 1024 - so there's no practical 207 - * limitation from this.) 208 - */ 209 - #define MIN_SHARES (1UL << 1) 210 - #define MAX_SHARES (1UL << 18) 211 - 212 - static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 213 - #endif 214 - 215 - /* Default task group. 216 - * Every task in system belong to this group at bootup. 217 - */ 218 - struct task_group root_task_group; 219 - 220 - #endif /* CONFIG_CGROUP_SCHED */ 221 - 222 - /* CFS-related fields in a runqueue */ 223 - struct cfs_rq { 224 - struct load_weight load; 225 - unsigned long nr_running, h_nr_running; 226 - 227 - u64 exec_clock; 228 - u64 min_vruntime; 229 - #ifndef CONFIG_64BIT 230 - u64 min_vruntime_copy; 231 - #endif 232 - 233 - struct rb_root tasks_timeline; 234 - struct rb_node *rb_leftmost; 235 - 236 - struct list_head tasks; 237 - struct list_head *balance_iterator; 238 - 239 - /* 240 - * 'curr' points to currently running entity on this cfs_rq. 241 - * It is set to NULL otherwise (i.e when none are currently running). 242 - */ 243 - struct sched_entity *curr, *next, *last, *skip; 244 - 245 - #ifdef CONFIG_SCHED_DEBUG 246 - unsigned int nr_spread_over; 247 - #endif 248 - 249 - #ifdef CONFIG_FAIR_GROUP_SCHED 250 - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 251 - 252 - /* 253 - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 254 - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 255 - * (like users, containers etc.) 256 - * 257 - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 258 - * list is used during load balance. 259 - */ 260 - int on_list; 261 - struct list_head leaf_cfs_rq_list; 262 - struct task_group *tg; /* group that "owns" this runqueue */ 263 - 264 - #ifdef CONFIG_SMP 265 - /* 266 - * the part of load.weight contributed by tasks 267 - */ 268 - unsigned long task_weight; 269 - 270 - /* 271 - * h_load = weight * f(tg) 272 - * 273 - * Where f(tg) is the recursive weight fraction assigned to 274 - * this group. 275 - */ 276 - unsigned long h_load; 277 - 278 - /* 279 - * Maintaining per-cpu shares distribution for group scheduling 280 - * 281 - * load_stamp is the last time we updated the load average 282 - * load_last is the last time we updated the load average and saw load 283 - * load_unacc_exec_time is currently unaccounted execution time 284 - */ 285 - u64 load_avg; 286 - u64 load_period; 287 - u64 load_stamp, load_last, load_unacc_exec_time; 288 - 289 - unsigned long load_contribution; 290 - #endif 291 - #ifdef CONFIG_CFS_BANDWIDTH 292 - int runtime_enabled; 293 - u64 runtime_expires; 294 - s64 runtime_remaining; 295 - 296 - u64 throttled_timestamp; 297 - int throttled, throttle_count; 298 - struct list_head throttled_list; 299 - #endif 300 - #endif 301 - }; 302 - 303 - #ifdef CONFIG_FAIR_GROUP_SCHED 304 - #ifdef CONFIG_CFS_BANDWIDTH 305 - static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 306 - { 307 - return &tg->cfs_bandwidth; 308 - } 309 - 310 - static inline u64 default_cfs_period(void); 311 - static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); 312 - static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); 313 - 314 - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 315 - { 316 - struct cfs_bandwidth *cfs_b = 317 - container_of(timer, struct cfs_bandwidth, slack_timer); 318 - do_sched_cfs_slack_timer(cfs_b); 319 - 320 - return HRTIMER_NORESTART; 321 - } 322 - 323 - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) 324 - { 325 - struct cfs_bandwidth *cfs_b = 326 - container_of(timer, struct cfs_bandwidth, period_timer); 327 - ktime_t now; 328 - int overrun; 329 - int idle = 0; 330 - 331 - for (;;) { 332 - now = hrtimer_cb_get_time(timer); 333 - overrun = hrtimer_forward(timer, now, cfs_b->period); 334 - 335 - if (!overrun) 336 - break; 337 - 338 - idle = do_sched_cfs_period_timer(cfs_b, overrun); 339 - } 340 - 341 - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 342 - } 343 - 344 - static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 345 - { 346 - raw_spin_lock_init(&cfs_b->lock); 347 - cfs_b->runtime = 0; 348 - cfs_b->quota = RUNTIME_INF; 349 - cfs_b->period = ns_to_ktime(default_cfs_period()); 350 - 351 - INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); 352 - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 353 - cfs_b->period_timer.function = sched_cfs_period_timer; 354 - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 355 - cfs_b->slack_timer.function = sched_cfs_slack_timer; 356 - } 357 - 358 - static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 359 - { 360 - cfs_rq->runtime_enabled = 0; 361 - INIT_LIST_HEAD(&cfs_rq->throttled_list); 362 - } 363 - 364 - /* requires cfs_b->lock, may release to reprogram timer */ 365 - static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 366 - { 367 - /* 368 - * The timer may be active because we're trying to set a new bandwidth 369 - * period or because we're racing with the tear-down path 370 - * (timer_active==0 becomes visible before the hrtimer call-back 371 - * terminates). In either case we ensure that it's re-programmed 372 - */ 373 - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 374 - raw_spin_unlock(&cfs_b->lock); 375 - /* ensure cfs_b->lock is available while we wait */ 376 - hrtimer_cancel(&cfs_b->period_timer); 377 - 378 - raw_spin_lock(&cfs_b->lock); 379 - /* if someone else restarted the timer then we're done */ 380 - if (cfs_b->timer_active) 381 - return; 382 - } 383 - 384 - cfs_b->timer_active = 1; 385 - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); 386 - } 387 - 388 - static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 389 - { 390 - hrtimer_cancel(&cfs_b->period_timer); 391 - hrtimer_cancel(&cfs_b->slack_timer); 392 - } 393 - 394 - #ifdef HAVE_JUMP_LABEL 395 - static struct jump_label_key __cfs_bandwidth_used; 396 - 397 - static inline bool cfs_bandwidth_used(void) 398 - { 399 - return static_branch(&__cfs_bandwidth_used); 400 - } 401 - 402 - static void account_cfs_bandwidth_used(int enabled, int was_enabled) 403 - { 404 - /* only need to count groups transitioning between enabled/!enabled */ 405 - if (enabled && !was_enabled) 406 - jump_label_inc(&__cfs_bandwidth_used); 407 - else if (!enabled && was_enabled) 408 - jump_label_dec(&__cfs_bandwidth_used); 409 - } 410 - #else /* !HAVE_JUMP_LABEL */ 411 - /* static_branch doesn't help unless supported */ 412 - static int cfs_bandwidth_used(void) 413 - { 414 - return 1; 415 - } 416 - static void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 417 - #endif /* HAVE_JUMP_LABEL */ 418 - #else /* !CONFIG_CFS_BANDWIDTH */ 419 - static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 420 - static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 421 - static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 422 - 423 - static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 424 - { 425 - return NULL; 426 - } 427 - #endif /* CONFIG_CFS_BANDWIDTH */ 428 - #endif /* CONFIG_FAIR_GROUP_SCHED */ 429 - 430 - /* Real-Time classes' related field in a runqueue: */ 431 - struct rt_rq { 432 - struct rt_prio_array active; 433 - unsigned long rt_nr_running; 434 - #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 435 - struct { 436 - int curr; /* highest queued rt task prio */ 437 - #ifdef CONFIG_SMP 438 - int next; /* next highest */ 439 - #endif 440 - } highest_prio; 441 - #endif 442 - #ifdef CONFIG_SMP 443 - unsigned long rt_nr_migratory; 444 - unsigned long rt_nr_total; 445 - int overloaded; 446 - struct plist_head pushable_tasks; 447 - #endif 448 - int rt_throttled; 449 - u64 rt_time; 450 - u64 rt_runtime; 451 - /* Nests inside the rq lock: */ 452 - raw_spinlock_t rt_runtime_lock; 453 - 454 - #ifdef CONFIG_RT_GROUP_SCHED 455 - unsigned long rt_nr_boosted; 456 - 457 - struct rq *rq; 458 - struct list_head leaf_rt_rq_list; 459 - struct task_group *tg; 460 - #endif 461 - }; 462 - 463 - #ifdef CONFIG_SMP 464 - 465 - /* 466 - * We add the notion of a root-domain which will be used to define per-domain 467 - * variables. Each exclusive cpuset essentially defines an island domain by 468 - * fully partitioning the member cpus from any other cpuset. Whenever a new 469 - * exclusive cpuset is created, we also create and attach a new root-domain 470 - * object. 471 - * 472 - */ 473 - struct root_domain { 474 - atomic_t refcount; 475 - atomic_t rto_count; 476 - struct rcu_head rcu; 477 - cpumask_var_t span; 478 - cpumask_var_t online; 479 - 480 - /* 481 - * The "RT overload" flag: it gets set if a CPU has more than 482 - * one runnable RT task. 483 - */ 484 - cpumask_var_t rto_mask; 485 - struct cpupri cpupri; 486 - }; 487 - 488 - /* 489 - * By default the system creates a single root-domain with all cpus as 490 - * members (mimicking the global state we have today). 491 - */ 492 - static struct root_domain def_root_domain; 493 - 494 - #endif /* CONFIG_SMP */ 495 - 496 - /* 497 - * This is the main, per-CPU runqueue data structure. 498 - * 499 - * Locking rule: those places that want to lock multiple runqueues 500 - * (such as the load balancing or the thread migration code), lock 501 - * acquire operations must be ordered by ascending &runqueue. 502 - */ 503 - struct rq { 504 - /* runqueue lock: */ 505 - raw_spinlock_t lock; 506 - 507 - /* 508 - * nr_running and cpu_load should be in the same cacheline because 509 - * remote CPUs use both these fields when doing load calculation. 510 - */ 511 - unsigned long nr_running; 512 - #define CPU_LOAD_IDX_MAX 5 513 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 514 - unsigned long last_load_update_tick; 515 - #ifdef CONFIG_NO_HZ 516 - u64 nohz_stamp; 517 - unsigned char nohz_balance_kick; 518 - #endif 519 - int skip_clock_update; 520 - 521 - /* capture load from *all* tasks on this cpu: */ 522 - struct load_weight load; 523 - unsigned long nr_load_updates; 524 - u64 nr_switches; 525 - 526 - struct cfs_rq cfs; 527 - struct rt_rq rt; 528 - 529 - #ifdef CONFIG_FAIR_GROUP_SCHED 530 - /* list of leaf cfs_rq on this cpu: */ 531 - struct list_head leaf_cfs_rq_list; 532 - #endif 533 - #ifdef CONFIG_RT_GROUP_SCHED 534 - struct list_head leaf_rt_rq_list; 535 - #endif 536 - 537 - /* 538 - * This is part of a global counter where only the total sum 539 - * over all CPUs matters. A task can increase this counter on 540 - * one CPU and if it got migrated afterwards it may decrease 541 - * it on another CPU. Always updated under the runqueue lock: 542 - */ 543 - unsigned long nr_uninterruptible; 544 - 545 - struct task_struct *curr, *idle, *stop; 546 - unsigned long next_balance; 547 - struct mm_struct *prev_mm; 548 - 549 - u64 clock; 550 - u64 clock_task; 551 - 552 - atomic_t nr_iowait; 553 - 554 - #ifdef CONFIG_SMP 555 - struct root_domain *rd; 556 - struct sched_domain *sd; 557 - 558 - unsigned long cpu_power; 559 - 560 - unsigned char idle_balance; 561 - /* For active balancing */ 562 - int post_schedule; 563 - int active_balance; 564 - int push_cpu; 565 - struct cpu_stop_work active_balance_work; 566 - /* cpu of this runqueue: */ 567 - int cpu; 568 - int online; 569 - 570 - u64 rt_avg; 571 - u64 age_stamp; 572 - u64 idle_stamp; 573 - u64 avg_idle; 574 - #endif 575 - 576 - #ifdef CONFIG_IRQ_TIME_ACCOUNTING 577 - u64 prev_irq_time; 578 - #endif 579 - #ifdef CONFIG_PARAVIRT 580 - u64 prev_steal_time; 581 - #endif 582 - #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 583 - u64 prev_steal_time_rq; 584 - #endif 585 - 586 - /* calc_load related fields */ 587 - unsigned long calc_load_update; 588 - long calc_load_active; 589 - 590 - #ifdef CONFIG_SCHED_HRTICK 591 - #ifdef CONFIG_SMP 592 - int hrtick_csd_pending; 593 - struct call_single_data hrtick_csd; 594 - #endif 595 - struct hrtimer hrtick_timer; 596 - #endif 597 - 598 - #ifdef CONFIG_SCHEDSTATS 599 - /* latency stats */ 600 - struct sched_info rq_sched_info; 601 - unsigned long long rq_cpu_time; 602 - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 603 - 604 - /* sys_sched_yield() stats */ 605 - unsigned int yld_count; 606 - 607 - /* schedule() stats */ 608 - unsigned int sched_switch; 609 - unsigned int sched_count; 610 - unsigned int sched_goidle; 611 - 612 - /* try_to_wake_up() stats */ 613 - unsigned int ttwu_count; 614 - unsigned int ttwu_local; 615 - #endif 616 - 617 - #ifdef CONFIG_SMP 618 - struct llist_head wake_list; 619 - #endif 620 - }; 621 - 622 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 623 - 624 - 625 - static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 626 - 627 - static inline int cpu_of(struct rq *rq) 628 - { 629 - #ifdef CONFIG_SMP 630 - return rq->cpu; 631 - #else 632 - return 0; 633 - #endif 634 - } 635 - 636 - #define rcu_dereference_check_sched_domain(p) \ 637 - rcu_dereference_check((p), \ 638 - lockdep_is_held(&sched_domains_mutex)) 639 - 640 - /* 641 - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 642 - * See detach_destroy_domains: synchronize_sched for details. 643 - * 644 - * The domain tree of any CPU may only be accessed from within 645 - * preempt-disabled sections. 646 - */ 647 - #define for_each_domain(cpu, __sd) \ 648 - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 649 - 650 - #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 651 - #define this_rq() (&__get_cpu_var(runqueues)) 652 - #define task_rq(p) cpu_rq(task_cpu(p)) 653 - #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 654 - #define raw_rq() (&__raw_get_cpu_var(runqueues)) 655 - 656 - #ifdef CONFIG_CGROUP_SCHED 657 - 658 - /* 659 - * Return the group to which this tasks belongs. 660 - * 661 - * We use task_subsys_state_check() and extend the RCU verification with 662 - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 663 - * task it moves into the cgroup. Therefore by holding either of those locks, 664 - * we pin the task to the current cgroup. 665 - */ 666 - static inline struct task_group *task_group(struct task_struct *p) 667 - { 668 - struct task_group *tg; 669 - struct cgroup_subsys_state *css; 670 - 671 - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 672 - lockdep_is_held(&p->pi_lock) || 673 - lockdep_is_held(&task_rq(p)->lock)); 674 - tg = container_of(css, struct task_group, css); 675 - 676 - return autogroup_task_group(p, tg); 677 - } 678 - 679 - /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 680 - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 681 - { 682 - #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) 683 - struct task_group *tg = task_group(p); 684 - #endif 685 - 686 - #ifdef CONFIG_FAIR_GROUP_SCHED 687 - p->se.cfs_rq = tg->cfs_rq[cpu]; 688 - p->se.parent = tg->se[cpu]; 689 - #endif 690 - 691 - #ifdef CONFIG_RT_GROUP_SCHED 692 - p->rt.rt_rq = tg->rt_rq[cpu]; 693 - p->rt.parent = tg->rt_se[cpu]; 694 - #endif 695 - } 696 - 697 - #else /* CONFIG_CGROUP_SCHED */ 698 - 699 - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 700 - static inline struct task_group *task_group(struct task_struct *p) 701 - { 702 - return NULL; 703 - } 704 - 705 - #endif /* CONFIG_CGROUP_SCHED */ 221 + DEFINE_MUTEX(sched_domains_mutex); 222 + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 706 223 707 224 static void update_rq_clock_task(struct rq *rq, s64 delta); 708 225 709 - static void update_rq_clock(struct rq *rq) 226 + void update_rq_clock(struct rq *rq) 710 227 { 711 228 s64 delta; 712 229 ··· 122 833 } 123 834 124 835 /* 125 - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 126 - */ 127 - #ifdef CONFIG_SCHED_DEBUG 128 - # define const_debug __read_mostly 129 - #else 130 - # define const_debug static const 131 - #endif 132 - 133 - /** 134 - * runqueue_is_locked - Returns true if the current cpu runqueue is locked 135 - * @cpu: the processor in question. 136 - * 137 - * This interface allows printk to be called with the runqueue lock 138 - * held and know whether or not it is OK to wake up the klogd. 139 - */ 140 - int runqueue_is_locked(int cpu) 141 - { 142 - return raw_spin_is_locked(&cpu_rq(cpu)->lock); 143 - } 144 - 145 - /* 146 836 * Debugging: various feature bits 147 837 */ 148 - 149 - #define SCHED_FEAT(name, enabled) \ 150 - __SCHED_FEAT_##name , 151 - 152 - enum { 153 - #include "sched_features.h" 154 - }; 155 - 156 - #undef SCHED_FEAT 157 838 158 839 #define SCHED_FEAT(name, enabled) \ 159 840 (1UL << __SCHED_FEAT_##name) * enabled | ··· 224 965 225 966 #endif 226 967 227 - #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 228 - 229 968 /* 230 969 * Number of tasks to iterate in a single balance run. 231 970 * Limited because this is done with IRQs disabled. ··· 244 987 */ 245 988 unsigned int sysctl_sched_rt_period = 1000000; 246 989 247 - static __read_mostly int scheduler_running; 990 + __read_mostly int scheduler_running; 248 991 249 992 /* 250 993 * part of the period that we allow rt tasks to run in us. ··· 252 995 */ 253 996 int sysctl_sched_rt_runtime = 950000; 254 997 255 - static inline u64 global_rt_period(void) 256 - { 257 - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 258 - } 259 998 260 - static inline u64 global_rt_runtime(void) 261 - { 262 - if (sysctl_sched_rt_runtime < 0) 263 - return RUNTIME_INF; 264 - 265 - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 266 - } 267 - 268 - #ifndef prepare_arch_switch 269 - # define prepare_arch_switch(next) do { } while (0) 270 - #endif 271 - #ifndef finish_arch_switch 272 - # define finish_arch_switch(prev) do { } while (0) 273 - #endif 274 - 275 - static inline int task_current(struct rq *rq, struct task_struct *p) 276 - { 277 - return rq->curr == p; 278 - } 279 - 280 - static inline int task_running(struct rq *rq, struct task_struct *p) 281 - { 282 - #ifdef CONFIG_SMP 283 - return p->on_cpu; 284 - #else 285 - return task_current(rq, p); 286 - #endif 287 - } 288 - 289 - #ifndef __ARCH_WANT_UNLOCKED_CTXSW 290 - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 291 - { 292 - #ifdef CONFIG_SMP 293 - /* 294 - * We can optimise this out completely for !SMP, because the 295 - * SMP rebalancing from interrupt is the only thing that cares 296 - * here. 297 - */ 298 - next->on_cpu = 1; 299 - #endif 300 - } 301 - 302 - static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 303 - { 304 - #ifdef CONFIG_SMP 305 - /* 306 - * After ->on_cpu is cleared, the task can be moved to a different CPU. 307 - * We must ensure this doesn't happen until the switch is completely 308 - * finished. 309 - */ 310 - smp_wmb(); 311 - prev->on_cpu = 0; 312 - #endif 313 - #ifdef CONFIG_DEBUG_SPINLOCK 314 - /* this is a valid case when another task releases the spinlock */ 315 - rq->lock.owner = current; 316 - #endif 317 - /* 318 - * If we are tracking spinlock dependencies then we have to 319 - * fix up the runqueue lock - which gets 'carried over' from 320 - * prev into current: 321 - */ 322 - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 323 - 324 - raw_spin_unlock_irq(&rq->lock); 325 - } 326 - 327 - #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 328 - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 329 - { 330 - #ifdef CONFIG_SMP 331 - /* 332 - * We can optimise this out completely for !SMP, because the 333 - * SMP rebalancing from interrupt is the only thing that cares 334 - * here. 335 - */ 336 - next->on_cpu = 1; 337 - #endif 338 - #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 339 - raw_spin_unlock_irq(&rq->lock); 340 - #else 341 - raw_spin_unlock(&rq->lock); 342 - #endif 343 - } 344 - 345 - static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 346 - { 347 - #ifdef CONFIG_SMP 348 - /* 349 - * After ->on_cpu is cleared, the task can be moved to a different CPU. 350 - * We must ensure this doesn't happen until the switch is completely 351 - * finished. 352 - */ 353 - smp_wmb(); 354 - prev->on_cpu = 0; 355 - #endif 356 - #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 357 - local_irq_enable(); 358 - #endif 359 - } 360 - #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 361 999 362 1000 /* 363 1001 * __task_rq_lock - lock the rq @p resides on. ··· 335 1183 * rq->lock. 336 1184 */ 337 1185 338 - /* 339 - * Use hrtick when: 340 - * - enabled by features 341 - * - hrtimer is actually high res 342 - */ 343 - static inline int hrtick_enabled(struct rq *rq) 344 - { 345 - if (!sched_feat(HRTICK)) 346 - return 0; 347 - if (!cpu_active(cpu_of(rq))) 348 - return 0; 349 - return hrtimer_is_hres_active(&rq->hrtick_timer); 350 - } 351 - 352 1186 static void hrtick_clear(struct rq *rq) 353 1187 { 354 1188 if (hrtimer_active(&rq->hrtick_timer)) ··· 378 1240 * 379 1241 * called with rq->lock held and irqs disabled 380 1242 */ 381 - static void hrtick_start(struct rq *rq, u64 delay) 1243 + void hrtick_start(struct rq *rq, u64 delay) 382 1244 { 383 1245 struct hrtimer *timer = &rq->hrtick_timer; 384 1246 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); ··· 422 1284 * 423 1285 * called with rq->lock held and irqs disabled 424 1286 */ 425 - static void hrtick_start(struct rq *rq, u64 delay) 1287 + void hrtick_start(struct rq *rq, u64 delay) 426 1288 { 427 1289 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 428 1290 HRTIMER_MODE_REL_PINNED, 0); ··· 473 1335 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 474 1336 #endif 475 1337 476 - static void resched_task(struct task_struct *p) 1338 + void resched_task(struct task_struct *p) 477 1339 { 478 1340 int cpu; 479 1341 ··· 494 1356 smp_send_reschedule(cpu); 495 1357 } 496 1358 497 - static void resched_cpu(int cpu) 1359 + void resched_cpu(int cpu) 498 1360 { 499 1361 struct rq *rq = cpu_rq(cpu); 500 1362 unsigned long flags; ··· 587 1449 588 1450 #endif /* CONFIG_NO_HZ */ 589 1451 590 - static u64 sched_avg_period(void) 591 - { 592 - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 593 - } 594 - 595 - static void sched_avg_update(struct rq *rq) 1452 + void sched_avg_update(struct rq *rq) 596 1453 { 597 1454 s64 period = sched_avg_period(); 598 1455 ··· 603 1470 } 604 1471 } 605 1472 606 - static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 607 - { 608 - rq->rt_avg += rt_delta; 609 - sched_avg_update(rq); 610 - } 611 - 612 1473 #else /* !CONFIG_SMP */ 613 - static void resched_task(struct task_struct *p) 1474 + void resched_task(struct task_struct *p) 614 1475 { 615 1476 assert_raw_spin_locked(&task_rq(p)->lock); 616 1477 set_tsk_need_resched(p); 617 1478 } 618 - 619 - static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 620 - { 621 - } 622 - 623 - static void sched_avg_update(struct rq *rq) 624 - { 625 - } 626 1479 #endif /* CONFIG_SMP */ 627 - 628 - #if BITS_PER_LONG == 32 629 - # define WMULT_CONST (~0UL) 630 - #else 631 - # define WMULT_CONST (1UL << 32) 632 - #endif 633 - 634 - #define WMULT_SHIFT 32 635 - 636 - /* 637 - * Shift right and round: 638 - */ 639 - #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 640 - 641 - /* 642 - * delta *= weight / lw 643 - */ 644 - static unsigned long 645 - calc_delta_mine(unsigned long delta_exec, unsigned long weight, 646 - struct load_weight *lw) 647 - { 648 - u64 tmp; 649 - 650 - /* 651 - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched 652 - * entities since MIN_SHARES = 2. Treat weight as 1 if less than 653 - * 2^SCHED_LOAD_RESOLUTION. 654 - */ 655 - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) 656 - tmp = (u64)delta_exec * scale_load_down(weight); 657 - else 658 - tmp = (u64)delta_exec; 659 - 660 - if (!lw->inv_weight) { 661 - unsigned long w = scale_load_down(lw->weight); 662 - 663 - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 664 - lw->inv_weight = 1; 665 - else if (unlikely(!w)) 666 - lw->inv_weight = WMULT_CONST; 667 - else 668 - lw->inv_weight = WMULT_CONST / w; 669 - } 670 - 671 - /* 672 - * Check whether we'd overflow the 64-bit multiplication: 673 - */ 674 - if (unlikely(tmp > WMULT_CONST)) 675 - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 676 - WMULT_SHIFT/2); 677 - else 678 - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 679 - 680 - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 681 - } 682 - 683 - static inline void update_load_add(struct load_weight *lw, unsigned long inc) 684 - { 685 - lw->weight += inc; 686 - lw->inv_weight = 0; 687 - } 688 - 689 - static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 690 - { 691 - lw->weight -= dec; 692 - lw->inv_weight = 0; 693 - } 694 - 695 - static inline void update_load_set(struct load_weight *lw, unsigned long w) 696 - { 697 - lw->weight = w; 698 - lw->inv_weight = 0; 699 - } 700 - 701 - /* 702 - * To aid in avoiding the subversion of "niceness" due to uneven distribution 703 - * of tasks with abnormal "nice" values across CPUs the contribution that 704 - * each task makes to its run queue's load is weighted according to its 705 - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 706 - * scaled version of the new time slice allocation that they receive on time 707 - * slice expiry etc. 708 - */ 709 - 710 - #define WEIGHT_IDLEPRIO 3 711 - #define WMULT_IDLEPRIO 1431655765 712 - 713 - /* 714 - * Nice levels are multiplicative, with a gentle 10% change for every 715 - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 716 - * nice 1, it will get ~10% less CPU time than another CPU-bound task 717 - * that remained on nice 0. 718 - * 719 - * The "10% effect" is relative and cumulative: from _any_ nice level, 720 - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 721 - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 722 - * If a task goes up by ~10% and another task goes down by ~10% then 723 - * the relative distance between them is ~25%.) 724 - */ 725 - static const int prio_to_weight[40] = { 726 - /* -20 */ 88761, 71755, 56483, 46273, 36291, 727 - /* -15 */ 29154, 23254, 18705, 14949, 11916, 728 - /* -10 */ 9548, 7620, 6100, 4904, 3906, 729 - /* -5 */ 3121, 2501, 1991, 1586, 1277, 730 - /* 0 */ 1024, 820, 655, 526, 423, 731 - /* 5 */ 335, 272, 215, 172, 137, 732 - /* 10 */ 110, 87, 70, 56, 45, 733 - /* 15 */ 36, 29, 23, 18, 15, 734 - }; 735 - 736 - /* 737 - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 738 - * 739 - * In cases where the weight does not change often, we can use the 740 - * precalculated inverse to speed up arithmetics by turning divisions 741 - * into multiplications: 742 - */ 743 - static const u32 prio_to_wmult[40] = { 744 - /* -20 */ 48388, 59856, 76040, 92818, 118348, 745 - /* -15 */ 147320, 184698, 229616, 287308, 360437, 746 - /* -10 */ 449829, 563644, 704093, 875809, 1099582, 747 - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 748 - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 749 - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 750 - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 751 - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 752 - }; 753 - 754 - /* Time spent by the tasks of the cpu accounting group executing in ... */ 755 - enum cpuacct_stat_index { 756 - CPUACCT_STAT_USER, /* ... user mode */ 757 - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 758 - 759 - CPUACCT_STAT_NSTATS, 760 - }; 761 - 762 - #ifdef CONFIG_CGROUP_CPUACCT 763 - static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 764 - static void cpuacct_update_stats(struct task_struct *tsk, 765 - enum cpuacct_stat_index idx, cputime_t val); 766 - #else 767 - static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 768 - static inline void cpuacct_update_stats(struct task_struct *tsk, 769 - enum cpuacct_stat_index idx, cputime_t val) {} 770 - #endif 771 - 772 - static inline void inc_cpu_load(struct rq *rq, unsigned long load) 773 - { 774 - update_load_add(&rq->load, load); 775 - } 776 - 777 - static inline void dec_cpu_load(struct rq *rq, unsigned long load) 778 - { 779 - update_load_sub(&rq->load, load); 780 - } 781 1480 782 1481 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 783 1482 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 784 - typedef int (*tg_visitor)(struct task_group *, void *); 785 - 786 1483 /* 787 1484 * Iterate task_group tree rooted at *from, calling @down when first entering a 788 1485 * node and @up when leaving it for the final time. 789 1486 * 790 1487 * Caller must hold rcu_lock or sufficient equivalent. 791 1488 */ 792 - static int walk_tg_tree_from(struct task_group *from, 1489 + int walk_tg_tree_from(struct task_group *from, 793 1490 tg_visitor down, tg_visitor up, void *data) 794 1491 { 795 1492 struct task_group *parent, *child; ··· 650 1687 return ret; 651 1688 } 652 1689 653 - /* 654 - * Iterate the full tree, calling @down when first entering a node and @up when 655 - * leaving it for the final time. 656 - * 657 - * Caller must hold rcu_lock or sufficient equivalent. 658 - */ 659 - 660 - static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 661 - { 662 - return walk_tg_tree_from(&root_task_group, down, up, data); 663 - } 664 - 665 - static int tg_nop(struct task_group *tg, void *data) 1690 + int tg_nop(struct task_group *tg, void *data) 666 1691 { 667 1692 return 0; 668 1693 } 669 1694 #endif 670 1695 671 - #ifdef CONFIG_SMP 672 - /* Used instead of source_load when we know the type == 0 */ 673 - static unsigned long weighted_cpuload(const int cpu) 674 - { 675 - return cpu_rq(cpu)->load.weight; 676 - } 677 - 678 - /* 679 - * Return a low guess at the load of a migration-source cpu weighted 680 - * according to the scheduling class and "nice" value. 681 - * 682 - * We want to under-estimate the load of migration sources, to 683 - * balance conservatively. 684 - */ 685 - static unsigned long source_load(int cpu, int type) 686 - { 687 - struct rq *rq = cpu_rq(cpu); 688 - unsigned long total = weighted_cpuload(cpu); 689 - 690 - if (type == 0 || !sched_feat(LB_BIAS)) 691 - return total; 692 - 693 - return min(rq->cpu_load[type-1], total); 694 - } 695 - 696 - /* 697 - * Return a high guess at the load of a migration-target cpu weighted 698 - * according to the scheduling class and "nice" value. 699 - */ 700 - static unsigned long target_load(int cpu, int type) 701 - { 702 - struct rq *rq = cpu_rq(cpu); 703 - unsigned long total = weighted_cpuload(cpu); 704 - 705 - if (type == 0 || !sched_feat(LB_BIAS)) 706 - return total; 707 - 708 - return max(rq->cpu_load[type-1], total); 709 - } 710 - 711 - static unsigned long power_of(int cpu) 712 - { 713 - return cpu_rq(cpu)->cpu_power; 714 - } 715 - 716 - static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 717 - 718 - static unsigned long cpu_avg_load_per_task(int cpu) 719 - { 720 - struct rq *rq = cpu_rq(cpu); 721 - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 722 - 723 - if (nr_running) 724 - return rq->load.weight / nr_running; 725 - 726 - return 0; 727 - } 728 - 729 - #ifdef CONFIG_PREEMPT 730 - 731 - static void double_rq_lock(struct rq *rq1, struct rq *rq2); 732 - 733 - /* 734 - * fair double_lock_balance: Safely acquires both rq->locks in a fair 735 - * way at the expense of forcing extra atomic operations in all 736 - * invocations. This assures that the double_lock is acquired using the 737 - * same underlying policy as the spinlock_t on this architecture, which 738 - * reduces latency compared to the unfair variant below. However, it 739 - * also adds more overhead and therefore may reduce throughput. 740 - */ 741 - static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 742 - __releases(this_rq->lock) 743 - __acquires(busiest->lock) 744 - __acquires(this_rq->lock) 745 - { 746 - raw_spin_unlock(&this_rq->lock); 747 - double_rq_lock(this_rq, busiest); 748 - 749 - return 1; 750 - } 751 - 752 - #else 753 - /* 754 - * Unfair double_lock_balance: Optimizes throughput at the expense of 755 - * latency by eliminating extra atomic operations when the locks are 756 - * already in proper order on entry. This favors lower cpu-ids and will 757 - * grant the double lock to lower cpus over higher ids under contention, 758 - * regardless of entry order into the function. 759 - */ 760 - static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 761 - __releases(this_rq->lock) 762 - __acquires(busiest->lock) 763 - __acquires(this_rq->lock) 764 - { 765 - int ret = 0; 766 - 767 - if (unlikely(!raw_spin_trylock(&busiest->lock))) { 768 - if (busiest < this_rq) { 769 - raw_spin_unlock(&this_rq->lock); 770 - raw_spin_lock(&busiest->lock); 771 - raw_spin_lock_nested(&this_rq->lock, 772 - SINGLE_DEPTH_NESTING); 773 - ret = 1; 774 - } else 775 - raw_spin_lock_nested(&busiest->lock, 776 - SINGLE_DEPTH_NESTING); 777 - } 778 - return ret; 779 - } 780 - 781 - #endif /* CONFIG_PREEMPT */ 782 - 783 - /* 784 - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 785 - */ 786 - static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 787 - { 788 - if (unlikely(!irqs_disabled())) { 789 - /* printk() doesn't work good under rq->lock */ 790 - raw_spin_unlock(&this_rq->lock); 791 - BUG_ON(1); 792 - } 793 - 794 - return _double_lock_balance(this_rq, busiest); 795 - } 796 - 797 - static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 798 - __releases(busiest->lock) 799 - { 800 - raw_spin_unlock(&busiest->lock); 801 - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 802 - } 803 - 804 - /* 805 - * double_rq_lock - safely lock two runqueues 806 - * 807 - * Note this does not disable interrupts like task_rq_lock, 808 - * you need to do so manually before calling. 809 - */ 810 - static void double_rq_lock(struct rq *rq1, struct rq *rq2) 811 - __acquires(rq1->lock) 812 - __acquires(rq2->lock) 813 - { 814 - BUG_ON(!irqs_disabled()); 815 - if (rq1 == rq2) { 816 - raw_spin_lock(&rq1->lock); 817 - __acquire(rq2->lock); /* Fake it out ;) */ 818 - } else { 819 - if (rq1 < rq2) { 820 - raw_spin_lock(&rq1->lock); 821 - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 822 - } else { 823 - raw_spin_lock(&rq2->lock); 824 - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 825 - } 826 - } 827 - } 828 - 829 - /* 830 - * double_rq_unlock - safely unlock two runqueues 831 - * 832 - * Note this does not restore interrupts like task_rq_unlock, 833 - * you need to do so manually after calling. 834 - */ 835 - static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 836 - __releases(rq1->lock) 837 - __releases(rq2->lock) 838 - { 839 - raw_spin_unlock(&rq1->lock); 840 - if (rq1 != rq2) 841 - raw_spin_unlock(&rq2->lock); 842 - else 843 - __release(rq2->lock); 844 - } 845 - 846 - #else /* CONFIG_SMP */ 847 - 848 - /* 849 - * double_rq_lock - safely lock two runqueues 850 - * 851 - * Note this does not disable interrupts like task_rq_lock, 852 - * you need to do so manually before calling. 853 - */ 854 - static void double_rq_lock(struct rq *rq1, struct rq *rq2) 855 - __acquires(rq1->lock) 856 - __acquires(rq2->lock) 857 - { 858 - BUG_ON(!irqs_disabled()); 859 - BUG_ON(rq1 != rq2); 860 - raw_spin_lock(&rq1->lock); 861 - __acquire(rq2->lock); /* Fake it out ;) */ 862 - } 863 - 864 - /* 865 - * double_rq_unlock - safely unlock two runqueues 866 - * 867 - * Note this does not restore interrupts like task_rq_unlock, 868 - * you need to do so manually after calling. 869 - */ 870 - static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 871 - __releases(rq1->lock) 872 - __releases(rq2->lock) 873 - { 874 - BUG_ON(rq1 != rq2); 875 - raw_spin_unlock(&rq1->lock); 876 - __release(rq2->lock); 877 - } 878 - 879 - #endif 880 - 881 - static void calc_load_account_idle(struct rq *this_rq); 882 - static void update_sysctl(void); 883 - static int get_update_sysctl_factor(void); 884 - static void update_cpu_load(struct rq *this_rq); 885 - 886 - static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 887 - { 888 - set_task_rq(p, cpu); 889 - #ifdef CONFIG_SMP 890 - /* 891 - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 892 - * successfully executed on another CPU. We must ensure that updates of 893 - * per-task data have been completed by this moment. 894 - */ 895 - smp_wmb(); 896 - task_thread_info(p)->cpu = cpu; 897 - #endif 898 - } 899 - 900 - static const struct sched_class rt_sched_class; 901 - 902 - #define sched_class_highest (&stop_sched_class) 903 - #define for_each_class(class) \ 904 - for (class = sched_class_highest; class; class = class->next) 905 - 906 - #include "sched_stats.h" 907 - 908 - static void inc_nr_running(struct rq *rq) 909 - { 910 - rq->nr_running++; 911 - } 912 - 913 - static void dec_nr_running(struct rq *rq) 914 - { 915 - rq->nr_running--; 916 - } 1696 + void update_cpu_load(struct rq *this_rq); 917 1697 918 1698 static void set_load_weight(struct task_struct *p) 919 1699 { ··· 693 1987 /* 694 1988 * activate_task - move a task to the runqueue. 695 1989 */ 696 - static void activate_task(struct rq *rq, struct task_struct *p, int flags) 1990 + void activate_task(struct rq *rq, struct task_struct *p, int flags) 697 1991 { 698 1992 if (task_contributes_to_load(p)) 699 1993 rq->nr_uninterruptible--; ··· 704 1998 /* 705 1999 * deactivate_task - remove a task from the runqueue. 706 2000 */ 707 - static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 2001 + void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 708 2002 { 709 2003 if (task_contributes_to_load(p)) 710 2004 rq->nr_uninterruptible++; ··· 929 2223 930 2224 #endif 931 2225 932 - #include "sched_idletask.c" 933 - #include "sched_fair.c" 934 - #include "sched_rt.c" 935 - #include "sched_autogroup.c" 936 - #include "sched_stoptask.c" 937 - #ifdef CONFIG_SCHED_DEBUG 938 - # include "sched_debug.c" 939 - #endif 940 - 941 2226 void sched_set_stop_task(int cpu, struct task_struct *stop) 942 2227 { 943 2228 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; ··· 1026 2329 p->sched_class->prio_changed(rq, p, oldprio); 1027 2330 } 1028 2331 1029 - static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2332 + void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1030 2333 { 1031 2334 const struct sched_class *class; 1032 2335 ··· 1052 2355 } 1053 2356 1054 2357 #ifdef CONFIG_SMP 1055 - /* 1056 - * Is this task likely cache-hot: 1057 - */ 1058 - static int 1059 - task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1060 - { 1061 - s64 delta; 1062 - 1063 - if (p->sched_class != &fair_sched_class) 1064 - return 0; 1065 - 1066 - if (unlikely(p->policy == SCHED_IDLE)) 1067 - return 0; 1068 - 1069 - /* 1070 - * Buddy candidates are cache hot: 1071 - */ 1072 - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 1073 - (&p->se == cfs_rq_of(&p->se)->next || 1074 - &p->se == cfs_rq_of(&p->se)->last)) 1075 - return 1; 1076 - 1077 - if (sysctl_sched_migration_cost == -1) 1078 - return 1; 1079 - if (sysctl_sched_migration_cost == 0) 1080 - return 0; 1081 - 1082 - delta = now - p->se.exec_start; 1083 - 1084 - return delta < (s64)sysctl_sched_migration_cost; 1085 - } 1086 - 1087 2358 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088 2359 { 1089 2360 #ifdef CONFIG_SCHED_DEBUG ··· 2134 3469 */ 2135 3470 static atomic_long_t calc_load_tasks_idle; 2136 3471 2137 - static void calc_load_account_idle(struct rq *this_rq) 3472 + void calc_load_account_idle(struct rq *this_rq) 2138 3473 { 2139 3474 long delta; 2140 3475 ··· 2278 3613 */ 2279 3614 } 2280 3615 #else 2281 - static void calc_load_account_idle(struct rq *this_rq) 3616 + void calc_load_account_idle(struct rq *this_rq) 2282 3617 { 2283 3618 } 2284 3619 ··· 2421 3756 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2422 3757 * every tick. We fix it up based on jiffies. 2423 3758 */ 2424 - static void update_cpu_load(struct rq *this_rq) 3759 + void update_cpu_load(struct rq *this_rq) 2425 3760 { 2426 3761 unsigned long this_load = this_rq->load.weight; 2427 3762 unsigned long curr_jiffies = jiffies; ··· 4813 6148 #endif 4814 6149 } 4815 6150 4816 - /* 4817 - * Increase the granularity value when there are more CPUs, 4818 - * because with more CPUs the 'effective latency' as visible 4819 - * to users decreases. But the relationship is not linear, 4820 - * so pick a second-best guess by going with the log2 of the 4821 - * number of CPUs. 4822 - * 4823 - * This idea comes from the SD scheduler of Con Kolivas: 4824 - */ 4825 - static int get_update_sysctl_factor(void) 4826 - { 4827 - unsigned int cpus = min_t(int, num_online_cpus(), 8); 4828 - unsigned int factor; 4829 - 4830 - switch (sysctl_sched_tunable_scaling) { 4831 - case SCHED_TUNABLESCALING_NONE: 4832 - factor = 1; 4833 - break; 4834 - case SCHED_TUNABLESCALING_LINEAR: 4835 - factor = cpus; 4836 - break; 4837 - case SCHED_TUNABLESCALING_LOG: 4838 - default: 4839 - factor = 1 + ilog2(cpus); 4840 - break; 4841 - } 4842 - 4843 - return factor; 4844 - } 4845 - 4846 - static void update_sysctl(void) 4847 - { 4848 - unsigned int factor = get_update_sysctl_factor(); 4849 - 4850 - #define SET_SYSCTL(name) \ 4851 - (sysctl_##name = (factor) * normalized_sysctl_##name) 4852 - SET_SYSCTL(sched_min_granularity); 4853 - SET_SYSCTL(sched_latency); 4854 - SET_SYSCTL(sched_wakeup_granularity); 4855 - #undef SET_SYSCTL 4856 - } 4857 - 4858 - static inline void sched_init_granularity(void) 4859 - { 4860 - update_sysctl(); 4861 - } 4862 - 4863 6151 #ifdef CONFIG_SMP 4864 6152 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4865 6153 { ··· 4998 6380 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 4999 6381 rq->calc_load_active = 0; 5000 6382 } 5001 - 5002 - #ifdef CONFIG_CFS_BANDWIDTH 5003 - static void unthrottle_offline_cfs_rqs(struct rq *rq) 5004 - { 5005 - struct cfs_rq *cfs_rq; 5006 - 5007 - for_each_leaf_cfs_rq(rq, cfs_rq) { 5008 - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5009 - 5010 - if (!cfs_rq->runtime_enabled) 5011 - continue; 5012 - 5013 - /* 5014 - * clock_task is not advancing so we just need to make sure 5015 - * there's some valid quota amount 5016 - */ 5017 - cfs_rq->runtime_remaining = cfs_b->quota; 5018 - if (cfs_rq_throttled(cfs_rq)) 5019 - unthrottle_cfs_rq(cfs_rq); 5020 - } 5021 - } 5022 - #else 5023 - static void unthrottle_offline_cfs_rqs(struct rq *rq) {} 5024 - #endif 5025 6383 5026 6384 /* 5027 6385 * Migrate all tasks from the rq, sleeping tasks will be migrated by ··· 5604 7010 return -ENOMEM; 5605 7011 } 5606 7012 7013 + /* 7014 + * By default the system creates a single root-domain with all cpus as 7015 + * members (mimicking the global state we have today). 7016 + */ 7017 + struct root_domain def_root_domain; 7018 + 5607 7019 static void init_defrootdomain(void) 5608 7020 { 5609 7021 init_rootdomain(&def_root_domain); ··· 6016 7416 return; 6017 7417 6018 7418 update_group_power(sd, cpu); 7419 + } 7420 + 7421 + int __weak arch_sd_sibling_asym_packing(void) 7422 + { 7423 + return 0*SD_ASYM_PACKING; 6019 7424 } 6020 7425 6021 7426 /* ··· 6658 8053 } 6659 8054 } 6660 8055 6661 - static int update_runtime(struct notifier_block *nfb, 6662 - unsigned long action, void *hcpu) 6663 - { 6664 - int cpu = (int)(long)hcpu; 6665 - 6666 - switch (action) { 6667 - case CPU_DOWN_PREPARE: 6668 - case CPU_DOWN_PREPARE_FROZEN: 6669 - disable_runtime(cpu_rq(cpu)); 6670 - return NOTIFY_OK; 6671 - 6672 - case CPU_DOWN_FAILED: 6673 - case CPU_DOWN_FAILED_FROZEN: 6674 - case CPU_ONLINE: 6675 - case CPU_ONLINE_FROZEN: 6676 - enable_runtime(cpu_rq(cpu)); 6677 - return NOTIFY_OK; 6678 - 6679 - default: 6680 - return NOTIFY_DONE; 6681 - } 6682 - } 6683 - 6684 8056 void __init sched_init_smp(void) 6685 8057 { 6686 8058 cpumask_var_t non_isolated_cpus; ··· 6706 8124 && addr < (unsigned long)__sched_text_end); 6707 8125 } 6708 8126 6709 - static void init_cfs_rq(struct cfs_rq *cfs_rq) 6710 - { 6711 - cfs_rq->tasks_timeline = RB_ROOT; 6712 - INIT_LIST_HEAD(&cfs_rq->tasks); 6713 - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 6714 - #ifndef CONFIG_64BIT 6715 - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 6716 - #endif 6717 - } 6718 - 6719 - static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 6720 - { 6721 - struct rt_prio_array *array; 6722 - int i; 6723 - 6724 - array = &rt_rq->active; 6725 - for (i = 0; i < MAX_RT_PRIO; i++) { 6726 - INIT_LIST_HEAD(array->queue + i); 6727 - __clear_bit(i, array->bitmap); 6728 - } 6729 - /* delimiter for bitsearch: */ 6730 - __set_bit(MAX_RT_PRIO, array->bitmap); 6731 - 6732 - #if defined CONFIG_SMP 6733 - rt_rq->highest_prio.curr = MAX_RT_PRIO; 6734 - rt_rq->highest_prio.next = MAX_RT_PRIO; 6735 - rt_rq->rt_nr_migratory = 0; 6736 - rt_rq->overloaded = 0; 6737 - plist_head_init(&rt_rq->pushable_tasks); 8127 + #ifdef CONFIG_CGROUP_SCHED 8128 + struct task_group root_task_group; 6738 8129 #endif 6739 8130 6740 - rt_rq->rt_time = 0; 6741 - rt_rq->rt_throttled = 0; 6742 - rt_rq->rt_runtime = 0; 6743 - raw_spin_lock_init(&rt_rq->rt_runtime_lock); 6744 - } 6745 - 6746 - #ifdef CONFIG_FAIR_GROUP_SCHED 6747 - static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 6748 - struct sched_entity *se, int cpu, 6749 - struct sched_entity *parent) 6750 - { 6751 - struct rq *rq = cpu_rq(cpu); 6752 - 6753 - cfs_rq->tg = tg; 6754 - cfs_rq->rq = rq; 6755 - #ifdef CONFIG_SMP 6756 - /* allow initial update_cfs_load() to truncate */ 6757 - cfs_rq->load_stamp = 1; 6758 - #endif 6759 - init_cfs_rq_runtime(cfs_rq); 6760 - 6761 - tg->cfs_rq[cpu] = cfs_rq; 6762 - tg->se[cpu] = se; 6763 - 6764 - /* se could be NULL for root_task_group */ 6765 - if (!se) 6766 - return; 6767 - 6768 - if (!parent) 6769 - se->cfs_rq = &rq->cfs; 6770 - else 6771 - se->cfs_rq = parent->my_q; 6772 - 6773 - se->my_q = cfs_rq; 6774 - update_load_set(&se->load, 0); 6775 - se->parent = parent; 6776 - } 6777 - #endif 6778 - 6779 - #ifdef CONFIG_RT_GROUP_SCHED 6780 - static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 6781 - struct sched_rt_entity *rt_se, int cpu, 6782 - struct sched_rt_entity *parent) 6783 - { 6784 - struct rq *rq = cpu_rq(cpu); 6785 - 6786 - rt_rq->highest_prio.curr = MAX_RT_PRIO; 6787 - rt_rq->rt_nr_boosted = 0; 6788 - rt_rq->rq = rq; 6789 - rt_rq->tg = tg; 6790 - 6791 - tg->rt_rq[cpu] = rt_rq; 6792 - tg->rt_se[cpu] = rt_se; 6793 - 6794 - if (!rt_se) 6795 - return; 6796 - 6797 - if (!parent) 6798 - rt_se->rt_rq = &rq->rt; 6799 - else 6800 - rt_se->rt_rq = parent->my_q; 6801 - 6802 - rt_se->my_q = rt_rq; 6803 - rt_se->parent = parent; 6804 - INIT_LIST_HEAD(&rt_se->run_list); 6805 - } 6806 - #endif 8131 + DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6807 8132 6808 8133 void __init sched_init(void) 6809 8134 { ··· 6783 8294 init_cfs_rq(&rq->cfs); 6784 8295 init_rt_rq(&rq->rt, rq); 6785 8296 #ifdef CONFIG_FAIR_GROUP_SCHED 6786 - root_task_group.shares = root_task_group_load; 8297 + root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6787 8298 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6788 8299 /* 6789 8300 * How much cpu bandwidth does root_task_group get? ··· 6846 8357 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6847 8358 #endif 6848 8359 6849 - #ifdef CONFIG_SMP 6850 - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 6851 - #endif 6852 - 6853 8360 #ifdef CONFIG_RT_MUTEXES 6854 8361 plist_head_init(&init_task.pi_waiters); 6855 8362 #endif ··· 6873 8388 6874 8389 #ifdef CONFIG_SMP 6875 8390 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6876 - #ifdef CONFIG_NO_HZ 6877 - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 6878 - alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 6879 - atomic_set(&nohz.load_balancer, nr_cpu_ids); 6880 - atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); 6881 - atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); 6882 - #endif 6883 8391 /* May be allocated at isolcpus cmdline parse time */ 6884 8392 if (cpu_isolated_map == NULL) 6885 8393 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6886 - #endif /* SMP */ 8394 + #endif 8395 + init_sched_fair_class(); 6887 8396 6888 8397 scheduler_running = 1; 6889 8398 } ··· 7029 8550 7030 8551 #endif 7031 8552 7032 - #ifdef CONFIG_FAIR_GROUP_SCHED 7033 - static void free_fair_sched_group(struct task_group *tg) 7034 - { 7035 - int i; 7036 - 7037 - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 7038 - 7039 - for_each_possible_cpu(i) { 7040 - if (tg->cfs_rq) 7041 - kfree(tg->cfs_rq[i]); 7042 - if (tg->se) 7043 - kfree(tg->se[i]); 7044 - } 7045 - 7046 - kfree(tg->cfs_rq); 7047 - kfree(tg->se); 7048 - } 7049 - 7050 - static 7051 - int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 7052 - { 7053 - struct cfs_rq *cfs_rq; 7054 - struct sched_entity *se; 7055 - int i; 7056 - 7057 - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 7058 - if (!tg->cfs_rq) 7059 - goto err; 7060 - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 7061 - if (!tg->se) 7062 - goto err; 7063 - 7064 - tg->shares = NICE_0_LOAD; 7065 - 7066 - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 7067 - 7068 - for_each_possible_cpu(i) { 7069 - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 7070 - GFP_KERNEL, cpu_to_node(i)); 7071 - if (!cfs_rq) 7072 - goto err; 7073 - 7074 - se = kzalloc_node(sizeof(struct sched_entity), 7075 - GFP_KERNEL, cpu_to_node(i)); 7076 - if (!se) 7077 - goto err_free_rq; 7078 - 7079 - init_cfs_rq(cfs_rq); 7080 - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 7081 - } 7082 - 7083 - return 1; 7084 - 7085 - err_free_rq: 7086 - kfree(cfs_rq); 7087 - err: 7088 - return 0; 7089 - } 7090 - 7091 - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 7092 - { 7093 - struct rq *rq = cpu_rq(cpu); 7094 - unsigned long flags; 7095 - 7096 - /* 7097 - * Only empty task groups can be destroyed; so we can speculatively 7098 - * check on_list without danger of it being re-added. 7099 - */ 7100 - if (!tg->cfs_rq[cpu]->on_list) 7101 - return; 7102 - 7103 - raw_spin_lock_irqsave(&rq->lock, flags); 7104 - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 7105 - raw_spin_unlock_irqrestore(&rq->lock, flags); 7106 - } 7107 - #else /* !CONFIG_FAIR_GROUP_SCHED */ 7108 - static inline void free_fair_sched_group(struct task_group *tg) 7109 - { 7110 - } 7111 - 7112 - static inline 7113 - int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 7114 - { 7115 - return 1; 7116 - } 7117 - 7118 - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 7119 - { 7120 - } 7121 - #endif /* CONFIG_FAIR_GROUP_SCHED */ 7122 - 7123 8553 #ifdef CONFIG_RT_GROUP_SCHED 7124 - static void free_rt_sched_group(struct task_group *tg) 7125 - { 7126 - int i; 7127 - 7128 - if (tg->rt_se) 7129 - destroy_rt_bandwidth(&tg->rt_bandwidth); 7130 - 7131 - for_each_possible_cpu(i) { 7132 - if (tg->rt_rq) 7133 - kfree(tg->rt_rq[i]); 7134 - if (tg->rt_se) 7135 - kfree(tg->rt_se[i]); 7136 - } 7137 - 7138 - kfree(tg->rt_rq); 7139 - kfree(tg->rt_se); 7140 - } 7141 - 7142 - static 7143 - int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 7144 - { 7145 - struct rt_rq *rt_rq; 7146 - struct sched_rt_entity *rt_se; 7147 - int i; 7148 - 7149 - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 7150 - if (!tg->rt_rq) 7151 - goto err; 7152 - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 7153 - if (!tg->rt_se) 7154 - goto err; 7155 - 7156 - init_rt_bandwidth(&tg->rt_bandwidth, 7157 - ktime_to_ns(def_rt_bandwidth.rt_period), 0); 7158 - 7159 - for_each_possible_cpu(i) { 7160 - rt_rq = kzalloc_node(sizeof(struct rt_rq), 7161 - GFP_KERNEL, cpu_to_node(i)); 7162 - if (!rt_rq) 7163 - goto err; 7164 - 7165 - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 7166 - GFP_KERNEL, cpu_to_node(i)); 7167 - if (!rt_se) 7168 - goto err_free_rq; 7169 - 7170 - init_rt_rq(rt_rq, cpu_rq(i)); 7171 - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7172 - init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 7173 - } 7174 - 7175 - return 1; 7176 - 7177 - err_free_rq: 7178 - kfree(rt_rq); 7179 - err: 7180 - return 0; 7181 - } 7182 8554 #else /* !CONFIG_RT_GROUP_SCHED */ 7183 - static inline void free_rt_sched_group(struct task_group *tg) 7184 - { 7185 - } 7186 - 7187 - static inline 7188 - int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 7189 - { 7190 - return 1; 7191 - } 7192 8555 #endif /* CONFIG_RT_GROUP_SCHED */ 7193 8556 7194 8557 #ifdef CONFIG_CGROUP_SCHED 8558 + /* task_group_lock serializes the addition/removal of task groups */ 8559 + static DEFINE_SPINLOCK(task_group_lock); 8560 + 7195 8561 static void free_sched_group(struct task_group *tg) 7196 8562 { 7197 8563 free_fair_sched_group(tg); ··· 7142 8818 #endif /* CONFIG_CGROUP_SCHED */ 7143 8819 7144 8820 #ifdef CONFIG_FAIR_GROUP_SCHED 7145 - static DEFINE_MUTEX(shares_mutex); 7146 - 7147 - int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7148 - { 7149 - int i; 7150 - unsigned long flags; 7151 - 7152 - /* 7153 - * We can't change the weight of the root cgroup. 7154 - */ 7155 - if (!tg->se[0]) 7156 - return -EINVAL; 7157 - 7158 - shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); 7159 - 7160 - mutex_lock(&shares_mutex); 7161 - if (tg->shares == shares) 7162 - goto done; 7163 - 7164 - tg->shares = shares; 7165 - for_each_possible_cpu(i) { 7166 - struct rq *rq = cpu_rq(i); 7167 - struct sched_entity *se; 7168 - 7169 - se = tg->se[i]; 7170 - /* Propagate contribution to hierarchy */ 7171 - raw_spin_lock_irqsave(&rq->lock, flags); 7172 - for_each_sched_entity(se) 7173 - update_cfs_shares(group_cfs_rq(se)); 7174 - raw_spin_unlock_irqrestore(&rq->lock, flags); 7175 - } 7176 - 7177 - done: 7178 - mutex_unlock(&shares_mutex); 7179 - return 0; 7180 - } 7181 - 7182 - unsigned long sched_group_shares(struct task_group *tg) 7183 - { 7184 - return tg->shares; 7185 - } 7186 8821 #endif 7187 8822 7188 8823 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) ··· 7166 8883 struct task_struct *g, *p; 7167 8884 7168 8885 do_each_thread(g, p) { 7169 - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8886 + if (rt_task(p) && task_rq(p)->rt.tg == tg) 7170 8887 return 1; 7171 8888 } while_each_thread(g, p); 7172 8889 ··· 7518 9235 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7519 9236 { 7520 9237 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7521 - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 9238 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7522 9239 7523 9240 if (tg == &root_task_group) 7524 9241 return -EINVAL; ··· 7547 9264 runtime_enabled = quota != RUNTIME_INF; 7548 9265 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7549 9266 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7550 - 7551 9267 raw_spin_lock_irq(&cfs_b->lock); 7552 9268 cfs_b->period = ns_to_ktime(period); 7553 9269 cfs_b->quota = quota; ··· 7562 9280 7563 9281 for_each_possible_cpu(i) { 7564 9282 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7565 - struct rq *rq = rq_of(cfs_rq); 9283 + struct rq *rq = cfs_rq->rq; 7566 9284 7567 9285 raw_spin_lock_irq(&rq->lock); 7568 9286 cfs_rq->runtime_enabled = runtime_enabled; 7569 9287 cfs_rq->runtime_remaining = 0; 7570 9288 7571 - if (cfs_rq_throttled(cfs_rq)) 9289 + if (cfs_rq->throttled) 7572 9290 unthrottle_cfs_rq(cfs_rq); 7573 9291 raw_spin_unlock_irq(&rq->lock); 7574 9292 } ··· 7582 9300 { 7583 9301 u64 quota, period; 7584 9302 7585 - period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 9303 + period = ktime_to_ns(tg->cfs_bandwidth.period); 7586 9304 if (cfs_quota_us < 0) 7587 9305 quota = RUNTIME_INF; 7588 9306 else ··· 7595 9313 { 7596 9314 u64 quota_us; 7597 9315 7598 - if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 9316 + if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7599 9317 return -1; 7600 9318 7601 - quota_us = tg_cfs_bandwidth(tg)->quota; 9319 + quota_us = tg->cfs_bandwidth.quota; 7602 9320 do_div(quota_us, NSEC_PER_USEC); 7603 9321 7604 9322 return quota_us; ··· 7609 9327 u64 quota, period; 7610 9328 7611 9329 period = (u64)cfs_period_us * NSEC_PER_USEC; 7612 - quota = tg_cfs_bandwidth(tg)->quota; 9330 + quota = tg->cfs_bandwidth.quota; 7613 9331 7614 9332 if (period <= 0) 7615 9333 return -EINVAL; ··· 7621 9339 { 7622 9340 u64 cfs_period_us; 7623 9341 7624 - cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 9342 + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7625 9343 do_div(cfs_period_us, NSEC_PER_USEC); 7626 9344 7627 9345 return cfs_period_us; ··· 7681 9399 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7682 9400 { 7683 9401 struct cfs_schedulable_data *d = data; 7684 - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 9402 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7685 9403 s64 quota = 0, parent_quota = -1; 7686 9404 7687 9405 if (!tg->parent) { 7688 9406 quota = RUNTIME_INF; 7689 9407 } else { 7690 - struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 9408 + struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7691 9409 7692 9410 quota = normalize_cfs_quota(tg, d); 7693 9411 parent_quota = parent_b->hierarchal_quota; ··· 7731 9449 struct cgroup_map_cb *cb) 7732 9450 { 7733 9451 struct task_group *tg = cgroup_tg(cgrp); 7734 - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 9452 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7735 9453 7736 9454 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7737 9455 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); ··· 8030 9748 * 8031 9749 * called with rq->lock held. 8032 9750 */ 8033 - static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9751 + void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8034 9752 { 8035 9753 struct cpuacct *ca; 8036 9754 int cpu; ··· 8072 9790 /* 8073 9791 * Charge the system/user time to the task's accounting group. 8074 9792 */ 8075 - static void cpuacct_update_stats(struct task_struct *tsk, 9793 + void cpuacct_update_stats(struct task_struct *tsk, 8076 9794 enum cpuacct_stat_index idx, cputime_t val) 8077 9795 { 8078 9796 struct cpuacct *ca;

+1064

kernel/sched.h

··· 1 + 2 + #include <linux/sched.h> 3 + #include <linux/mutex.h> 4 + #include <linux/spinlock.h> 5 + #include <linux/stop_machine.h> 6 + 7 + #include "sched_cpupri.h" 8 + 9 + extern __read_mostly int scheduler_running; 10 + 11 + /* 12 + * Convert user-nice values [ -20 ... 0 ... 19 ] 13 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 14 + * and back. 15 + */ 16 + #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 17 + #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 18 + #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 19 + 20 + /* 21 + * 'User priority' is the nice value converted to something we 22 + * can work with better when scaling various scheduler parameters, 23 + * it's a [ 0 ... 39 ] range. 24 + */ 25 + #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 26 + #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 27 + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 28 + 29 + /* 30 + * Helpers for converting nanosecond timing to jiffy resolution 31 + */ 32 + #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 33 + 34 + #define NICE_0_LOAD SCHED_LOAD_SCALE 35 + #define NICE_0_SHIFT SCHED_LOAD_SHIFT 36 + 37 + /* 38 + * These are the 'tuning knobs' of the scheduler: 39 + * 40 + * default timeslice is 100 msecs (used only for SCHED_RR tasks). 41 + * Timeslices get refilled after they expire. 42 + */ 43 + #define DEF_TIMESLICE (100 * HZ / 1000) 44 + 45 + /* 46 + * single value that denotes runtime == period, ie unlimited time. 47 + */ 48 + #define RUNTIME_INF ((u64)~0ULL) 49 + 50 + static inline int rt_policy(int policy) 51 + { 52 + if (policy == SCHED_FIFO || policy == SCHED_RR) 53 + return 1; 54 + return 0; 55 + } 56 + 57 + static inline int task_has_rt_policy(struct task_struct *p) 58 + { 59 + return rt_policy(p->policy); 60 + } 61 + 62 + /* 63 + * This is the priority-queue data structure of the RT scheduling class: 64 + */ 65 + struct rt_prio_array { 66 + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 67 + struct list_head queue[MAX_RT_PRIO]; 68 + }; 69 + 70 + struct rt_bandwidth { 71 + /* nests inside the rq lock: */ 72 + raw_spinlock_t rt_runtime_lock; 73 + ktime_t rt_period; 74 + u64 rt_runtime; 75 + struct hrtimer rt_period_timer; 76 + }; 77 + 78 + extern struct mutex sched_domains_mutex; 79 + 80 + #ifdef CONFIG_CGROUP_SCHED 81 + 82 + #include <linux/cgroup.h> 83 + 84 + struct cfs_rq; 85 + struct rt_rq; 86 + 87 + static LIST_HEAD(task_groups); 88 + 89 + struct cfs_bandwidth { 90 + #ifdef CONFIG_CFS_BANDWIDTH 91 + raw_spinlock_t lock; 92 + ktime_t period; 93 + u64 quota, runtime; 94 + s64 hierarchal_quota; 95 + u64 runtime_expires; 96 + 97 + int idle, timer_active; 98 + struct hrtimer period_timer, slack_timer; 99 + struct list_head throttled_cfs_rq; 100 + 101 + /* statistics */ 102 + int nr_periods, nr_throttled; 103 + u64 throttled_time; 104 + #endif 105 + }; 106 + 107 + /* task group related information */ 108 + struct task_group { 109 + struct cgroup_subsys_state css; 110 + 111 + #ifdef CONFIG_FAIR_GROUP_SCHED 112 + /* schedulable entities of this group on each cpu */ 113 + struct sched_entity **se; 114 + /* runqueue "owned" by this group on each cpu */ 115 + struct cfs_rq **cfs_rq; 116 + unsigned long shares; 117 + 118 + atomic_t load_weight; 119 + #endif 120 + 121 + #ifdef CONFIG_RT_GROUP_SCHED 122 + struct sched_rt_entity **rt_se; 123 + struct rt_rq **rt_rq; 124 + 125 + struct rt_bandwidth rt_bandwidth; 126 + #endif 127 + 128 + struct rcu_head rcu; 129 + struct list_head list; 130 + 131 + struct task_group *parent; 132 + struct list_head siblings; 133 + struct list_head children; 134 + 135 + #ifdef CONFIG_SCHED_AUTOGROUP 136 + struct autogroup *autogroup; 137 + #endif 138 + 139 + struct cfs_bandwidth cfs_bandwidth; 140 + }; 141 + 142 + #ifdef CONFIG_FAIR_GROUP_SCHED 143 + #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD 144 + 145 + /* 146 + * A weight of 0 or 1 can cause arithmetics problems. 147 + * A weight of a cfs_rq is the sum of weights of which entities 148 + * are queued on this cfs_rq, so a weight of a entity should not be 149 + * too large, so as the shares value of a task group. 150 + * (The default weight is 1024 - so there's no practical 151 + * limitation from this.) 152 + */ 153 + #define MIN_SHARES (1UL << 1) 154 + #define MAX_SHARES (1UL << 18) 155 + #endif 156 + 157 + /* Default task group. 158 + * Every task in system belong to this group at bootup. 159 + */ 160 + extern struct task_group root_task_group; 161 + 162 + typedef int (*tg_visitor)(struct task_group *, void *); 163 + 164 + extern int walk_tg_tree_from(struct task_group *from, 165 + tg_visitor down, tg_visitor up, void *data); 166 + 167 + /* 168 + * Iterate the full tree, calling @down when first entering a node and @up when 169 + * leaving it for the final time. 170 + * 171 + * Caller must hold rcu_lock or sufficient equivalent. 172 + */ 173 + static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 174 + { 175 + return walk_tg_tree_from(&root_task_group, down, up, data); 176 + } 177 + 178 + extern int tg_nop(struct task_group *tg, void *data); 179 + 180 + extern void free_fair_sched_group(struct task_group *tg); 181 + extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 182 + extern void unregister_fair_sched_group(struct task_group *tg, int cpu); 183 + extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 184 + struct sched_entity *se, int cpu, 185 + struct sched_entity *parent); 186 + extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 187 + extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 188 + 189 + extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 190 + extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 191 + extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 192 + 193 + extern void free_rt_sched_group(struct task_group *tg); 194 + extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); 195 + extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 196 + struct sched_rt_entity *rt_se, int cpu, 197 + struct sched_rt_entity *parent); 198 + 199 + #else /* CONFIG_CGROUP_SCHED */ 200 + 201 + struct cfs_bandwidth { }; 202 + 203 + #endif /* CONFIG_CGROUP_SCHED */ 204 + 205 + /* CFS-related fields in a runqueue */ 206 + struct cfs_rq { 207 + struct load_weight load; 208 + unsigned long nr_running, h_nr_running; 209 + 210 + u64 exec_clock; 211 + u64 min_vruntime; 212 + #ifndef CONFIG_64BIT 213 + u64 min_vruntime_copy; 214 + #endif 215 + 216 + struct rb_root tasks_timeline; 217 + struct rb_node *rb_leftmost; 218 + 219 + struct list_head tasks; 220 + struct list_head *balance_iterator; 221 + 222 + /* 223 + * 'curr' points to currently running entity on this cfs_rq. 224 + * It is set to NULL otherwise (i.e when none are currently running). 225 + */ 226 + struct sched_entity *curr, *next, *last, *skip; 227 + 228 + #ifdef CONFIG_SCHED_DEBUG 229 + unsigned int nr_spread_over; 230 + #endif 231 + 232 + #ifdef CONFIG_FAIR_GROUP_SCHED 233 + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 234 + 235 + /* 236 + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 237 + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 238 + * (like users, containers etc.) 239 + * 240 + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 241 + * list is used during load balance. 242 + */ 243 + int on_list; 244 + struct list_head leaf_cfs_rq_list; 245 + struct task_group *tg; /* group that "owns" this runqueue */ 246 + 247 + #ifdef CONFIG_SMP 248 + /* 249 + * the part of load.weight contributed by tasks 250 + */ 251 + unsigned long task_weight; 252 + 253 + /* 254 + * h_load = weight * f(tg) 255 + * 256 + * Where f(tg) is the recursive weight fraction assigned to 257 + * this group. 258 + */ 259 + unsigned long h_load; 260 + 261 + /* 262 + * Maintaining per-cpu shares distribution for group scheduling 263 + * 264 + * load_stamp is the last time we updated the load average 265 + * load_last is the last time we updated the load average and saw load 266 + * load_unacc_exec_time is currently unaccounted execution time 267 + */ 268 + u64 load_avg; 269 + u64 load_period; 270 + u64 load_stamp, load_last, load_unacc_exec_time; 271 + 272 + unsigned long load_contribution; 273 + #endif /* CONFIG_SMP */ 274 + #ifdef CONFIG_CFS_BANDWIDTH 275 + int runtime_enabled; 276 + u64 runtime_expires; 277 + s64 runtime_remaining; 278 + 279 + u64 throttled_timestamp; 280 + int throttled, throttle_count; 281 + struct list_head throttled_list; 282 + #endif /* CONFIG_CFS_BANDWIDTH */ 283 + #endif /* CONFIG_FAIR_GROUP_SCHED */ 284 + }; 285 + 286 + static inline int rt_bandwidth_enabled(void) 287 + { 288 + return sysctl_sched_rt_runtime >= 0; 289 + } 290 + 291 + /* Real-Time classes' related field in a runqueue: */ 292 + struct rt_rq { 293 + struct rt_prio_array active; 294 + unsigned long rt_nr_running; 295 + #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 296 + struct { 297 + int curr; /* highest queued rt task prio */ 298 + #ifdef CONFIG_SMP 299 + int next; /* next highest */ 300 + #endif 301 + } highest_prio; 302 + #endif 303 + #ifdef CONFIG_SMP 304 + unsigned long rt_nr_migratory; 305 + unsigned long rt_nr_total; 306 + int overloaded; 307 + struct plist_head pushable_tasks; 308 + #endif 309 + int rt_throttled; 310 + u64 rt_time; 311 + u64 rt_runtime; 312 + /* Nests inside the rq lock: */ 313 + raw_spinlock_t rt_runtime_lock; 314 + 315 + #ifdef CONFIG_RT_GROUP_SCHED 316 + unsigned long rt_nr_boosted; 317 + 318 + struct rq *rq; 319 + struct list_head leaf_rt_rq_list; 320 + struct task_group *tg; 321 + #endif 322 + }; 323 + 324 + #ifdef CONFIG_SMP 325 + 326 + /* 327 + * We add the notion of a root-domain which will be used to define per-domain 328 + * variables. Each exclusive cpuset essentially defines an island domain by 329 + * fully partitioning the member cpus from any other cpuset. Whenever a new 330 + * exclusive cpuset is created, we also create and attach a new root-domain 331 + * object. 332 + * 333 + */ 334 + struct root_domain { 335 + atomic_t refcount; 336 + atomic_t rto_count; 337 + struct rcu_head rcu; 338 + cpumask_var_t span; 339 + cpumask_var_t online; 340 + 341 + /* 342 + * The "RT overload" flag: it gets set if a CPU has more than 343 + * one runnable RT task. 344 + */ 345 + cpumask_var_t rto_mask; 346 + struct cpupri cpupri; 347 + }; 348 + 349 + extern struct root_domain def_root_domain; 350 + 351 + #endif /* CONFIG_SMP */ 352 + 353 + /* 354 + * This is the main, per-CPU runqueue data structure. 355 + * 356 + * Locking rule: those places that want to lock multiple runqueues 357 + * (such as the load balancing or the thread migration code), lock 358 + * acquire operations must be ordered by ascending &runqueue. 359 + */ 360 + struct rq { 361 + /* runqueue lock: */ 362 + raw_spinlock_t lock; 363 + 364 + /* 365 + * nr_running and cpu_load should be in the same cacheline because 366 + * remote CPUs use both these fields when doing load calculation. 367 + */ 368 + unsigned long nr_running; 369 + #define CPU_LOAD_IDX_MAX 5 370 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 371 + unsigned long last_load_update_tick; 372 + #ifdef CONFIG_NO_HZ 373 + u64 nohz_stamp; 374 + unsigned char nohz_balance_kick; 375 + #endif 376 + int skip_clock_update; 377 + 378 + /* capture load from *all* tasks on this cpu: */ 379 + struct load_weight load; 380 + unsigned long nr_load_updates; 381 + u64 nr_switches; 382 + 383 + struct cfs_rq cfs; 384 + struct rt_rq rt; 385 + 386 + #ifdef CONFIG_FAIR_GROUP_SCHED 387 + /* list of leaf cfs_rq on this cpu: */ 388 + struct list_head leaf_cfs_rq_list; 389 + #endif 390 + #ifdef CONFIG_RT_GROUP_SCHED 391 + struct list_head leaf_rt_rq_list; 392 + #endif 393 + 394 + /* 395 + * This is part of a global counter where only the total sum 396 + * over all CPUs matters. A task can increase this counter on 397 + * one CPU and if it got migrated afterwards it may decrease 398 + * it on another CPU. Always updated under the runqueue lock: 399 + */ 400 + unsigned long nr_uninterruptible; 401 + 402 + struct task_struct *curr, *idle, *stop; 403 + unsigned long next_balance; 404 + struct mm_struct *prev_mm; 405 + 406 + u64 clock; 407 + u64 clock_task; 408 + 409 + atomic_t nr_iowait; 410 + 411 + #ifdef CONFIG_SMP 412 + struct root_domain *rd; 413 + struct sched_domain *sd; 414 + 415 + unsigned long cpu_power; 416 + 417 + unsigned char idle_balance; 418 + /* For active balancing */ 419 + int post_schedule; 420 + int active_balance; 421 + int push_cpu; 422 + struct cpu_stop_work active_balance_work; 423 + /* cpu of this runqueue: */ 424 + int cpu; 425 + int online; 426 + 427 + u64 rt_avg; 428 + u64 age_stamp; 429 + u64 idle_stamp; 430 + u64 avg_idle; 431 + #endif 432 + 433 + #ifdef CONFIG_IRQ_TIME_ACCOUNTING 434 + u64 prev_irq_time; 435 + #endif 436 + #ifdef CONFIG_PARAVIRT 437 + u64 prev_steal_time; 438 + #endif 439 + #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 440 + u64 prev_steal_time_rq; 441 + #endif 442 + 443 + /* calc_load related fields */ 444 + unsigned long calc_load_update; 445 + long calc_load_active; 446 + 447 + #ifdef CONFIG_SCHED_HRTICK 448 + #ifdef CONFIG_SMP 449 + int hrtick_csd_pending; 450 + struct call_single_data hrtick_csd; 451 + #endif 452 + struct hrtimer hrtick_timer; 453 + #endif 454 + 455 + #ifdef CONFIG_SCHEDSTATS 456 + /* latency stats */ 457 + struct sched_info rq_sched_info; 458 + unsigned long long rq_cpu_time; 459 + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 460 + 461 + /* sys_sched_yield() stats */ 462 + unsigned int yld_count; 463 + 464 + /* schedule() stats */ 465 + unsigned int sched_switch; 466 + unsigned int sched_count; 467 + unsigned int sched_goidle; 468 + 469 + /* try_to_wake_up() stats */ 470 + unsigned int ttwu_count; 471 + unsigned int ttwu_local; 472 + #endif 473 + 474 + #ifdef CONFIG_SMP 475 + struct llist_head wake_list; 476 + #endif 477 + }; 478 + 479 + static inline int cpu_of(struct rq *rq) 480 + { 481 + #ifdef CONFIG_SMP 482 + return rq->cpu; 483 + #else 484 + return 0; 485 + #endif 486 + } 487 + 488 + DECLARE_PER_CPU(struct rq, runqueues); 489 + 490 + #define rcu_dereference_check_sched_domain(p) \ 491 + rcu_dereference_check((p), \ 492 + lockdep_is_held(&sched_domains_mutex)) 493 + 494 + /* 495 + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 496 + * See detach_destroy_domains: synchronize_sched for details. 497 + * 498 + * The domain tree of any CPU may only be accessed from within 499 + * preempt-disabled sections. 500 + */ 501 + #define for_each_domain(cpu, __sd) \ 502 + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 503 + 504 + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 505 + #define this_rq() (&__get_cpu_var(runqueues)) 506 + #define task_rq(p) cpu_rq(task_cpu(p)) 507 + #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 508 + #define raw_rq() (&__raw_get_cpu_var(runqueues)) 509 + 510 + #include "sched_stats.h" 511 + #include "sched_autogroup.h" 512 + 513 + #ifdef CONFIG_CGROUP_SCHED 514 + 515 + /* 516 + * Return the group to which this tasks belongs. 517 + * 518 + * We use task_subsys_state_check() and extend the RCU verification with 519 + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 520 + * task it moves into the cgroup. Therefore by holding either of those locks, 521 + * we pin the task to the current cgroup. 522 + */ 523 + static inline struct task_group *task_group(struct task_struct *p) 524 + { 525 + struct task_group *tg; 526 + struct cgroup_subsys_state *css; 527 + 528 + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 529 + lockdep_is_held(&p->pi_lock) || 530 + lockdep_is_held(&task_rq(p)->lock)); 531 + tg = container_of(css, struct task_group, css); 532 + 533 + return autogroup_task_group(p, tg); 534 + } 535 + 536 + /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 537 + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 538 + { 539 + #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) 540 + struct task_group *tg = task_group(p); 541 + #endif 542 + 543 + #ifdef CONFIG_FAIR_GROUP_SCHED 544 + p->se.cfs_rq = tg->cfs_rq[cpu]; 545 + p->se.parent = tg->se[cpu]; 546 + #endif 547 + 548 + #ifdef CONFIG_RT_GROUP_SCHED 549 + p->rt.rt_rq = tg->rt_rq[cpu]; 550 + p->rt.parent = tg->rt_se[cpu]; 551 + #endif 552 + } 553 + 554 + #else /* CONFIG_CGROUP_SCHED */ 555 + 556 + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 557 + static inline struct task_group *task_group(struct task_struct *p) 558 + { 559 + return NULL; 560 + } 561 + 562 + #endif /* CONFIG_CGROUP_SCHED */ 563 + 564 + static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 565 + { 566 + set_task_rq(p, cpu); 567 + #ifdef CONFIG_SMP 568 + /* 569 + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 570 + * successfuly executed on another CPU. We must ensure that updates of 571 + * per-task data have been completed by this moment. 572 + */ 573 + smp_wmb(); 574 + task_thread_info(p)->cpu = cpu; 575 + #endif 576 + } 577 + 578 + /* 579 + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 580 + */ 581 + #ifdef CONFIG_SCHED_DEBUG 582 + # define const_debug __read_mostly 583 + #else 584 + # define const_debug const 585 + #endif 586 + 587 + extern const_debug unsigned int sysctl_sched_features; 588 + 589 + #define SCHED_FEAT(name, enabled) \ 590 + __SCHED_FEAT_##name , 591 + 592 + enum { 593 + #include "sched_features.h" 594 + }; 595 + 596 + #undef SCHED_FEAT 597 + 598 + #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 599 + 600 + static inline u64 global_rt_period(void) 601 + { 602 + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 603 + } 604 + 605 + static inline u64 global_rt_runtime(void) 606 + { 607 + if (sysctl_sched_rt_runtime < 0) 608 + return RUNTIME_INF; 609 + 610 + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 611 + } 612 + 613 + 614 + 615 + static inline int task_current(struct rq *rq, struct task_struct *p) 616 + { 617 + return rq->curr == p; 618 + } 619 + 620 + static inline int task_running(struct rq *rq, struct task_struct *p) 621 + { 622 + #ifdef CONFIG_SMP 623 + return p->on_cpu; 624 + #else 625 + return task_current(rq, p); 626 + #endif 627 + } 628 + 629 + 630 + #ifndef prepare_arch_switch 631 + # define prepare_arch_switch(next) do { } while (0) 632 + #endif 633 + #ifndef finish_arch_switch 634 + # define finish_arch_switch(prev) do { } while (0) 635 + #endif 636 + 637 + #ifndef __ARCH_WANT_UNLOCKED_CTXSW 638 + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 639 + { 640 + #ifdef CONFIG_SMP 641 + /* 642 + * We can optimise this out completely for !SMP, because the 643 + * SMP rebalancing from interrupt is the only thing that cares 644 + * here. 645 + */ 646 + next->on_cpu = 1; 647 + #endif 648 + } 649 + 650 + static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 651 + { 652 + #ifdef CONFIG_SMP 653 + /* 654 + * After ->on_cpu is cleared, the task can be moved to a different CPU. 655 + * We must ensure this doesn't happen until the switch is completely 656 + * finished. 657 + */ 658 + smp_wmb(); 659 + prev->on_cpu = 0; 660 + #endif 661 + #ifdef CONFIG_DEBUG_SPINLOCK 662 + /* this is a valid case when another task releases the spinlock */ 663 + rq->lock.owner = current; 664 + #endif 665 + /* 666 + * If we are tracking spinlock dependencies then we have to 667 + * fix up the runqueue lock - which gets 'carried over' from 668 + * prev into current: 669 + */ 670 + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 671 + 672 + raw_spin_unlock_irq(&rq->lock); 673 + } 674 + 675 + #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 676 + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 677 + { 678 + #ifdef CONFIG_SMP 679 + /* 680 + * We can optimise this out completely for !SMP, because the 681 + * SMP rebalancing from interrupt is the only thing that cares 682 + * here. 683 + */ 684 + next->on_cpu = 1; 685 + #endif 686 + #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 687 + raw_spin_unlock_irq(&rq->lock); 688 + #else 689 + raw_spin_unlock(&rq->lock); 690 + #endif 691 + } 692 + 693 + static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 694 + { 695 + #ifdef CONFIG_SMP 696 + /* 697 + * After ->on_cpu is cleared, the task can be moved to a different CPU. 698 + * We must ensure this doesn't happen until the switch is completely 699 + * finished. 700 + */ 701 + smp_wmb(); 702 + prev->on_cpu = 0; 703 + #endif 704 + #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 705 + local_irq_enable(); 706 + #endif 707 + } 708 + #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 709 + 710 + 711 + static inline void update_load_add(struct load_weight *lw, unsigned long inc) 712 + { 713 + lw->weight += inc; 714 + lw->inv_weight = 0; 715 + } 716 + 717 + static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 718 + { 719 + lw->weight -= dec; 720 + lw->inv_weight = 0; 721 + } 722 + 723 + static inline void update_load_set(struct load_weight *lw, unsigned long w) 724 + { 725 + lw->weight = w; 726 + lw->inv_weight = 0; 727 + } 728 + 729 + /* 730 + * To aid in avoiding the subversion of "niceness" due to uneven distribution 731 + * of tasks with abnormal "nice" values across CPUs the contribution that 732 + * each task makes to its run queue's load is weighted according to its 733 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 734 + * scaled version of the new time slice allocation that they receive on time 735 + * slice expiry etc. 736 + */ 737 + 738 + #define WEIGHT_IDLEPRIO 3 739 + #define WMULT_IDLEPRIO 1431655765 740 + 741 + /* 742 + * Nice levels are multiplicative, with a gentle 10% change for every 743 + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 744 + * nice 1, it will get ~10% less CPU time than another CPU-bound task 745 + * that remained on nice 0. 746 + * 747 + * The "10% effect" is relative and cumulative: from _any_ nice level, 748 + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 749 + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 750 + * If a task goes up by ~10% and another task goes down by ~10% then 751 + * the relative distance between them is ~25%.) 752 + */ 753 + static const int prio_to_weight[40] = { 754 + /* -20 */ 88761, 71755, 56483, 46273, 36291, 755 + /* -15 */ 29154, 23254, 18705, 14949, 11916, 756 + /* -10 */ 9548, 7620, 6100, 4904, 3906, 757 + /* -5 */ 3121, 2501, 1991, 1586, 1277, 758 + /* 0 */ 1024, 820, 655, 526, 423, 759 + /* 5 */ 335, 272, 215, 172, 137, 760 + /* 10 */ 110, 87, 70, 56, 45, 761 + /* 15 */ 36, 29, 23, 18, 15, 762 + }; 763 + 764 + /* 765 + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 766 + * 767 + * In cases where the weight does not change often, we can use the 768 + * precalculated inverse to speed up arithmetics by turning divisions 769 + * into multiplications: 770 + */ 771 + static const u32 prio_to_wmult[40] = { 772 + /* -20 */ 48388, 59856, 76040, 92818, 118348, 773 + /* -15 */ 147320, 184698, 229616, 287308, 360437, 774 + /* -10 */ 449829, 563644, 704093, 875809, 1099582, 775 + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 776 + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 777 + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 778 + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 779 + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 780 + }; 781 + 782 + /* Time spent by the tasks of the cpu accounting group executing in ... */ 783 + enum cpuacct_stat_index { 784 + CPUACCT_STAT_USER, /* ... user mode */ 785 + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 786 + 787 + CPUACCT_STAT_NSTATS, 788 + }; 789 + 790 + 791 + #define sched_class_highest (&stop_sched_class) 792 + #define for_each_class(class) \ 793 + for (class = sched_class_highest; class; class = class->next) 794 + 795 + extern const struct sched_class stop_sched_class; 796 + extern const struct sched_class rt_sched_class; 797 + extern const struct sched_class fair_sched_class; 798 + extern const struct sched_class idle_sched_class; 799 + 800 + 801 + #ifdef CONFIG_SMP 802 + 803 + extern void trigger_load_balance(struct rq *rq, int cpu); 804 + extern void idle_balance(int this_cpu, struct rq *this_rq); 805 + 806 + #else /* CONFIG_SMP */ 807 + 808 + static inline void idle_balance(int cpu, struct rq *rq) 809 + { 810 + } 811 + 812 + #endif 813 + 814 + extern void sysrq_sched_debug_show(void); 815 + extern void sched_init_granularity(void); 816 + extern void update_max_interval(void); 817 + extern void update_group_power(struct sched_domain *sd, int cpu); 818 + extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 819 + extern void init_sched_rt_class(void); 820 + extern void init_sched_fair_class(void); 821 + 822 + extern void resched_task(struct task_struct *p); 823 + extern void resched_cpu(int cpu); 824 + 825 + extern struct rt_bandwidth def_rt_bandwidth; 826 + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 827 + 828 + extern void update_cpu_load(struct rq *this_rq); 829 + 830 + #ifdef CONFIG_CGROUP_CPUACCT 831 + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 832 + extern void cpuacct_update_stats(struct task_struct *tsk, 833 + enum cpuacct_stat_index idx, cputime_t val); 834 + #else 835 + static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 836 + static inline void cpuacct_update_stats(struct task_struct *tsk, 837 + enum cpuacct_stat_index idx, cputime_t val) {} 838 + #endif 839 + 840 + static inline void inc_nr_running(struct rq *rq) 841 + { 842 + rq->nr_running++; 843 + } 844 + 845 + static inline void dec_nr_running(struct rq *rq) 846 + { 847 + rq->nr_running--; 848 + } 849 + 850 + extern void update_rq_clock(struct rq *rq); 851 + 852 + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 853 + extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); 854 + 855 + extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 856 + 857 + extern const_debug unsigned int sysctl_sched_time_avg; 858 + extern const_debug unsigned int sysctl_sched_nr_migrate; 859 + extern const_debug unsigned int sysctl_sched_migration_cost; 860 + 861 + static inline u64 sched_avg_period(void) 862 + { 863 + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 864 + } 865 + 866 + void calc_load_account_idle(struct rq *this_rq); 867 + 868 + #ifdef CONFIG_SCHED_HRTICK 869 + 870 + /* 871 + * Use hrtick when: 872 + * - enabled by features 873 + * - hrtimer is actually high res 874 + */ 875 + static inline int hrtick_enabled(struct rq *rq) 876 + { 877 + if (!sched_feat(HRTICK)) 878 + return 0; 879 + if (!cpu_active(cpu_of(rq))) 880 + return 0; 881 + return hrtimer_is_hres_active(&rq->hrtick_timer); 882 + } 883 + 884 + void hrtick_start(struct rq *rq, u64 delay); 885 + 886 + #endif /* CONFIG_SCHED_HRTICK */ 887 + 888 + #ifdef CONFIG_SMP 889 + extern void sched_avg_update(struct rq *rq); 890 + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 891 + { 892 + rq->rt_avg += rt_delta; 893 + sched_avg_update(rq); 894 + } 895 + #else 896 + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } 897 + static inline void sched_avg_update(struct rq *rq) { } 898 + #endif 899 + 900 + extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); 901 + 902 + #ifdef CONFIG_SMP 903 + #ifdef CONFIG_PREEMPT 904 + 905 + static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); 906 + 907 + /* 908 + * fair double_lock_balance: Safely acquires both rq->locks in a fair 909 + * way at the expense of forcing extra atomic operations in all 910 + * invocations. This assures that the double_lock is acquired using the 911 + * same underlying policy as the spinlock_t on this architecture, which 912 + * reduces latency compared to the unfair variant below. However, it 913 + * also adds more overhead and therefore may reduce throughput. 914 + */ 915 + static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 916 + __releases(this_rq->lock) 917 + __acquires(busiest->lock) 918 + __acquires(this_rq->lock) 919 + { 920 + raw_spin_unlock(&this_rq->lock); 921 + double_rq_lock(this_rq, busiest); 922 + 923 + return 1; 924 + } 925 + 926 + #else 927 + /* 928 + * Unfair double_lock_balance: Optimizes throughput at the expense of 929 + * latency by eliminating extra atomic operations when the locks are 930 + * already in proper order on entry. This favors lower cpu-ids and will 931 + * grant the double lock to lower cpus over higher ids under contention, 932 + * regardless of entry order into the function. 933 + */ 934 + static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 935 + __releases(this_rq->lock) 936 + __acquires(busiest->lock) 937 + __acquires(this_rq->lock) 938 + { 939 + int ret = 0; 940 + 941 + if (unlikely(!raw_spin_trylock(&busiest->lock))) { 942 + if (busiest < this_rq) { 943 + raw_spin_unlock(&this_rq->lock); 944 + raw_spin_lock(&busiest->lock); 945 + raw_spin_lock_nested(&this_rq->lock, 946 + SINGLE_DEPTH_NESTING); 947 + ret = 1; 948 + } else 949 + raw_spin_lock_nested(&busiest->lock, 950 + SINGLE_DEPTH_NESTING); 951 + } 952 + return ret; 953 + } 954 + 955 + #endif /* CONFIG_PREEMPT */ 956 + 957 + /* 958 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 959 + */ 960 + static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 961 + { 962 + if (unlikely(!irqs_disabled())) { 963 + /* printk() doesn't work good under rq->lock */ 964 + raw_spin_unlock(&this_rq->lock); 965 + BUG_ON(1); 966 + } 967 + 968 + return _double_lock_balance(this_rq, busiest); 969 + } 970 + 971 + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 972 + __releases(busiest->lock) 973 + { 974 + raw_spin_unlock(&busiest->lock); 975 + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 976 + } 977 + 978 + /* 979 + * double_rq_lock - safely lock two runqueues 980 + * 981 + * Note this does not disable interrupts like task_rq_lock, 982 + * you need to do so manually before calling. 983 + */ 984 + static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 985 + __acquires(rq1->lock) 986 + __acquires(rq2->lock) 987 + { 988 + BUG_ON(!irqs_disabled()); 989 + if (rq1 == rq2) { 990 + raw_spin_lock(&rq1->lock); 991 + __acquire(rq2->lock); /* Fake it out ;) */ 992 + } else { 993 + if (rq1 < rq2) { 994 + raw_spin_lock(&rq1->lock); 995 + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 996 + } else { 997 + raw_spin_lock(&rq2->lock); 998 + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 999 + } 1000 + } 1001 + } 1002 + 1003 + /* 1004 + * double_rq_unlock - safely unlock two runqueues 1005 + * 1006 + * Note this does not restore interrupts like task_rq_unlock, 1007 + * you need to do so manually after calling. 1008 + */ 1009 + static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1010 + __releases(rq1->lock) 1011 + __releases(rq2->lock) 1012 + { 1013 + raw_spin_unlock(&rq1->lock); 1014 + if (rq1 != rq2) 1015 + raw_spin_unlock(&rq2->lock); 1016 + else 1017 + __release(rq2->lock); 1018 + } 1019 + 1020 + #else /* CONFIG_SMP */ 1021 + 1022 + /* 1023 + * double_rq_lock - safely lock two runqueues 1024 + * 1025 + * Note this does not disable interrupts like task_rq_lock, 1026 + * you need to do so manually before calling. 1027 + */ 1028 + static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 1029 + __acquires(rq1->lock) 1030 + __acquires(rq2->lock) 1031 + { 1032 + BUG_ON(!irqs_disabled()); 1033 + BUG_ON(rq1 != rq2); 1034 + raw_spin_lock(&rq1->lock); 1035 + __acquire(rq2->lock); /* Fake it out ;) */ 1036 + } 1037 + 1038 + /* 1039 + * double_rq_unlock - safely unlock two runqueues 1040 + * 1041 + * Note this does not restore interrupts like task_rq_unlock, 1042 + * you need to do so manually after calling. 1043 + */ 1044 + static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1045 + __releases(rq1->lock) 1046 + __releases(rq2->lock) 1047 + { 1048 + BUG_ON(rq1 != rq2); 1049 + raw_spin_unlock(&rq1->lock); 1050 + __release(rq2->lock); 1051 + } 1052 + 1053 + #endif 1054 + 1055 + extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); 1056 + extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 1057 + extern void print_cfs_stats(struct seq_file *m, int cpu); 1058 + extern void print_rt_stats(struct seq_file *m, int cpu); 1059 + 1060 + extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1061 + extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1062 + extern void unthrottle_offline_cfs_rqs(struct rq *rq); 1063 + 1064 + extern void account_cfs_bandwidth_used(int enabled, int was_enabled);

+8 -25

kernel/sched_autogroup.c

··· 1 1 #ifdef CONFIG_SCHED_AUTOGROUP 2 2 3 + #include "sched.h" 4 + 3 5 #include <linux/proc_fs.h> 4 6 #include <linux/seq_file.h> 5 7 #include <linux/kallsyms.h> 6 8 #include <linux/utsname.h> 9 + #include <linux/security.h> 10 + #include <linux/export.h> 7 11 8 12 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 9 13 static struct autogroup autogroup_default; 10 14 static atomic_t autogroup_seq_nr; 11 15 12 - static void __init autogroup_init(struct task_struct *init_task) 16 + void __init autogroup_init(struct task_struct *init_task) 13 17 { 14 18 autogroup_default.tg = &root_task_group; 15 19 kref_init(&autogroup_default.kref); ··· 21 17 init_task->signal->autogroup = &autogroup_default; 22 18 } 23 19 24 - static inline void autogroup_free(struct task_group *tg) 20 + void autogroup_free(struct task_group *tg) 25 21 { 26 22 kfree(tg->autogroup); 27 23 } ··· 62 58 63 59 return ag; 64 60 } 65 - 66 - #ifdef CONFIG_RT_GROUP_SCHED 67 - static void free_rt_sched_group(struct task_group *tg); 68 - #endif 69 61 70 62 static inline struct autogroup *autogroup_create(void) 71 63 { ··· 108 108 return autogroup_kref_get(&autogroup_default); 109 109 } 110 110 111 - static inline bool 112 - task_wants_autogroup(struct task_struct *p, struct task_group *tg) 111 + bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) 113 112 { 114 113 if (tg != &root_task_group) 115 114 return false; ··· 124 125 return false; 125 126 126 127 return true; 127 - } 128 - 129 - static inline bool task_group_is_autogroup(struct task_group *tg) 130 - { 131 - return !!tg->autogroup; 132 - } 133 - 134 - static inline struct task_group * 135 - autogroup_task_group(struct task_struct *p, struct task_group *tg) 136 - { 137 - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 138 - 139 - if (enabled && task_wants_autogroup(p, tg)) 140 - return p->signal->autogroup->tg; 141 - 142 - return tg; 143 128 } 144 129 145 130 static void ··· 246 263 #endif /* CONFIG_PROC_FS */ 247 264 248 265 #ifdef CONFIG_SCHED_DEBUG 249 - static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266 + int autogroup_path(struct task_group *tg, char *buf, int buflen) 250 267 { 251 268 if (!task_group_is_autogroup(tg)) 252 269 return 0;

+24 -2

kernel/sched_autogroup.h

··· 1 1 #ifdef CONFIG_SCHED_AUTOGROUP 2 2 3 + #include <linux/kref.h> 4 + #include <linux/rwsem.h> 5 + 3 6 struct autogroup { 4 7 /* 5 8 * reference doesn't mean how many thread attach to this ··· 16 13 int nice; 17 14 }; 18 15 19 - static inline bool task_group_is_autogroup(struct task_group *tg); 16 + extern void autogroup_init(struct task_struct *init_task); 17 + extern void autogroup_free(struct task_group *tg); 18 + 19 + static inline bool task_group_is_autogroup(struct task_group *tg) 20 + { 21 + return !!tg->autogroup; 22 + } 23 + 24 + extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); 25 + 20 26 static inline struct task_group * 21 - autogroup_task_group(struct task_struct *p, struct task_group *tg); 27 + autogroup_task_group(struct task_struct *p, struct task_group *tg) 28 + { 29 + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 30 + 31 + if (enabled && task_wants_autogroup(p, tg)) 32 + return p->signal->autogroup->tg; 33 + 34 + return tg; 35 + } 36 + 37 + extern int autogroup_path(struct task_group *tg, char *buf, int buflen); 22 38 23 39 #else /* !CONFIG_SCHED_AUTOGROUP */ 24 40

+3 -1

kernel/sched_debug.c

··· 16 16 #include <linux/kallsyms.h> 17 17 #include <linux/utsname.h> 18 18 19 + #include "sched.h" 20 + 19 21 static DEFINE_SPINLOCK(sched_debug_lock); 20 22 21 23 /* ··· 375 373 return 0; 376 374 } 377 375 378 - static void sysrq_sched_debug_show(void) 376 + void sysrq_sched_debug_show(void) 379 377 { 380 378 sched_debug_show(NULL, NULL); 381 379 }

+542 -38

kernel/sched_fair.c

··· 23 23 #include <linux/latencytop.h> 24 24 #include <linux/sched.h> 25 25 #include <linux/cpumask.h> 26 + #include <linux/slab.h> 27 + #include <linux/profile.h> 28 + #include <linux/interrupt.h> 29 + 30 + #include <trace/events/sched.h> 31 + 32 + #include "sched.h" 26 33 27 34 /* 28 35 * Targeted preemption latency for CPU-bound tasks: ··· 110 103 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 111 104 #endif 112 105 113 - static const struct sched_class fair_sched_class; 106 + /* 107 + * Increase the granularity value when there are more CPUs, 108 + * because with more CPUs the 'effective latency' as visible 109 + * to users decreases. But the relationship is not linear, 110 + * so pick a second-best guess by going with the log2 of the 111 + * number of CPUs. 112 + * 113 + * This idea comes from the SD scheduler of Con Kolivas: 114 + */ 115 + static int get_update_sysctl_factor(void) 116 + { 117 + unsigned int cpus = min_t(int, num_online_cpus(), 8); 118 + unsigned int factor; 119 + 120 + switch (sysctl_sched_tunable_scaling) { 121 + case SCHED_TUNABLESCALING_NONE: 122 + factor = 1; 123 + break; 124 + case SCHED_TUNABLESCALING_LINEAR: 125 + factor = cpus; 126 + break; 127 + case SCHED_TUNABLESCALING_LOG: 128 + default: 129 + factor = 1 + ilog2(cpus); 130 + break; 131 + } 132 + 133 + return factor; 134 + } 135 + 136 + static void update_sysctl(void) 137 + { 138 + unsigned int factor = get_update_sysctl_factor(); 139 + 140 + #define SET_SYSCTL(name) \ 141 + (sysctl_##name = (factor) * normalized_sysctl_##name) 142 + SET_SYSCTL(sched_min_granularity); 143 + SET_SYSCTL(sched_latency); 144 + SET_SYSCTL(sched_wakeup_granularity); 145 + #undef SET_SYSCTL 146 + } 147 + 148 + void sched_init_granularity(void) 149 + { 150 + update_sysctl(); 151 + } 152 + 153 + #if BITS_PER_LONG == 32 154 + # define WMULT_CONST (~0UL) 155 + #else 156 + # define WMULT_CONST (1UL << 32) 157 + #endif 158 + 159 + #define WMULT_SHIFT 32 160 + 161 + /* 162 + * Shift right and round: 163 + */ 164 + #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 165 + 166 + /* 167 + * delta *= weight / lw 168 + */ 169 + static unsigned long 170 + calc_delta_mine(unsigned long delta_exec, unsigned long weight, 171 + struct load_weight *lw) 172 + { 173 + u64 tmp; 174 + 175 + /* 176 + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched 177 + * entities since MIN_SHARES = 2. Treat weight as 1 if less than 178 + * 2^SCHED_LOAD_RESOLUTION. 179 + */ 180 + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) 181 + tmp = (u64)delta_exec * scale_load_down(weight); 182 + else 183 + tmp = (u64)delta_exec; 184 + 185 + if (!lw->inv_weight) { 186 + unsigned long w = scale_load_down(lw->weight); 187 + 188 + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 189 + lw->inv_weight = 1; 190 + else if (unlikely(!w)) 191 + lw->inv_weight = WMULT_CONST; 192 + else 193 + lw->inv_weight = WMULT_CONST / w; 194 + } 195 + 196 + /* 197 + * Check whether we'd overflow the 64-bit multiplication: 198 + */ 199 + if (unlikely(tmp > WMULT_CONST)) 200 + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 201 + WMULT_SHIFT/2); 202 + else 203 + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 204 + 205 + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 206 + } 207 + 208 + 209 + const struct sched_class fair_sched_class; 114 210 115 211 /************************************************************** 116 212 * CFS operations on generic schedulable entities: ··· 523 413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 524 414 } 525 415 526 - static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 416 + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 527 417 { 528 418 struct rb_node *left = cfs_rq->rb_leftmost; 529 419 ··· 544 434 } 545 435 546 436 #ifdef CONFIG_SCHED_DEBUG 547 - static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 437 + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 548 438 { 549 439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 550 440 ··· 794 684 { 795 685 update_load_add(&cfs_rq->load, se->load.weight); 796 686 if (!parent_entity(se)) 797 - inc_cpu_load(rq_of(cfs_rq), se->load.weight); 687 + update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 798 688 if (entity_is_task(se)) { 799 689 add_cfs_task_weight(cfs_rq, se->load.weight); 800 690 list_add(&se->group_node, &cfs_rq->tasks); ··· 807 697 { 808 698 update_load_sub(&cfs_rq->load, se->load.weight); 809 699 if (!parent_entity(se)) 810 - dec_cpu_load(rq_of(cfs_rq), se->load.weight); 700 + update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 811 701 if (entity_is_task(se)) { 812 702 add_cfs_task_weight(cfs_rq, -se->load.weight); 813 703 list_del_init(&se->group_node); ··· 1397 1287 */ 1398 1288 1399 1289 #ifdef CONFIG_CFS_BANDWIDTH 1290 + 1291 + #ifdef HAVE_JUMP_LABEL 1292 + static struct jump_label_key __cfs_bandwidth_used; 1293 + 1294 + static inline bool cfs_bandwidth_used(void) 1295 + { 1296 + return static_branch(&__cfs_bandwidth_used); 1297 + } 1298 + 1299 + void account_cfs_bandwidth_used(int enabled, int was_enabled) 1300 + { 1301 + /* only need to count groups transitioning between enabled/!enabled */ 1302 + if (enabled && !was_enabled) 1303 + jump_label_inc(&__cfs_bandwidth_used); 1304 + else if (!enabled && was_enabled) 1305 + jump_label_dec(&__cfs_bandwidth_used); 1306 + } 1307 + #else /* HAVE_JUMP_LABEL */ 1308 + static bool cfs_bandwidth_used(void) 1309 + { 1310 + return true; 1311 + } 1312 + 1313 + void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 1314 + #endif /* HAVE_JUMP_LABEL */ 1315 + 1400 1316 /* 1401 1317 * default period for cfs group bandwidth. 1402 1318 * default: 0.1s, units: nanoseconds ··· 1444 1308 * 1445 1309 * requires cfs_b->lock 1446 1310 */ 1447 - static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1311 + void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1448 1312 { 1449 1313 u64 now; 1450 1314 ··· 1454 1318 now = sched_clock_cpu(smp_processor_id()); 1455 1319 cfs_b->runtime = cfs_b->quota; 1456 1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1321 + } 1322 + 1323 + static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 1324 + { 1325 + return &tg->cfs_bandwidth; 1457 1326 } 1458 1327 1459 1328 /* returns 0 on failure to allocate runtime */ ··· 1671 1530 raw_spin_unlock(&cfs_b->lock); 1672 1531 } 1673 1532 1674 - static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1533 + void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1675 1534 { 1676 1535 struct rq *rq = rq_of(cfs_rq); 1677 1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); ··· 1980 1839 1981 1840 throttle_cfs_rq(cfs_rq); 1982 1841 } 1983 - #else 1842 + 1843 + static inline u64 default_cfs_period(void); 1844 + static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); 1845 + static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); 1846 + 1847 + static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 1848 + { 1849 + struct cfs_bandwidth *cfs_b = 1850 + container_of(timer, struct cfs_bandwidth, slack_timer); 1851 + do_sched_cfs_slack_timer(cfs_b); 1852 + 1853 + return HRTIMER_NORESTART; 1854 + } 1855 + 1856 + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) 1857 + { 1858 + struct cfs_bandwidth *cfs_b = 1859 + container_of(timer, struct cfs_bandwidth, period_timer); 1860 + ktime_t now; 1861 + int overrun; 1862 + int idle = 0; 1863 + 1864 + for (;;) { 1865 + now = hrtimer_cb_get_time(timer); 1866 + overrun = hrtimer_forward(timer, now, cfs_b->period); 1867 + 1868 + if (!overrun) 1869 + break; 1870 + 1871 + idle = do_sched_cfs_period_timer(cfs_b, overrun); 1872 + } 1873 + 1874 + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 1875 + } 1876 + 1877 + void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 1878 + { 1879 + raw_spin_lock_init(&cfs_b->lock); 1880 + cfs_b->runtime = 0; 1881 + cfs_b->quota = RUNTIME_INF; 1882 + cfs_b->period = ns_to_ktime(default_cfs_period()); 1883 + 1884 + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); 1885 + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1886 + cfs_b->period_timer.function = sched_cfs_period_timer; 1887 + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1888 + cfs_b->slack_timer.function = sched_cfs_slack_timer; 1889 + } 1890 + 1891 + static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1892 + { 1893 + cfs_rq->runtime_enabled = 0; 1894 + INIT_LIST_HEAD(&cfs_rq->throttled_list); 1895 + } 1896 + 1897 + /* requires cfs_b->lock, may release to reprogram timer */ 1898 + void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 1899 + { 1900 + /* 1901 + * The timer may be active because we're trying to set a new bandwidth 1902 + * period or because we're racing with the tear-down path 1903 + * (timer_active==0 becomes visible before the hrtimer call-back 1904 + * terminates). In either case we ensure that it's re-programmed 1905 + */ 1906 + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 1907 + raw_spin_unlock(&cfs_b->lock); 1908 + /* ensure cfs_b->lock is available while we wait */ 1909 + hrtimer_cancel(&cfs_b->period_timer); 1910 + 1911 + raw_spin_lock(&cfs_b->lock); 1912 + /* if someone else restarted the timer then we're done */ 1913 + if (cfs_b->timer_active) 1914 + return; 1915 + } 1916 + 1917 + cfs_b->timer_active = 1; 1918 + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); 1919 + } 1920 + 1921 + static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 1922 + { 1923 + hrtimer_cancel(&cfs_b->period_timer); 1924 + hrtimer_cancel(&cfs_b->slack_timer); 1925 + } 1926 + 1927 + void unthrottle_offline_cfs_rqs(struct rq *rq) 1928 + { 1929 + struct cfs_rq *cfs_rq; 1930 + 1931 + for_each_leaf_cfs_rq(rq, cfs_rq) { 1932 + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1933 + 1934 + if (!cfs_rq->runtime_enabled) 1935 + continue; 1936 + 1937 + /* 1938 + * clock_task is not advancing so we just need to make sure 1939 + * there's some valid quota amount 1940 + */ 1941 + cfs_rq->runtime_remaining = cfs_b->quota; 1942 + if (cfs_rq_throttled(cfs_rq)) 1943 + unthrottle_cfs_rq(cfs_rq); 1944 + } 1945 + } 1946 + 1947 + #else /* CONFIG_CFS_BANDWIDTH */ 1984 1948 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1985 1949 unsigned long delta_exec) {} 1986 1950 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} ··· 2107 1861 { 2108 1862 return 0; 2109 1863 } 1864 + 1865 + void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 1866 + 1867 + #ifdef CONFIG_FAIR_GROUP_SCHED 1868 + static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2110 1869 #endif 1870 + 1871 + static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 1872 + { 1873 + return NULL; 1874 + } 1875 + static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 1876 + void unthrottle_offline_cfs_rqs(struct rq *rq) {} 1877 + 1878 + #endif /* CONFIG_CFS_BANDWIDTH */ 2111 1879 2112 1880 /************************************************** 2113 1881 * CFS operations on tasks: ··· 2289 2029 } 2290 2030 2291 2031 #ifdef CONFIG_SMP 2032 + /* Used instead of source_load when we know the type == 0 */ 2033 + static unsigned long weighted_cpuload(const int cpu) 2034 + { 2035 + return cpu_rq(cpu)->load.weight; 2036 + } 2037 + 2038 + /* 2039 + * Return a low guess at the load of a migration-source cpu weighted 2040 + * according to the scheduling class and "nice" value. 2041 + * 2042 + * We want to under-estimate the load of migration sources, to 2043 + * balance conservatively. 2044 + */ 2045 + static unsigned long source_load(int cpu, int type) 2046 + { 2047 + struct rq *rq = cpu_rq(cpu); 2048 + unsigned long total = weighted_cpuload(cpu); 2049 + 2050 + if (type == 0 || !sched_feat(LB_BIAS)) 2051 + return total; 2052 + 2053 + return min(rq->cpu_load[type-1], total); 2054 + } 2055 + 2056 + /* 2057 + * Return a high guess at the load of a migration-target cpu weighted 2058 + * according to the scheduling class and "nice" value. 2059 + */ 2060 + static unsigned long target_load(int cpu, int type) 2061 + { 2062 + struct rq *rq = cpu_rq(cpu); 2063 + unsigned long total = weighted_cpuload(cpu); 2064 + 2065 + if (type == 0 || !sched_feat(LB_BIAS)) 2066 + return total; 2067 + 2068 + return max(rq->cpu_load[type-1], total); 2069 + } 2070 + 2071 + static unsigned long power_of(int cpu) 2072 + { 2073 + return cpu_rq(cpu)->cpu_power; 2074 + } 2075 + 2076 + static unsigned long cpu_avg_load_per_task(int cpu) 2077 + { 2078 + struct rq *rq = cpu_rq(cpu); 2079 + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 2080 + 2081 + if (nr_running) 2082 + return rq->load.weight / nr_running; 2083 + 2084 + return 0; 2085 + } 2086 + 2292 2087 2293 2088 static void task_waking_fair(struct task_struct *p) 2294 2089 { ··· 3098 2783 } 3099 2784 3100 2785 /* 2786 + * Is this task likely cache-hot: 2787 + */ 2788 + static int 2789 + task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 2790 + { 2791 + s64 delta; 2792 + 2793 + if (p->sched_class != &fair_sched_class) 2794 + return 0; 2795 + 2796 + if (unlikely(p->policy == SCHED_IDLE)) 2797 + return 0; 2798 + 2799 + /* 2800 + * Buddy candidates are cache hot: 2801 + */ 2802 + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 2803 + (&p->se == cfs_rq_of(&p->se)->next || 2804 + &p->se == cfs_rq_of(&p->se)->last)) 2805 + return 1; 2806 + 2807 + if (sysctl_sched_migration_cost == -1) 2808 + return 1; 2809 + if (sysctl_sched_migration_cost == 0) 2810 + return 0; 2811 + 2812 + delta = now - p->se.exec_start; 2813 + 2814 + return delta < (s64)sysctl_sched_migration_cost; 2815 + } 2816 + 2817 + /* 3101 2818 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3102 2819 */ 3103 2820 static ··· 3509 3162 }; 3510 3163 3511 3164 /** 3512 - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 3513 - * @group: The group whose first cpu is to be returned. 3514 - */ 3515 - static inline unsigned int group_first_cpu(struct sched_group *group) 3516 - { 3517 - return cpumask_first(sched_group_cpus(group)); 3518 - } 3519 - 3520 - /** 3521 3165 * get_sd_load_idx - Obtain the load index for a given sched domain. 3522 3166 * @sd: The sched_domain whose load_idx is to be obtained. 3523 3167 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. ··· 3757 3419 sdg->sgp->power = power; 3758 3420 } 3759 3421 3760 - static void update_group_power(struct sched_domain *sd, int cpu) 3422 + void update_group_power(struct sched_domain *sd, int cpu) 3761 3423 { 3762 3424 struct sched_domain *child = sd->child; 3763 3425 struct sched_group *group, *sdg = sd->groups; ··· 4021 3683 update_sd_power_savings_stats(sg, sds, local_group, &sgs); 4022 3684 sg = sg->next; 4023 3685 } while (sg != sd->groups); 4024 - } 4025 - 4026 - int __weak arch_sd_sibling_asym_packing(void) 4027 - { 4028 - return 0*SD_ASYM_PACKING; 4029 3686 } 4030 3687 4031 3688 /** ··· 4386 4053 #define MAX_PINNED_INTERVAL 512 4387 4054 4388 4055 /* Working cpumask for load_balance and load_balance_newidle. */ 4389 - static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4056 + DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4390 4057 4391 4058 static int need_active_balance(struct sched_domain *sd, int idle, 4392 4059 int busiest_cpu, int this_cpu) ··· 4589 4256 * idle_balance is called by schedule() if this_cpu is about to become 4590 4257 * idle. Attempts to pull tasks from other CPUs. 4591 4258 */ 4592 - static void idle_balance(int this_cpu, struct rq *this_rq) 4259 + void idle_balance(int this_cpu, struct rq *this_rq) 4593 4260 { 4594 4261 struct sched_domain *sd; 4595 4262 int pulled_task = 0; ··· 4964 4631 * Scale the max load_balance interval with the number of CPUs in the system. 4965 4632 * This trades load-balance latency on larger machines for less cross talk. 4966 4633 */ 4967 - static void update_max_interval(void) 4634 + void update_max_interval(void) 4968 4635 { 4969 4636 max_load_balance_interval = HZ*num_online_cpus()/10; 4970 4637 } ··· 5166 4833 /* 5167 4834 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5168 4835 */ 5169 - static inline void trigger_load_balance(struct rq *rq, int cpu) 4836 + void trigger_load_balance(struct rq *rq, int cpu) 5170 4837 { 5171 4838 /* Don't need to rebalance while attached to NULL domain */ 5172 4839 if (time_after_eq(jiffies, rq->next_balance) && ··· 5186 4853 static void rq_offline_fair(struct rq *rq) 5187 4854 { 5188 4855 update_sysctl(); 5189 - } 5190 - 5191 - #else /* CONFIG_SMP */ 5192 - 5193 - /* 5194 - * on UP we do not need to balance between CPUs: 5195 - */ 5196 - static inline void idle_balance(int cpu, struct rq *rq) 5197 - { 5198 4856 } 5199 4857 5200 4858 #endif /* CONFIG_SMP */ ··· 5330 5006 } 5331 5007 } 5332 5008 5009 + void init_cfs_rq(struct cfs_rq *cfs_rq) 5010 + { 5011 + cfs_rq->tasks_timeline = RB_ROOT; 5012 + INIT_LIST_HEAD(&cfs_rq->tasks); 5013 + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 5014 + #ifndef CONFIG_64BIT 5015 + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5016 + #endif 5017 + } 5018 + 5333 5019 #ifdef CONFIG_FAIR_GROUP_SCHED 5334 5020 static void task_move_group_fair(struct task_struct *p, int on_rq) 5335 5021 { ··· 5362 5028 if (!on_rq) 5363 5029 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5364 5030 } 5031 + 5032 + void free_fair_sched_group(struct task_group *tg) 5033 + { 5034 + int i; 5035 + 5036 + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 5037 + 5038 + for_each_possible_cpu(i) { 5039 + if (tg->cfs_rq) 5040 + kfree(tg->cfs_rq[i]); 5041 + if (tg->se) 5042 + kfree(tg->se[i]); 5043 + } 5044 + 5045 + kfree(tg->cfs_rq); 5046 + kfree(tg->se); 5047 + } 5048 + 5049 + int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 5050 + { 5051 + struct cfs_rq *cfs_rq; 5052 + struct sched_entity *se; 5053 + int i; 5054 + 5055 + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 5056 + if (!tg->cfs_rq) 5057 + goto err; 5058 + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 5059 + if (!tg->se) 5060 + goto err; 5061 + 5062 + tg->shares = NICE_0_LOAD; 5063 + 5064 + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 5065 + 5066 + for_each_possible_cpu(i) { 5067 + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 5068 + GFP_KERNEL, cpu_to_node(i)); 5069 + if (!cfs_rq) 5070 + goto err; 5071 + 5072 + se = kzalloc_node(sizeof(struct sched_entity), 5073 + GFP_KERNEL, cpu_to_node(i)); 5074 + if (!se) 5075 + goto err_free_rq; 5076 + 5077 + init_cfs_rq(cfs_rq); 5078 + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 5079 + } 5080 + 5081 + return 1; 5082 + 5083 + err_free_rq: 5084 + kfree(cfs_rq); 5085 + err: 5086 + return 0; 5087 + } 5088 + 5089 + void unregister_fair_sched_group(struct task_group *tg, int cpu) 5090 + { 5091 + struct rq *rq = cpu_rq(cpu); 5092 + unsigned long flags; 5093 + 5094 + /* 5095 + * Only empty task groups can be destroyed; so we can speculatively 5096 + * check on_list without danger of it being re-added. 5097 + */ 5098 + if (!tg->cfs_rq[cpu]->on_list) 5099 + return; 5100 + 5101 + raw_spin_lock_irqsave(&rq->lock, flags); 5102 + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 5103 + raw_spin_unlock_irqrestore(&rq->lock, flags); 5104 + } 5105 + 5106 + void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 5107 + struct sched_entity *se, int cpu, 5108 + struct sched_entity *parent) 5109 + { 5110 + struct rq *rq = cpu_rq(cpu); 5111 + 5112 + cfs_rq->tg = tg; 5113 + cfs_rq->rq = rq; 5114 + #ifdef CONFIG_SMP 5115 + /* allow initial update_cfs_load() to truncate */ 5116 + cfs_rq->load_stamp = 1; 5365 5117 #endif 5118 + init_cfs_rq_runtime(cfs_rq); 5119 + 5120 + tg->cfs_rq[cpu] = cfs_rq; 5121 + tg->se[cpu] = se; 5122 + 5123 + /* se could be NULL for root_task_group */ 5124 + if (!se) 5125 + return; 5126 + 5127 + if (!parent) 5128 + se->cfs_rq = &rq->cfs; 5129 + else 5130 + se->cfs_rq = parent->my_q; 5131 + 5132 + se->my_q = cfs_rq; 5133 + update_load_set(&se->load, 0); 5134 + se->parent = parent; 5135 + } 5136 + 5137 + static DEFINE_MUTEX(shares_mutex); 5138 + 5139 + int sched_group_set_shares(struct task_group *tg, unsigned long shares) 5140 + { 5141 + int i; 5142 + unsigned long flags; 5143 + 5144 + /* 5145 + * We can't change the weight of the root cgroup. 5146 + */ 5147 + if (!tg->se[0]) 5148 + return -EINVAL; 5149 + 5150 + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); 5151 + 5152 + mutex_lock(&shares_mutex); 5153 + if (tg->shares == shares) 5154 + goto done; 5155 + 5156 + tg->shares = shares; 5157 + for_each_possible_cpu(i) { 5158 + struct rq *rq = cpu_rq(i); 5159 + struct sched_entity *se; 5160 + 5161 + se = tg->se[i]; 5162 + /* Propagate contribution to hierarchy */ 5163 + raw_spin_lock_irqsave(&rq->lock, flags); 5164 + for_each_sched_entity(se) 5165 + update_cfs_shares(group_cfs_rq(se)); 5166 + raw_spin_unlock_irqrestore(&rq->lock, flags); 5167 + } 5168 + 5169 + done: 5170 + mutex_unlock(&shares_mutex); 5171 + return 0; 5172 + } 5173 + #else /* CONFIG_FAIR_GROUP_SCHED */ 5174 + 5175 + void free_fair_sched_group(struct task_group *tg) { } 5176 + 5177 + int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 5178 + { 5179 + return 1; 5180 + } 5181 + 5182 + void unregister_fair_sched_group(struct task_group *tg, int cpu) { } 5183 + 5184 + #endif /* CONFIG_FAIR_GROUP_SCHED */ 5185 + 5366 5186 5367 5187 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5368 5188 { ··· 5536 5048 /* 5537 5049 * All the scheduling class methods: 5538 5050 */ 5539 - static const struct sched_class fair_sched_class = { 5051 + const struct sched_class fair_sched_class = { 5540 5052 .next = &idle_sched_class, 5541 5053 .enqueue_task = enqueue_task_fair, 5542 5054 .dequeue_task = dequeue_task_fair, ··· 5573 5085 }; 5574 5086 5575 5087 #ifdef CONFIG_SCHED_DEBUG 5576 - static void print_cfs_stats(struct seq_file *m, int cpu) 5088 + void print_cfs_stats(struct seq_file *m, int cpu) 5577 5089 { 5578 5090 struct cfs_rq *cfs_rq; 5579 5091 ··· 5583 5095 rcu_read_unlock(); 5584 5096 } 5585 5097 #endif 5098 + 5099 + __init void init_sched_fair_class(void) 5100 + { 5101 + #ifdef CONFIG_SMP 5102 + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 5103 + 5104 + #ifdef CONFIG_NO_HZ 5105 + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 5106 + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 5107 + atomic_set(&nohz.load_balancer, nr_cpu_ids); 5108 + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); 5109 + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); 5110 + #endif 5111 + #endif /* SMP */ 5112 + 5113 + }

+3 -1

kernel/sched_idletask.c

··· 1 + #include "sched.h" 2 + 1 3 /* 2 4 * idle-task scheduling class. 3 5 * ··· 73 71 /* 74 72 * Simple, special scheduling class for the per-CPU idle tasks: 75 73 */ 76 - static const struct sched_class idle_sched_class = { 74 + const struct sched_class idle_sched_class = { 77 75 /* .next is NULL */ 78 76 /* no enqueue/yield_task for idle tasks */ 79 77

+203 -6

kernel/sched_rt.c

··· 3 3 * policies) 4 4 */ 5 5 6 + #include "sched.h" 7 + 8 + #include <linux/slab.h> 9 + 10 + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 11 + 12 + struct rt_bandwidth def_rt_bandwidth; 13 + 14 + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 15 + { 16 + struct rt_bandwidth *rt_b = 17 + container_of(timer, struct rt_bandwidth, rt_period_timer); 18 + ktime_t now; 19 + int overrun; 20 + int idle = 0; 21 + 22 + for (;;) { 23 + now = hrtimer_cb_get_time(timer); 24 + overrun = hrtimer_forward(timer, now, rt_b->rt_period); 25 + 26 + if (!overrun) 27 + break; 28 + 29 + idle = do_sched_rt_period_timer(rt_b, overrun); 30 + } 31 + 32 + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 33 + } 34 + 35 + void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 36 + { 37 + rt_b->rt_period = ns_to_ktime(period); 38 + rt_b->rt_runtime = runtime; 39 + 40 + raw_spin_lock_init(&rt_b->rt_runtime_lock); 41 + 42 + hrtimer_init(&rt_b->rt_period_timer, 43 + CLOCK_MONOTONIC, HRTIMER_MODE_REL); 44 + rt_b->rt_period_timer.function = sched_rt_period_timer; 45 + } 46 + 47 + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 48 + { 49 + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 50 + return; 51 + 52 + if (hrtimer_active(&rt_b->rt_period_timer)) 53 + return; 54 + 55 + raw_spin_lock(&rt_b->rt_runtime_lock); 56 + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); 57 + raw_spin_unlock(&rt_b->rt_runtime_lock); 58 + } 59 + 60 + void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 61 + { 62 + struct rt_prio_array *array; 63 + int i; 64 + 65 + array = &rt_rq->active; 66 + for (i = 0; i < MAX_RT_PRIO; i++) { 67 + INIT_LIST_HEAD(array->queue + i); 68 + __clear_bit(i, array->bitmap); 69 + } 70 + /* delimiter for bitsearch: */ 71 + __set_bit(MAX_RT_PRIO, array->bitmap); 72 + 73 + #if defined CONFIG_SMP 74 + rt_rq->highest_prio.curr = MAX_RT_PRIO; 75 + rt_rq->highest_prio.next = MAX_RT_PRIO; 76 + rt_rq->rt_nr_migratory = 0; 77 + rt_rq->overloaded = 0; 78 + plist_head_init(&rt_rq->pushable_tasks); 79 + #endif 80 + 81 + rt_rq->rt_time = 0; 82 + rt_rq->rt_throttled = 0; 83 + rt_rq->rt_runtime = 0; 84 + raw_spin_lock_init(&rt_rq->rt_runtime_lock); 85 + } 86 + 6 87 #ifdef CONFIG_RT_GROUP_SCHED 88 + static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 89 + { 90 + hrtimer_cancel(&rt_b->rt_period_timer); 91 + } 7 92 8 93 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 9 94 ··· 108 23 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 109 24 { 110 25 return rt_se->rt_rq; 26 + } 27 + 28 + void free_rt_sched_group(struct task_group *tg) 29 + { 30 + int i; 31 + 32 + if (tg->rt_se) 33 + destroy_rt_bandwidth(&tg->rt_bandwidth); 34 + 35 + for_each_possible_cpu(i) { 36 + if (tg->rt_rq) 37 + kfree(tg->rt_rq[i]); 38 + if (tg->rt_se) 39 + kfree(tg->rt_se[i]); 40 + } 41 + 42 + kfree(tg->rt_rq); 43 + kfree(tg->rt_se); 44 + } 45 + 46 + void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 47 + struct sched_rt_entity *rt_se, int cpu, 48 + struct sched_rt_entity *parent) 49 + { 50 + struct rq *rq = cpu_rq(cpu); 51 + 52 + rt_rq->highest_prio.curr = MAX_RT_PRIO; 53 + rt_rq->rt_nr_boosted = 0; 54 + rt_rq->rq = rq; 55 + rt_rq->tg = tg; 56 + 57 + tg->rt_rq[cpu] = rt_rq; 58 + tg->rt_se[cpu] = rt_se; 59 + 60 + if (!rt_se) 61 + return; 62 + 63 + if (!parent) 64 + rt_se->rt_rq = &rq->rt; 65 + else 66 + rt_se->rt_rq = parent->my_q; 67 + 68 + rt_se->my_q = rt_rq; 69 + rt_se->parent = parent; 70 + INIT_LIST_HEAD(&rt_se->run_list); 71 + } 72 + 73 + int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 74 + { 75 + struct rt_rq *rt_rq; 76 + struct sched_rt_entity *rt_se; 77 + int i; 78 + 79 + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 80 + if (!tg->rt_rq) 81 + goto err; 82 + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 83 + if (!tg->rt_se) 84 + goto err; 85 + 86 + init_rt_bandwidth(&tg->rt_bandwidth, 87 + ktime_to_ns(def_rt_bandwidth.rt_period), 0); 88 + 89 + for_each_possible_cpu(i) { 90 + rt_rq = kzalloc_node(sizeof(struct rt_rq), 91 + GFP_KERNEL, cpu_to_node(i)); 92 + if (!rt_rq) 93 + goto err; 94 + 95 + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 96 + GFP_KERNEL, cpu_to_node(i)); 97 + if (!rt_se) 98 + goto err_free_rq; 99 + 100 + init_rt_rq(rt_rq, cpu_rq(i)); 101 + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 102 + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 103 + } 104 + 105 + return 1; 106 + 107 + err_free_rq: 108 + kfree(rt_rq); 109 + err: 110 + return 0; 111 111 } 112 112 113 113 #else /* CONFIG_RT_GROUP_SCHED */ ··· 217 47 return &rq->rt; 218 48 } 219 49 50 + void free_rt_sched_group(struct task_group *tg) { } 51 + 52 + int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 53 + { 54 + return 1; 55 + } 220 56 #endif /* CONFIG_RT_GROUP_SCHED */ 221 57 222 58 #ifdef CONFIG_SMP ··· 730 554 raw_spin_lock_irqsave(&rq->lock, flags); 731 555 __enable_runtime(rq); 732 556 raw_spin_unlock_irqrestore(&rq->lock, flags); 557 + } 558 + 559 + int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) 560 + { 561 + int cpu = (int)(long)hcpu; 562 + 563 + switch (action) { 564 + case CPU_DOWN_PREPARE: 565 + case CPU_DOWN_PREPARE_FROZEN: 566 + disable_runtime(cpu_rq(cpu)); 567 + return NOTIFY_OK; 568 + 569 + case CPU_DOWN_FAILED: 570 + case CPU_DOWN_FAILED_FROZEN: 571 + case CPU_ONLINE: 572 + case CPU_ONLINE_FROZEN: 573 + enable_runtime(cpu_rq(cpu)); 574 + return NOTIFY_OK; 575 + 576 + default: 577 + return NOTIFY_DONE; 578 + } 733 579 } 734 580 735 581 static int balance_runtime(struct rt_rq *rt_rq) ··· 1376 1178 /* Only try algorithms three times */ 1377 1179 #define RT_MAX_TRIES 3 1378 1180 1379 - static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 1380 - 1381 1181 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382 1182 { 1383 1183 if (!task_running(rq, p) && ··· 1849 1653 pull_rt_task(rq); 1850 1654 } 1851 1655 1852 - static inline void init_sched_rt_class(void) 1656 + void init_sched_rt_class(void) 1853 1657 { 1854 1658 unsigned int i; 1855 1659 1856 - for_each_possible_cpu(i) 1660 + for_each_possible_cpu(i) { 1857 1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1858 1662 GFP_KERNEL, cpu_to_node(i)); 1663 + } 1859 1664 } 1860 1665 #endif /* CONFIG_SMP */ 1861 1666 ··· 1997 1800 return 0; 1998 1801 } 1999 1802 2000 - static const struct sched_class rt_sched_class = { 1803 + const struct sched_class rt_sched_class = { 2001 1804 .next = &fair_sched_class, 2002 1805 .enqueue_task = enqueue_task_rt, 2003 1806 .dequeue_task = dequeue_task_rt, ··· 2032 1835 #ifdef CONFIG_SCHED_DEBUG 2033 1836 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2034 1837 2035 - static void print_rt_stats(struct seq_file *m, int cpu) 1838 + void print_rt_stats(struct seq_file *m, int cpu) 2036 1839 { 2037 1840 rt_rq_iter_t iter; 2038 1841 struct rt_rq *rt_rq;

+111

kernel/sched_stats.c

··· 1 + 2 + #include <linux/slab.h> 3 + #include <linux/fs.h> 4 + #include <linux/seq_file.h> 5 + #include <linux/proc_fs.h> 6 + 7 + #include "sched.h" 8 + 9 + /* 10 + * bump this up when changing the output format or the meaning of an existing 11 + * format, so that tools can adapt (or abort) 12 + */ 13 + #define SCHEDSTAT_VERSION 15 14 + 15 + static int show_schedstat(struct seq_file *seq, void *v) 16 + { 17 + int cpu; 18 + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; 19 + char *mask_str = kmalloc(mask_len, GFP_KERNEL); 20 + 21 + if (mask_str == NULL) 22 + return -ENOMEM; 23 + 24 + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 25 + seq_printf(seq, "timestamp %lu\n", jiffies); 26 + for_each_online_cpu(cpu) { 27 + struct rq *rq = cpu_rq(cpu); 28 + #ifdef CONFIG_SMP 29 + struct sched_domain *sd; 30 + int dcount = 0; 31 + #endif 32 + 33 + /* runqueue-specific stats */ 34 + seq_printf(seq, 35 + "cpu%d %u %u %u %u %u %u %llu %llu %lu", 36 + cpu, rq->yld_count, 37 + rq->sched_switch, rq->sched_count, rq->sched_goidle, 38 + rq->ttwu_count, rq->ttwu_local, 39 + rq->rq_cpu_time, 40 + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 41 + 42 + seq_printf(seq, "\n"); 43 + 44 + #ifdef CONFIG_SMP 45 + /* domain-specific stats */ 46 + rcu_read_lock(); 47 + for_each_domain(cpu, sd) { 48 + enum cpu_idle_type itype; 49 + 50 + cpumask_scnprintf(mask_str, mask_len, 51 + sched_domain_span(sd)); 52 + seq_printf(seq, "domain%d %s", dcount++, mask_str); 53 + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 54 + itype++) { 55 + seq_printf(seq, " %u %u %u %u %u %u %u %u", 56 + sd->lb_count[itype], 57 + sd->lb_balanced[itype], 58 + sd->lb_failed[itype], 59 + sd->lb_imbalance[itype], 60 + sd->lb_gained[itype], 61 + sd->lb_hot_gained[itype], 62 + sd->lb_nobusyq[itype], 63 + sd->lb_nobusyg[itype]); 64 + } 65 + seq_printf(seq, 66 + " %u %u %u %u %u %u %u %u %u %u %u %u\n", 67 + sd->alb_count, sd->alb_failed, sd->alb_pushed, 68 + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, 69 + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, 70 + sd->ttwu_wake_remote, sd->ttwu_move_affine, 71 + sd->ttwu_move_balance); 72 + } 73 + rcu_read_unlock(); 74 + #endif 75 + } 76 + kfree(mask_str); 77 + return 0; 78 + } 79 + 80 + static int schedstat_open(struct inode *inode, struct file *file) 81 + { 82 + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 83 + char *buf = kmalloc(size, GFP_KERNEL); 84 + struct seq_file *m; 85 + int res; 86 + 87 + if (!buf) 88 + return -ENOMEM; 89 + res = single_open(file, show_schedstat, NULL); 90 + if (!res) { 91 + m = file->private_data; 92 + m->buf = buf; 93 + m->size = size; 94 + } else 95 + kfree(buf); 96 + return res; 97 + } 98 + 99 + static const struct file_operations proc_schedstat_operations = { 100 + .open = schedstat_open, 101 + .read = seq_read, 102 + .llseek = seq_lseek, 103 + .release = single_release, 104 + }; 105 + 106 + static int __init proc_schedstat_init(void) 107 + { 108 + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 109 + return 0; 110 + } 111 + module_init(proc_schedstat_init);

-103

kernel/sched_stats.h

··· 1 1 2 2 #ifdef CONFIG_SCHEDSTATS 3 - /* 4 - * bump this up when changing the output format or the meaning of an existing 5 - * format, so that tools can adapt (or abort) 6 - */ 7 - #define SCHEDSTAT_VERSION 15 8 - 9 - static int show_schedstat(struct seq_file *seq, void *v) 10 - { 11 - int cpu; 12 - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; 13 - char *mask_str = kmalloc(mask_len, GFP_KERNEL); 14 - 15 - if (mask_str == NULL) 16 - return -ENOMEM; 17 - 18 - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 19 - seq_printf(seq, "timestamp %lu\n", jiffies); 20 - for_each_online_cpu(cpu) { 21 - struct rq *rq = cpu_rq(cpu); 22 - #ifdef CONFIG_SMP 23 - struct sched_domain *sd; 24 - int dcount = 0; 25 - #endif 26 - 27 - /* runqueue-specific stats */ 28 - seq_printf(seq, 29 - "cpu%d %u %u %u %u %u %u %llu %llu %lu", 30 - cpu, rq->yld_count, 31 - rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 - rq->ttwu_count, rq->ttwu_local, 33 - rq->rq_cpu_time, 34 - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 - 36 - seq_printf(seq, "\n"); 37 - 38 - #ifdef CONFIG_SMP 39 - /* domain-specific stats */ 40 - rcu_read_lock(); 41 - for_each_domain(cpu, sd) { 42 - enum cpu_idle_type itype; 43 - 44 - cpumask_scnprintf(mask_str, mask_len, 45 - sched_domain_span(sd)); 46 - seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 - itype++) { 49 - seq_printf(seq, " %u %u %u %u %u %u %u %u", 50 - sd->lb_count[itype], 51 - sd->lb_balanced[itype], 52 - sd->lb_failed[itype], 53 - sd->lb_imbalance[itype], 54 - sd->lb_gained[itype], 55 - sd->lb_hot_gained[itype], 56 - sd->lb_nobusyq[itype], 57 - sd->lb_nobusyg[itype]); 58 - } 59 - seq_printf(seq, 60 - " %u %u %u %u %u %u %u %u %u %u %u %u\n", 61 - sd->alb_count, sd->alb_failed, sd->alb_pushed, 62 - sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, 63 - sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, 64 - sd->ttwu_wake_remote, sd->ttwu_move_affine, 65 - sd->ttwu_move_balance); 66 - } 67 - rcu_read_unlock(); 68 - #endif 69 - } 70 - kfree(mask_str); 71 - return 0; 72 - } 73 - 74 - static int schedstat_open(struct inode *inode, struct file *file) 75 - { 76 - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 77 - char *buf = kmalloc(size, GFP_KERNEL); 78 - struct seq_file *m; 79 - int res; 80 - 81 - if (!buf) 82 - return -ENOMEM; 83 - res = single_open(file, show_schedstat, NULL); 84 - if (!res) { 85 - m = file->private_data; 86 - m->buf = buf; 87 - m->size = size; 88 - } else 89 - kfree(buf); 90 - return res; 91 - } 92 - 93 - static const struct file_operations proc_schedstat_operations = { 94 - .open = schedstat_open, 95 - .read = seq_read, 96 - .llseek = seq_lseek, 97 - .release = single_release, 98 - }; 99 - 100 - static int __init proc_schedstat_init(void) 101 - { 102 - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 103 - return 0; 104 - } 105 - module_init(proc_schedstat_init); 106 3 107 4 /* 108 5 * Expects runqueue lock to be held for atomicity of update

+3 -1

kernel/sched_stoptask.c

··· 1 + #include "sched.h" 2 + 1 3 /* 2 4 * stop-task scheduling class. 3 5 * ··· 82 80 /* 83 81 * Simple, special scheduling class for the per-CPU stop tasks: 84 82 */ 85 - static const struct sched_class stop_sched_class = { 83 + const struct sched_class stop_sched_class = { 86 84 .next = &rt_sched_class, 87 85 88 86 .enqueue_task = enqueue_task_stop,