Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

-21

arch/x86/include/asm/context_tracking.h

··· 1 1 #ifndef _ASM_X86_CONTEXT_TRACKING_H 2 2 #define _ASM_X86_CONTEXT_TRACKING_H 3 3 4 - #ifndef __ASSEMBLY__ 5 - #include <linux/context_tracking.h> 6 - #include <asm/ptrace.h> 7 - 8 - static inline void exception_enter(struct pt_regs *regs) 9 - { 10 - user_exit(); 11 - } 12 - 13 - static inline void exception_exit(struct pt_regs *regs) 14 - { 15 - #ifdef CONFIG_CONTEXT_TRACKING 16 - if (user_mode(regs)) 17 - user_enter(); 18 - #endif 19 - } 20 - 21 - #else /* __ASSEMBLY__ */ 22 - 23 4 #ifdef CONFIG_CONTEXT_TRACKING 24 5 # define SCHEDULE_USER call schedule_user 25 6 #else 26 7 # define SCHEDULE_USER call schedule 27 8 #endif 28 - 29 - #endif /* !__ASSEMBLY__ */ 30 9 31 10 #endif

+5 -3

arch/x86/kernel/kvm.c

··· 20 20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 21 21 */ 22 22 23 + #include <linux/context_tracking.h> 23 24 #include <linux/module.h> 24 25 #include <linux/kernel.h> 25 26 #include <linux/kvm_para.h> ··· 44 43 #include <asm/apicdef.h> 45 44 #include <asm/hypervisor.h> 46 45 #include <asm/kvm_guest.h> 47 - #include <asm/context_tracking.h> 48 46 49 47 static int kvmapf = 1; 50 48 ··· 254 254 dotraplinkage void __kprobes 255 255 do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 256 256 { 257 + enum ctx_state prev_state; 258 + 257 259 switch (kvm_read_and_reset_pf_reason()) { 258 260 default: 259 261 do_page_fault(regs, error_code); 260 262 break; 261 263 case KVM_PV_REASON_PAGE_NOT_PRESENT: 262 264 /* page is swapped out by the host. */ 263 - exception_enter(regs); 265 + prev_state = exception_enter(); 264 266 exit_idle(); 265 267 kvm_async_pf_task_wait((u32)read_cr2()); 266 - exception_exit(regs); 268 + exception_exit(prev_state); 267 269 break; 268 270 case KVM_PV_REASON_PAGE_READY: 269 271 rcu_irq_enter();

+42 -26

arch/x86/kernel/traps.c

··· 12 12 13 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 14 15 + #include <linux/context_tracking.h> 15 16 #include <linux/interrupt.h> 16 17 #include <linux/kallsyms.h> 17 18 #include <linux/spinlock.h> ··· 56 55 #include <asm/i387.h> 57 56 #include <asm/fpu-internal.h> 58 57 #include <asm/mce.h> 59 - #include <asm/context_tracking.h> 60 - 61 58 #include <asm/mach_traps.h> 62 59 63 60 #ifdef CONFIG_X86_64 ··· 175 176 #define DO_ERROR(trapnr, signr, str, name) \ 176 177 dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 177 178 { \ 178 - exception_enter(regs); \ 179 + enum ctx_state prev_state; \ 180 + \ 181 + prev_state = exception_enter(); \ 179 182 if (notify_die(DIE_TRAP, str, regs, error_code, \ 180 183 trapnr, signr) == NOTIFY_STOP) { \ 181 - exception_exit(regs); \ 184 + exception_exit(prev_state); \ 182 185 return; \ 183 186 } \ 184 187 conditional_sti(regs); \ 185 188 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 186 - exception_exit(regs); \ 189 + exception_exit(prev_state); \ 187 190 } 188 191 189 192 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 190 193 dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 191 194 { \ 192 195 siginfo_t info; \ 196 + enum ctx_state prev_state; \ 197 + \ 193 198 info.si_signo = signr; \ 194 199 info.si_errno = 0; \ 195 200 info.si_code = sicode; \ 196 201 info.si_addr = (void __user *)siaddr; \ 197 - exception_enter(regs); \ 202 + prev_state = exception_enter(); \ 198 203 if (notify_die(DIE_TRAP, str, regs, error_code, \ 199 204 trapnr, signr) == NOTIFY_STOP) { \ 200 - exception_exit(regs); \ 205 + exception_exit(prev_state); \ 201 206 return; \ 202 207 } \ 203 208 conditional_sti(regs); \ 204 209 do_trap(trapnr, signr, str, regs, error_code, &info); \ 205 - exception_exit(regs); \ 210 + exception_exit(prev_state); \ 206 211 } 207 212 208 213 DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, ··· 229 226 /* Runs on IST stack */ 230 227 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) 231 228 { 232 - exception_enter(regs); 229 + enum ctx_state prev_state; 230 + 231 + prev_state = exception_enter(); 233 232 if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 234 233 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { 235 234 preempt_conditional_sti(regs); 236 235 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); 237 236 preempt_conditional_cli(regs); 238 237 } 239 - exception_exit(regs); 238 + exception_exit(prev_state); 240 239 } 241 240 242 241 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) ··· 246 241 static const char str[] = "double fault"; 247 242 struct task_struct *tsk = current; 248 243 249 - exception_enter(regs); 244 + exception_enter(); 250 245 /* Return not checked because double check cannot be ignored */ 251 246 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 252 247 ··· 266 261 do_general_protection(struct pt_regs *regs, long error_code) 267 262 { 268 263 struct task_struct *tsk; 264 + enum ctx_state prev_state; 269 265 270 - exception_enter(regs); 266 + prev_state = exception_enter(); 271 267 conditional_sti(regs); 272 268 273 269 #ifdef CONFIG_X86_32 ··· 306 300 307 301 force_sig(SIGSEGV, tsk); 308 302 exit: 309 - exception_exit(regs); 303 + exception_exit(prev_state); 310 304 } 311 305 312 306 /* May run on IST stack. */ 313 307 dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) 314 308 { 309 + enum ctx_state prev_state; 310 + 315 311 #ifdef CONFIG_DYNAMIC_FTRACE 316 312 /* 317 313 * ftrace must be first, everything else may cause a recursive crash. ··· 323 315 ftrace_int3_handler(regs)) 324 316 return; 325 317 #endif 326 - exception_enter(regs); 318 + prev_state = exception_enter(); 327 319 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 328 320 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 329 321 SIGTRAP) == NOTIFY_STOP) ··· 344 336 preempt_conditional_cli(regs); 345 337 debug_stack_usage_dec(); 346 338 exit: 347 - exception_exit(regs); 339 + exception_exit(prev_state); 348 340 } 349 341 350 342 #ifdef CONFIG_X86_64 ··· 401 393 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 402 394 { 403 395 struct task_struct *tsk = current; 396 + enum ctx_state prev_state; 404 397 int user_icebp = 0; 405 398 unsigned long dr6; 406 399 int si_code; 407 400 408 - exception_enter(regs); 401 + prev_state = exception_enter(); 409 402 410 403 get_debugreg(dr6, 6); 411 404 ··· 476 467 debug_stack_usage_dec(); 477 468 478 469 exit: 479 - exception_exit(regs); 470 + exception_exit(prev_state); 480 471 } 481 472 482 473 /* ··· 570 561 571 562 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 572 563 { 573 - exception_enter(regs); 564 + enum ctx_state prev_state; 565 + 566 + prev_state = exception_enter(); 574 567 math_error(regs, error_code, X86_TRAP_MF); 575 - exception_exit(regs); 568 + exception_exit(prev_state); 576 569 } 577 570 578 571 dotraplinkage void 579 572 do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 580 573 { 581 - exception_enter(regs); 574 + enum ctx_state prev_state; 575 + 576 + prev_state = exception_enter(); 582 577 math_error(regs, error_code, X86_TRAP_XF); 583 - exception_exit(regs); 578 + exception_exit(prev_state); 584 579 } 585 580 586 581 dotraplinkage void ··· 652 639 dotraplinkage void __kprobes 653 640 do_device_not_available(struct pt_regs *regs, long error_code) 654 641 { 655 - exception_enter(regs); 642 + enum ctx_state prev_state; 643 + 644 + prev_state = exception_enter(); 656 645 BUG_ON(use_eager_fpu()); 657 646 658 647 #ifdef CONFIG_MATH_EMULATION ··· 665 650 666 651 info.regs = regs; 667 652 math_emulate(&info); 668 - exception_exit(regs); 653 + exception_exit(prev_state); 669 654 return; 670 655 } 671 656 #endif ··· 673 658 #ifdef CONFIG_X86_32 674 659 conditional_sti(regs); 675 660 #endif 676 - exception_exit(regs); 661 + exception_exit(prev_state); 677 662 } 678 663 679 664 #ifdef CONFIG_X86_32 680 665 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 681 666 { 682 667 siginfo_t info; 668 + enum ctx_state prev_state; 683 669 684 - exception_enter(regs); 670 + prev_state = exception_enter(); 685 671 local_irq_enable(); 686 672 687 673 info.si_signo = SIGILL; ··· 694 678 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 695 679 &info); 696 680 } 697 - exception_exit(regs); 681 + exception_exit(prev_state); 698 682 } 699 683 #endif 700 684

+5 -3

arch/x86/mm/fault.c

··· 13 13 #include <linux/perf_event.h> /* perf_sw_event */ 14 14 #include <linux/hugetlb.h> /* hstate_index_to_shift */ 15 15 #include <linux/prefetch.h> /* prefetchw */ 16 + #include <linux/context_tracking.h> /* exception_enter(), ... */ 16 17 17 18 #include <asm/traps.h> /* dotraplinkage, ... */ 18 19 #include <asm/pgalloc.h> /* pgd_*(), ... */ 19 20 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 20 21 #include <asm/fixmap.h> /* VSYSCALL_START */ 21 - #include <asm/context_tracking.h> /* exception_enter(), ... */ 22 22 23 23 /* 24 24 * Page fault error code bits: ··· 1224 1224 dotraplinkage void __kprobes 1225 1225 do_page_fault(struct pt_regs *regs, unsigned long error_code) 1226 1226 { 1227 - exception_enter(regs); 1227 + enum ctx_state prev_state; 1228 + 1229 + prev_state = exception_enter(); 1228 1230 __do_page_fault(regs, error_code); 1229 - exception_exit(regs); 1231 + exception_exit(prev_state); 1230 1232 }

-1

include/linux/cgroup.h

··· 586 586 void (*bind)(struct cgroup *root); 587 587 588 588 int subsys_id; 589 - int active; 590 589 int disabled; 591 590 int early_init; 592 591 /*

+22 -2

include/linux/context_tracking.h

··· 1 1 #ifndef _LINUX_CONTEXT_TRACKING_H 2 2 #define _LINUX_CONTEXT_TRACKING_H 3 3 4 - #ifdef CONFIG_CONTEXT_TRACKING 5 4 #include <linux/sched.h> 6 5 #include <linux/percpu.h> 6 + #include <asm/ptrace.h> 7 7 8 8 struct context_tracking { 9 9 /* ··· 13 13 * may be further optimized using static keys. 14 14 */ 15 15 bool active; 16 - enum { 16 + enum ctx_state { 17 17 IN_KERNEL = 0, 18 18 IN_USER, 19 19 } state; 20 20 }; 21 21 22 + #ifdef CONFIG_CONTEXT_TRACKING 22 23 DECLARE_PER_CPU(struct context_tracking, context_tracking); 23 24 24 25 static inline bool context_tracking_in_user(void) ··· 34 33 35 34 extern void user_enter(void); 36 35 extern void user_exit(void); 36 + 37 + static inline enum ctx_state exception_enter(void) 38 + { 39 + enum ctx_state prev_ctx; 40 + 41 + prev_ctx = this_cpu_read(context_tracking.state); 42 + user_exit(); 43 + 44 + return prev_ctx; 45 + } 46 + 47 + static inline void exception_exit(enum ctx_state prev_ctx) 48 + { 49 + if (prev_ctx == IN_USER) 50 + user_enter(); 51 + } 52 + 37 53 extern void context_tracking_task_switch(struct task_struct *prev, 38 54 struct task_struct *next); 39 55 #else 40 56 static inline bool context_tracking_in_user(void) { return false; } 41 57 static inline void user_enter(void) { } 42 58 static inline void user_exit(void) { } 59 + static inline enum ctx_state exception_enter(void) { return 0; } 60 + static inline void exception_exit(enum ctx_state prev_ctx) { } 43 61 static inline void context_tracking_task_switch(struct task_struct *prev, 44 62 struct task_struct *next) { } 45 63 #endif /* !CONFIG_CONTEXT_TRACKING */

+18 -1

include/linux/math64.h

··· 30 30 } 31 31 32 32 /** 33 + * div64_u64_rem - unsigned 64bit divide with 64bit divisor 34 + */ 35 + static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) 36 + { 37 + *remainder = dividend % divisor; 38 + return dividend / divisor; 39 + } 40 + 41 + /** 33 42 * div64_u64 - unsigned 64bit divide with 64bit divisor 34 43 */ 35 44 static inline u64 div64_u64(u64 dividend, u64 divisor) ··· 70 61 extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); 71 62 #endif 72 63 64 + #ifndef div64_u64_rem 65 + extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); 66 + #endif 67 + 73 68 #ifndef div64_u64 74 - extern u64 div64_u64(u64 dividend, u64 divisor); 69 + static inline u64 div64_u64(u64 dividend, u64 divisor) 70 + { 71 + u64 remainder; 72 + return div64_u64_rem(dividend, divisor, &remainder); 73 + } 75 74 #endif 76 75 77 76 #ifndef div64_s64

+9 -195

include/linux/sched.h

··· 127 127 extern void proc_sched_set_task(struct task_struct *p); 128 128 extern void 129 129 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); 130 - #else 131 - static inline void 132 - proc_sched_show_task(struct task_struct *p, struct seq_file *m) 133 - { 134 - } 135 - static inline void proc_sched_set_task(struct task_struct *p) 136 - { 137 - } 138 - static inline void 139 - print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 140 - { 141 - } 142 130 #endif 143 131 144 132 /* ··· 558 570 cputime_t utime, stime, cutime, cstime; 559 571 cputime_t gtime; 560 572 cputime_t cgtime; 561 - #ifndef CONFIG_VIRT_CPU_ACCOUNTING 573 + #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 562 574 struct cputime prev_cputime; 563 575 #endif 564 576 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; ··· 756 768 }; 757 769 758 770 /* 759 - * Increase resolution of nice-level calculations for 64-bit architectures. 760 - * The extra resolution improves shares distribution and load balancing of 761 - * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup 762 - * hierarchies, especially on larger systems. This is not a user-visible change 763 - * and does not change the user-interface for setting shares/weights. 764 - * 765 - * We increase resolution only if we have enough bits to allow this increased 766 - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution 767 - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the 768 - * increased costs. 769 - */ 770 - #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ 771 - # define SCHED_LOAD_RESOLUTION 10 772 - # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) 773 - # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) 774 - #else 775 - # define SCHED_LOAD_RESOLUTION 0 776 - # define scale_load(w) (w) 777 - # define scale_load_down(w) (w) 778 - #endif 779 - 780 - #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) 781 - #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) 782 - 783 - /* 784 771 * Increase resolution of cpu_power calculations 785 772 */ 786 773 #define SCHED_POWER_SHIFT 10 ··· 780 817 781 818 extern int __weak arch_sd_sibiling_asym_packing(void); 782 819 783 - struct sched_group_power { 784 - atomic_t ref; 785 - /* 786 - * CPU power of this group, SCHED_LOAD_SCALE being max power for a 787 - * single CPU. 788 - */ 789 - unsigned int power, power_orig; 790 - unsigned long next_update; 791 - /* 792 - * Number of busy cpus in this group. 793 - */ 794 - atomic_t nr_busy_cpus; 795 - 796 - unsigned long cpumask[0]; /* iteration mask */ 797 - }; 798 - 799 - struct sched_group { 800 - struct sched_group *next; /* Must be a circular list */ 801 - atomic_t ref; 802 - 803 - unsigned int group_weight; 804 - struct sched_group_power *sgp; 805 - 806 - /* 807 - * The CPUs this group covers. 808 - * 809 - * NOTE: this field is variable length. (Allocated dynamically 810 - * by attaching extra space to the end of the structure, 811 - * depending on how many CPUs the kernel has booted up with) 812 - */ 813 - unsigned long cpumask[0]; 814 - }; 815 - 816 - static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 817 - { 818 - return to_cpumask(sg->cpumask); 819 - } 820 - 821 - /* 822 - * cpumask masking which cpus in the group are allowed to iterate up the domain 823 - * tree. 824 - */ 825 - static inline struct cpumask *sched_group_mask(struct sched_group *sg) 826 - { 827 - return to_cpumask(sg->sgp->cpumask); 828 - } 829 - 830 - /** 831 - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 832 - * @group: The group whose first cpu is to be returned. 833 - */ 834 - static inline unsigned int group_first_cpu(struct sched_group *group) 835 - { 836 - return cpumask_first(sched_group_cpus(group)); 837 - } 838 - 839 820 struct sched_domain_attr { 840 821 int relax_domain_level; 841 822 }; ··· 789 882 } 790 883 791 884 extern int sched_domain_level_max; 885 + 886 + struct sched_group; 792 887 793 888 struct sched_domain { 794 889 /* These fields must be setup */ ··· 808 899 unsigned int wake_idx; 809 900 unsigned int forkexec_idx; 810 901 unsigned int smt_gain; 902 + 903 + int nohz_idle; /* NOHZ IDLE status */ 811 904 int flags; /* See SD_* */ 812 905 int level; 813 906 ··· 882 971 cpumask_var_t *alloc_sched_domains(unsigned int ndoms); 883 972 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); 884 973 885 - /* Test a flag in parent sched domain */ 886 - static inline int test_sd_parent(struct sched_domain *sd, int flag) 887 - { 888 - if (sd->parent && (sd->parent->flags & flag)) 889 - return 1; 890 - 891 - return 0; 892 - } 893 - 894 - unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); 895 - unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); 896 - 897 974 bool cpus_share_cache(int this_cpu, int that_cpu); 898 975 899 976 #else /* CONFIG_SMP */ ··· 915 1016 struct mempolicy; 916 1017 struct pipe_inode_info; 917 1018 struct uts_namespace; 918 - 919 - struct rq; 920 - struct sched_domain; 921 - 922 - /* 923 - * wake flags 924 - */ 925 - #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ 926 - #define WF_FORK 0x02 /* child wakeup after fork */ 927 - #define WF_MIGRATED 0x04 /* internal use, task got migrated */ 928 - 929 - #define ENQUEUE_WAKEUP 1 930 - #define ENQUEUE_HEAD 2 931 - #ifdef CONFIG_SMP 932 - #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ 933 - #else 934 - #define ENQUEUE_WAKING 0 935 - #endif 936 - 937 - #define DEQUEUE_SLEEP 1 938 - 939 - struct sched_class { 940 - const struct sched_class *next; 941 - 942 - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 943 - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 944 - void (*yield_task) (struct rq *rq); 945 - bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 946 - 947 - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 948 - 949 - struct task_struct * (*pick_next_task) (struct rq *rq); 950 - void (*put_prev_task) (struct rq *rq, struct task_struct *p); 951 - 952 - #ifdef CONFIG_SMP 953 - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 954 - void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 955 - 956 - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 957 - void (*post_schedule) (struct rq *this_rq); 958 - void (*task_waking) (struct task_struct *task); 959 - void (*task_woken) (struct rq *this_rq, struct task_struct *task); 960 - 961 - void (*set_cpus_allowed)(struct task_struct *p, 962 - const struct cpumask *newmask); 963 - 964 - void (*rq_online)(struct rq *rq); 965 - void (*rq_offline)(struct rq *rq); 966 - #endif 967 - 968 - void (*set_curr_task) (struct rq *rq); 969 - void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 970 - void (*task_fork) (struct task_struct *p); 971 - 972 - void (*switched_from) (struct rq *this_rq, struct task_struct *task); 973 - void (*switched_to) (struct rq *this_rq, struct task_struct *task); 974 - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 975 - int oldprio); 976 - 977 - unsigned int (*get_rr_interval) (struct rq *rq, 978 - struct task_struct *task); 979 - 980 - #ifdef CONFIG_FAIR_GROUP_SCHED 981 - void (*task_move_group) (struct task_struct *p, int on_rq); 982 - #endif 983 - }; 984 1019 985 1020 struct load_weight { 986 1021 unsigned long weight, inv_weight; ··· 1107 1274 int exit_code, exit_signal; 1108 1275 int pdeath_signal; /* The signal sent when the parent dies */ 1109 1276 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1110 - /* ??? */ 1277 + 1278 + /* Used for emulating ABI behavior of previous Linux versions */ 1111 1279 unsigned int personality; 1280 + 1112 1281 unsigned did_exec:1; 1113 1282 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1114 1283 * execve */ ··· 1162 1327 1163 1328 cputime_t utime, stime, utimescaled, stimescaled; 1164 1329 cputime_t gtime; 1165 - #ifndef CONFIG_VIRT_CPU_ACCOUNTING 1330 + #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1166 1331 struct cputime prev_cputime; 1167 1332 #endif 1168 1333 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ··· 2516 2681 extern long sched_getaffinity(pid_t pid, struct cpumask *mask); 2517 2682 2518 2683 #ifdef CONFIG_CGROUP_SCHED 2519 - 2520 2684 extern struct task_group root_task_group; 2521 - 2522 - extern struct task_group *sched_create_group(struct task_group *parent); 2523 - extern void sched_online_group(struct task_group *tg, 2524 - struct task_group *parent); 2525 - extern void sched_destroy_group(struct task_group *tg); 2526 - extern void sched_offline_group(struct task_group *tg); 2527 - extern void sched_move_task(struct task_struct *tsk); 2528 - #ifdef CONFIG_FAIR_GROUP_SCHED 2529 - extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 2530 - extern unsigned long sched_group_shares(struct task_group *tg); 2531 - #endif 2532 - #ifdef CONFIG_RT_GROUP_SCHED 2533 - extern int sched_group_set_rt_runtime(struct task_group *tg, 2534 - long rt_runtime_us); 2535 - extern long sched_group_rt_runtime(struct task_group *tg); 2536 - extern int sched_group_set_rt_period(struct task_group *tg, 2537 - long rt_period_us); 2538 - extern long sched_group_rt_period(struct task_group *tg); 2539 - extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); 2540 - #endif 2541 2685 #endif /* CONFIG_CGROUP_SCHED */ 2542 2686 2543 2687 extern int task_can_switch_user(struct user_struct *up,

+1

init/Kconfig

··· 505 505 config CONTEXT_TRACKING_FORCE 506 506 bool "Force context tracking" 507 507 depends on CONTEXT_TRACKING 508 + default CONTEXT_TRACKING 508 509 help 509 510 Probe on user/kernel boundaries by default in order to 510 511 test the features that rely on it such as userspace RCU extended

-3

kernel/cgroup.c

··· 4380 4380 * need to invoke fork callbacks here. */ 4381 4381 BUG_ON(!list_empty(&init_task.tasks)); 4382 4382 4383 - ss->active = 1; 4384 4383 BUG_ON(online_css(ss, dummytop)); 4385 4384 4386 4385 mutex_unlock(&cgroup_mutex); ··· 4484 4485 } 4485 4486 write_unlock(&css_set_lock); 4486 4487 4487 - ss->active = 1; 4488 4488 ret = online_css(ss, dummytop); 4489 4489 if (ret) 4490 4490 goto err_unload; ··· 4524 4526 mutex_lock(&cgroup_mutex); 4525 4527 4526 4528 offline_css(ss, dummytop); 4527 - ss->active = 0; 4528 4529 4529 4530 if (ss->use_id) 4530 4531 idr_destroy(&ss->idr);

+1 -1

kernel/fork.c

··· 1233 1233 1234 1234 p->utime = p->stime = p->gtime = 0; 1235 1235 p->utimescaled = p->stimescaled = 0; 1236 - #ifndef CONFIG_VIRT_CPU_ACCOUNTING 1236 + #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1237 1237 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1238 1238 #endif 1239 1239 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN

+1

kernel/sched/Makefile

··· 16 16 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 17 17 obj-$(CONFIG_SCHEDSTATS) += stats.o 18 18 obj-$(CONFIG_SCHED_DEBUG) += debug.o 19 + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o

+18 -236

kernel/sched/core.c

··· 1288 1288 static void 1289 1289 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1290 1290 { 1291 - trace_sched_wakeup(p, true); 1292 1291 check_preempt_curr(rq, p, wake_flags); 1292 + trace_sched_wakeup(p, true); 1293 1293 1294 1294 p->state = TASK_RUNNING; 1295 1295 #ifdef CONFIG_SMP ··· 3039 3039 asmlinkage void __sched preempt_schedule_irq(void) 3040 3040 { 3041 3041 struct thread_info *ti = current_thread_info(); 3042 + enum ctx_state prev_state; 3042 3043 3043 3044 /* Catch callers which need to be fixed */ 3044 3045 BUG_ON(ti->preempt_count || !irqs_disabled()); 3045 3046 3046 - user_exit(); 3047 + prev_state = exception_enter(); 3048 + 3047 3049 do { 3048 3050 add_preempt_count(PREEMPT_ACTIVE); 3049 3051 local_irq_enable(); ··· 3059 3057 */ 3060 3058 barrier(); 3061 3059 } while (need_resched()); 3060 + 3061 + exception_exit(prev_state); 3062 3062 } 3063 3063 3064 3064 #endif /* CONFIG_PREEMPT */ ··· 6208 6204 * 'level' contains the number of unique distances, excluding the 6209 6205 * identity distance node_distance(i,i). 6210 6206 * 6211 - * The sched_domains_nume_distance[] array includes the actual distance 6207 + * The sched_domains_numa_distance[] array includes the actual distance 6212 6208 * numbers. 6213 6209 */ 6214 6210 ··· 6821 6817 } 6822 6818 6823 6819 #ifdef CONFIG_CGROUP_SCHED 6820 + /* 6821 + * Default task group. 6822 + * Every task in system belongs to this group at bootup. 6823 + */ 6824 6824 struct task_group root_task_group; 6825 6825 LIST_HEAD(task_groups); 6826 6826 #endif 6827 6827 6828 - DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6828 + DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 6829 6829 6830 6830 void __init sched_init(void) 6831 6831 { ··· 6866 6858 #endif /* CONFIG_RT_GROUP_SCHED */ 6867 6859 #ifdef CONFIG_CPUMASK_OFFSTACK 6868 6860 for_each_possible_cpu(i) { 6869 - per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6861 + per_cpu(load_balance_mask, i) = (void *)ptr; 6870 6862 ptr += cpumask_size(); 6871 6863 } 6872 6864 #endif /* CONFIG_CPUMASK_OFFSTACK */ ··· 6892 6884 6893 6885 #endif /* CONFIG_CGROUP_SCHED */ 6894 6886 6895 - #ifdef CONFIG_CGROUP_CPUACCT 6896 - root_cpuacct.cpustat = &kernel_cpustat; 6897 - root_cpuacct.cpuusage = alloc_percpu(u64); 6898 - /* Too early, not expected to fail */ 6899 - BUG_ON(!root_cpuacct.cpuusage); 6900 - #endif 6901 6887 for_each_possible_cpu(i) { 6902 6888 struct rq *rq; 6903 6889 ··· 7413 7411 return err; 7414 7412 } 7415 7413 7416 - int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7414 + static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7417 7415 { 7418 7416 u64 rt_runtime, rt_period; 7419 7417 ··· 7425 7423 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7426 7424 } 7427 7425 7428 - long sched_group_rt_runtime(struct task_group *tg) 7426 + static long sched_group_rt_runtime(struct task_group *tg) 7429 7427 { 7430 7428 u64 rt_runtime_us; 7431 7429 ··· 7437 7435 return rt_runtime_us; 7438 7436 } 7439 7437 7440 - int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7438 + static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7441 7439 { 7442 7440 u64 rt_runtime, rt_period; 7443 7441 ··· 7450 7448 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7451 7449 } 7452 7450 7453 - long sched_group_rt_period(struct task_group *tg) 7451 + static long sched_group_rt_period(struct task_group *tg) 7454 7452 { 7455 7453 u64 rt_period_us; 7456 7454 ··· 7485 7483 return ret; 7486 7484 } 7487 7485 7488 - int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7486 + static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7489 7487 { 7490 7488 /* Don't accept realtime tasks when there is no way for them to run */ 7491 7489 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) ··· 7992 7990 }; 7993 7991 7994 7992 #endif /* CONFIG_CGROUP_SCHED */ 7995 - 7996 - #ifdef CONFIG_CGROUP_CPUACCT 7997 - 7998 - /* 7999 - * CPU accounting code for task groups. 8000 - * 8001 - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 8002 - * (balbir@in.ibm.com). 8003 - */ 8004 - 8005 - struct cpuacct root_cpuacct; 8006 - 8007 - /* create a new cpu accounting group */ 8008 - static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 8009 - { 8010 - struct cpuacct *ca; 8011 - 8012 - if (!cgrp->parent) 8013 - return &root_cpuacct.css; 8014 - 8015 - ca = kzalloc(sizeof(*ca), GFP_KERNEL); 8016 - if (!ca) 8017 - goto out; 8018 - 8019 - ca->cpuusage = alloc_percpu(u64); 8020 - if (!ca->cpuusage) 8021 - goto out_free_ca; 8022 - 8023 - ca->cpustat = alloc_percpu(struct kernel_cpustat); 8024 - if (!ca->cpustat) 8025 - goto out_free_cpuusage; 8026 - 8027 - return &ca->css; 8028 - 8029 - out_free_cpuusage: 8030 - free_percpu(ca->cpuusage); 8031 - out_free_ca: 8032 - kfree(ca); 8033 - out: 8034 - return ERR_PTR(-ENOMEM); 8035 - } 8036 - 8037 - /* destroy an existing cpu accounting group */ 8038 - static void cpuacct_css_free(struct cgroup *cgrp) 8039 - { 8040 - struct cpuacct *ca = cgroup_ca(cgrp); 8041 - 8042 - free_percpu(ca->cpustat); 8043 - free_percpu(ca->cpuusage); 8044 - kfree(ca); 8045 - } 8046 - 8047 - static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 8048 - { 8049 - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8050 - u64 data; 8051 - 8052 - #ifndef CONFIG_64BIT 8053 - /* 8054 - * Take rq->lock to make 64-bit read safe on 32-bit platforms. 8055 - */ 8056 - raw_spin_lock_irq(&cpu_rq(cpu)->lock); 8057 - data = *cpuusage; 8058 - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 8059 - #else 8060 - data = *cpuusage; 8061 - #endif 8062 - 8063 - return data; 8064 - } 8065 - 8066 - static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 8067 - { 8068 - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8069 - 8070 - #ifndef CONFIG_64BIT 8071 - /* 8072 - * Take rq->lock to make 64-bit write safe on 32-bit platforms. 8073 - */ 8074 - raw_spin_lock_irq(&cpu_rq(cpu)->lock); 8075 - *cpuusage = val; 8076 - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 8077 - #else 8078 - *cpuusage = val; 8079 - #endif 8080 - } 8081 - 8082 - /* return total cpu usage (in nanoseconds) of a group */ 8083 - static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 8084 - { 8085 - struct cpuacct *ca = cgroup_ca(cgrp); 8086 - u64 totalcpuusage = 0; 8087 - int i; 8088 - 8089 - for_each_present_cpu(i) 8090 - totalcpuusage += cpuacct_cpuusage_read(ca, i); 8091 - 8092 - return totalcpuusage; 8093 - } 8094 - 8095 - static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 8096 - u64 reset) 8097 - { 8098 - struct cpuacct *ca = cgroup_ca(cgrp); 8099 - int err = 0; 8100 - int i; 8101 - 8102 - if (reset) { 8103 - err = -EINVAL; 8104 - goto out; 8105 - } 8106 - 8107 - for_each_present_cpu(i) 8108 - cpuacct_cpuusage_write(ca, i, 0); 8109 - 8110 - out: 8111 - return err; 8112 - } 8113 - 8114 - static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 8115 - struct seq_file *m) 8116 - { 8117 - struct cpuacct *ca = cgroup_ca(cgroup); 8118 - u64 percpu; 8119 - int i; 8120 - 8121 - for_each_present_cpu(i) { 8122 - percpu = cpuacct_cpuusage_read(ca, i); 8123 - seq_printf(m, "%llu ", (unsigned long long) percpu); 8124 - } 8125 - seq_printf(m, "\n"); 8126 - return 0; 8127 - } 8128 - 8129 - static const char *cpuacct_stat_desc[] = { 8130 - [CPUACCT_STAT_USER] = "user", 8131 - [CPUACCT_STAT_SYSTEM] = "system", 8132 - }; 8133 - 8134 - static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8135 - struct cgroup_map_cb *cb) 8136 - { 8137 - struct cpuacct *ca = cgroup_ca(cgrp); 8138 - int cpu; 8139 - s64 val = 0; 8140 - 8141 - for_each_online_cpu(cpu) { 8142 - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8143 - val += kcpustat->cpustat[CPUTIME_USER]; 8144 - val += kcpustat->cpustat[CPUTIME_NICE]; 8145 - } 8146 - val = cputime64_to_clock_t(val); 8147 - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 8148 - 8149 - val = 0; 8150 - for_each_online_cpu(cpu) { 8151 - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8152 - val += kcpustat->cpustat[CPUTIME_SYSTEM]; 8153 - val += kcpustat->cpustat[CPUTIME_IRQ]; 8154 - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 8155 - } 8156 - 8157 - val = cputime64_to_clock_t(val); 8158 - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 8159 - 8160 - return 0; 8161 - } 8162 - 8163 - static struct cftype files[] = { 8164 - { 8165 - .name = "usage", 8166 - .read_u64 = cpuusage_read, 8167 - .write_u64 = cpuusage_write, 8168 - }, 8169 - { 8170 - .name = "usage_percpu", 8171 - .read_seq_string = cpuacct_percpu_seq_read, 8172 - }, 8173 - { 8174 - .name = "stat", 8175 - .read_map = cpuacct_stats_show, 8176 - }, 8177 - { } /* terminate */ 8178 - }; 8179 - 8180 - /* 8181 - * charge this task's execution time to its accounting group. 8182 - * 8183 - * called with rq->lock held. 8184 - */ 8185 - void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8186 - { 8187 - struct cpuacct *ca; 8188 - int cpu; 8189 - 8190 - if (unlikely(!cpuacct_subsys.active)) 8191 - return; 8192 - 8193 - cpu = task_cpu(tsk); 8194 - 8195 - rcu_read_lock(); 8196 - 8197 - ca = task_ca(tsk); 8198 - 8199 - for (; ca; ca = parent_ca(ca)) { 8200 - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8201 - *cpuusage += cputime; 8202 - } 8203 - 8204 - rcu_read_unlock(); 8205 - } 8206 - 8207 - struct cgroup_subsys cpuacct_subsys = { 8208 - .name = "cpuacct", 8209 - .css_alloc = cpuacct_css_alloc, 8210 - .css_free = cpuacct_css_free, 8211 - .subsys_id = cpuacct_subsys_id, 8212 - .base_cftypes = files, 8213 - }; 8214 - #endif /* CONFIG_CGROUP_CPUACCT */ 8215 7993 8216 7994 void dump_cpu_task(int cpu) 8217 7995 {

+296

kernel/sched/cpuacct.c

··· 1 + #include <linux/cgroup.h> 2 + #include <linux/slab.h> 3 + #include <linux/percpu.h> 4 + #include <linux/spinlock.h> 5 + #include <linux/cpumask.h> 6 + #include <linux/seq_file.h> 7 + #include <linux/rcupdate.h> 8 + #include <linux/kernel_stat.h> 9 + #include <linux/err.h> 10 + 11 + #include "sched.h" 12 + 13 + /* 14 + * CPU accounting code for task groups. 15 + * 16 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 17 + * (balbir@in.ibm.com). 18 + */ 19 + 20 + /* Time spent by the tasks of the cpu accounting group executing in ... */ 21 + enum cpuacct_stat_index { 22 + CPUACCT_STAT_USER, /* ... user mode */ 23 + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 24 + 25 + CPUACCT_STAT_NSTATS, 26 + }; 27 + 28 + /* track cpu usage of a group of tasks and its child groups */ 29 + struct cpuacct { 30 + struct cgroup_subsys_state css; 31 + /* cpuusage holds pointer to a u64-type object on every cpu */ 32 + u64 __percpu *cpuusage; 33 + struct kernel_cpustat __percpu *cpustat; 34 + }; 35 + 36 + /* return cpu accounting group corresponding to this container */ 37 + static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 38 + { 39 + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 40 + struct cpuacct, css); 41 + } 42 + 43 + /* return cpu accounting group to which this task belongs */ 44 + static inline struct cpuacct *task_ca(struct task_struct *tsk) 45 + { 46 + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 47 + struct cpuacct, css); 48 + } 49 + 50 + static inline struct cpuacct *__parent_ca(struct cpuacct *ca) 51 + { 52 + return cgroup_ca(ca->css.cgroup->parent); 53 + } 54 + 55 + static inline struct cpuacct *parent_ca(struct cpuacct *ca) 56 + { 57 + if (!ca->css.cgroup->parent) 58 + return NULL; 59 + return cgroup_ca(ca->css.cgroup->parent); 60 + } 61 + 62 + static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 63 + static struct cpuacct root_cpuacct = { 64 + .cpustat = &kernel_cpustat, 65 + .cpuusage = &root_cpuacct_cpuusage, 66 + }; 67 + 68 + /* create a new cpu accounting group */ 69 + static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 70 + { 71 + struct cpuacct *ca; 72 + 73 + if (!cgrp->parent) 74 + return &root_cpuacct.css; 75 + 76 + ca = kzalloc(sizeof(*ca), GFP_KERNEL); 77 + if (!ca) 78 + goto out; 79 + 80 + ca->cpuusage = alloc_percpu(u64); 81 + if (!ca->cpuusage) 82 + goto out_free_ca; 83 + 84 + ca->cpustat = alloc_percpu(struct kernel_cpustat); 85 + if (!ca->cpustat) 86 + goto out_free_cpuusage; 87 + 88 + return &ca->css; 89 + 90 + out_free_cpuusage: 91 + free_percpu(ca->cpuusage); 92 + out_free_ca: 93 + kfree(ca); 94 + out: 95 + return ERR_PTR(-ENOMEM); 96 + } 97 + 98 + /* destroy an existing cpu accounting group */ 99 + static void cpuacct_css_free(struct cgroup *cgrp) 100 + { 101 + struct cpuacct *ca = cgroup_ca(cgrp); 102 + 103 + free_percpu(ca->cpustat); 104 + free_percpu(ca->cpuusage); 105 + kfree(ca); 106 + } 107 + 108 + static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 109 + { 110 + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 111 + u64 data; 112 + 113 + #ifndef CONFIG_64BIT 114 + /* 115 + * Take rq->lock to make 64-bit read safe on 32-bit platforms. 116 + */ 117 + raw_spin_lock_irq(&cpu_rq(cpu)->lock); 118 + data = *cpuusage; 119 + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 120 + #else 121 + data = *cpuusage; 122 + #endif 123 + 124 + return data; 125 + } 126 + 127 + static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 128 + { 129 + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 130 + 131 + #ifndef CONFIG_64BIT 132 + /* 133 + * Take rq->lock to make 64-bit write safe on 32-bit platforms. 134 + */ 135 + raw_spin_lock_irq(&cpu_rq(cpu)->lock); 136 + *cpuusage = val; 137 + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 138 + #else 139 + *cpuusage = val; 140 + #endif 141 + } 142 + 143 + /* return total cpu usage (in nanoseconds) of a group */ 144 + static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 145 + { 146 + struct cpuacct *ca = cgroup_ca(cgrp); 147 + u64 totalcpuusage = 0; 148 + int i; 149 + 150 + for_each_present_cpu(i) 151 + totalcpuusage += cpuacct_cpuusage_read(ca, i); 152 + 153 + return totalcpuusage; 154 + } 155 + 156 + static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 157 + u64 reset) 158 + { 159 + struct cpuacct *ca = cgroup_ca(cgrp); 160 + int err = 0; 161 + int i; 162 + 163 + if (reset) { 164 + err = -EINVAL; 165 + goto out; 166 + } 167 + 168 + for_each_present_cpu(i) 169 + cpuacct_cpuusage_write(ca, i, 0); 170 + 171 + out: 172 + return err; 173 + } 174 + 175 + static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 176 + struct seq_file *m) 177 + { 178 + struct cpuacct *ca = cgroup_ca(cgroup); 179 + u64 percpu; 180 + int i; 181 + 182 + for_each_present_cpu(i) { 183 + percpu = cpuacct_cpuusage_read(ca, i); 184 + seq_printf(m, "%llu ", (unsigned long long) percpu); 185 + } 186 + seq_printf(m, "\n"); 187 + return 0; 188 + } 189 + 190 + static const char * const cpuacct_stat_desc[] = { 191 + [CPUACCT_STAT_USER] = "user", 192 + [CPUACCT_STAT_SYSTEM] = "system", 193 + }; 194 + 195 + static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 196 + struct cgroup_map_cb *cb) 197 + { 198 + struct cpuacct *ca = cgroup_ca(cgrp); 199 + int cpu; 200 + s64 val = 0; 201 + 202 + for_each_online_cpu(cpu) { 203 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 204 + val += kcpustat->cpustat[CPUTIME_USER]; 205 + val += kcpustat->cpustat[CPUTIME_NICE]; 206 + } 207 + val = cputime64_to_clock_t(val); 208 + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 209 + 210 + val = 0; 211 + for_each_online_cpu(cpu) { 212 + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 213 + val += kcpustat->cpustat[CPUTIME_SYSTEM]; 214 + val += kcpustat->cpustat[CPUTIME_IRQ]; 215 + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 216 + } 217 + 218 + val = cputime64_to_clock_t(val); 219 + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 220 + 221 + return 0; 222 + } 223 + 224 + static struct cftype files[] = { 225 + { 226 + .name = "usage", 227 + .read_u64 = cpuusage_read, 228 + .write_u64 = cpuusage_write, 229 + }, 230 + { 231 + .name = "usage_percpu", 232 + .read_seq_string = cpuacct_percpu_seq_read, 233 + }, 234 + { 235 + .name = "stat", 236 + .read_map = cpuacct_stats_show, 237 + }, 238 + { } /* terminate */ 239 + }; 240 + 241 + /* 242 + * charge this task's execution time to its accounting group. 243 + * 244 + * called with rq->lock held. 245 + */ 246 + void cpuacct_charge(struct task_struct *tsk, u64 cputime) 247 + { 248 + struct cpuacct *ca; 249 + int cpu; 250 + 251 + cpu = task_cpu(tsk); 252 + 253 + rcu_read_lock(); 254 + 255 + ca = task_ca(tsk); 256 + 257 + while (true) { 258 + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 259 + *cpuusage += cputime; 260 + 261 + ca = parent_ca(ca); 262 + if (!ca) 263 + break; 264 + } 265 + 266 + rcu_read_unlock(); 267 + } 268 + 269 + /* 270 + * Add user/system time to cpuacct. 271 + * 272 + * Note: it's the caller that updates the account of the root cgroup. 273 + */ 274 + void cpuacct_account_field(struct task_struct *p, int index, u64 val) 275 + { 276 + struct kernel_cpustat *kcpustat; 277 + struct cpuacct *ca; 278 + 279 + rcu_read_lock(); 280 + ca = task_ca(p); 281 + while (ca != &root_cpuacct) { 282 + kcpustat = this_cpu_ptr(ca->cpustat); 283 + kcpustat->cpustat[index] += val; 284 + ca = __parent_ca(ca); 285 + } 286 + rcu_read_unlock(); 287 + } 288 + 289 + struct cgroup_subsys cpuacct_subsys = { 290 + .name = "cpuacct", 291 + .css_alloc = cpuacct_css_alloc, 292 + .css_free = cpuacct_css_free, 293 + .subsys_id = cpuacct_subsys_id, 294 + .base_cftypes = files, 295 + .early_init = 1, 296 + };

+17

kernel/sched/cpuacct.h

··· 1 + #ifdef CONFIG_CGROUP_CPUACCT 2 + 3 + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 4 + extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); 5 + 6 + #else 7 + 8 + static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9 + { 10 + } 11 + 12 + static inline void 13 + cpuacct_account_field(struct task_struct *p, int index, u64 val) 14 + { 15 + } 16 + 17 + #endif

+113 -101

kernel/sched/cputime.c

··· 115 115 static inline void task_group_account_field(struct task_struct *p, int index, 116 116 u64 tmp) 117 117 { 118 - #ifdef CONFIG_CGROUP_CPUACCT 119 - struct kernel_cpustat *kcpustat; 120 - struct cpuacct *ca; 121 - #endif 122 118 /* 123 119 * Since all updates are sure to touch the root cgroup, we 124 120 * get ourselves ahead and touch it first. If the root cgroup ··· 123 127 */ 124 128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 125 129 126 - #ifdef CONFIG_CGROUP_CPUACCT 127 - if (unlikely(!cpuacct_subsys.active)) 128 - return; 129 - 130 - rcu_read_lock(); 131 - ca = task_ca(p); 132 - while (ca && (ca != &root_cpuacct)) { 133 - kcpustat = this_cpu_ptr(ca->cpustat); 134 - kcpustat->cpustat[index] += tmp; 135 - ca = parent_ca(ca); 136 - } 137 - rcu_read_unlock(); 138 - #endif 130 + cpuacct_account_field(p, index, tmp); 139 131 } 140 132 141 133 /* ··· 372 388 struct rq *rq) {} 373 389 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 374 390 375 - #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 376 - /* 377 - * Account a single tick of cpu time. 378 - * @p: the process that the cpu time gets accounted to 379 - * @user_tick: indicates if the tick is a user or a system tick 380 - */ 381 - void account_process_tick(struct task_struct *p, int user_tick) 382 - { 383 - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 384 - struct rq *rq = this_rq(); 385 - 386 - if (vtime_accounting_enabled()) 387 - return; 388 - 389 - if (sched_clock_irqtime) { 390 - irqtime_account_process_tick(p, user_tick, rq); 391 - return; 392 - } 393 - 394 - if (steal_account_process_tick()) 395 - return; 396 - 397 - if (user_tick) 398 - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 399 - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 400 - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 401 - one_jiffy_scaled); 402 - else 403 - account_idle_time(cputime_one_jiffy); 404 - } 405 - 406 - /* 407 - * Account multiple ticks of steal time. 408 - * @p: the process from which the cpu time has been stolen 409 - * @ticks: number of stolen ticks 410 - */ 411 - void account_steal_ticks(unsigned long ticks) 412 - { 413 - account_steal_time(jiffies_to_cputime(ticks)); 414 - } 415 - 416 - /* 417 - * Account multiple ticks of idle time. 418 - * @ticks: number of stolen ticks 419 - */ 420 - void account_idle_ticks(unsigned long ticks) 421 - { 422 - 423 - if (sched_clock_irqtime) { 424 - irqtime_account_idle_ticks(ticks); 425 - return; 426 - } 427 - 428 - account_idle_time(jiffies_to_cputime(ticks)); 429 - } 430 - #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 431 - 432 391 /* 433 392 * Use precise platform statistics if available: 434 393 */ 435 394 #ifdef CONFIG_VIRT_CPU_ACCOUNTING 436 - void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 437 - { 438 - *ut = p->utime; 439 - *st = p->stime; 440 - } 441 - 442 - void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 443 - { 444 - struct task_cputime cputime; 445 - 446 - thread_group_cputime(p, &cputime); 447 - 448 - *ut = cputime.utime; 449 - *st = cputime.stime; 450 - } 451 395 452 396 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH 453 397 void vtime_task_switch(struct task_struct *prev) ··· 430 518 } 431 519 EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 432 520 #endif /* __ARCH_HAS_VTIME_ACCOUNT */ 521 + #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 433 522 434 - #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 435 523 436 - static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) 524 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 525 + void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 437 526 { 438 - u64 temp = (__force u64) rtime; 527 + *ut = p->utime; 528 + *st = p->stime; 529 + } 439 530 440 - temp *= (__force u64) stime; 531 + void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 532 + { 533 + struct task_cputime cputime; 441 534 442 - if (sizeof(cputime_t) == 4) 443 - temp = div_u64(temp, (__force u32) total); 535 + thread_group_cputime(p, &cputime); 536 + 537 + *ut = cputime.utime; 538 + *st = cputime.stime; 539 + } 540 + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 541 + /* 542 + * Account a single tick of cpu time. 543 + * @p: the process that the cpu time gets accounted to 544 + * @user_tick: indicates if the tick is a user or a system tick 545 + */ 546 + void account_process_tick(struct task_struct *p, int user_tick) 547 + { 548 + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 549 + struct rq *rq = this_rq(); 550 + 551 + if (vtime_accounting_enabled()) 552 + return; 553 + 554 + if (sched_clock_irqtime) { 555 + irqtime_account_process_tick(p, user_tick, rq); 556 + return; 557 + } 558 + 559 + if (steal_account_process_tick()) 560 + return; 561 + 562 + if (user_tick) 563 + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 564 + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 565 + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 566 + one_jiffy_scaled); 444 567 else 445 - temp = div64_u64(temp, (__force u64) total); 568 + account_idle_time(cputime_one_jiffy); 569 + } 446 570 447 - return (__force cputime_t) temp; 571 + /* 572 + * Account multiple ticks of steal time. 573 + * @p: the process from which the cpu time has been stolen 574 + * @ticks: number of stolen ticks 575 + */ 576 + void account_steal_ticks(unsigned long ticks) 577 + { 578 + account_steal_time(jiffies_to_cputime(ticks)); 579 + } 580 + 581 + /* 582 + * Account multiple ticks of idle time. 583 + * @ticks: number of stolen ticks 584 + */ 585 + void account_idle_ticks(unsigned long ticks) 586 + { 587 + 588 + if (sched_clock_irqtime) { 589 + irqtime_account_idle_ticks(ticks); 590 + return; 591 + } 592 + 593 + account_idle_time(jiffies_to_cputime(ticks)); 594 + } 595 + 596 + /* 597 + * Perform (stime * rtime) / total with reduced chances 598 + * of multiplication overflows by using smaller factors 599 + * like quotient and remainders of divisions between 600 + * rtime and total. 601 + */ 602 + static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) 603 + { 604 + u64 rem, res, scaled; 605 + 606 + if (rtime >= total) { 607 + /* 608 + * Scale up to rtime / total then add 609 + * the remainder scaled to stime / total. 610 + */ 611 + res = div64_u64_rem(rtime, total, &rem); 612 + scaled = stime * res; 613 + scaled += div64_u64(stime * rem, total); 614 + } else { 615 + /* 616 + * Same in reverse: scale down to total / rtime 617 + * then substract that result scaled to 618 + * to the remaining part. 619 + */ 620 + res = div64_u64_rem(total, rtime, &rem); 621 + scaled = div64_u64(stime, res); 622 + scaled -= div64_u64(scaled * rem, total); 623 + } 624 + 625 + return (__force cputime_t) scaled; 448 626 } 449 627 450 628 /* ··· 546 544 cputime_t *ut, cputime_t *st) 547 545 { 548 546 cputime_t rtime, stime, total; 547 + 548 + if (vtime_accounting_enabled()) { 549 + *ut = curr->utime; 550 + *st = curr->stime; 551 + return; 552 + } 549 553 550 554 stime = curr->stime; 551 555 total = stime + curr->utime; ··· 568 560 */ 569 561 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 570 562 571 - if (total) 572 - stime = scale_stime(stime, rtime, total); 573 - else 563 + if (!rtime) { 564 + stime = 0; 565 + } else if (!total) { 574 566 stime = rtime; 567 + } else { 568 + stime = scale_stime((__force u64)stime, 569 + (__force u64)rtime, (__force u64)total); 570 + } 575 571 576 572 /* 577 573 * If the tick based count grows faster than the scheduler one, ··· 609 597 thread_group_cputime(p, &cputime); 610 598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 611 599 } 612 - #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 600 + #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 613 601 614 602 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 615 603 static unsigned long long vtime_delta(struct task_struct *tsk)

+90 -58

kernel/sched/fair.c

··· 431 431 * Scheduling class tree data structure manipulation methods: 432 432 */ 433 433 434 - static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 434 + static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) 435 435 { 436 - s64 delta = (s64)(vruntime - min_vruntime); 436 + s64 delta = (s64)(vruntime - max_vruntime); 437 437 if (delta > 0) 438 - min_vruntime = vruntime; 438 + max_vruntime = vruntime; 439 439 440 - return min_vruntime; 440 + return max_vruntime; 441 441 } 442 442 443 443 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) ··· 473 473 vruntime = min_vruntime(vruntime, se->vruntime); 474 474 } 475 475 476 + /* ensure we never gain time by being placed backwards. */ 476 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 477 478 #ifndef CONFIG_64BIT 478 479 smp_wmb(); ··· 653 652 } 654 653 655 654 /* 656 - * We calculate the vruntime slice of a to be inserted task 655 + * We calculate the vruntime slice of a to-be-inserted task. 657 656 * 658 657 * vs = s/w 659 658 */ ··· 1563 1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1564 1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1565 1564 } 1565 + 1566 + /* 1567 + * Update the rq's load with the elapsed running time before entering 1568 + * idle. if the last scheduled task is not a CFS task, idle_enter will 1569 + * be the only way to update the runnable statistic. 1570 + */ 1571 + void idle_enter_fair(struct rq *this_rq) 1572 + { 1573 + update_rq_runnable_avg(this_rq, 1); 1574 + } 1575 + 1576 + /* 1577 + * Update the rq's load with the elapsed idle time before a task is 1578 + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will 1579 + * be the only way to update the runnable statistic. 1580 + */ 1581 + void idle_exit_fair(struct rq *this_rq) 1582 + { 1583 + update_rq_runnable_avg(this_rq, 0); 1584 + } 1585 + 1566 1586 #else 1567 1587 static inline void update_entity_load_avg(struct sched_entity *se, 1568 1588 int update_cfs_rq) {} ··· 3896 3874 int tsk_cache_hot = 0; 3897 3875 /* 3898 3876 * We do not migrate tasks that are: 3899 - * 1) running (obviously), or 3877 + * 1) throttled_lb_pair, or 3900 3878 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3901 - * 3) are cache-hot on their current CPU. 3879 + * 3) running (obviously), or 3880 + * 4) are cache-hot on their current CPU. 3902 3881 */ 3882 + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 3883 + return 0; 3884 + 3903 3885 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3904 - int new_dst_cpu; 3886 + int cpu; 3905 3887 3906 3888 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3907 3889 ··· 3920 3894 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3921 3895 return 0; 3922 3896 3923 - new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3924 - tsk_cpus_allowed(p)); 3925 - if (new_dst_cpu < nr_cpu_ids) { 3926 - env->flags |= LBF_SOME_PINNED; 3927 - env->new_dst_cpu = new_dst_cpu; 3897 + /* Prevent to re-select dst_cpu via env's cpus */ 3898 + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 3899 + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 3900 + env->flags |= LBF_SOME_PINNED; 3901 + env->new_dst_cpu = cpu; 3902 + break; 3903 + } 3928 3904 } 3905 + 3929 3906 return 0; 3930 3907 } 3931 3908 ··· 3949 3920 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3950 3921 if (!tsk_cache_hot || 3951 3922 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3952 - #ifdef CONFIG_SCHEDSTATS 3923 + 3953 3924 if (tsk_cache_hot) { 3954 3925 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3955 3926 schedstat_inc(p, se.statistics.nr_forced_migrations); 3956 3927 } 3957 - #endif 3928 + 3958 3929 return 1; 3959 3930 } 3960 3931 3961 - if (tsk_cache_hot) { 3962 - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3963 - return 0; 3964 - } 3965 - return 1; 3932 + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3933 + return 0; 3966 3934 } 3967 3935 3968 3936 /* ··· 3974 3948 struct task_struct *p, *n; 3975 3949 3976 3950 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3977 - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) 3978 - continue; 3979 - 3980 3951 if (!can_migrate_task(p, env)) 3981 3952 continue; 3982 3953 ··· 4025 4002 break; 4026 4003 } 4027 4004 4028 - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4005 + if (!can_migrate_task(p, env)) 4029 4006 goto next; 4030 4007 4031 4008 load = task_h_load(p); ··· 4034 4011 goto next; 4035 4012 4036 4013 if ((load / 2) > env->imbalance) 4037 - goto next; 4038 - 4039 - if (!can_migrate_task(p, env)) 4040 4014 goto next; 4041 4015 4042 4016 move_task(p, env); ··· 4265 4245 return load_idx; 4266 4246 } 4267 4247 4268 - unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4248 + static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4269 4249 { 4270 4250 return SCHED_POWER_SCALE; 4271 4251 } ··· 4275 4255 return default_scale_freq_power(sd, cpu); 4276 4256 } 4277 4257 4278 - unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4258 + static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4279 4259 { 4280 4260 unsigned long weight = sd->span_weight; 4281 4261 unsigned long smt_gain = sd->smt_gain; ··· 4290 4270 return default_scale_smt_power(sd, cpu); 4291 4271 } 4292 4272 4293 - unsigned long scale_rt_power(int cpu) 4273 + static unsigned long scale_rt_power(int cpu) 4294 4274 { 4295 4275 struct rq *rq = cpu_rq(cpu); 4296 4276 u64 total, available, age_stamp, avg; ··· 4980 4960 #define MAX_PINNED_INTERVAL 512 4981 4961 4982 4962 /* Working cpumask for load_balance and load_balance_newidle. */ 4983 - DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4963 + DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); 4984 4964 4985 4965 static int need_active_balance(struct lb_env *env) 4986 4966 { ··· 5011 4991 int *balance) 5012 4992 { 5013 4993 int ld_moved, cur_ld_moved, active_balance = 0; 5014 - int lb_iterations, max_lb_iterations; 5015 4994 struct sched_group *group; 5016 4995 struct rq *busiest; 5017 4996 unsigned long flags; 5018 - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4997 + struct cpumask *cpus = __get_cpu_var(load_balance_mask); 5019 4998 5020 4999 struct lb_env env = { 5021 5000 .sd = sd, ··· 5026 5007 .cpus = cpus, 5027 5008 }; 5028 5009 5010 + /* 5011 + * For NEWLY_IDLE load_balancing, we don't need to consider 5012 + * other cpus in our group 5013 + */ 5014 + if (idle == CPU_NEWLY_IDLE) 5015 + env.dst_grpmask = NULL; 5016 + 5029 5017 cpumask_copy(cpus, cpu_active_mask); 5030 - max_lb_iterations = cpumask_weight(env.dst_grpmask); 5031 5018 5032 5019 schedstat_inc(sd, lb_count[idle]); 5033 5020 ··· 5059 5034 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5060 5035 5061 5036 ld_moved = 0; 5062 - lb_iterations = 1; 5063 5037 if (busiest->nr_running > 1) { 5064 5038 /* 5065 5039 * Attempt to move tasks. If find_busiest_group has found ··· 5085 5061 double_rq_unlock(env.dst_rq, busiest); 5086 5062 local_irq_restore(flags); 5087 5063 5088 - if (env.flags & LBF_NEED_BREAK) { 5089 - env.flags &= ~LBF_NEED_BREAK; 5090 - goto more_balance; 5091 - } 5092 - 5093 5064 /* 5094 5065 * some other cpu did the load balance for us. 5095 5066 */ 5096 5067 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5097 5068 resched_cpu(env.dst_cpu); 5069 + 5070 + if (env.flags & LBF_NEED_BREAK) { 5071 + env.flags &= ~LBF_NEED_BREAK; 5072 + goto more_balance; 5073 + } 5098 5074 5099 5075 /* 5100 5076 * Revisit (affine) tasks on src_cpu that couldn't be moved to ··· 5115 5091 * moreover subsequent load balance cycles should correct the 5116 5092 * excess load moved. 5117 5093 */ 5118 - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5119 - lb_iterations++ < max_lb_iterations) { 5094 + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 5120 5095 5121 5096 env.dst_rq = cpu_rq(env.new_dst_cpu); 5122 5097 env.dst_cpu = env.new_dst_cpu; 5123 5098 env.flags &= ~LBF_SOME_PINNED; 5124 5099 env.loop = 0; 5125 5100 env.loop_break = sched_nr_migrate_break; 5101 + 5102 + /* Prevent to re-select dst_cpu via env's cpus */ 5103 + cpumask_clear_cpu(env.dst_cpu, env.cpus); 5104 + 5126 5105 /* 5127 5106 * Go back to "more_balance" rather than "redo" since we 5128 5107 * need to continue with same src_cpu. ··· 5245 5218 5246 5219 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5247 5220 return; 5248 - 5249 - update_rq_runnable_avg(this_rq, 1); 5250 5221 5251 5222 /* 5252 5223 * Drop the rq->lock, but keep IRQ/preempt disabled. ··· 5420 5395 struct sched_domain *sd; 5421 5396 int cpu = smp_processor_id(); 5422 5397 5423 - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) 5424 - return; 5425 - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); 5426 - 5427 5398 rcu_read_lock(); 5428 - for_each_domain(cpu, sd) 5399 + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5400 + 5401 + if (!sd || !sd->nohz_idle) 5402 + goto unlock; 5403 + sd->nohz_idle = 0; 5404 + 5405 + for (; sd; sd = sd->parent) 5429 5406 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5407 + unlock: 5430 5408 rcu_read_unlock(); 5431 5409 } 5432 5410 ··· 5438 5410 struct sched_domain *sd; 5439 5411 int cpu = smp_processor_id(); 5440 5412 5441 - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) 5442 - return; 5443 - set_bit(NOHZ_IDLE, nohz_flags(cpu)); 5444 - 5445 5413 rcu_read_lock(); 5446 - for_each_domain(cpu, sd) 5414 + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5415 + 5416 + if (!sd || sd->nohz_idle) 5417 + goto unlock; 5418 + sd->nohz_idle = 1; 5419 + 5420 + for (; sd; sd = sd->parent) 5447 5421 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5422 + unlock: 5448 5423 rcu_read_unlock(); 5449 5424 } 5450 5425 ··· 5499 5468 * It checks each scheduling domain to see if it is due to be balanced, 5500 5469 * and initiates a balancing operation if so. 5501 5470 * 5502 - * Balancing parameters are set up in arch_init_sched_domains. 5471 + * Balancing parameters are set up in init_sched_domains. 5503 5472 */ 5504 5473 static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5505 5474 { ··· 5537 5506 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5538 5507 if (load_balance(cpu, rq, sd, idle, &balance)) { 5539 5508 /* 5540 - * We've pulled tasks over so either we're no 5541 - * longer idle. 5509 + * The LBF_SOME_PINNED logic could have changed 5510 + * env->dst_cpu, so we can't know our idle 5511 + * state even if we migrated tasks. Update it. 5542 5512 */ 5543 - idle = CPU_NOT_IDLE; 5513 + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 5544 5514 } 5545 5515 sd->last_balance = jiffies; 5546 5516 }

+16

kernel/sched/idle_task.c

··· 13 13 { 14 14 return task_cpu(p); /* IDLE tasks as never migrated */ 15 15 } 16 + 17 + static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) 18 + { 19 + idle_exit_fair(rq); 20 + } 21 + 22 + static void post_schedule_idle(struct rq *rq) 23 + { 24 + idle_enter_fair(rq); 25 + } 16 26 #endif /* CONFIG_SMP */ 17 27 /* 18 28 * Idle tasks are unconditionally rescheduled: ··· 35 25 static struct task_struct *pick_next_task_idle(struct rq *rq) 36 26 { 37 27 schedstat_inc(rq, sched_goidle); 28 + #ifdef CONFIG_SMP 29 + /* Trigger the post schedule to do an idle_enter for CFS */ 30 + rq->post_schedule = 1; 31 + #endif 38 32 return rq->idle; 39 33 } 40 34 ··· 100 86 101 87 #ifdef CONFIG_SMP 102 88 .select_task_rq = select_task_rq_idle, 89 + .pre_schedule = pre_schedule_idle, 90 + .post_schedule = post_schedule_idle, 103 91 #endif 104 92 105 93 .set_curr_task = set_curr_task_idle,

+167 -52

kernel/sched/sched.h

··· 7 7 #include <linux/stop_machine.h> 8 8 9 9 #include "cpupri.h" 10 + #include "cpuacct.h" 10 11 11 12 extern __read_mostly int scheduler_running; 12 13 ··· 33 32 * Helpers for converting nanosecond timing to jiffy resolution 34 33 */ 35 34 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 35 + 36 + /* 37 + * Increase resolution of nice-level calculations for 64-bit architectures. 38 + * The extra resolution improves shares distribution and load balancing of 39 + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup 40 + * hierarchies, especially on larger systems. This is not a user-visible change 41 + * and does not change the user-interface for setting shares/weights. 42 + * 43 + * We increase resolution only if we have enough bits to allow this increased 44 + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution 45 + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the 46 + * increased costs. 47 + */ 48 + #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ 49 + # define SCHED_LOAD_RESOLUTION 10 50 + # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) 51 + # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) 52 + #else 53 + # define SCHED_LOAD_RESOLUTION 0 54 + # define scale_load(w) (w) 55 + # define scale_load_down(w) (w) 56 + #endif 57 + 58 + #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) 59 + #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) 36 60 37 61 #define NICE_0_LOAD SCHED_LOAD_SCALE 38 62 #define NICE_0_SHIFT SCHED_LOAD_SHIFT ··· 180 154 #define MAX_SHARES (1UL << 18) 181 155 #endif 182 156 183 - /* Default task group. 184 - * Every task in system belong to this group at bootup. 185 - */ 186 - extern struct task_group root_task_group; 187 - 188 157 typedef int (*tg_visitor)(struct task_group *, void *); 189 158 190 159 extern int walk_tg_tree_from(struct task_group *from, ··· 216 195 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 217 196 struct sched_rt_entity *rt_se, int cpu, 218 197 struct sched_rt_entity *parent); 198 + 199 + extern struct task_group *sched_create_group(struct task_group *parent); 200 + extern void sched_online_group(struct task_group *tg, 201 + struct task_group *parent); 202 + extern void sched_destroy_group(struct task_group *tg); 203 + extern void sched_offline_group(struct task_group *tg); 204 + 205 + extern void sched_move_task(struct task_struct *tsk); 206 + 207 + #ifdef CONFIG_FAIR_GROUP_SCHED 208 + extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 209 + #endif 219 210 220 211 #else /* CONFIG_CGROUP_SCHED */ 221 212 ··· 580 547 DECLARE_PER_CPU(struct sched_domain *, sd_llc); 581 548 DECLARE_PER_CPU(int, sd_llc_id); 582 549 550 + struct sched_group_power { 551 + atomic_t ref; 552 + /* 553 + * CPU power of this group, SCHED_LOAD_SCALE being max power for a 554 + * single CPU. 555 + */ 556 + unsigned int power, power_orig; 557 + unsigned long next_update; 558 + /* 559 + * Number of busy cpus in this group. 560 + */ 561 + atomic_t nr_busy_cpus; 562 + 563 + unsigned long cpumask[0]; /* iteration mask */ 564 + }; 565 + 566 + struct sched_group { 567 + struct sched_group *next; /* Must be a circular list */ 568 + atomic_t ref; 569 + 570 + unsigned int group_weight; 571 + struct sched_group_power *sgp; 572 + 573 + /* 574 + * The CPUs this group covers. 575 + * 576 + * NOTE: this field is variable length. (Allocated dynamically 577 + * by attaching extra space to the end of the structure, 578 + * depending on how many CPUs the kernel has booted up with) 579 + */ 580 + unsigned long cpumask[0]; 581 + }; 582 + 583 + static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 584 + { 585 + return to_cpumask(sg->cpumask); 586 + } 587 + 588 + /* 589 + * cpumask masking which cpus in the group are allowed to iterate up the domain 590 + * tree. 591 + */ 592 + static inline struct cpumask *sched_group_mask(struct sched_group *sg) 593 + { 594 + return to_cpumask(sg->sgp->cpumask); 595 + } 596 + 597 + /** 598 + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 599 + * @group: The group whose first cpu is to be returned. 600 + */ 601 + static inline unsigned int group_first_cpu(struct sched_group *group) 602 + { 603 + return cpumask_first(sched_group_cpus(group)); 604 + } 605 + 583 606 extern int group_balance_cpu(struct sched_group *sg); 584 607 585 608 #endif /* CONFIG_SMP */ ··· 873 784 } 874 785 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 875 786 787 + /* 788 + * wake flags 789 + */ 790 + #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ 791 + #define WF_FORK 0x02 /* child wakeup after fork */ 792 + #define WF_MIGRATED 0x4 /* internal use, task got migrated */ 876 793 877 794 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 878 795 { ··· 951 856 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 952 857 }; 953 858 954 - /* Time spent by the tasks of the cpu accounting group executing in ... */ 955 - enum cpuacct_stat_index { 956 - CPUACCT_STAT_USER, /* ... user mode */ 957 - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 859 + #define ENQUEUE_WAKEUP 1 860 + #define ENQUEUE_HEAD 2 861 + #ifdef CONFIG_SMP 862 + #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ 863 + #else 864 + #define ENQUEUE_WAKING 0 865 + #endif 958 866 959 - CPUACCT_STAT_NSTATS, 867 + #define DEQUEUE_SLEEP 1 868 + 869 + struct sched_class { 870 + const struct sched_class *next; 871 + 872 + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 873 + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 874 + void (*yield_task) (struct rq *rq); 875 + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 876 + 877 + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 878 + 879 + struct task_struct * (*pick_next_task) (struct rq *rq); 880 + void (*put_prev_task) (struct rq *rq, struct task_struct *p); 881 + 882 + #ifdef CONFIG_SMP 883 + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 884 + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 885 + 886 + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 887 + void (*post_schedule) (struct rq *this_rq); 888 + void (*task_waking) (struct task_struct *task); 889 + void (*task_woken) (struct rq *this_rq, struct task_struct *task); 890 + 891 + void (*set_cpus_allowed)(struct task_struct *p, 892 + const struct cpumask *newmask); 893 + 894 + void (*rq_online)(struct rq *rq); 895 + void (*rq_offline)(struct rq *rq); 896 + #endif 897 + 898 + void (*set_curr_task) (struct rq *rq); 899 + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 900 + void (*task_fork) (struct task_struct *p); 901 + 902 + void (*switched_from) (struct rq *this_rq, struct task_struct *task); 903 + void (*switched_to) (struct rq *this_rq, struct task_struct *task); 904 + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 905 + int oldprio); 906 + 907 + unsigned int (*get_rr_interval) (struct rq *rq, 908 + struct task_struct *task); 909 + 910 + #ifdef CONFIG_FAIR_GROUP_SCHED 911 + void (*task_move_group) (struct task_struct *p, int on_rq); 912 + #endif 960 913 }; 961 - 962 914 963 915 #define sched_class_highest (&stop_sched_class) 964 916 #define for_each_class(class) \ ··· 1019 877 1020 878 #ifdef CONFIG_SMP 1021 879 880 + extern void update_group_power(struct sched_domain *sd, int cpu); 881 + 1022 882 extern void trigger_load_balance(struct rq *rq, int cpu); 1023 883 extern void idle_balance(int this_cpu, struct rq *this_rq); 884 + 885 + /* 886 + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg 887 + * becomes useful in lb 888 + */ 889 + #if defined(CONFIG_FAIR_GROUP_SCHED) 890 + extern void idle_enter_fair(struct rq *this_rq); 891 + extern void idle_exit_fair(struct rq *this_rq); 892 + #else 893 + static inline void idle_enter_fair(struct rq *this_rq) {} 894 + static inline void idle_exit_fair(struct rq *this_rq) {} 895 + #endif 1024 896 1025 897 #else /* CONFIG_SMP */ 1026 898 ··· 1047 891 extern void sysrq_sched_debug_show(void); 1048 892 extern void sched_init_granularity(void); 1049 893 extern void update_max_interval(void); 1050 - extern void update_group_power(struct sched_domain *sd, int cpu); 1051 894 extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 1052 895 extern void init_sched_rt_class(void); 1053 896 extern void init_sched_fair_class(void); ··· 1058 903 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1059 904 1060 905 extern void update_idle_cpu_load(struct rq *this_rq); 1061 - 1062 - #ifdef CONFIG_CGROUP_CPUACCT 1063 - #include <linux/cgroup.h> 1064 - /* track cpu usage of a group of tasks and its child groups */ 1065 - struct cpuacct { 1066 - struct cgroup_subsys_state css; 1067 - /* cpuusage holds pointer to a u64-type object on every cpu */ 1068 - u64 __percpu *cpuusage; 1069 - struct kernel_cpustat __percpu *cpustat; 1070 - }; 1071 - 1072 - extern struct cgroup_subsys cpuacct_subsys; 1073 - extern struct cpuacct root_cpuacct; 1074 - 1075 - /* return cpu accounting group corresponding to this container */ 1076 - static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 1077 - { 1078 - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 1079 - struct cpuacct, css); 1080 - } 1081 - 1082 - /* return cpu accounting group to which this task belongs */ 1083 - static inline struct cpuacct *task_ca(struct task_struct *tsk) 1084 - { 1085 - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 1086 - struct cpuacct, css); 1087 - } 1088 - 1089 - static inline struct cpuacct *parent_ca(struct cpuacct *ca) 1090 - { 1091 - if (!ca || !ca->css.cgroup->parent) 1092 - return NULL; 1093 - return cgroup_ca(ca->css.cgroup->parent); 1094 - } 1095 - 1096 - extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1097 - #else 1098 - static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1099 - #endif 1100 906 1101 907 #ifdef CONFIG_PARAVIRT 1102 908 static inline u64 steal_ticks(u64 steal) ··· 1303 1187 enum rq_nohz_flag_bits { 1304 1188 NOHZ_TICK_STOPPED, 1305 1189 NOHZ_BALANCE_KICK, 1306 - NOHZ_IDLE, 1307 1190 }; 1308 1191 1309 1192 #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)

+13 -6

lib/div64.c

··· 79 79 #endif 80 80 81 81 /** 82 - * div64_u64 - unsigned 64bit divide with 64bit divisor 82 + * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder 83 83 * @dividend: 64bit dividend 84 84 * @divisor: 64bit divisor 85 + * @remainder: 64bit remainder 85 86 * 86 87 * This implementation is a modified version of the algorithm proposed 87 88 * by the book 'Hacker's Delight'. The original source and full proof ··· 90 89 * 91 90 * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' 92 91 */ 93 - #ifndef div64_u64 94 - u64 div64_u64(u64 dividend, u64 divisor) 92 + #ifndef div64_u64_rem 93 + u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) 95 94 { 96 95 u32 high = divisor >> 32; 97 96 u64 quot; 98 97 99 98 if (high == 0) { 100 - quot = div_u64(dividend, divisor); 99 + u32 rem32; 100 + quot = div_u64_rem(dividend, divisor, &rem32); 101 + *remainder = rem32; 101 102 } else { 102 103 int n = 1 + fls(high); 103 104 quot = div_u64(dividend >> n, divisor >> n); 104 105 105 106 if (quot != 0) 106 107 quot--; 107 - if ((dividend - quot * divisor) >= divisor) 108 + 109 + *remainder = dividend - quot * divisor; 110 + if (*remainder >= divisor) { 108 111 quot++; 112 + *remainder -= divisor; 113 + } 109 114 } 110 115 111 116 return quot; 112 117 } 113 - EXPORT_SYMBOL(div64_u64); 118 + EXPORT_SYMBOL(div64_u64_rem); 114 119 #endif 115 120 116 121 /**