Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull scheduler fixes from Thomas Gleixner:
"This scheduler update provides:

- The (hopefully) final fix for the vtime accounting issues which
were around for quite some time

- Use types known to user space in UAPI headers to unbreak user space
builds

- Make load balancing respect the current scheduling domain again
instead of evaluating unrelated CPUs"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/headers/uapi: Fix linux/sched/types.h userspace compilation errors
sched/fair: Fix load_balance() affinity redo path
sched/cputime: Accumulate vtime on top of nsec clocksource
sched/cputime: Move the vtime task fields to their own struct
sched/cputime: Rename vtime fields
sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime
vtime, sched/cputime: Remove vtime_account_user()
Revert "sched/cputime: Refactor the cputime_adjust() code"

Linus Torvalds 8 years ago 4fde846a c3931a87

+167 -115

7 changed files

expand all

include

linux

init_task.h

sched.h

vtime.h

uapi

linux

sched

types.h

kernel

fork.c

sched

cputime.c

fair.c

+3 -3

include/linux/init_task.h

··· 170 170 171 171 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 172 172 # define INIT_VTIME(tsk) \ 173 - .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ 174 - .vtime_snap = 0, \ 175 - .vtime_snap_whence = VTIME_SYS, 173 + .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \ 174 + .vtime.starttime = 0, \ 175 + .vtime.state = VTIME_SYS, 176 176 #else 177 177 # define INIT_VTIME(tsk) 178 178 #endif

+19 -10

include/linux/sched.h

··· 223 223 #define prof_exp stime 224 224 #define sched_exp sum_exec_runtime 225 225 226 + enum vtime_state { 227 + /* Task is sleeping or running in a CPU with VTIME inactive: */ 228 + VTIME_INACTIVE = 0, 229 + /* Task runs in userspace in a CPU with VTIME active: */ 230 + VTIME_USER, 231 + /* Task runs in kernelspace in a CPU with VTIME active: */ 232 + VTIME_SYS, 233 + }; 234 + 235 + struct vtime { 236 + seqcount_t seqcount; 237 + unsigned long long starttime; 238 + enum vtime_state state; 239 + u64 utime; 240 + u64 stime; 241 + u64 gtime; 242 + }; 243 + 226 244 struct sched_info { 227 245 #ifdef CONFIG_SCHED_INFO 228 246 /* Cumulative counters: */ ··· 706 688 u64 gtime; 707 689 struct prev_cputime prev_cputime; 708 690 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 709 - seqcount_t vtime_seqcount; 710 - unsigned long long vtime_snap; 711 - enum { 712 - /* Task is sleeping or running in a CPU with VTIME inactive: */ 713 - VTIME_INACTIVE = 0, 714 - /* Task runs in userspace in a CPU with VTIME active: */ 715 - VTIME_USER, 716 - /* Task runs in kernelspace in a CPU with VTIME active: */ 717 - VTIME_SYS, 718 - } vtime_snap_whence; 691 + struct vtime vtime; 719 692 #endif 720 693 721 694 #ifdef CONFIG_NO_HZ_FULL

+1 -8

include/linux/vtime.h

··· 67 67 68 68 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 69 69 extern void arch_vtime_task_switch(struct task_struct *tsk); 70 - extern void vtime_account_user(struct task_struct *tsk); 71 70 extern void vtime_user_enter(struct task_struct *tsk); 72 - 73 - static inline void vtime_user_exit(struct task_struct *tsk) 74 - { 75 - vtime_account_user(tsk); 76 - } 77 - 71 + extern void vtime_user_exit(struct task_struct *tsk); 78 72 extern void vtime_guest_enter(struct task_struct *tsk); 79 73 extern void vtime_guest_exit(struct task_struct *tsk); 80 74 extern void vtime_init_idle(struct task_struct *tsk, int cpu); 81 75 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 82 - static inline void vtime_account_user(struct task_struct *tsk) { } 83 76 static inline void vtime_user_enter(struct task_struct *tsk) { } 84 77 static inline void vtime_user_exit(struct task_struct *tsk) { } 85 78 static inline void vtime_guest_enter(struct task_struct *tsk) { }

+8 -8

include/uapi/linux/sched/types.h

··· 54 54 * available in the scheduling class file or in Documentation/. 55 55 */ 56 56 struct sched_attr { 57 - u32 size; 57 + __u32 size; 58 58 59 - u32 sched_policy; 60 - u64 sched_flags; 59 + __u32 sched_policy; 60 + __u64 sched_flags; 61 61 62 62 /* SCHED_NORMAL, SCHED_BATCH */ 63 - s32 sched_nice; 63 + __s32 sched_nice; 64 64 65 65 /* SCHED_FIFO, SCHED_RR */ 66 - u32 sched_priority; 66 + __u32 sched_priority; 67 67 68 68 /* SCHED_DEADLINE */ 69 - u64 sched_runtime; 70 - u64 sched_deadline; 71 - u64 sched_period; 69 + __u64 sched_runtime; 70 + __u64 sched_deadline; 71 + __u64 sched_period; 72 72 }; 73 73 74 74 #endif /* _UAPI_LINUX_SCHED_TYPES_H */

+3 -3

kernel/fork.c

··· 1637 1637 prev_cputime_init(&p->prev_cputime); 1638 1638 1639 1639 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1640 - seqcount_init(&p->vtime_seqcount); 1641 - p->vtime_snap = 0; 1642 - p->vtime_snap_whence = VTIME_INACTIVE; 1640 + seqcount_init(&p->vtime.seqcount); 1641 + p->vtime.starttime = 0; 1642 + p->vtime.state = VTIME_INACTIVE; 1643 1643 #endif 1644 1644 1645 1645 #if defined(SPLIT_RSS_COUNTING)

+113 -71

kernel/sched/cputime.c

··· 611 611 utime = curr->utime; 612 612 613 613 /* 614 - * If either stime or both stime and utime are 0, assume all runtime is 615 - * userspace. Once a task gets some ticks, the monotonicy code at 616 - * 'update' will ensure things converge to the observed ratio. 614 + * If either stime or utime are 0, assume all runtime is userspace. 615 + * Once a task gets some ticks, the monotonicy code at 'update:' 616 + * will ensure things converge to the observed ratio. 617 617 */ 618 - if (stime != 0) { 619 - if (utime == 0) 620 - stime = rtime; 621 - else 622 - stime = scale_stime(stime, rtime, stime + utime); 618 + if (stime == 0) { 619 + utime = rtime; 620 + goto update; 623 621 } 624 622 623 + if (utime == 0) { 624 + stime = rtime; 625 + goto update; 626 + } 627 + 628 + stime = scale_stime(stime, rtime, stime + utime); 629 + 630 + update: 625 631 /* 626 632 * Make sure stime doesn't go backwards; this preserves monotonicity 627 633 * for utime because rtime is monotonic. ··· 679 673 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 680 674 681 675 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 682 - static u64 vtime_delta(struct task_struct *tsk) 676 + static u64 vtime_delta(struct vtime *vtime) 683 677 { 684 - unsigned long now = READ_ONCE(jiffies); 678 + unsigned long long clock; 685 679 686 - if (time_before(now, (unsigned long)tsk->vtime_snap)) 680 + clock = sched_clock_cpu(smp_processor_id()); 681 + if (clock < vtime->starttime) 687 682 return 0; 688 683 689 - return jiffies_to_nsecs(now - tsk->vtime_snap); 684 + return clock - vtime->starttime; 690 685 } 691 686 692 - static u64 get_vtime_delta(struct task_struct *tsk) 687 + static u64 get_vtime_delta(struct vtime *vtime) 693 688 { 694 - unsigned long now = READ_ONCE(jiffies); 695 - u64 delta, other; 689 + u64 delta = vtime_delta(vtime); 690 + u64 other; 696 691 697 692 /* 698 693 * Unlike tick based timing, vtime based timing never has lost ··· 702 695 * elapsed time. Limit account_other_time to prevent rounding 703 696 * errors from causing elapsed vtime to go negative. 704 697 */ 705 - delta = jiffies_to_nsecs(now - tsk->vtime_snap); 706 698 other = account_other_time(delta); 707 - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 708 - tsk->vtime_snap = now; 699 + WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 700 + vtime->starttime += delta; 709 701 710 702 return delta - other; 711 703 } 712 704 713 - static void __vtime_account_system(struct task_struct *tsk) 705 + static void __vtime_account_system(struct task_struct *tsk, 706 + struct vtime *vtime) 714 707 { 715 - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); 708 + vtime->stime += get_vtime_delta(vtime); 709 + if (vtime->stime >= TICK_NSEC) { 710 + account_system_time(tsk, irq_count(), vtime->stime); 711 + vtime->stime = 0; 712 + } 713 + } 714 + 715 + static void vtime_account_guest(struct task_struct *tsk, 716 + struct vtime *vtime) 717 + { 718 + vtime->gtime += get_vtime_delta(vtime); 719 + if (vtime->gtime >= TICK_NSEC) { 720 + account_guest_time(tsk, vtime->gtime); 721 + vtime->gtime = 0; 722 + } 716 723 } 717 724 718 725 void vtime_account_system(struct task_struct *tsk) 719 726 { 720 - if (!vtime_delta(tsk)) 727 + struct vtime *vtime = &tsk->vtime; 728 + 729 + if (!vtime_delta(vtime)) 721 730 return; 722 731 723 - write_seqcount_begin(&tsk->vtime_seqcount); 724 - __vtime_account_system(tsk); 725 - write_seqcount_end(&tsk->vtime_seqcount); 726 - } 727 - 728 - void vtime_account_user(struct task_struct *tsk) 729 - { 730 - write_seqcount_begin(&tsk->vtime_seqcount); 731 - tsk->vtime_snap_whence = VTIME_SYS; 732 - if (vtime_delta(tsk)) 733 - account_user_time(tsk, get_vtime_delta(tsk)); 734 - write_seqcount_end(&tsk->vtime_seqcount); 732 + write_seqcount_begin(&vtime->seqcount); 733 + /* We might have scheduled out from guest path */ 734 + if (current->flags & PF_VCPU) 735 + vtime_account_guest(tsk, vtime); 736 + else 737 + __vtime_account_system(tsk, vtime); 738 + write_seqcount_end(&vtime->seqcount); 735 739 } 736 740 737 741 void vtime_user_enter(struct task_struct *tsk) 738 742 { 739 - write_seqcount_begin(&tsk->vtime_seqcount); 740 - if (vtime_delta(tsk)) 741 - __vtime_account_system(tsk); 742 - tsk->vtime_snap_whence = VTIME_USER; 743 - write_seqcount_end(&tsk->vtime_seqcount); 743 + struct vtime *vtime = &tsk->vtime; 744 + 745 + write_seqcount_begin(&vtime->seqcount); 746 + __vtime_account_system(tsk, vtime); 747 + vtime->state = VTIME_USER; 748 + write_seqcount_end(&vtime->seqcount); 749 + } 750 + 751 + void vtime_user_exit(struct task_struct *tsk) 752 + { 753 + struct vtime *vtime = &tsk->vtime; 754 + 755 + write_seqcount_begin(&vtime->seqcount); 756 + vtime->utime += get_vtime_delta(vtime); 757 + if (vtime->utime >= TICK_NSEC) { 758 + account_user_time(tsk, vtime->utime); 759 + vtime->utime = 0; 760 + } 761 + vtime->state = VTIME_SYS; 762 + write_seqcount_end(&vtime->seqcount); 744 763 } 745 764 746 765 void vtime_guest_enter(struct task_struct *tsk) 747 766 { 767 + struct vtime *vtime = &tsk->vtime; 748 768 /* 749 769 * The flags must be updated under the lock with 750 - * the vtime_snap flush and update. 770 + * the vtime_starttime flush and update. 751 771 * That enforces a right ordering and update sequence 752 772 * synchronization against the reader (task_gtime()) 753 773 * that can thus safely catch up with a tickless delta. 754 774 */ 755 - write_seqcount_begin(&tsk->vtime_seqcount); 756 - if (vtime_delta(tsk)) 757 - __vtime_account_system(tsk); 775 + write_seqcount_begin(&vtime->seqcount); 776 + __vtime_account_system(tsk, vtime); 758 777 current->flags |= PF_VCPU; 759 - write_seqcount_end(&tsk->vtime_seqcount); 778 + write_seqcount_end(&vtime->seqcount); 760 779 } 761 780 EXPORT_SYMBOL_GPL(vtime_guest_enter); 762 781 763 782 void vtime_guest_exit(struct task_struct *tsk) 764 783 { 765 - write_seqcount_begin(&tsk->vtime_seqcount); 766 - __vtime_account_system(tsk); 784 + struct vtime *vtime = &tsk->vtime; 785 + 786 + write_seqcount_begin(&vtime->seqcount); 787 + vtime_account_guest(tsk, vtime); 767 788 current->flags &= ~PF_VCPU; 768 - write_seqcount_end(&tsk->vtime_seqcount); 789 + write_seqcount_end(&vtime->seqcount); 769 790 } 770 791 EXPORT_SYMBOL_GPL(vtime_guest_exit); 771 792 772 793 void vtime_account_idle(struct task_struct *tsk) 773 794 { 774 - account_idle_time(get_vtime_delta(tsk)); 795 + account_idle_time(get_vtime_delta(&tsk->vtime)); 775 796 } 776 797 777 798 void arch_vtime_task_switch(struct task_struct *prev) 778 799 { 779 - write_seqcount_begin(&prev->vtime_seqcount); 780 - prev->vtime_snap_whence = VTIME_INACTIVE; 781 - write_seqcount_end(&prev->vtime_seqcount); 800 + struct vtime *vtime = &prev->vtime; 782 801 783 - write_seqcount_begin(&current->vtime_seqcount); 784 - current->vtime_snap_whence = VTIME_SYS; 785 - current->vtime_snap = jiffies; 786 - write_seqcount_end(&current->vtime_seqcount); 802 + write_seqcount_begin(&vtime->seqcount); 803 + vtime->state = VTIME_INACTIVE; 804 + write_seqcount_end(&vtime->seqcount); 805 + 806 + vtime = &current->vtime; 807 + 808 + write_seqcount_begin(&vtime->seqcount); 809 + vtime->state = VTIME_SYS; 810 + vtime->starttime = sched_clock_cpu(smp_processor_id()); 811 + write_seqcount_end(&vtime->seqcount); 787 812 } 788 813 789 814 void vtime_init_idle(struct task_struct *t, int cpu) 790 815 { 816 + struct vtime *vtime = &t->vtime; 791 817 unsigned long flags; 792 818 793 819 local_irq_save(flags); 794 - write_seqcount_begin(&t->vtime_seqcount); 795 - t->vtime_snap_whence = VTIME_SYS; 796 - t->vtime_snap = jiffies; 797 - write_seqcount_end(&t->vtime_seqcount); 820 + write_seqcount_begin(&vtime->seqcount); 821 + vtime->state = VTIME_SYS; 822 + vtime->starttime = sched_clock_cpu(cpu); 823 + write_seqcount_end(&vtime->seqcount); 798 824 local_irq_restore(flags); 799 825 } 800 826 801 827 u64 task_gtime(struct task_struct *t) 802 828 { 829 + struct vtime *vtime = &t->vtime; 803 830 unsigned int seq; 804 831 u64 gtime; 805 832 ··· 841 800 return t->gtime; 842 801 843 802 do { 844 - seq = read_seqcount_begin(&t->vtime_seqcount); 803 + seq = read_seqcount_begin(&vtime->seqcount); 845 804 846 805 gtime = t->gtime; 847 - if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) 848 - gtime += vtime_delta(t); 806 + if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) 807 + gtime += vtime->gtime + vtime_delta(vtime); 849 808 850 - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 809 + } while (read_seqcount_retry(&vtime->seqcount, seq)); 851 810 852 811 return gtime; 853 812 } ··· 859 818 */ 860 819 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 861 820 { 862 - u64 delta; 821 + struct vtime *vtime = &t->vtime; 863 822 unsigned int seq; 823 + u64 delta; 864 824 865 825 if (!vtime_accounting_enabled()) { 866 826 *utime = t->utime; ··· 870 828 } 871 829 872 830 do { 873 - seq = read_seqcount_begin(&t->vtime_seqcount); 831 + seq = read_seqcount_begin(&vtime->seqcount); 874 832 875 833 *utime = t->utime; 876 834 *stime = t->stime; 877 835 878 836 /* Task is sleeping, nothing to add */ 879 - if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) 837 + if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) 880 838 continue; 881 839 882 - delta = vtime_delta(t); 840 + delta = vtime_delta(vtime); 883 841 884 842 /* 885 843 * Task runs either in user or kernel space, add pending nohz time to 886 844 * the right place. 887 845 */ 888 - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) 889 - *utime += delta; 890 - else if (t->vtime_snap_whence == VTIME_SYS) 891 - *stime += delta; 892 - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 846 + if (vtime->state == VTIME_USER || t->flags & PF_VCPU) 847 + *utime += vtime->utime + delta; 848 + else if (vtime->state == VTIME_SYS) 849 + *stime += vtime->stime + delta; 850 + } while (read_seqcount_retry(&vtime->seqcount, seq)); 893 851 } 894 852 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

+20 -12

kernel/sched/fair.c

··· 6646 6646 * our sched_group. We may want to revisit it if we couldn't 6647 6647 * meet load balance goals by pulling other tasks on src_cpu. 6648 6648 * 6649 - * Also avoid computing new_dst_cpu if we have already computed 6650 - * one in current iteration. 6649 + * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have 6650 + * already computed one in current iteration. 6651 6651 */ 6652 - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) 6652 + if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 6653 6653 return 0; 6654 6654 6655 6655 /* Prevent to re-select dst_cpu via env's cpus */ ··· 8022 8022 .tasks = LIST_HEAD_INIT(env.tasks), 8023 8023 }; 8024 8024 8025 - /* 8026 - * For NEWLY_IDLE load_balancing, we don't need to consider 8027 - * other cpus in our group 8028 - */ 8029 - if (idle == CPU_NEWLY_IDLE) 8030 - env.dst_grpmask = NULL; 8031 - 8032 - cpumask_copy(cpus, cpu_active_mask); 8025 + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); 8033 8026 8034 8027 schedstat_inc(sd->lb_count[idle]); 8035 8028 ··· 8144 8151 /* All tasks on this runqueue were pinned by CPU affinity */ 8145 8152 if (unlikely(env.flags & LBF_ALL_PINNED)) { 8146 8153 cpumask_clear_cpu(cpu_of(busiest), cpus); 8147 - if (!cpumask_empty(cpus)) { 8154 + /* 8155 + * Attempting to continue load balancing at the current 8156 + * sched_domain level only makes sense if there are 8157 + * active CPUs remaining as possible busiest CPUs to 8158 + * pull load from which are not contained within the 8159 + * destination group that is receiving any migrated 8160 + * load. 8161 + */ 8162 + if (!cpumask_subset(cpus, env.dst_grpmask)) { 8148 8163 env.loop = 0; 8149 8164 env.loop_break = sched_nr_migrate_break; 8150 8165 goto redo; ··· 8448 8447 .src_cpu = busiest_rq->cpu, 8449 8448 .src_rq = busiest_rq, 8450 8449 .idle = CPU_IDLE, 8450 + /* 8451 + * can_migrate_task() doesn't need to compute new_dst_cpu 8452 + * for active balancing. Since we have CPU_IDLE, but no 8453 + * @dst_grpmask we need to make that test go away with lying 8454 + * about DST_PINNED. 8455 + */ 8456 + .flags = LBF_DST_PINNED, 8451 8457 }; 8452 8458 8453 8459 schedstat_inc(sd->alb_count);