Merge tag 'sched-urgent-2026-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull scheduler fixes from Ingo Molnar:

- Fix zero_vruntime tracking when there's a single task running

- Fix slice protection logic

- Fix the ->vprot logic for reniced tasks

- Fix lag clamping in mixed slice workloads

- Fix objtool uaccess warning (and bug) in the
!CONFIG_RSEQ_SLICE_EXTENSION case caused by unexpected un-inlining,
which triggers with older compilers

- Fix a comment in the rseq registration rseq_size bound check code

- Fix a legacy RSEQ ABI quirk that handled 32-byte area sizes
differently, which special size we now reached naturally and want to
avoid. The visible ugliness of the new reserved field will be avoided
the next time the RSEQ area is extended.

* tag 'sched-urgent-2026-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
rseq: slice ext: Ensure rseq feature size differs from original rseq size
rseq: Clarify rseq registration rseq_size bound check comment
sched/core: Fix wakeup_preempt's next_class tracking
rseq: Mark rseq_arm_slice_extension_timer() __always_inline
sched/fair: Fix lag clamp
sched/eevdf: Update se->vprot in reweight_entity()
sched/fair: Only set slice protection at pick time
sched/fair: Fix zero_vruntime tracking

Linus Torvalds 3 weeks ago 61706251 cb36eabc

+172 -52

10 changed files

expand all

binfmt_elf.c

include

linux

rseq.h

rseq_entry.h

sched.h

uapi

linux

rseq.h

kernel

rseq.c

sched

core.c

ext.c

fair.c

sched.h

+2 -1

fs/binfmt_elf.c

··· 47 47 #include <linux/dax.h> 48 48 #include <linux/uaccess.h> 49 49 #include <uapi/linux/rseq.h> 50 + #include <linux/rseq.h> 50 51 #include <asm/param.h> 51 52 #include <asm/page.h> 52 53 ··· 287 286 } 288 287 #ifdef CONFIG_RSEQ 289 288 NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); 290 - NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); 289 + NEW_AUX_ENT(AT_RSEQ_ALIGN, rseq_alloc_align()); 291 290 #endif 292 291 #undef NEW_AUX_ENT 293 292 /* AT_NULL is zero; clear the rest too */

+12

include/linux/rseq.h

··· 146 146 t->rseq = current->rseq; 147 147 } 148 148 149 + /* 150 + * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq 151 + * registration. This is the active rseq area size rounded up to next 152 + * power of 2, which guarantees that the rseq structure will always be 153 + * aligned on the nearest power of two large enough to contain it, even 154 + * as it grows. 155 + */ 156 + static inline unsigned int rseq_alloc_align(void) 157 + { 158 + return 1U << get_count_order(offsetof(struct rseq, end)); 159 + } 160 + 149 161 #else /* CONFIG_RSEQ */ 150 162 static inline void rseq_handle_slowpath(struct pt_regs *regs) { } 151 163 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }

+4 -4

include/linux/rseq_entry.h

··· 216 216 } 217 217 218 218 #else /* CONFIG_RSEQ_SLICE_EXTENSION */ 219 - static inline bool rseq_slice_extension_enabled(void) { return false; } 220 - static inline bool rseq_arm_slice_extension_timer(void) { return false; } 221 - static inline void rseq_slice_clear_grant(struct task_struct *t) { } 222 - static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } 219 + static __always_inline bool rseq_slice_extension_enabled(void) { return false; } 220 + static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } 221 + static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } 222 + static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; } 223 223 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ 224 224 225 225 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);

include/linux/sched.h

··· 579 579 u64 deadline; 580 580 u64 min_vruntime; 581 581 u64 min_slice; 582 + u64 max_slice; 582 583 583 584 struct list_head group_node; 584 585 unsigned char on_rq;

+22 -4

include/uapi/linux/rseq.h

··· 87 87 }; 88 88 89 89 /* 90 - * struct rseq is aligned on 4 * 8 bytes to ensure it is always 91 - * contained within a single cache-line. 90 + * The original size and alignment of the allocation for struct rseq is 91 + * 32 bytes. 92 92 * 93 - * A single struct rseq per thread is allowed. 93 + * The allocation size needs to be greater or equal to 94 + * max(getauxval(AT_RSEQ_FEATURE_SIZE), 32), and the allocation needs to 95 + * be aligned on max(getauxval(AT_RSEQ_ALIGN), 32). 96 + * 97 + * As an alternative, userspace is allowed to use both the original size 98 + * and alignment of 32 bytes for backward compatibility. 99 + * 100 + * A single active struct rseq registration per thread is allowed. 94 101 */ 95 102 struct rseq { 96 103 /* ··· 188 181 struct rseq_slice_ctrl slice_ctrl; 189 182 190 183 /* 184 + * Before rseq became extensible, its original size was 32 bytes even 185 + * though the active rseq area was only 20 bytes. 186 + * Exposing a 32 bytes feature size would make life needlessly painful 187 + * for userspace. Therefore, add a reserved byte after byte 32 188 + * to bump the rseq feature size from 32 to 33. 189 + * The next field to be added to the rseq area will be larger 190 + * than one byte, and will replace this reserved byte. 191 + */ 192 + __u8 __reserved; 193 + 194 + /* 191 195 * Flexible array member at end of structure, after last feature field. 192 196 */ 193 197 char end[]; 194 - } __attribute__((aligned(4 * sizeof(__u64)))); 198 + } __attribute__((aligned(32))); 195 199 196 200 #endif /* _UAPI_LINUX_RSEQ_H */

+5 -3

kernel/rseq.c

··· 80 80 #include <linux/syscalls.h> 81 81 #include <linux/uaccess.h> 82 82 #include <linux/types.h> 83 + #include <linux/rseq.h> 83 84 #include <asm/ptrace.h> 84 85 85 86 #define CREATE_TRACE_POINTS ··· 450 449 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq 451 450 * size, the required alignment is the original struct rseq alignment. 452 451 * 453 - * In order to be valid, rseq_len is either the original rseq size, or 454 - * large enough to contain all supported fields, as communicated to 452 + * The rseq_len is required to be greater or equal to the original rseq 453 + * size. In order to be valid, rseq_len is either the original rseq size, 454 + * or large enough to contain all supported fields, as communicated to 455 455 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 456 456 */ 457 457 if (rseq_len < ORIG_RSEQ_SIZE || 458 458 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 459 - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 459 + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || 460 460 rseq_len < offsetof(struct rseq, end)))) 461 461 return -EINVAL; 462 462 if (!access_ok(rseq, rseq_len))

kernel/sched/core.c

··· 6830 6830 /* SCX must consult the BPF scheduler to tell if rq is empty */ 6831 6831 if (!rq->nr_running && !scx_enabled()) { 6832 6832 next = prev; 6833 + rq->next_class = &idle_sched_class; 6833 6834 goto picked; 6834 6835 } 6835 6836 } else if (!preempt && prev_state) {

+2 -2

kernel/sched/ext.c

··· 2460 2460 /* see kick_cpus_irq_workfn() */ 2461 2461 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2462 2462 2463 - rq->next_class = &ext_sched_class; 2463 + rq_modified_begin(rq, &ext_sched_class); 2464 2464 2465 2465 rq_unpin_lock(rq, rf); 2466 2466 balance_one(rq, prev); ··· 2475 2475 * If @force_scx is true, always try to pick a SCHED_EXT task, 2476 2476 * regardless of any higher-priority sched classes activity. 2477 2477 */ 2478 - if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) 2478 + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 2479 2479 return RETRY_TASK; 2480 2480 2481 2481 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;

+112 -38

kernel/sched/fair.c

··· 589 589 return vruntime_cmp(a->deadline, "<", b->deadline); 590 590 } 591 591 592 + /* 593 + * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale 594 + * and this value should be no more than two lag bounds. Which puts it in the 595 + * general order of: 596 + * 597 + * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT 598 + * 599 + * which is around 44 bits in size (on 64bit); that is 20 for 600 + * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for 601 + * however many msec the actual slice+tick ends up begin. 602 + * 603 + * (disregarding the actual divide-by-weight part makes for the worst case 604 + * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually 605 + * being the zero-lag point). 606 + */ 592 607 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 593 608 { 594 609 return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); ··· 691 676 } 692 677 693 678 static inline 694 - void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) 679 + void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) 695 680 { 696 681 /* 697 - * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight 682 + * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight 698 683 */ 699 684 cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; 685 + cfs_rq->zero_vruntime += delta; 700 686 } 701 687 702 688 /* 703 - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true 689 + * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true 704 690 * For this to be so, the result of this function must have a left bias. 691 + * 692 + * Called in: 693 + * - place_entity() -- before enqueue 694 + * - update_entity_lag() -- before dequeue 695 + * - entity_tick() 696 + * 697 + * This means it is one entry 'behind' but that puts it close enough to where 698 + * the bound on entity_key() is at most two lag bounds. 705 699 */ 706 700 u64 avg_vruntime(struct cfs_rq *cfs_rq) 707 701 { 708 702 struct sched_entity *curr = cfs_rq->curr; 709 - s64 avg = cfs_rq->sum_w_vruntime; 710 - long load = cfs_rq->sum_weight; 703 + long weight = cfs_rq->sum_weight; 704 + s64 delta = 0; 711 705 712 - if (curr && curr->on_rq) { 713 - unsigned long weight = scale_load_down(curr->load.weight); 706 + if (curr && !curr->on_rq) 707 + curr = NULL; 714 708 715 - avg += entity_key(cfs_rq, curr) * weight; 716 - load += weight; 717 - } 709 + if (weight) { 710 + s64 runtime = cfs_rq->sum_w_vruntime; 718 711 719 - if (load) { 712 + if (curr) { 713 + unsigned long w = scale_load_down(curr->load.weight); 714 + 715 + runtime += entity_key(cfs_rq, curr) * w; 716 + weight += w; 717 + } 718 + 720 719 /* sign flips effective floor / ceiling */ 721 - if (avg < 0) 722 - avg -= (load - 1); 723 - avg = div_s64(avg, load); 720 + if (runtime < 0) 721 + runtime -= (weight - 1); 722 + 723 + delta = div_s64(runtime, weight); 724 + } else if (curr) { 725 + /* 726 + * When there is but one element, it is the average. 727 + */ 728 + delta = curr->vruntime - cfs_rq->zero_vruntime; 724 729 } 725 730 726 - return cfs_rq->zero_vruntime + avg; 731 + update_zero_vruntime(cfs_rq, delta); 732 + 733 + return cfs_rq->zero_vruntime; 727 734 } 735 + 736 + static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); 728 737 729 738 /* 730 739 * lag_i = S - s_i = w_i * (V - v_i) ··· 763 724 * EEVDF gives the following limit for a steady state system: 764 725 * 765 726 * -r_max < lag < max(r_max, q) 766 - * 767 - * XXX could add max_slice to the augmented data to track this. 768 727 */ 769 728 static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) 770 729 { 730 + u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; 771 731 s64 vlag, limit; 772 732 773 733 WARN_ON_ONCE(!se->on_rq); 774 734 775 735 vlag = avg_vruntime(cfs_rq) - se->vruntime; 776 - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); 736 + limit = calc_delta_fair(max_slice, se); 777 737 778 738 se->vlag = clamp(vlag, -limit, limit); 779 739 } ··· 815 777 return vruntime_eligible(cfs_rq, se->vruntime); 816 778 } 817 779 818 - static void update_zero_vruntime(struct cfs_rq *cfs_rq) 819 - { 820 - u64 vruntime = avg_vruntime(cfs_rq); 821 - s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); 822 - 823 - sum_w_vruntime_update(cfs_rq, delta); 824 - 825 - cfs_rq->zero_vruntime = vruntime; 826 - } 827 - 828 780 static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) 829 781 { 830 782 struct sched_entity *root = __pick_root_entity(cfs_rq); ··· 828 800 min_slice = min(min_slice, root->min_slice); 829 801 830 802 return min_slice; 803 + } 804 + 805 + static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq) 806 + { 807 + struct sched_entity *root = __pick_root_entity(cfs_rq); 808 + struct sched_entity *curr = cfs_rq->curr; 809 + u64 max_slice = 0ULL; 810 + 811 + if (curr && curr->on_rq) 812 + max_slice = curr->slice; 813 + 814 + if (root) 815 + max_slice = max(max_slice, root->max_slice); 816 + 817 + return max_slice; 831 818 } 832 819 833 820 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) ··· 869 826 } 870 827 } 871 828 829 + static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node) 830 + { 831 + if (node) { 832 + struct sched_entity *rse = __node_2_se(node); 833 + if (rse->max_slice > se->max_slice) 834 + se->max_slice = rse->max_slice; 835 + } 836 + } 837 + 872 838 /* 873 839 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) 874 840 */ ··· 885 833 { 886 834 u64 old_min_vruntime = se->min_vruntime; 887 835 u64 old_min_slice = se->min_slice; 836 + u64 old_max_slice = se->max_slice; 888 837 struct rb_node *node = &se->run_node; 889 838 890 839 se->min_vruntime = se->vruntime; ··· 896 843 __min_slice_update(se, node->rb_right); 897 844 __min_slice_update(se, node->rb_left); 898 845 846 + se->max_slice = se->slice; 847 + __max_slice_update(se, node->rb_right); 848 + __max_slice_update(se, node->rb_left); 849 + 899 850 return se->min_vruntime == old_min_vruntime && 900 - se->min_slice == old_min_slice; 851 + se->min_slice == old_min_slice && 852 + se->max_slice == old_max_slice; 901 853 } 902 854 903 855 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, ··· 914 856 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 915 857 { 916 858 sum_w_vruntime_add(cfs_rq, se); 917 - update_zero_vruntime(cfs_rq); 918 859 se->min_vruntime = se->vruntime; 919 860 se->min_slice = se->slice; 920 861 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, ··· 925 868 rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, 926 869 &min_vruntime_cb); 927 870 sum_w_vruntime_sub(cfs_rq, se); 928 - update_zero_vruntime(cfs_rq); 929 871 } 930 872 931 873 struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) ··· 3846 3790 unsigned long weight) 3847 3791 { 3848 3792 bool curr = cfs_rq->curr == se; 3793 + bool rel_vprot = false; 3794 + u64 vprot; 3849 3795 3850 3796 if (se->on_rq) { 3851 3797 /* commit outstanding execution time */ ··· 3855 3797 update_entity_lag(cfs_rq, se); 3856 3798 se->deadline -= se->vruntime; 3857 3799 se->rel_deadline = 1; 3800 + if (curr && protect_slice(se)) { 3801 + vprot = se->vprot - se->vruntime; 3802 + rel_vprot = true; 3803 + } 3804 + 3858 3805 cfs_rq->nr_queued--; 3859 3806 if (!curr) 3860 3807 __dequeue_entity(cfs_rq, se); ··· 3875 3812 if (se->rel_deadline) 3876 3813 se->deadline = div_s64(se->deadline * se->load.weight, weight); 3877 3814 3815 + if (rel_vprot) 3816 + vprot = div_s64(vprot * se->load.weight, weight); 3817 + 3878 3818 update_load_set(&se->load, weight); 3879 3819 3880 3820 do { ··· 3889 3823 enqueue_load_avg(cfs_rq, se); 3890 3824 if (se->on_rq) { 3891 3825 place_entity(cfs_rq, se, 0); 3826 + if (rel_vprot) 3827 + se->vprot = se->vruntime + vprot; 3892 3828 update_load_add(&cfs_rq->load, se->load.weight); 3893 3829 if (!curr) 3894 3830 __enqueue_entity(cfs_rq, se); ··· 5488 5420 } 5489 5421 5490 5422 static void 5491 - set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 5423 + set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first) 5492 5424 { 5493 5425 clear_buddies(cfs_rq, se); 5494 5426 ··· 5503 5435 __dequeue_entity(cfs_rq, se); 5504 5436 update_load_avg(cfs_rq, se, UPDATE_TG); 5505 5437 5506 - set_protect_slice(cfs_rq, se); 5438 + if (first) 5439 + set_protect_slice(cfs_rq, se); 5507 5440 } 5508 5441 5509 5442 update_stats_curr_start(cfs_rq, se); ··· 5592 5523 */ 5593 5524 update_load_avg(cfs_rq, curr, UPDATE_TG); 5594 5525 update_cfs_group(curr); 5526 + 5527 + /* 5528 + * Pulls along cfs_rq::zero_vruntime. 5529 + */ 5530 + avg_vruntime(cfs_rq); 5595 5531 5596 5532 #ifdef CONFIG_SCHED_HRTICK 5597 5533 /* ··· 9022 8948 pse = parent_entity(pse); 9023 8949 } 9024 8950 if (se_depth >= pse_depth) { 9025 - set_next_entity(cfs_rq_of(se), se); 8951 + set_next_entity(cfs_rq_of(se), se, true); 9026 8952 se = parent_entity(se); 9027 8953 } 9028 8954 } 9029 8955 9030 8956 put_prev_entity(cfs_rq, pse); 9031 - set_next_entity(cfs_rq, se); 8957 + set_next_entity(cfs_rq, se, true); 9032 8958 9033 8959 __set_next_task_fair(rq, p, true); 9034 8960 } ··· 12982 12908 t0 = sched_clock_cpu(this_cpu); 12983 12909 __sched_balance_update_blocked_averages(this_rq); 12984 12910 12985 - this_rq->next_class = &fair_sched_class; 12911 + rq_modified_begin(this_rq, &fair_sched_class); 12986 12912 raw_spin_rq_unlock(this_rq); 12987 12913 12988 12914 for_each_domain(this_cpu, sd) { ··· 13049 12975 pulled_task = 1; 13050 12976 13051 12977 /* If a higher prio class was modified, restart the pick */ 13052 - if (sched_class_above(this_rq->next_class, &fair_sched_class)) 12978 + if (rq_modified_above(this_rq, &fair_sched_class)) 13053 12979 pulled_task = -1; 13054 12980 13055 12981 out: ··· 13642 13568 for_each_sched_entity(se) { 13643 13569 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13644 13570 13645 - set_next_entity(cfs_rq, se); 13571 + set_next_entity(cfs_rq, se, first); 13646 13572 /* ensure bandwidth has been allocated on our new cfs_rq */ 13647 13573 account_cfs_rq_runtime(cfs_rq, 0); 13648 13574 }

+11

kernel/sched/sched.h

··· 2748 2748 2749 2749 #define sched_class_above(_a, _b) ((_a) < (_b)) 2750 2750 2751 + static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class) 2752 + { 2753 + if (sched_class_above(rq->next_class, class)) 2754 + rq->next_class = class; 2755 + } 2756 + 2757 + static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class) 2758 + { 2759 + return sched_class_above(rq->next_class, class); 2760 + } 2761 + 2751 2762 static inline bool sched_stop_runnable(struct rq *rq) 2752 2763 { 2753 2764 return rq->stop && task_on_rq_queued(rq->stop);