commit dda5df9823630a26ed24ca9150b33a7f56ba4546 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'sched-urgent-2026-02-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
"Miscellaneous MMCID fixes to address bugs and performance regressions
in the recent rewrite of the SCHED_MM_CID management code:

- Fix livelock triggered by BPF CI testing

- Fix hard lockup on weakly ordered systems

- Simplify the dropping of CIDs in the exit path by removing an
unintended transition phase

- Fix performance/scalability regression on a thread-pool benchmark
by optimizing transitional CIDs when scheduling out"

* tag 'sched-urgent-2026-02-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/mmcid: Optimize transitional CIDs when scheduling out
sched/mmcid: Drop per CPU CID immediately when switching to per task mode
sched/mmcid: Protect transition on weakly ordered systems
sched/mmcid: Prevent live lock on task to CPU mode transition

Linus Torvalds 3 days ago dda5df98 7e0b172c

+163 -71

3 changed files

expand all

unified split

include

linux

rseq_types.h

kernel

sched

core.c

sched.h

+2 -4

include/linux/rseq_types.h

··· 121 121 /** 122 122 * struct mm_mm_cid - Storage for per MM CID data 123 123 * @pcpu: Per CPU storage for CIDs associated to a CPU 124 - * @percpu: Set, when CIDs are in per CPU mode 125 - * @transit: Set to MM_CID_TRANSIT during a mode change transition phase 124 + * @mode: Indicates per CPU and transition mode 126 125 * @max_cids: The exclusive maximum CID value for allocation and convergence 127 126 * @irq_work: irq_work to handle the affinity mode change case 128 127 * @work: Regular work to handle the affinity mode change case ··· 138 139 struct mm_mm_cid { 139 140 /* Hotpath read mostly members */ 140 141 struct mm_cid_pcpu __percpu *pcpu; 141 - unsigned int percpu; 142 - unsigned int transit; 142 + unsigned int mode; 143 143 unsigned int max_cids; 144 144 145 145 /* Rarely used. Moves @lock and @mutex into the second cacheline */

+126 -58

kernel/sched/core.c

··· 10269 10269 * Serialization rules: 10270 10270 * 10271 10271 * mm::mm_cid::mutex: Serializes fork() and exit() and therefore 10272 - * protects mm::mm_cid::users. 10272 + * protects mm::mm_cid::users and mode switch 10273 + * transitions 10273 10274 * 10274 10275 * mm::mm_cid::lock: Serializes mm_update_max_cids() and 10275 10276 * mm_update_cpus_allowed(). Nests in mm_cid::mutex ··· 10286 10285 * 10287 10286 * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or 10288 10287 * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the 10289 - * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, 10290 - * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the 10291 - * task needs to drop the CID into the pool when scheduling out. Both bits 10292 - * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is 10293 - * actually handed over to user space in the RSEQ memory. 10288 + * MM_CID_ONCPU bit set. 10289 + * 10290 + * During the transition of ownership mode, the MM_CID_TRANSIT bit is set 10291 + * on the CIDs. When this bit is set the tasks drop the CID back into the 10292 + * pool when scheduling out. 10293 + * 10294 + * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the 10295 + * CID is actually handed over to user space in the RSEQ memory. 10294 10296 * 10295 10297 * Mode switching: 10298 + * 10299 + * The ownership mode is per process and stored in mm:mm_cid::mode with the 10300 + * following possible states: 10301 + * 10302 + * 0: Per task ownership 10303 + * 0 | MM_CID_TRANSIT: Transition from per CPU to per task 10304 + * MM_CID_ONCPU: Per CPU ownership 10305 + * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU 10306 + * 10307 + * All transitions of ownership mode happen in two phases: 10308 + * 10309 + * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the 10310 + * CIDs and denotes that the CID is only temporarily owned by a 10311 + * task. When the task schedules out it drops the CID back into the 10312 + * pool if this bit is set. 10313 + * 10314 + * 2) The initiating context walks the per CPU space or the tasks to fixup 10315 + * or drop the CIDs and after completion it clears MM_CID_TRANSIT in 10316 + * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU 10317 + * owned again. 10318 + * 10319 + * This two phase transition is required to prevent CID space exhaustion 10320 + * during the transition as a direct transfer of ownership would fail: 10321 + * 10322 + * - On task to CPU mode switch if a task is scheduled in on one CPU and 10323 + * then migrated to another CPU before the fixup freed enough per task 10324 + * CIDs. 10325 + * 10326 + * - On CPU to task mode switch if two tasks are scheduled in on the same 10327 + * CPU before the fixup freed per CPU CIDs. 10328 + * 10329 + * Both scenarios can result in a live lock because sched_in() is invoked 10330 + * with runqueue lock held and loops in search of a CID and the fixup 10331 + * thread can't make progress freeing them up because it is stuck on the 10332 + * same runqueue lock. 10333 + * 10334 + * While MM_CID_TRANSIT is active during the transition phase the MM_CID 10335 + * bitmap can be contended, but that's a temporary contention bound to the 10336 + * transition period. After that everything goes back into steady state and 10337 + * nothing except fork() and exit() will touch the bitmap. This is an 10338 + * acceptable tradeoff as it completely avoids complex serialization, 10339 + * memory barriers and atomic operations for the common case. 10340 + * 10341 + * Aside of that this mechanism also ensures RT compability: 10342 + * 10343 + * - The task which runs the fixup is fully preemptible except for the 10344 + * short runqueue lock held sections. 10345 + * 10346 + * - The transient impact of the bitmap contention is only problematic 10347 + * when there is a thundering herd scenario of tasks scheduling in and 10348 + * out concurrently. There is not much which can be done about that 10349 + * except for avoiding mode switching by a proper overall system 10350 + * configuration. 10296 10351 * 10297 10352 * Switching to per CPU mode happens when the user count becomes greater 10298 10353 * than the maximum number of CIDs, which is calculated by: ··· 10363 10306 * 10364 10307 * At the point of switching to per CPU mode the new user is not yet 10365 10308 * visible in the system, so the task which initiated the fork() runs the 10366 - * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and 10367 - * either transfers each tasks owned CID to the CPU the task runs on or 10368 - * drops it into the CID pool if a task is not on a CPU at that point in 10369 - * time. Tasks which schedule in before the task walk reaches them do the 10370 - * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes 10371 - * it's guaranteed that no task related to that MM owns a CID anymore. 10309 + * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and 10310 + * either marks each task owned CID with MM_CID_TRANSIT if the task is 10311 + * running on a CPU or drops it into the CID pool if a task is not on a 10312 + * CPU. Tasks which schedule in before the task walk reaches them do the 10313 + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() 10314 + * completes it is guaranteed that no task related to that MM owns a CID 10315 + * anymore. 10372 10316 * 10373 10317 * Switching back to task mode happens when the user count goes below the 10374 10318 * threshold which was recorded on the per CPU mode switch: ··· 10385 10327 * run either in the deferred update function in context of a workqueue or 10386 10328 * by a task which forks a new one or by a task which exits. Whatever 10387 10329 * happens first. mm_cid_fixup_cpus_to_task() walks through the possible 10388 - * CPUs and either transfers the CPU owned CIDs to a related task which 10389 - * runs on the CPU or drops it into the pool. Tasks which schedule in on a 10390 - * CPU which the walk did not cover yet do the handover themself. 10391 - * 10392 - * This transition from CPU to per task ownership happens in two phases: 10393 - * 10394 - * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task 10395 - * CID and denotes that the CID is only temporarily owned by the 10396 - * task. When it schedules out the task drops the CID back into the 10397 - * pool if this bit is set. 10398 - * 10399 - * 2) The initiating context walks the per CPU space and after completion 10400 - * clears mm:mm_cid.transit. So after that point the CIDs are strictly 10401 - * task owned again. 10402 - * 10403 - * This two phase transition is required to prevent CID space exhaustion 10404 - * during the transition as a direct transfer of ownership would fail if 10405 - * two tasks are scheduled in on the same CPU before the fixup freed per 10406 - * CPU CIDs. 10407 - * 10408 - * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID 10409 - * related to that MM is owned by a CPU anymore. 10330 + * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a 10331 + * related task is running on the CPU or drops it into the pool. Tasks 10332 + * which are scheduled in before the fixup covered them do the handover 10333 + * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed 10334 + * that no CID related to that MM is owned by a CPU anymore. 10410 10335 */ 10411 10336 10412 10337 /* ··· 10420 10379 static bool mm_update_max_cids(struct mm_struct *mm) 10421 10380 { 10422 10381 struct mm_mm_cid *mc = &mm->mm_cid; 10382 + bool percpu = cid_on_cpu(mc->mode); 10423 10383 10424 10384 lockdep_assert_held(&mm->mm_cid.lock); 10425 10385 ··· 10429 10387 __mm_update_max_cids(mc); 10430 10388 10431 10389 /* Check whether owner mode must be changed */ 10432 - if (!mc->percpu) { 10390 + if (!percpu) { 10433 10391 /* Enable per CPU mode when the number of users is above max_cids */ 10434 10392 if (mc->users > mc->max_cids) 10435 10393 mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); ··· 10440 10398 } 10441 10399 10442 10400 /* Mode change required? */ 10443 - if (!!mc->percpu == !!mc->pcpu_thrs) 10401 + if (percpu == !!mc->pcpu_thrs) 10444 10402 return false; 10445 - /* When switching back to per TASK mode, set the transition flag */ 10446 - if (!mc->pcpu_thrs) 10447 - WRITE_ONCE(mc->transit, MM_CID_TRANSIT); 10448 - WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); 10403 + 10404 + /* Flip the mode and set the transition flag to bridge the transfer */ 10405 + WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU)); 10406 + /* 10407 + * Order the store against the subsequent fixups so that 10408 + * acquire(rq::lock) cannot be reordered by the CPU before the 10409 + * store. 10410 + */ 10411 + smp_mb(); 10449 10412 return true; 10450 10413 } 10451 10414 ··· 10475 10428 10476 10429 WRITE_ONCE(mc->nr_cpus_allowed, weight); 10477 10430 __mm_update_max_cids(mc); 10478 - if (!mc->percpu) 10431 + if (!cid_on_cpu(mc->mode)) 10479 10432 return; 10480 10433 10481 10434 /* Adjust the threshold to the wider set */ ··· 10491 10444 /* Queue the irq work, which schedules the real work */ 10492 10445 mc->update_deferred = true; 10493 10446 irq_work_queue(&mc->irq_work); 10447 + } 10448 + 10449 + static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode) 10450 + { 10451 + /* 10452 + * Ensure that the store removing the TRANSIT bit cannot be 10453 + * reordered by the CPU before the fixups have been completed. 10454 + */ 10455 + smp_mb(); 10456 + WRITE_ONCE(mm->mm_cid.mode, mode); 10494 10457 } 10495 10458 10496 10459 static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) ··· 10546 10489 } 10547 10490 } 10548 10491 } 10549 - /* Clear the transition bit */ 10550 - WRITE_ONCE(mm->mm_cid.transit, 0); 10492 + mm_cid_complete_transit(mm, 0); 10551 10493 } 10552 10494 10553 - static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) 10495 + static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) 10554 10496 { 10555 10497 if (cid_on_task(t->mm_cid.cid)) { 10556 - t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); 10498 + t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid); 10557 10499 pcp->cid = t->mm_cid.cid; 10558 10500 } 10559 10501 } ··· 10565 10509 if (!t->mm_cid.active) 10566 10510 return false; 10567 10511 if (cid_on_task(t->mm_cid.cid)) { 10568 - /* If running on the CPU, transfer the CID, otherwise drop it */ 10512 + /* If running on the CPU, put the CID in transit mode, otherwise drop it */ 10569 10513 if (task_rq(t)->curr == t) 10570 - mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); 10514 + mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); 10571 10515 else 10572 10516 mm_unset_cid_on_task(t); 10573 10517 } 10574 10518 return true; 10575 10519 } 10576 10520 10577 - static void mm_cid_fixup_tasks_to_cpus(void) 10521 + static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) 10578 10522 { 10579 - struct mm_struct *mm = current->mm; 10580 10523 struct task_struct *p, *t; 10581 10524 unsigned int users; 10582 10525 ··· 10613 10558 } 10614 10559 } 10615 10560 10561 + static void mm_cid_fixup_tasks_to_cpus(void) 10562 + { 10563 + struct mm_struct *mm = current->mm; 10564 + 10565 + mm_cid_do_fixup_tasks_to_cpus(mm); 10566 + mm_cid_complete_transit(mm, MM_CID_ONCPU); 10567 + } 10568 + 10616 10569 static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) 10617 10570 { 10618 10571 t->mm_cid.active = 1; ··· 10649 10586 } 10650 10587 10651 10588 if (!sched_mm_cid_add_user(t, mm)) { 10652 - if (!mm->mm_cid.percpu) 10589 + if (!cid_on_cpu(mm->mm_cid.mode)) 10653 10590 t->mm_cid.cid = mm_get_cid(mm); 10654 10591 return; 10655 10592 } 10656 10593 10657 10594 /* Handle the mode change and transfer current's CID */ 10658 - percpu = !!mm->mm_cid.percpu; 10595 + percpu = cid_on_cpu(mm->mm_cid.mode); 10659 10596 if (!percpu) 10660 10597 mm_cid_transit_to_task(current, pcp); 10661 10598 else 10662 - mm_cid_transfer_to_cpu(current, pcp); 10599 + mm_cid_transit_to_cpu(current, pcp); 10663 10600 } 10664 10601 10665 10602 if (percpu) { ··· 10694 10631 * affinity change increased the number of allowed CPUs and the 10695 10632 * deferred fixup did not run yet. 10696 10633 */ 10697 - if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10634 + if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) 10698 10635 return false; 10699 10636 /* 10700 10637 * A failed fork(2) cleanup never gets here, so @current must have ··· 10727 10664 scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10728 10665 if (!__sched_mm_cid_exit(t)) 10729 10666 return; 10730 - /* Mode change required. Transfer currents CID */ 10731 - mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); 10667 + /* 10668 + * Mode change. The task has the CID unset 10669 + * already. The CPU CID is still valid and 10670 + * does not have MM_CID_TRANSIT set as the 10671 + * mode change has just taken effect under 10672 + * mm::mm_cid::lock. Drop it. 10673 + */ 10674 + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); 10732 10675 } 10733 10676 mm_cid_fixup_cpus_to_tasks(mm); 10734 10677 return; ··· 10791 10722 if (!mm_update_max_cids(mm)) 10792 10723 return; 10793 10724 /* Affinity changes can only switch back to task mode */ 10794 - if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10725 + if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) 10795 10726 return; 10796 10727 } 10797 10728 mm_cid_fixup_cpus_to_tasks(mm); ··· 10812 10743 void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 10813 10744 { 10814 10745 mm->mm_cid.max_cids = 0; 10815 - mm->mm_cid.percpu = 0; 10816 - mm->mm_cid.transit = 0; 10746 + mm->mm_cid.mode = 0; 10817 10747 mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; 10818 10748 mm->mm_cid.users = 0; 10819 10749 mm->mm_cid.pcpu_thrs = 0;

+35 -9

kernel/sched/sched.h

··· 3816 3816 __this_cpu_write(mm->mm_cid.pcpu->cid, cid); 3817 3817 } 3818 3818 3819 - static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) 3819 + static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid, 3820 + unsigned int mode) 3820 3821 { 3821 3822 unsigned int max_cids, tcid = t->mm_cid.cid; 3822 3823 struct mm_struct *mm = t->mm; ··· 3842 3841 /* Still nothing, allocate a new one */ 3843 3842 if (!cid_on_cpu(cpu_cid)) 3844 3843 cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); 3844 + 3845 + /* Handle the transition mode flag if required */ 3846 + if (mode & MM_CID_TRANSIT) 3847 + cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT; 3845 3848 } 3846 3849 mm_cid_update_pcpu_cid(mm, cpu_cid); 3847 3850 mm_cid_update_task_cid(t, cpu_cid); 3848 3851 } 3849 3852 3850 - static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) 3853 + static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid, 3854 + unsigned int mode) 3851 3855 { 3852 3856 unsigned int max_cids, tcid = t->mm_cid.cid; 3853 3857 struct mm_struct *mm = t->mm; ··· 3878 3872 if (!cid_on_task(tcid)) 3879 3873 tcid = mm_get_cid(mm); 3880 3874 /* Set the transition mode flag if required */ 3881 - tcid |= READ_ONCE(mm->mm_cid.transit); 3875 + tcid |= mode & MM_CID_TRANSIT; 3882 3876 } 3883 3877 mm_cid_update_pcpu_cid(mm, tcid); 3884 3878 mm_cid_update_task_cid(t, tcid); ··· 3887 3881 static __always_inline void mm_cid_schedin(struct task_struct *next) 3888 3882 { 3889 3883 struct mm_struct *mm = next->mm; 3890 - unsigned int cpu_cid; 3884 + unsigned int cpu_cid, mode; 3891 3885 3892 3886 if (!next->mm_cid.active) 3893 3887 return; 3894 3888 3895 3889 cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); 3896 - if (likely(!READ_ONCE(mm->mm_cid.percpu))) 3897 - mm_cid_from_task(next, cpu_cid); 3890 + mode = READ_ONCE(mm->mm_cid.mode); 3891 + if (likely(!cid_on_cpu(mode))) 3892 + mm_cid_from_task(next, cpu_cid, mode); 3898 3893 else 3899 - mm_cid_from_cpu(next, cpu_cid); 3894 + mm_cid_from_cpu(next, cpu_cid, mode); 3900 3895 } 3901 3896 3902 3897 static __always_inline void mm_cid_schedout(struct task_struct *prev) 3903 3898 { 3899 + struct mm_struct *mm = prev->mm; 3900 + unsigned int mode, cid; 3901 + 3904 3902 /* During mode transitions CIDs are temporary and need to be dropped */ 3905 3903 if (likely(!cid_in_transit(prev->mm_cid.cid))) 3906 3904 return; 3907 3905 3908 - mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); 3909 - prev->mm_cid.cid = MM_CID_UNSET; 3906 + mode = READ_ONCE(mm->mm_cid.mode); 3907 + cid = cid_from_transit_cid(prev->mm_cid.cid); 3908 + 3909 + /* 3910 + * If transition mode is done, transfer ownership when the CID is 3911 + * within the convergence range to optimize the next schedule in. 3912 + */ 3913 + if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) { 3914 + if (cid_on_cpu(mode)) 3915 + cid = cid_to_cpu_cid(cid); 3916 + 3917 + /* Update both so that the next schedule in goes into the fast path */ 3918 + mm_cid_update_pcpu_cid(mm, cid); 3919 + prev->mm_cid.cid = cid; 3920 + } else { 3921 + mm_drop_cid(mm, cid); 3922 + prev->mm_cid.cid = MM_CID_UNSET; 3923 + } 3910 3924 } 3911 3925 3912 3926 static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)