sched/mmcid: Switch over to the new mechanism

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Now that all pieces are in place, change the implementations of
sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict
ownership scheme and switch context_switch() over to use the new
mm_cid_schedin() functionality.

The common case is that there is no mode change required, which makes
fork() and exit() just update the user count and the constraints.

In case that a new user would exceed the CID space limit the fork() context
handles the transition to per CPU mode with mm::mm_cid::mutex held. exit()
handles the transition back to per task mode when the user count drops
below the switch back threshold. fork() might also be forced to handle a
deferred switch back to per task mode, when a affinity change increased the
number of allowed CPUs enough.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de

Thomas Gleixner 4 months ago 653fda7a 9da6ccbc

+103 -116

5 changed files

expand all

include

linux

rseq.h

rseq_types.h

kernel

fork.c

sched

core.c

sched.h

-19

include/linux/rseq.h

··· 84 84 t->rseq.event.ids_changed = true; 85 85 } 86 86 87 - /* 88 - * Invoked from switch_mm_cid() in context switch when the task gets a MM 89 - * CID assigned. 90 - * 91 - * This does not raise TIF_NOTIFY_RESUME as that happens in 92 - * rseq_sched_switch_event(). 93 - */ 94 - static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) 95 - { 96 - /* 97 - * Requires a comparison as the switch_mm_cid() code does not 98 - * provide a conditional for it readily. So avoid excessive updates 99 - * when nothing changes. 100 - */ 101 - if (t->rseq.ids.mm_cid != cid) 102 - t->rseq.event.ids_changed = true; 103 - } 104 - 105 87 /* Enforce a full update after RSEQ registration and when execve() failed */ 106 88 static inline void rseq_force_update(void) 107 89 { ··· 151 169 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } 152 170 static inline void rseq_sched_switch_event(struct task_struct *t) { } 153 171 static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } 154 - static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } 155 172 static inline void rseq_force_update(void) { } 156 173 static inline void rseq_virt_userspace_exit(void) { } 157 174 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }

+4 -4

include/linux/rseq_types.h

··· 101 101 /** 102 102 * struct sched_mm_cid - Storage for per task MM CID data 103 103 * @active: MM CID is active for the task 104 - * @cid: The CID associated to the task 105 - * @last_cid: The last CID associated to the task 104 + * @cid: The CID associated to the task either permanently or 105 + * borrowed from the CPU 106 106 */ 107 107 struct sched_mm_cid { 108 108 unsigned int active; 109 109 unsigned int cid; 110 - unsigned int last_cid; 111 110 }; 112 111 113 112 /** 114 113 * struct mm_cid_pcpu - Storage for per CPU MM_CID data 115 - * @cid: The CID associated to the CPU 114 + * @cid: The CID associated to the CPU either permanently or 115 + * while a task with a CID is running 116 116 */ 117 117 struct mm_cid_pcpu { 118 118 unsigned int cid;

-1

kernel/fork.c

··· 956 956 957 957 #ifdef CONFIG_SCHED_MM_CID 958 958 tsk->mm_cid.cid = MM_CID_UNSET; 959 - tsk->mm_cid.last_cid = MM_CID_UNSET; 960 959 tsk->mm_cid.active = 0; 961 960 #endif 962 961 return tsk;

+99 -16

kernel/sched/core.c

··· 5307 5307 } 5308 5308 } 5309 5309 5310 - switch_mm_cid(prev, next); 5310 + mm_cid_switch_to(prev, next); 5311 5311 5312 5312 /* 5313 5313 * Tell rseq that the task was scheduled in. Must be after ··· 10624 10624 return true; 10625 10625 } 10626 10626 10627 - static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void) 10627 + static void mm_cid_fixup_tasks_to_cpus(void) 10628 10628 { 10629 10629 struct mm_struct *mm = current->mm; 10630 10630 struct task_struct *p, *t; ··· 10674 10674 void sched_mm_cid_fork(struct task_struct *t) 10675 10675 { 10676 10676 struct mm_struct *mm = t->mm; 10677 + bool percpu; 10677 10678 10678 10679 WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); 10679 10680 10680 10681 guard(mutex)(&mm->mm_cid.mutex); 10681 - scoped_guard(raw_spinlock, &mm->mm_cid.lock) { 10682 - sched_mm_cid_add_user(t, mm); 10683 - /* Preset last_cid for mm_cid_select() */ 10684 - t->mm_cid.last_cid = mm->mm_cid.max_cids - 1; 10682 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10683 + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); 10684 + 10685 + /* First user ? */ 10686 + if (!mm->mm_cid.users) { 10687 + sched_mm_cid_add_user(t, mm); 10688 + t->mm_cid.cid = mm_get_cid(mm); 10689 + /* Required for execve() */ 10690 + pcp->cid = t->mm_cid.cid; 10691 + return; 10692 + } 10693 + 10694 + if (!sched_mm_cid_add_user(t, mm)) { 10695 + if (!mm->mm_cid.percpu) 10696 + t->mm_cid.cid = mm_get_cid(mm); 10697 + return; 10698 + } 10699 + 10700 + /* Handle the mode change and transfer current's CID */ 10701 + percpu = !!mm->mm_cid.percpu; 10702 + if (!percpu) 10703 + mm_cid_transit_to_task(current, pcp); 10704 + else 10705 + mm_cid_transfer_to_cpu(current, pcp); 10706 + } 10707 + 10708 + if (percpu) { 10709 + mm_cid_fixup_tasks_to_cpus(); 10710 + } else { 10711 + mm_cid_fixup_cpus_to_tasks(mm); 10712 + t->mm_cid.cid = mm_get_cid(mm); 10685 10713 } 10686 10714 } 10687 10715 10688 10716 static bool sched_mm_cid_remove_user(struct task_struct *t) 10689 10717 { 10690 10718 t->mm_cid.active = 0; 10691 - mm_unset_cid_on_task(t); 10719 + scoped_guard(preempt) { 10720 + /* Clear the transition bit */ 10721 + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10722 + mm_unset_cid_on_task(t); 10723 + } 10692 10724 t->mm->mm_cid.users--; 10693 10725 return mm_update_max_cids(t->mm); 10726 + } 10727 + 10728 + static bool __sched_mm_cid_exit(struct task_struct *t) 10729 + { 10730 + struct mm_struct *mm = t->mm; 10731 + 10732 + if (!sched_mm_cid_remove_user(t)) 10733 + return false; 10734 + /* 10735 + * Contrary to fork() this only deals with a switch back to per 10736 + * task mode either because the above decreased users or an 10737 + * affinity change increased the number of allowed CPUs and the 10738 + * deferred fixup did not run yet. 10739 + */ 10740 + if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10741 + return false; 10742 + /* 10743 + * A failed fork(2) cleanup never gets here, so @current must have 10744 + * the same MM as @t. That's true for exit() and the failed 10745 + * pthread_create() cleanup case. 10746 + */ 10747 + if (WARN_ON_ONCE(current->mm != mm)) 10748 + return false; 10749 + return true; 10694 10750 } 10695 10751 10696 10752 /* ··· 10759 10703 10760 10704 if (!mm || !t->mm_cid.active) 10761 10705 return; 10706 + /* 10707 + * Ensure that only one instance is doing MM CID operations within 10708 + * a MM. The common case is uncontended. The rare fixup case adds 10709 + * some overhead. 10710 + */ 10711 + scoped_guard(mutex, &mm->mm_cid.mutex) { 10712 + /* mm_cid::mutex is sufficient to protect mm_cid::users */ 10713 + if (likely(mm->mm_cid.users > 1)) { 10714 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10715 + if (!__sched_mm_cid_exit(t)) 10716 + return; 10717 + /* Mode change required. Transfer currents CID */ 10718 + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); 10719 + } 10720 + mm_cid_fixup_cpus_to_tasks(mm); 10721 + return; 10722 + } 10723 + /* Last user */ 10724 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10725 + /* Required across execve() */ 10726 + if (t == current) 10727 + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); 10728 + /* Ignore mode change. There is nothing to do. */ 10729 + sched_mm_cid_remove_user(t); 10730 + } 10731 + } 10762 10732 10763 - guard(mutex)(&mm->mm_cid.mutex); 10764 - scoped_guard(raw_spinlock, &mm->mm_cid.lock) 10765 - sched_mm_cid_remove_user(t); 10733 + /* 10734 + * As this is the last user (execve(), process exit or failed 10735 + * fork(2)) there is no concurrency anymore. 10736 + * 10737 + * Synchronize eventually pending work to ensure that there are no 10738 + * dangling references left. @t->mm_cid.users is zero so nothing 10739 + * can queue this work anymore. 10740 + */ 10741 + irq_work_sync(&mm->mm_cid.irq_work); 10742 + cancel_work_sync(&mm->mm_cid.work); 10766 10743 } 10767 10744 10768 10745 /* Deactivate MM CID allocation across execve() */ ··· 10808 10719 void sched_mm_cid_after_execve(struct task_struct *t) 10809 10720 { 10810 10721 sched_mm_cid_fork(t); 10811 - guard(preempt)(); 10812 - mm_cid_select(t); 10813 10722 } 10814 10723 10815 10724 static void mm_cid_work_fn(struct work_struct *work) 10816 10725 { 10817 10726 struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); 10818 - 10819 - /* Make it compile, but not functional yet */ 10820 - if (!IS_ENABLED(CONFIG_NEW_MM_CID)) 10821 - return; 10822 10727 10823 10728 guard(mutex)(&mm->mm_cid.mutex); 10824 10729 /* Did the last user task exit already? */

-76

kernel/sched/sched.h

··· 3745 3745 mm_cid_schedin(next); 3746 3746 } 3747 3747 3748 - /* Active implementation */ 3749 - static inline void init_sched_mm_cid(struct task_struct *t) 3750 - { 3751 - struct mm_struct *mm = t->mm; 3752 - unsigned int max_cid; 3753 - 3754 - if (!mm) 3755 - return; 3756 - 3757 - /* Preset last_mm_cid */ 3758 - max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users)); 3759 - t->mm_cid.last_cid = max_cid - 1; 3760 - } 3761 - 3762 - static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids) 3763 - { 3764 - struct mm_struct *mm = t->mm; 3765 - 3766 - if (cid >= max_cids) 3767 - return false; 3768 - if (test_and_set_bit(cid, mm_cidmask(mm))) 3769 - return false; 3770 - t->mm_cid.cid = t->mm_cid.last_cid = cid; 3771 - __this_cpu_write(mm->mm_cid.pcpu->cid, cid); 3772 - return true; 3773 - } 3774 - 3775 - static inline bool mm_cid_get(struct task_struct *t) 3776 - { 3777 - struct mm_struct *mm = t->mm; 3778 - unsigned int max_cids; 3779 - 3780 - max_cids = READ_ONCE(mm->mm_cid.max_cids); 3781 - 3782 - /* Try to reuse the last CID of this task */ 3783 - if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) 3784 - return true; 3785 - 3786 - /* Try to reuse the last CID of this mm on this CPU */ 3787 - if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids)) 3788 - return true; 3789 - 3790 - /* Try the first zero bit in the cidmask. */ 3791 - return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids); 3792 - } 3793 - 3794 - static inline void mm_cid_select(struct task_struct *t) 3795 - { 3796 - /* 3797 - * mm_cid_get() can fail when the maximum CID, which is determined 3798 - * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. 3799 - * That's a transient failure as there cannot be more tasks 3800 - * concurrently on a CPU (or about to be scheduled in) than that. 3801 - */ 3802 - for (;;) { 3803 - if (mm_cid_get(t)) 3804 - break; 3805 - } 3806 - } 3807 - 3808 - static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) 3809 - { 3810 - if (prev->mm_cid.active) { 3811 - if (prev->mm_cid.cid != MM_CID_UNSET) 3812 - clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm)); 3813 - prev->mm_cid.cid = MM_CID_UNSET; 3814 - } 3815 - 3816 - if (next->mm_cid.active) { 3817 - mm_cid_select(next); 3818 - rseq_sched_set_task_mm_cid(next, next->mm_cid.cid); 3819 - } 3820 - } 3821 - 3822 3748 #else /* !CONFIG_SCHED_MM_CID: */ 3823 - static inline void mm_cid_select(struct task_struct *t) { } 3824 - static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3825 3749 static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } 3826 3750 #endif /* !CONFIG_SCHED_MM_CID */ 3827 3751