sched: Introduce per-memory-map concurrency ID

+4

fs/exec.c

··· 1010 1010 active_mm = tsk->active_mm; 1011 1011 tsk->active_mm = mm; 1012 1012 tsk->mm = mm; 1013 + mm_init_cid(mm); 1013 1014 /* 1014 1015 * This prevents preemption while active_mm is being loaded and 1015 1016 * it and mm are being updated, which could cause problems for ··· 1823 1822 */ 1824 1823 check_unsafe_exec(bprm); 1825 1824 current->in_execve = 1; 1825 + sched_mm_cid_before_execve(current); 1826 1826 1827 1827 file = do_open_execat(fd, filename, flags); 1828 1828 retval = PTR_ERR(file); ··· 1854 1852 if (retval < 0) 1855 1853 goto out; 1856 1854 1855 + sched_mm_cid_after_execve(current); 1857 1856 /* execve succeeded */ 1858 1857 current->fs->in_exec = 0; 1859 1858 current->in_execve = 0; ··· 1874 1871 force_fatal_sig(SIGSEGV); 1875 1872 1876 1873 out_unmark: 1874 + sched_mm_cid_after_execve(current); 1877 1875 current->fs->in_exec = 0; 1878 1876 current->in_execve = 0; 1879 1877

+25

include/linux/mm.h

··· 1976 1976 /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ 1977 1977 #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) 1978 1978 1979 + #ifdef CONFIG_SCHED_MM_CID 1980 + void sched_mm_cid_before_execve(struct task_struct *t); 1981 + void sched_mm_cid_after_execve(struct task_struct *t); 1982 + void sched_mm_cid_fork(struct task_struct *t); 1983 + void sched_mm_cid_exit_signals(struct task_struct *t); 1984 + static inline int task_mm_cid(struct task_struct *t) 1985 + { 1986 + return t->mm_cid; 1987 + } 1988 + #else 1989 + static inline void sched_mm_cid_before_execve(struct task_struct *t) { } 1990 + static inline void sched_mm_cid_after_execve(struct task_struct *t) { } 1991 + static inline void sched_mm_cid_fork(struct task_struct *t) { } 1992 + static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } 1993 + static inline int task_mm_cid(struct task_struct *t) 1994 + { 1995 + /* 1996 + * Use the processor id as a fall-back when the mm cid feature is 1997 + * disabled. This provides functional per-cpu data structure accesses 1998 + * in user-space, althrough it won't provide the memory usage benefits. 1999 + */ 2000 + return raw_smp_processor_id(); 2001 + } 2002 + #endif 2003 + 1979 2004 #ifdef CONFIG_MMU 1980 2005 extern bool can_do_mlock(void); 1981 2006 #else

+42 -1

include/linux/mm_types.h

··· 645 645 * &struct mm_struct is freed. 646 646 */ 647 647 atomic_t mm_count; 648 - 648 + #ifdef CONFIG_SCHED_MM_CID 649 + /** 650 + * @cid_lock: Protect cid bitmap updates vs lookups. 651 + * 652 + * Prevent situations where updates to the cid bitmap happen 653 + * concurrently with lookups. Those can lead to situations 654 + * where a lookup cannot find a free bit simply because it was 655 + * unlucky enough to load, non-atomically, bitmap words as they 656 + * were being concurrently updated by the updaters. 657 + */ 658 + raw_spinlock_t cid_lock; 659 + #endif 649 660 #ifdef CONFIG_MMU 650 661 atomic_long_t pgtables_bytes; /* PTE page table pages */ 651 662 #endif ··· 919 908 vmi->mas.index = addr; 920 909 vmi->mas.node = MAS_START; 921 910 } 911 + 912 + #ifdef CONFIG_SCHED_MM_CID 913 + /* Accessor for struct mm_struct's cidmask. */ 914 + static inline cpumask_t *mm_cidmask(struct mm_struct *mm) 915 + { 916 + unsigned long cid_bitmap = (unsigned long)mm; 917 + 918 + cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); 919 + /* Skip cpu_bitmap */ 920 + cid_bitmap += cpumask_size(); 921 + return (struct cpumask *)cid_bitmap; 922 + } 923 + 924 + static inline void mm_init_cid(struct mm_struct *mm) 925 + { 926 + raw_spin_lock_init(&mm->cid_lock); 927 + cpumask_clear(mm_cidmask(mm)); 928 + } 929 + 930 + static inline unsigned int mm_cid_size(void) 931 + { 932 + return cpumask_size(); 933 + } 934 + #else /* CONFIG_SCHED_MM_CID */ 935 + static inline void mm_init_cid(struct mm_struct *mm) { } 936 + static inline unsigned int mm_cid_size(void) 937 + { 938 + return 0; 939 + } 940 + #endif /* CONFIG_SCHED_MM_CID */ 922 941 923 942 struct mmu_gather; 924 943 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);

+5

include/linux/sched.h

··· 1311 1311 unsigned long rseq_event_mask; 1312 1312 #endif 1313 1313 1314 + #ifdef CONFIG_SCHED_MM_CID 1315 + int mm_cid; /* Current cid in mm */ 1316 + int mm_cid_active; /* Whether cid bitmap is active */ 1317 + #endif 1318 + 1314 1319 struct tlbflush_unmap_batch tlb_ubc; 1315 1320 1316 1321 union {

+4

init/Kconfig

··· 1041 1041 1042 1042 endif #CGROUP_SCHED 1043 1043 1044 + config SCHED_MM_CID 1045 + def_bool y 1046 + depends on SMP && RSEQ 1047 + 1044 1048 config UCLAMP_TASK_GROUP 1045 1049 bool "Utilization clamping per group of tasks" 1046 1050 depends on CGROUP_SCHED

+7 -1

kernel/fork.c

··· 1060 1060 tsk->reported_split_lock = 0; 1061 1061 #endif 1062 1062 1063 + #ifdef CONFIG_SCHED_MM_CID 1064 + tsk->mm_cid = -1; 1065 + tsk->mm_cid_active = 0; 1066 + #endif 1063 1067 return tsk; 1064 1068 1065 1069 free_stack: ··· 1173 1169 1174 1170 mm->user_ns = get_user_ns(user_ns); 1175 1171 lru_gen_init_mm(mm); 1172 + mm_init_cid(mm); 1176 1173 return mm; 1177 1174 1178 1175 fail_pcpu: ··· 1606 1601 1607 1602 tsk->mm = mm; 1608 1603 tsk->active_mm = mm; 1604 + sched_mm_cid_fork(tsk); 1609 1605 return 0; 1610 1606 } 1611 1607 ··· 3040 3034 * dynamically sized based on the maximum CPU number this system 3041 3035 * can have, taking hotplug into account (nr_cpu_ids). 3042 3036 */ 3043 - mm_size = sizeof(struct mm_struct) + cpumask_size(); 3037 + mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); 3044 3038 3045 3039 mm_cachep = kmem_cache_create_usercopy("mm_struct", 3046 3040 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,

+51

kernel/sched/core.c

··· 5052 5052 sched_info_switch(rq, prev, next); 5053 5053 perf_event_task_sched_out(prev, next); 5054 5054 rseq_preempt(prev); 5055 + switch_mm_cid(prev, next); 5055 5056 fire_sched_out_preempt_notifiers(prev, next); 5056 5057 kmap_local_sched_out(); 5057 5058 prepare_task(next); ··· 11306 11305 { 11307 11306 trace_sched_update_nr_running_tp(rq, count); 11308 11307 } 11308 + 11309 + #ifdef CONFIG_SCHED_MM_CID 11310 + void sched_mm_cid_exit_signals(struct task_struct *t) 11311 + { 11312 + struct mm_struct *mm = t->mm; 11313 + unsigned long flags; 11314 + 11315 + if (!mm) 11316 + return; 11317 + local_irq_save(flags); 11318 + mm_cid_put(mm, t->mm_cid); 11319 + t->mm_cid = -1; 11320 + t->mm_cid_active = 0; 11321 + local_irq_restore(flags); 11322 + } 11323 + 11324 + void sched_mm_cid_before_execve(struct task_struct *t) 11325 + { 11326 + struct mm_struct *mm = t->mm; 11327 + unsigned long flags; 11328 + 11329 + if (!mm) 11330 + return; 11331 + local_irq_save(flags); 11332 + mm_cid_put(mm, t->mm_cid); 11333 + t->mm_cid = -1; 11334 + t->mm_cid_active = 0; 11335 + local_irq_restore(flags); 11336 + } 11337 + 11338 + void sched_mm_cid_after_execve(struct task_struct *t) 11339 + { 11340 + struct mm_struct *mm = t->mm; 11341 + unsigned long flags; 11342 + 11343 + WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm); 11344 + 11345 + local_irq_save(flags); 11346 + t->mm_cid = mm_cid_get(mm); 11347 + t->mm_cid_active = 1; 11348 + local_irq_restore(flags); 11349 + rseq_set_notify_resume(t); 11350 + } 11351 + 11352 + void sched_mm_cid_fork(struct task_struct *t) 11353 + { 11354 + WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm || t->mm_cid != -1); 11355 + t->mm_cid_active = 1; 11356 + } 11357 + #endif

+58

kernel/sched/sched.h

··· 3269 3269 cgroup_account_cputime(curr, delta_exec); 3270 3270 } 3271 3271 3272 + #ifdef CONFIG_SCHED_MM_CID 3273 + static inline int __mm_cid_get(struct mm_struct *mm) 3274 + { 3275 + struct cpumask *cpumask; 3276 + int cid; 3277 + 3278 + cpumask = mm_cidmask(mm); 3279 + cid = cpumask_first_zero(cpumask); 3280 + if (cid >= nr_cpu_ids) 3281 + return -1; 3282 + __cpumask_set_cpu(cid, cpumask); 3283 + return cid; 3284 + } 3285 + 3286 + static inline void mm_cid_put(struct mm_struct *mm, int cid) 3287 + { 3288 + lockdep_assert_irqs_disabled(); 3289 + if (cid < 0) 3290 + return; 3291 + raw_spin_lock(&mm->cid_lock); 3292 + __cpumask_clear_cpu(cid, mm_cidmask(mm)); 3293 + raw_spin_unlock(&mm->cid_lock); 3294 + } 3295 + 3296 + static inline int mm_cid_get(struct mm_struct *mm) 3297 + { 3298 + int ret; 3299 + 3300 + lockdep_assert_irqs_disabled(); 3301 + raw_spin_lock(&mm->cid_lock); 3302 + ret = __mm_cid_get(mm); 3303 + raw_spin_unlock(&mm->cid_lock); 3304 + return ret; 3305 + } 3306 + 3307 + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) 3308 + { 3309 + if (prev->mm_cid_active) { 3310 + if (next->mm_cid_active && next->mm == prev->mm) { 3311 + /* 3312 + * Context switch between threads in same mm, hand over 3313 + * the mm_cid from prev to next. 3314 + */ 3315 + next->mm_cid = prev->mm_cid; 3316 + prev->mm_cid = -1; 3317 + return; 3318 + } 3319 + mm_cid_put(prev->mm, prev->mm_cid); 3320 + prev->mm_cid = -1; 3321 + } 3322 + if (next->mm_cid_active) 3323 + next->mm_cid = mm_cid_get(next->mm); 3324 + } 3325 + 3326 + #else 3327 + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3328 + #endif 3329 + 3272 3330 #endif /* _KERNEL_SCHED_SCHED_H */

+2

kernel/signal.c

··· 2951 2951 cgroup_threadgroup_change_begin(tsk); 2952 2952 2953 2953 if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { 2954 + sched_mm_cid_exit_signals(tsk); 2954 2955 tsk->flags |= PF_EXITING; 2955 2956 cgroup_threadgroup_change_end(tsk); 2956 2957 return; ··· 2962 2961 * From now this task is not visible for group-wide signals, 2963 2962 * see wants_signal(), do_signal_stop(). 2964 2963 */ 2964 + sched_mm_cid_exit_signals(tsk); 2965 2965 tsk->flags |= PF_EXITING; 2966 2966 2967 2967 cgroup_threadgroup_change_end(tsk);