sched: Fix performance regression introduced by mm_cid

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
sysbench regression reported by Aaron Lu.

Keep track of the currently allocated mm_cid for each mm/cpu rather than
freeing them immediately on context switch. This eliminates most atomic
operations when context switching back and forth between threads
belonging to different memory spaces in multi-threaded scenarios (many
processes, each with many threads). The per-mm/per-cpu mm_cid values are
serialized by their respective runqueue locks.

Thread migration is handled by introducing invocation to
sched_mm_cid_migrate_to() (with destination runqueue lock held) in
activate_task() for migrating tasks. If the destination cpu's mm_cid is
unset, and if the source runqueue is not actively using its mm_cid, then
the source cpu's mm_cid is moved to the destination cpu on migration.

Introduce a task-work executed periodically, similarly to NUMA work,
which delays reclaim of cid values when they are unused for a period of
time.

Keep track of the allocation time for each per-cpu cid, and let the task
work clear them when they are observed to be older than
SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all
mm_cids which are greater or equal to the Hamming weight of the mm
cidmask to keep concurrency ids compact.

Because we want to ensure the mm_cid converges towards the smaller
values as migrations happen, the prior optimization that was done when
context switching between threads belonging to the same mm is removed,
because it could delay the lazy release of the destination runqueue
mm_cid after it has been replaced by a migration. Removing this prior
optimization is not an issue performance-wise because the introduced
per-mm/per-cpu mm_cid tracking also covers this more specific case.

Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID")
Reported-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Aaron Lu <aaron.lu@intel.com>
Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/

authored by

Mathieu Desnoyers and committed by

Peter Zijlstra 3 years ago 223baf9d 5a4d3b38

+804 -57

6 changed files

expand all

include

linux

mm_types.h

sched

mm.h

sched.h

kernel

fork.c

sched

core.c

sched.h

+74 -8

include/linux/mm_types.h

··· 550 550 struct vm_userfaultfd_ctx vm_userfaultfd_ctx; 551 551 } __randomize_layout; 552 552 553 + #ifdef CONFIG_SCHED_MM_CID 554 + struct mm_cid { 555 + u64 time; 556 + int cid; 557 + }; 558 + #endif 559 + 553 560 struct kioctx_table; 554 561 struct mm_struct { 555 562 struct { ··· 607 600 atomic_t mm_count; 608 601 #ifdef CONFIG_SCHED_MM_CID 609 602 /** 610 - * @cid_lock: Protect cid bitmap updates vs lookups. 603 + * @pcpu_cid: Per-cpu current cid. 611 604 * 612 - * Prevent situations where updates to the cid bitmap happen 613 - * concurrently with lookups. Those can lead to situations 614 - * where a lookup cannot find a free bit simply because it was 615 - * unlucky enough to load, non-atomically, bitmap words as they 616 - * were being concurrently updated by the updaters. 605 + * Keep track of the currently allocated mm_cid for each cpu. 606 + * The per-cpu mm_cid values are serialized by their respective 607 + * runqueue locks. 617 608 */ 618 - raw_spinlock_t cid_lock; 609 + struct mm_cid __percpu *pcpu_cid; 610 + /* 611 + * @mm_cid_next_scan: Next mm_cid scan (in jiffies). 612 + * 613 + * When the next mm_cid scan is due (in jiffies). 614 + */ 615 + unsigned long mm_cid_next_scan; 619 616 #endif 620 617 #ifdef CONFIG_MMU 621 618 atomic_long_t pgtables_bytes; /* size of all page tables */ ··· 884 873 } 885 874 886 875 #ifdef CONFIG_SCHED_MM_CID 876 + 877 + enum mm_cid_state { 878 + MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ 879 + MM_CID_LAZY_PUT = (1U << 31), 880 + }; 881 + 882 + static inline bool mm_cid_is_unset(int cid) 883 + { 884 + return cid == MM_CID_UNSET; 885 + } 886 + 887 + static inline bool mm_cid_is_lazy_put(int cid) 888 + { 889 + return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); 890 + } 891 + 892 + static inline bool mm_cid_is_valid(int cid) 893 + { 894 + return !(cid & MM_CID_LAZY_PUT); 895 + } 896 + 897 + static inline int mm_cid_set_lazy_put(int cid) 898 + { 899 + return cid | MM_CID_LAZY_PUT; 900 + } 901 + 902 + static inline int mm_cid_clear_lazy_put(int cid) 903 + { 904 + return cid & ~MM_CID_LAZY_PUT; 905 + } 906 + 887 907 /* Accessor for struct mm_struct's cidmask. */ 888 908 static inline cpumask_t *mm_cidmask(struct mm_struct *mm) 889 909 { ··· 928 886 929 887 static inline void mm_init_cid(struct mm_struct *mm) 930 888 { 931 - raw_spin_lock_init(&mm->cid_lock); 889 + int i; 890 + 891 + for_each_possible_cpu(i) { 892 + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 893 + 894 + pcpu_cid->cid = MM_CID_UNSET; 895 + pcpu_cid->time = 0; 896 + } 932 897 cpumask_clear(mm_cidmask(mm)); 898 + } 899 + 900 + static inline int mm_alloc_cid(struct mm_struct *mm) 901 + { 902 + mm->pcpu_cid = alloc_percpu(struct mm_cid); 903 + if (!mm->pcpu_cid) 904 + return -ENOMEM; 905 + mm_init_cid(mm); 906 + return 0; 907 + } 908 + 909 + static inline void mm_destroy_cid(struct mm_struct *mm) 910 + { 911 + free_percpu(mm->pcpu_cid); 912 + mm->pcpu_cid = NULL; 933 913 } 934 914 935 915 static inline unsigned int mm_cid_size(void) ··· 960 896 } 961 897 #else /* CONFIG_SCHED_MM_CID */ 962 898 static inline void mm_init_cid(struct mm_struct *mm) { } 899 + static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } 900 + static inline void mm_destroy_cid(struct mm_struct *mm) { } 963 901 static inline unsigned int mm_cid_size(void) 964 902 { 965 903 return 0;

include/linux/sched.h

··· 1314 1314 1315 1315 #ifdef CONFIG_SCHED_MM_CID 1316 1316 int mm_cid; /* Current cid in mm */ 1317 + int last_mm_cid; /* Most recent cid in mm */ 1318 + int migrate_from_cpu; 1317 1319 int mm_cid_active; /* Whether cid bitmap is active */ 1320 + struct callback_head cid_work; 1318 1321 #endif 1319 1322 1320 1323 struct tlbflush_unmap_batch tlb_ubc;

include/linux/sched/mm.h

··· 37 37 atomic_inc(&mm->mm_count); 38 38 } 39 39 40 + static inline void smp_mb__after_mmgrab(void) 41 + { 42 + smp_mb__after_atomic(); 43 + } 44 + 40 45 extern void __mmdrop(struct mm_struct *mm); 41 46 42 47 static inline void mmdrop(struct mm_struct *mm)

+8 -1

kernel/fork.c

··· 793 793 check_mm(mm); 794 794 put_user_ns(mm->user_ns); 795 795 mm_pasid_drop(mm); 796 + mm_destroy_cid(mm); 796 797 797 798 for (i = 0; i < NR_MM_COUNTERS; i++) 798 799 percpu_counter_destroy(&mm->rss_stat[i]); ··· 1058 1057 1059 1058 #ifdef CONFIG_SCHED_MM_CID 1060 1059 tsk->mm_cid = -1; 1060 + tsk->last_mm_cid = -1; 1061 1061 tsk->mm_cid_active = 0; 1062 + tsk->migrate_from_cpu = -1; 1062 1063 #endif 1063 1064 return tsk; 1064 1065 ··· 1165 1162 if (init_new_context(p, mm)) 1166 1163 goto fail_nocontext; 1167 1164 1165 + if (mm_alloc_cid(mm)) 1166 + goto fail_cid; 1167 + 1168 1168 for (i = 0; i < NR_MM_COUNTERS; i++) 1169 1169 if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) 1170 1170 goto fail_pcpu; 1171 1171 1172 1172 mm->user_ns = get_user_ns(user_ns); 1173 1173 lru_gen_init_mm(mm); 1174 - mm_init_cid(mm); 1175 1174 return mm; 1176 1175 1177 1176 fail_pcpu: 1178 1177 while (i > 0) 1179 1178 percpu_counter_destroy(&mm->rss_stat[--i]); 1179 + mm_destroy_cid(mm); 1180 + fail_cid: 1180 1181 fail_nocontext: 1181 1182 mm_free_pgd(mm); 1182 1183 fail_nopgd:

+506 -17

kernel/sched/core.c

··· 2101 2101 { 2102 2102 if (task_on_rq_migrating(p)) 2103 2103 flags |= ENQUEUE_MIGRATED; 2104 + if (flags & ENQUEUE_MIGRATED) 2105 + sched_mm_cid_migrate_to(rq, p); 2104 2106 2105 2107 enqueue_task(rq, p, flags); 2106 2108 ··· 3212 3210 p->sched_class->migrate_task_rq(p, new_cpu); 3213 3211 p->se.nr_migrations++; 3214 3212 rseq_migrate(p); 3213 + sched_mm_cid_migrate_from(p); 3215 3214 perf_event_task_migrate(p); 3216 3215 } 3217 3216 ··· 4486 4483 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4487 4484 p->migration_pending = NULL; 4488 4485 #endif 4486 + init_sched_mm_cid(p); 4489 4487 } 4490 4488 4491 4489 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); ··· 5133 5129 sched_info_switch(rq, prev, next); 5134 5130 perf_event_task_sched_out(prev, next); 5135 5131 rseq_preempt(prev); 5136 - switch_mm_cid(prev, next); 5137 5132 fire_sched_out_preempt_notifiers(prev, next); 5138 5133 kmap_local_sched_out(); 5139 5134 prepare_task(next); ··· 5288 5285 * 5289 5286 * kernel -> user switch + mmdrop() active 5290 5287 * user -> user switch 5288 + * 5289 + * switch_mm_cid() needs to be updated if the barriers provided 5290 + * by context_switch() are modified. 5291 5291 */ 5292 5292 if (!next->mm) { // to kernel 5293 5293 enter_lazy_tlb(prev->active_mm, next); ··· 5319 5313 prev->active_mm = NULL; 5320 5314 } 5321 5315 } 5316 + 5317 + /* switch_mm_cid() requires the memory barriers above. */ 5318 + switch_mm_cid(rq, prev, next); 5322 5319 5323 5320 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 5324 5321 ··· 5611 5602 resched_latency = cpu_resched_latency(rq); 5612 5603 calc_global_load_tick(rq); 5613 5604 sched_core_tick(rq); 5605 + task_tick_mm_cid(rq, curr); 5614 5606 5615 5607 rq_unlock(rq, &rf); 5616 5608 ··· 11479 11469 } 11480 11470 11481 11471 #ifdef CONFIG_SCHED_MM_CID 11482 - void sched_mm_cid_exit_signals(struct task_struct *t) 11472 + 11473 + /** 11474 + * @cid_lock: Guarantee forward-progress of cid allocation. 11475 + * 11476 + * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock 11477 + * is only used when contention is detected by the lock-free allocation so 11478 + * forward progress can be guaranteed. 11479 + */ 11480 + DEFINE_RAW_SPINLOCK(cid_lock); 11481 + 11482 + /** 11483 + * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. 11484 + * 11485 + * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is 11486 + * detected, it is set to 1 to ensure that all newly coming allocations are 11487 + * serialized by @cid_lock until the allocation which detected contention 11488 + * completes and sets @use_cid_lock back to 0. This guarantees forward progress 11489 + * of a cid allocation. 11490 + */ 11491 + int use_cid_lock; 11492 + 11493 + /* 11494 + * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid 11495 + * concurrently with respect to the execution of the source runqueue context 11496 + * switch. 11497 + * 11498 + * There is one basic properties we want to guarantee here: 11499 + * 11500 + * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively 11501 + * used by a task. That would lead to concurrent allocation of the cid and 11502 + * userspace corruption. 11503 + * 11504 + * Provide this guarantee by introducing a Dekker memory ordering to guarantee 11505 + * that a pair of loads observe at least one of a pair of stores, which can be 11506 + * shown as: 11507 + * 11508 + * X = Y = 0 11509 + * 11510 + * w[X]=1 w[Y]=1 11511 + * MB MB 11512 + * r[Y]=y r[X]=x 11513 + * 11514 + * Which guarantees that x==0 && y==0 is impossible. But rather than using 11515 + * values 0 and 1, this algorithm cares about specific state transitions of the 11516 + * runqueue current task (as updated by the scheduler context switch), and the 11517 + * per-mm/cpu cid value. 11518 + * 11519 + * Let's introduce task (Y) which has task->mm == mm and task (N) which has 11520 + * task->mm != mm for the rest of the discussion. There are two scheduler state 11521 + * transitions on context switch we care about: 11522 + * 11523 + * (TSA) Store to rq->curr with transition from (N) to (Y) 11524 + * 11525 + * (TSB) Store to rq->curr with transition from (Y) to (N) 11526 + * 11527 + * On the remote-clear side, there is one transition we care about: 11528 + * 11529 + * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag 11530 + * 11531 + * There is also a transition to UNSET state which can be performed from all 11532 + * sides (scheduler, remote-clear). It is always performed with a cmpxchg which 11533 + * guarantees that only a single thread will succeed: 11534 + * 11535 + * (TMB) cmpxchg to *pcpu_cid to mark UNSET 11536 + * 11537 + * Just to be clear, what we do _not_ want to happen is a transition to UNSET 11538 + * when a thread is actively using the cid (property (1)). 11539 + * 11540 + * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. 11541 + * 11542 + * Scenario A) (TSA)+(TMA) (from next task perspective) 11543 + * 11544 + * CPU0 CPU1 11545 + * 11546 + * Context switch CS-1 Remote-clear 11547 + * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) 11548 + * (implied barrier after cmpxchg) 11549 + * - switch_mm_cid() 11550 + * - memory barrier (see switch_mm_cid() 11551 + * comment explaining how this barrier 11552 + * is combined with other scheduler 11553 + * barriers) 11554 + * - mm_cid_get (next) 11555 + * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) 11556 + * 11557 + * This Dekker ensures that either task (Y) is observed by the 11558 + * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are 11559 + * observed. 11560 + * 11561 + * If task (Y) store is observed by rcu_dereference(), it means that there is 11562 + * still an active task on the cpu. Remote-clear will therefore not transition 11563 + * to UNSET, which fulfills property (1). 11564 + * 11565 + * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), 11566 + * it will move its state to UNSET, which clears the percpu cid perhaps 11567 + * uselessly (which is not an issue for correctness). Because task (Y) is not 11568 + * observed, CPU1 can move ahead to set the state to UNSET. Because moving 11569 + * state to UNSET is done with a cmpxchg expecting that the old state has the 11570 + * LAZY flag set, only one thread will successfully UNSET. 11571 + * 11572 + * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 11573 + * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and 11574 + * CPU1 will observe task (Y) and do nothing more, which is fine. 11575 + * 11576 + * What we are effectively preventing with this Dekker is a scenario where 11577 + * neither LAZY flag nor store (Y) are observed, which would fail property (1) 11578 + * because this would UNSET a cid which is actively used. 11579 + */ 11580 + 11581 + void sched_mm_cid_migrate_from(struct task_struct *t) 11582 + { 11583 + t->migrate_from_cpu = task_cpu(t); 11584 + } 11585 + 11586 + static 11587 + int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, 11588 + struct task_struct *t, 11589 + struct mm_cid *src_pcpu_cid) 11483 11590 { 11484 11591 struct mm_struct *mm = t->mm; 11485 - unsigned long flags; 11592 + struct task_struct *src_task; 11593 + int src_cid, last_mm_cid; 11594 + 11595 + if (!mm) 11596 + return -1; 11597 + 11598 + last_mm_cid = t->last_mm_cid; 11599 + /* 11600 + * If the migrated task has no last cid, or if the current 11601 + * task on src rq uses the cid, it means the source cid does not need 11602 + * to be moved to the destination cpu. 11603 + */ 11604 + if (last_mm_cid == -1) 11605 + return -1; 11606 + src_cid = READ_ONCE(src_pcpu_cid->cid); 11607 + if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) 11608 + return -1; 11609 + 11610 + /* 11611 + * If we observe an active task using the mm on this rq, it means we 11612 + * are not the last task to be migrated from this cpu for this mm, so 11613 + * there is no need to move src_cid to the destination cpu. 11614 + */ 11615 + rcu_read_lock(); 11616 + src_task = rcu_dereference(src_rq->curr); 11617 + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 11618 + rcu_read_unlock(); 11619 + t->last_mm_cid = -1; 11620 + return -1; 11621 + } 11622 + rcu_read_unlock(); 11623 + 11624 + return src_cid; 11625 + } 11626 + 11627 + static 11628 + int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, 11629 + struct task_struct *t, 11630 + struct mm_cid *src_pcpu_cid, 11631 + int src_cid) 11632 + { 11633 + struct task_struct *src_task; 11634 + struct mm_struct *mm = t->mm; 11635 + int lazy_cid; 11636 + 11637 + if (src_cid == -1) 11638 + return -1; 11639 + 11640 + /* 11641 + * Attempt to clear the source cpu cid to move it to the destination 11642 + * cpu. 11643 + */ 11644 + lazy_cid = mm_cid_set_lazy_put(src_cid); 11645 + if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) 11646 + return -1; 11647 + 11648 + /* 11649 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11650 + * rq->curr->mm matches the scheduler barrier in context_switch() 11651 + * between store to rq->curr and load of prev and next task's 11652 + * per-mm/cpu cid. 11653 + * 11654 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11655 + * rq->curr->mm_cid_active matches the barrier in 11656 + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 11657 + * sched_mm_cid_after_execve() between store to t->mm_cid_active and 11658 + * load of per-mm/cpu cid. 11659 + */ 11660 + 11661 + /* 11662 + * If we observe an active task using the mm on this rq after setting 11663 + * the lazy-put flag, this task will be responsible for transitioning 11664 + * from lazy-put flag set to MM_CID_UNSET. 11665 + */ 11666 + rcu_read_lock(); 11667 + src_task = rcu_dereference(src_rq->curr); 11668 + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 11669 + rcu_read_unlock(); 11670 + /* 11671 + * We observed an active task for this mm, there is therefore 11672 + * no point in moving this cid to the destination cpu. 11673 + */ 11674 + t->last_mm_cid = -1; 11675 + return -1; 11676 + } 11677 + rcu_read_unlock(); 11678 + 11679 + /* 11680 + * The src_cid is unused, so it can be unset. 11681 + */ 11682 + if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 11683 + return -1; 11684 + return src_cid; 11685 + } 11686 + 11687 + /* 11688 + * Migration to dst cpu. Called with dst_rq lock held. 11689 + * Interrupts are disabled, which keeps the window of cid ownership without the 11690 + * source rq lock held small. 11691 + */ 11692 + void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) 11693 + { 11694 + struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 11695 + struct mm_struct *mm = t->mm; 11696 + int src_cid, dst_cid, src_cpu; 11697 + struct rq *src_rq; 11698 + 11699 + lockdep_assert_rq_held(dst_rq); 11486 11700 11487 11701 if (!mm) 11488 11702 return; 11703 + src_cpu = t->migrate_from_cpu; 11704 + if (src_cpu == -1) { 11705 + t->last_mm_cid = -1; 11706 + return; 11707 + } 11708 + /* 11709 + * Move the src cid if the dst cid is unset. This keeps id 11710 + * allocation closest to 0 in cases where few threads migrate around 11711 + * many cpus. 11712 + * 11713 + * If destination cid is already set, we may have to just clear 11714 + * the src cid to ensure compactness in frequent migrations 11715 + * scenarios. 11716 + * 11717 + * It is not useful to clear the src cid when the number of threads is 11718 + * greater or equal to the number of allowed cpus, because user-space 11719 + * can expect that the number of allowed cids can reach the number of 11720 + * allowed cpus. 11721 + */ 11722 + dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 11723 + dst_cid = READ_ONCE(dst_pcpu_cid->cid); 11724 + if (!mm_cid_is_unset(dst_cid) && 11725 + atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) 11726 + return; 11727 + src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 11728 + src_rq = cpu_rq(src_cpu); 11729 + src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); 11730 + if (src_cid == -1) 11731 + return; 11732 + src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, 11733 + src_cid); 11734 + if (src_cid == -1) 11735 + return; 11736 + if (!mm_cid_is_unset(dst_cid)) { 11737 + __mm_cid_put(mm, src_cid); 11738 + return; 11739 + } 11740 + /* Move src_cid to dst cpu. */ 11741 + mm_cid_snapshot_time(dst_rq, mm); 11742 + WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 11743 + } 11744 + 11745 + static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, 11746 + int cpu) 11747 + { 11748 + struct rq *rq = cpu_rq(cpu); 11749 + struct task_struct *t; 11750 + unsigned long flags; 11751 + int cid, lazy_cid; 11752 + 11753 + cid = READ_ONCE(pcpu_cid->cid); 11754 + if (!mm_cid_is_valid(cid)) 11755 + return; 11756 + 11757 + /* 11758 + * Clear the cpu cid if it is set to keep cid allocation compact. If 11759 + * there happens to be other tasks left on the source cpu using this 11760 + * mm, the next task using this mm will reallocate its cid on context 11761 + * switch. 11762 + */ 11763 + lazy_cid = mm_cid_set_lazy_put(cid); 11764 + if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) 11765 + return; 11766 + 11767 + /* 11768 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11769 + * rq->curr->mm matches the scheduler barrier in context_switch() 11770 + * between store to rq->curr and load of prev and next task's 11771 + * per-mm/cpu cid. 11772 + * 11773 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11774 + * rq->curr->mm_cid_active matches the barrier in 11775 + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 11776 + * sched_mm_cid_after_execve() between store to t->mm_cid_active and 11777 + * load of per-mm/cpu cid. 11778 + */ 11779 + 11780 + /* 11781 + * If we observe an active task using the mm on this rq after setting 11782 + * the lazy-put flag, that task will be responsible for transitioning 11783 + * from lazy-put flag set to MM_CID_UNSET. 11784 + */ 11785 + rcu_read_lock(); 11786 + t = rcu_dereference(rq->curr); 11787 + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { 11788 + rcu_read_unlock(); 11789 + return; 11790 + } 11791 + rcu_read_unlock(); 11792 + 11793 + /* 11794 + * The cid is unused, so it can be unset. 11795 + * Disable interrupts to keep the window of cid ownership without rq 11796 + * lock small. 11797 + */ 11489 11798 local_irq_save(flags); 11490 - mm_cid_put(mm, t->mm_cid); 11491 - t->mm_cid = -1; 11492 - t->mm_cid_active = 0; 11799 + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 11800 + __mm_cid_put(mm, cid); 11493 11801 local_irq_restore(flags); 11802 + } 11803 + 11804 + static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) 11805 + { 11806 + struct rq *rq = cpu_rq(cpu); 11807 + struct mm_cid *pcpu_cid; 11808 + struct task_struct *curr; 11809 + u64 rq_clock; 11810 + 11811 + /* 11812 + * rq->clock load is racy on 32-bit but one spurious clear once in a 11813 + * while is irrelevant. 11814 + */ 11815 + rq_clock = READ_ONCE(rq->clock); 11816 + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 11817 + 11818 + /* 11819 + * In order to take care of infrequently scheduled tasks, bump the time 11820 + * snapshot associated with this cid if an active task using the mm is 11821 + * observed on this rq. 11822 + */ 11823 + rcu_read_lock(); 11824 + curr = rcu_dereference(rq->curr); 11825 + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { 11826 + WRITE_ONCE(pcpu_cid->time, rq_clock); 11827 + rcu_read_unlock(); 11828 + return; 11829 + } 11830 + rcu_read_unlock(); 11831 + 11832 + if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) 11833 + return; 11834 + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 11835 + } 11836 + 11837 + static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, 11838 + int weight) 11839 + { 11840 + struct mm_cid *pcpu_cid; 11841 + int cid; 11842 + 11843 + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 11844 + cid = READ_ONCE(pcpu_cid->cid); 11845 + if (!mm_cid_is_valid(cid) || cid < weight) 11846 + return; 11847 + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 11848 + } 11849 + 11850 + static void task_mm_cid_work(struct callback_head *work) 11851 + { 11852 + unsigned long now = jiffies, old_scan, next_scan; 11853 + struct task_struct *t = current; 11854 + struct cpumask *cidmask; 11855 + struct mm_struct *mm; 11856 + int weight, cpu; 11857 + 11858 + SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); 11859 + 11860 + work->next = work; /* Prevent double-add */ 11861 + if (t->flags & PF_EXITING) 11862 + return; 11863 + mm = t->mm; 11864 + if (!mm) 11865 + return; 11866 + old_scan = READ_ONCE(mm->mm_cid_next_scan); 11867 + next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); 11868 + if (!old_scan) { 11869 + unsigned long res; 11870 + 11871 + res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); 11872 + if (res != old_scan) 11873 + old_scan = res; 11874 + else 11875 + old_scan = next_scan; 11876 + } 11877 + if (time_before(now, old_scan)) 11878 + return; 11879 + if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) 11880 + return; 11881 + cidmask = mm_cidmask(mm); 11882 + /* Clear cids that were not recently used. */ 11883 + for_each_possible_cpu(cpu) 11884 + sched_mm_cid_remote_clear_old(mm, cpu); 11885 + weight = cpumask_weight(cidmask); 11886 + /* 11887 + * Clear cids that are greater or equal to the cidmask weight to 11888 + * recompact it. 11889 + */ 11890 + for_each_possible_cpu(cpu) 11891 + sched_mm_cid_remote_clear_weight(mm, cpu, weight); 11892 + } 11893 + 11894 + void init_sched_mm_cid(struct task_struct *t) 11895 + { 11896 + struct mm_struct *mm = t->mm; 11897 + int mm_users = 0; 11898 + 11899 + if (mm) { 11900 + mm_users = atomic_read(&mm->mm_users); 11901 + if (mm_users == 1) 11902 + mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); 11903 + } 11904 + t->cid_work.next = &t->cid_work; /* Protect against double add */ 11905 + init_task_work(&t->cid_work, task_mm_cid_work); 11906 + } 11907 + 11908 + void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) 11909 + { 11910 + struct callback_head *work = &curr->cid_work; 11911 + unsigned long now = jiffies; 11912 + 11913 + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || 11914 + work->next != work) 11915 + return; 11916 + if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 11917 + return; 11918 + task_work_add(curr, work, TWA_RESUME); 11919 + } 11920 + 11921 + void sched_mm_cid_exit_signals(struct task_struct *t) 11922 + { 11923 + struct mm_struct *mm = t->mm; 11924 + struct rq_flags rf; 11925 + struct rq *rq; 11926 + 11927 + if (!mm) 11928 + return; 11929 + 11930 + preempt_disable(); 11931 + rq = this_rq(); 11932 + rq_lock_irqsave(rq, &rf); 11933 + preempt_enable_no_resched(); /* holding spinlock */ 11934 + WRITE_ONCE(t->mm_cid_active, 0); 11935 + /* 11936 + * Store t->mm_cid_active before loading per-mm/cpu cid. 11937 + * Matches barrier in sched_mm_cid_remote_clear_old(). 11938 + */ 11939 + smp_mb(); 11940 + mm_cid_put(mm); 11941 + t->last_mm_cid = t->mm_cid = -1; 11942 + rq_unlock_irqrestore(rq, &rf); 11494 11943 } 11495 11944 11496 11945 void sched_mm_cid_before_execve(struct task_struct *t) 11497 11946 { 11498 11947 struct mm_struct *mm = t->mm; 11499 - unsigned long flags; 11948 + struct rq_flags rf; 11949 + struct rq *rq; 11500 11950 11501 11951 if (!mm) 11502 11952 return; 11503 - local_irq_save(flags); 11504 - mm_cid_put(mm, t->mm_cid); 11505 - t->mm_cid = -1; 11506 - t->mm_cid_active = 0; 11507 - local_irq_restore(flags); 11953 + 11954 + preempt_disable(); 11955 + rq = this_rq(); 11956 + rq_lock_irqsave(rq, &rf); 11957 + preempt_enable_no_resched(); /* holding spinlock */ 11958 + WRITE_ONCE(t->mm_cid_active, 0); 11959 + /* 11960 + * Store t->mm_cid_active before loading per-mm/cpu cid. 11961 + * Matches barrier in sched_mm_cid_remote_clear_old(). 11962 + */ 11963 + smp_mb(); 11964 + mm_cid_put(mm); 11965 + t->last_mm_cid = t->mm_cid = -1; 11966 + rq_unlock_irqrestore(rq, &rf); 11508 11967 } 11509 11968 11510 11969 void sched_mm_cid_after_execve(struct task_struct *t) 11511 11970 { 11512 11971 struct mm_struct *mm = t->mm; 11513 - unsigned long flags; 11972 + struct rq_flags rf; 11973 + struct rq *rq; 11514 11974 11515 11975 if (!mm) 11516 11976 return; 11517 - local_irq_save(flags); 11518 - t->mm_cid = mm_cid_get(mm); 11519 - t->mm_cid_active = 1; 11520 - local_irq_restore(flags); 11977 + 11978 + preempt_disable(); 11979 + rq = this_rq(); 11980 + rq_lock_irqsave(rq, &rf); 11981 + preempt_enable_no_resched(); /* holding spinlock */ 11982 + WRITE_ONCE(t->mm_cid_active, 1); 11983 + /* 11984 + * Store t->mm_cid_active before loading per-mm/cpu cid. 11985 + * Matches barrier in sched_mm_cid_remote_clear_old(). 11986 + */ 11987 + smp_mb(); 11988 + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); 11989 + rq_unlock_irqrestore(rq, &rf); 11521 11990 rseq_set_notify_resume(t); 11522 11991 } 11523 11992

+208 -31

kernel/sched/sched.h

··· 3253 3253 } 3254 3254 3255 3255 #ifdef CONFIG_SCHED_MM_CID 3256 - static inline int __mm_cid_get(struct mm_struct *mm) 3256 + 3257 + #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ 3258 + #define MM_CID_SCAN_DELAY 100 /* 100ms */ 3259 + 3260 + extern raw_spinlock_t cid_lock; 3261 + extern int use_cid_lock; 3262 + 3263 + extern void sched_mm_cid_migrate_from(struct task_struct *t); 3264 + extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); 3265 + extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); 3266 + extern void init_sched_mm_cid(struct task_struct *t); 3267 + 3268 + static inline void __mm_cid_put(struct mm_struct *mm, int cid) 3269 + { 3270 + if (cid < 0) 3271 + return; 3272 + cpumask_clear_cpu(cid, mm_cidmask(mm)); 3273 + } 3274 + 3275 + /* 3276 + * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to 3277 + * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to 3278 + * be held to transition to other states. 3279 + * 3280 + * State transitions synchronized with cmpxchg or try_cmpxchg need to be 3281 + * consistent across cpus, which prevents use of this_cpu_cmpxchg. 3282 + */ 3283 + static inline void mm_cid_put_lazy(struct task_struct *t) 3284 + { 3285 + struct mm_struct *mm = t->mm; 3286 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3287 + int cid; 3288 + 3289 + lockdep_assert_irqs_disabled(); 3290 + cid = __this_cpu_read(pcpu_cid->cid); 3291 + if (!mm_cid_is_lazy_put(cid) || 3292 + !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3293 + return; 3294 + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3295 + } 3296 + 3297 + static inline int mm_cid_pcpu_unset(struct mm_struct *mm) 3298 + { 3299 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3300 + int cid, res; 3301 + 3302 + lockdep_assert_irqs_disabled(); 3303 + cid = __this_cpu_read(pcpu_cid->cid); 3304 + for (;;) { 3305 + if (mm_cid_is_unset(cid)) 3306 + return MM_CID_UNSET; 3307 + /* 3308 + * Attempt transition from valid or lazy-put to unset. 3309 + */ 3310 + res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); 3311 + if (res == cid) 3312 + break; 3313 + cid = res; 3314 + } 3315 + return cid; 3316 + } 3317 + 3318 + static inline void mm_cid_put(struct mm_struct *mm) 3319 + { 3320 + int cid; 3321 + 3322 + lockdep_assert_irqs_disabled(); 3323 + cid = mm_cid_pcpu_unset(mm); 3324 + if (cid == MM_CID_UNSET) 3325 + return; 3326 + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3327 + } 3328 + 3329 + static inline int __mm_cid_try_get(struct mm_struct *mm) 3257 3330 { 3258 3331 struct cpumask *cpumask; 3259 3332 int cid; 3260 3333 3261 3334 cpumask = mm_cidmask(mm); 3262 - cid = cpumask_first_zero(cpumask); 3263 - if (cid >= nr_cpu_ids) 3335 + /* 3336 + * Retry finding first zero bit if the mask is temporarily 3337 + * filled. This only happens during concurrent remote-clear 3338 + * which owns a cid without holding a rq lock. 3339 + */ 3340 + for (;;) { 3341 + cid = cpumask_first_zero(cpumask); 3342 + if (cid < nr_cpu_ids) 3343 + break; 3344 + cpu_relax(); 3345 + } 3346 + if (cpumask_test_and_set_cpu(cid, cpumask)) 3264 3347 return -1; 3265 - __cpumask_set_cpu(cid, cpumask); 3266 3348 return cid; 3267 3349 } 3268 3350 3269 - static inline void mm_cid_put(struct mm_struct *mm, int cid) 3351 + /* 3352 + * Save a snapshot of the current runqueue time of this cpu 3353 + * with the per-cpu cid value, allowing to estimate how recently it was used. 3354 + */ 3355 + static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) 3270 3356 { 3271 - lockdep_assert_irqs_disabled(); 3272 - if (cid < 0) 3273 - return; 3274 - raw_spin_lock(&mm->cid_lock); 3275 - __cpumask_clear_cpu(cid, mm_cidmask(mm)); 3276 - raw_spin_unlock(&mm->cid_lock); 3357 + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); 3358 + 3359 + lockdep_assert_rq_held(rq); 3360 + WRITE_ONCE(pcpu_cid->time, rq->clock); 3277 3361 } 3278 3362 3279 - static inline int mm_cid_get(struct mm_struct *mm) 3363 + static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) 3280 3364 { 3281 - int ret; 3365 + int cid; 3282 3366 3283 - lockdep_assert_irqs_disabled(); 3284 - raw_spin_lock(&mm->cid_lock); 3285 - ret = __mm_cid_get(mm); 3286 - raw_spin_unlock(&mm->cid_lock); 3287 - return ret; 3367 + /* 3368 + * All allocations (even those using the cid_lock) are lock-free. If 3369 + * use_cid_lock is set, hold the cid_lock to perform cid allocation to 3370 + * guarantee forward progress. 3371 + */ 3372 + if (!READ_ONCE(use_cid_lock)) { 3373 + cid = __mm_cid_try_get(mm); 3374 + if (cid >= 0) 3375 + goto end; 3376 + raw_spin_lock(&cid_lock); 3377 + } else { 3378 + raw_spin_lock(&cid_lock); 3379 + cid = __mm_cid_try_get(mm); 3380 + if (cid >= 0) 3381 + goto unlock; 3382 + } 3383 + 3384 + /* 3385 + * cid concurrently allocated. Retry while forcing following 3386 + * allocations to use the cid_lock to ensure forward progress. 3387 + */ 3388 + WRITE_ONCE(use_cid_lock, 1); 3389 + /* 3390 + * Set use_cid_lock before allocation. Only care about program order 3391 + * because this is only required for forward progress. 3392 + */ 3393 + barrier(); 3394 + /* 3395 + * Retry until it succeeds. It is guaranteed to eventually succeed once 3396 + * all newcoming allocations observe the use_cid_lock flag set. 3397 + */ 3398 + do { 3399 + cid = __mm_cid_try_get(mm); 3400 + cpu_relax(); 3401 + } while (cid < 0); 3402 + /* 3403 + * Allocate before clearing use_cid_lock. Only care about 3404 + * program order because this is for forward progress. 3405 + */ 3406 + barrier(); 3407 + WRITE_ONCE(use_cid_lock, 0); 3408 + unlock: 3409 + raw_spin_unlock(&cid_lock); 3410 + end: 3411 + mm_cid_snapshot_time(rq, mm); 3412 + return cid; 3288 3413 } 3289 3414 3290 - static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) 3415 + static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) 3291 3416 { 3417 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3418 + struct cpumask *cpumask; 3419 + int cid; 3420 + 3421 + lockdep_assert_rq_held(rq); 3422 + cpumask = mm_cidmask(mm); 3423 + cid = __this_cpu_read(pcpu_cid->cid); 3424 + if (mm_cid_is_valid(cid)) { 3425 + mm_cid_snapshot_time(rq, mm); 3426 + return cid; 3427 + } 3428 + if (mm_cid_is_lazy_put(cid)) { 3429 + if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3430 + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3431 + } 3432 + cid = __mm_cid_get(rq, mm); 3433 + __this_cpu_write(pcpu_cid->cid, cid); 3434 + return cid; 3435 + } 3436 + 3437 + static inline void switch_mm_cid(struct rq *rq, 3438 + struct task_struct *prev, 3439 + struct task_struct *next) 3440 + { 3441 + /* 3442 + * Provide a memory barrier between rq->curr store and load of 3443 + * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. 3444 + * 3445 + * Should be adapted if context_switch() is modified. 3446 + */ 3447 + if (!next->mm) { // to kernel 3448 + /* 3449 + * user -> kernel transition does not guarantee a barrier, but 3450 + * we can use the fact that it performs an atomic operation in 3451 + * mmgrab(). 3452 + */ 3453 + if (prev->mm) // from user 3454 + smp_mb__after_mmgrab(); 3455 + /* 3456 + * kernel -> kernel transition does not change rq->curr->mm 3457 + * state. It stays NULL. 3458 + */ 3459 + } else { // to user 3460 + /* 3461 + * kernel -> user transition does not provide a barrier 3462 + * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. 3463 + * Provide it here. 3464 + */ 3465 + if (!prev->mm) // from kernel 3466 + smp_mb(); 3467 + /* 3468 + * user -> user transition guarantees a memory barrier through 3469 + * switch_mm() when current->mm changes. If current->mm is 3470 + * unchanged, no barrier is needed. 3471 + */ 3472 + } 3292 3473 if (prev->mm_cid_active) { 3293 - if (next->mm_cid_active && next->mm == prev->mm) { 3294 - /* 3295 - * Context switch between threads in same mm, hand over 3296 - * the mm_cid from prev to next. 3297 - */ 3298 - next->mm_cid = prev->mm_cid; 3299 - prev->mm_cid = -1; 3300 - return; 3301 - } 3302 - mm_cid_put(prev->mm, prev->mm_cid); 3474 + mm_cid_snapshot_time(rq, prev->mm); 3475 + mm_cid_put_lazy(prev); 3303 3476 prev->mm_cid = -1; 3304 3477 } 3305 3478 if (next->mm_cid_active) 3306 - next->mm_cid = mm_cid_get(next->mm); 3479 + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); 3307 3480 } 3308 3481 3309 3482 #else 3310 - static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3483 + static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } 3484 + static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } 3485 + static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } 3486 + static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } 3487 + static inline void init_sched_mm_cid(struct task_struct *t) { } 3311 3488 #endif 3312 3489 3313 3490 #endif /* _KERNEL_SCHED_SCHED_H */