Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched_ext: Don't use double locking to migrate tasks across CPUs

consume_remote_task() and dispatch_to_local_dsq() use
move_task_to_local_dsq() to migrate the task to the target CPU. Currently,
move_task_to_local_dsq() expects the caller to lock both the source and
destination rq's. While this may save a few lock operations while the rq's
are not contended, under contention, the double locking can exacerbate the
situation significantly (refer to the linked message below).

Update the migration path so that double locking is not used.
move_task_to_local_dsq() now expects the caller to be locking the source rq,
drops it and then acquires the destination rq lock. Code is simpler this way
and, on a 2-way NUMA machine w/ Xeon Gold 6138, 'hackbench 100 thread 5000`
shows ~3% improvement with scx_simple.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20240806082716.GP37996@noisy.programming.kicks-ass.net
Acked-by: David Vernet <void@manifault.com>

+46 -88
+46 -88
kernel/sched/ext.c
··· 2097 2097 #ifdef CONFIG_SMP 2098 2098 /** 2099 2099 * move_task_to_local_dsq - Move a task from a different rq to a local DSQ 2100 - * @rq: rq to move the task into, currently locked 2101 2100 * @p: task to move 2102 2101 * @enq_flags: %SCX_ENQ_* 2102 + * @src_rq: rq to move the task from, locked on entry, released on return 2103 + * @dst_rq: rq to move the task into, locked on return 2103 2104 * 2104 - * Move @p which is currently on a different rq to @rq's local DSQ. The caller 2105 + * Move @p which is currently on @src_rq to @dst_rq's local DSQ. The caller 2105 2106 * must: 2106 2107 * 2107 2108 * 1. Start with exclusive access to @p either through its DSQ lock or ··· 2110 2109 * 2111 2110 * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). 2112 2111 * 2113 - * 3. Remember task_rq(@p). Release the exclusive access so that we don't 2114 - * deadlock with dequeue. 2112 + * 3. Remember task_rq(@p) as @src_rq. Release the exclusive access so that we 2113 + * don't deadlock with dequeue. 2115 2114 * 2116 - * 4. Lock @rq and the task_rq from #3. 2115 + * 4. Lock @src_rq from #3. 2117 2116 * 2118 2117 * 5. Call this function. 2119 2118 * 2120 2119 * Returns %true if @p was successfully moved. %false after racing dequeue and 2121 - * losing. 2120 + * losing. On return, @src_rq is unlocked and @dst_rq is locked. 2122 2121 */ 2123 - static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, 2124 - u64 enq_flags) 2122 + static bool move_task_to_local_dsq(struct task_struct *p, u64 enq_flags, 2123 + struct rq *src_rq, struct rq *dst_rq) 2125 2124 { 2126 - struct rq *task_rq; 2127 - 2128 - lockdep_assert_rq_held(rq); 2125 + lockdep_assert_rq_held(src_rq); 2129 2126 2130 2127 /* 2131 - * If dequeue got to @p while we were trying to lock both rq's, it'd 2132 - * have cleared @p->scx.holding_cpu to -1. While other cpus may have 2133 - * updated it to different values afterwards, as this operation can't be 2128 + * If dequeue got to @p while we were trying to lock @src_rq, it'd have 2129 + * cleared @p->scx.holding_cpu to -1. While other cpus may have updated 2130 + * it to different values afterwards, as this operation can't be 2134 2131 * preempted or recurse, @p->scx.holding_cpu can never become 2135 2132 * raw_smp_processor_id() again before we're done. Thus, we can tell 2136 2133 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is 2137 2134 * still raw_smp_processor_id(). 2138 2135 * 2136 + * @p->rq couldn't have changed if we're still the holding cpu. 2137 + * 2139 2138 * See dispatch_dequeue() for the counterpart. 2140 2139 */ 2141 - if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) 2140 + if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()) || 2141 + WARN_ON_ONCE(src_rq != task_rq(p))) { 2142 + raw_spin_rq_unlock(src_rq); 2143 + raw_spin_rq_lock(dst_rq); 2142 2144 return false; 2145 + } 2143 2146 2144 - /* @p->rq couldn't have changed if we're still the holding cpu */ 2145 - task_rq = task_rq(p); 2146 - lockdep_assert_rq_held(task_rq); 2147 + /* the following marks @p MIGRATING which excludes dequeue */ 2148 + deactivate_task(src_rq, p, 0); 2149 + set_task_cpu(p, cpu_of(dst_rq)); 2150 + p->scx.sticky_cpu = cpu_of(dst_rq); 2147 2151 2148 - WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); 2149 - deactivate_task(task_rq, p, 0); 2150 - set_task_cpu(p, cpu_of(rq)); 2151 - p->scx.sticky_cpu = cpu_of(rq); 2152 + raw_spin_rq_unlock(src_rq); 2153 + raw_spin_rq_lock(dst_rq); 2152 2154 2153 2155 /* 2154 2156 * We want to pass scx-specific enq_flags but activate_task() will 2155 2157 * truncate the upper 32 bit. As we own @rq, we can pass them through 2156 2158 * @rq->scx.extra_enq_flags instead. 2157 2159 */ 2158 - WARN_ON_ONCE(rq->scx.extra_enq_flags); 2159 - rq->scx.extra_enq_flags = enq_flags; 2160 - activate_task(rq, p, 0); 2161 - rq->scx.extra_enq_flags = 0; 2160 + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); 2161 + WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); 2162 + dst_rq->scx.extra_enq_flags = enq_flags; 2163 + activate_task(dst_rq, p, 0); 2164 + dst_rq->scx.extra_enq_flags = 0; 2162 2165 2163 2166 return true; 2164 2167 } 2165 2168 2166 - /** 2167 - * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked 2168 - * @rq: current rq which is locked 2169 - * @src_rq: rq to move task from 2170 - * @dst_rq: rq to move task to 2171 - * 2172 - * We're holding @rq lock and trying to dispatch a task from @src_rq to 2173 - * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether 2174 - * @rq stays locked isn't important as long as the state is restored after 2175 - * dispatch_to_local_dsq_unlock(). 2176 - */ 2177 - static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq *src_rq, 2178 - struct rq *dst_rq) 2179 - { 2180 - if (src_rq == dst_rq) { 2181 - raw_spin_rq_unlock(rq); 2182 - raw_spin_rq_lock(dst_rq); 2183 - } else if (rq == src_rq) { 2184 - double_lock_balance(rq, dst_rq); 2185 - } else if (rq == dst_rq) { 2186 - double_lock_balance(rq, src_rq); 2187 - } else { 2188 - raw_spin_rq_unlock(rq); 2189 - double_rq_lock(src_rq, dst_rq); 2190 - } 2191 - } 2192 - 2193 - /** 2194 - * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() 2195 - * @rq: current rq which is locked 2196 - * @src_rq: rq to move task from 2197 - * @dst_rq: rq to move task to 2198 - * 2199 - * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. 2200 - */ 2201 - static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq *src_rq, 2202 - struct rq *dst_rq) 2203 - { 2204 - if (src_rq == dst_rq) { 2205 - raw_spin_rq_unlock(dst_rq); 2206 - raw_spin_rq_lock(rq); 2207 - } else if (rq == src_rq) { 2208 - double_unlock_balance(rq, dst_rq); 2209 - } else if (rq == dst_rq) { 2210 - double_unlock_balance(rq, src_rq); 2211 - } else { 2212 - double_rq_unlock(src_rq, dst_rq); 2213 - raw_spin_rq_lock(rq); 2214 - } 2215 - } 2216 2169 #endif /* CONFIG_SMP */ 2217 2170 2218 2171 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, ··· 2218 2263 static bool consume_remote_task(struct rq *rq, struct scx_dispatch_q *dsq, 2219 2264 struct task_struct *p, struct rq *task_rq) 2220 2265 { 2221 - bool moved = false; 2222 - 2223 2266 lockdep_assert_held(&dsq->lock); /* released on return */ 2224 2267 2225 2268 /* ··· 2233 2280 p->scx.holding_cpu = raw_smp_processor_id(); 2234 2281 raw_spin_unlock(&dsq->lock); 2235 2282 2236 - double_lock_balance(rq, task_rq); 2283 + raw_spin_rq_unlock(rq); 2284 + raw_spin_rq_lock(task_rq); 2237 2285 2238 - moved = move_task_to_local_dsq(rq, p, 0); 2239 - 2240 - double_unlock_balance(rq, task_rq); 2241 - 2242 - return moved; 2286 + return move_task_to_local_dsq(p, 0, task_rq, rq); 2243 2287 } 2244 2288 #else /* CONFIG_SMP */ 2245 2289 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } ··· 2330 2380 2331 2381 #ifdef CONFIG_SMP 2332 2382 if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { 2333 - struct rq *locked_dst_rq = dst_rq; 2334 2383 bool dsp; 2335 2384 2336 2385 /* ··· 2348 2399 /* store_release ensures that dequeue sees the above */ 2349 2400 atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); 2350 2401 2351 - dispatch_to_local_dsq_lock(rq, src_rq, locked_dst_rq); 2402 + /* switch to @src_rq lock */ 2403 + if (rq != src_rq) { 2404 + raw_spin_rq_unlock(rq); 2405 + raw_spin_rq_lock(src_rq); 2406 + } 2352 2407 2353 2408 /* 2354 2409 * We don't require the BPF scheduler to avoid dispatching to ··· 2379 2426 enq_flags); 2380 2427 } 2381 2428 } else { 2382 - dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); 2429 + dsp = move_task_to_local_dsq(p, enq_flags, 2430 + src_rq, dst_rq); 2383 2431 } 2384 2432 2385 2433 /* if the destination CPU is idle, wake it up */ ··· 2388 2434 dst_rq->curr->sched_class)) 2389 2435 resched_curr(dst_rq); 2390 2436 2391 - dispatch_to_local_dsq_unlock(rq, src_rq, locked_dst_rq); 2437 + /* switch back to @rq lock */ 2438 + if (rq != dst_rq) { 2439 + raw_spin_rq_unlock(dst_rq); 2440 + raw_spin_rq_lock(rq); 2441 + } 2392 2442 2393 2443 return dsp ? DTL_DISPATCHED : DTL_LOST; 2394 2444 }