Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu: Add synchronize_sched_expedited() primitive

This adds the synchronize_sched_expedited() primitive that
implements the "big hammer" expedited RCU grace periods.

This primitive is placed in kernel/sched.c rather than
kernel/rcupdate.c due to its need to interact closely with the
migration_thread() kthread.

The idea is to wake up this kthread with req->task set to NULL,
in response to which the kthread reports the quiescent state
resulting from the kthread having been scheduled.

Because this patch needs to fallback to the slow versions of
the primitives in response to some races with CPU onlining and
offlining, a new synchronize_rcu_bh() primitive is added as
well.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: davem@davemloft.net
Cc: dada1@cosmosbay.com
Cc: zbr@ioremap.net
Cc: jeff.chua.linux@gmail.com
Cc: paulus@samba.org
Cc: laijs@cn.fujitsu.com
Cc: jengelh@medozas.de
Cc: r000n@r000n.net
Cc: benh@kernel.crashing.org
Cc: mathieu.desnoyers@polymtl.ca
LKML-Reference: <12459460982947-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Paul E. McKenney and committed by
Ingo Molnar
03b042bf c17ef453

+186 -15
+13 -12
include/linux/rcupdate.h
··· 51 51 void (*func)(struct rcu_head *head); 52 52 }; 53 53 54 - /* Internal to kernel, but needed by rcupreempt.h. */ 54 + /* Exported common interfaces */ 55 + extern void synchronize_rcu(void); 56 + extern void synchronize_rcu_bh(void); 57 + extern void rcu_barrier(void); 58 + extern void rcu_barrier_bh(void); 59 + extern void rcu_barrier_sched(void); 60 + extern void synchronize_sched_expedited(void); 61 + extern int sched_expedited_torture_stats(char *page); 62 + 63 + /* Internal to kernel */ 64 + extern void rcu_init(void); 65 + extern void rcu_scheduler_starting(void); 66 + extern int rcu_needs_cpu(int cpu); 55 67 extern int rcu_scheduler_active; 56 68 57 69 #if defined(CONFIG_TREE_RCU) ··· 268 256 */ 269 257 extern void call_rcu_bh(struct rcu_head *head, 270 258 void (*func)(struct rcu_head *head)); 271 - 272 - /* Exported common interfaces */ 273 - extern void synchronize_rcu(void); 274 - extern void rcu_barrier(void); 275 - extern void rcu_barrier_bh(void); 276 - extern void rcu_barrier_sched(void); 277 - 278 - /* Internal to kernel */ 279 - extern void rcu_init(void); 280 - extern void rcu_scheduler_starting(void); 281 - extern int rcu_needs_cpu(int cpu); 282 259 283 260 #endif /* __LINUX_RCUPDATE_H */
+10
include/linux/rcupreempt.h
··· 74 74 75 75 extern void __synchronize_sched(void); 76 76 77 + static inline void synchronize_rcu_expedited(void) 78 + { 79 + synchronize_rcu(); /* Placeholder for new rcupreempt implementation. */ 80 + } 81 + 82 + static inline void synchronize_rcu_bh_expedited(void) 83 + { 84 + synchronize_rcu_bh(); /* Placeholder for new rcupreempt impl. */ 85 + } 86 + 77 87 extern void __rcu_init(void); 78 88 extern void rcu_init_sched(void); 79 89 extern void rcu_check_callbacks(int cpu, int user);
+11 -1
include/linux/rcutree.h
··· 286 286 287 287 #define call_rcu_sched(head, func) call_rcu(head, func) 288 288 289 - static inline void rcu_init_sched(void) 289 + static inline void synchronize_rcu_expedited(void) 290 290 { 291 + synchronize_sched_expedited(); 292 + } 293 + 294 + static inline void synchronize_rcu_bh_expedited(void) 295 + { 296 + synchronize_sched_expedited(); 291 297 } 292 298 293 299 extern void __rcu_init(void); ··· 302 296 303 297 extern long rcu_batches_completed(void); 304 298 extern long rcu_batches_completed_bh(void); 299 + 300 + static inline void rcu_init_sched(void) 301 + { 302 + } 305 303 306 304 #ifdef CONFIG_NO_HZ 307 305 void rcu_enter_nohz(void);
+25
kernel/rcupdate.c
··· 98 98 } 99 99 EXPORT_SYMBOL_GPL(synchronize_rcu); 100 100 101 + /** 102 + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 103 + * 104 + * Control will return to the caller some time after a full rcu_bh grace 105 + * period has elapsed, in other words after all currently executing rcu_bh 106 + * read-side critical sections have completed. RCU read-side critical 107 + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 108 + * and may be nested. 109 + */ 110 + void synchronize_rcu_bh(void) 111 + { 112 + struct rcu_synchronize rcu; 113 + 114 + if (rcu_blocking_is_gp()) 115 + return; 116 + 117 + init_completion(&rcu.completion); 118 + /* Will wake me after RCU finished. */ 119 + call_rcu_bh(&rcu.head, wakeme_after_rcu); 120 + /* Wait for it. */ 121 + wait_for_completion(&rcu.completion); 122 + } 123 + EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 124 + 101 125 static void rcu_barrier_callback(struct rcu_head *notused) 102 126 { 103 127 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) ··· 153 129 static inline void wait_migrated_callbacks(void) 154 130 { 155 131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 132 + smp_mb(); /* In case we didn't sleep. */ 156 133 } 157 134 158 135 /*
+127 -2
kernel/sched.c
··· 7024 7024 return ret; 7025 7025 } 7026 7026 7027 + #define RCU_MIGRATION_IDLE 0 7028 + #define RCU_MIGRATION_NEED_QS 1 7029 + #define RCU_MIGRATION_GOT_QS 2 7030 + #define RCU_MIGRATION_MUST_SYNC 3 7031 + 7027 7032 /* 7028 7033 * migration_thread - this is a highprio system thread that performs 7029 7034 * thread migration by bumping thread off CPU then 'pushing' onto ··· 7036 7031 */ 7037 7032 static int migration_thread(void *data) 7038 7033 { 7034 + int badcpu; 7039 7035 int cpu = (long)data; 7040 7036 struct rq *rq; 7041 7037 ··· 7071 7065 req = list_entry(head->next, struct migration_req, list); 7072 7066 list_del_init(head->next); 7073 7067 7074 - spin_unlock(&rq->lock); 7075 - __migrate_task(req->task, cpu, req->dest_cpu); 7068 + if (req->task != NULL) { 7069 + spin_unlock(&rq->lock); 7070 + __migrate_task(req->task, cpu, req->dest_cpu); 7071 + } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7072 + req->dest_cpu = RCU_MIGRATION_GOT_QS; 7073 + spin_unlock(&rq->lock); 7074 + } else { 7075 + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7076 + spin_unlock(&rq->lock); 7077 + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7078 + } 7076 7079 local_irq_enable(); 7077 7080 7078 7081 complete(&req->done); ··· 10569 10554 .subsys_id = cpuacct_subsys_id, 10570 10555 }; 10571 10556 #endif /* CONFIG_CGROUP_CPUACCT */ 10557 + 10558 + #ifndef CONFIG_SMP 10559 + 10560 + int rcu_expedited_torture_stats(char *page) 10561 + { 10562 + return 0; 10563 + } 10564 + EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); 10565 + 10566 + void synchronize_sched_expedited(void) 10567 + { 10568 + } 10569 + EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 10570 + 10571 + #else /* #ifndef CONFIG_SMP */ 10572 + 10573 + static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 10574 + static DEFINE_MUTEX(rcu_sched_expedited_mutex); 10575 + 10576 + #define RCU_EXPEDITED_STATE_POST -2 10577 + #define RCU_EXPEDITED_STATE_IDLE -1 10578 + 10579 + static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10580 + 10581 + int rcu_expedited_torture_stats(char *page) 10582 + { 10583 + int cnt = 0; 10584 + int cpu; 10585 + 10586 + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 10587 + for_each_online_cpu(cpu) { 10588 + cnt += sprintf(&page[cnt], " %d:%d", 10589 + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 10590 + } 10591 + cnt += sprintf(&page[cnt], "\n"); 10592 + return cnt; 10593 + } 10594 + EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); 10595 + 10596 + static long synchronize_sched_expedited_count; 10597 + 10598 + /* 10599 + * Wait for an rcu-sched grace period to elapse, but use "big hammer" 10600 + * approach to force grace period to end quickly. This consumes 10601 + * significant time on all CPUs, and is thus not recommended for 10602 + * any sort of common-case code. 10603 + * 10604 + * Note that it is illegal to call this function while holding any 10605 + * lock that is acquired by a CPU-hotplug notifier. Failing to 10606 + * observe this restriction will result in deadlock. 10607 + */ 10608 + void synchronize_sched_expedited(void) 10609 + { 10610 + int cpu; 10611 + unsigned long flags; 10612 + bool need_full_sync = 0; 10613 + struct rq *rq; 10614 + struct migration_req *req; 10615 + long snap; 10616 + int trycount = 0; 10617 + 10618 + smp_mb(); /* ensure prior mod happens before capturing snap. */ 10619 + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 10620 + get_online_cpus(); 10621 + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 10622 + put_online_cpus(); 10623 + if (trycount++ < 10) 10624 + udelay(trycount * num_online_cpus()); 10625 + else { 10626 + synchronize_sched(); 10627 + return; 10628 + } 10629 + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 10630 + smp_mb(); /* ensure test happens before caller kfree */ 10631 + return; 10632 + } 10633 + get_online_cpus(); 10634 + } 10635 + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 10636 + for_each_online_cpu(cpu) { 10637 + rq = cpu_rq(cpu); 10638 + req = &per_cpu(rcu_migration_req, cpu); 10639 + init_completion(&req->done); 10640 + req->task = NULL; 10641 + req->dest_cpu = RCU_MIGRATION_NEED_QS; 10642 + spin_lock_irqsave(&rq->lock, flags); 10643 + list_add(&req->list, &rq->migration_queue); 10644 + spin_unlock_irqrestore(&rq->lock, flags); 10645 + wake_up_process(rq->migration_thread); 10646 + } 10647 + for_each_online_cpu(cpu) { 10648 + rcu_expedited_state = cpu; 10649 + req = &per_cpu(rcu_migration_req, cpu); 10650 + rq = cpu_rq(cpu); 10651 + wait_for_completion(&req->done); 10652 + spin_lock_irqsave(&rq->lock, flags); 10653 + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 10654 + need_full_sync = 1; 10655 + req->dest_cpu = RCU_MIGRATION_IDLE; 10656 + spin_unlock_irqrestore(&rq->lock, flags); 10657 + } 10658 + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10659 + mutex_unlock(&rcu_sched_expedited_mutex); 10660 + put_online_cpus(); 10661 + if (need_full_sync) 10662 + synchronize_sched(); 10663 + } 10664 + EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 10665 + 10666 + #endif /* #else #ifndef CONFIG_SMP */