Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors

Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used
in places where it would be useful for it to apply to the normal RCU
flavors, rcu_preempt, rcu_sched, and rcu_bh. This is especially the
case for workloads that aggressively overload the system, particularly
those that generate large numbers of RCU updates on systems running
NO_HZ_FULL CPUs. This commit therefore communicates quiescent states
from cond_resched_rcu_qs() to the normal RCU flavors.

Note that it is unfortunately necessary to leave the old ->passed_quiesce
mechanism in place to allow quiescent states that apply to only one
flavor to be recorded. (Yes, we could decrement ->rcu_qs_ctr_snap in
that case, but that is not so good for debugging of RCU internals.)
In addition, if one of the RCU flavor's grace period has stalled, this
will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight
quiescent state visible from other CPUs.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu()
was used in preemptible code. ]

+65 -25
+16 -16
Documentation/RCU/trace.txt
··· 56 56 57 57 The output of "cat rcu/rcu_preempt/rcudata" looks as follows: 58 58 59 - 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 60 - 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 61 - 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 62 - 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 63 - 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 64 - 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 65 - 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 66 - 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 59 + 0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 60 + 1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 61 + 2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 62 + 3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 63 + 4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 64 + 5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 65 + 6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 66 + 7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 67 67 68 68 This file has one line per CPU, or eight for this 8-CPU system. 69 69 The fields are as follows: ··· 188 188 Kernels compiled with CONFIG_RCU_BOOST=y display the following from 189 189 /debug/rcu/rcu_preempt/rcudata: 190 190 191 - 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 192 - 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 193 - 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 194 - 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 195 - 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 196 - 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 197 - 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 198 - 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 191 + 0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 192 + 1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 193 + 2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 194 + 3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 195 + 4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 196 + 5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 197 + 6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 198 + 7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 199 199 200 200 This is similar to the output discussed above, but contains the following 201 201 additional fields:
+2 -1
include/linux/rcupdate.h
··· 331 331 extern struct srcu_struct tasks_rcu_exit_srcu; 332 332 #define rcu_note_voluntary_context_switch(t) \ 333 333 do { \ 334 + rcu_all_qs(); \ 334 335 if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ 335 336 ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ 336 337 } while (0) 337 338 #else /* #ifdef CONFIG_TASKS_RCU */ 338 339 #define TASKS_RCU(x) do { } while (0) 339 - #define rcu_note_voluntary_context_switch(t) do { } while (0) 340 + #define rcu_note_voluntary_context_switch(t) rcu_all_qs() 340 341 #endif /* #else #ifdef CONFIG_TASKS_RCU */ 341 342 342 343 /**
+4 -1
include/linux/rcutiny.h
··· 154 154 return true; 155 155 } 156 156 157 - 158 157 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 158 + 159 + static inline void rcu_all_qs(void) 160 + { 161 + } 159 162 160 163 #endif /* __LINUX_RCUTINY_H */
+2
include/linux/rcutree.h
··· 97 97 98 98 bool rcu_is_watching(void); 99 99 100 + void rcu_all_qs(void); 101 + 100 102 #endif /* __LINUX_RCUTREE_H */
+33 -5
kernel/rcu/tree.c
··· 219 219 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 220 220 }; 221 221 222 + DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 223 + EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); 224 + 222 225 /* 223 226 * Let the RCU core know that this CPU has gone through the scheduler, 224 227 * which is a quiescent state. This is called when the need for a ··· 290 287 trace_rcu_utilization(TPS("End context switch")); 291 288 } 292 289 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 290 + 291 + /* 292 + * Register a quiesecent state for all RCU flavors. If there is an 293 + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 294 + * dyntick-idle quiescent state visible to other CPUs (but only for those 295 + * RCU flavors in desparate need of a quiescent state, which will normally 296 + * be none of them). Either way, do a lightweight quiescent state for 297 + * all RCU flavors. 298 + */ 299 + void rcu_all_qs(void) 300 + { 301 + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 302 + rcu_momentary_dyntick_idle(); 303 + this_cpu_inc(rcu_qs_ctr); 304 + } 305 + EXPORT_SYMBOL_GPL(rcu_all_qs); 293 306 294 307 static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 295 308 static long qhimark = 10000; /* If this many pending, ignore blimit. */ ··· 1628 1609 rdp->gpnum = rnp->gpnum; 1629 1610 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1630 1611 rdp->passed_quiesce = 0; 1612 + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1631 1613 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1632 1614 zero_cpu_stall_ticks(rdp); 1633 1615 ACCESS_ONCE(rdp->gpwrap) = false; ··· 2095 2075 rnp = rdp->mynode; 2096 2076 raw_spin_lock_irqsave(&rnp->lock, flags); 2097 2077 smp_mb__after_unlock_lock(); 2098 - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2099 - rnp->completed == rnp->gpnum || rdp->gpwrap) { 2078 + if ((rdp->passed_quiesce == 0 && 2079 + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || 2080 + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || 2081 + rdp->gpwrap) { 2100 2082 2101 2083 /* 2102 2084 * The grace period in which this quiescent state was ··· 2107 2085 * within the current grace period. 2108 2086 */ 2109 2087 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2088 + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2110 2089 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2111 2090 return; 2112 2091 } ··· 2152 2129 * Was there a quiescent state since the beginning of the grace 2153 2130 * period? If no, then exit and wait for the next call. 2154 2131 */ 2155 - if (!rdp->passed_quiesce) 2132 + if (!rdp->passed_quiesce && 2133 + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) 2156 2134 return; 2157 2135 2158 2136 /* ··· 3198 3174 3199 3175 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3200 3176 if (rcu_scheduler_fully_active && 3201 - rdp->qs_pending && !rdp->passed_quiesce) { 3177 + rdp->qs_pending && !rdp->passed_quiesce && 3178 + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3202 3179 rdp->n_rp_qs_pending++; 3203 - } else if (rdp->qs_pending && rdp->passed_quiesce) { 3180 + } else if (rdp->qs_pending && 3181 + (rdp->passed_quiesce || 3182 + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { 3204 3183 rdp->n_rp_report_qs++; 3205 3184 return 1; 3206 3185 } ··· 3537 3510 rdp->gpnum = rnp->completed; 3538 3511 rdp->completed = rnp->completed; 3539 3512 rdp->passed_quiesce = 0; 3513 + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 3540 3514 rdp->qs_pending = 0; 3541 3515 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3542 3516 }
+2
kernel/rcu/tree.h
··· 257 257 /* in order to detect GP end. */ 258 258 unsigned long gpnum; /* Highest gp number that this CPU */ 259 259 /* is aware of having started. */ 260 + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ 261 + /* for rcu_all_qs() invocations. */ 260 262 bool passed_quiesce; /* User-mode/idle loop etc. */ 261 263 bool qs_pending; /* Core waits for quiesc state. */ 262 264 bool beenonline; /* CPU online at least once. */
+6 -2
kernel/rcu/tree_trace.c
··· 46 46 #define RCU_TREE_NONCORE 47 47 #include "tree.h" 48 48 49 + DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 50 + 49 51 static int r_open(struct inode *inode, struct file *file, 50 52 const struct seq_operations *op) 51 53 { ··· 117 115 118 116 if (!rdp->beenonline) 119 117 return; 120 - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", 118 + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", 121 119 rdp->cpu, 122 120 cpu_is_offline(rdp->cpu) ? '!' : ' ', 123 121 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 124 - rdp->passed_quiesce, rdp->qs_pending); 122 + rdp->passed_quiesce, 123 + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), 124 + rdp->qs_pending); 125 125 seq_printf(m, " dt=%d/%llx/%d df=%lu", 126 126 atomic_read(&rdp->dynticks->dynticks), 127 127 rdp->dynticks->dynticks_nesting,