Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu: Kill rnp->ofl_seq and use only rcu_state.ofl_lock for exclusion

If we allow architectures to bring APs online in parallel, then we end
up requiring rcu_cpu_starting() to be reentrant. But currently, the
manipulation of rnp->ofl_seq is not thread-safe.

However, rnp->ofl_seq is also fairly much pointless anyway since both
rcu_cpu_starting() and rcu_report_dead() hold rcu_state.ofl_lock for
fairly much the whole time that rnp->ofl_seq is set to an odd number
to indicate that an operation is in progress.

So drop rnp->ofl_seq completely, and use only rcu_state.ofl_lock.

This has a couple of minor complexities: lockdep will complain when we
take rcu_state.ofl_lock, and currently accepts the 'excuse' of having
an odd value in rnp->ofl_seq. So switch it to an arch_spinlock_t to
avoid that false positive complaint. Since we're killing rnp->ofl_seq
of course that 'excuse' has to be changed too, so make it check for
arch_spin_is_locked(rcu_state.ofl_lock).

There's no arch_spin_lock_irqsave() so we have to manually save and
restore local interrupts around the locking.

At Paul's request based on Neeraj's analysis, make rcu_gp_init not just
wait but *exclude* any CPU online/offline activity, which was fairly
much true already by virtue of it holding rcu_state.ofl_lock.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

authored by

David Woodhouse and committed by
Paul E. McKenney
82980b16 da123016

+37 -38
+36 -35
kernel/rcu/tree.c
··· 91 91 .abbr = RCU_ABBR, 92 92 .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), 93 93 .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), 94 - .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), 94 + .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, 95 95 }; 96 96 97 97 /* Dump rcu_node combining tree at boot to verify correct setup. */ ··· 1175 1175 preempt_disable_notrace(); 1176 1176 rdp = this_cpu_ptr(&rcu_data); 1177 1177 rnp = rdp->mynode; 1178 - if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) 1178 + /* 1179 + * Strictly, we care here about the case where the current CPU is 1180 + * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask 1181 + * not being up to date. So arch_spin_is_locked() might have a 1182 + * false positive if it's held by some *other* CPU, but that's 1183 + * OK because that just means a false *negative* on the warning. 1184 + */ 1185 + if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || 1186 + arch_spin_is_locked(&rcu_state.ofl_lock)) 1179 1187 ret = true; 1180 1188 preempt_enable_notrace(); 1181 1189 return ret; ··· 1747 1739 */ 1748 1740 static noinline_for_stack bool rcu_gp_init(void) 1749 1741 { 1750 - unsigned long firstseq; 1751 1742 unsigned long flags; 1752 1743 unsigned long oldmask; 1753 1744 unsigned long mask; ··· 1789 1782 * of RCU's Requirements documentation. 1790 1783 */ 1791 1784 WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); 1785 + /* Exclude CPU hotplug operations. */ 1792 1786 rcu_for_each_leaf_node(rnp) { 1793 - // Wait for CPU-hotplug operations that might have 1794 - // started before this grace period did. 1795 - smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. 1796 - firstseq = READ_ONCE(rnp->ofl_seq); 1797 - if (firstseq & 0x1) 1798 - while (firstseq == READ_ONCE(rnp->ofl_seq)) 1799 - schedule_timeout_idle(1); // Can't wake unless RCU is watching. 1800 - smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. 1801 - raw_spin_lock(&rcu_state.ofl_lock); 1802 - raw_spin_lock_irq_rcu_node(rnp); 1787 + local_irq_save(flags); 1788 + arch_spin_lock(&rcu_state.ofl_lock); 1789 + raw_spin_lock_rcu_node(rnp); 1803 1790 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1804 1791 !rnp->wait_blkd_tasks) { 1805 1792 /* Nothing to do on this leaf rcu_node structure. */ 1806 - raw_spin_unlock_irq_rcu_node(rnp); 1807 - raw_spin_unlock(&rcu_state.ofl_lock); 1793 + raw_spin_unlock_rcu_node(rnp); 1794 + arch_spin_unlock(&rcu_state.ofl_lock); 1795 + local_irq_restore(flags); 1808 1796 continue; 1809 1797 } 1810 1798 ··· 1834 1832 rcu_cleanup_dead_rnp(rnp); 1835 1833 } 1836 1834 1837 - raw_spin_unlock_irq_rcu_node(rnp); 1838 - raw_spin_unlock(&rcu_state.ofl_lock); 1835 + raw_spin_unlock_rcu_node(rnp); 1836 + arch_spin_unlock(&rcu_state.ofl_lock); 1837 + local_irq_restore(flags); 1839 1838 } 1840 1839 rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ 1841 1840 ··· 4290 4287 4291 4288 rnp = rdp->mynode; 4292 4289 mask = rdp->grpmask; 4293 - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); 4294 - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); 4290 + local_irq_save(flags); 4291 + arch_spin_lock(&rcu_state.ofl_lock); 4295 4292 rcu_dynticks_eqs_online(); 4296 - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). 4297 - raw_spin_lock_irqsave_rcu_node(rnp, flags); 4293 + raw_spin_lock_rcu_node(rnp); 4298 4294 WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); 4299 4295 newcpu = !(rnp->expmaskinitnext & mask); 4300 4296 rnp->expmaskinitnext |= mask; ··· 4306 4304 4307 4305 /* An incoming CPU should never be blocking a grace period. */ 4308 4306 if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ 4307 + /* rcu_report_qs_rnp() *really* wants some flags to restore */ 4308 + unsigned long flags2; 4309 + 4310 + local_irq_save(flags2); 4309 4311 rcu_disable_urgency_upon_qs(rdp); 4310 4312 /* Report QS -after- changing ->qsmaskinitnext! */ 4311 - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 4313 + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); 4312 4314 } else { 4313 - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4315 + raw_spin_unlock_rcu_node(rnp); 4314 4316 } 4315 - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). 4316 - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); 4317 - WARN_ON_ONCE(rnp->ofl_seq & 0x1); 4317 + arch_spin_unlock(&rcu_state.ofl_lock); 4318 + local_irq_restore(flags); 4318 4319 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ 4319 4320 } 4320 4321 ··· 4331 4326 */ 4332 4327 void rcu_report_dead(unsigned int cpu) 4333 4328 { 4334 - unsigned long flags; 4329 + unsigned long flags, seq_flags; 4335 4330 unsigned long mask; 4336 4331 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 4337 4332 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ ··· 4345 4340 4346 4341 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 4347 4342 mask = rdp->grpmask; 4348 - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); 4349 - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); 4350 - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). 4351 - raw_spin_lock(&rcu_state.ofl_lock); 4343 + local_irq_save(seq_flags); 4344 + arch_spin_lock(&rcu_state.ofl_lock); 4352 4345 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 4353 4346 rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); 4354 4347 rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); ··· 4357 4354 } 4358 4355 WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); 4359 4356 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4360 - raw_spin_unlock(&rcu_state.ofl_lock); 4361 - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). 4362 - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); 4363 - WARN_ON_ONCE(rnp->ofl_seq & 0x1); 4357 + arch_spin_unlock(&rcu_state.ofl_lock); 4358 + local_irq_restore(seq_flags); 4364 4359 4365 4360 rdp->cpu_started = false; 4366 4361 }
+1 -3
kernel/rcu/tree.h
··· 56 56 /* Initialized from ->qsmaskinitnext at the */ 57 57 /* beginning of each grace period. */ 58 58 unsigned long qsmaskinitnext; 59 - unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */ 60 - /* Online CPUs for next grace period. */ 61 59 unsigned long expmask; /* CPUs or groups that need to check in */ 62 60 /* to allow the current expedited GP */ 63 61 /* to complete. */ ··· 353 355 const char *name; /* Name of structure. */ 354 356 char abbr; /* Abbreviated name. */ 355 357 356 - raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; 358 + arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; 357 359 /* Synchronize offline with */ 358 360 /* GP pre-initialization. */ 359 361 };