Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu: Create transitive rnp->lock acquisition functions

Providing RCU's memory-ordering guarantees requires that the rcu_node
tree's locking provide transitive memory ordering, which the Linux kernel's
spinlocks currently do not provide unless smp_mb__after_unlock_lock()
is used. Having a separate smp_mb__after_unlock_lock() after each and
every lock acquisition is error-prone, hard to read, and a bit annoying,
so this commit provides wrapper functions that pull in the
smp_mb__after_unlock_lock() invocations.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

authored by

Peter Zijlstra and committed by
Paul E. McKenney
2a67e741 1ec21837

+71 -64
+26 -52
kernel/rcu/tree.c
··· 1534 1534 * hold it, acquire the root rcu_node structure's lock in order to 1535 1535 * start one (if needed). 1536 1536 */ 1537 - if (rnp != rnp_root) { 1538 - raw_spin_lock(&rnp_root->lock); 1539 - smp_mb__after_unlock_lock(); 1540 - } 1537 + if (rnp != rnp_root) 1538 + raw_spin_lock_rcu_node(rnp_root); 1541 1539 1542 1540 /* 1543 1541 * Get a new grace-period number. If there really is no grace ··· 1784 1786 if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && 1785 1787 rdp->completed == READ_ONCE(rnp->completed) && 1786 1788 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ 1787 - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1789 + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ 1788 1790 local_irq_restore(flags); 1789 1791 return; 1790 1792 } 1791 - smp_mb__after_unlock_lock(); 1792 1793 needwake = __note_gp_changes(rsp, rnp, rdp); 1793 1794 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1794 1795 if (needwake) ··· 1811 1814 struct rcu_node *rnp = rcu_get_root(rsp); 1812 1815 1813 1816 WRITE_ONCE(rsp->gp_activity, jiffies); 1814 - raw_spin_lock_irq(&rnp->lock); 1815 - smp_mb__after_unlock_lock(); 1817 + raw_spin_lock_irq_rcu_node(rnp); 1816 1818 if (!READ_ONCE(rsp->gp_flags)) { 1817 1819 /* Spurious wakeup, tell caller to go back to sleep. */ 1818 1820 raw_spin_unlock_irq(&rnp->lock); ··· 1843 1847 */ 1844 1848 rcu_for_each_leaf_node(rsp, rnp) { 1845 1849 rcu_gp_slow(rsp, gp_preinit_delay); 1846 - raw_spin_lock_irq(&rnp->lock); 1847 - smp_mb__after_unlock_lock(); 1850 + raw_spin_lock_irq_rcu_node(rnp); 1848 1851 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1849 1852 !rnp->wait_blkd_tasks) { 1850 1853 /* Nothing to do on this leaf rcu_node structure. */ ··· 1899 1904 */ 1900 1905 rcu_for_each_node_breadth_first(rsp, rnp) { 1901 1906 rcu_gp_slow(rsp, gp_init_delay); 1902 - raw_spin_lock_irq(&rnp->lock); 1903 - smp_mb__after_unlock_lock(); 1907 + raw_spin_lock_irq_rcu_node(rnp); 1904 1908 rdp = this_cpu_ptr(rsp->rda); 1905 1909 rcu_preempt_check_blocked_tasks(rnp); 1906 1910 rnp->qsmask = rnp->qsmaskinit; ··· 1967 1973 } 1968 1974 /* Clear flag to prevent immediate re-entry. */ 1969 1975 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1970 - raw_spin_lock_irq(&rnp->lock); 1971 - smp_mb__after_unlock_lock(); 1976 + raw_spin_lock_irq_rcu_node(rnp); 1972 1977 WRITE_ONCE(rsp->gp_flags, 1973 1978 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 1974 1979 raw_spin_unlock_irq(&rnp->lock); ··· 1986 1993 struct rcu_node *rnp = rcu_get_root(rsp); 1987 1994 1988 1995 WRITE_ONCE(rsp->gp_activity, jiffies); 1989 - raw_spin_lock_irq(&rnp->lock); 1990 - smp_mb__after_unlock_lock(); 1996 + raw_spin_lock_irq_rcu_node(rnp); 1991 1997 gp_duration = jiffies - rsp->gp_start; 1992 1998 if (gp_duration > rsp->gp_max) 1993 1999 rsp->gp_max = gp_duration; ··· 2011 2019 * grace period is recorded in any of the rcu_node structures. 2012 2020 */ 2013 2021 rcu_for_each_node_breadth_first(rsp, rnp) { 2014 - raw_spin_lock_irq(&rnp->lock); 2015 - smp_mb__after_unlock_lock(); 2022 + raw_spin_lock_irq_rcu_node(rnp); 2016 2023 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 2017 2024 WARN_ON_ONCE(rnp->qsmask); 2018 2025 WRITE_ONCE(rnp->completed, rsp->gpnum); ··· 2026 2035 rcu_gp_slow(rsp, gp_cleanup_delay); 2027 2036 } 2028 2037 rnp = rcu_get_root(rsp); 2029 - raw_spin_lock_irq(&rnp->lock); 2030 - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ 2038 + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ 2031 2039 rcu_nocb_gp_set(rnp, nocb); 2032 2040 2033 2041 /* Declare grace period done. */ ··· 2274 2284 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2275 2285 rnp_c = rnp; 2276 2286 rnp = rnp->parent; 2277 - raw_spin_lock_irqsave(&rnp->lock, flags); 2278 - smp_mb__after_unlock_lock(); 2287 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 2279 2288 oldmask = rnp_c->qsmask; 2280 2289 } 2281 2290 ··· 2321 2332 gps = rnp->gpnum; 2322 2333 mask = rnp->grpmask; 2323 2334 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2324 - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 2325 - smp_mb__after_unlock_lock(); 2335 + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ 2326 2336 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); 2327 2337 } 2328 2338 ··· 2343 2355 struct rcu_node *rnp; 2344 2356 2345 2357 rnp = rdp->mynode; 2346 - raw_spin_lock_irqsave(&rnp->lock, flags); 2347 - smp_mb__after_unlock_lock(); 2358 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 2348 2359 if ((rdp->cpu_no_qs.b.norm && 2349 2360 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || 2350 2361 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || ··· 2569 2582 rnp = rnp->parent; 2570 2583 if (!rnp) 2571 2584 break; 2572 - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2573 - smp_mb__after_unlock_lock(); /* GP memory ordering. */ 2585 + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 2574 2586 rnp->qsmaskinit &= ~mask; 2575 2587 rnp->qsmask &= ~mask; 2576 2588 if (rnp->qsmaskinit) { ··· 2597 2611 2598 2612 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 2599 2613 mask = rdp->grpmask; 2600 - raw_spin_lock_irqsave(&rnp->lock, flags); 2601 - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ 2614 + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 2602 2615 rnp->qsmaskinitnext &= ~mask; 2603 2616 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2604 2617 } ··· 2794 2809 rcu_for_each_leaf_node(rsp, rnp) { 2795 2810 cond_resched_rcu_qs(); 2796 2811 mask = 0; 2797 - raw_spin_lock_irqsave(&rnp->lock, flags); 2798 - smp_mb__after_unlock_lock(); 2812 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 2799 2813 if (rnp->qsmask == 0) { 2800 2814 if (rcu_state_p == &rcu_sched_state || 2801 2815 rsp != rcu_state_p || ··· 2865 2881 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 2866 2882 2867 2883 /* Reached the root of the rcu_node tree, acquire lock. */ 2868 - raw_spin_lock_irqsave(&rnp_old->lock, flags); 2869 - smp_mb__after_unlock_lock(); 2884 + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); 2870 2885 raw_spin_unlock(&rnp_old->fqslock); 2871 2886 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2872 2887 rsp->n_force_qs_lh++; ··· 2988 3005 if (!rcu_gp_in_progress(rsp)) { 2989 3006 struct rcu_node *rnp_root = rcu_get_root(rsp); 2990 3007 2991 - raw_spin_lock(&rnp_root->lock); 2992 - smp_mb__after_unlock_lock(); 3008 + raw_spin_lock_rcu_node(rnp_root); 2993 3009 needwake = rcu_start_gp(rsp); 2994 3010 raw_spin_unlock(&rnp_root->lock); 2995 3011 if (needwake) ··· 3408 3426 * CPUs for the current rcu_node structure up the rcu_node tree. 3409 3427 */ 3410 3428 rcu_for_each_leaf_node(rsp, rnp) { 3411 - raw_spin_lock_irqsave(&rnp->lock, flags); 3412 - smp_mb__after_unlock_lock(); 3429 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 3413 3430 if (rnp->expmaskinit == rnp->expmaskinitnext) { 3414 3431 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3415 3432 continue; /* No new CPUs, nothing to do. */ ··· 3428 3447 rnp_up = rnp->parent; 3429 3448 done = false; 3430 3449 while (rnp_up) { 3431 - raw_spin_lock_irqsave(&rnp_up->lock, flags); 3432 - smp_mb__after_unlock_lock(); 3450 + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); 3433 3451 if (rnp_up->expmaskinit) 3434 3452 done = true; 3435 3453 rnp_up->expmaskinit |= mask; ··· 3452 3472 3453 3473 sync_exp_reset_tree_hotplug(rsp); 3454 3474 rcu_for_each_node_breadth_first(rsp, rnp) { 3455 - raw_spin_lock_irqsave(&rnp->lock, flags); 3456 - smp_mb__after_unlock_lock(); 3475 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 3457 3476 WARN_ON_ONCE(rnp->expmask); 3458 3477 rnp->expmask = rnp->expmaskinit; 3459 3478 raw_spin_unlock_irqrestore(&rnp->lock, flags); ··· 3510 3531 mask = rnp->grpmask; 3511 3532 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 3512 3533 rnp = rnp->parent; 3513 - raw_spin_lock(&rnp->lock); /* irqs already disabled */ 3514 - smp_mb__after_unlock_lock(); 3534 + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ 3515 3535 WARN_ON_ONCE(!(rnp->expmask & mask)); 3516 3536 rnp->expmask &= ~mask; 3517 3537 } ··· 3527 3549 { 3528 3550 unsigned long flags; 3529 3551 3530 - raw_spin_lock_irqsave(&rnp->lock, flags); 3531 - smp_mb__after_unlock_lock(); 3552 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 3532 3553 __rcu_report_exp_rnp(rsp, rnp, wake, flags); 3533 3554 } 3534 3555 ··· 3541 3564 { 3542 3565 unsigned long flags; 3543 3566 3544 - raw_spin_lock_irqsave(&rnp->lock, flags); 3545 - smp_mb__after_unlock_lock(); 3567 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 3546 3568 if (!(rnp->expmask & mask)) { 3547 3569 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3548 3570 return; ··· 3684 3708 3685 3709 sync_exp_reset_tree(rsp); 3686 3710 rcu_for_each_leaf_node(rsp, rnp) { 3687 - raw_spin_lock_irqsave(&rnp->lock, flags); 3688 - smp_mb__after_unlock_lock(); 3711 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 3689 3712 3690 3713 /* Each pass checks a CPU for identity, offline, and idle. */ 3691 3714 mask_ofl_test = 0; ··· 4173 4198 */ 4174 4199 rnp = rdp->mynode; 4175 4200 mask = rdp->grpmask; 4176 - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 4177 - smp_mb__after_unlock_lock(); 4201 + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 4178 4202 rnp->qsmaskinitnext |= mask; 4179 4203 rnp->expmaskinitnext |= mask; 4180 4204 if (!rdp->beenonline)
+39
kernel/rcu/tree.h
··· 664 664 #else /* #ifdef CONFIG_PPC */ 665 665 #define smp_mb__after_unlock_lock() do { } while (0) 666 666 #endif /* #else #ifdef CONFIG_PPC */ 667 + 668 + /* 669 + * Wrappers for the rcu_node::lock acquire. 670 + * 671 + * Because the rcu_nodes form a tree, the tree traversal locking will observe 672 + * different lock values, this in turn means that an UNLOCK of one level 673 + * followed by a LOCK of another level does not imply a full memory barrier; 674 + * and most importantly transitivity is lost. 675 + * 676 + * In order to restore full ordering between tree levels, augment the regular 677 + * lock acquire functions with smp_mb__after_unlock_lock(). 678 + */ 679 + static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) 680 + { 681 + raw_spin_lock(&rnp->lock); 682 + smp_mb__after_unlock_lock(); 683 + } 684 + 685 + static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) 686 + { 687 + raw_spin_lock_irq(&rnp->lock); 688 + smp_mb__after_unlock_lock(); 689 + } 690 + 691 + #define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ 692 + do { \ 693 + typecheck(unsigned long, flags); \ 694 + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ 695 + smp_mb__after_unlock_lock(); \ 696 + } while (0) 697 + 698 + static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) 699 + { 700 + bool locked = raw_spin_trylock(&rnp->lock); 701 + 702 + if (locked) 703 + smp_mb__after_unlock_lock(); 704 + return locked; 705 + }
+6 -12
kernel/rcu/tree_plugin.h
··· 301 301 /* Possibly blocking in an RCU read-side critical section. */ 302 302 rdp = this_cpu_ptr(rcu_state_p->rda); 303 303 rnp = rdp->mynode; 304 - raw_spin_lock_irqsave(&rnp->lock, flags); 305 - smp_mb__after_unlock_lock(); 304 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 306 305 t->rcu_read_unlock_special.b.blocked = true; 307 306 t->rcu_blocked_node = rnp; 308 307 ··· 456 457 */ 457 458 for (;;) { 458 459 rnp = t->rcu_blocked_node; 459 - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 460 - smp_mb__after_unlock_lock(); 460 + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 461 461 if (rnp == t->rcu_blocked_node) 462 462 break; 463 463 WARN_ON_ONCE(1); ··· 987 989 READ_ONCE(rnp->boost_tasks) == NULL) 988 990 return 0; /* Nothing left to boost. */ 989 991 990 - raw_spin_lock_irqsave(&rnp->lock, flags); 991 - smp_mb__after_unlock_lock(); 992 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 992 993 993 994 /* 994 995 * Recheck under the lock: all tasks in need of boosting ··· 1173 1176 "rcub/%d", rnp_index); 1174 1177 if (IS_ERR(t)) 1175 1178 return PTR_ERR(t); 1176 - raw_spin_lock_irqsave(&rnp->lock, flags); 1177 - smp_mb__after_unlock_lock(); 1179 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 1178 1180 rnp->boost_kthread_task = t; 1179 1181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1180 1182 sp.sched_priority = kthread_prio; ··· 1563 1567 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1564 1568 continue; 1565 1569 rnp = rdp->mynode; 1566 - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1567 - smp_mb__after_unlock_lock(); 1570 + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1568 1571 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 1569 1572 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1570 1573 if (needwake) ··· 2063 2068 bool needwake; 2064 2069 struct rcu_node *rnp = rdp->mynode; 2065 2070 2066 - raw_spin_lock_irqsave(&rnp->lock, flags); 2067 - smp_mb__after_unlock_lock(); 2071 + raw_spin_lock_irqsave_rcu_node(rnp, flags); 2068 2072 needwake = rcu_start_future_gp(rnp, rdp, &c); 2069 2073 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2070 2074 if (needwake)