commit 7fbe67e46aab13f99d551ab04a1168a7d58cdae9 · tjh.dev/kernel

+9

Documentation/admin-guide/kernel-parameters.txt

··· 4152 4152 This wake_up() will be accompanied by a 4153 4153 WARN_ONCE() splat and an ftrace_dump(). 4154 4154 4155 + rcutree.rcu_unlock_delay= [KNL] 4156 + In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, 4157 + this specifies an rcu_read_unlock()-time delay 4158 + in microseconds. This defaults to zero. 4159 + Larger delays increase the probability of 4160 + catching RCU pointer leaks, that is, buggy use 4161 + of RCU-protected pointers after the relevant 4162 + rcu_read_unlock() has completed. 4163 + 4155 4164 rcutree.sysrq_rcu= [KNL] 4156 4165 Commandeer a sysrq key to dump out Tree RCU's 4157 4166 rcu_node tree with an eye towards determining

+7

include/linux/rcupdate.h

··· 55 55 56 56 #else /* #ifdef CONFIG_PREEMPT_RCU */ 57 57 58 + #ifdef CONFIG_TINY_RCU 59 + #define rcu_read_unlock_strict() do { } while (0) 60 + #else 61 + void rcu_read_unlock_strict(void); 62 + #endif 63 + 58 64 static inline void __rcu_read_lock(void) 59 65 { 60 66 preempt_disable(); ··· 69 63 static inline void __rcu_read_unlock(void) 70 64 { 71 65 preempt_enable(); 66 + rcu_read_unlock_strict(); 72 67 } 73 68 74 69 static inline int rcu_preempt_depth(void)

+5 -3

kernel/rcu/Kconfig

··· 135 135 136 136 config RCU_FANOUT_LEAF 137 137 int "Tree-based hierarchical RCU leaf-level fanout value" 138 - range 2 64 if 64BIT 139 - range 2 32 if !64BIT 138 + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD 139 + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD 140 + range 2 3 if RCU_STRICT_GRACE_PERIOD 140 141 depends on TREE_RCU && RCU_EXPERT 141 - default 16 142 + default 16 if !RCU_STRICT_GRACE_PERIOD 143 + default 2 if RCU_STRICT_GRACE_PERIOD 142 144 help 143 145 This option controls the leaf-level fanout of hierarchical 144 146 implementations of RCU, and allows trading off cache misses

+15

kernel/rcu/Kconfig.debug

··· 114 114 Say N here if you need ultimate kernel/user switch latencies 115 115 Say Y if you are unsure 116 116 117 + config RCU_STRICT_GRACE_PERIOD 118 + bool "Provide debug RCU implementation with short grace periods" 119 + depends on DEBUG_KERNEL && RCU_EXPERT 120 + default n 121 + select PREEMPT_COUNT if PREEMPT=n 122 + help 123 + Select this option to build an RCU variant that is strict about 124 + grace periods, making them as short as it can. This limits 125 + scalability, destroys real-time response, degrades battery 126 + lifetime and kills performance. Don't try this on large 127 + machines, as in systems with more than about 10 or 20 CPUs. 128 + But in conjunction with tools like KASAN, it can be helpful 129 + when looking for certain types of RCU usage bugs, for example, 130 + too-short RCU read-side critical sections. 131 + 117 132 endmenu # "RCU Debugging"

+62 -11

kernel/rcu/tree.c

··· 165 165 static int gp_cleanup_delay; 166 166 module_param(gp_cleanup_delay, int, 0444); 167 167 168 + // Add delay to rcu_read_unlock() for strict grace periods. 169 + static int rcu_unlock_delay; 170 + #ifdef CONFIG_RCU_STRICT_GRACE_PERIOD 171 + module_param(rcu_unlock_delay, int, 0444); 172 + #endif 173 + 168 174 /* 169 175 * This rcu parameter is runtime-read-only. It reflects 170 176 * a minimum allowed number of objects which can be cached ··· 461 455 return __this_cpu_read(rcu_data.dynticks_nesting) == 0; 462 456 } 463 457 464 - #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ 465 - #define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ 458 + #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) 459 + // Maximum callbacks per rcu_do_batch ... 460 + #define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. 466 461 static long blimit = DEFAULT_RCU_BLIMIT; 467 - #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ 462 + #define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. 468 463 static long qhimark = DEFAULT_RCU_QHIMARK; 469 - #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ 464 + #define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. 470 465 static long qlowmark = DEFAULT_RCU_QLOMARK; 471 466 #define DEFAULT_RCU_QOVLD_MULT 2 472 467 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) 473 - static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ 474 - static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ 468 + static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. 469 + static long qovld_calc = -1; // No pre-initialization lock acquisitions! 475 470 476 471 module_param(blimit, long, 0444); 477 472 module_param(qhimark, long, 0444); 478 473 module_param(qlowmark, long, 0444); 479 474 module_param(qovld, long, 0444); 480 475 481 - static ulong jiffies_till_first_fqs = ULONG_MAX; 476 + static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; 482 477 static ulong jiffies_till_next_fqs = ULONG_MAX; 483 478 static bool rcu_kick_kthreads; 484 479 static int rcu_divisor = 7; ··· 1579 1572 } 1580 1573 1581 1574 /* 1575 + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a 1576 + * quiescent state. This is intended to be invoked when the CPU notices 1577 + * a new grace period. 1578 + */ 1579 + static void rcu_strict_gp_check_qs(void) 1580 + { 1581 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { 1582 + rcu_read_lock(); 1583 + rcu_read_unlock(); 1584 + } 1585 + } 1586 + 1587 + /* 1582 1588 * Update CPU-local rcu_data state to record the beginnings and ends of 1583 1589 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1584 1590 * structure corresponding to the current CPU, and must have irqs disabled. ··· 1661 1641 } 1662 1642 needwake = __note_gp_changes(rnp, rdp); 1663 1643 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1644 + rcu_strict_gp_check_qs(); 1664 1645 if (needwake) 1665 1646 rcu_gp_kthread_wake(); 1666 1647 } ··· 1697 1676 schedule_timeout_idle(duration); 1698 1677 pr_alert("%s: Wait complete\n", __func__); 1699 1678 } 1679 + } 1680 + 1681 + /* 1682 + * Handler for on_each_cpu() to invoke the target CPU's RCU core 1683 + * processing. 1684 + */ 1685 + static void rcu_strict_gp_boundary(void *unused) 1686 + { 1687 + invoke_rcu_core(); 1700 1688 } 1701 1689 1702 1690 /* ··· 1838 1808 cond_resched_tasks_rcu_qs(); 1839 1809 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1840 1810 } 1811 + 1812 + // If strict, make all CPUs aware of new grace period. 1813 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) 1814 + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); 1841 1815 1842 1816 return true; 1843 1817 } ··· 2059 2025 rcu_state.gp_flags & RCU_GP_FLAG_INIT); 2060 2026 } 2061 2027 raw_spin_unlock_irq_rcu_node(rnp); 2028 + 2029 + // If strict, make all CPUs aware of the end of the old grace period. 2030 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) 2031 + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); 2062 2032 } 2063 2033 2064 2034 /* ··· 2241 2203 * structure. This must be called from the specified CPU. 2242 2204 */ 2243 2205 static void 2244 - rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) 2206 + rcu_report_qs_rdp(struct rcu_data *rdp) 2245 2207 { 2246 2208 unsigned long flags; 2247 2209 unsigned long mask; ··· 2250 2212 rcu_segcblist_is_offloaded(&rdp->cblist); 2251 2213 struct rcu_node *rnp; 2252 2214 2215 + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); 2253 2216 rnp = rdp->mynode; 2254 2217 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2255 2218 if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || ··· 2267 2228 return; 2268 2229 } 2269 2230 mask = rdp->grpmask; 2270 - if (rdp->cpu == smp_processor_id()) 2271 - rdp->core_needs_qs = false; 2231 + rdp->core_needs_qs = false; 2272 2232 if ((rnp->qsmask & mask) == 0) { 2273 2233 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2274 2234 } else { ··· 2316 2278 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 2317 2279 * judge of that). 2318 2280 */ 2319 - rcu_report_qs_rdp(rdp->cpu, rdp); 2281 + rcu_report_qs_rdp(rdp); 2320 2282 } 2321 2283 2322 2284 /* ··· 2659 2621 } 2660 2622 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 2661 2623 2624 + // Workqueue handler for an RCU reader for kernels enforcing struct RCU 2625 + // grace periods. 2626 + static void strict_work_handler(struct work_struct *work) 2627 + { 2628 + rcu_read_lock(); 2629 + rcu_read_unlock(); 2630 + } 2631 + 2662 2632 /* Perform RCU core processing work for the current CPU. */ 2663 2633 static __latent_entropy void rcu_core(void) 2664 2634 { ··· 2711 2665 /* Do any needed deferred wakeups of rcuo kthreads. */ 2712 2666 do_nocb_deferred_wakeup(rdp); 2713 2667 trace_rcu_utilization(TPS("End RCU core")); 2668 + 2669 + // If strict GPs, schedule an RCU reader in a clean environment. 2670 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) 2671 + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); 2714 2672 } 2715 2673 2716 2674 static void rcu_core_si(struct softirq_action *h) ··· 3912 3862 3913 3863 /* Set up local state, ensuring consistent view of global state. */ 3914 3864 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); 3865 + INIT_WORK(&rdp->strict_work, strict_work_handler); 3915 3866 WARN_ON_ONCE(rdp->dynticks_nesting != 1); 3916 3867 WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); 3917 3868 rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;

+1

kernel/rcu/tree.h

··· 165 165 /* period it is aware of. */ 166 166 struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ 167 167 bool defer_qs_iw_pending; /* Scheduler attention pending? */ 168 + struct work_struct strict_work; /* Schedule readers for strict GPs. */ 168 169 169 170 /* 2) batch handling */ 170 171 struct rcu_segcblist cblist; /* Segmented callback list, with */

+30 -2

kernel/rcu/tree_plugin.h

··· 36 36 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 37 37 if (IS_ENABLED(CONFIG_PROVE_RCU)) 38 38 pr_info("\tRCU lockdep checking is enabled.\n"); 39 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) 40 + pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); 39 41 if (RCU_NUM_LVLS >= 4) 40 42 pr_info("\tFour(or more)-level hierarchy is enabled.\n"); 41 43 if (RCU_FANOUT_LEAF != 16) ··· 376 374 rcu_preempt_read_enter(); 377 375 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 378 376 WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); 377 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) 378 + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); 379 379 barrier(); /* critical section after entry code. */ 380 380 } 381 381 EXPORT_SYMBOL_GPL(__rcu_read_lock); ··· 459 455 return; 460 456 } 461 457 t->rcu_read_unlock_special.s = 0; 462 - if (special.b.need_qs) 463 - rcu_qs(); 458 + if (special.b.need_qs) { 459 + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { 460 + rcu_report_qs_rdp(rdp); 461 + udelay(rcu_unlock_delay); 462 + } else { 463 + rcu_qs(); 464 + } 465 + } 464 466 465 467 /* 466 468 * Respond to a request by an expedited grace period for a ··· 777 767 } 778 768 779 769 #else /* #ifdef CONFIG_PREEMPT_RCU */ 770 + 771 + /* 772 + * If strict grace periods are enabled, and if the calling 773 + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately 774 + * report that quiescent state and, if requested, spin for a bit. 775 + */ 776 + void rcu_read_unlock_strict(void) 777 + { 778 + struct rcu_data *rdp; 779 + 780 + if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || 781 + irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) 782 + return; 783 + rdp = this_cpu_ptr(&rcu_data); 784 + rcu_report_qs_rdp(rdp); 785 + udelay(rcu_unlock_delay); 786 + } 787 + EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); 780 788 781 789 /* 782 790 * Tell them what RCU they are running.