Merge branches 'doc.2013.08.19a', 'fixes.2013.08.20a', 'sysidle.2013.08.31a' and 'torture.2013.08.20a' into HEAD

+10

Documentation/RCU/torture.txt

··· 42 42 fqs_stutter Wait time (in seconds) between consecutive bursts 43 43 of calls to force_quiescent_state(). 44 44 45 + gp_normal Make the fake writers use normal synchronous grace-period 46 + primitives. 47 + 48 + gp_exp Make the fake writers use expedited synchronous grace-period 49 + primitives. If both gp_normal and gp_exp are set, or 50 + if neither gp_normal nor gp_exp are set, then randomly 51 + choose the primitive so that about 50% are normal and 52 + 50% expedited. By default, neither are set, which 53 + gives best overall test coverage. 54 + 45 55 irqreader Says to invoke RCU readers from irq level. This is currently 46 56 done via timers. Defaults to "1" for variants of RCU that 47 57 permit this. (Or, more accurately, variants of RCU that do

+34 -10

Documentation/timers/NO_HZ.txt

··· 24 24 workloads, you will normally -not- want this option. 25 25 26 26 These three cases are described in the following three sections, followed 27 - by a third section on RCU-specific considerations and a fourth and final 28 - section listing known issues. 27 + by a third section on RCU-specific considerations, a fourth section 28 + discussing testing, and a fifth and final section listing known issues. 29 29 30 30 31 31 NEVER OMIT SCHEDULING-CLOCK TICKS ··· 121 121 "nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks 122 122 CPUs. Note that you are prohibited from marking all of the CPUs as 123 123 adaptive-tick CPUs: At least one non-adaptive-tick CPU must remain 124 - online to handle timekeeping tasks in order to ensure that system calls 125 - like gettimeofday() returns accurate values on adaptive-tick CPUs. 126 - (This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no 127 - running user processes to observe slight drifts in clock rate.) 128 - Therefore, the boot CPU is prohibited from entering adaptive-ticks 129 - mode. Specifying a "nohz_full=" mask that includes the boot CPU will 130 - result in a boot-time error message, and the boot CPU will be removed 131 - from the mask. 124 + online to handle timekeeping tasks in order to ensure that system 125 + calls like gettimeofday() returns accurate values on adaptive-tick CPUs. 126 + (This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no running 127 + user processes to observe slight drifts in clock rate.) Therefore, the 128 + boot CPU is prohibited from entering adaptive-ticks mode. Specifying a 129 + "nohz_full=" mask that includes the boot CPU will result in a boot-time 130 + error message, and the boot CPU will be removed from the mask. Note that 131 + this means that your system must have at least two CPUs in order for 132 + CONFIG_NO_HZ_FULL=y to do anything for you. 132 133 133 134 Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies 134 135 that all CPUs other than the boot CPU are adaptive-ticks CPUs. This ··· 231 230 pin the "rcuo" kthreads to specific CPUs if desired. Otherwise, the 232 231 scheduler will decide where to run them, which might or might not be 233 232 where you want them to run. 233 + 234 + 235 + TESTING 236 + 237 + So you enable all the OS-jitter features described in this document, 238 + but do not see any change in your workload's behavior. Is this because 239 + your workload isn't affected that much by OS jitter, or is it because 240 + something else is in the way? This section helps answer this question 241 + by providing a simple OS-jitter test suite, which is available on branch 242 + master of the following git archive: 243 + 244 + git://git.kernel.org/pub/scm/linux/kernel/git/frederic/dynticks-testing.git 245 + 246 + Clone this archive and follow the instructions in the README file. 247 + This test procedure will produce a trace that will allow you to evaluate 248 + whether or not you have succeeded in removing OS jitter from your system. 249 + If this trace shows that you have removed OS jitter as much as is 250 + possible, then you can conclude that your workload is not all that 251 + sensitive to OS jitter. 252 + 253 + Note: this test requires that your system have at least two CPUs. 254 + We do not currently have a good way to remove OS jitter from single-CPU 255 + systems. 234 256 235 257 236 258 KNOWN ISSUES

+3 -3

include/linux/debugobjects.h

··· 63 63 extern void debug_object_init (void *addr, struct debug_obj_descr *descr); 64 64 extern void 65 65 debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr); 66 - extern void debug_object_activate (void *addr, struct debug_obj_descr *descr); 66 + extern int debug_object_activate (void *addr, struct debug_obj_descr *descr); 67 67 extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr); 68 68 extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr); 69 69 extern void debug_object_free (void *addr, struct debug_obj_descr *descr); ··· 85 85 debug_object_init (void *addr, struct debug_obj_descr *descr) { } 86 86 static inline void 87 87 debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr) { } 88 - static inline void 89 - debug_object_activate (void *addr, struct debug_obj_descr *descr) { } 88 + static inline int 89 + debug_object_activate (void *addr, struct debug_obj_descr *descr) { return 0; } 90 90 static inline void 91 91 debug_object_deactivate(void *addr, struct debug_obj_descr *descr) { } 92 92 static inline void

+4 -4

include/linux/jiffies.h

··· 101 101 #define time_after(a,b) \ 102 102 (typecheck(unsigned long, a) && \ 103 103 typecheck(unsigned long, b) && \ 104 - ((long)(b) - (long)(a) < 0)) 104 + ((long)((b) - (a)) < 0)) 105 105 #define time_before(a,b) time_after(b,a) 106 106 107 107 #define time_after_eq(a,b) \ 108 108 (typecheck(unsigned long, a) && \ 109 109 typecheck(unsigned long, b) && \ 110 - ((long)(a) - (long)(b) >= 0)) 110 + ((long)((a) - (b)) >= 0)) 111 111 #define time_before_eq(a,b) time_after_eq(b,a) 112 112 113 113 /* ··· 130 130 #define time_after64(a,b) \ 131 131 (typecheck(__u64, a) && \ 132 132 typecheck(__u64, b) && \ 133 - ((__s64)(b) - (__s64)(a) < 0)) 133 + ((__s64)((b) - (a)) < 0)) 134 134 #define time_before64(a,b) time_after64(b,a) 135 135 136 136 #define time_after_eq64(a,b) \ 137 137 (typecheck(__u64, a) && \ 138 138 typecheck(__u64, b) && \ 139 - ((__s64)(a) - (__s64)(b) >= 0)) 139 + ((__s64)((a) - (b)) >= 0)) 140 140 #define time_before_eq64(a,b) time_after_eq64(b,a) 141 141 142 142 #define time_in_range64(a, b, c) \

+3 -2

include/linux/rculist.h

··· 267 267 */ 268 268 #define list_first_or_null_rcu(ptr, type, member) \ 269 269 ({struct list_head *__ptr = (ptr); \ 270 - struct list_head __rcu *__next = list_next_rcu(__ptr); \ 271 - likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \ 270 + struct list_head *__next = ACCESS_ONCE(__ptr->next); \ 271 + likely(__ptr != __next) ? \ 272 + list_entry_rcu(__next, type, member) : NULL; \ 272 273 }) 273 274 274 275 /**

+18 -4

include/linux/rcupdate.h

··· 229 229 #ifdef CONFIG_RCU_USER_QS 230 230 extern void rcu_user_enter(void); 231 231 extern void rcu_user_exit(void); 232 - extern void rcu_user_enter_after_irq(void); 233 - extern void rcu_user_exit_after_irq(void); 234 232 #else 235 233 static inline void rcu_user_enter(void) { } 236 234 static inline void rcu_user_exit(void) { } 237 - static inline void rcu_user_enter_after_irq(void) { } 238 - static inline void rcu_user_exit_after_irq(void) { } 239 235 static inline void rcu_user_hooks_switch(struct task_struct *prev, 240 236 struct task_struct *next) { } 241 237 #endif /* CONFIG_RCU_USER_QS */ ··· 1009 1013 #else 1010 1014 static inline bool rcu_is_nocb_cpu(int cpu) { return false; } 1011 1015 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 1016 + 1017 + 1018 + /* Only for use by adaptive-ticks code. */ 1019 + #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 1020 + extern bool rcu_sys_is_idle(void); 1021 + extern void rcu_sysidle_force_exit(void); 1022 + #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 1023 + 1024 + static inline bool rcu_sys_is_idle(void) 1025 + { 1026 + return false; 1027 + } 1028 + 1029 + static inline void rcu_sysidle_force_exit(void) 1030 + { 1031 + } 1032 + 1033 + #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 1012 1034 1013 1035 1014 1036 #endif /* __LINUX_RCUPDATE_H */

+1

init/Kconfig

··· 470 470 config TREE_PREEMPT_RCU 471 471 bool "Preemptible tree-based hierarchical RCU" 472 472 depends on PREEMPT 473 + select IRQ_WORK 473 474 help 474 475 This option selects the RCU implementation that is 475 476 designed for very large SMP systems with hundreds or

+7 -3

kernel/rcu.h

··· 67 67 68 68 extern struct debug_obj_descr rcuhead_debug_descr; 69 69 70 - static inline void debug_rcu_head_queue(struct rcu_head *head) 70 + static inline int debug_rcu_head_queue(struct rcu_head *head) 71 71 { 72 - debug_object_activate(head, &rcuhead_debug_descr); 72 + int r1; 73 + 74 + r1 = debug_object_activate(head, &rcuhead_debug_descr); 73 75 debug_object_active_state(head, &rcuhead_debug_descr, 74 76 STATE_RCU_HEAD_READY, 75 77 STATE_RCU_HEAD_QUEUED); 78 + return r1; 76 79 } 77 80 78 81 static inline void debug_rcu_head_unqueue(struct rcu_head *head) ··· 86 83 debug_object_deactivate(head, &rcuhead_debug_descr); 87 84 } 88 85 #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 89 - static inline void debug_rcu_head_queue(struct rcu_head *head) 86 + static inline int debug_rcu_head_queue(struct rcu_head *head) 90 87 { 88 + return 0; 91 89 } 92 90 93 91 static inline void debug_rcu_head_unqueue(struct rcu_head *head)

-100

kernel/rcupdate.c

··· 212 212 } 213 213 214 214 /* 215 - * fixup_init is called when: 216 - * - an active object is initialized 217 - */ 218 - static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) 219 - { 220 - struct rcu_head *head = addr; 221 - 222 - switch (state) { 223 - case ODEBUG_STATE_ACTIVE: 224 - /* 225 - * Ensure that queued callbacks are all executed. 226 - * If we detect that we are nested in a RCU read-side critical 227 - * section, we should simply fail, otherwise we would deadlock. 228 - * In !PREEMPT configurations, there is no way to tell if we are 229 - * in a RCU read-side critical section or not, so we never 230 - * attempt any fixup and just print a warning. 231 - */ 232 - #ifndef CONFIG_PREEMPT 233 - WARN_ON_ONCE(1); 234 - return 0; 235 - #endif 236 - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 237 - irqs_disabled()) { 238 - WARN_ON_ONCE(1); 239 - return 0; 240 - } 241 - rcu_barrier(); 242 - rcu_barrier_sched(); 243 - rcu_barrier_bh(); 244 - debug_object_init(head, &rcuhead_debug_descr); 245 - return 1; 246 - default: 247 - return 0; 248 - } 249 - } 250 - 251 - /* 252 215 * fixup_activate is called when: 253 216 * - an active object is activated 254 217 * - an unknown object is activated (might be a statically initialized object) ··· 231 268 debug_object_init(head, &rcuhead_debug_descr); 232 269 debug_object_activate(head, &rcuhead_debug_descr); 233 270 return 0; 234 - 235 - case ODEBUG_STATE_ACTIVE: 236 - /* 237 - * Ensure that queued callbacks are all executed. 238 - * If we detect that we are nested in a RCU read-side critical 239 - * section, we should simply fail, otherwise we would deadlock. 240 - * In !PREEMPT configurations, there is no way to tell if we are 241 - * in a RCU read-side critical section or not, so we never 242 - * attempt any fixup and just print a warning. 243 - */ 244 - #ifndef CONFIG_PREEMPT 245 - WARN_ON_ONCE(1); 246 - return 0; 247 - #endif 248 - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 249 - irqs_disabled()) { 250 - WARN_ON_ONCE(1); 251 - return 0; 252 - } 253 - rcu_barrier(); 254 - rcu_barrier_sched(); 255 - rcu_barrier_bh(); 256 - debug_object_activate(head, &rcuhead_debug_descr); 257 - return 1; 258 271 default: 259 - return 0; 260 - } 261 - } 262 - 263 - /* 264 - * fixup_free is called when: 265 - * - an active object is freed 266 - */ 267 - static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) 268 - { 269 - struct rcu_head *head = addr; 270 - 271 - switch (state) { 272 - case ODEBUG_STATE_ACTIVE: 273 - /* 274 - * Ensure that queued callbacks are all executed. 275 - * If we detect that we are nested in a RCU read-side critical 276 - * section, we should simply fail, otherwise we would deadlock. 277 - * In !PREEMPT configurations, there is no way to tell if we are 278 - * in a RCU read-side critical section or not, so we never 279 - * attempt any fixup and just print a warning. 280 - */ 281 - #ifndef CONFIG_PREEMPT 282 - WARN_ON_ONCE(1); 283 - return 0; 284 - #endif 285 - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 286 - irqs_disabled()) { 287 - WARN_ON_ONCE(1); 288 - return 0; 289 - } 290 - rcu_barrier(); 291 - rcu_barrier_sched(); 292 - rcu_barrier_bh(); 293 - debug_object_free(head, &rcuhead_debug_descr); 294 272 return 1; 295 - default: 296 - return 0; 297 273 } 298 274 } 299 275 ··· 271 369 272 370 struct debug_obj_descr rcuhead_debug_descr = { 273 371 .name = "rcu_head", 274 - .fixup_init = rcuhead_fixup_init, 275 372 .fixup_activate = rcuhead_fixup_activate, 276 - .fixup_free = rcuhead_fixup_free, 277 373 }; 278 374 EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 279 375 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */

+176 -214

kernel/rcutorture.c

··· 52 52 MODULE_LICENSE("GPL"); 53 53 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 54 54 55 - static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 56 - static int nfakewriters = 4; /* # fake writer threads */ 57 - static int stat_interval = 60; /* Interval between stats, in seconds. */ 58 - /* Zero means "only at end of test". */ 59 - static bool verbose; /* Print more debug info. */ 60 - static bool test_no_idle_hz = true; 61 - /* Test RCU support for tickless idle CPUs. */ 62 - static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 63 - static int stutter = 5; /* Start/stop testing interval (in sec) */ 64 - static int irqreader = 1; /* RCU readers from irq (timers). */ 65 - static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 66 - static int fqs_holdoff; /* Hold time within burst (us). */ 67 - static int fqs_stutter = 3; /* Wait time between bursts (s). */ 68 - static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ 69 - static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 70 - static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 71 - static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 72 - static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ 73 - static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ 74 - static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 75 - static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 76 - static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 77 - static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 78 - 79 - module_param(nreaders, int, 0444); 80 - MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 81 - module_param(nfakewriters, int, 0444); 82 - MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 83 - module_param(stat_interval, int, 0644); 84 - MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 85 - module_param(verbose, bool, 0444); 86 - MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 87 - module_param(test_no_idle_hz, bool, 0444); 88 - MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 89 - module_param(shuffle_interval, int, 0444); 90 - MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 91 - module_param(stutter, int, 0444); 92 - MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 93 - module_param(irqreader, int, 0444); 94 - MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 55 + static int fqs_duration; 95 56 module_param(fqs_duration, int, 0444); 96 - MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); 57 + MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); 58 + static int fqs_holdoff; 97 59 module_param(fqs_holdoff, int, 0444); 98 60 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 61 + static int fqs_stutter = 3; 99 62 module_param(fqs_stutter, int, 0444); 100 63 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 64 + static bool gp_exp; 65 + module_param(gp_exp, bool, 0444); 66 + MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); 67 + static bool gp_normal; 68 + module_param(gp_normal, bool, 0444); 69 + MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); 70 + static int irqreader = 1; 71 + module_param(irqreader, int, 0444); 72 + MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 73 + static int n_barrier_cbs; 101 74 module_param(n_barrier_cbs, int, 0444); 102 75 MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 103 - module_param(onoff_interval, int, 0444); 104 - MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 76 + static int nfakewriters = 4; 77 + module_param(nfakewriters, int, 0444); 78 + MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 79 + static int nreaders = -1; 80 + module_param(nreaders, int, 0444); 81 + MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 82 + static int object_debug; 83 + module_param(object_debug, int, 0444); 84 + MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); 85 + static int onoff_holdoff; 105 86 module_param(onoff_holdoff, int, 0444); 106 87 MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); 88 + static int onoff_interval; 89 + module_param(onoff_interval, int, 0444); 90 + MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 91 + static int shuffle_interval = 3; 92 + module_param(shuffle_interval, int, 0444); 93 + MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 94 + static int shutdown_secs; 107 95 module_param(shutdown_secs, int, 0444); 108 - MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 96 + MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); 97 + static int stall_cpu; 109 98 module_param(stall_cpu, int, 0444); 110 99 MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); 100 + static int stall_cpu_holdoff = 10; 111 101 module_param(stall_cpu_holdoff, int, 0444); 112 102 MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); 103 + static int stat_interval = 60; 104 + module_param(stat_interval, int, 0644); 105 + MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 106 + static int stutter = 5; 107 + module_param(stutter, int, 0444); 108 + MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 109 + static int test_boost = 1; 113 110 module_param(test_boost, int, 0444); 114 111 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 115 - module_param(test_boost_interval, int, 0444); 116 - MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); 112 + static int test_boost_duration = 4; 117 113 module_param(test_boost_duration, int, 0444); 118 114 MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); 115 + static int test_boost_interval = 7; 116 + module_param(test_boost_interval, int, 0444); 117 + MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); 118 + static bool test_no_idle_hz = true; 119 + module_param(test_no_idle_hz, bool, 0444); 120 + MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 121 + static char *torture_type = "rcu"; 119 122 module_param(torture_type, charp, 0444); 120 - MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 123 + MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); 124 + static bool verbose; 125 + module_param(verbose, bool, 0444); 126 + MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 121 127 122 128 #define TORTURE_FLAG "-torture:" 123 129 #define PRINTK_STRING(s) \ ··· 366 360 int (*completed)(void); 367 361 void (*deferred_free)(struct rcu_torture *p); 368 362 void (*sync)(void); 363 + void (*exp_sync)(void); 369 364 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 370 365 void (*cb_barrier)(void); 371 366 void (*fqs)(void); ··· 450 443 call_rcu(&p->rtort_rcu, rcu_torture_cb); 451 444 } 452 445 446 + static void rcu_sync_torture_init(void) 447 + { 448 + INIT_LIST_HEAD(&rcu_torture_removed); 449 + } 450 + 453 451 static struct rcu_torture_ops rcu_ops = { 454 - .init = NULL, 452 + .init = rcu_sync_torture_init, 455 453 .readlock = rcu_torture_read_lock, 456 454 .read_delay = rcu_read_delay, 457 455 .readunlock = rcu_torture_read_unlock, 458 456 .completed = rcu_torture_completed, 459 457 .deferred_free = rcu_torture_deferred_free, 460 458 .sync = synchronize_rcu, 459 + .exp_sync = synchronize_rcu_expedited, 461 460 .call = call_rcu, 462 461 .cb_barrier = rcu_barrier, 463 462 .fqs = rcu_force_quiescent_state, ··· 471 458 .irq_capable = 1, 472 459 .can_boost = rcu_can_boost(), 473 460 .name = "rcu" 474 - }; 475 - 476 - static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 477 - { 478 - int i; 479 - struct rcu_torture *rp; 480 - struct rcu_torture *rp1; 481 - 482 - cur_ops->sync(); 483 - list_add(&p->rtort_free, &rcu_torture_removed); 484 - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { 485 - i = rp->rtort_pipe_count; 486 - if (i > RCU_TORTURE_PIPE_LEN) 487 - i = RCU_TORTURE_PIPE_LEN; 488 - atomic_inc(&rcu_torture_wcount[i]); 489 - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 490 - rp->rtort_mbtest = 0; 491 - list_del(&rp->rtort_free); 492 - rcu_torture_free(rp); 493 - } 494 - } 495 - } 496 - 497 - static void rcu_sync_torture_init(void) 498 - { 499 - INIT_LIST_HEAD(&rcu_torture_removed); 500 - } 501 - 502 - static struct rcu_torture_ops rcu_sync_ops = { 503 - .init = rcu_sync_torture_init, 504 - .readlock = rcu_torture_read_lock, 505 - .read_delay = rcu_read_delay, 506 - .readunlock = rcu_torture_read_unlock, 507 - .completed = rcu_torture_completed, 508 - .deferred_free = rcu_sync_torture_deferred_free, 509 - .sync = synchronize_rcu, 510 - .call = NULL, 511 - .cb_barrier = NULL, 512 - .fqs = rcu_force_quiescent_state, 513 - .stats = NULL, 514 - .irq_capable = 1, 515 - .can_boost = rcu_can_boost(), 516 - .name = "rcu_sync" 517 - }; 518 - 519 - static struct rcu_torture_ops rcu_expedited_ops = { 520 - .init = rcu_sync_torture_init, 521 - .readlock = rcu_torture_read_lock, 522 - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 523 - .readunlock = rcu_torture_read_unlock, 524 - .completed = rcu_no_completed, 525 - .deferred_free = rcu_sync_torture_deferred_free, 526 - .sync = synchronize_rcu_expedited, 527 - .call = NULL, 528 - .cb_barrier = NULL, 529 - .fqs = rcu_force_quiescent_state, 530 - .stats = NULL, 531 - .irq_capable = 1, 532 - .can_boost = rcu_can_boost(), 533 - .name = "rcu_expedited" 534 461 }; 535 462 536 463 /* ··· 499 546 } 500 547 501 548 static struct rcu_torture_ops rcu_bh_ops = { 502 - .init = NULL, 549 + .init = rcu_sync_torture_init, 503 550 .readlock = rcu_bh_torture_read_lock, 504 551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 505 552 .readunlock = rcu_bh_torture_read_unlock, 506 553 .completed = rcu_bh_torture_completed, 507 554 .deferred_free = rcu_bh_torture_deferred_free, 508 555 .sync = synchronize_rcu_bh, 556 + .exp_sync = synchronize_rcu_bh_expedited, 509 557 .call = call_rcu_bh, 510 558 .cb_barrier = rcu_barrier_bh, 511 559 .fqs = rcu_bh_force_quiescent_state, 512 560 .stats = NULL, 513 561 .irq_capable = 1, 514 562 .name = "rcu_bh" 515 - }; 516 - 517 - static struct rcu_torture_ops rcu_bh_sync_ops = { 518 - .init = rcu_sync_torture_init, 519 - .readlock = rcu_bh_torture_read_lock, 520 - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 521 - .readunlock = rcu_bh_torture_read_unlock, 522 - .completed = rcu_bh_torture_completed, 523 - .deferred_free = rcu_sync_torture_deferred_free, 524 - .sync = synchronize_rcu_bh, 525 - .call = NULL, 526 - .cb_barrier = NULL, 527 - .fqs = rcu_bh_force_quiescent_state, 528 - .stats = NULL, 529 - .irq_capable = 1, 530 - .name = "rcu_bh_sync" 531 - }; 532 - 533 - static struct rcu_torture_ops rcu_bh_expedited_ops = { 534 - .init = rcu_sync_torture_init, 535 - .readlock = rcu_bh_torture_read_lock, 536 - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 537 - .readunlock = rcu_bh_torture_read_unlock, 538 - .completed = rcu_bh_torture_completed, 539 - .deferred_free = rcu_sync_torture_deferred_free, 540 - .sync = synchronize_rcu_bh_expedited, 541 - .call = NULL, 542 - .cb_barrier = NULL, 543 - .fqs = rcu_bh_force_quiescent_state, 544 - .stats = NULL, 545 - .irq_capable = 1, 546 - .name = "rcu_bh_expedited" 547 563 }; 548 564 549 565 /* ··· 589 667 return cnt; 590 668 } 591 669 670 + static void srcu_torture_synchronize_expedited(void) 671 + { 672 + synchronize_srcu_expedited(&srcu_ctl); 673 + } 674 + 592 675 static struct rcu_torture_ops srcu_ops = { 593 676 .init = rcu_sync_torture_init, 594 677 .readlock = srcu_torture_read_lock, ··· 602 675 .completed = srcu_torture_completed, 603 676 .deferred_free = srcu_torture_deferred_free, 604 677 .sync = srcu_torture_synchronize, 678 + .exp_sync = srcu_torture_synchronize_expedited, 605 679 .call = srcu_torture_call, 606 680 .cb_barrier = srcu_torture_barrier, 607 681 .stats = srcu_torture_stats, 608 682 .name = "srcu" 609 - }; 610 - 611 - static struct rcu_torture_ops srcu_sync_ops = { 612 - .init = rcu_sync_torture_init, 613 - .readlock = srcu_torture_read_lock, 614 - .read_delay = srcu_read_delay, 615 - .readunlock = srcu_torture_read_unlock, 616 - .completed = srcu_torture_completed, 617 - .deferred_free = rcu_sync_torture_deferred_free, 618 - .sync = srcu_torture_synchronize, 619 - .call = NULL, 620 - .cb_barrier = NULL, 621 - .stats = srcu_torture_stats, 622 - .name = "srcu_sync" 623 - }; 624 - 625 - static void srcu_torture_synchronize_expedited(void) 626 - { 627 - synchronize_srcu_expedited(&srcu_ctl); 628 - } 629 - 630 - static struct rcu_torture_ops srcu_expedited_ops = { 631 - .init = rcu_sync_torture_init, 632 - .readlock = srcu_torture_read_lock, 633 - .read_delay = srcu_read_delay, 634 - .readunlock = srcu_torture_read_unlock, 635 - .completed = srcu_torture_completed, 636 - .deferred_free = rcu_sync_torture_deferred_free, 637 - .sync = srcu_torture_synchronize_expedited, 638 - .call = NULL, 639 - .cb_barrier = NULL, 640 - .stats = srcu_torture_stats, 641 - .name = "srcu_expedited" 642 683 }; 643 684 644 685 /* ··· 637 742 .completed = rcu_no_completed, 638 743 .deferred_free = rcu_sched_torture_deferred_free, 639 744 .sync = synchronize_sched, 745 + .exp_sync = synchronize_sched_expedited, 746 + .call = call_rcu_sched, 640 747 .cb_barrier = rcu_barrier_sched, 641 748 .fqs = rcu_sched_force_quiescent_state, 642 749 .stats = NULL, 643 750 .irq_capable = 1, 644 751 .name = "sched" 645 - }; 646 - 647 - static struct rcu_torture_ops sched_sync_ops = { 648 - .init = rcu_sync_torture_init, 649 - .readlock = sched_torture_read_lock, 650 - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 651 - .readunlock = sched_torture_read_unlock, 652 - .completed = rcu_no_completed, 653 - .deferred_free = rcu_sync_torture_deferred_free, 654 - .sync = synchronize_sched, 655 - .cb_barrier = NULL, 656 - .fqs = rcu_sched_force_quiescent_state, 657 - .stats = NULL, 658 - .name = "sched_sync" 659 - }; 660 - 661 - static struct rcu_torture_ops sched_expedited_ops = { 662 - .init = rcu_sync_torture_init, 663 - .readlock = sched_torture_read_lock, 664 - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 665 - .readunlock = sched_torture_read_unlock, 666 - .completed = rcu_no_completed, 667 - .deferred_free = rcu_sync_torture_deferred_free, 668 - .sync = synchronize_sched_expedited, 669 - .cb_barrier = NULL, 670 - .fqs = rcu_sched_force_quiescent_state, 671 - .stats = NULL, 672 - .irq_capable = 1, 673 - .name = "sched_expedited" 674 752 }; 675 753 676 754 /* ··· 795 927 static int 796 928 rcu_torture_writer(void *arg) 797 929 { 930 + bool exp; 798 931 int i; 799 - long oldbatch = rcu_batches_completed(); 800 932 struct rcu_torture *rp; 933 + struct rcu_torture *rp1; 801 934 struct rcu_torture *old_rp; 802 935 static DEFINE_RCU_RANDOM(rand); 803 936 ··· 823 954 i = RCU_TORTURE_PIPE_LEN; 824 955 atomic_inc(&rcu_torture_wcount[i]); 825 956 old_rp->rtort_pipe_count++; 826 - cur_ops->deferred_free(old_rp); 957 + if (gp_normal == gp_exp) 958 + exp = !!(rcu_random(&rand) & 0x80); 959 + else 960 + exp = gp_exp; 961 + if (!exp) { 962 + cur_ops->deferred_free(old_rp); 963 + } else { 964 + cur_ops->exp_sync(); 965 + list_add(&old_rp->rtort_free, 966 + &rcu_torture_removed); 967 + list_for_each_entry_safe(rp, rp1, 968 + &rcu_torture_removed, 969 + rtort_free) { 970 + i = rp->rtort_pipe_count; 971 + if (i > RCU_TORTURE_PIPE_LEN) 972 + i = RCU_TORTURE_PIPE_LEN; 973 + atomic_inc(&rcu_torture_wcount[i]); 974 + if (++rp->rtort_pipe_count >= 975 + RCU_TORTURE_PIPE_LEN) { 976 + rp->rtort_mbtest = 0; 977 + list_del(&rp->rtort_free); 978 + rcu_torture_free(rp); 979 + } 980 + } 981 + } 827 982 } 828 983 rcutorture_record_progress(++rcu_torture_current_version); 829 - oldbatch = cur_ops->completed(); 830 984 rcu_stutter_wait("rcu_torture_writer"); 831 985 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 832 986 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); ··· 875 983 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 876 984 udelay(rcu_random(&rand) & 0x3ff); 877 985 if (cur_ops->cb_barrier != NULL && 878 - rcu_random(&rand) % (nfakewriters * 8) == 0) 986 + rcu_random(&rand) % (nfakewriters * 8) == 0) { 879 987 cur_ops->cb_barrier(); 880 - else 988 + } else if (gp_normal == gp_exp) { 989 + if (rcu_random(&rand) & 0x80) 990 + cur_ops->sync(); 991 + else 992 + cur_ops->exp_sync(); 993 + } else if (gp_normal) { 881 994 cur_ops->sync(); 995 + } else { 996 + cur_ops->exp_sync(); 997 + } 882 998 rcu_stutter_wait("rcu_torture_fakewriter"); 883 999 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 884 1000 ··· 1434 1534 torture_type, cpu); 1435 1535 starttime = jiffies; 1436 1536 n_online_attempts++; 1437 - if (cpu_up(cpu) == 0) { 1537 + ret = cpu_up(cpu); 1538 + if (ret) { 1539 + if (verbose) 1540 + pr_alert("%s" TORTURE_FLAG 1541 + "rcu_torture_onoff task: online %d failed: errno %d\n", 1542 + torture_type, cpu, ret); 1543 + } else { 1438 1544 if (verbose) 1439 1545 pr_alert("%s" TORTURE_FLAG 1440 1546 "rcu_torture_onoff task: onlined %d\n", ··· 1840 1934 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1841 1935 } 1842 1936 1937 + #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 1938 + static void rcu_torture_leak_cb(struct rcu_head *rhp) 1939 + { 1940 + } 1941 + 1942 + static void rcu_torture_err_cb(struct rcu_head *rhp) 1943 + { 1944 + /* 1945 + * This -might- happen due to race conditions, but is unlikely. 1946 + * The scenario that leads to this happening is that the 1947 + * first of the pair of duplicate callbacks is queued, 1948 + * someone else starts a grace period that includes that 1949 + * callback, then the second of the pair must wait for the 1950 + * next grace period. Unlikely, but can happen. If it 1951 + * does happen, the debug-objects subsystem won't have splatted. 1952 + */ 1953 + pr_alert("rcutorture: duplicated callback was invoked.\n"); 1954 + } 1955 + #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 1956 + 1957 + /* 1958 + * Verify that double-free causes debug-objects to complain, but only 1959 + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test 1960 + * cannot be carried out. 1961 + */ 1962 + static void rcu_test_debug_objects(void) 1963 + { 1964 + #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 1965 + struct rcu_head rh1; 1966 + struct rcu_head rh2; 1967 + 1968 + init_rcu_head_on_stack(&rh1); 1969 + init_rcu_head_on_stack(&rh2); 1970 + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); 1971 + 1972 + /* Try to queue the rh2 pair of callbacks for the same grace period. */ 1973 + preempt_disable(); /* Prevent preemption from interrupting test. */ 1974 + rcu_read_lock(); /* Make it impossible to finish a grace period. */ 1975 + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ 1976 + local_irq_disable(); /* Make it harder to start a new grace period. */ 1977 + call_rcu(&rh2, rcu_torture_leak_cb); 1978 + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ 1979 + local_irq_enable(); 1980 + rcu_read_unlock(); 1981 + preempt_enable(); 1982 + 1983 + /* Wait for them all to get done so we can safely return. */ 1984 + rcu_barrier(); 1985 + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); 1986 + destroy_rcu_head_on_stack(&rh1); 1987 + destroy_rcu_head_on_stack(&rh2); 1988 + #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 1989 + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); 1990 + #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 1991 + } 1992 + 1843 1993 static int __init 1844 1994 rcu_torture_init(void) 1845 1995 { ··· 1903 1941 int cpu; 1904 1942 int firsterr = 0; 1905 1943 int retval; 1906 - static struct rcu_torture_ops *torture_ops[] = 1907 - { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1908 - &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1909 - &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, 1910 - &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1944 + static struct rcu_torture_ops *torture_ops[] = { 1945 + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, 1946 + }; 1911 1947 1912 1948 mutex_lock(&fullstop_mutex); 1913 1949 ··· 2123 2163 firsterr = retval; 2124 2164 goto unwind; 2125 2165 } 2166 + if (object_debug) 2167 + rcu_test_debug_objects(); 2126 2168 rcutorture_record_test_transition(); 2127 2169 mutex_unlock(&fullstop_mutex); 2128 2170 return 0;

+92 -58

kernel/rcutree.c

··· 54 54 #include <linux/stop_machine.h> 55 55 #include <linux/random.h> 56 56 #include <linux/ftrace_event.h> 57 + #include <linux/suspend.h> 57 58 58 59 #include "rcutree.h" 59 60 #include <trace/events/rcu.h> ··· 225 224 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 226 225 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 227 226 .dynticks = ATOMIC_INIT(1), 227 + #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 228 + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 229 + .dynticks_idle = ATOMIC_INIT(1), 230 + #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 228 231 }; 229 232 230 233 static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ ··· 247 242 248 243 static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 249 244 struct rcu_data *rdp); 250 - static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 245 + static void force_qs_rnp(struct rcu_state *rsp, 246 + int (*f)(struct rcu_data *rsp, bool *isidle, 247 + unsigned long *maxj), 248 + bool *isidle, unsigned long *maxj); 251 249 static void force_quiescent_state(struct rcu_state *rsp); 252 250 static int rcu_pending(int cpu); 253 251 ··· 435 427 436 428 local_irq_save(flags); 437 429 rcu_eqs_enter(false); 430 + rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); 438 431 local_irq_restore(flags); 439 432 } 440 433 EXPORT_SYMBOL_GPL(rcu_idle_enter); ··· 452 443 void rcu_user_enter(void) 453 444 { 454 445 rcu_eqs_enter(1); 455 - } 456 - 457 - /** 458 - * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace 459 - * after the current irq returns. 460 - * 461 - * This is similar to rcu_user_enter() but in the context of a non-nesting 462 - * irq. After this call, RCU enters into idle mode when the interrupt 463 - * returns. 464 - */ 465 - void rcu_user_enter_after_irq(void) 466 - { 467 - unsigned long flags; 468 - struct rcu_dynticks *rdtp; 469 - 470 - local_irq_save(flags); 471 - rdtp = &__get_cpu_var(rcu_dynticks); 472 - /* Ensure this irq is interrupting a non-idle RCU state. */ 473 - WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); 474 - rdtp->dynticks_nesting = 1; 475 - local_irq_restore(flags); 476 446 } 477 447 #endif /* CONFIG_RCU_USER_QS */ 478 448 ··· 486 498 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 487 499 else 488 500 rcu_eqs_enter_common(rdtp, oldval, true); 501 + rcu_sysidle_enter(rdtp, 1); 489 502 local_irq_restore(flags); 490 503 } 491 504 ··· 555 566 556 567 local_irq_save(flags); 557 568 rcu_eqs_exit(false); 569 + rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); 558 570 local_irq_restore(flags); 559 571 } 560 572 EXPORT_SYMBOL_GPL(rcu_idle_exit); ··· 570 580 void rcu_user_exit(void) 571 581 { 572 582 rcu_eqs_exit(1); 573 - } 574 - 575 - /** 576 - * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace 577 - * idle mode after the current non-nesting irq returns. 578 - * 579 - * This is similar to rcu_user_exit() but in the context of an irq. 580 - * This is called when the irq has interrupted a userspace RCU idle mode 581 - * context. When the current non-nesting interrupt returns after this call, 582 - * the CPU won't restore the RCU idle mode. 583 - */ 584 - void rcu_user_exit_after_irq(void) 585 - { 586 - unsigned long flags; 587 - struct rcu_dynticks *rdtp; 588 - 589 - local_irq_save(flags); 590 - rdtp = &__get_cpu_var(rcu_dynticks); 591 - /* Ensure we are interrupting an RCU idle mode. */ 592 - WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); 593 - rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; 594 - local_irq_restore(flags); 595 583 } 596 584 #endif /* CONFIG_RCU_USER_QS */ 597 585 ··· 607 639 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 608 640 else 609 641 rcu_eqs_exit_common(rdtp, oldval, true); 642 + rcu_sysidle_exit(rdtp, 1); 610 643 local_irq_restore(flags); 611 644 } 612 645 ··· 731 762 * credit them with an implicit quiescent state. Return 1 if this CPU 732 763 * is in dynticks idle mode, which is an extended quiescent state. 733 764 */ 734 - static int dyntick_save_progress_counter(struct rcu_data *rdp) 765 + static int dyntick_save_progress_counter(struct rcu_data *rdp, 766 + bool *isidle, unsigned long *maxj) 735 767 { 736 768 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 769 + rcu_sysidle_check_cpu(rdp, isidle, maxj); 737 770 return (rdp->dynticks_snap & 0x1) == 0; 738 771 } 739 772 ··· 745 774 * idle state since the last call to dyntick_save_progress_counter() 746 775 * for this same CPU, or by virtue of having been offline. 747 776 */ 748 - static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 777 + static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, 778 + bool *isidle, unsigned long *maxj) 749 779 { 750 780 unsigned int curr; 751 781 unsigned int snap; ··· 1304 1332 struct rcu_data *rdp; 1305 1333 struct rcu_node *rnp = rcu_get_root(rsp); 1306 1334 1335 + rcu_bind_gp_kthread(); 1307 1336 raw_spin_lock_irq(&rnp->lock); 1308 1337 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1309 1338 ··· 1369 1396 int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1370 1397 { 1371 1398 int fqs_state = fqs_state_in; 1399 + bool isidle = false; 1400 + unsigned long maxj; 1372 1401 struct rcu_node *rnp = rcu_get_root(rsp); 1373 1402 1374 1403 rsp->n_force_qs++; 1375 1404 if (fqs_state == RCU_SAVE_DYNTICK) { 1376 1405 /* Collect dyntick-idle snapshots. */ 1377 - force_qs_rnp(rsp, dyntick_save_progress_counter); 1406 + if (is_sysidle_rcu_state(rsp)) { 1407 + isidle = 1; 1408 + maxj = jiffies - ULONG_MAX / 4; 1409 + } 1410 + force_qs_rnp(rsp, dyntick_save_progress_counter, 1411 + &isidle, &maxj); 1412 + rcu_sysidle_report_gp(rsp, isidle, maxj); 1378 1413 fqs_state = RCU_FORCE_QS; 1379 1414 } else { 1380 1415 /* Handle dyntick-idle and offline CPUs. */ 1381 - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); 1416 + isidle = 0; 1417 + force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1382 1418 } 1383 1419 /* Clear flag to prevent immediate re-entry. */ 1384 1420 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { ··· 1557 1575 1558 1576 /* 1559 1577 * We can't do wakeups while holding the rnp->lock, as that 1560 - * could cause possible deadlocks with the rq->lock. Deter 1561 - * the wakeup to interrupt context. 1578 + * could cause possible deadlocks with the rq->lock. Defer 1579 + * the wakeup to interrupt context. And don't bother waking 1580 + * up the running kthread. 1562 1581 */ 1563 - irq_work_queue(&rsp->wakeup_work); 1582 + if (current != rsp->gp_kthread) 1583 + irq_work_queue(&rsp->wakeup_work); 1564 1584 } 1565 1585 1566 1586 /* ··· 2088 2104 * 2089 2105 * The caller must have suppressed start of new grace periods. 2090 2106 */ 2091 - static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 2107 + static void force_qs_rnp(struct rcu_state *rsp, 2108 + int (*f)(struct rcu_data *rsp, bool *isidle, 2109 + unsigned long *maxj), 2110 + bool *isidle, unsigned long *maxj) 2092 2111 { 2093 2112 unsigned long bit; 2094 2113 int cpu; ··· 2114 2127 cpu = rnp->grplo; 2115 2128 bit = 1; 2116 2129 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2117 - if ((rnp->qsmask & bit) != 0 && 2118 - f(per_cpu_ptr(rsp->rda, cpu))) 2119 - mask |= bit; 2130 + if ((rnp->qsmask & bit) != 0) { 2131 + if ((rnp->qsmaskinit & bit) != 0) 2132 + *isidle = 0; 2133 + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2134 + mask |= bit; 2135 + } 2120 2136 } 2121 2137 if (mask != 0) { 2122 2138 ··· 2294 2304 } 2295 2305 2296 2306 /* 2307 + * RCU callback function to leak a callback. 2308 + */ 2309 + static void rcu_leak_callback(struct rcu_head *rhp) 2310 + { 2311 + } 2312 + 2313 + /* 2297 2314 * Helper function for call_rcu() and friends. The cpu argument will 2298 2315 * normally be -1, indicating "currently running CPU". It may specify 2299 2316 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() ··· 2314 2317 struct rcu_data *rdp; 2315 2318 2316 2319 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2317 - debug_rcu_head_queue(head); 2320 + if (debug_rcu_head_queue(head)) { 2321 + /* Probable double call_rcu(), so leak the callback. */ 2322 + ACCESS_ONCE(head->func) = rcu_leak_callback; 2323 + WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); 2324 + return; 2325 + } 2318 2326 head->func = func; 2319 2327 head->next = NULL; 2320 2328 ··· 2804 2802 * transition. The "if" expression below therefore rounds the old 2805 2803 * value up to the next even number and adds two before comparing. 2806 2804 */ 2807 - snap_done = ACCESS_ONCE(rsp->n_barrier_done); 2805 + snap_done = rsp->n_barrier_done; 2808 2806 _rcu_barrier_trace(rsp, "Check", -1, snap_done); 2809 - if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { 2807 + 2808 + /* 2809 + * If the value in snap is odd, we needed to wait for the current 2810 + * rcu_barrier() to complete, then wait for the next one, in other 2811 + * words, we need the value of snap_done to be three larger than 2812 + * the value of snap. On the other hand, if the value in snap is 2813 + * even, we only had to wait for the next rcu_barrier() to complete, 2814 + * in other words, we need the value of snap_done to be only two 2815 + * greater than the value of snap. The "(snap + 3) & ~0x1" computes 2816 + * this for us (thank you, Linus!). 2817 + */ 2818 + if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { 2810 2819 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); 2811 2820 smp_mb(); /* caller's subsequent code after above check. */ 2812 2821 mutex_unlock(&rsp->barrier_mutex); ··· 2960 2947 rdp->blimit = blimit; 2961 2948 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 2962 2949 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2950 + rcu_sysidle_init_percpu_data(rdp->dynticks); 2963 2951 atomic_set(&rdp->dynticks->dynticks, 2964 2952 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2965 2953 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ ··· 3043 3029 break; 3044 3030 } 3045 3031 trace_rcu_utilization(TPS("End CPU hotplug")); 3032 + return NOTIFY_OK; 3033 + } 3034 + 3035 + static int rcu_pm_notify(struct notifier_block *self, 3036 + unsigned long action, void *hcpu) 3037 + { 3038 + switch (action) { 3039 + case PM_HIBERNATION_PREPARE: 3040 + case PM_SUSPEND_PREPARE: 3041 + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3042 + rcu_expedited = 1; 3043 + break; 3044 + case PM_POST_HIBERNATION: 3045 + case PM_POST_SUSPEND: 3046 + rcu_expedited = 0; 3047 + break; 3048 + default: 3049 + break; 3050 + } 3046 3051 return NOTIFY_OK; 3047 3052 } 3048 3053 ··· 3306 3273 * or the scheduler are operational. 3307 3274 */ 3308 3275 cpu_notifier(rcu_cpu_notify, 0); 3276 + pm_notifier(rcu_pm_notify, 0); 3309 3277 for_each_online_cpu(cpu) 3310 3278 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3311 3279 }

+17

kernel/rcutree.h

··· 88 88 /* Process level is worth LLONG_MAX/2. */ 89 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 90 90 atomic_t dynticks; /* Even value for idle, else odd. */ 91 + #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 92 + long long dynticks_idle_nesting; 93 + /* irq/process nesting level from idle. */ 94 + atomic_t dynticks_idle; /* Even value for idle, else odd. */ 95 + /* "Idle" excludes userspace execution. */ 96 + unsigned long dynticks_idle_jiffies; 97 + /* End of last non-NMI non-idle period. */ 98 + #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 91 99 #ifdef CONFIG_RCU_FAST_NO_HZ 92 100 bool all_lazy; /* Are all CPU's CBs lazy? */ 93 101 unsigned long nonlazy_posted; ··· 553 545 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 554 546 static void rcu_kick_nohz_cpu(int cpu); 555 547 static bool init_nocb_callback_list(struct rcu_data *rdp); 548 + static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 549 + static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 550 + static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 551 + unsigned long *maxj); 552 + static bool is_sysidle_rcu_state(struct rcu_state *rsp); 553 + static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 554 + unsigned long maxj); 555 + static void rcu_bind_gp_kthread(void); 556 + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); 556 557 557 558 #endif /* #ifndef RCU_TREE_NONCORE */ 558 559

+423 -1

kernel/rcutree_plugin.h

··· 28 28 #include <linux/gfp.h> 29 29 #include <linux/oom.h> 30 30 #include <linux/smpboot.h> 31 - #include <linux/tick.h> 31 + #include "time/tick-internal.h" 32 32 33 33 #define RCU_KTHREAD_PRIO 1 34 34 ··· 2373 2373 smp_send_reschedule(cpu); 2374 2374 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 2375 2375 } 2376 + 2377 + 2378 + #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 2379 + 2380 + /* 2381 + * Define RCU flavor that holds sysidle state. This needs to be the 2382 + * most active flavor of RCU. 2383 + */ 2384 + #ifdef CONFIG_PREEMPT_RCU 2385 + static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; 2386 + #else /* #ifdef CONFIG_PREEMPT_RCU */ 2387 + static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; 2388 + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 2389 + 2390 + static int full_sysidle_state; /* Current system-idle state. */ 2391 + #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ 2392 + #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ 2393 + #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ 2394 + #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ 2395 + #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ 2396 + 2397 + /* 2398 + * Invoked to note exit from irq or task transition to idle. Note that 2399 + * usermode execution does -not- count as idle here! After all, we want 2400 + * to detect full-system idle states, not RCU quiescent states and grace 2401 + * periods. The caller must have disabled interrupts. 2402 + */ 2403 + static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2404 + { 2405 + unsigned long j; 2406 + 2407 + /* Adjust nesting, check for fully idle. */ 2408 + if (irq) { 2409 + rdtp->dynticks_idle_nesting--; 2410 + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); 2411 + if (rdtp->dynticks_idle_nesting != 0) 2412 + return; /* Still not fully idle. */ 2413 + } else { 2414 + if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == 2415 + DYNTICK_TASK_NEST_VALUE) { 2416 + rdtp->dynticks_idle_nesting = 0; 2417 + } else { 2418 + rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; 2419 + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); 2420 + return; /* Still not fully idle. */ 2421 + } 2422 + } 2423 + 2424 + /* Record start of fully idle period. */ 2425 + j = jiffies; 2426 + ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; 2427 + smp_mb__before_atomic_inc(); 2428 + atomic_inc(&rdtp->dynticks_idle); 2429 + smp_mb__after_atomic_inc(); 2430 + WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); 2431 + } 2432 + 2433 + /* 2434 + * Unconditionally force exit from full system-idle state. This is 2435 + * invoked when a normal CPU exits idle, but must be called separately 2436 + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this 2437 + * is that the timekeeping CPU is permitted to take scheduling-clock 2438 + * interrupts while the system is in system-idle state, and of course 2439 + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock 2440 + * interrupt from any other type of interrupt. 2441 + */ 2442 + void rcu_sysidle_force_exit(void) 2443 + { 2444 + int oldstate = ACCESS_ONCE(full_sysidle_state); 2445 + int newoldstate; 2446 + 2447 + /* 2448 + * Each pass through the following loop attempts to exit full 2449 + * system-idle state. If contention proves to be a problem, 2450 + * a trylock-based contention tree could be used here. 2451 + */ 2452 + while (oldstate > RCU_SYSIDLE_SHORT) { 2453 + newoldstate = cmpxchg(&full_sysidle_state, 2454 + oldstate, RCU_SYSIDLE_NOT); 2455 + if (oldstate == newoldstate && 2456 + oldstate == RCU_SYSIDLE_FULL_NOTED) { 2457 + rcu_kick_nohz_cpu(tick_do_timer_cpu); 2458 + return; /* We cleared it, done! */ 2459 + } 2460 + oldstate = newoldstate; 2461 + } 2462 + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ 2463 + } 2464 + 2465 + /* 2466 + * Invoked to note entry to irq or task transition from idle. Note that 2467 + * usermode execution does -not- count as idle here! The caller must 2468 + * have disabled interrupts. 2469 + */ 2470 + static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2471 + { 2472 + /* Adjust nesting, check for already non-idle. */ 2473 + if (irq) { 2474 + rdtp->dynticks_idle_nesting++; 2475 + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); 2476 + if (rdtp->dynticks_idle_nesting != 1) 2477 + return; /* Already non-idle. */ 2478 + } else { 2479 + /* 2480 + * Allow for irq misnesting. Yes, it really is possible 2481 + * to enter an irq handler then never leave it, and maybe 2482 + * also vice versa. Handle both possibilities. 2483 + */ 2484 + if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { 2485 + rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; 2486 + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); 2487 + return; /* Already non-idle. */ 2488 + } else { 2489 + rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; 2490 + } 2491 + } 2492 + 2493 + /* Record end of idle period. */ 2494 + smp_mb__before_atomic_inc(); 2495 + atomic_inc(&rdtp->dynticks_idle); 2496 + smp_mb__after_atomic_inc(); 2497 + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); 2498 + 2499 + /* 2500 + * If we are the timekeeping CPU, we are permitted to be non-idle 2501 + * during a system-idle state. This must be the case, because 2502 + * the timekeeping CPU has to take scheduling-clock interrupts 2503 + * during the time that the system is transitioning to full 2504 + * system-idle state. This means that the timekeeping CPU must 2505 + * invoke rcu_sysidle_force_exit() directly if it does anything 2506 + * more than take a scheduling-clock interrupt. 2507 + */ 2508 + if (smp_processor_id() == tick_do_timer_cpu) 2509 + return; 2510 + 2511 + /* Update system-idle state: We are clearly no longer fully idle! */ 2512 + rcu_sysidle_force_exit(); 2513 + } 2514 + 2515 + /* 2516 + * Check to see if the current CPU is idle. Note that usermode execution 2517 + * does not count as idle. The caller must have disabled interrupts. 2518 + */ 2519 + static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2520 + unsigned long *maxj) 2521 + { 2522 + int cur; 2523 + unsigned long j; 2524 + struct rcu_dynticks *rdtp = rdp->dynticks; 2525 + 2526 + /* 2527 + * If some other CPU has already reported non-idle, if this is 2528 + * not the flavor of RCU that tracks sysidle state, or if this 2529 + * is an offline or the timekeeping CPU, nothing to do. 2530 + */ 2531 + if (!*isidle || rdp->rsp != rcu_sysidle_state || 2532 + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2533 + return; 2534 + if (rcu_gp_in_progress(rdp->rsp)) 2535 + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); 2536 + 2537 + /* Pick up current idle and NMI-nesting counter and check. */ 2538 + cur = atomic_read(&rdtp->dynticks_idle); 2539 + if (cur & 0x1) { 2540 + *isidle = false; /* We are not idle! */ 2541 + return; 2542 + } 2543 + smp_mb(); /* Read counters before timestamps. */ 2544 + 2545 + /* Pick up timestamps. */ 2546 + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); 2547 + /* If this CPU entered idle more recently, update maxj timestamp. */ 2548 + if (ULONG_CMP_LT(*maxj, j)) 2549 + *maxj = j; 2550 + } 2551 + 2552 + /* 2553 + * Is this the flavor of RCU that is handling full-system idle? 2554 + */ 2555 + static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2556 + { 2557 + return rsp == rcu_sysidle_state; 2558 + } 2559 + 2560 + /* 2561 + * Bind the grace-period kthread for the sysidle flavor of RCU to the 2562 + * timekeeping CPU. 2563 + */ 2564 + static void rcu_bind_gp_kthread(void) 2565 + { 2566 + int cpu = ACCESS_ONCE(tick_do_timer_cpu); 2567 + 2568 + if (cpu < 0 || cpu >= nr_cpu_ids) 2569 + return; 2570 + if (raw_smp_processor_id() != cpu) 2571 + set_cpus_allowed_ptr(current, cpumask_of(cpu)); 2572 + } 2573 + 2574 + /* 2575 + * Return a delay in jiffies based on the number of CPUs, rcu_node 2576 + * leaf fanout, and jiffies tick rate. The idea is to allow larger 2577 + * systems more time to transition to full-idle state in order to 2578 + * avoid the cache thrashing that otherwise occur on the state variable. 2579 + * Really small systems (less than a couple of tens of CPUs) should 2580 + * instead use a single global atomically incremented counter, and later 2581 + * versions of this will automatically reconfigure themselves accordingly. 2582 + */ 2583 + static unsigned long rcu_sysidle_delay(void) 2584 + { 2585 + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 2586 + return 0; 2587 + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); 2588 + } 2589 + 2590 + /* 2591 + * Advance the full-system-idle state. This is invoked when all of 2592 + * the non-timekeeping CPUs are idle. 2593 + */ 2594 + static void rcu_sysidle(unsigned long j) 2595 + { 2596 + /* Check the current state. */ 2597 + switch (ACCESS_ONCE(full_sysidle_state)) { 2598 + case RCU_SYSIDLE_NOT: 2599 + 2600 + /* First time all are idle, so note a short idle period. */ 2601 + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; 2602 + break; 2603 + 2604 + case RCU_SYSIDLE_SHORT: 2605 + 2606 + /* 2607 + * Idle for a bit, time to advance to next state? 2608 + * cmpxchg failure means race with non-idle, let them win. 2609 + */ 2610 + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) 2611 + (void)cmpxchg(&full_sysidle_state, 2612 + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); 2613 + break; 2614 + 2615 + case RCU_SYSIDLE_LONG: 2616 + 2617 + /* 2618 + * Do an additional check pass before advancing to full. 2619 + * cmpxchg failure means race with non-idle, let them win. 2620 + */ 2621 + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) 2622 + (void)cmpxchg(&full_sysidle_state, 2623 + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); 2624 + break; 2625 + 2626 + default: 2627 + break; 2628 + } 2629 + } 2630 + 2631 + /* 2632 + * Found a non-idle non-timekeeping CPU, so kick the system-idle state 2633 + * back to the beginning. 2634 + */ 2635 + static void rcu_sysidle_cancel(void) 2636 + { 2637 + smp_mb(); 2638 + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; 2639 + } 2640 + 2641 + /* 2642 + * Update the sysidle state based on the results of a force-quiescent-state 2643 + * scan of the CPUs' dyntick-idle state. 2644 + */ 2645 + static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, 2646 + unsigned long maxj, bool gpkt) 2647 + { 2648 + if (rsp != rcu_sysidle_state) 2649 + return; /* Wrong flavor, ignore. */ 2650 + if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 2651 + return; /* Running state machine from timekeeping CPU. */ 2652 + if (isidle) 2653 + rcu_sysidle(maxj); /* More idle! */ 2654 + else 2655 + rcu_sysidle_cancel(); /* Idle is over. */ 2656 + } 2657 + 2658 + /* 2659 + * Wrapper for rcu_sysidle_report() when called from the grace-period 2660 + * kthread's context. 2661 + */ 2662 + static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2663 + unsigned long maxj) 2664 + { 2665 + rcu_sysidle_report(rsp, isidle, maxj, true); 2666 + } 2667 + 2668 + /* Callback and function for forcing an RCU grace period. */ 2669 + struct rcu_sysidle_head { 2670 + struct rcu_head rh; 2671 + int inuse; 2672 + }; 2673 + 2674 + static void rcu_sysidle_cb(struct rcu_head *rhp) 2675 + { 2676 + struct rcu_sysidle_head *rshp; 2677 + 2678 + /* 2679 + * The following memory barrier is needed to replace the 2680 + * memory barriers that would normally be in the memory 2681 + * allocator. 2682 + */ 2683 + smp_mb(); /* grace period precedes setting inuse. */ 2684 + 2685 + rshp = container_of(rhp, struct rcu_sysidle_head, rh); 2686 + ACCESS_ONCE(rshp->inuse) = 0; 2687 + } 2688 + 2689 + /* 2690 + * Check to see if the system is fully idle, other than the timekeeping CPU. 2691 + * The caller must have disabled interrupts. 2692 + */ 2693 + bool rcu_sys_is_idle(void) 2694 + { 2695 + static struct rcu_sysidle_head rsh; 2696 + int rss = ACCESS_ONCE(full_sysidle_state); 2697 + 2698 + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) 2699 + return false; 2700 + 2701 + /* Handle small-system case by doing a full scan of CPUs. */ 2702 + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { 2703 + int oldrss = rss - 1; 2704 + 2705 + /* 2706 + * One pass to advance to each state up to _FULL. 2707 + * Give up if any pass fails to advance the state. 2708 + */ 2709 + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { 2710 + int cpu; 2711 + bool isidle = true; 2712 + unsigned long maxj = jiffies - ULONG_MAX / 4; 2713 + struct rcu_data *rdp; 2714 + 2715 + /* Scan all the CPUs looking for nonidle CPUs. */ 2716 + for_each_possible_cpu(cpu) { 2717 + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); 2718 + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); 2719 + if (!isidle) 2720 + break; 2721 + } 2722 + rcu_sysidle_report(rcu_sysidle_state, 2723 + isidle, maxj, false); 2724 + oldrss = rss; 2725 + rss = ACCESS_ONCE(full_sysidle_state); 2726 + } 2727 + } 2728 + 2729 + /* If this is the first observation of an idle period, record it. */ 2730 + if (rss == RCU_SYSIDLE_FULL) { 2731 + rss = cmpxchg(&full_sysidle_state, 2732 + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); 2733 + return rss == RCU_SYSIDLE_FULL; 2734 + } 2735 + 2736 + smp_mb(); /* ensure rss load happens before later caller actions. */ 2737 + 2738 + /* If already fully idle, tell the caller (in case of races). */ 2739 + if (rss == RCU_SYSIDLE_FULL_NOTED) 2740 + return true; 2741 + 2742 + /* 2743 + * If we aren't there yet, and a grace period is not in flight, 2744 + * initiate a grace period. Either way, tell the caller that 2745 + * we are not there yet. We use an xchg() rather than an assignment 2746 + * to make up for the memory barriers that would otherwise be 2747 + * provided by the memory allocator. 2748 + */ 2749 + if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && 2750 + !rcu_gp_in_progress(rcu_sysidle_state) && 2751 + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) 2752 + call_rcu(&rsh.rh, rcu_sysidle_cb); 2753 + return false; 2754 + } 2755 + 2756 + /* 2757 + * Initialize dynticks sysidle state for CPUs coming online. 2758 + */ 2759 + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) 2760 + { 2761 + rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; 2762 + } 2763 + 2764 + #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2765 + 2766 + static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2767 + { 2768 + } 2769 + 2770 + static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2771 + { 2772 + } 2773 + 2774 + static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2775 + unsigned long *maxj) 2776 + { 2777 + } 2778 + 2779 + static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2780 + { 2781 + return false; 2782 + } 2783 + 2784 + static void rcu_bind_gp_kthread(void) 2785 + { 2786 + } 2787 + 2788 + static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2789 + unsigned long maxj) 2790 + { 2791 + } 2792 + 2793 + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) 2794 + { 2795 + } 2796 + 2797 + #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */

+50

kernel/time/Kconfig

··· 134 134 Note the boot CPU will still be kept outside the range to 135 135 handle the timekeeping duty. 136 136 137 + config NO_HZ_FULL_SYSIDLE 138 + bool "Detect full-system idle state for full dynticks system" 139 + depends on NO_HZ_FULL 140 + default n 141 + help 142 + At least one CPU must keep the scheduling-clock tick running for 143 + timekeeping purposes whenever there is a non-idle CPU, where 144 + "non-idle" also includes dynticks CPUs as long as they are 145 + running non-idle tasks. Because the underlying adaptive-tick 146 + support cannot distinguish between all CPUs being idle and 147 + all CPUs each running a single task in dynticks mode, the 148 + underlying support simply ensures that there is always a CPU 149 + handling the scheduling-clock tick, whether or not all CPUs 150 + are idle. This Kconfig option enables scalable detection of 151 + the all-CPUs-idle state, thus allowing the scheduling-clock 152 + tick to be disabled when all CPUs are idle. Note that scalable 153 + detection of the all-CPUs-idle state means that larger systems 154 + will be slower to declare the all-CPUs-idle state. 155 + 156 + Say Y if you would like to help debug all-CPUs-idle detection. 157 + 158 + Say N if you are unsure. 159 + 160 + config NO_HZ_FULL_SYSIDLE_SMALL 161 + int "Number of CPUs above which large-system approach is used" 162 + depends on NO_HZ_FULL_SYSIDLE 163 + range 1 NR_CPUS 164 + default 8 165 + help 166 + The full-system idle detection mechanism takes a lazy approach 167 + on large systems, as is required to attain decent scalability. 168 + However, on smaller systems, scalability is not anywhere near as 169 + large a concern as is energy efficiency. The sysidle subsystem 170 + therefore uses a fast but non-scalable algorithm for small 171 + systems and a lazier but scalable algorithm for large systems. 172 + This Kconfig parameter defines the number of CPUs in the largest 173 + system that will be considered to be "small". 174 + 175 + The default value will be fine in most cases. Battery-powered 176 + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger 177 + numbers of CPUs, and (3) are suffering from battery-lifetime 178 + problems due to long sysidle latencies might wish to experiment 179 + with larger values for this Kconfig parameter. On the other 180 + hand, they might be even better served by disabling NO_HZ_FULL 181 + entirely, given that NO_HZ_FULL is intended for HPC and 182 + real-time workloads that at present do not tend to be run on 183 + battery-powered systems. 184 + 185 + Take the default if you are unsure. 186 + 137 187 config NO_HZ 138 188 bool "Old Idle dynticks config" 139 189 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS

+14 -6

lib/debugobjects.c

··· 381 381 * debug_object_activate - debug checks when an object is activated 382 382 * @addr: address of the object 383 383 * @descr: pointer to an object specific debug description structure 384 + * Returns 0 for success, -EINVAL for check failed. 384 385 */ 385 - void debug_object_activate(void *addr, struct debug_obj_descr *descr) 386 + int debug_object_activate(void *addr, struct debug_obj_descr *descr) 386 387 { 387 388 enum debug_obj_state state; 388 389 struct debug_bucket *db; 389 390 struct debug_obj *obj; 390 391 unsigned long flags; 392 + int ret; 391 393 struct debug_obj o = { .object = addr, 392 394 .state = ODEBUG_STATE_NOTAVAILABLE, 393 395 .descr = descr }; 394 396 395 397 if (!debug_objects_enabled) 396 - return; 398 + return 0; 397 399 398 400 db = get_bucket((unsigned long) addr); 399 401 ··· 407 405 case ODEBUG_STATE_INIT: 408 406 case ODEBUG_STATE_INACTIVE: 409 407 obj->state = ODEBUG_STATE_ACTIVE; 408 + ret = 0; 410 409 break; 411 410 412 411 case ODEBUG_STATE_ACTIVE: 413 412 debug_print_object(obj, "activate"); 414 413 state = obj->state; 415 414 raw_spin_unlock_irqrestore(&db->lock, flags); 416 - debug_object_fixup(descr->fixup_activate, addr, state); 417 - return; 415 + ret = debug_object_fixup(descr->fixup_activate, addr, state); 416 + return ret ? -EINVAL : 0; 418 417 419 418 case ODEBUG_STATE_DESTROYED: 420 419 debug_print_object(obj, "activate"); 420 + ret = -EINVAL; 421 421 break; 422 422 default: 423 + ret = 0; 423 424 break; 424 425 } 425 426 raw_spin_unlock_irqrestore(&db->lock, flags); 426 - return; 427 + return ret; 427 428 } 428 429 429 430 raw_spin_unlock_irqrestore(&db->lock, flags); ··· 436 431 * true or not. 437 432 */ 438 433 if (debug_object_fixup(descr->fixup_activate, addr, 439 - ODEBUG_STATE_NOTAVAILABLE)) 434 + ODEBUG_STATE_NOTAVAILABLE)) { 440 435 debug_print_object(&o, "activate"); 436 + return -EINVAL; 437 + } 438 + return 0; 441 439 } 442 440 443 441 /**