Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+14 -1

Documentation/RCU/torture.txt

··· 47 47 permit this. (Or, more accurately, variants of RCU that do 48 48 -not- permit this know to ignore this variable.) 49 49 50 + n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted, 51 + in which case n_barrier_cbs specifies the number of 52 + RCU callbacks (and corresponding kthreads) to use for 53 + this testing. The value cannot be negative. If you 54 + specify this to be non-zero when torture_type indicates a 55 + synchronous RCU implementation (one for which a member of 56 + the synchronize_rcu() rather than the call_rcu() family is 57 + used -- see the documentation for torture_type below), an 58 + error will be reported and no testing will be carried out. 59 + 50 60 nfakewriters This is the number of RCU fake writer threads to run. Fake 51 61 writer threads repeatedly use the synchronous "wait for 52 62 current readers" function of the interface selected by ··· 198 188 The statistics output is as follows: 199 189 200 190 rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 201 - rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 191 + rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 202 192 rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0 203 193 rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0 204 194 rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0 ··· 239 229 o "rtmbe": A non-zero value indicates that rcutorture believes that 240 230 rcu_assign_pointer() and rcu_dereference() are not working 241 231 correctly. This value should be zero. 232 + 233 + o "rtbe": A non-zero value indicates that one of the rcu_barrier() 234 + family of functions is not working correctly. 242 235 243 236 o "rtbke": rcutorture was unable to create the real-time kthreads 244 237 used to force RCU priority inversion. This value should be zero.

+85 -3

Documentation/kernel-parameters.txt

··· 2333 2333 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes 2334 2334 See Documentation/blockdev/ramdisk.txt. 2335 2335 2336 - rcupdate.blimit= [KNL,BOOT] 2336 + rcutree.blimit= [KNL,BOOT] 2337 2337 Set maximum number of finished RCU callbacks to process 2338 2338 in one batch. 2339 2339 2340 - rcupdate.qhimark= [KNL,BOOT] 2340 + rcutree.qhimark= [KNL,BOOT] 2341 2341 Set threshold of queued 2342 2342 RCU callbacks over which batch limiting is disabled. 2343 2343 2344 - rcupdate.qlowmark= [KNL,BOOT] 2344 + rcutree.qlowmark= [KNL,BOOT] 2345 2345 Set threshold of queued RCU callbacks below which 2346 2346 batch limiting is re-enabled. 2347 + 2348 + rcutree.rcu_cpu_stall_suppress= [KNL,BOOT] 2349 + Suppress RCU CPU stall warning messages. 2350 + 2351 + rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] 2352 + Set timeout for RCU CPU stall warning messages. 2353 + 2354 + rcutorture.fqs_duration= [KNL,BOOT] 2355 + Set duration of force_quiescent_state bursts. 2356 + 2357 + rcutorture.fqs_holdoff= [KNL,BOOT] 2358 + Set holdoff time within force_quiescent_state bursts. 2359 + 2360 + rcutorture.fqs_stutter= [KNL,BOOT] 2361 + Set wait time between force_quiescent_state bursts. 2362 + 2363 + rcutorture.irqreader= [KNL,BOOT] 2364 + Test RCU readers from irq handlers. 2365 + 2366 + rcutorture.n_barrier_cbs= [KNL,BOOT] 2367 + Set callbacks/threads for rcu_barrier() testing. 2368 + 2369 + rcutorture.nfakewriters= [KNL,BOOT] 2370 + Set number of concurrent RCU writers. These just 2371 + stress RCU, they don't participate in the actual 2372 + test, hence the "fake". 2373 + 2374 + rcutorture.nreaders= [KNL,BOOT] 2375 + Set number of RCU readers. 2376 + 2377 + rcutorture.onoff_holdoff= [KNL,BOOT] 2378 + Set time (s) after boot for CPU-hotplug testing. 2379 + 2380 + rcutorture.onoff_interval= [KNL,BOOT] 2381 + Set time (s) between CPU-hotplug operations, or 2382 + zero to disable CPU-hotplug testing. 2383 + 2384 + rcutorture.shuffle_interval= [KNL,BOOT] 2385 + Set task-shuffle interval (s). Shuffling tasks 2386 + allows some CPUs to go into dyntick-idle mode 2387 + during the rcutorture test. 2388 + 2389 + rcutorture.shutdown_secs= [KNL,BOOT] 2390 + Set time (s) after boot system shutdown. This 2391 + is useful for hands-off automated testing. 2392 + 2393 + rcutorture.stall_cpu= [KNL,BOOT] 2394 + Duration of CPU stall (s) to test RCU CPU stall 2395 + warnings, zero to disable. 2396 + 2397 + rcutorture.stall_cpu_holdoff= [KNL,BOOT] 2398 + Time to wait (s) after boot before inducing stall. 2399 + 2400 + rcutorture.stat_interval= [KNL,BOOT] 2401 + Time (s) between statistics printk()s. 2402 + 2403 + rcutorture.stutter= [KNL,BOOT] 2404 + Time (s) to stutter testing, for example, specifying 2405 + five seconds causes the test to run for five seconds, 2406 + wait for five seconds, and so on. This tests RCU's 2407 + ability to transition abruptly to and from idle. 2408 + 2409 + rcutorture.test_boost= [KNL,BOOT] 2410 + Test RCU priority boosting? 0=no, 1=maybe, 2=yes. 2411 + "Maybe" means test if the RCU implementation 2412 + under test support RCU priority boosting. 2413 + 2414 + rcutorture.test_boost_duration= [KNL,BOOT] 2415 + Duration (s) of each individual boost test. 2416 + 2417 + rcutorture.test_boost_interval= [KNL,BOOT] 2418 + Interval (s) between each boost test. 2419 + 2420 + rcutorture.test_no_idle_hz= [KNL,BOOT] 2421 + Test RCU's dyntick-idle handling. See also the 2422 + rcutorture.shuffle_interval parameter. 2423 + 2424 + rcutorture.torture_type= [KNL,BOOT] 2425 + Specify the RCU implementation to test. 2426 + 2427 + rcutorture.verbose= [KNL,BOOT] 2428 + Enable additional printk() statements. 2347 2429 2348 2430 rdinit= [KNL] 2349 2431 Format: <full_path>

+11 -3

MAINTAINERS

··· 5598 5598 READ-COPY UPDATE (RCU) 5599 5599 M: Dipankar Sarma <dipankar@in.ibm.com> 5600 5600 M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> 5601 - W: http://www.rdrop.com/users/paulmck/rclock/ 5601 + W: http://www.rdrop.com/users/paulmck/RCU/ 5602 5602 S: Supported 5603 5603 T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git 5604 5604 F: Documentation/RCU/ 5605 + X: Documentation/RCU/torture.txt 5605 5606 F: include/linux/rcu* 5606 - F: include/linux/srcu* 5607 5607 F: kernel/rcu* 5608 - F: kernel/srcu* 5609 5608 X: kernel/rcutorture.c 5610 5609 5611 5610 REAL TIME CLOCK (RTC) SUBSYSTEM ··· 6120 6121 S: Maintained 6121 6122 F: include/linux/sl?b*.h 6122 6123 F: mm/sl?b.c 6124 + 6125 + SLEEPABLE READ-COPY UPDATE (SRCU) 6126 + M: Lai Jiangshan <laijs@cn.fujitsu.com> 6127 + M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> 6128 + W: http://www.rdrop.com/users/paulmck/RCU/ 6129 + S: Supported 6130 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git 6131 + F: include/linux/srcu* 6132 + F: kernel/srcu* 6123 6133 6124 6134 SMC91x ETHERNET DRIVER 6125 6135 M: Nicolas Pitre <nico@fluxnic.net>

+1

arch/um/drivers/mconsole_kern.c

··· 705 705 struct task_struct *from = current, *to = arg; 706 706 707 707 to->thread.saved_task = from; 708 + rcu_switch_from(from); 708 709 switch_to(from, to, from); 709 710 } 710 711

+35 -5

include/linux/rculist.h

··· 30 30 * This is only for internal list manipulation where we know 31 31 * the prev/next entries already! 32 32 */ 33 + #ifndef CONFIG_DEBUG_LIST 33 34 static inline void __list_add_rcu(struct list_head *new, 34 35 struct list_head *prev, struct list_head *next) 35 36 { ··· 39 38 rcu_assign_pointer(list_next_rcu(prev), new); 40 39 next->prev = new; 41 40 } 41 + #else 42 + extern void __list_add_rcu(struct list_head *new, 43 + struct list_head *prev, struct list_head *next); 44 + #endif 42 45 43 46 /** 44 47 * list_add_rcu - add a new entry to rcu-protected list ··· 113 108 */ 114 109 static inline void list_del_rcu(struct list_head *entry) 115 110 { 116 - __list_del(entry->prev, entry->next); 111 + __list_del_entry(entry); 117 112 entry->prev = LIST_POISON2; 118 113 } 119 114 ··· 233 228 }) 234 229 235 230 /** 236 - * list_first_entry_rcu - get the first element from a list 231 + * Where are list_empty_rcu() and list_first_entry_rcu()? 232 + * 233 + * Implementing those functions following their counterparts list_empty() and 234 + * list_first_entry() is not advisable because they lead to subtle race 235 + * conditions as the following snippet shows: 236 + * 237 + * if (!list_empty_rcu(mylist)) { 238 + * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); 239 + * do_something(bar); 240 + * } 241 + * 242 + * The list may not be empty when list_empty_rcu checks it, but it may be when 243 + * list_first_entry_rcu rereads the ->next pointer. 244 + * 245 + * Rereading the ->next pointer is not a problem for list_empty() and 246 + * list_first_entry() because they would be protected by a lock that blocks 247 + * writers. 248 + * 249 + * See list_first_or_null_rcu for an alternative. 250 + */ 251 + 252 + /** 253 + * list_first_or_null_rcu - get the first element from a list 237 254 * @ptr: the list head to take the element from. 238 255 * @type: the type of the struct this is embedded in. 239 256 * @member: the name of the list_struct within the struct. 240 257 * 241 - * Note, that list is expected to be not empty. 258 + * Note that if the list is empty, it returns NULL. 242 259 * 243 260 * This primitive may safely run concurrently with the _rcu list-mutation 244 261 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). 245 262 */ 246 - #define list_first_entry_rcu(ptr, type, member) \ 247 - list_entry_rcu((ptr)->next, type, member) 263 + #define list_first_or_null_rcu(ptr, type, member) \ 264 + ({struct list_head *__ptr = (ptr); \ 265 + struct list_head __rcu *__next = list_next_rcu(__ptr); \ 266 + likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \ 267 + }) 248 268 249 269 /** 250 270 * list_for_each_entry_rcu - iterate over rcu list of given type

+20

include/linux/rcupdate.h

··· 184 184 /* Internal to kernel */ 185 185 extern void rcu_sched_qs(int cpu); 186 186 extern void rcu_bh_qs(int cpu); 187 + extern void rcu_preempt_note_context_switch(void); 187 188 extern void rcu_check_callbacks(int cpu, int user); 188 189 struct notifier_block; 189 190 extern void rcu_idle_enter(void); 190 191 extern void rcu_idle_exit(void); 191 192 extern void rcu_irq_enter(void); 192 193 extern void rcu_irq_exit(void); 194 + extern void exit_rcu(void); 193 195 194 196 /** 195 197 * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers ··· 924 922 kfree_call_rcu(head, (rcu_callback)offset); 925 923 } 926 924 925 + /* 926 + * Does the specified offset indicate that the corresponding rcu_head 927 + * structure can be handled by kfree_rcu()? 928 + */ 929 + #define __is_kfree_rcu_offset(offset) ((offset) < 4096) 930 + 931 + /* 932 + * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. 933 + */ 934 + #define __kfree_rcu(head, offset) \ 935 + do { \ 936 + BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ 937 + call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \ 938 + } while (0) 939 + 927 940 /** 928 941 * kfree_rcu() - kfree an object after a grace period. 929 942 * @ptr: pointer to kfree ··· 961 944 * 962 945 * Note that the allowable offset might decrease in the future, for example, 963 946 * to allow something like kmem_cache_free_rcu(). 947 + * 948 + * The BUILD_BUG_ON check must not involve any function calls, hence the 949 + * checks are done in macros here. 964 950 */ 965 951 #define kfree_rcu(ptr, rcu_head) \ 966 952 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))

-11

include/linux/rcutiny.h

··· 87 87 88 88 #ifdef CONFIG_TINY_RCU 89 89 90 - static inline void rcu_preempt_note_context_switch(void) 91 - { 92 - } 93 - 94 - static inline void exit_rcu(void) 95 - { 96 - } 97 - 98 90 static inline int rcu_needs_cpu(int cpu) 99 91 { 100 92 return 0; ··· 94 102 95 103 #else /* #ifdef CONFIG_TINY_RCU */ 96 104 97 - void rcu_preempt_note_context_switch(void); 98 - extern void exit_rcu(void); 99 105 int rcu_preempt_needs_cpu(void); 100 106 101 107 static inline int rcu_needs_cpu(int cpu) ··· 106 116 static inline void rcu_note_context_switch(int cpu) 107 117 { 108 118 rcu_sched_qs(cpu); 109 - rcu_preempt_note_context_switch(); 110 119 } 111 120 112 121 /*

-19

include/linux/rcutree.h

··· 45 45 rcu_note_context_switch(cpu); 46 46 } 47 47 48 - #ifdef CONFIG_TREE_PREEMPT_RCU 49 - 50 - extern void exit_rcu(void); 51 - 52 - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 53 - 54 - static inline void exit_rcu(void) 55 - { 56 - } 57 - 58 - #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 59 - 60 48 extern void synchronize_rcu_bh(void); 61 49 extern void synchronize_sched_expedited(void); 62 50 extern void synchronize_rcu_expedited(void); ··· 85 97 extern void rcu_force_quiescent_state(void); 86 98 extern void rcu_bh_force_quiescent_state(void); 87 99 extern void rcu_sched_force_quiescent_state(void); 88 - 89 - /* A context switch is a grace period for RCU-sched and RCU-bh. */ 90 - static inline int rcu_blocking_is_gp(void) 91 - { 92 - might_sleep(); /* Check for RCU read-side critical section. */ 93 - return num_online_cpus() == 1; 94 - } 95 100 96 101 extern void rcu_scheduler_starting(void); 97 102 extern int rcu_scheduler_active __read_mostly;

+10

include/linux/sched.h

··· 1905 1905 INIT_LIST_HEAD(&p->rcu_node_entry); 1906 1906 } 1907 1907 1908 + static inline void rcu_switch_from(struct task_struct *prev) 1909 + { 1910 + if (prev->rcu_read_lock_nesting != 0) 1911 + rcu_preempt_note_context_switch(); 1912 + } 1913 + 1908 1914 #else 1909 1915 1910 1916 static inline void rcu_copy_process(struct task_struct *p) 1917 + { 1918 + } 1919 + 1920 + static inline void rcu_switch_from(struct task_struct *prev) 1911 1921 { 1912 1922 } 1913 1923

+39 -9

include/linux/srcu.h

··· 29 29 30 30 #include <linux/mutex.h> 31 31 #include <linux/rcupdate.h> 32 + #include <linux/workqueue.h> 32 33 33 34 struct srcu_struct_array { 34 - int c[2]; 35 + unsigned long c[2]; 36 + unsigned long seq[2]; 37 + }; 38 + 39 + struct rcu_batch { 40 + struct rcu_head *head, **tail; 35 41 }; 36 42 37 43 struct srcu_struct { 38 - int completed; 44 + unsigned completed; 39 45 struct srcu_struct_array __percpu *per_cpu_ref; 40 - struct mutex mutex; 46 + spinlock_t queue_lock; /* protect ->batch_queue, ->running */ 47 + bool running; 48 + /* callbacks just queued */ 49 + struct rcu_batch batch_queue; 50 + /* callbacks try to do the first check_zero */ 51 + struct rcu_batch batch_check0; 52 + /* callbacks done with the first check_zero and the flip */ 53 + struct rcu_batch batch_check1; 54 + struct rcu_batch batch_done; 55 + struct delayed_work work; 41 56 #ifdef CONFIG_DEBUG_LOCK_ALLOC 42 57 struct lockdep_map dep_map; 43 58 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 44 59 }; 45 - 46 - #ifndef CONFIG_PREEMPT 47 - #define srcu_barrier() barrier() 48 - #else /* #ifndef CONFIG_PREEMPT */ 49 - #define srcu_barrier() 50 - #endif /* #else #ifndef CONFIG_PREEMPT */ 51 60 52 61 #ifdef CONFIG_DEBUG_LOCK_ALLOC 53 62 ··· 76 67 77 68 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 78 69 70 + /** 71 + * call_srcu() - Queue a callback for invocation after an SRCU grace period 72 + * @sp: srcu_struct in queue the callback 73 + * @head: structure to be used for queueing the SRCU callback. 74 + * @func: function to be invoked after the SRCU grace period 75 + * 76 + * The callback function will be invoked some time after a full SRCU 77 + * grace period elapses, in other words after all pre-existing SRCU 78 + * read-side critical sections have completed. However, the callback 79 + * function might well execute concurrently with other SRCU read-side 80 + * critical sections that started after call_srcu() was invoked. SRCU 81 + * read-side critical sections are delimited by srcu_read_lock() and 82 + * srcu_read_unlock(), and may be nested. 83 + * 84 + * The callback will be invoked from process context, but must nevertheless 85 + * be fast and must not block. 86 + */ 87 + void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 88 + void (*func)(struct rcu_head *head)); 89 + 79 90 void cleanup_srcu_struct(struct srcu_struct *sp); 80 91 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); 81 92 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); 82 93 void synchronize_srcu(struct srcu_struct *sp); 83 94 void synchronize_srcu_expedited(struct srcu_struct *sp); 84 95 long srcu_batches_completed(struct srcu_struct *sp); 96 + void srcu_barrier(struct srcu_struct *sp); 85 97 86 98 #ifdef CONFIG_DEBUG_LOCK_ALLOC 87 99

+2

include/trace/events/rcu.h

··· 292 292 * "More callbacks": Still more callbacks, try again to clear them out. 293 293 * "Callbacks drained": All callbacks processed, off to dyntick idle! 294 294 * "Timer": Timer fired to cause CPU to continue processing callbacks. 295 + * "Demigrate": Timer fired on wrong CPU, woke up correct CPU. 296 + * "Cleanup after idle": Idle exited, timer canceled. 295 297 */ 296 298 TRACE_EVENT(rcu_prep_idle, 297 299

+46 -4

init/Kconfig

··· 458 458 Select a specific number if testing RCU itself. 459 459 Take the default if unsure. 460 460 461 + config RCU_FANOUT_LEAF 462 + int "Tree-based hierarchical RCU leaf-level fanout value" 463 + range 2 RCU_FANOUT if 64BIT 464 + range 2 RCU_FANOUT if !64BIT 465 + depends on TREE_RCU || TREE_PREEMPT_RCU 466 + default 16 467 + help 468 + This option controls the leaf-level fanout of hierarchical 469 + implementations of RCU, and allows trading off cache misses 470 + against lock contention. Systems that synchronize their 471 + scheduling-clock interrupts for energy-efficiency reasons will 472 + want the default because the smaller leaf-level fanout keeps 473 + lock contention levels acceptably low. Very large systems 474 + (hundreds or thousands of CPUs) will instead want to set this 475 + value to the maximum value possible in order to reduce the 476 + number of cache misses incurred during RCU's grace-period 477 + initialization. These systems tend to run CPU-bound, and thus 478 + are not helped by synchronized interrupts, and thus tend to 479 + skew them, which reduces lock contention enough that large 480 + leaf-level fanouts work well. 481 + 482 + Select a specific number if testing RCU itself. 483 + 484 + Select the maximum permissible value for large systems. 485 + 486 + Take the default if unsure. 487 + 461 488 config RCU_FANOUT_EXACT 462 489 bool "Disable tree-based hierarchical RCU auto-balancing" 463 490 depends on TREE_RCU || TREE_PREEMPT_RCU ··· 542 515 depends on RCU_BOOST 543 516 default 1 544 517 help 545 - This option specifies the real-time priority to which preempted 546 - RCU readers are to be boosted. If you are working with CPU-bound 547 - real-time applications, you should specify a priority higher then 548 - the highest-priority CPU-bound application. 518 + This option specifies the real-time priority to which long-term 519 + preempted RCU readers are to be boosted. If you are working 520 + with a real-time application that has one or more CPU-bound 521 + threads running at a real-time priority level, you should set 522 + RCU_BOOST_PRIO to a priority higher then the highest-priority 523 + real-time CPU-bound thread. The default RCU_BOOST_PRIO value 524 + of 1 is appropriate in the common case, which is real-time 525 + applications that do not have any CPU-bound threads. 526 + 527 + Some real-time applications might not have a single real-time 528 + thread that saturates a given CPU, but instead might have 529 + multiple real-time threads that, taken together, fully utilize 530 + that CPU. In this case, you should set RCU_BOOST_PRIO to 531 + a priority higher than the lowest-priority thread that is 532 + conspiring to prevent the CPU from running any non-real-time 533 + tasks. For example, if one thread at priority 10 and another 534 + thread at priority 5 are between themselves fully consuming 535 + the CPU time on a given CPU, then RCU_BOOST_PRIO should be 536 + set to priority 6 or higher. 549 537 550 538 Specify the real-time priority, or take the default if unsure. 551 539

+28

kernel/rcupdate.c

··· 51 51 52 52 #include "rcu.h" 53 53 54 + #ifdef CONFIG_PREEMPT_RCU 55 + 56 + /* 57 + * Check for a task exiting while in a preemptible-RCU read-side 58 + * critical section, clean up if so. No need to issue warnings, 59 + * as debug_check_no_locks_held() already does this if lockdep 60 + * is enabled. 61 + */ 62 + void exit_rcu(void) 63 + { 64 + struct task_struct *t = current; 65 + 66 + if (likely(list_empty(&current->rcu_node_entry))) 67 + return; 68 + t->rcu_read_lock_nesting = 1; 69 + barrier(); 70 + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; 71 + __rcu_read_unlock(); 72 + } 73 + 74 + #else /* #ifdef CONFIG_PREEMPT_RCU */ 75 + 76 + void exit_rcu(void) 77 + { 78 + } 79 + 80 + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 81 + 54 82 #ifdef CONFIG_DEBUG_LOCK_ALLOC 55 83 static struct lock_class_key rcu_lock_key; 56 84 struct lockdep_map rcu_lock_map =

-16

kernel/rcutiny_plugin.h

··· 851 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 852 852 } 853 853 854 - /* 855 - * Check for a task exiting while in a preemptible -RCU read-side 856 - * critical section, clean up if so. No need to issue warnings, 857 - * as debug_check_no_locks_held() already does this if lockdep 858 - * is enabled. 859 - */ 860 - void exit_rcu(void) 861 - { 862 - struct task_struct *t = current; 863 - 864 - if (t->rcu_read_lock_nesting == 0) 865 - return; 866 - t->rcu_read_lock_nesting = 1; 867 - __rcu_read_unlock(); 868 - } 869 - 870 854 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 871 855 872 856 #ifdef CONFIG_RCU_TRACE

+242 -15

kernel/rcutorture.c

··· 64 64 static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 65 65 static int fqs_holdoff; /* Hold time within burst (us). */ 66 66 static int fqs_stutter = 3; /* Wait time between bursts (s). */ 67 + static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ 67 68 static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68 69 static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69 70 static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ ··· 97 96 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 98 97 module_param(fqs_stutter, int, 0444); 99 98 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99 + module_param(n_barrier_cbs, int, 0444); 100 + MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 100 101 module_param(onoff_interval, int, 0444); 101 102 MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 102 103 module_param(onoff_holdoff, int, 0444); ··· 142 139 static struct task_struct *onoff_task; 143 140 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 144 141 static struct task_struct *stall_task; 142 + static struct task_struct **barrier_cbs_tasks; 143 + static struct task_struct *barrier_task; 145 144 146 145 #define RCU_TORTURE_PIPE_LEN 10 147 146 ··· 169 164 static atomic_t n_rcu_torture_free; 170 165 static atomic_t n_rcu_torture_mberror; 171 166 static atomic_t n_rcu_torture_error; 167 + static long n_rcu_torture_barrier_error; 172 168 static long n_rcu_torture_boost_ktrerror; 173 169 static long n_rcu_torture_boost_rterror; 174 170 static long n_rcu_torture_boost_failure; ··· 179 173 static long n_offline_successes; 180 174 static long n_online_attempts; 181 175 static long n_online_successes; 176 + static long n_barrier_attempts; 177 + static long n_barrier_successes; 182 178 static struct list_head rcu_torture_removed; 183 179 static cpumask_var_t shuffle_tmp_mask; 184 180 ··· 205 197 static unsigned long boost_starttime; /* jiffies of next boost test start. */ 206 198 DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 207 199 /* and boost task create/destroy. */ 200 + static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ 201 + static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ 202 + static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ 203 + static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); 208 204 209 205 /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 210 206 ··· 339 327 int (*completed)(void); 340 328 void (*deferred_free)(struct rcu_torture *p); 341 329 void (*sync)(void); 330 + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 342 331 void (*cb_barrier)(void); 343 332 void (*fqs)(void); 344 333 int (*stats)(char *page); ··· 430 417 .completed = rcu_torture_completed, 431 418 .deferred_free = rcu_torture_deferred_free, 432 419 .sync = synchronize_rcu, 420 + .call = call_rcu, 433 421 .cb_barrier = rcu_barrier, 434 422 .fqs = rcu_force_quiescent_state, 435 423 .stats = NULL, ··· 474 460 .completed = rcu_torture_completed, 475 461 .deferred_free = rcu_sync_torture_deferred_free, 476 462 .sync = synchronize_rcu, 463 + .call = NULL, 477 464 .cb_barrier = NULL, 478 465 .fqs = rcu_force_quiescent_state, 479 466 .stats = NULL, ··· 492 477 .completed = rcu_no_completed, 493 478 .deferred_free = rcu_sync_torture_deferred_free, 494 479 .sync = synchronize_rcu_expedited, 480 + .call = NULL, 495 481 .cb_barrier = NULL, 496 482 .fqs = rcu_force_quiescent_state, 497 483 .stats = NULL, ··· 535 519 .completed = rcu_bh_torture_completed, 536 520 .deferred_free = rcu_bh_torture_deferred_free, 537 521 .sync = synchronize_rcu_bh, 522 + .call = call_rcu_bh, 538 523 .cb_barrier = rcu_barrier_bh, 539 524 .fqs = rcu_bh_force_quiescent_state, 540 525 .stats = NULL, ··· 552 535 .completed = rcu_bh_torture_completed, 553 536 .deferred_free = rcu_sync_torture_deferred_free, 554 537 .sync = synchronize_rcu_bh, 538 + .call = NULL, 555 539 .cb_barrier = NULL, 556 540 .fqs = rcu_bh_force_quiescent_state, 557 541 .stats = NULL, ··· 569 551 .completed = rcu_bh_torture_completed, 570 552 .deferred_free = rcu_sync_torture_deferred_free, 571 553 .sync = synchronize_rcu_bh_expedited, 554 + .call = NULL, 572 555 .cb_barrier = NULL, 573 556 .fqs = rcu_bh_force_quiescent_state, 574 557 .stats = NULL, ··· 625 606 return srcu_batches_completed(&srcu_ctl); 626 607 } 627 608 609 + static void srcu_torture_deferred_free(struct rcu_torture *rp) 610 + { 611 + call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); 612 + } 613 + 628 614 static void srcu_torture_synchronize(void) 629 615 { 630 616 synchronize_srcu(&srcu_ctl); ··· 644 620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 645 621 torture_type, TORTURE_FLAG, idx); 646 622 for_each_possible_cpu(cpu) { 647 - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 623 + cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, 648 624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 649 625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 650 626 } ··· 659 635 .read_delay = srcu_read_delay, 660 636 .readunlock = srcu_torture_read_unlock, 661 637 .completed = srcu_torture_completed, 662 - .deferred_free = rcu_sync_torture_deferred_free, 638 + .deferred_free = srcu_torture_deferred_free, 663 639 .sync = srcu_torture_synchronize, 640 + .call = NULL, 664 641 .cb_barrier = NULL, 665 642 .stats = srcu_torture_stats, 666 643 .name = "srcu" 644 + }; 645 + 646 + static struct rcu_torture_ops srcu_sync_ops = { 647 + .init = srcu_torture_init, 648 + .cleanup = srcu_torture_cleanup, 649 + .readlock = srcu_torture_read_lock, 650 + .read_delay = srcu_read_delay, 651 + .readunlock = srcu_torture_read_unlock, 652 + .completed = srcu_torture_completed, 653 + .deferred_free = rcu_sync_torture_deferred_free, 654 + .sync = srcu_torture_synchronize, 655 + .call = NULL, 656 + .cb_barrier = NULL, 657 + .stats = srcu_torture_stats, 658 + .name = "srcu_sync" 667 659 }; 668 660 669 661 static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) ··· 699 659 .read_delay = srcu_read_delay, 700 660 .readunlock = srcu_torture_read_unlock_raw, 701 661 .completed = srcu_torture_completed, 702 - .deferred_free = rcu_sync_torture_deferred_free, 662 + .deferred_free = srcu_torture_deferred_free, 703 663 .sync = srcu_torture_synchronize, 664 + .call = NULL, 704 665 .cb_barrier = NULL, 705 666 .stats = srcu_torture_stats, 706 667 .name = "srcu_raw" 668 + }; 669 + 670 + static struct rcu_torture_ops srcu_raw_sync_ops = { 671 + .init = srcu_torture_init, 672 + .cleanup = srcu_torture_cleanup, 673 + .readlock = srcu_torture_read_lock_raw, 674 + .read_delay = srcu_read_delay, 675 + .readunlock = srcu_torture_read_unlock_raw, 676 + .completed = srcu_torture_completed, 677 + .deferred_free = rcu_sync_torture_deferred_free, 678 + .sync = srcu_torture_synchronize, 679 + .call = NULL, 680 + .cb_barrier = NULL, 681 + .stats = srcu_torture_stats, 682 + .name = "srcu_raw_sync" 707 683 }; 708 684 709 685 static void srcu_torture_synchronize_expedited(void) ··· 736 680 .completed = srcu_torture_completed, 737 681 .deferred_free = rcu_sync_torture_deferred_free, 738 682 .sync = srcu_torture_synchronize_expedited, 683 + .call = NULL, 739 684 .cb_barrier = NULL, 740 685 .stats = srcu_torture_stats, 741 686 .name = "srcu_expedited" ··· 1186 1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1187 1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1188 1131 "rtbf: %ld rtb: %ld nt: %ld " 1189 - "onoff: %ld/%ld:%ld/%ld", 1132 + "onoff: %ld/%ld:%ld/%ld " 1133 + "barrier: %ld/%ld:%ld", 1190 1134 rcu_torture_current, 1191 1135 rcu_torture_current_version, 1192 1136 list_empty(&rcu_torture_freelist), ··· 1203 1145 n_online_successes, 1204 1146 n_online_attempts, 1205 1147 n_offline_successes, 1206 - n_offline_attempts); 1148 + n_offline_attempts, 1149 + n_barrier_successes, 1150 + n_barrier_attempts, 1151 + n_rcu_torture_barrier_error); 1152 + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1207 1153 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1154 + n_rcu_torture_barrier_error != 0 || 1208 1155 n_rcu_torture_boost_ktrerror != 0 || 1209 1156 n_rcu_torture_boost_rterror != 0 || 1210 - n_rcu_torture_boost_failure != 0) 1211 - cnt += sprintf(&page[cnt], " !!!"); 1212 - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1213 - if (i > 1) { 1157 + n_rcu_torture_boost_failure != 0 || 1158 + i > 1) { 1214 1159 cnt += sprintf(&page[cnt], "!!! "); 1215 1160 atomic_inc(&n_rcu_torture_error); 1216 1161 WARN_ON_ONCE(1); ··· 1398 1337 1399 1338 /* This must be outside of the mutex, otherwise deadlock! */ 1400 1339 kthread_stop(t); 1340 + boost_tasks[cpu] = NULL; 1401 1341 } 1402 1342 1403 1343 static int rcutorture_booster_init(int cpu) ··· 1546 1484 return; 1547 1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1548 1486 kthread_stop(onoff_task); 1487 + onoff_task = NULL; 1549 1488 } 1550 1489 1551 1490 #else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552 1491 1553 - static void 1492 + static int 1554 1493 rcu_torture_onoff_init(void) 1555 1494 { 1495 + return 0; 1556 1496 } 1557 1497 1558 1498 static void rcu_torture_onoff_cleanup(void) ··· 1618 1554 return; 1619 1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1620 1556 kthread_stop(stall_task); 1557 + stall_task = NULL; 1558 + } 1559 + 1560 + /* Callback function for RCU barrier testing. */ 1561 + void rcu_torture_barrier_cbf(struct rcu_head *rcu) 1562 + { 1563 + atomic_inc(&barrier_cbs_invoked); 1564 + } 1565 + 1566 + /* kthread function to register callbacks used to test RCU barriers. */ 1567 + static int rcu_torture_barrier_cbs(void *arg) 1568 + { 1569 + long myid = (long)arg; 1570 + struct rcu_head rcu; 1571 + 1572 + init_rcu_head_on_stack(&rcu); 1573 + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); 1574 + set_user_nice(current, 19); 1575 + do { 1576 + wait_event(barrier_cbs_wq[myid], 1577 + atomic_read(&barrier_cbs_count) == n_barrier_cbs || 1578 + kthread_should_stop() || 1579 + fullstop != FULLSTOP_DONTSTOP); 1580 + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1581 + break; 1582 + cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1583 + if (atomic_dec_and_test(&barrier_cbs_count)) 1584 + wake_up(&barrier_wq); 1585 + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1586 + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); 1587 + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); 1588 + while (!kthread_should_stop()) 1589 + schedule_timeout_interruptible(1); 1590 + cur_ops->cb_barrier(); 1591 + destroy_rcu_head_on_stack(&rcu); 1592 + return 0; 1593 + } 1594 + 1595 + /* kthread function to drive and coordinate RCU barrier testing. */ 1596 + static int rcu_torture_barrier(void *arg) 1597 + { 1598 + int i; 1599 + 1600 + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); 1601 + do { 1602 + atomic_set(&barrier_cbs_invoked, 0); 1603 + atomic_set(&barrier_cbs_count, n_barrier_cbs); 1604 + /* wake_up() path contains the required barriers. */ 1605 + for (i = 0; i < n_barrier_cbs; i++) 1606 + wake_up(&barrier_cbs_wq[i]); 1607 + wait_event(barrier_wq, 1608 + atomic_read(&barrier_cbs_count) == 0 || 1609 + kthread_should_stop() || 1610 + fullstop != FULLSTOP_DONTSTOP); 1611 + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1612 + break; 1613 + n_barrier_attempts++; 1614 + cur_ops->cb_barrier(); 1615 + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1616 + n_rcu_torture_barrier_error++; 1617 + WARN_ON_ONCE(1); 1618 + } 1619 + n_barrier_successes++; 1620 + schedule_timeout_interruptible(HZ / 10); 1621 + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1622 + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); 1623 + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); 1624 + while (!kthread_should_stop()) 1625 + schedule_timeout_interruptible(1); 1626 + return 0; 1627 + } 1628 + 1629 + /* Initialize RCU barrier testing. */ 1630 + static int rcu_torture_barrier_init(void) 1631 + { 1632 + int i; 1633 + int ret; 1634 + 1635 + if (n_barrier_cbs == 0) 1636 + return 0; 1637 + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { 1638 + printk(KERN_ALERT "%s" TORTURE_FLAG 1639 + " Call or barrier ops missing for %s,\n", 1640 + torture_type, cur_ops->name); 1641 + printk(KERN_ALERT "%s" TORTURE_FLAG 1642 + " RCU barrier testing omitted from run.\n", 1643 + torture_type); 1644 + return 0; 1645 + } 1646 + atomic_set(&barrier_cbs_count, 0); 1647 + atomic_set(&barrier_cbs_invoked, 0); 1648 + barrier_cbs_tasks = 1649 + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), 1650 + GFP_KERNEL); 1651 + barrier_cbs_wq = 1652 + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), 1653 + GFP_KERNEL); 1654 + if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) 1655 + return -ENOMEM; 1656 + for (i = 0; i < n_barrier_cbs; i++) { 1657 + init_waitqueue_head(&barrier_cbs_wq[i]); 1658 + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, 1659 + (void *)(long)i, 1660 + "rcu_torture_barrier_cbs"); 1661 + if (IS_ERR(barrier_cbs_tasks[i])) { 1662 + ret = PTR_ERR(barrier_cbs_tasks[i]); 1663 + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); 1664 + barrier_cbs_tasks[i] = NULL; 1665 + return ret; 1666 + } 1667 + } 1668 + barrier_task = kthread_run(rcu_torture_barrier, NULL, 1669 + "rcu_torture_barrier"); 1670 + if (IS_ERR(barrier_task)) { 1671 + ret = PTR_ERR(barrier_task); 1672 + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); 1673 + barrier_task = NULL; 1674 + } 1675 + return 0; 1676 + } 1677 + 1678 + /* Clean up after RCU barrier testing. */ 1679 + static void rcu_torture_barrier_cleanup(void) 1680 + { 1681 + int i; 1682 + 1683 + if (barrier_task != NULL) { 1684 + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); 1685 + kthread_stop(barrier_task); 1686 + barrier_task = NULL; 1687 + } 1688 + if (barrier_cbs_tasks != NULL) { 1689 + for (i = 0; i < n_barrier_cbs; i++) { 1690 + if (barrier_cbs_tasks[i] != NULL) { 1691 + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); 1692 + kthread_stop(barrier_cbs_tasks[i]); 1693 + barrier_cbs_tasks[i] = NULL; 1694 + } 1695 + } 1696 + kfree(barrier_cbs_tasks); 1697 + barrier_cbs_tasks = NULL; 1698 + } 1699 + if (barrier_cbs_wq != NULL) { 1700 + kfree(barrier_cbs_wq); 1701 + barrier_cbs_wq = NULL; 1702 + } 1621 1703 } 1622 1704 1623 1705 static int rcutorture_cpu_notify(struct notifier_block *self, ··· 1808 1598 fullstop = FULLSTOP_RMMOD; 1809 1599 mutex_unlock(&fullstop_mutex); 1810 1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1601 + rcu_torture_barrier_cleanup(); 1811 1602 rcu_torture_stall_cleanup(); 1812 1603 if (stutter_task) { 1813 1604 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); ··· 1876 1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1877 1666 kthread_stop(shutdown_task); 1878 1667 } 1668 + shutdown_task = NULL; 1879 1669 rcu_torture_onoff_cleanup(); 1880 1670 1881 1671 /* Wait for all RCU callbacks to fire. */ ··· 1888 1676 1889 1677 if (cur_ops->cleanup) 1890 1678 cur_ops->cleanup(); 1891 - if (atomic_read(&n_rcu_torture_error)) 1679 + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1892 1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1893 1681 else if (n_online_successes != n_online_attempts || 1894 1682 n_offline_successes != n_offline_attempts) ··· 1904 1692 int i; 1905 1693 int cpu; 1906 1694 int firsterr = 0; 1695 + int retval; 1907 1696 static struct rcu_torture_ops *torture_ops[] = 1908 1697 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 1698 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 - &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1699 + &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, 1700 + &srcu_raw_sync_ops, &srcu_expedited_ops, 1911 1701 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1912 1702 1913 1703 mutex_lock(&fullstop_mutex); ··· 1963 1749 atomic_set(&n_rcu_torture_free, 0); 1964 1750 atomic_set(&n_rcu_torture_mberror, 0); 1965 1751 atomic_set(&n_rcu_torture_error, 0); 1752 + n_rcu_torture_barrier_error = 0; 1966 1753 n_rcu_torture_boost_ktrerror = 0; 1967 1754 n_rcu_torture_boost_rterror = 0; 1968 1755 n_rcu_torture_boost_failure = 0; ··· 2087 1872 test_boost_duration = 2; 2088 1873 if ((test_boost == 1 && cur_ops->can_boost) || 2089 1874 test_boost == 2) { 2090 - int retval; 2091 1875 2092 1876 boost_starttime = jiffies + test_boost_interval * HZ; 2093 1877 register_cpu_notifier(&rcutorture_cpu_nb); ··· 2111 1897 goto unwind; 2112 1898 } 2113 1899 } 2114 - rcu_torture_onoff_init(); 1900 + i = rcu_torture_onoff_init(); 1901 + if (i != 0) { 1902 + firsterr = i; 1903 + goto unwind; 1904 + } 2115 1905 register_reboot_notifier(&rcutorture_shutdown_nb); 2116 - rcu_torture_stall_init(); 1906 + i = rcu_torture_stall_init(); 1907 + if (i != 0) { 1908 + firsterr = i; 1909 + goto unwind; 1910 + } 1911 + retval = rcu_torture_barrier_init(); 1912 + if (retval != 0) { 1913 + firsterr = retval; 1914 + goto unwind; 1915 + } 2117 1916 rcutorture_record_test_transition(); 2118 1917 mutex_unlock(&fullstop_mutex); 2119 1918 return 0;

+244 -88

kernel/rcutree.c

··· 75 75 .gpnum = -300, \ 76 76 .completed = -300, \ 77 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 78 + .orphan_nxttail = &structname##_state.orphan_nxtlist, \ 79 + .orphan_donetail = &structname##_state.orphan_donelist, \ 78 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 79 81 .n_force_qs = 0, \ 80 82 .n_force_qs_ngp = 0, \ ··· 147 145 unsigned long rcutorture_testseq; 148 146 unsigned long rcutorture_vernum; 149 147 148 + /* State information for rcu_barrier() and friends. */ 149 + 150 + static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 151 + static atomic_t rcu_barrier_cpu_count; 152 + static DEFINE_MUTEX(rcu_barrier_mutex); 153 + static struct completion rcu_barrier_completion; 154 + 150 155 /* 151 156 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 152 157 * permit this function to be invoked without holding the root rcu_node ··· 201 192 { 202 193 trace_rcu_utilization("Start context switch"); 203 194 rcu_sched_qs(cpu); 204 - rcu_preempt_note_context_switch(cpu); 205 195 trace_rcu_utilization("End context switch"); 206 196 } 207 197 EXPORT_SYMBOL_GPL(rcu_note_context_switch); ··· 1319 1311 #ifdef CONFIG_HOTPLUG_CPU 1320 1312 1321 1313 /* 1322 - * Move a dying CPU's RCU callbacks to online CPU's callback list. 1323 - * Also record a quiescent state for this CPU for the current grace period. 1324 - * Synchronization and interrupt disabling are not required because 1325 - * this function executes in stop_machine() context. Therefore, cleanup 1326 - * operations that might block must be done later from the CPU_DEAD 1327 - * notifier. 1328 - * 1329 - * Note that the outgoing CPU's bit has already been cleared in the 1330 - * cpu_online_mask. This allows us to randomly pick a callback 1331 - * destination from the bits set in that mask. 1314 + * Send the specified CPU's RCU callbacks to the orphanage. The 1315 + * specified CPU must be offline, and the caller must hold the 1316 + * ->onofflock. 1332 1317 */ 1333 - static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1318 + static void 1319 + rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1320 + struct rcu_node *rnp, struct rcu_data *rdp) 1334 1321 { 1335 1322 int i; 1336 - unsigned long mask; 1337 - int receive_cpu = cpumask_any(cpu_online_mask); 1338 - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1339 - struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); 1340 - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ 1341 1323 1342 - /* First, adjust the counts. */ 1324 + /* 1325 + * Orphan the callbacks. First adjust the counts. This is safe 1326 + * because ->onofflock excludes _rcu_barrier()'s adoption of 1327 + * the callbacks, thus no memory barrier is required. 1328 + */ 1343 1329 if (rdp->nxtlist != NULL) { 1344 - receive_rdp->qlen_lazy += rdp->qlen_lazy; 1345 - receive_rdp->qlen += rdp->qlen; 1330 + rsp->qlen_lazy += rdp->qlen_lazy; 1331 + rsp->qlen += rdp->qlen; 1332 + rdp->n_cbs_orphaned += rdp->qlen; 1346 1333 rdp->qlen_lazy = 0; 1347 1334 rdp->qlen = 0; 1348 1335 } 1349 1336 1350 1337 /* 1351 - * Next, move ready-to-invoke callbacks to be invoked on some 1352 - * other CPU. These will not be required to pass through another 1353 - * grace period: They are done, regardless of CPU. 1338 + * Next, move those callbacks still needing a grace period to 1339 + * the orphanage, where some other CPU will pick them up. 1340 + * Some of the callbacks might have gone partway through a grace 1341 + * period, but that is too bad. They get to start over because we 1342 + * cannot assume that grace periods are synchronized across CPUs. 1343 + * We don't bother updating the ->nxttail[] array yet, instead 1344 + * we just reset the whole thing later on. 1354 1345 */ 1355 - if (rdp->nxtlist != NULL && 1356 - rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1357 - struct rcu_head *oldhead; 1358 - struct rcu_head **oldtail; 1359 - struct rcu_head **newtail; 1360 - 1361 - oldhead = rdp->nxtlist; 1362 - oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; 1363 - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1364 - *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; 1365 - *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; 1366 - newtail = rdp->nxttail[RCU_DONE_TAIL]; 1367 - for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { 1368 - if (receive_rdp->nxttail[i] == oldtail) 1369 - receive_rdp->nxttail[i] = newtail; 1370 - if (rdp->nxttail[i] == newtail) 1371 - rdp->nxttail[i] = &rdp->nxtlist; 1372 - } 1346 + if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 1347 + *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; 1348 + rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; 1349 + *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1373 1350 } 1374 1351 1375 1352 /* 1376 - * Finally, put the rest of the callbacks at the end of the list. 1377 - * The ones that made it partway through get to start over: We 1378 - * cannot assume that grace periods are synchronized across CPUs. 1379 - * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but 1380 - * this does not seem compelling. Not yet, anyway.) 1353 + * Then move the ready-to-invoke callbacks to the orphanage, 1354 + * where some other CPU will pick them up. These will not be 1355 + * required to pass though another grace period: They are done. 1381 1356 */ 1382 1357 if (rdp->nxtlist != NULL) { 1383 - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1384 - receive_rdp->nxttail[RCU_NEXT_TAIL] = 1385 - rdp->nxttail[RCU_NEXT_TAIL]; 1386 - receive_rdp->n_cbs_adopted += rdp->qlen; 1387 - rdp->n_cbs_orphaned += rdp->qlen; 1388 - 1389 - rdp->nxtlist = NULL; 1390 - for (i = 0; i < RCU_NEXT_SIZE; i++) 1391 - rdp->nxttail[i] = &rdp->nxtlist; 1358 + *rsp->orphan_donetail = rdp->nxtlist; 1359 + rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 1392 1360 } 1393 1361 1362 + /* Finally, initialize the rcu_data structure's list to empty. */ 1363 + rdp->nxtlist = NULL; 1364 + for (i = 0; i < RCU_NEXT_SIZE; i++) 1365 + rdp->nxttail[i] = &rdp->nxtlist; 1366 + } 1367 + 1368 + /* 1369 + * Adopt the RCU callbacks from the specified rcu_state structure's 1370 + * orphanage. The caller must hold the ->onofflock. 1371 + */ 1372 + static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1373 + { 1374 + int i; 1375 + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1376 + 1394 1377 /* 1395 - * Record a quiescent state for the dying CPU. This is safe 1396 - * only because we have already cleared out the callbacks. 1397 - * (Otherwise, the RCU core might try to schedule the invocation 1398 - * of callbacks on this now-offline CPU, which would be bad.) 1378 + * If there is an rcu_barrier() operation in progress, then 1379 + * only the task doing that operation is permitted to adopt 1380 + * callbacks. To do otherwise breaks rcu_barrier() and friends 1381 + * by causing them to fail to wait for the callbacks in the 1382 + * orphanage. 1399 1383 */ 1400 - mask = rdp->grpmask; /* rnp->grplo is constant. */ 1384 + if (rsp->rcu_barrier_in_progress && 1385 + rsp->rcu_barrier_in_progress != current) 1386 + return; 1387 + 1388 + /* Do the accounting first. */ 1389 + rdp->qlen_lazy += rsp->qlen_lazy; 1390 + rdp->qlen += rsp->qlen; 1391 + rdp->n_cbs_adopted += rsp->qlen; 1392 + rsp->qlen_lazy = 0; 1393 + rsp->qlen = 0; 1394 + 1395 + /* 1396 + * We do not need a memory barrier here because the only way we 1397 + * can get here if there is an rcu_barrier() in flight is if 1398 + * we are the task doing the rcu_barrier(). 1399 + */ 1400 + 1401 + /* First adopt the ready-to-invoke callbacks. */ 1402 + if (rsp->orphan_donelist != NULL) { 1403 + *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 1404 + *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 1405 + for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 1406 + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 1407 + rdp->nxttail[i] = rsp->orphan_donetail; 1408 + rsp->orphan_donelist = NULL; 1409 + rsp->orphan_donetail = &rsp->orphan_donelist; 1410 + } 1411 + 1412 + /* And then adopt the callbacks that still need a grace period. */ 1413 + if (rsp->orphan_nxtlist != NULL) { 1414 + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; 1415 + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; 1416 + rsp->orphan_nxtlist = NULL; 1417 + rsp->orphan_nxttail = &rsp->orphan_nxtlist; 1418 + } 1419 + } 1420 + 1421 + /* 1422 + * Trace the fact that this CPU is going offline. 1423 + */ 1424 + static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1425 + { 1426 + RCU_TRACE(unsigned long mask); 1427 + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 1428 + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 1429 + 1430 + RCU_TRACE(mask = rdp->grpmask); 1401 1431 trace_rcu_grace_period(rsp->name, 1402 1432 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1403 1433 "cpuofl"); 1404 - rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); 1405 - /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ 1406 1434 } 1407 1435 1408 1436 /* 1409 1437 * The CPU has been completely removed, and some other CPU is reporting 1410 - * this fact from process context. Do the remainder of the cleanup. 1438 + * this fact from process context. Do the remainder of the cleanup, 1439 + * including orphaning the outgoing CPU's RCU callbacks, and also 1440 + * adopting them, if there is no _rcu_barrier() instance running. 1411 1441 * There can only be one CPU hotplug operation at a time, so no other 1412 1442 * CPU can be attempting to update rcu_cpu_kthread_task. 1413 1443 */ ··· 1455 1409 unsigned long mask; 1456 1410 int need_report = 0; 1457 1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1458 - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1412 + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 1459 1413 1460 1414 /* Adjust any no-longer-needed kthreads. */ 1461 1415 rcu_stop_cpu_kthread(cpu); 1462 1416 rcu_node_kthread_setaffinity(rnp, -1); 1463 1417 1464 - /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1418 + /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1465 1419 1466 1420 /* Exclude any attempts to start a new grace period. */ 1467 1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1422 + 1423 + /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1424 + rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1425 + rcu_adopt_orphan_cbs(rsp); 1468 1426 1469 1427 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1470 1428 mask = rdp->grpmask; /* rnp->grplo is constant. */ ··· 1505 1455 } 1506 1456 1507 1457 #else /* #ifdef CONFIG_HOTPLUG_CPU */ 1458 + 1459 + static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1460 + { 1461 + } 1508 1462 1509 1463 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1510 1464 { ··· 1578 1524 rcu_is_callbacks_kthread()); 1579 1525 1580 1526 /* Update count, and requeue any remaining callbacks. */ 1581 - rdp->qlen_lazy -= count_lazy; 1582 - rdp->qlen -= count; 1583 - rdp->n_cbs_invoked += count; 1584 1527 if (list != NULL) { 1585 1528 *tail = rdp->nxtlist; 1586 1529 rdp->nxtlist = list; ··· 1587 1536 else 1588 1537 break; 1589 1538 } 1539 + smp_mb(); /* List handling before counting for rcu_barrier(). */ 1540 + rdp->qlen_lazy -= count_lazy; 1541 + rdp->qlen -= count; 1542 + rdp->n_cbs_invoked += count; 1590 1543 1591 1544 /* Reinstate batch limit if we have worked down the excess. */ 1592 1545 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) ··· 1878 1823 rdp = this_cpu_ptr(rsp->rda); 1879 1824 1880 1825 /* Add the callback to our list. */ 1881 - *rdp->nxttail[RCU_NEXT_TAIL] = head; 1882 - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1883 1826 rdp->qlen++; 1884 1827 if (lazy) 1885 1828 rdp->qlen_lazy++; 1829 + else 1830 + rcu_idle_count_callbacks_posted(); 1831 + smp_mb(); /* Count before adding callback for rcu_barrier(). */ 1832 + *rdp->nxttail[RCU_NEXT_TAIL] = head; 1833 + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1886 1834 1887 1835 if (__is_kfree_rcu_offset((unsigned long)func)) 1888 1836 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, ··· 1950 1892 __call_rcu(head, func, &rcu_bh_state, 0); 1951 1893 } 1952 1894 EXPORT_SYMBOL_GPL(call_rcu_bh); 1895 + 1896 + /* 1897 + * Because a context switch is a grace period for RCU-sched and RCU-bh, 1898 + * any blocking grace-period wait automatically implies a grace period 1899 + * if there is only one CPU online at any point time during execution 1900 + * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to 1901 + * occasionally incorrectly indicate that there are multiple CPUs online 1902 + * when there was in fact only one the whole time, as this just adds 1903 + * some overhead: RCU still operates correctly. 1904 + * 1905 + * Of course, sampling num_online_cpus() with preemption enabled can 1906 + * give erroneous results if there are concurrent CPU-hotplug operations. 1907 + * For example, given a demonic sequence of preemptions in num_online_cpus() 1908 + * and CPU-hotplug operations, there could be two or more CPUs online at 1909 + * all times, but num_online_cpus() might well return one (or even zero). 1910 + * 1911 + * However, all such demonic sequences require at least one CPU-offline 1912 + * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer 1913 + * is only a problem if there is an RCU read-side critical section executing 1914 + * throughout. But RCU-sched and RCU-bh read-side critical sections 1915 + * disable either preemption or bh, which prevents a CPU from going offline. 1916 + * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return 1917 + * that there is only one CPU when in fact there was more than one throughout 1918 + * is when there were no RCU readers in the system. If there are no 1919 + * RCU readers, the grace period by definition can be of zero length, 1920 + * regardless of the number of online CPUs. 1921 + */ 1922 + static inline int rcu_blocking_is_gp(void) 1923 + { 1924 + might_sleep(); /* Check for RCU read-side critical section. */ 1925 + return num_online_cpus() <= 1; 1926 + } 1953 1927 1954 1928 /** 1955 1929 * synchronize_sched - wait until an rcu-sched grace period has elapsed. ··· 2256 2166 rcu_preempt_cpu_has_callbacks(cpu); 2257 2167 } 2258 2168 2259 - static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2260 - static atomic_t rcu_barrier_cpu_count; 2261 - static DEFINE_MUTEX(rcu_barrier_mutex); 2262 - static struct completion rcu_barrier_completion; 2263 - 2169 + /* 2170 + * RCU callback function for _rcu_barrier(). If we are last, wake 2171 + * up the task executing _rcu_barrier(). 2172 + */ 2264 2173 static void rcu_barrier_callback(struct rcu_head *notused) 2265 2174 { 2266 2175 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) ··· 2289 2200 void (*call_rcu_func)(struct rcu_head *head, 2290 2201 void (*func)(struct rcu_head *head))) 2291 2202 { 2292 - BUG_ON(in_interrupt()); 2203 + int cpu; 2204 + unsigned long flags; 2205 + struct rcu_data *rdp; 2206 + struct rcu_head rh; 2207 + 2208 + init_rcu_head_on_stack(&rh); 2209 + 2293 2210 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2294 2211 mutex_lock(&rcu_barrier_mutex); 2295 - init_completion(&rcu_barrier_completion); 2212 + 2213 + smp_mb(); /* Prevent any prior operations from leaking in. */ 2214 + 2296 2215 /* 2297 - * Initialize rcu_barrier_cpu_count to 1, then invoke 2298 - * rcu_barrier_func() on each CPU, so that each CPU also has 2299 - * incremented rcu_barrier_cpu_count. Only then is it safe to 2300 - * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2301 - * might complete its grace period before all of the other CPUs 2302 - * did their increment, causing this function to return too 2303 - * early. Note that on_each_cpu() disables irqs, which prevents 2304 - * any CPUs from coming online or going offline until each online 2305 - * CPU has queued its RCU-barrier callback. 2216 + * Initialize the count to one rather than to zero in order to 2217 + * avoid a too-soon return to zero in case of a short grace period 2218 + * (or preemption of this task). Also flag this task as doing 2219 + * an rcu_barrier(). This will prevent anyone else from adopting 2220 + * orphaned callbacks, which could cause otherwise failure if a 2221 + * CPU went offline and quickly came back online. To see this, 2222 + * consider the following sequence of events: 2223 + * 2224 + * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. 2225 + * 2. CPU 1 goes offline, orphaning its callbacks. 2226 + * 3. CPU 0 adopts CPU 1's orphaned callbacks. 2227 + * 4. CPU 1 comes back online. 2228 + * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. 2229 + * 6. Both rcu_barrier_callback() callbacks are invoked, awakening 2230 + * us -- but before CPU 1's orphaned callbacks are invoked!!! 2306 2231 */ 2232 + init_completion(&rcu_barrier_completion); 2307 2233 atomic_set(&rcu_barrier_cpu_count, 1); 2308 - on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2234 + raw_spin_lock_irqsave(&rsp->onofflock, flags); 2235 + rsp->rcu_barrier_in_progress = current; 2236 + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2237 + 2238 + /* 2239 + * Force every CPU with callbacks to register a new callback 2240 + * that will tell us when all the preceding callbacks have 2241 + * been invoked. If an offline CPU has callbacks, wait for 2242 + * it to either come back online or to finish orphaning those 2243 + * callbacks. 2244 + */ 2245 + for_each_possible_cpu(cpu) { 2246 + preempt_disable(); 2247 + rdp = per_cpu_ptr(rsp->rda, cpu); 2248 + if (cpu_is_offline(cpu)) { 2249 + preempt_enable(); 2250 + while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) 2251 + schedule_timeout_interruptible(1); 2252 + } else if (ACCESS_ONCE(rdp->qlen)) { 2253 + smp_call_function_single(cpu, rcu_barrier_func, 2254 + (void *)call_rcu_func, 1); 2255 + preempt_enable(); 2256 + } else { 2257 + preempt_enable(); 2258 + } 2259 + } 2260 + 2261 + /* 2262 + * Now that all online CPUs have rcu_barrier_callback() callbacks 2263 + * posted, we can adopt all of the orphaned callbacks and place 2264 + * an rcu_barrier_callback() callback after them. When that is done, 2265 + * we are guaranteed to have an rcu_barrier_callback() callback 2266 + * following every callback that could possibly have been 2267 + * registered before _rcu_barrier() was called. 2268 + */ 2269 + raw_spin_lock_irqsave(&rsp->onofflock, flags); 2270 + rcu_adopt_orphan_cbs(rsp); 2271 + rsp->rcu_barrier_in_progress = NULL; 2272 + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2273 + atomic_inc(&rcu_barrier_cpu_count); 2274 + smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ 2275 + call_rcu_func(&rh, rcu_barrier_callback); 2276 + 2277 + /* 2278 + * Now that we have an rcu_barrier_callback() callback on each 2279 + * CPU, and thus each counted, remove the initial count. 2280 + */ 2309 2281 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2310 2282 complete(&rcu_barrier_completion); 2283 + 2284 + /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 2311 2285 wait_for_completion(&rcu_barrier_completion); 2286 + 2287 + /* Other rcu_barrier() invocations can now safely proceed. */ 2312 2288 mutex_unlock(&rcu_barrier_mutex); 2289 + 2290 + destroy_rcu_head_on_stack(&rh); 2313 2291 } 2314 2292 2315 2293 /** ··· 2573 2417 2574 2418 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2575 2419 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2576 - rsp->levelspread[0] = RCU_FANOUT_LEAF; 2420 + rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; 2577 2421 } 2578 2422 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2579 2423 static void __init rcu_init_levelspread(struct rcu_state *rsp)

+15 -8

kernel/rcutree.h

··· 29 29 #include <linux/seqlock.h> 30 30 31 31 /* 32 - * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 33 + * CONFIG_RCU_FANOUT_LEAF. 33 34 * In theory, it should be possible to add more levels straightforwardly. 34 35 * In practice, this did work well going from three levels to four. 35 36 * Of course, your mileage may vary. 36 37 */ 37 38 #define MAX_RCU_LVLS 4 38 - #if CONFIG_RCU_FANOUT > 16 39 - #define RCU_FANOUT_LEAF 16 40 - #else /* #if CONFIG_RCU_FANOUT > 16 */ 41 - #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) 42 - #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ 43 - #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) 39 + #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) 44 40 #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 45 41 #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 46 42 #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) ··· 367 371 368 372 raw_spinlock_t onofflock; /* exclude on/offline and */ 369 373 /* starting new GP. */ 374 + struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 375 + /* need a grace period. */ 376 + struct rcu_head **orphan_nxttail; /* Tail of above. */ 377 + struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ 378 + /* are ready to invoke. */ 379 + struct rcu_head **orphan_donetail; /* Tail of above. */ 380 + long qlen_lazy; /* Number of lazy callbacks. */ 381 + long qlen; /* Total number of callbacks. */ 382 + struct task_struct *rcu_barrier_in_progress; 383 + /* Task doing rcu_barrier(), */ 384 + /* or NULL if no barrier. */ 370 385 raw_spinlock_t fqslock; /* Only one task forcing */ 371 386 /* quiescent states. */ 372 387 unsigned long jiffies_force_qs; /* Time at which to invoke */ ··· 430 423 /* Forward declarations for rcutree_plugin.h */ 431 424 static void rcu_bootup_announce(void); 432 425 long rcu_batches_completed(void); 433 - static void rcu_preempt_note_context_switch(int cpu); 434 426 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 435 427 #ifdef CONFIG_HOTPLUG_CPU 436 428 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, ··· 477 471 static void rcu_prepare_for_idle_init(int cpu); 478 472 static void rcu_cleanup_after_idle(int cpu); 479 473 static void rcu_prepare_for_idle(int cpu); 474 + static void rcu_idle_count_callbacks_posted(void); 480 475 static void print_cpu_stall_info_begin(void); 481 476 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482 477 static void print_cpu_stall_info_end(void);

+99 -55

kernel/rcutree_plugin.h

··· 153 153 * 154 154 * Caller must disable preemption. 155 155 */ 156 - static void rcu_preempt_note_context_switch(int cpu) 156 + void rcu_preempt_note_context_switch(void) 157 157 { 158 158 struct task_struct *t = current; 159 159 unsigned long flags; ··· 164 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 165 165 166 166 /* Possibly blocking in an RCU read-side critical section. */ 167 - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 + rdp = __this_cpu_ptr(rcu_preempt_state.rda); 168 168 rnp = rdp->mynode; 169 169 raw_spin_lock_irqsave(&rnp->lock, flags); 170 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; ··· 228 228 * means that we continue to block the current grace period. 229 229 */ 230 230 local_irq_save(flags); 231 - rcu_preempt_qs(cpu); 231 + rcu_preempt_qs(smp_processor_id()); 232 232 local_irq_restore(flags); 233 233 } 234 234 ··· 969 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 970 970 } 971 971 972 - /* 973 - * Check for a task exiting while in a preemptible-RCU read-side 974 - * critical section, clean up if so. No need to issue warnings, 975 - * as debug_check_no_locks_held() already does this if lockdep 976 - * is enabled. 977 - */ 978 - void exit_rcu(void) 979 - { 980 - struct task_struct *t = current; 981 - 982 - if (t->rcu_read_lock_nesting == 0) 983 - return; 984 - t->rcu_read_lock_nesting = 1; 985 - __rcu_read_unlock(); 986 - } 987 - 988 972 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 989 973 990 974 static struct rcu_state *rcu_state = &rcu_sched_state; ··· 1000 1016 rcu_sched_force_quiescent_state(); 1001 1017 } 1002 1018 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1003 - 1004 - /* 1005 - * Because preemptible RCU does not exist, we never have to check for 1006 - * CPUs being in quiescent states. 1007 - */ 1008 - static void rcu_preempt_note_context_switch(int cpu) 1009 - { 1010 - } 1011 1019 1012 1020 /* 1013 1021 * Because preemptible RCU does not exist, there are never any preempted ··· 1914 1938 { 1915 1939 } 1916 1940 1941 + /* 1942 + * Don't bother keeping a running count of the number of RCU callbacks 1943 + * posted because CONFIG_RCU_FAST_NO_HZ=n. 1944 + */ 1945 + static void rcu_idle_count_callbacks_posted(void) 1946 + { 1947 + } 1948 + 1917 1949 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1918 1950 1919 1951 /* ··· 1962 1978 #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1963 1979 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1964 1980 1981 + /* Loop counter for rcu_prepare_for_idle(). */ 1965 1982 static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1983 + /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ 1966 1984 static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1967 - static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1968 - static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1969 - static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1985 + /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ 1986 + static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); 1987 + /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ 1988 + static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); 1989 + /* Enable special processing on first attempt to enter dyntick-idle mode. */ 1990 + static DEFINE_PER_CPU(bool, rcu_idle_first_pass); 1991 + /* Running count of non-lazy callbacks posted, never decremented. */ 1992 + static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); 1993 + /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ 1994 + static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); 1970 1995 1971 1996 /* 1972 1997 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no ··· 1988 1995 */ 1989 1996 int rcu_needs_cpu(int cpu) 1990 1997 { 1998 + /* Flag a new idle sojourn to the idle-entry state machine. */ 1999 + per_cpu(rcu_idle_first_pass, cpu) = 1; 1991 2000 /* If no callbacks, RCU doesn't need the CPU. */ 1992 2001 if (!rcu_cpu_has_callbacks(cpu)) 1993 2002 return 0; ··· 2040 2045 } 2041 2046 2042 2047 /* 2048 + * Handler for smp_call_function_single(). The only point of this 2049 + * handler is to wake the CPU up, so the handler does only tracing. 2050 + */ 2051 + void rcu_idle_demigrate(void *unused) 2052 + { 2053 + trace_rcu_prep_idle("Demigrate"); 2054 + } 2055 + 2056 + /* 2043 2057 * Timer handler used to force CPU to start pushing its remaining RCU 2044 2058 * callbacks in the case where it entered dyntick-idle mode with callbacks 2045 2059 * pending. The hander doesn't really need to do anything because the 2046 2060 * real work is done upon re-entry to idle, or by the next scheduling-clock 2047 2061 * interrupt should idle not be re-entered. 2062 + * 2063 + * One special case: the timer gets migrated without awakening the CPU 2064 + * on which the timer was scheduled on. In this case, we must wake up 2065 + * that CPU. We do so with smp_call_function_single(). 2048 2066 */ 2049 - static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2067 + static void rcu_idle_gp_timer_func(unsigned long cpu_in) 2050 2068 { 2069 + int cpu = (int)cpu_in; 2070 + 2051 2071 trace_rcu_prep_idle("Timer"); 2052 - return HRTIMER_NORESTART; 2072 + if (cpu != smp_processor_id()) 2073 + smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); 2074 + else 2075 + WARN_ON_ONCE(1); /* Getting here can hang the system... */ 2053 2076 } 2054 2077 2055 2078 /* ··· 2075 2062 */ 2076 2063 static void rcu_prepare_for_idle_init(int cpu) 2077 2064 { 2078 - static int firsttime = 1; 2079 - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2080 - 2081 - hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2082 - hrtp->function = rcu_idle_gp_timer_func; 2083 - if (firsttime) { 2084 - unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); 2085 - 2086 - rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); 2087 - upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); 2088 - rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); 2089 - firsttime = 0; 2090 - } 2065 + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2066 + setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), 2067 + rcu_idle_gp_timer_func, cpu); 2068 + per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; 2069 + per_cpu(rcu_idle_first_pass, cpu) = 1; 2091 2070 } 2092 2071 2093 2072 /* ··· 2089 2084 */ 2090 2085 static void rcu_cleanup_after_idle(int cpu) 2091 2086 { 2092 - hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2087 + del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); 2088 + trace_rcu_prep_idle("Cleanup after idle"); 2093 2089 } 2094 2090 2095 2091 /* ··· 2114 2108 */ 2115 2109 static void rcu_prepare_for_idle(int cpu) 2116 2110 { 2111 + struct timer_list *tp; 2112 + 2113 + /* 2114 + * If this is an idle re-entry, for example, due to use of 2115 + * RCU_NONIDLE() or the new idle-loop tracing API within the idle 2116 + * loop, then don't take any state-machine actions, unless the 2117 + * momentary exit from idle queued additional non-lazy callbacks. 2118 + * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks 2119 + * pending. 2120 + */ 2121 + if (!per_cpu(rcu_idle_first_pass, cpu) && 2122 + (per_cpu(rcu_nonlazy_posted, cpu) == 2123 + per_cpu(rcu_nonlazy_posted_snap, cpu))) { 2124 + if (rcu_cpu_has_callbacks(cpu)) { 2125 + tp = &per_cpu(rcu_idle_gp_timer, cpu); 2126 + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2127 + } 2128 + return; 2129 + } 2130 + per_cpu(rcu_idle_first_pass, cpu) = 0; 2131 + per_cpu(rcu_nonlazy_posted_snap, cpu) = 2132 + per_cpu(rcu_nonlazy_posted, cpu) - 1; 2133 + 2117 2134 /* 2118 2135 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2119 2136 * Also reset state to avoid prejudicing later attempts. ··· 2169 2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2170 2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2171 2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2172 - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2173 - rcu_idle_gp_wait, HRTIMER_MODE_REL); 2143 + per_cpu(rcu_idle_gp_timer_expires, cpu) = 2144 + jiffies + RCU_IDLE_GP_DELAY; 2174 2145 else 2175 - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2176 - rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2146 + per_cpu(rcu_idle_gp_timer_expires, cpu) = 2147 + jiffies + RCU_IDLE_LAZY_GP_DELAY; 2148 + tp = &per_cpu(rcu_idle_gp_timer, cpu); 2149 + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); 2150 + per_cpu(rcu_nonlazy_posted_snap, cpu) = 2151 + per_cpu(rcu_nonlazy_posted, cpu); 2177 2152 return; /* Nothing more to do immediately. */ 2178 2153 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2179 2154 /* We have hit the limit, so time to give up. */ ··· 2217 2184 trace_rcu_prep_idle("Callbacks drained"); 2218 2185 } 2219 2186 2187 + /* 2188 + * Keep a running count of the number of non-lazy callbacks posted 2189 + * on this CPU. This running counter (which is never decremented) allows 2190 + * rcu_prepare_for_idle() to detect when something out of the idle loop 2191 + * posts a callback, even if an equal number of callbacks are invoked. 2192 + * Of course, callbacks should only be posted from within a trace event 2193 + * designed to be called from idle or from within RCU_NONIDLE(). 2194 + */ 2195 + static void rcu_idle_count_callbacks_posted(void) 2196 + { 2197 + __this_cpu_add(rcu_nonlazy_posted, 1); 2198 + } 2199 + 2220 2200 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2221 2201 2222 2202 #ifdef CONFIG_RCU_CPU_STALL_INFO ··· 2238 2192 2239 2193 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2240 2194 { 2241 - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2195 + struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); 2242 2196 2243 - sprintf(cp, "drain=%d %c timer=%lld", 2197 + sprintf(cp, "drain=%d %c timer=%lu", 2244 2198 per_cpu(rcu_dyntick_drain, cpu), 2245 2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2246 - hrtimer_active(hrtp) 2247 - ? ktime_to_us(hrtimer_get_remaining(hrtp)) 2248 - : -1); 2200 + timer_pending(tltp) ? tltp->expires - jiffies : -1); 2249 2201 } 2250 2202 2251 2203 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */

+2 -2

kernel/rcutree_trace.c

··· 271 271 272 272 gpnum = rsp->gpnum; 273 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 274 - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 275 275 rsp->completed, gpnum, rsp->fqs_state, 276 276 (long)(rsp->jiffies_force_qs - jiffies), 277 277 (int)(jiffies & 0xffff), 278 278 rsp->n_force_qs, rsp->n_force_qs_ngp, 279 279 rsp->n_force_qs - rsp->n_force_qs_ngp, 280 - rsp->n_force_qs_lh); 280 + rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 281 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 282 282 if (rnp->level != level) { 283 283 seq_puts(m, "\n");

+1

kernel/sched/core.c

··· 2083 2083 #endif 2084 2084 2085 2085 /* Here we just switch the register state and the stack. */ 2086 + rcu_switch_from(prev); 2086 2087 switch_to(prev, next, prev); 2087 2088 2088 2089 barrier();

+435 -113

kernel/srcu.c

··· 34 34 #include <linux/delay.h> 35 35 #include <linux/srcu.h> 36 36 37 + /* 38 + * Initialize an rcu_batch structure to empty. 39 + */ 40 + static inline void rcu_batch_init(struct rcu_batch *b) 41 + { 42 + b->head = NULL; 43 + b->tail = &b->head; 44 + } 45 + 46 + /* 47 + * Enqueue a callback onto the tail of the specified rcu_batch structure. 48 + */ 49 + static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) 50 + { 51 + *b->tail = head; 52 + b->tail = &head->next; 53 + } 54 + 55 + /* 56 + * Is the specified rcu_batch structure empty? 57 + */ 58 + static inline bool rcu_batch_empty(struct rcu_batch *b) 59 + { 60 + return b->tail == &b->head; 61 + } 62 + 63 + /* 64 + * Remove the callback at the head of the specified rcu_batch structure 65 + * and return a pointer to it, or return NULL if the structure is empty. 66 + */ 67 + static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) 68 + { 69 + struct rcu_head *head; 70 + 71 + if (rcu_batch_empty(b)) 72 + return NULL; 73 + 74 + head = b->head; 75 + b->head = head->next; 76 + if (b->tail == &head->next) 77 + rcu_batch_init(b); 78 + 79 + return head; 80 + } 81 + 82 + /* 83 + * Move all callbacks from the rcu_batch structure specified by "from" to 84 + * the structure specified by "to". 85 + */ 86 + static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) 87 + { 88 + if (!rcu_batch_empty(from)) { 89 + *to->tail = from->head; 90 + to->tail = from->tail; 91 + rcu_batch_init(from); 92 + } 93 + } 94 + 95 + /* single-thread state-machine */ 96 + static void process_srcu(struct work_struct *work); 97 + 37 98 static int init_srcu_struct_fields(struct srcu_struct *sp) 38 99 { 39 100 sp->completed = 0; 40 - mutex_init(&sp->mutex); 101 + spin_lock_init(&sp->queue_lock); 102 + sp->running = false; 103 + rcu_batch_init(&sp->batch_queue); 104 + rcu_batch_init(&sp->batch_check0); 105 + rcu_batch_init(&sp->batch_check1); 106 + rcu_batch_init(&sp->batch_done); 107 + INIT_DELAYED_WORK(&sp->work, process_srcu); 41 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 42 109 return sp->per_cpu_ref ? 0 : -ENOMEM; 43 110 } ··· 140 73 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 141 74 142 75 /* 143 - * srcu_readers_active_idx -- returns approximate number of readers 144 - * active on the specified rank of per-CPU counters. 76 + * Returns approximate total of the readers' ->seq[] values for the 77 + * rank of per-CPU counters specified by idx. 145 78 */ 146 - 147 - static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 79 + static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) 148 80 { 149 81 int cpu; 150 - int sum; 82 + unsigned long sum = 0; 83 + unsigned long t; 151 84 152 - sum = 0; 153 - for_each_possible_cpu(cpu) 154 - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 85 + for_each_possible_cpu(cpu) { 86 + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); 87 + sum += t; 88 + } 155 89 return sum; 90 + } 91 + 92 + /* 93 + * Returns approximate number of readers active on the specified rank 94 + * of the per-CPU ->c[] counters. 95 + */ 96 + static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) 97 + { 98 + int cpu; 99 + unsigned long sum = 0; 100 + unsigned long t; 101 + 102 + for_each_possible_cpu(cpu) { 103 + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); 104 + sum += t; 105 + } 106 + return sum; 107 + } 108 + 109 + /* 110 + * Return true if the number of pre-existing readers is determined to 111 + * be stably zero. An example unstable zero can occur if the call 112 + * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, 113 + * but due to task migration, sees the corresponding __srcu_read_unlock() 114 + * decrement. This can happen because srcu_readers_active_idx() takes 115 + * time to sum the array, and might in fact be interrupted or preempted 116 + * partway through the summation. 117 + */ 118 + static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) 119 + { 120 + unsigned long seq; 121 + 122 + seq = srcu_readers_seq_idx(sp, idx); 123 + 124 + /* 125 + * The following smp_mb() A pairs with the smp_mb() B located in 126 + * __srcu_read_lock(). This pairing ensures that if an 127 + * __srcu_read_lock() increments its counter after the summation 128 + * in srcu_readers_active_idx(), then the corresponding SRCU read-side 129 + * critical section will see any changes made prior to the start 130 + * of the current SRCU grace period. 131 + * 132 + * Also, if the above call to srcu_readers_seq_idx() saw the 133 + * increment of ->seq[], then the call to srcu_readers_active_idx() 134 + * must see the increment of ->c[]. 135 + */ 136 + smp_mb(); /* A */ 137 + 138 + /* 139 + * Note that srcu_readers_active_idx() can incorrectly return 140 + * zero even though there is a pre-existing reader throughout. 141 + * To see this, suppose that task A is in a very long SRCU 142 + * read-side critical section that started on CPU 0, and that 143 + * no other reader exists, so that the sum of the counters 144 + * is equal to one. Then suppose that task B starts executing 145 + * srcu_readers_active_idx(), summing up to CPU 1, and then that 146 + * task C starts reading on CPU 0, so that its increment is not 147 + * summed, but finishes reading on CPU 2, so that its decrement 148 + * -is- summed. Then when task B completes its sum, it will 149 + * incorrectly get zero, despite the fact that task A has been 150 + * in its SRCU read-side critical section the whole time. 151 + * 152 + * We therefore do a validation step should srcu_readers_active_idx() 153 + * return zero. 154 + */ 155 + if (srcu_readers_active_idx(sp, idx) != 0) 156 + return false; 157 + 158 + /* 159 + * The remainder of this function is the validation step. 160 + * The following smp_mb() D pairs with the smp_mb() C in 161 + * __srcu_read_unlock(). If the __srcu_read_unlock() was seen 162 + * by srcu_readers_active_idx() above, then any destructive 163 + * operation performed after the grace period will happen after 164 + * the corresponding SRCU read-side critical section. 165 + * 166 + * Note that there can be at most NR_CPUS worth of readers using 167 + * the old index, which is not enough to overflow even a 32-bit 168 + * integer. (Yes, this does mean that systems having more than 169 + * a billion or so CPUs need to be 64-bit systems.) Therefore, 170 + * the sum of the ->seq[] counters cannot possibly overflow. 171 + * Therefore, the only way that the return values of the two 172 + * calls to srcu_readers_seq_idx() can be equal is if there were 173 + * no increments of the corresponding rank of ->seq[] counts 174 + * in the interim. But the missed-increment scenario laid out 175 + * above includes an increment of the ->seq[] counter by 176 + * the corresponding __srcu_read_lock(). Therefore, if this 177 + * scenario occurs, the return values from the two calls to 178 + * srcu_readers_seq_idx() will differ, and thus the validation 179 + * step below suffices. 180 + */ 181 + smp_mb(); /* D */ 182 + 183 + return srcu_readers_seq_idx(sp, idx) == seq; 156 184 } 157 185 158 186 /** ··· 260 98 */ 261 99 static int srcu_readers_active(struct srcu_struct *sp) 262 100 { 263 - return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 101 + int cpu; 102 + unsigned long sum = 0; 103 + 104 + for_each_possible_cpu(cpu) { 105 + sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); 106 + sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); 107 + } 108 + return sum; 264 109 } 265 110 266 111 /** ··· 300 131 int idx; 301 132 302 133 preempt_disable(); 303 - idx = sp->completed & 0x1; 304 - barrier(); /* ensure compiler looks -once- at sp->completed. */ 305 - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 306 - srcu_barrier(); /* ensure compiler won't misorder critical section. */ 134 + idx = rcu_dereference_index_check(sp->completed, 135 + rcu_read_lock_sched_held()) & 0x1; 136 + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 137 + smp_mb(); /* B */ /* Avoid leaking the critical section. */ 138 + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 307 139 preempt_enable(); 308 140 return idx; 309 141 } ··· 319 149 void __srcu_read_unlock(struct srcu_struct *sp, int idx) 320 150 { 321 151 preempt_disable(); 322 - srcu_barrier(); /* ensure compiler won't misorder critical section. */ 323 - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 152 + smp_mb(); /* C */ /* Avoid leaking the critical section. */ 153 + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; 324 154 preempt_enable(); 325 155 } 326 156 EXPORT_SYMBOL_GPL(__srcu_read_unlock); ··· 333 163 * we repeatedly block for 1-millisecond time periods. This approach 334 164 * has done well in testing, so there is no need for a config parameter. 335 165 */ 336 - #define SYNCHRONIZE_SRCU_READER_DELAY 10 166 + #define SRCU_RETRY_CHECK_DELAY 5 167 + #define SYNCHRONIZE_SRCU_TRYCOUNT 2 168 + #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 169 + 170 + /* 171 + * @@@ Wait until all pre-existing readers complete. Such readers 172 + * will have used the index specified by "idx". 173 + * the caller should ensures the ->completed is not changed while checking 174 + * and idx = (->completed & 1) ^ 1 175 + */ 176 + static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) 177 + { 178 + for (;;) { 179 + if (srcu_readers_active_idx_check(sp, idx)) 180 + return true; 181 + if (--trycount <= 0) 182 + return false; 183 + udelay(SRCU_RETRY_CHECK_DELAY); 184 + } 185 + } 186 + 187 + /* 188 + * Increment the ->completed counter so that future SRCU readers will 189 + * use the other rank of the ->c[] and ->seq[] arrays. This allows 190 + * us to wait for pre-existing readers in a starvation-free manner. 191 + */ 192 + static void srcu_flip(struct srcu_struct *sp) 193 + { 194 + sp->completed++; 195 + } 196 + 197 + /* 198 + * Enqueue an SRCU callback on the specified srcu_struct structure, 199 + * initiating grace-period processing if it is not already running. 200 + */ 201 + void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 202 + void (*func)(struct rcu_head *head)) 203 + { 204 + unsigned long flags; 205 + 206 + head->next = NULL; 207 + head->func = func; 208 + spin_lock_irqsave(&sp->queue_lock, flags); 209 + rcu_batch_queue(&sp->batch_queue, head); 210 + if (!sp->running) { 211 + sp->running = true; 212 + queue_delayed_work(system_nrt_wq, &sp->work, 0); 213 + } 214 + spin_unlock_irqrestore(&sp->queue_lock, flags); 215 + } 216 + EXPORT_SYMBOL_GPL(call_srcu); 217 + 218 + struct rcu_synchronize { 219 + struct rcu_head head; 220 + struct completion completion; 221 + }; 222 + 223 + /* 224 + * Awaken the corresponding synchronize_srcu() instance now that a 225 + * grace period has elapsed. 226 + */ 227 + static void wakeme_after_rcu(struct rcu_head *head) 228 + { 229 + struct rcu_synchronize *rcu; 230 + 231 + rcu = container_of(head, struct rcu_synchronize, head); 232 + complete(&rcu->completion); 233 + } 234 + 235 + static void srcu_advance_batches(struct srcu_struct *sp, int trycount); 236 + static void srcu_reschedule(struct srcu_struct *sp); 337 237 338 238 /* 339 239 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 340 240 */ 341 - static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 241 + static void __synchronize_srcu(struct srcu_struct *sp, int trycount) 342 242 { 343 - int idx; 243 + struct rcu_synchronize rcu; 244 + struct rcu_head *head = &rcu.head; 245 + bool done = false; 344 246 345 247 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 346 248 !lock_is_held(&rcu_bh_lock_map) && ··· 420 178 !lock_is_held(&rcu_sched_lock_map), 421 179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 422 180 423 - idx = sp->completed; 424 - mutex_lock(&sp->mutex); 181 + init_completion(&rcu.completion); 425 182 426 - /* 427 - * Check to see if someone else did the work for us while we were 428 - * waiting to acquire the lock. We need -two- advances of 429 - * the counter, not just one. If there was but one, we might have 430 - * shown up -after- our helper's first synchronize_sched(), thus 431 - * having failed to prevent CPU-reordering races with concurrent 432 - * srcu_read_unlock()s on other CPUs (see comment below). So we 433 - * either (1) wait for two or (2) supply the second ourselves. 434 - */ 183 + head->next = NULL; 184 + head->func = wakeme_after_rcu; 185 + spin_lock_irq(&sp->queue_lock); 186 + if (!sp->running) { 187 + /* steal the processing owner */ 188 + sp->running = true; 189 + rcu_batch_queue(&sp->batch_check0, head); 190 + spin_unlock_irq(&sp->queue_lock); 435 191 436 - if ((sp->completed - idx) >= 2) { 437 - mutex_unlock(&sp->mutex); 438 - return; 192 + srcu_advance_batches(sp, trycount); 193 + if (!rcu_batch_empty(&sp->batch_done)) { 194 + BUG_ON(sp->batch_done.head != head); 195 + rcu_batch_dequeue(&sp->batch_done); 196 + done = true; 197 + } 198 + /* give the processing owner to work_struct */ 199 + srcu_reschedule(sp); 200 + } else { 201 + rcu_batch_queue(&sp->batch_queue, head); 202 + spin_unlock_irq(&sp->queue_lock); 439 203 } 440 204 441 - sync_func(); /* Force memory barrier on all CPUs. */ 442 - 443 - /* 444 - * The preceding synchronize_sched() ensures that any CPU that 445 - * sees the new value of sp->completed will also see any preceding 446 - * changes to data structures made by this CPU. This prevents 447 - * some other CPU from reordering the accesses in its SRCU 448 - * read-side critical section to precede the corresponding 449 - * srcu_read_lock() -- ensuring that such references will in 450 - * fact be protected. 451 - * 452 - * So it is now safe to do the flip. 453 - */ 454 - 455 - idx = sp->completed & 0x1; 456 - sp->completed++; 457 - 458 - sync_func(); /* Force memory barrier on all CPUs. */ 459 - 460 - /* 461 - * At this point, because of the preceding synchronize_sched(), 462 - * all srcu_read_lock() calls using the old counters have completed. 463 - * Their corresponding critical sections might well be still 464 - * executing, but the srcu_read_lock() primitives themselves 465 - * will have finished executing. We initially give readers 466 - * an arbitrarily chosen 10 microseconds to get out of their 467 - * SRCU read-side critical sections, then loop waiting 1/HZ 468 - * seconds per iteration. The 10-microsecond value has done 469 - * very well in testing. 470 - */ 471 - 472 - if (srcu_readers_active_idx(sp, idx)) 473 - udelay(SYNCHRONIZE_SRCU_READER_DELAY); 474 - while (srcu_readers_active_idx(sp, idx)) 475 - schedule_timeout_interruptible(1); 476 - 477 - sync_func(); /* Force memory barrier on all CPUs. */ 478 - 479 - /* 480 - * The preceding synchronize_sched() forces all srcu_read_unlock() 481 - * primitives that were executing concurrently with the preceding 482 - * for_each_possible_cpu() loop to have completed by this point. 483 - * More importantly, it also forces the corresponding SRCU read-side 484 - * critical sections to have also completed, and the corresponding 485 - * references to SRCU-protected data items to be dropped. 486 - * 487 - * Note: 488 - * 489 - * Despite what you might think at first glance, the 490 - * preceding synchronize_sched() -must- be within the 491 - * critical section ended by the following mutex_unlock(). 492 - * Otherwise, a task taking the early exit can race 493 - * with a srcu_read_unlock(), which might have executed 494 - * just before the preceding srcu_readers_active() check, 495 - * and whose CPU might have reordered the srcu_read_unlock() 496 - * with the preceding critical section. In this case, there 497 - * is nothing preventing the synchronize_sched() task that is 498 - * taking the early exit from freeing a data structure that 499 - * is still being referenced (out of order) by the task 500 - * doing the srcu_read_unlock(). 501 - * 502 - * Alternatively, the comparison with "2" on the early exit 503 - * could be changed to "3", but this increases synchronize_srcu() 504 - * latency for bulk loads. So the current code is preferred. 505 - */ 506 - 507 - mutex_unlock(&sp->mutex); 205 + if (!done) 206 + wait_for_completion(&rcu.completion); 508 207 } 509 208 510 209 /** ··· 464 281 */ 465 282 void synchronize_srcu(struct srcu_struct *sp) 466 283 { 467 - __synchronize_srcu(sp, synchronize_sched); 284 + __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 468 285 } 469 286 EXPORT_SYMBOL_GPL(synchronize_srcu); 470 287 ··· 472 289 * synchronize_srcu_expedited - Brute-force SRCU grace period 473 290 * @sp: srcu_struct with which to synchronize. 474 291 * 475 - * Wait for an SRCU grace period to elapse, but use a "big hammer" 476 - * approach to force the grace period to end quickly. This consumes 477 - * significant time on all CPUs and is unfriendly to real-time workloads, 478 - * so is thus not recommended for any sort of common-case code. In fact, 479 - * if you are using synchronize_srcu_expedited() in a loop, please 480 - * restructure your code to batch your updates, and then use a single 481 - * synchronize_srcu() instead. 292 + * Wait for an SRCU grace period to elapse, but be more aggressive about 293 + * spinning rather than blocking when waiting. 482 294 * 483 295 * Note that it is illegal to call this function while holding any lock 484 - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 485 - * to call this function from a CPU-hotplug notifier. Failing to observe 486 - * these restriction will result in deadlock. It is also illegal to call 296 + * that is acquired by a CPU-hotplug notifier. It is also illegal to call 487 297 * synchronize_srcu_expedited() from the corresponding SRCU read-side 488 298 * critical section; doing so will result in deadlock. However, it is 489 299 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct ··· 485 309 */ 486 310 void synchronize_srcu_expedited(struct srcu_struct *sp) 487 311 { 488 - __synchronize_srcu(sp, synchronize_sched_expedited); 312 + __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); 489 313 } 490 314 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 315 + 316 + /** 317 + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. 318 + */ 319 + void srcu_barrier(struct srcu_struct *sp) 320 + { 321 + synchronize_srcu(sp); 322 + } 323 + EXPORT_SYMBOL_GPL(srcu_barrier); 491 324 492 325 /** 493 326 * srcu_batches_completed - return batches completed. ··· 505 320 * Report the number of batches, correlated with, but not necessarily 506 321 * precisely the same as, the number of grace periods that have elapsed. 507 322 */ 508 - 509 323 long srcu_batches_completed(struct srcu_struct *sp) 510 324 { 511 325 return sp->completed; 512 326 } 513 327 EXPORT_SYMBOL_GPL(srcu_batches_completed); 328 + 329 + #define SRCU_CALLBACK_BATCH 10 330 + #define SRCU_INTERVAL 1 331 + 332 + /* 333 + * Move any new SRCU callbacks to the first stage of the SRCU grace 334 + * period pipeline. 335 + */ 336 + static void srcu_collect_new(struct srcu_struct *sp) 337 + { 338 + if (!rcu_batch_empty(&sp->batch_queue)) { 339 + spin_lock_irq(&sp->queue_lock); 340 + rcu_batch_move(&sp->batch_check0, &sp->batch_queue); 341 + spin_unlock_irq(&sp->queue_lock); 342 + } 343 + } 344 + 345 + /* 346 + * Core SRCU state machine. Advance callbacks from ->batch_check0 to 347 + * ->batch_check1 and then to ->batch_done as readers drain. 348 + */ 349 + static void srcu_advance_batches(struct srcu_struct *sp, int trycount) 350 + { 351 + int idx = 1 ^ (sp->completed & 1); 352 + 353 + /* 354 + * Because readers might be delayed for an extended period after 355 + * fetching ->completed for their index, at any point in time there 356 + * might well be readers using both idx=0 and idx=1. We therefore 357 + * need to wait for readers to clear from both index values before 358 + * invoking a callback. 359 + */ 360 + 361 + if (rcu_batch_empty(&sp->batch_check0) && 362 + rcu_batch_empty(&sp->batch_check1)) 363 + return; /* no callbacks need to be advanced */ 364 + 365 + if (!try_check_zero(sp, idx, trycount)) 366 + return; /* failed to advance, will try after SRCU_INTERVAL */ 367 + 368 + /* 369 + * The callbacks in ->batch_check1 have already done with their 370 + * first zero check and flip back when they were enqueued on 371 + * ->batch_check0 in a previous invocation of srcu_advance_batches(). 372 + * (Presumably try_check_zero() returned false during that 373 + * invocation, leaving the callbacks stranded on ->batch_check1.) 374 + * They are therefore ready to invoke, so move them to ->batch_done. 375 + */ 376 + rcu_batch_move(&sp->batch_done, &sp->batch_check1); 377 + 378 + if (rcu_batch_empty(&sp->batch_check0)) 379 + return; /* no callbacks need to be advanced */ 380 + srcu_flip(sp); 381 + 382 + /* 383 + * The callbacks in ->batch_check0 just finished their 384 + * first check zero and flip, so move them to ->batch_check1 385 + * for future checking on the other idx. 386 + */ 387 + rcu_batch_move(&sp->batch_check1, &sp->batch_check0); 388 + 389 + /* 390 + * SRCU read-side critical sections are normally short, so check 391 + * at least twice in quick succession after a flip. 392 + */ 393 + trycount = trycount < 2 ? 2 : trycount; 394 + if (!try_check_zero(sp, idx^1, trycount)) 395 + return; /* failed to advance, will try after SRCU_INTERVAL */ 396 + 397 + /* 398 + * The callbacks in ->batch_check1 have now waited for all 399 + * pre-existing readers using both idx values. They are therefore 400 + * ready to invoke, so move them to ->batch_done. 401 + */ 402 + rcu_batch_move(&sp->batch_done, &sp->batch_check1); 403 + } 404 + 405 + /* 406 + * Invoke a limited number of SRCU callbacks that have passed through 407 + * their grace period. If there are more to do, SRCU will reschedule 408 + * the workqueue. 409 + */ 410 + static void srcu_invoke_callbacks(struct srcu_struct *sp) 411 + { 412 + int i; 413 + struct rcu_head *head; 414 + 415 + for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { 416 + head = rcu_batch_dequeue(&sp->batch_done); 417 + if (!head) 418 + break; 419 + local_bh_disable(); 420 + head->func(head); 421 + local_bh_enable(); 422 + } 423 + } 424 + 425 + /* 426 + * Finished one round of SRCU grace period. Start another if there are 427 + * more SRCU callbacks queued, otherwise put SRCU into not-running state. 428 + */ 429 + static void srcu_reschedule(struct srcu_struct *sp) 430 + { 431 + bool pending = true; 432 + 433 + if (rcu_batch_empty(&sp->batch_done) && 434 + rcu_batch_empty(&sp->batch_check1) && 435 + rcu_batch_empty(&sp->batch_check0) && 436 + rcu_batch_empty(&sp->batch_queue)) { 437 + spin_lock_irq(&sp->queue_lock); 438 + if (rcu_batch_empty(&sp->batch_done) && 439 + rcu_batch_empty(&sp->batch_check1) && 440 + rcu_batch_empty(&sp->batch_check0) && 441 + rcu_batch_empty(&sp->batch_queue)) { 442 + sp->running = false; 443 + pending = false; 444 + } 445 + spin_unlock_irq(&sp->queue_lock); 446 + } 447 + 448 + if (pending) 449 + queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); 450 + } 451 + 452 + /* 453 + * This is the work-queue function that handles SRCU grace periods. 454 + */ 455 + static void process_srcu(struct work_struct *work) 456 + { 457 + struct srcu_struct *sp; 458 + 459 + sp = container_of(work, struct srcu_struct, work.work); 460 + 461 + srcu_collect_new(sp); 462 + srcu_advance_batches(sp, 1); 463 + srcu_invoke_callbacks(sp); 464 + srcu_reschedule(sp); 465 + }

+7 -1

kernel/timer.c

··· 861 861 * 862 862 * mod_timer_pinned() is a way to update the expire field of an 863 863 * active timer (if the timer is inactive it will be activated) 864 - * and not allow the timer to be migrated to a different CPU. 864 + * and to ensure that the timer is scheduled on the current CPU. 865 + * 866 + * Note that this does not prevent the timer from being migrated 867 + * when the current CPU goes offline. If this is a problem for 868 + * you, use CPU-hotplug notifiers to handle it correctly, for 869 + * example, cancelling the timer when the corresponding CPU goes 870 + * offline. 865 871 * 866 872 * mod_timer_pinned(timer, expires) is equivalent to: 867 873 *

+22

lib/list_debug.c

··· 10 10 #include <linux/list.h> 11 11 #include <linux/bug.h> 12 12 #include <linux/kernel.h> 13 + #include <linux/rculist.h> 13 14 14 15 /* 15 16 * Insert a new entry between two known consecutive entries. ··· 76 75 entry->prev = LIST_POISON2; 77 76 } 78 77 EXPORT_SYMBOL(list_del); 78 + 79 + /* 80 + * RCU variants. 81 + */ 82 + void __list_add_rcu(struct list_head *new, 83 + struct list_head *prev, struct list_head *next) 84 + { 85 + WARN(next->prev != prev, 86 + "list_add_rcu corruption. next->prev should be " 87 + "prev (%p), but was %p. (next=%p).\n", 88 + prev, next->prev, next); 89 + WARN(prev->next != next, 90 + "list_add_rcu corruption. prev->next should be " 91 + "next (%p), but was %p. (prev=%p).\n", 92 + next, prev->next, prev); 93 + new->next = next; 94 + new->prev = prev; 95 + rcu_assign_pointer(list_next_rcu(prev), new); 96 + next->prev = new; 97 + } 98 + EXPORT_SYMBOL(__list_add_rcu);