Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

documentation: Record RCU requirements

This commit adds RCU requirements as published in a 2015 LWN series.
Bringing these requirements in-tree allows them to be updated as changes
are discovered.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Updates to charset and URLs as suggested by Josh Triplett. ]

+6800
Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png

This is a binary file and will not be displayed.

+374
Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
··· 1 + <?xml version="1.0" encoding="UTF-8" standalone="no"?> 2 + <!-- Created with Inkscape (http://www.inkscape.org/) --> 3 + 4 + <svg 5 + xmlns:dc="http://purl.org/dc/elements/1.1/" 6 + xmlns:cc="http://creativecommons.org/ns#" 7 + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" 8 + xmlns:svg="http://www.w3.org/2000/svg" 9 + xmlns="http://www.w3.org/2000/svg" 10 + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" 11 + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" 12 + width="447.99197" 13 + height="428.19299" 14 + id="svg2" 15 + version="1.1" 16 + inkscape:version="0.48.3.1 r9886" 17 + sodipodi:docname="GPpartitionReaders1.svg"> 18 + <defs 19 + id="defs4"> 20 + <marker 21 + inkscape:stockid="Arrow2Lend" 22 + orient="auto" 23 + refY="0" 24 + refX="0" 25 + id="Arrow2Lend" 26 + style="overflow:visible"> 27 + <path 28 + id="path3792" 29 + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" 30 + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 31 + transform="matrix(-1.1,0,0,-1.1,-1.1,0)" 32 + inkscape:connector-curvature="0" /> 33 + </marker> 34 + <marker 35 + inkscape:stockid="Arrow2Lstart" 36 + orient="auto" 37 + refY="0" 38 + refX="0" 39 + id="Arrow2Lstart" 40 + style="overflow:visible"> 41 + <path 42 + id="path3789" 43 + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" 44 + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 45 + transform="matrix(1.1,0,0,1.1,1.1,0)" 46 + inkscape:connector-curvature="0" /> 47 + </marker> 48 + </defs> 49 + <sodipodi:namedview 50 + id="base" 51 + pagecolor="#ffffff" 52 + bordercolor="#666666" 53 + borderopacity="1.0" 54 + inkscape:pageopacity="0.0" 55 + inkscape:pageshadow="2" 56 + inkscape:zoom="1.6184291" 57 + inkscape:cx="223.99599" 58 + inkscape:cy="214.0965" 59 + inkscape:document-units="px" 60 + inkscape:current-layer="layer1" 61 + showgrid="false" 62 + inkscape:window-width="979" 63 + inkscape:window-height="836" 64 + inkscape:window-x="571" 65 + inkscape:window-y="335" 66 + inkscape:window-maximized="0" 67 + fit-margin-top="5" 68 + fit-margin-left="5" 69 + fit-margin-right="5" 70 + fit-margin-bottom="5" /> 71 + <metadata 72 + id="metadata7"> 73 + <rdf:RDF> 74 + <cc:Work 75 + rdf:about=""> 76 + <dc:format>image/svg+xml</dc:format> 77 + <dc:type 78 + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 79 + <dc:title></dc:title> 80 + </cc:Work> 81 + </rdf:RDF> 82 + </metadata> 83 + <g 84 + inkscape:label="Layer 1" 85 + inkscape:groupmode="layer" 86 + id="layer1" 87 + transform="translate(-28.441125,-185.60612)"> 88 + <flowRoot 89 + xml:space="preserve" 90 + id="flowRoot2985" 91 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion 92 + id="flowRegion2987"><rect 93 + id="rect2989" 94 + width="82.85714" 95 + height="11.428572" 96 + x="240" 97 + y="492.36218" /></flowRegion><flowPara 98 + id="flowPara2991"></flowPara></flowRoot> <g 99 + id="g4433" 100 + transform="translate(2,0)"> 101 + <text 102 + sodipodi:linespacing="125%" 103 + id="text2993" 104 + y="-261.66608" 105 + x="412.12299" 106 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 107 + xml:space="preserve" 108 + transform="matrix(0,1,-1,0,0,0)"><tspan 109 + y="-261.66608" 110 + x="412.12299" 111 + id="tspan2995" 112 + sodipodi:role="line">synchronize_rcu()</tspan></text> 113 + <g 114 + id="g4417" 115 + transform="matrix(0,1,-1,0,730.90257,222.4928)"> 116 + <path 117 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" 118 + d="m 97.580736,477.4048 183.140664,0" 119 + id="path2997" 120 + inkscape:connector-curvature="0" 121 + sodipodi:nodetypes="cc" /> 122 + <path 123 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 124 + d="m 96.752718,465.38398 0,22.62742" 125 + id="path4397" 126 + inkscape:connector-curvature="0" 127 + sodipodi:nodetypes="cc" /> 128 + <path 129 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 130 + d="m 281.54942,465.38397 0,22.62742" 131 + id="path4397-5" 132 + inkscape:connector-curvature="0" 133 + sodipodi:nodetypes="cc" /> 134 + </g> 135 + </g> 136 + <text 137 + xml:space="preserve" 138 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 139 + x="112.04738" 140 + y="268.18076" 141 + id="text4429" 142 + sodipodi:linespacing="125%"><tspan 143 + sodipodi:role="line" 144 + id="tspan4431" 145 + x="112.04738" 146 + y="268.18076">WRITE_ONCE(a, 1);</tspan></text> 147 + <text 148 + xml:space="preserve" 149 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 150 + x="112.04738" 151 + y="439.13766" 152 + id="text4441" 153 + sodipodi:linespacing="125%"><tspan 154 + sodipodi:role="line" 155 + id="tspan4443" 156 + x="112.04738" 157 + y="439.13766">WRITE_ONCE(b, 1);</tspan></text> 158 + <text 159 + xml:space="preserve" 160 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 161 + x="255.60869" 162 + y="309.29346" 163 + id="text4445" 164 + sodipodi:linespacing="125%"><tspan 165 + sodipodi:role="line" 166 + id="tspan4447" 167 + x="255.60869" 168 + y="309.29346">r1 = READ_ONCE(a);</tspan></text> 169 + <text 170 + xml:space="preserve" 171 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 172 + x="255.14423" 173 + y="520.61786" 174 + id="text4449" 175 + sodipodi:linespacing="125%"><tspan 176 + sodipodi:role="line" 177 + id="tspan4451" 178 + x="255.14423" 179 + y="520.61786">WRITE_ONCE(c, 1);</tspan></text> 180 + <text 181 + xml:space="preserve" 182 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 183 + x="396.10254" 184 + y="384.71124" 185 + id="text4453" 186 + sodipodi:linespacing="125%"><tspan 187 + sodipodi:role="line" 188 + id="tspan4455" 189 + x="396.10254" 190 + y="384.71124">r2 = READ_ONCE(b);</tspan></text> 191 + <text 192 + xml:space="preserve" 193 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 194 + x="396.10254" 195 + y="582.13617" 196 + id="text4457" 197 + sodipodi:linespacing="125%"><tspan 198 + sodipodi:role="line" 199 + id="tspan4459" 200 + x="396.10254" 201 + y="582.13617">r3 = READ_ONCE(c);</tspan></text> 202 + <text 203 + xml:space="preserve" 204 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 205 + x="112.08231" 206 + y="213.91006" 207 + id="text4461" 208 + sodipodi:linespacing="125%"><tspan 209 + sodipodi:role="line" 210 + id="tspan4463" 211 + x="112.08231" 212 + y="213.91006">thread0()</tspan></text> 213 + <text 214 + xml:space="preserve" 215 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 216 + x="252.34512" 217 + y="213.91006" 218 + id="text4461-6" 219 + sodipodi:linespacing="125%"><tspan 220 + sodipodi:role="line" 221 + id="tspan4463-0" 222 + x="252.34512" 223 + y="213.91006">thread1()</tspan></text> 224 + <text 225 + xml:space="preserve" 226 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 227 + x="396.42557" 228 + y="213.91006" 229 + id="text4461-2" 230 + sodipodi:linespacing="125%"><tspan 231 + sodipodi:role="line" 232 + id="tspan4463-2" 233 + x="396.42557" 234 + y="213.91006">thread2()</tspan></text> 235 + <rect 236 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 237 + id="rect4495" 238 + width="436.28488" 239 + height="416.4859" 240 + x="34.648232" 241 + y="191.10612" /> 242 + <path 243 + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 244 + d="m 183.14066,191.10612 0,417.193 -0.70711,0" 245 + id="path4497" 246 + inkscape:connector-curvature="0" /> 247 + <path 248 + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 249 + d="m 325.13867,191.10612 0,417.193 -0.70711,0" 250 + id="path4497-5" 251 + inkscape:connector-curvature="0" /> 252 + <text 253 + xml:space="preserve" 254 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 255 + x="111.75929" 256 + y="251.53981" 257 + id="text4429-8" 258 + sodipodi:linespacing="125%"><tspan 259 + sodipodi:role="line" 260 + id="tspan4431-9" 261 + x="111.75929" 262 + y="251.53981">rcu_read_lock();</tspan></text> 263 + <text 264 + xml:space="preserve" 265 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 266 + x="396.10254" 267 + y="367.91556" 268 + id="text4429-8-9" 269 + sodipodi:linespacing="125%"><tspan 270 + sodipodi:role="line" 271 + id="tspan4431-9-4" 272 + x="396.10254" 273 + y="367.91556">rcu_read_lock();</tspan></text> 274 + <text 275 + xml:space="preserve" 276 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 277 + x="396.10254" 278 + y="597.40289" 279 + id="text4429-8-9-3" 280 + sodipodi:linespacing="125%"><tspan 281 + sodipodi:role="line" 282 + id="tspan4431-9-4-4" 283 + x="396.10254" 284 + y="597.40289">rcu_read_unlock();</tspan></text> 285 + <text 286 + xml:space="preserve" 287 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 288 + x="111.75929" 289 + y="453.15311" 290 + id="text4429-8-9-3-1" 291 + sodipodi:linespacing="125%"><tspan 292 + sodipodi:role="line" 293 + id="tspan4431-9-4-4-6" 294 + x="111.75929" 295 + y="453.15311">rcu_read_unlock();</tspan></text> 296 + <path 297 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 298 + d="m 33.941125,227.87568 436.284885,0 0,0.7071" 299 + id="path4608" 300 + inkscape:connector-curvature="0" /> 301 + <text 302 + xml:space="preserve" 303 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 304 + x="394.94427" 305 + y="345.66351" 306 + id="text4648" 307 + sodipodi:linespacing="125%"><tspan 308 + sodipodi:role="line" 309 + id="tspan4650" 310 + x="394.94427" 311 + y="345.66351">QS</tspan></text> 312 + <path 313 + sodipodi:type="arc" 314 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 315 + id="path4652" 316 + sodipodi:cx="358.85669" 317 + sodipodi:cy="142.87541" 318 + sodipodi:rx="10.960155" 319 + sodipodi:ry="10.253048" 320 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 321 + transform="translate(36.441125,199.60612)" 322 + sodipodi:start="4.7135481" 323 + sodipodi:end="10.994651" 324 + sodipodi:open="true" /> 325 + <text 326 + xml:space="preserve" 327 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 328 + x="112.11968" 329 + y="475.77856" 330 + id="text4648-4" 331 + sodipodi:linespacing="125%"><tspan 332 + sodipodi:role="line" 333 + id="tspan4650-4" 334 + x="112.11968" 335 + y="475.77856">QS</tspan></text> 336 + <path 337 + sodipodi:type="arc" 338 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 339 + id="path4652-7" 340 + sodipodi:cx="358.85669" 341 + sodipodi:cy="142.87541" 342 + sodipodi:rx="10.960155" 343 + sodipodi:ry="10.253048" 344 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 345 + transform="translate(-246.38346,329.72117)" 346 + sodipodi:start="4.7135481" 347 + sodipodi:end="10.994651" 348 + sodipodi:open="true" /> 349 + <path 350 + sodipodi:type="arc" 351 + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 352 + id="path4652-7-7" 353 + sodipodi:cx="358.85669" 354 + sodipodi:cy="142.87541" 355 + sodipodi:rx="10.960155" 356 + sodipodi:ry="10.253048" 357 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 358 + transform="translate(-103.65246,202.90878)" 359 + sodipodi:start="4.7135481" 360 + sodipodi:end="10.994651" 361 + sodipodi:open="true" /> 362 + <text 363 + xml:space="preserve" 364 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 365 + x="254.85066" 366 + y="348.96619" 367 + id="text4648-4-3" 368 + sodipodi:linespacing="125%"><tspan 369 + sodipodi:role="line" 370 + id="tspan4650-4-5" 371 + x="254.85066" 372 + y="348.96619">QS</tspan></text> 373 + </g> 374 + </svg>
+237
Documentation/RCU/Design/Requirements/RCUApplicability.svg
··· 1 + <?xml version="1.0" encoding="UTF-8" standalone="no"?> 2 + <!-- Creator: fig2dev Version 3.2 Patchlevel 5d --> 3 + 4 + <!-- CreationDate: Tue Mar 4 18:34:25 2014 --> 5 + 6 + <!-- Magnification: 3.000 --> 7 + 8 + <svg 9 + xmlns:dc="http://purl.org/dc/elements/1.1/" 10 + xmlns:cc="http://creativecommons.org/ns#" 11 + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" 12 + xmlns:svg="http://www.w3.org/2000/svg" 13 + xmlns="http://www.w3.org/2000/svg" 14 + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" 15 + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" 16 + width="1089.1382" 17 + height="668.21368" 18 + viewBox="-2121 -36 14554.634 8876.4061" 19 + id="svg2" 20 + version="1.1" 21 + inkscape:version="0.48.3.1 r9886" 22 + sodipodi:docname="RCUApplicability.svg"> 23 + <metadata 24 + id="metadata40"> 25 + <rdf:RDF> 26 + <cc:Work 27 + rdf:about=""> 28 + <dc:format>image/svg+xml</dc:format> 29 + <dc:type 30 + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 31 + <dc:title /> 32 + </cc:Work> 33 + </rdf:RDF> 34 + </metadata> 35 + <defs 36 + id="defs38" /> 37 + <sodipodi:namedview 38 + pagecolor="#ffffff" 39 + bordercolor="#666666" 40 + borderopacity="1" 41 + objecttolerance="10" 42 + gridtolerance="10" 43 + guidetolerance="10" 44 + inkscape:pageopacity="0" 45 + inkscape:pageshadow="2" 46 + inkscape:window-width="849" 47 + inkscape:window-height="639" 48 + id="namedview36" 49 + showgrid="false" 50 + inkscape:zoom="0.51326165" 51 + inkscape:cx="544.56912" 52 + inkscape:cy="334.10686" 53 + inkscape:window-x="149" 54 + inkscape:window-y="448" 55 + inkscape:window-maximized="0" 56 + inkscape:current-layer="g4" 57 + fit-margin-top="5" 58 + fit-margin-left="5" 59 + fit-margin-right="5" 60 + fit-margin-bottom="5" /> 61 + <g 62 + style="fill:none;stroke-width:0.025in" 63 + id="g4" 64 + transform="translate(-2043.6828,14.791398)"> 65 + <!-- Line: box --> 66 + <rect 67 + x="0" 68 + y="0" 69 + width="14400" 70 + height="8775" 71 + rx="0" 72 + style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" 73 + id="rect6" /> 74 + <!-- Line: box --> 75 + <rect 76 + x="1350" 77 + y="0" 78 + width="11700" 79 + height="6075" 80 + rx="0" 81 + style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" 82 + id="rect8" /> 83 + <!-- Line: box --> 84 + <rect 85 + x="2700" 86 + y="0" 87 + width="9000" 88 + height="4275" 89 + rx="0" 90 + style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" 91 + id="rect10" /> 92 + <!-- Line: box --> 93 + <rect 94 + x="4050" 95 + y="0" 96 + width="6300" 97 + height="2475" 98 + rx="0" 99 + style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" 100 + id="rect12" /> 101 + <!-- Text --> 102 + <text 103 + xml:space="preserve" 104 + x="7200" 105 + y="900" 106 + font-style="normal" 107 + font-weight="normal" 108 + font-size="324" 109 + id="text14" 110 + sodipodi:linespacing="125%" 111 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 112 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 113 + id="tspan3017">Read-Mostly, Stale &amp;</tspan></text> 114 + <!-- Text --> 115 + <text 116 + xml:space="preserve" 117 + x="7200" 118 + y="1350" 119 + font-style="normal" 120 + font-weight="normal" 121 + font-size="324" 122 + id="text16" 123 + sodipodi:linespacing="125%" 124 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 125 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 126 + id="tspan3019">Inconsistent Data OK</tspan></text> 127 + <!-- Text --> 128 + <text 129 + xml:space="preserve" 130 + x="7200" 131 + y="1800" 132 + font-style="normal" 133 + font-weight="normal" 134 + font-size="324" 135 + id="text18" 136 + sodipodi:linespacing="125%" 137 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 138 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 139 + id="tspan3021">(RCU Works Great!!!)</tspan></text> 140 + <!-- Text --> 141 + <text 142 + xml:space="preserve" 143 + x="7200" 144 + y="3825" 145 + font-style="normal" 146 + font-weight="normal" 147 + font-size="324" 148 + id="text20" 149 + sodipodi:linespacing="125%" 150 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 151 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 152 + id="tspan3023">(RCU Works Well)</tspan></text> 153 + <!-- Text --> 154 + <text 155 + xml:space="preserve" 156 + x="7200" 157 + y="3375" 158 + font-style="normal" 159 + font-weight="normal" 160 + font-size="324" 161 + id="text22" 162 + sodipodi:linespacing="125%" 163 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 164 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 165 + id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text> 166 + <!-- Text --> 167 + <text 168 + xml:space="preserve" 169 + x="7200" 170 + y="5175" 171 + font-style="normal" 172 + font-weight="normal" 173 + font-size="324" 174 + id="text24" 175 + sodipodi:linespacing="125%" 176 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 177 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 178 + id="tspan3027">Read-Write, Need Consistent Data</tspan></text> 179 + <!-- Text --> 180 + <text 181 + xml:space="preserve" 182 + x="7200" 183 + y="6975" 184 + font-style="normal" 185 + font-weight="normal" 186 + font-size="324" 187 + id="text26" 188 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 189 + sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text> 190 + <!-- Text --> 191 + <text 192 + xml:space="preserve" 193 + x="7200" 194 + y="5625" 195 + font-style="normal" 196 + font-weight="normal" 197 + font-size="324" 198 + id="text28" 199 + sodipodi:linespacing="125%" 200 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan 201 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 202 + id="tspan3029">(RCU Might Be OK...)</tspan></text> 203 + <!-- Text --> 204 + <text 205 + xml:space="preserve" 206 + x="7200" 207 + y="7875" 208 + font-style="normal" 209 + font-weight="normal" 210 + font-size="324" 211 + id="text30" 212 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 213 + sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text> 214 + <!-- Text --> 215 + <text 216 + xml:space="preserve" 217 + x="7200" 218 + y="8325" 219 + font-style="normal" 220 + font-weight="normal" 221 + font-size="324" 222 + id="text32" 223 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 224 + sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text> 225 + <!-- Text --> 226 + <text 227 + xml:space="preserve" 228 + x="7200" 229 + y="7425" 230 + font-style="normal" 231 + font-weight="normal" 232 + font-size="324" 233 + id="text34" 234 + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" 235 + sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text> 236 + </g> 237 + </svg>
+639
Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
··· 1 + <?xml version="1.0" encoding="UTF-8" standalone="no"?> 2 + <!-- Created with Inkscape (http://www.inkscape.org/) --> 3 + 4 + <svg 5 + xmlns:dc="http://purl.org/dc/elements/1.1/" 6 + xmlns:cc="http://creativecommons.org/ns#" 7 + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" 8 + xmlns:svg="http://www.w3.org/2000/svg" 9 + xmlns="http://www.w3.org/2000/svg" 10 + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" 11 + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" 12 + width="735.25" 13 + height="516.21875" 14 + id="svg2" 15 + version="1.1" 16 + inkscape:version="0.48.3.1 r9886" 17 + sodipodi:docname="ReadersPartitionGP1.svg"> 18 + <defs 19 + id="defs4"> 20 + <marker 21 + inkscape:stockid="Arrow2Lend" 22 + orient="auto" 23 + refY="0" 24 + refX="0" 25 + id="Arrow2Lend" 26 + style="overflow:visible"> 27 + <path 28 + id="path3792" 29 + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" 30 + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 31 + transform="matrix(-1.1,0,0,-1.1,-1.1,0)" 32 + inkscape:connector-curvature="0" /> 33 + </marker> 34 + <marker 35 + inkscape:stockid="Arrow2Lstart" 36 + orient="auto" 37 + refY="0" 38 + refX="0" 39 + id="Arrow2Lstart" 40 + style="overflow:visible"> 41 + <path 42 + id="path3789" 43 + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" 44 + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 45 + transform="matrix(1.1,0,0,1.1,1.1,0)" 46 + inkscape:connector-curvature="0" /> 47 + </marker> 48 + <marker 49 + inkscape:stockid="Arrow2Lstart" 50 + orient="auto" 51 + refY="0" 52 + refX="0" 53 + id="Arrow2Lstart-4" 54 + style="overflow:visible"> 55 + <path 56 + id="path3789-9" 57 + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" 58 + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 59 + transform="matrix(1.1,0,0,1.1,1.1,0)" 60 + inkscape:connector-curvature="0" /> 61 + </marker> 62 + <marker 63 + inkscape:stockid="Arrow2Lend" 64 + orient="auto" 65 + refY="0" 66 + refX="0" 67 + id="Arrow2Lend-4" 68 + style="overflow:visible"> 69 + <path 70 + id="path3792-4" 71 + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" 72 + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" 73 + transform="matrix(-1.1,0,0,-1.1,-1.1,0)" 74 + inkscape:connector-curvature="0" /> 75 + </marker> 76 + </defs> 77 + <sodipodi:namedview 78 + id="base" 79 + pagecolor="#ffffff" 80 + bordercolor="#666666" 81 + borderopacity="1.0" 82 + inkscape:pageopacity="0.0" 83 + inkscape:pageshadow="2" 84 + inkscape:zoom="1.3670394" 85 + inkscape:cx="367.26465" 86 + inkscape:cy="258.46182" 87 + inkscape:document-units="px" 88 + inkscape:current-layer="g4433-6" 89 + showgrid="false" 90 + inkscape:window-width="1351" 91 + inkscape:window-height="836" 92 + inkscape:window-x="438" 93 + inkscape:window-y="335" 94 + inkscape:window-maximized="0" 95 + fit-margin-top="5" 96 + fit-margin-left="5" 97 + fit-margin-right="5" 98 + fit-margin-bottom="5" /> 99 + <metadata 100 + id="metadata7"> 101 + <rdf:RDF> 102 + <cc:Work 103 + rdf:about=""> 104 + <dc:format>image/svg+xml</dc:format> 105 + <dc:type 106 + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 107 + <dc:title /> 108 + </cc:Work> 109 + </rdf:RDF> 110 + </metadata> 111 + <g 112 + inkscape:label="Layer 1" 113 + inkscape:groupmode="layer" 114 + id="layer1" 115 + transform="translate(-29.15625,-185.59375)"> 116 + <flowRoot 117 + xml:space="preserve" 118 + id="flowRoot2985" 119 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion 120 + id="flowRegion2987"><rect 121 + id="rect2989" 122 + width="82.85714" 123 + height="11.428572" 124 + x="240" 125 + y="492.36218" /></flowRegion><flowPara 126 + id="flowPara2991" /></flowRoot> <g 127 + id="g4433" 128 + transform="translate(2,-12)"> 129 + <text 130 + sodipodi:linespacing="125%" 131 + id="text2993" 132 + y="-261.66608" 133 + x="436.12299" 134 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 135 + xml:space="preserve" 136 + transform="matrix(0,1,-1,0,0,0)"><tspan 137 + y="-261.66608" 138 + x="436.12299" 139 + id="tspan2995" 140 + sodipodi:role="line">synchronize_rcu()</tspan></text> 141 + <g 142 + id="g4417" 143 + transform="matrix(0,1,-1,0,730.90257,222.4928)"> 144 + <path 145 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" 146 + d="M 97.580736,477.4048 327.57913,476.09759" 147 + id="path2997" 148 + inkscape:connector-curvature="0" 149 + sodipodi:nodetypes="cc" /> 150 + <path 151 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 152 + d="m 96.752718,465.38398 0,22.62742" 153 + id="path4397" 154 + inkscape:connector-curvature="0" 155 + sodipodi:nodetypes="cc" /> 156 + <path 157 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 158 + d="m 328.40703,465.38397 0,22.62742" 159 + id="path4397-5" 160 + inkscape:connector-curvature="0" 161 + sodipodi:nodetypes="cc" /> 162 + </g> 163 + </g> 164 + <text 165 + xml:space="preserve" 166 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 167 + x="112.04738" 168 + y="268.18076" 169 + id="text4429" 170 + sodipodi:linespacing="125%"><tspan 171 + sodipodi:role="line" 172 + id="tspan4431" 173 + x="112.04738" 174 + y="268.18076">WRITE_ONCE(a, 1);</tspan></text> 175 + <text 176 + xml:space="preserve" 177 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 178 + x="112.04738" 179 + y="487.13766" 180 + id="text4441" 181 + sodipodi:linespacing="125%"><tspan 182 + sodipodi:role="line" 183 + id="tspan4443" 184 + x="112.04738" 185 + y="487.13766">WRITE_ONCE(b, 1);</tspan></text> 186 + <text 187 + xml:space="preserve" 188 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 189 + x="255.60869" 190 + y="297.29346" 191 + id="text4445" 192 + sodipodi:linespacing="125%"><tspan 193 + sodipodi:role="line" 194 + id="tspan4447" 195 + x="255.60869" 196 + y="297.29346">r1 = READ_ONCE(a);</tspan></text> 197 + <text 198 + xml:space="preserve" 199 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 200 + x="255.14423" 201 + y="554.61786" 202 + id="text4449" 203 + sodipodi:linespacing="125%"><tspan 204 + sodipodi:role="line" 205 + id="tspan4451" 206 + x="255.14423" 207 + y="554.61786">WRITE_ONCE(c, 1);</tspan></text> 208 + <text 209 + xml:space="preserve" 210 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 211 + x="396.10254" 212 + y="370.71124" 213 + id="text4453" 214 + sodipodi:linespacing="125%"><tspan 215 + sodipodi:role="line" 216 + id="tspan4455" 217 + x="396.10254" 218 + y="370.71124">WRITE_ONCE(d, 1);</tspan></text> 219 + <text 220 + xml:space="preserve" 221 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 222 + x="396.10254" 223 + y="572.13617" 224 + id="text4457" 225 + sodipodi:linespacing="125%"><tspan 226 + sodipodi:role="line" 227 + id="tspan4459" 228 + x="396.10254" 229 + y="572.13617">r2 = READ_ONCE(c);</tspan></text> 230 + <text 231 + xml:space="preserve" 232 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 233 + x="112.08231" 234 + y="213.91006" 235 + id="text4461" 236 + sodipodi:linespacing="125%"><tspan 237 + sodipodi:role="line" 238 + id="tspan4463" 239 + x="112.08231" 240 + y="213.91006">thread0()</tspan></text> 241 + <text 242 + xml:space="preserve" 243 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 244 + x="252.34512" 245 + y="213.91006" 246 + id="text4461-6" 247 + sodipodi:linespacing="125%"><tspan 248 + sodipodi:role="line" 249 + id="tspan4463-0" 250 + x="252.34512" 251 + y="213.91006">thread1()</tspan></text> 252 + <text 253 + xml:space="preserve" 254 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 255 + x="396.42557" 256 + y="213.91006" 257 + id="text4461-2" 258 + sodipodi:linespacing="125%"><tspan 259 + sodipodi:role="line" 260 + id="tspan4463-2" 261 + x="396.42557" 262 + y="213.91006">thread2()</tspan></text> 263 + <rect 264 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 265 + id="rect4495" 266 + width="724.25244" 267 + height="505.21201" 268 + x="34.648232" 269 + y="191.10612" /> 270 + <path 271 + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 272 + d="m 183.14066,191.10612 0,504.24243" 273 + id="path4497" 274 + inkscape:connector-curvature="0" 275 + sodipodi:nodetypes="cc" /> 276 + <path 277 + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 278 + d="m 325.13867,191.10612 0,504.24243" 279 + id="path4497-5" 280 + inkscape:connector-curvature="0" 281 + sodipodi:nodetypes="cc" /> 282 + <text 283 + xml:space="preserve" 284 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 285 + x="111.75929" 286 + y="251.53981" 287 + id="text4429-8" 288 + sodipodi:linespacing="125%"><tspan 289 + sodipodi:role="line" 290 + id="tspan4431-9" 291 + x="111.75929" 292 + y="251.53981">rcu_read_lock();</tspan></text> 293 + <text 294 + xml:space="preserve" 295 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 296 + x="396.10254" 297 + y="353.91556" 298 + id="text4429-8-9" 299 + sodipodi:linespacing="125%"><tspan 300 + sodipodi:role="line" 301 + id="tspan4431-9-4" 302 + x="396.10254" 303 + y="353.91556">rcu_read_lock();</tspan></text> 304 + <text 305 + xml:space="preserve" 306 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 307 + x="396.10254" 308 + y="587.40289" 309 + id="text4429-8-9-3" 310 + sodipodi:linespacing="125%"><tspan 311 + sodipodi:role="line" 312 + id="tspan4431-9-4-4" 313 + x="396.10254" 314 + y="587.40289">rcu_read_unlock();</tspan></text> 315 + <text 316 + xml:space="preserve" 317 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 318 + x="111.75929" 319 + y="501.15311" 320 + id="text4429-8-9-3-1" 321 + sodipodi:linespacing="125%"><tspan 322 + sodipodi:role="line" 323 + id="tspan4431-9-4-4-6" 324 + x="111.75929" 325 + y="501.15311">rcu_read_unlock();</tspan></text> 326 + <path 327 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 328 + d="m 33.941125,227.87568 724.941765,0" 329 + id="path4608" 330 + inkscape:connector-curvature="0" 331 + sodipodi:nodetypes="cc" /> 332 + <text 333 + xml:space="preserve" 334 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 335 + x="394.94427" 336 + y="331.66351" 337 + id="text4648" 338 + sodipodi:linespacing="125%"><tspan 339 + sodipodi:role="line" 340 + id="tspan4650" 341 + x="394.94427" 342 + y="331.66351">QS</tspan></text> 343 + <path 344 + sodipodi:type="arc" 345 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 346 + id="path4652" 347 + sodipodi:cx="358.85669" 348 + sodipodi:cy="142.87541" 349 + sodipodi:rx="10.960155" 350 + sodipodi:ry="10.253048" 351 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 352 + transform="translate(36.441125,185.60612)" 353 + sodipodi:start="4.7135481" 354 + sodipodi:end="10.994651" 355 + sodipodi:open="true" /> 356 + <text 357 + xml:space="preserve" 358 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 359 + x="112.11968" 360 + y="523.77856" 361 + id="text4648-4" 362 + sodipodi:linespacing="125%"><tspan 363 + sodipodi:role="line" 364 + id="tspan4650-4" 365 + x="112.11968" 366 + y="523.77856">QS</tspan></text> 367 + <path 368 + sodipodi:type="arc" 369 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 370 + id="path4652-7" 371 + sodipodi:cx="358.85669" 372 + sodipodi:cy="142.87541" 373 + sodipodi:rx="10.960155" 374 + sodipodi:ry="10.253048" 375 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 376 + transform="translate(-246.38346,377.72117)" 377 + sodipodi:start="4.7135481" 378 + sodipodi:end="10.994651" 379 + sodipodi:open="true" /> 380 + <path 381 + sodipodi:type="arc" 382 + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 383 + id="path4652-7-7" 384 + sodipodi:cx="358.85669" 385 + sodipodi:cy="142.87541" 386 + sodipodi:rx="10.960155" 387 + sodipodi:ry="10.253048" 388 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 389 + transform="translate(-103.65246,190.90878)" 390 + sodipodi:start="4.7135481" 391 + sodipodi:end="10.994651" 392 + sodipodi:open="true" /> 393 + <text 394 + xml:space="preserve" 395 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 396 + x="254.85066" 397 + y="336.96619" 398 + id="text4648-4-3" 399 + sodipodi:linespacing="125%"><tspan 400 + sodipodi:role="line" 401 + id="tspan4650-4-5" 402 + x="254.85066" 403 + y="336.96619">QS</tspan></text> 404 + <path 405 + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 406 + d="m 470.93311,190.39903 0,504.24243" 407 + id="path4497-5-6" 408 + inkscape:connector-curvature="0" 409 + sodipodi:nodetypes="cc" /> 410 + <path 411 + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 412 + d="m 616.22755,190.38323 0,504.24243" 413 + id="path4497-5-2" 414 + inkscape:connector-curvature="0" 415 + sodipodi:nodetypes="cc" /> 416 + <g 417 + id="g4433-6" 418 + transform="translate(288.0964,78.32827)"> 419 + <text 420 + sodipodi:linespacing="125%" 421 + id="text2993-7" 422 + y="-261.66608" 423 + x="440.12299" 424 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 425 + xml:space="preserve" 426 + transform="matrix(0,1,-1,0,0,0)"><tspan 427 + y="-261.66608" 428 + x="440.12299" 429 + id="tspan2995-1" 430 + sodipodi:role="line">synchronize_rcu()</tspan></text> 431 + <g 432 + id="g4417-1" 433 + transform="matrix(0,1,-1,0,730.90257,222.4928)"> 434 + <path 435 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" 436 + d="M 97.580736,477.4048 328.5624,477.07246" 437 + id="path2997-2" 438 + inkscape:connector-curvature="0" 439 + sodipodi:nodetypes="cc" /> 440 + <path 441 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 442 + d="m 96.752718,465.38398 0,22.62742" 443 + id="path4397-3" 444 + inkscape:connector-curvature="0" 445 + sodipodi:nodetypes="cc" /> 446 + <path 447 + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" 448 + d="m 329.39039,465.38397 0,22.62742" 449 + id="path4397-5-4" 450 + inkscape:connector-curvature="0" 451 + sodipodi:nodetypes="cc" /> 452 + </g> 453 + </g> 454 + <text 455 + xml:space="preserve" 456 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 457 + x="541.70508" 458 + y="387.6217" 459 + id="text4445-0" 460 + sodipodi:linespacing="125%"><tspan 461 + sodipodi:role="line" 462 + id="tspan4447-5" 463 + x="541.70508" 464 + y="387.6217">r3 = READ_ONCE(d);</tspan></text> 465 + <text 466 + xml:space="preserve" 467 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 468 + x="541.2406" 469 + y="646.94611" 470 + id="text4449-6" 471 + sodipodi:linespacing="125%"><tspan 472 + sodipodi:role="line" 473 + id="tspan4451-6" 474 + x="541.2406" 475 + y="646.94611">WRITE_ONCE(e, 1);</tspan></text> 476 + <path 477 + sodipodi:type="arc" 478 + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 479 + id="path4652-7-7-5" 480 + sodipodi:cx="358.85669" 481 + sodipodi:cy="142.87541" 482 + sodipodi:rx="10.960155" 483 + sodipodi:ry="10.253048" 484 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 485 + transform="translate(182.44393,281.23704)" 486 + sodipodi:start="4.7135481" 487 + sodipodi:end="10.994651" 488 + sodipodi:open="true" /> 489 + <text 490 + xml:space="preserve" 491 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 492 + x="540.94702" 493 + y="427.29443" 494 + id="text4648-4-3-1" 495 + sodipodi:linespacing="125%"><tspan 496 + sodipodi:role="line" 497 + id="tspan4650-4-5-7" 498 + x="540.94702" 499 + y="427.29443">QS</tspan></text> 500 + <text 501 + xml:space="preserve" 502 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 503 + x="686.27747" 504 + y="461.83929" 505 + id="text4453-7" 506 + sodipodi:linespacing="125%"><tspan 507 + sodipodi:role="line" 508 + id="tspan4455-1" 509 + x="686.27747" 510 + y="461.83929">r4 = READ_ONCE(b);</tspan></text> 511 + <text 512 + xml:space="preserve" 513 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 514 + x="686.27747" 515 + y="669.26422" 516 + id="text4457-9" 517 + sodipodi:linespacing="125%"><tspan 518 + sodipodi:role="line" 519 + id="tspan4459-2" 520 + x="686.27747" 521 + y="669.26422">r5 = READ_ONCE(e);</tspan></text> 522 + <text 523 + xml:space="preserve" 524 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 525 + x="686.27747" 526 + y="445.04358" 527 + id="text4429-8-9-33" 528 + sodipodi:linespacing="125%"><tspan 529 + sodipodi:role="line" 530 + id="tspan4431-9-4-2" 531 + x="686.27747" 532 + y="445.04358">rcu_read_lock();</tspan></text> 533 + <text 534 + xml:space="preserve" 535 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 536 + x="686.27747" 537 + y="684.53094" 538 + id="text4429-8-9-3-8" 539 + sodipodi:linespacing="125%"><tspan 540 + sodipodi:role="line" 541 + id="tspan4431-9-4-4-5" 542 + x="686.27747" 543 + y="684.53094">rcu_read_unlock();</tspan></text> 544 + <text 545 + xml:space="preserve" 546 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 547 + x="685.11914" 548 + y="422.79153" 549 + id="text4648-9" 550 + sodipodi:linespacing="125%"><tspan 551 + sodipodi:role="line" 552 + id="tspan4650-7" 553 + x="685.11914" 554 + y="422.79153">QS</tspan></text> 555 + <path 556 + sodipodi:type="arc" 557 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 558 + id="path4652-8" 559 + sodipodi:cx="358.85669" 560 + sodipodi:cy="142.87541" 561 + sodipodi:rx="10.960155" 562 + sodipodi:ry="10.253048" 563 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 564 + transform="translate(326.61602,276.73415)" 565 + sodipodi:start="4.7135481" 566 + sodipodi:end="10.994651" 567 + sodipodi:open="true" /> 568 + <text 569 + xml:space="preserve" 570 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 571 + x="397.85934" 572 + y="609.59003" 573 + id="text4648-5" 574 + sodipodi:linespacing="125%"><tspan 575 + sodipodi:role="line" 576 + id="tspan4650-77" 577 + x="397.85934" 578 + y="609.59003">QS</tspan></text> 579 + <path 580 + sodipodi:type="arc" 581 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 582 + id="path4652-80" 583 + sodipodi:cx="358.85669" 584 + sodipodi:cy="142.87541" 585 + sodipodi:rx="10.960155" 586 + sodipodi:ry="10.253048" 587 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 588 + transform="translate(39.356201,463.53264)" 589 + sodipodi:start="4.7135481" 590 + sodipodi:end="10.994651" 591 + sodipodi:open="true" /> 592 + <text 593 + xml:space="preserve" 594 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 595 + x="256.75986" 596 + y="586.99133" 597 + id="text4648-5-2" 598 + sodipodi:linespacing="125%"><tspan 599 + sodipodi:role="line" 600 + id="tspan4650-77-7" 601 + x="256.75986" 602 + y="586.99133">QS</tspan></text> 603 + <path 604 + sodipodi:type="arc" 605 + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 606 + id="path4652-80-5" 607 + sodipodi:cx="358.85669" 608 + sodipodi:cy="142.87541" 609 + sodipodi:rx="10.960155" 610 + sodipodi:ry="10.253048" 611 + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" 612 + transform="translate(-101.74328,440.93395)" 613 + sodipodi:start="4.7135481" 614 + sodipodi:end="10.994651" 615 + sodipodi:open="true" /> 616 + <text 617 + xml:space="preserve" 618 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 619 + x="546.22791" 620 + y="213.91006" 621 + id="text4461-2-5" 622 + sodipodi:linespacing="125%"><tspan 623 + sodipodi:role="line" 624 + id="tspan4463-2-6" 625 + x="546.22791" 626 + y="213.91006">thread3()</tspan></text> 627 + <text 628 + xml:space="preserve" 629 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" 630 + x="684.00067" 631 + y="213.91006" 632 + id="text4461-2-1" 633 + sodipodi:linespacing="125%"><tspan 634 + sodipodi:role="line" 635 + id="tspan4463-2-0" 636 + x="684.00067" 637 + y="213.91006">thread4()</tspan></text> 638 + </g> 639 + </svg>
+2799
Documentation/RCU/Design/Requirements/Requirements.html
··· 1 + <!-- DO NOT HAND EDIT. --> 2 + <!-- Instead, edit Requirements.htmlx and run 'sh htmlqqz.sh Requirements' --> 3 + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 4 + "http://www.w3.org/TR/html4/loose.dtd"> 5 + <html> 6 + <head><title>A Tour Through RCU's Requirements [LWN.net]</title> 7 + <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> 8 + 9 + <h1>A Tour Through RCU's Requirements</h1> 10 + 11 + <p>Copyright IBM Corporation, 2015</p> 12 + <p>Author: Paul E.&nbsp;McKenney</p> 13 + <p><i>The initial version of this document appeared in the 14 + <a href="https://lwn.net/">LWN</a> articles 15 + <a href="https://lwn.net/Articles/652156/">here</a>, 16 + <a href="https://lwn.net/Articles/652677/">here</a>, and 17 + <a href="https://lwn.net/Articles/653326/">here</a>.</i></p> 18 + 19 + <h2>Introduction</h2> 20 + 21 + <p> 22 + Read-copy update (RCU) is a synchronization mechanism that is often 23 + used as a replacement for reader-writer locking. 24 + RCU is unusual in that updaters do not block readers, 25 + which means that RCU's read-side primitives can be exceedingly fast 26 + and scalable. 27 + In addition, updaters can make useful forward progress concurrently 28 + with readers. 29 + However, all this concurrency between RCU readers and updaters does raise 30 + the question of exactly what RCU readers are doing, which in turn 31 + raises the question of exactly what RCU's requirements are. 32 + 33 + <p> 34 + This document therefore summarizes RCU's requirements, and can be thought 35 + of as an informal, high-level specification for RCU. 36 + It is important to understand that RCU's specification is primarily 37 + empirical in nature; 38 + in fact, I learned about many of these requirements the hard way. 39 + This situation might cause some consternation, however, not only 40 + has this learning process been a lot of fun, but it has also been 41 + a great privilege to work with so many people willing to apply 42 + technologies in interesting new ways. 43 + 44 + <p> 45 + All that aside, here are the categories of currently known RCU requirements: 46 + </p> 47 + 48 + <ol> 49 + <li> <a href="#Fundamental Requirements"> 50 + Fundamental Requirements</a> 51 + <li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> 52 + <li> <a href="#Parallelism Facts of Life"> 53 + Parallelism Facts of Life</a> 54 + <li> <a href="#Quality-of-Implementation Requirements"> 55 + Quality-of-Implementation Requirements</a> 56 + <li> <a href="#Linux Kernel Complications"> 57 + Linux Kernel Complications</a> 58 + <li> <a href="#Software-Engineering Requirements"> 59 + Software-Engineering Requirements</a> 60 + <li> <a href="#Other RCU Flavors"> 61 + Other RCU Flavors</a> 62 + <li> <a href="#Possible Future Changes"> 63 + Possible Future Changes</a> 64 + </ol> 65 + 66 + <p> 67 + This is followed by a <a href="#Summary">summary</a>, 68 + which is in turn followed by the inevitable 69 + <a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. 70 + 71 + <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> 72 + 73 + <p> 74 + RCU's fundamental requirements are the closest thing RCU has to hard 75 + mathematical requirements. 76 + These are: 77 + 78 + <ol> 79 + <li> <a href="#Grace-Period Guarantee"> 80 + Grace-Period Guarantee</a> 81 + <li> <a href="#Publish-Subscribe Guarantee"> 82 + Publish-Subscribe Guarantee</a> 83 + <li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> 84 + RCU Primitives Guaranteed to Execute Unconditionally</a> 85 + <li> <a href="#Guaranteed Read-to-Write Upgrade"> 86 + Guaranteed Read-to-Write Upgrade</a> 87 + </ol> 88 + 89 + <h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> 90 + 91 + <p> 92 + RCU's grace-period guarantee is unusual in being premeditated: 93 + Jack Slingwine and I had this guarantee firmly in mind when we started 94 + work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s. 95 + That said, the past two decades of experience with RCU have produced 96 + a much more detailed understanding of this guarantee. 97 + 98 + <p> 99 + RCU's grace-period guarantee allows updaters to wait for the completion 100 + of all pre-existing RCU read-side critical sections. 101 + An RCU read-side critical section 102 + begins with the marker <tt>rcu_read_lock()</tt> and ends with 103 + the marker <tt>rcu_read_unlock()</tt>. 104 + These markers may be nested, and RCU treats a nested set as one 105 + big RCU read-side critical section. 106 + Production-quality implementations of <tt>rcu_read_lock()</tt> and 107 + <tt>rcu_read_unlock()</tt> are extremely lightweight, and in 108 + fact have exactly zero overhead in Linux kernels built for production 109 + use with <tt>CONFIG_PREEMPT=n</tt>. 110 + 111 + <p> 112 + This guarantee allows ordering to be enforced with extremely low 113 + overhead to readers, for example: 114 + 115 + <blockquote> 116 + <pre> 117 + 1 int x, y; 118 + 2 119 + 3 void thread0(void) 120 + 4 { 121 + 5 rcu_read_lock(); 122 + 6 r1 = READ_ONCE(x); 123 + 7 r2 = READ_ONCE(y); 124 + 8 rcu_read_unlock(); 125 + 9 } 126 + 10 127 + 11 void thread1(void) 128 + 12 { 129 + 13 WRITE_ONCE(x, 1); 130 + 14 synchronize_rcu(); 131 + 15 WRITE_ONCE(y, 1); 132 + 16 } 133 + </pre> 134 + </blockquote> 135 + 136 + <p> 137 + Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for 138 + all pre-existing readers, any instance of <tt>thread0()</tt> that 139 + loads a value of zero from <tt>x</tt> must complete before 140 + <tt>thread1()</tt> stores to <tt>y</tt>, so that instance must 141 + also load a value of zero from <tt>y</tt>. 142 + Similarly, any instance of <tt>thread0()</tt> that loads a value of 143 + one from <tt>y</tt> must have started after the 144 + <tt>synchronize_rcu()</tt> started, and must therefore also load 145 + a value of one from <tt>x</tt>. 146 + Therefore, the outcome: 147 + <blockquote> 148 + <pre> 149 + (r1 == 0 &amp;&amp; r2 == 1) 150 + </pre> 151 + </blockquote> 152 + cannot happen. 153 + 154 + <p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a> 155 + Wait a minute! 156 + You said that updaters can make useful forward progress concurrently 157 + with readers, but pre-existing readers will block 158 + <tt>synchronize_rcu()</tt>!!! 159 + Just who are you trying to fool??? 160 + <br><a href="#qq1answer">Answer</a> 161 + 162 + <p> 163 + This scenario resembles one of the first uses of RCU in 164 + <a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, 165 + which managed a distributed lock manager's transition into 166 + a state suitable for handling recovery from node failure, 167 + more or less as follows: 168 + 169 + <blockquote> 170 + <pre> 171 + 1 #define STATE_NORMAL 0 172 + 2 #define STATE_WANT_RECOVERY 1 173 + 3 #define STATE_RECOVERING 2 174 + 4 #define STATE_WANT_NORMAL 3 175 + 5 176 + 6 int state = STATE_NORMAL; 177 + 7 178 + 8 void do_something_dlm(void) 179 + 9 { 180 + 10 int state_snap; 181 + 11 182 + 12 rcu_read_lock(); 183 + 13 state_snap = READ_ONCE(state); 184 + 14 if (state_snap == STATE_NORMAL) 185 + 15 do_something(); 186 + 16 else 187 + 17 do_something_carefully(); 188 + 18 rcu_read_unlock(); 189 + 19 } 190 + 20 191 + 21 void start_recovery(void) 192 + 22 { 193 + 23 WRITE_ONCE(state, STATE_WANT_RECOVERY); 194 + 24 synchronize_rcu(); 195 + 25 WRITE_ONCE(state, STATE_RECOVERING); 196 + 26 recovery(); 197 + 27 WRITE_ONCE(state, STATE_WANT_NORMAL); 198 + 28 synchronize_rcu(); 199 + 29 WRITE_ONCE(state, STATE_NORMAL); 200 + 30 } 201 + </pre> 202 + </blockquote> 203 + 204 + <p> 205 + The RCU read-side critical section in <tt>do_something_dlm()</tt> 206 + works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> 207 + to guarantee that <tt>do_something()</tt> never runs concurrently 208 + with <tt>recovery()</tt>, but with little or no synchronization 209 + overhead in <tt>do_something_dlm()</tt>. 210 + 211 + <p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a> 212 + Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed? 213 + <br><a href="#qq2answer">Answer</a> 214 + 215 + <p> 216 + In order to avoid fatal problems such as deadlocks, 217 + an RCU read-side critical section must not contain calls to 218 + <tt>synchronize_rcu()</tt>. 219 + Similarly, an RCU read-side critical section must not 220 + contain anything that waits, directly or indirectly, on completion of 221 + an invocation of <tt>synchronize_rcu()</tt>. 222 + 223 + <p> 224 + Although RCU's grace-period guarantee is useful in and of itself, with 225 + <a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, 226 + it would be good to be able to use RCU to coordinate read-side 227 + access to linked data structures. 228 + For this, the grace-period guarantee is not sufficient, as can 229 + be seen in function <tt>add_gp_buggy()</tt> below. 230 + We will look at the reader's code later, but in the meantime, just think of 231 + the reader as locklessly picking up the <tt>gp</tt> pointer, 232 + and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the 233 + <tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields. 234 + 235 + <blockquote> 236 + <pre> 237 + 1 bool add_gp_buggy(int a, int b) 238 + 2 { 239 + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); 240 + 4 if (!p) 241 + 5 return -ENOMEM; 242 + 6 spin_lock(&amp;gp_lock); 243 + 7 if (rcu_access_pointer(gp)) { 244 + 8 spin_unlock(&amp;gp_lock); 245 + 9 return false; 246 + 10 } 247 + 11 p-&gt;a = a; 248 + 12 p-&gt;b = a; 249 + 13 gp = p; /* ORDERING BUG */ 250 + 14 spin_unlock(&amp;gp_lock); 251 + 15 return true; 252 + 16 } 253 + </pre> 254 + </blockquote> 255 + 256 + <p> 257 + The problem is that both the compiler and weakly ordered CPUs are within 258 + their rights to reorder this code as follows: 259 + 260 + <blockquote> 261 + <pre> 262 + 1 bool add_gp_buggy_optimized(int a, int b) 263 + 2 { 264 + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); 265 + 4 if (!p) 266 + 5 return -ENOMEM; 267 + 6 spin_lock(&amp;gp_lock); 268 + 7 if (rcu_access_pointer(gp)) { 269 + 8 spin_unlock(&amp;gp_lock); 270 + 9 return false; 271 + 10 } 272 + <b>11 gp = p; /* ORDERING BUG */ 273 + 12 p-&gt;a = a; 274 + 13 p-&gt;b = a;</b> 275 + 14 spin_unlock(&amp;gp_lock); 276 + 15 return true; 277 + 16 } 278 + </pre> 279 + </blockquote> 280 + 281 + <p> 282 + If an RCU reader fetches <tt>gp</tt> just after 283 + <tt>add_gp_buggy_optimized</tt> executes line&nbsp;11, 284 + it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt> 285 + fields. 286 + And this is but one of many ways in which compiler and hardware optimizations 287 + could cause trouble. 288 + Therefore, we clearly need some way to prevent the compiler and the CPU from 289 + reordering in this manner, which brings us to the publish-subscribe 290 + guarantee discussed in the next section. 291 + 292 + <h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> 293 + 294 + <p> 295 + RCU's publish-subscribe guarantee allows data to be inserted 296 + into a linked data structure without disrupting RCU readers. 297 + The updater uses <tt>rcu_assign_pointer()</tt> to insert the 298 + new data, and readers use <tt>rcu_dereference()</tt> to 299 + access data, whether new or old. 300 + The following shows an example of insertion: 301 + 302 + <blockquote> 303 + <pre> 304 + 1 bool add_gp(int a, int b) 305 + 2 { 306 + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); 307 + 4 if (!p) 308 + 5 return -ENOMEM; 309 + 6 spin_lock(&amp;gp_lock); 310 + 7 if (rcu_access_pointer(gp)) { 311 + 8 spin_unlock(&amp;gp_lock); 312 + 9 return false; 313 + 10 } 314 + 11 p-&gt;a = a; 315 + 12 p-&gt;b = a; 316 + 13 rcu_assign_pointer(gp, p); 317 + 14 spin_unlock(&amp;gp_lock); 318 + 15 return true; 319 + 16 } 320 + </pre> 321 + </blockquote> 322 + 323 + <p> 324 + The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually 325 + equivalent to a simple assignment statement, but also guarantees 326 + that its assignment will 327 + happen after the two assignments in lines&nbsp;11 and&nbsp;12, 328 + similar to the C11 <tt>memory_order_release</tt> store operation. 329 + It also prevents any number of &ldquo;interesting&rdquo; compiler 330 + optimizations, for example, the use of <tt>gp</tt> as a scratch 331 + location immediately preceding the assignment. 332 + 333 + <p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a> 334 + But <tt>rcu_assign_pointer()</tt> does nothing to prevent the 335 + two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> 336 + from being reordered. 337 + Can't that also cause problems? 338 + <br><a href="#qq3answer">Answer</a> 339 + 340 + <p> 341 + It is tempting to assume that the reader need not do anything special 342 + to control its accesses to the RCU-protected data, 343 + as shown in <tt>do_something_gp_buggy()</tt> below: 344 + 345 + <blockquote> 346 + <pre> 347 + 1 bool do_something_gp_buggy(void) 348 + 2 { 349 + 3 rcu_read_lock(); 350 + 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ 351 + 5 if (p) { 352 + 6 do_something(p-&gt;a, p-&gt;b); 353 + 7 rcu_read_unlock(); 354 + 8 return true; 355 + 9 } 356 + 10 rcu_read_unlock(); 357 + 11 return false; 358 + 12 } 359 + </pre> 360 + </blockquote> 361 + 362 + <p> 363 + However, this temptation must be resisted because there are a 364 + surprisingly large number of ways that the compiler 365 + (to say nothing of 366 + <a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) 367 + can trip this code up. 368 + For but one example, if the compiler were short of registers, it 369 + might choose to refetch from <tt>gp</tt> rather than keeping 370 + a separate copy in <tt>p</tt> as follows: 371 + 372 + <blockquote> 373 + <pre> 374 + 1 bool do_something_gp_buggy_optimized(void) 375 + 2 { 376 + 3 rcu_read_lock(); 377 + 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ 378 + <b> 5 do_something(gp-&gt;a, gp-&gt;b);</b> 379 + 6 rcu_read_unlock(); 380 + 7 return true; 381 + 8 } 382 + 9 rcu_read_unlock(); 383 + 10 return false; 384 + 11 } 385 + </pre> 386 + </blockquote> 387 + 388 + <p> 389 + If this function ran concurrently with a series of updates that 390 + replaced the current structure with a new one, 391 + the fetches of <tt>gp-&gt;a</tt> 392 + and <tt>gp-&gt;b</tt> might well come from two different structures, 393 + which could cause serious confusion. 394 + To prevent this (and much else besides), <tt>do_something_gp()</tt> uses 395 + <tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: 396 + 397 + <blockquote> 398 + <pre> 399 + 1 bool do_something_gp(void) 400 + 2 { 401 + 3 rcu_read_lock(); 402 + 4 p = rcu_dereference(gp); 403 + 5 if (p) { 404 + 6 do_something(p-&gt;a, p-&gt;b); 405 + 7 rcu_read_unlock(); 406 + 8 return true; 407 + 9 } 408 + 10 rcu_read_unlock(); 409 + 11 return false; 410 + 12 } 411 + </pre> 412 + </blockquote> 413 + 414 + <p> 415 + The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) 416 + memory barriers in the Linux kernel. 417 + Should a 418 + <a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> 419 + ever appear, then <tt>rcu_dereference()</tt> could be implemented 420 + as a <tt>memory_order_consume</tt> load. 421 + Regardless of the exact implementation, a pointer fetched by 422 + <tt>rcu_dereference()</tt> may not be used outside of the 423 + outermost RCU read-side critical section containing that 424 + <tt>rcu_dereference()</tt>, unless protection of 425 + the corresponding data element has been passed from RCU to some 426 + other synchronization mechanism, most commonly locking or 427 + <a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. 428 + 429 + <p> 430 + In short, updaters use <tt>rcu_assign_pointer()</tt> and readers 431 + use <tt>rcu_dereference()</tt>, and these two RCU API elements 432 + work together to ensure that readers have a consistent view of 433 + newly added data elements. 434 + 435 + <p> 436 + Of course, it is also necessary to remove elements from RCU-protected 437 + data structures, for example, using the following process: 438 + 439 + <ol> 440 + <li> Remove the data element from the enclosing structure. 441 + <li> Wait for all pre-existing RCU read-side critical sections 442 + to complete (because only pre-existing readers can possibly have 443 + a reference to the newly removed data element). 444 + <li> At this point, only the updater has a reference to the 445 + newly removed data element, so it can safely reclaim 446 + the data element, for example, by passing it to <tt>kfree()</tt>. 447 + </ol> 448 + 449 + This process is implemented by <tt>remove_gp_synchronous()</tt>: 450 + 451 + <blockquote> 452 + <pre> 453 + 1 bool remove_gp_synchronous(void) 454 + 2 { 455 + 3 struct foo *p; 456 + 4 457 + 5 spin_lock(&amp;gp_lock); 458 + 6 p = rcu_access_pointer(gp); 459 + 7 if (!p) { 460 + 8 spin_unlock(&amp;gp_lock); 461 + 9 return false; 462 + 10 } 463 + 11 rcu_assign_pointer(gp, NULL); 464 + 12 spin_unlock(&amp;gp_lock); 465 + 13 synchronize_rcu(); 466 + 14 kfree(p); 467 + 15 return true; 468 + 16 } 469 + </pre> 470 + </blockquote> 471 + 472 + <p> 473 + This function is straightforward, with line&nbsp;13 waiting for a grace 474 + period before line&nbsp;14 frees the old data element. 475 + This waiting ensures that readers will reach line&nbsp;7 of 476 + <tt>do_something_gp()</tt> before the data element referenced by 477 + <tt>p</tt> is freed. 478 + The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to 479 + <tt>rcu_dereference()</tt>, except that: 480 + 481 + <ol> 482 + <li> The value returned by <tt>rcu_access_pointer()</tt> 483 + cannot be dereferenced. 484 + If you want to access the value pointed to as well as 485 + the pointer itself, use <tt>rcu_dereference()</tt> 486 + instead of <tt>rcu_access_pointer()</tt>. 487 + <li> The call to <tt>rcu_access_pointer()</tt> need not be 488 + protected. 489 + In contrast, <tt>rcu_dereference()</tt> must either be 490 + within an RCU read-side critical section or in a code 491 + segment where the pointer cannot change, for example, in 492 + code protected by the corresponding update-side lock. 493 + </ol> 494 + 495 + <p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a> 496 + Without the <tt>rcu_dereference()</tt> or the 497 + <tt>rcu_access_pointer()</tt>, what destructive optimizations 498 + might the compiler make use of? 499 + <br><a href="#qq4answer">Answer</a> 500 + 501 + <p> 502 + This simple linked-data-structure scenario clearly demonstrates the need 503 + for RCU's stringent memory-ordering guarantees on systems with more than 504 + one CPU: 505 + 506 + <ol> 507 + <li> Each CPU that has an RCU read-side critical section that 508 + begins before <tt>synchronize_rcu()</tt> starts is 509 + guaranteed to execute a full memory barrier between the time 510 + that the RCU read-side critical section ends and the time that 511 + <tt>synchronize_rcu()</tt> returns. 512 + Without this guarantee, a pre-existing RCU read-side critical section 513 + might hold a reference to the newly removed <tt>struct foo</tt> 514 + after the <tt>kfree()</tt> on line&nbsp;14 of 515 + <tt>remove_gp_synchronous()</tt>. 516 + <li> Each CPU that has an RCU read-side critical section that ends 517 + after <tt>synchronize_rcu()</tt> returns is guaranteed 518 + to execute a full memory barrier between the time that 519 + <tt>synchronize_rcu()</tt> begins and the time that the RCU 520 + read-side critical section begins. 521 + Without this guarantee, a later RCU read-side critical section 522 + running after the <tt>kfree()</tt> on line&nbsp;14 of 523 + <tt>remove_gp_synchronous()</tt> might 524 + later run <tt>do_something_gp()</tt> and find the 525 + newly deleted <tt>struct foo</tt>. 526 + <li> If the task invoking <tt>synchronize_rcu()</tt> remains 527 + on a given CPU, then that CPU is guaranteed to execute a full 528 + memory barrier sometime during the execution of 529 + <tt>synchronize_rcu()</tt>. 530 + This guarantee ensures that the <tt>kfree()</tt> on 531 + line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does 532 + execute after the removal on line&nbsp;11. 533 + <li> If the task invoking <tt>synchronize_rcu()</tt> migrates 534 + among a group of CPUs during that invocation, then each of the 535 + CPUs in that group is guaranteed to execute a full memory barrier 536 + sometime during the execution of <tt>synchronize_rcu()</tt>. 537 + This guarantee also ensures that the <tt>kfree()</tt> on 538 + line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does 539 + execute after the removal on 540 + line&nbsp;11, but also in the case where the thread executing the 541 + <tt>synchronize_rcu()</tt> migrates in the meantime. 542 + </ol> 543 + 544 + <p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a> 545 + Given that multiple CPUs can start RCU read-side critical sections 546 + at any time without any ordering whatsoever, how can RCU possibly tell whether 547 + or not a given RCU read-side critical section starts before a 548 + given instance of <tt>synchronize_rcu()</tt>? 549 + <br><a href="#qq5answer">Answer</a> 550 + 551 + <p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a> 552 + The first and second guarantees require unbelievably strict ordering! 553 + Are all these memory barriers <i> really</i> required? 554 + <br><a href="#qq6answer">Answer</a> 555 + 556 + <p> 557 + In short, RCU's publish-subscribe guarantee is provided by the combination 558 + of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. 559 + This guarantee allows data elements to be safely added to RCU-protected 560 + linked data structures without disrupting RCU readers. 561 + This guarantee can be used in combination with the grace-period 562 + guarantee to also allow data elements to be removed from RCU-protected 563 + linked data structures, again without disrupting RCU readers. 564 + 565 + <p> 566 + This guarantee was only partially premeditated. 567 + DYNIX/ptx used an explicit memory barrier for publication, but had nothing 568 + resembling <tt>rcu_dereference()</tt> for subscription, nor did it 569 + have anything resembling the <tt>smp_read_barrier_depends()</tt> 570 + that was later subsumed into <tt>rcu_dereference()</tt>. 571 + The need for these operations made itself known quite suddenly at a 572 + late-1990s meeting with the DEC Alpha architects, back in the days when 573 + DEC was still a free-standing company. 574 + It took the Alpha architects a good hour to convince me that any sort 575 + of barrier would ever be needed, and it then took me a good <i>two</i> hours 576 + to convince them that their documentation did not make this point clear. 577 + More recent work with the C and C++ standards committees have provided 578 + much education on tricks and traps from the compiler. 579 + In short, compilers were much less tricky in the early 1990s, but in 580 + 2015, don't even think about omitting <tt>rcu_dereference()</tt>! 581 + 582 + <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> 583 + 584 + <p> 585 + The common-case RCU primitives are unconditional. 586 + They are invoked, they do their job, and they return, with no possibility 587 + of error, and no need to retry. 588 + This is a key RCU design philosophy. 589 + 590 + <p> 591 + However, this philosophy is pragmatic rather than pigheaded. 592 + If someone comes up with a good justification for a particular conditional 593 + RCU primitive, it might well be implemented and added. 594 + After all, this guarantee was reverse-engineered, not premeditated. 595 + The unconditional nature of the RCU primitives was initially an 596 + accident of implementation, and later experience with synchronization 597 + primitives with conditional primitives caused me to elevate this 598 + accident to a guarantee. 599 + Therefore, the justification for adding a conditional primitive to 600 + RCU would need to be based on detailed and compelling use cases. 601 + 602 + <h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> 603 + 604 + <p> 605 + As far as RCU is concerned, it is always possible to carry out an 606 + update within an RCU read-side critical section. 607 + For example, that RCU read-side critical section might search for 608 + a given data element, and then might acquire the update-side 609 + spinlock in order to update that element, all while remaining 610 + in that RCU read-side critical section. 611 + Of course, it is necessary to exit the RCU read-side critical section 612 + before invoking <tt>synchronize_rcu()</tt>, however, this 613 + inconvenience can be avoided through use of the 614 + <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members 615 + described later in this document. 616 + 617 + <p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a> 618 + But how does the upgrade-to-write operation exclude other readers? 619 + <br><a href="#qq7answer">Answer</a> 620 + 621 + <p> 622 + This guarantee allows lookup code to be shared between read-side 623 + and update-side code, and was premeditated, appearing in the earliest 624 + DYNIX/ptx RCU documentation. 625 + 626 + <h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> 627 + 628 + <p> 629 + RCU provides extremely lightweight readers, and its read-side guarantees, 630 + though quite useful, are correspondingly lightweight. 631 + It is therefore all too easy to assume that RCU is guaranteeing more 632 + than it really is. 633 + Of course, the list of things that RCU does not guarantee is infinitely 634 + long, however, the following sections list a few non-guarantees that 635 + have caused confusion. 636 + Except where otherwise noted, these non-guarantees were premeditated. 637 + 638 + <ol> 639 + <li> <a href="#Readers Impose Minimal Ordering"> 640 + Readers Impose Minimal Ordering</a> 641 + <li> <a href="#Readers Do Not Exclude Updaters"> 642 + Readers Do Not Exclude Updaters</a> 643 + <li> <a href="#Updaters Only Wait For Old Readers"> 644 + Updaters Only Wait For Old Readers</a> 645 + <li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> 646 + Grace Periods Don't Partition Read-Side Critical Sections</a> 647 + <li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> 648 + Read-Side Critical Sections Don't Partition Grace Periods</a> 649 + <li> <a href="#Disabling Preemption Does Not Block Grace Periods"> 650 + Disabling Preemption Does Not Block Grace Periods</a> 651 + </ol> 652 + 653 + <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> 654 + 655 + <p> 656 + Reader-side markers such as <tt>rcu_read_lock()</tt> and 657 + <tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees 658 + except through their interaction with the grace-period APIs such as 659 + <tt>synchronize_rcu()</tt>. 660 + To see this, consider the following pair of threads: 661 + 662 + <blockquote> 663 + <pre> 664 + 1 void thread0(void) 665 + 2 { 666 + 3 rcu_read_lock(); 667 + 4 WRITE_ONCE(x, 1); 668 + 5 rcu_read_unlock(); 669 + 6 rcu_read_lock(); 670 + 7 WRITE_ONCE(y, 1); 671 + 8 rcu_read_unlock(); 672 + 9 } 673 + 10 674 + 11 void thread1(void) 675 + 12 { 676 + 13 rcu_read_lock(); 677 + 14 r1 = READ_ONCE(y); 678 + 15 rcu_read_unlock(); 679 + 16 rcu_read_lock(); 680 + 17 r2 = READ_ONCE(x); 681 + 18 rcu_read_unlock(); 682 + 19 } 683 + </pre> 684 + </blockquote> 685 + 686 + <p> 687 + After <tt>thread0()</tt> and <tt>thread1()</tt> execute 688 + concurrently, it is quite possible to have 689 + 690 + <blockquote> 691 + <pre> 692 + (r1 == 1 &amp;&amp; r2 == 0) 693 + </pre> 694 + </blockquote> 695 + 696 + (that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), 697 + which would not be possible if <tt>rcu_read_lock()</tt> and 698 + <tt>rcu_read_unlock()</tt> had much in the way of ordering 699 + properties. 700 + But they do not, so the CPU is within its rights 701 + to do significant reordering. 702 + This is by design: Any significant ordering constraints would slow down 703 + these fast-path APIs. 704 + 705 + <p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a> 706 + Can't the compiler also reorder this code? 707 + <br><a href="#qq8answer">Answer</a> 708 + 709 + <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> 710 + 711 + <p> 712 + Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> 713 + exclude updates. 714 + All they do is to prevent grace periods from ending. 715 + The following example illustrates this: 716 + 717 + <blockquote> 718 + <pre> 719 + 1 void thread0(void) 720 + 2 { 721 + 3 rcu_read_lock(); 722 + 4 r1 = READ_ONCE(y); 723 + 5 if (r1) { 724 + 6 do_something_with_nonzero_x(); 725 + 7 r2 = READ_ONCE(x); 726 + 8 WARN_ON(!r2); /* BUG!!! */ 727 + 9 } 728 + 10 rcu_read_unlock(); 729 + 11 } 730 + 12 731 + 13 void thread1(void) 732 + 14 { 733 + 15 spin_lock(&amp;my_lock); 734 + 16 WRITE_ONCE(x, 1); 735 + 17 WRITE_ONCE(y, 1); 736 + 18 spin_unlock(&amp;my_lock); 737 + 19 } 738 + </pre> 739 + </blockquote> 740 + 741 + <p> 742 + If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> 743 + excluded the <tt>thread1()</tt> function's update, 744 + the <tt>WARN_ON()</tt> could never fire. 745 + But the fact is that <tt>rcu_read_lock()</tt> does not exclude 746 + much of anything aside from subsequent grace periods, of which 747 + <tt>thread1()</tt> has none, so the 748 + <tt>WARN_ON()</tt> can and does fire. 749 + 750 + <h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> 751 + 752 + <p> 753 + It might be tempting to assume that after <tt>synchronize_rcu()</tt> 754 + completes, there are no readers executing. 755 + This temptation must be avoided because 756 + new readers can start immediately after <tt>synchronize_rcu()</tt> 757 + starts, and <tt>synchronize_rcu()</tt> is under no 758 + obligation to wait for these new readers. 759 + 760 + <p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a> 761 + Suppose that synchronize_rcu() did wait until all readers had completed. 762 + Would the updater be able to rely on this? 763 + <br><a href="#qq9answer">Answer</a> 764 + 765 + <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> 766 + Grace Periods Don't Partition Read-Side Critical Sections</a></h3> 767 + 768 + <p> 769 + It is tempting to assume that if any part of one RCU read-side critical 770 + section precedes a given grace period, and if any part of another RCU 771 + read-side critical section follows that same grace period, then all of 772 + the first RCU read-side critical section must precede all of the second. 773 + However, this just isn't the case: A single grace period does not 774 + partition the set of RCU read-side critical sections. 775 + An example of this situation can be illustrated as follows, where 776 + <tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: 777 + 778 + <blockquote> 779 + <pre> 780 + 1 void thread0(void) 781 + 2 { 782 + 3 rcu_read_lock(); 783 + 4 WRITE_ONCE(a, 1); 784 + 5 WRITE_ONCE(b, 1); 785 + 6 rcu_read_unlock(); 786 + 7 } 787 + 8 788 + 9 void thread1(void) 789 + 10 { 790 + 11 r1 = READ_ONCE(a); 791 + 12 synchronize_rcu(); 792 + 13 WRITE_ONCE(c, 1); 793 + 14 } 794 + 15 795 + 16 void thread2(void) 796 + 17 { 797 + 18 rcu_read_lock(); 798 + 19 r2 = READ_ONCE(b); 799 + 20 r3 = READ_ONCE(c); 800 + 21 rcu_read_unlock(); 801 + 22 } 802 + </pre> 803 + </blockquote> 804 + 805 + <p> 806 + It turns out that the outcome: 807 + 808 + <blockquote> 809 + <pre> 810 + (r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1) 811 + </pre> 812 + </blockquote> 813 + 814 + is entirely possible. 815 + The following figure show how this can happen, with each circled 816 + <tt>QS</tt> indicating the point at which RCU recorded a 817 + <i>quiescent state</i> for each thread, that is, a state in which 818 + RCU knows that the thread cannot be in the midst of an RCU read-side 819 + critical section that started before the current grace period: 820 + 821 + <p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> 822 + 823 + <p> 824 + If it is necessary to partition RCU read-side critical sections in this 825 + manner, it is necessary to use two grace periods, where the first 826 + grace period is known to end before the second grace period starts: 827 + 828 + <blockquote> 829 + <pre> 830 + 1 void thread0(void) 831 + 2 { 832 + 3 rcu_read_lock(); 833 + 4 WRITE_ONCE(a, 1); 834 + 5 WRITE_ONCE(b, 1); 835 + 6 rcu_read_unlock(); 836 + 7 } 837 + 8 838 + 9 void thread1(void) 839 + 10 { 840 + 11 r1 = READ_ONCE(a); 841 + 12 synchronize_rcu(); 842 + 13 WRITE_ONCE(c, 1); 843 + 14 } 844 + 15 845 + 16 void thread2(void) 846 + 17 { 847 + 18 r2 = READ_ONCE(c); 848 + 19 synchronize_rcu(); 849 + 20 WRITE_ONCE(d, 1); 850 + 21 } 851 + 22 852 + 23 void thread3(void) 853 + 24 { 854 + 25 rcu_read_lock(); 855 + 26 r3 = READ_ONCE(b); 856 + 27 r4 = READ_ONCE(d); 857 + 28 rcu_read_unlock(); 858 + 29 } 859 + </pre> 860 + </blockquote> 861 + 862 + <p> 863 + Here, if <tt>(r1 == 1)</tt>, then 864 + <tt>thread0()</tt>'s write to <tt>b</tt> must happen 865 + before the end of <tt>thread1()</tt>'s grace period. 866 + If in addition <tt>(r4 == 1)</tt>, then 867 + <tt>thread3()</tt>'s read from <tt>b</tt> must happen 868 + after the beginning of <tt>thread2()</tt>'s grace period. 869 + If it is also the case that <tt>(r2 == 1)</tt>, then the 870 + end of <tt>thread1()</tt>'s grace period must precede the 871 + beginning of <tt>thread2()</tt>'s grace period. 872 + This mean that the two RCU read-side critical sections cannot overlap, 873 + guaranteeing that <tt>(r3 == 1)</tt>. 874 + As a result, the outcome: 875 + 876 + <blockquote> 877 + <pre> 878 + (r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1) 879 + </pre> 880 + </blockquote> 881 + 882 + cannot happen. 883 + 884 + <p> 885 + This non-requirement was also non-premeditated, but became apparent 886 + when studying RCU's interaction with memory ordering. 887 + 888 + <h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> 889 + Read-Side Critical Sections Don't Partition Grace Periods</a></h3> 890 + 891 + <p> 892 + It is also tempting to assume that if an RCU read-side critical section 893 + happens between a pair of grace periods, then those grace periods cannot 894 + overlap. 895 + However, this temptation leads nowhere good, as can be illustrated by 896 + the following, with all variables initially zero: 897 + 898 + <blockquote> 899 + <pre> 900 + 1 void thread0(void) 901 + 2 { 902 + 3 rcu_read_lock(); 903 + 4 WRITE_ONCE(a, 1); 904 + 5 WRITE_ONCE(b, 1); 905 + 6 rcu_read_unlock(); 906 + 7 } 907 + 8 908 + 9 void thread1(void) 909 + 10 { 910 + 11 r1 = READ_ONCE(a); 911 + 12 synchronize_rcu(); 912 + 13 WRITE_ONCE(c, 1); 913 + 14 } 914 + 15 915 + 16 void thread2(void) 916 + 17 { 917 + 18 rcu_read_lock(); 918 + 19 WRITE_ONCE(d, 1); 919 + 20 r2 = READ_ONCE(c); 920 + 21 rcu_read_unlock(); 921 + 22 } 922 + 23 923 + 24 void thread3(void) 924 + 25 { 925 + 26 r3 = READ_ONCE(d); 926 + 27 synchronize_rcu(); 927 + 28 WRITE_ONCE(e, 1); 928 + 29 } 929 + 30 930 + 31 void thread4(void) 931 + 32 { 932 + 33 rcu_read_lock(); 933 + 34 r4 = READ_ONCE(b); 934 + 35 r5 = READ_ONCE(e); 935 + 36 rcu_read_unlock(); 936 + 37 } 937 + </pre> 938 + </blockquote> 939 + 940 + <p> 941 + In this case, the outcome: 942 + 943 + <blockquote> 944 + <pre> 945 + (r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1) 946 + </pre> 947 + </blockquote> 948 + 949 + is entirely possible, as illustrated below: 950 + 951 + <p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> 952 + 953 + <p> 954 + Again, an RCU read-side critical section can overlap almost all of a 955 + given grace period, just so long as it does not overlap the entire 956 + grace period. 957 + As a result, an RCU read-side critical section cannot partition a pair 958 + of RCU grace periods. 959 + 960 + <p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a> 961 + How long a sequence of grace periods, each separated by an RCU read-side 962 + critical section, would be required to partition the RCU read-side 963 + critical sections at the beginning and end of the chain? 964 + <br><a href="#qq10answer">Answer</a> 965 + 966 + <h3><a name="Disabling Preemption Does Not Block Grace Periods"> 967 + Disabling Preemption Does Not Block Grace Periods</a></h3> 968 + 969 + <p> 970 + There was a time when disabling preemption on any given CPU would block 971 + subsequent grace periods. 972 + However, this was an accident of implementation and is not a requirement. 973 + And in the current Linux-kernel implementation, disabling preemption 974 + on a given CPU in fact does not block grace periods, as Oleg Nesterov 975 + <a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. 976 + 977 + <p> 978 + If you need a preempt-disable region to block grace periods, you need to add 979 + <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example 980 + as follows: 981 + 982 + <blockquote> 983 + <pre> 984 + 1 preempt_disable(); 985 + 2 rcu_read_lock(); 986 + 3 do_something(); 987 + 4 rcu_read_unlock(); 988 + 5 preempt_enable(); 989 + 6 990 + 7 /* Spinlocks implicitly disable preemption. */ 991 + 8 spin_lock(&amp;mylock); 992 + 9 rcu_read_lock(); 993 + 10 do_something(); 994 + 11 rcu_read_unlock(); 995 + 12 spin_unlock(&amp;mylock); 996 + </pre> 997 + </blockquote> 998 + 999 + <p> 1000 + In theory, you could enter the RCU read-side critical section first, 1001 + but it is more efficient to keep the entire RCU read-side critical 1002 + section contained in the preempt-disable region as shown above. 1003 + Of course, RCU read-side critical sections that extend outside of 1004 + preempt-disable regions will work correctly, but such critical sections 1005 + can be preempted, which forces <tt>rcu_read_unlock()</tt> to do 1006 + more work. 1007 + And no, this is <i>not</i> an invitation to enclose all of your RCU 1008 + read-side critical sections within preempt-disable regions, because 1009 + doing so would degrade real-time response. 1010 + 1011 + <p> 1012 + This non-requirement appeared with preemptible RCU. 1013 + If you need a grace period that waits on non-preemptible code regions, use 1014 + <a href="#Sched Flavor">RCU-sched</a>. 1015 + 1016 + <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> 1017 + 1018 + <p> 1019 + These parallelism facts of life are by no means specific to RCU, but 1020 + the RCU implementation must abide by them. 1021 + They therefore bear repeating: 1022 + 1023 + <ol> 1024 + <li> Any CPU or task may be delayed at any time, 1025 + and any attempts to avoid these delays by disabling 1026 + preemption, interrupts, or whatever are completely futile. 1027 + This is most obvious in preemptible user-level 1028 + environments and in virtualized environments (where 1029 + a given guest OS's VCPUs can be preempted at any time by 1030 + the underlying hypervisor), but can also happen in bare-metal 1031 + environments due to ECC errors, NMIs, and other hardware 1032 + events. 1033 + Although a delay of more than about 20 seconds can result 1034 + in splats, the RCU implementation is obligated to use 1035 + algorithms that can tolerate extremely long delays, but where 1036 + &ldquo;extremely long&rdquo; is not long enough to allow 1037 + wrap-around when incrementing a 64-bit counter. 1038 + <li> Both the compiler and the CPU can reorder memory accesses. 1039 + Where it matters, RCU must use compiler directives and 1040 + memory-barrier instructions to preserve ordering. 1041 + <li> Conflicting writes to memory locations in any given cache line 1042 + will result in expensive cache misses. 1043 + Greater numbers of concurrent writes and more-frequent 1044 + concurrent writes will result in more dramatic slowdowns. 1045 + RCU is therefore obligated to use algorithms that have 1046 + sufficient locality to avoid significant performance and 1047 + scalability problems. 1048 + <li> As a rough rule of thumb, only one CPU's worth of processing 1049 + may be carried out under the protection of any given exclusive 1050 + lock. 1051 + RCU must therefore use scalable locking designs. 1052 + <li> Counters are finite, especially on 32-bit systems. 1053 + RCU's use of counters must therefore tolerate counter wrap, 1054 + or be designed such that counter wrap would take way more 1055 + time than a single system is likely to run. 1056 + An uptime of ten years is quite possible, a runtime 1057 + of a century much less so. 1058 + As an example of the latter, RCU's dyntick-idle nesting counter 1059 + allows 54 bits for interrupt nesting level (this counter 1060 + is 64 bits even on a 32-bit system). 1061 + Overflowing this counter requires 2<sup>54</sup> 1062 + half-interrupts on a given CPU without that CPU ever going idle. 1063 + If a half-interrupt happened every microsecond, it would take 1064 + 570 years of runtime to overflow this counter, which is currently 1065 + believed to be an acceptably long time. 1066 + <li> Linux systems can have thousands of CPUs running a single 1067 + Linux kernel in a single shared-memory environment. 1068 + RCU must therefore pay close attention to high-end scalability. 1069 + </ol> 1070 + 1071 + <p> 1072 + This last parallelism fact of life means that RCU must pay special 1073 + attention to the preceding facts of life. 1074 + The idea that Linux might scale to systems with thousands of CPUs would 1075 + have been met with some skepticism in the 1990s, but these requirements 1076 + would have otherwise have been unsurprising, even in the early 1990s. 1077 + 1078 + <h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> 1079 + 1080 + <p> 1081 + These sections list quality-of-implementation requirements. 1082 + Although an RCU implementation that ignores these requirements could 1083 + still be used, it would likely be subject to limitations that would 1084 + make it inappropriate for industrial-strength production use. 1085 + Classes of quality-of-implementation requirements are as follows: 1086 + 1087 + <ol> 1088 + <li> <a href="#Specialization">Specialization</a> 1089 + <li> <a href="#Performance and Scalability">Performance and Scalability</a> 1090 + <li> <a href="#Composability">Composability</a> 1091 + <li> <a href="#Corner Cases">Corner Cases</a> 1092 + </ol> 1093 + 1094 + <p> 1095 + These classes is covered in the following sections. 1096 + 1097 + <h3><a name="Specialization">Specialization</a></h3> 1098 + 1099 + <p> 1100 + RCU is and always has been intended primarily for read-mostly situations, as 1101 + illustrated by the following figure. 1102 + This means that RCU's read-side primitives are optimized, often at the 1103 + expense of its update-side primitives. 1104 + 1105 + <p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> 1106 + 1107 + <p> 1108 + This focus on read-mostly situations means that RCU must interoperate 1109 + with other synchronization primitives. 1110 + For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> 1111 + examples discussed earlier use RCU to protect readers and locking to 1112 + coordinate updaters. 1113 + However, the need extends much farther, requiring that a variety of 1114 + synchronization primitives be legal within RCU read-side critical sections, 1115 + including spinlocks, sequence locks, atomic operations, reference 1116 + counters, and memory barriers. 1117 + 1118 + <p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a> 1119 + What about sleeping locks? 1120 + <br><a href="#qq11answer">Answer</a> 1121 + 1122 + <p> 1123 + It often comes as a surprise that many algorithms do not require a 1124 + consistent view of data, but many can function in that mode, 1125 + with network routing being the poster child. 1126 + Internet routing algorithms take significant time to propagate 1127 + updates, so that by the time an update arrives at a given system, 1128 + that system has been sending network traffic the wrong way for 1129 + a considerable length of time. 1130 + Having a few threads continue to send traffic the wrong way for a 1131 + few more milliseconds is clearly not a problem: In the worst case, 1132 + TCP retransmissions will eventually get the data where it needs to go. 1133 + In general, when tracking the state of the universe outside of the 1134 + computer, some level of inconsistency must be tolerated due to 1135 + speed-of-light delays if nothing else. 1136 + 1137 + <p> 1138 + Furthermore, uncertainty about external state is inherent in many cases. 1139 + For example, a pair of veternarians might use heartbeat to determine 1140 + whether or not a given cat was alive. 1141 + But how long should they wait after the last heartbeat to decide that 1142 + the cat is in fact dead? 1143 + Waiting less than 400 milliseconds makes no sense because this would 1144 + mean that a relaxed cat would be considered to cycle between death 1145 + and life more than 100 times per minute. 1146 + Moreover, just as with human beings, a cat's heart might stop for 1147 + some period of time, so the exact wait period is a judgment call. 1148 + One of our pair of veternarians might wait 30 seconds before pronouncing 1149 + the cat dead, while the other might insist on waiting a full minute. 1150 + The two veternarians would then disagree on the state of the cat during 1151 + the final 30 seconds of the minute following the last heartbeat, as 1152 + fancifully illustrated below: 1153 + 1154 + <p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> 1155 + 1156 + <p> 1157 + Interestingly enough, this same situation applies to hardware. 1158 + When push comes to shove, how do we tell whether or not some 1159 + external server has failed? 1160 + We send messages to it periodically, and declare it failed if we 1161 + don't receive a response within a given period of time. 1162 + Policy decisions can usually tolerate short 1163 + periods of inconsistency. 1164 + The policy was decided some time ago, and is only now being put into 1165 + effect, so a few milliseconds of delay is normally inconsequential. 1166 + 1167 + <p> 1168 + However, there are algorithms that absolutely must see consistent data. 1169 + For example, the translation between a user-level SystemV semaphore 1170 + ID to the corresponding in-kernel data structure is protected by RCU, 1171 + but it is absolutely forbidden to update a semaphore that has just been 1172 + removed. 1173 + In the Linux kernel, this need for consistency is accommodated by acquiring 1174 + spinlocks located in the in-kernel data structure from within 1175 + the RCU read-side critical section, and this is indicated by the 1176 + green box in the figure above. 1177 + Many other techniques may be used, and are in fact used within the 1178 + Linux kernel. 1179 + 1180 + <p> 1181 + In short, RCU is not required to maintain consistency, and other 1182 + mechanisms may be used in concert with RCU when consistency is required. 1183 + RCU's specialization allows it to do its job extremely well, and its 1184 + ability to interoperate with other synchronization mechanisms allows 1185 + the right mix of synchronization tools to be used for a given job. 1186 + 1187 + <h3><a name="Performance and Scalability">Performance and Scalability</a></h3> 1188 + 1189 + <p> 1190 + Energy efficiency is a critical component of performance today, 1191 + and Linux-kernel RCU implementations must therefore avoid unnecessarily 1192 + awakening idle CPUs. 1193 + I cannot claim that this requirement was premeditated. 1194 + In fact, I learned of it during a telephone conversation in which I 1195 + was given &ldquo;frank and open&rdquo; feedback on the importance 1196 + of energy efficiency in battery-powered systems and on specific 1197 + energy-efficiency shortcomings of the Linux-kernel RCU implementation. 1198 + In my experience, the battery-powered embedded community will consider 1199 + any unnecessary wakeups to be extremely unfriendly acts. 1200 + So much so that mere Linux-kernel-mailing-list posts are 1201 + insufficient to vent their ire. 1202 + 1203 + <p> 1204 + Memory consumption is not particularly important for in most 1205 + situations, and has become decreasingly 1206 + so as memory sizes have expanded and memory 1207 + costs have plummeted. 1208 + However, as I learned from Matt Mackall's 1209 + <a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> 1210 + efforts, memory footprint is critically important on single-CPU systems with 1211 + non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus 1212 + <a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> 1213 + was born. 1214 + Josh Triplett has since taken over the small-memory banner with his 1215 + <a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> 1216 + project, which resulted in 1217 + <a href="#Sleepable RCU">SRCU</a> 1218 + becoming optional for those kernels not needing it. 1219 + 1220 + <p> 1221 + The remaining performance requirements are, for the most part, 1222 + unsurprising. 1223 + For example, in keeping with RCU's read-side specialization, 1224 + <tt>rcu_dereference()</tt> should have negligible overhead (for 1225 + example, suppression of a few minor compiler optimizations). 1226 + Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and 1227 + <tt>rcu_read_unlock()</tt> should have exactly zero overhead. 1228 + 1229 + <p> 1230 + In preemptible environments, in the case where the RCU read-side 1231 + critical section was not preempted (as will be the case for the 1232 + highest-priority real-time process), <tt>rcu_read_lock()</tt> and 1233 + <tt>rcu_read_unlock()</tt> should have minimal overhead. 1234 + In particular, they should not contain atomic read-modify-write 1235 + operations, memory-barrier instructions, preemption disabling, 1236 + interrupt disabling, or backwards branches. 1237 + However, in the case where the RCU read-side critical section was preempted, 1238 + <tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. 1239 + This is why it is better to nest an RCU read-side critical section 1240 + within a preempt-disable region than vice versa, at least in cases 1241 + where that critical section is short enough to avoid unduly degrading 1242 + real-time latencies. 1243 + 1244 + <p> 1245 + The <tt>synchronize_rcu()</tt> grace-period-wait primitive is 1246 + optimized for throughput. 1247 + It may therefore incur several milliseconds of latency in addition to 1248 + the duration of the longest RCU read-side critical section. 1249 + On the other hand, multiple concurrent invocations of 1250 + <tt>synchronize_rcu()</tt> are required to use batching optimizations 1251 + so that they can be satisfied by a single underlying grace-period-wait 1252 + operation. 1253 + For example, in the Linux kernel, it is not unusual for a single 1254 + grace-period-wait operation to serve more than 1255 + <a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> 1256 + of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation 1257 + overhead down to nearly zero. 1258 + However, the grace-period optimization is also required to avoid 1259 + measurable degradation of real-time scheduling and interrupt latencies. 1260 + 1261 + <p> 1262 + In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> 1263 + latencies are unacceptable. 1264 + In these cases, <tt>synchronize_rcu_expedited()</tt> may be used 1265 + instead, reducing the grace-period latency down to a few tens of 1266 + microseconds on small systems, at least in cases where the RCU read-side 1267 + critical sections are short. 1268 + There are currently no special latency requirements for 1269 + <tt>synchronize_rcu_expedited()</tt> on large systems, but, 1270 + consistent with the empirical nature of the RCU specification, 1271 + that is subject to change. 1272 + However, there most definitely are scalability requirements: 1273 + A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 1274 + CPUs should at least make reasonable forward progress. 1275 + In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> 1276 + is permitted to impose modest degradation of real-time latency 1277 + on non-idle online CPUs. 1278 + That said, it will likely be necessary to take further steps to reduce this 1279 + degradation, hopefully to roughly that of a scheduling-clock interrupt. 1280 + 1281 + <p> 1282 + There are a number of situations where even 1283 + <tt>synchronize_rcu_expedited()</tt>'s reduced grace-period 1284 + latency is unacceptable. 1285 + In these situations, the asynchronous <tt>call_rcu()</tt> can be 1286 + used in place of <tt>synchronize_rcu()</tt> as follows: 1287 + 1288 + <blockquote> 1289 + <pre> 1290 + 1 struct foo { 1291 + 2 int a; 1292 + 3 int b; 1293 + 4 struct rcu_head rh; 1294 + 5 }; 1295 + 6 1296 + 7 static void remove_gp_cb(struct rcu_head *rhp) 1297 + 8 { 1298 + 9 struct foo *p = container_of(rhp, struct foo, rh); 1299 + 10 1300 + 11 kfree(p); 1301 + 12 } 1302 + 13 1303 + 14 bool remove_gp_asynchronous(void) 1304 + 15 { 1305 + 16 struct foo *p; 1306 + 17 1307 + 18 spin_lock(&amp;gp_lock); 1308 + 19 p = rcu_dereference(gp); 1309 + 20 if (!p) { 1310 + 21 spin_unlock(&amp;gp_lock); 1311 + 22 return false; 1312 + 23 } 1313 + 24 rcu_assign_pointer(gp, NULL); 1314 + 25 call_rcu(&amp;p-&gt;rh, remove_gp_cb); 1315 + 26 spin_unlock(&amp;gp_lock); 1316 + 27 return true; 1317 + 28 } 1318 + </pre> 1319 + </blockquote> 1320 + 1321 + <p> 1322 + A definition of <tt>struct foo</tt> is finally needed, and appears 1323 + on lines&nbsp;1-5. 1324 + The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> 1325 + on line&nbsp;25, and will be invoked after the end of a subsequent 1326 + grace period. 1327 + This gets the same effect as <tt>remove_gp_synchronous()</tt>, 1328 + but without forcing the updater to wait for a grace period to elapse. 1329 + The <tt>call_rcu()</tt> function may be used in a number of 1330 + situations where neither <tt>synchronize_rcu()</tt> nor 1331 + <tt>synchronize_rcu_expedited()</tt> would be legal, 1332 + including within preempt-disable code, <tt>local_bh_disable()</tt> code, 1333 + interrupt-disable code, and interrupt handlers. 1334 + However, even <tt>call_rcu()</tt> is illegal within NMI handlers. 1335 + The callback function (<tt>remove_gp_cb()</tt> in this case) will be 1336 + executed within softirq (software interrupt) environment within the 1337 + Linux kernel, 1338 + either within a real softirq handler or under the protection 1339 + of <tt>local_bh_disable()</tt>. 1340 + In both the Linux kernel and in userspace, it is bad practice to 1341 + write an RCU callback function that takes too long. 1342 + Long-running operations should be relegated to separate threads or 1343 + (in the Linux kernel) workqueues. 1344 + 1345 + <p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a> 1346 + Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>? 1347 + After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the 1348 + structure, which would interact badly with concurrent insertions. 1349 + Doesn't this mean that <tt>rcu_dereference()</tt> is required? 1350 + <br><a href="#qq12answer">Answer</a> 1351 + 1352 + <p> 1353 + However, all that <tt>remove_gp_cb()</tt> is doing is 1354 + invoking <tt>kfree()</tt> on the data element. 1355 + This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, 1356 + which allows &ldquo;fire and forget&rdquo; operation as shown below: 1357 + 1358 + <blockquote> 1359 + <pre> 1360 + 1 struct foo { 1361 + 2 int a; 1362 + 3 int b; 1363 + 4 struct rcu_head rh; 1364 + 5 }; 1365 + 6 1366 + 7 bool remove_gp_faf(void) 1367 + 8 { 1368 + 9 struct foo *p; 1369 + 10 1370 + 11 spin_lock(&amp;gp_lock); 1371 + 12 p = rcu_dereference(gp); 1372 + 13 if (!p) { 1373 + 14 spin_unlock(&amp;gp_lock); 1374 + 15 return false; 1375 + 16 } 1376 + 17 rcu_assign_pointer(gp, NULL); 1377 + 18 kfree_rcu(p, rh); 1378 + 19 spin_unlock(&amp;gp_lock); 1379 + 20 return true; 1380 + 21 } 1381 + </pre> 1382 + </blockquote> 1383 + 1384 + <p> 1385 + Note that <tt>remove_gp_faf()</tt> simply invokes 1386 + <tt>kfree_rcu()</tt> and proceeds, without any need to pay any 1387 + further attention to the subsequent grace period and <tt>kfree()</tt>. 1388 + It is permissible to invoke <tt>kfree_rcu()</tt> from the same 1389 + environments as for <tt>call_rcu()</tt>. 1390 + Interestingly enough, DYNIX/ptx had the equivalents of 1391 + <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not 1392 + <tt>synchronize_rcu()</tt>. 1393 + This was due to the fact that RCU was not heavily used within DYNIX/ptx, 1394 + so the very few places that needed something like 1395 + <tt>synchronize_rcu()</tt> simply open-coded it. 1396 + 1397 + <p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a> 1398 + Earlier it was claimed that <tt>call_rcu()</tt> and 1399 + <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked 1400 + by readers. 1401 + But how can that be correct, given that the invocation of the callback 1402 + and the freeing of the memory (respectively) must still wait for 1403 + a grace period to elapse? 1404 + <br><a href="#qq13answer">Answer</a> 1405 + 1406 + <p> 1407 + But what if the updater must wait for the completion of code to be 1408 + executed after the end of the grace period, but has other tasks 1409 + that can be carried out in the meantime? 1410 + The polling-style <tt>get_state_synchronize_rcu()</tt> and 1411 + <tt>cond_synchronize_rcu()</tt> functions may be used for this 1412 + purpose, as shown below: 1413 + 1414 + <blockquote> 1415 + <pre> 1416 + 1 bool remove_gp_poll(void) 1417 + 2 { 1418 + 3 struct foo *p; 1419 + 4 unsigned long s; 1420 + 5 1421 + 6 spin_lock(&amp;gp_lock); 1422 + 7 p = rcu_access_pointer(gp); 1423 + 8 if (!p) { 1424 + 9 spin_unlock(&amp;gp_lock); 1425 + 10 return false; 1426 + 11 } 1427 + 12 rcu_assign_pointer(gp, NULL); 1428 + 13 spin_unlock(&amp;gp_lock); 1429 + 14 s = get_state_synchronize_rcu(); 1430 + 15 do_something_while_waiting(); 1431 + 16 cond_synchronize_rcu(s); 1432 + 17 kfree(p); 1433 + 18 return true; 1434 + 19 } 1435 + </pre> 1436 + </blockquote> 1437 + 1438 + <p> 1439 + On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a 1440 + &ldquo;cookie&rdquo; from RCU, 1441 + then line&nbsp;15 carries out other tasks, 1442 + and finally, line&nbsp;16 returns immediately if a grace period has 1443 + elapsed in the meantime, but otherwise waits as required. 1444 + The need for <tt>get_state_synchronize_rcu</tt> and 1445 + <tt>cond_synchronize_rcu()</tt> has appeared quite recently, 1446 + so it is too early to tell whether they will stand the test of time. 1447 + 1448 + <p> 1449 + RCU thus provides a range of tools to allow updaters to strike the 1450 + required tradeoff between latency, flexibility and CPU overhead. 1451 + 1452 + <h3><a name="Composability">Composability</a></h3> 1453 + 1454 + <p> 1455 + Composability has received much attention in recent years, perhaps in part 1456 + due to the collision of multicore hardware with object-oriented techniques 1457 + designed in single-threaded environments for single-threaded use. 1458 + And in theory, RCU read-side critical sections may be composed, and in 1459 + fact may be nested arbitrarily deeply. 1460 + In practice, as with all real-world implementations of composable 1461 + constructs, there are limitations. 1462 + 1463 + <p> 1464 + Implementations of RCU for which <tt>rcu_read_lock()</tt> 1465 + and <tt>rcu_read_unlock()</tt> generate no code, such as 1466 + Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be 1467 + nested arbitrarily deeply. 1468 + After all, there is no overhead. 1469 + Except that if all these instances of <tt>rcu_read_lock()</tt> 1470 + and <tt>rcu_read_unlock()</tt> are visible to the compiler, 1471 + compilation will eventually fail due to exhausting memory, 1472 + mass storage, or user patience, whichever comes first. 1473 + If the nesting is not visible to the compiler, as is the case with 1474 + mutually recursive functions each in its own translation unit, 1475 + stack overflow will result. 1476 + If the nesting takes the form of loops, either the control variable 1477 + will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. 1478 + Nevertheless, this class of RCU implementations is one 1479 + of the most composable constructs in existence. 1480 + 1481 + <p> 1482 + RCU implementations that explicitly track nesting depth 1483 + are limited by the nesting-depth counter. 1484 + For example, the Linux kernel's preemptible RCU limits nesting to 1485 + <tt>INT_MAX</tt>. 1486 + This should suffice for almost all practical purposes. 1487 + That said, a consecutive pair of RCU read-side critical sections 1488 + between which there is an operation that waits for a grace period 1489 + cannot be enclosed in another RCU read-side critical section. 1490 + This is because it is not legal to wait for a grace period within 1491 + an RCU read-side critical section: To do so would result either 1492 + in deadlock or 1493 + in RCU implicitly splitting the enclosing RCU read-side critical 1494 + section, neither of which is conducive to a long-lived and prosperous 1495 + kernel. 1496 + 1497 + <p> 1498 + In short, although RCU read-side critical sections are highly composable, 1499 + care is required in some situations, just as is the case for any other 1500 + composable synchronization mechanism. 1501 + 1502 + <h3><a name="Corner Cases">Corner Cases</a></h3> 1503 + 1504 + <p> 1505 + A given RCU workload might have an endless and intense stream of 1506 + RCU read-side critical sections, perhaps even so intense that there 1507 + was never a point in time during which there was not at least one 1508 + RCU read-side critical section in flight. 1509 + RCU cannot allow this situation to block grace periods: As long as 1510 + all the RCU read-side critical sections are finite, grace periods 1511 + must also be finite. 1512 + 1513 + <p> 1514 + That said, preemptible RCU implementations could potentially result 1515 + in RCU read-side critical sections being preempted for long durations, 1516 + which has the effect of creating a long-duration RCU read-side 1517 + critical section. 1518 + This situation can arise only in heavily loaded systems, but systems using 1519 + real-time priorities are of course more vulnerable. 1520 + Therefore, RCU priority boosting is provided to help deal with this 1521 + case. 1522 + That said, the exact requirements on RCU priority boosting will likely 1523 + evolve as more experience accumulates. 1524 + 1525 + <p> 1526 + Other workloads might have very high update rates. 1527 + Although one can argue that such workloads should instead use 1528 + something other than RCU, the fact remains that RCU must 1529 + handle such workloads gracefully. 1530 + This requirement is another factor driving batching of grace periods, 1531 + but it is also the driving force behind the checks for large numbers 1532 + of queued RCU callbacks in the <tt>call_rcu()</tt> code path. 1533 + Finally, high update rates should not delay RCU read-side critical 1534 + sections, although some read-side delays can occur when using 1535 + <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use 1536 + of <tt>try_stop_cpus()</tt>. 1537 + (In the future, <tt>synchronize_rcu_expedited()</tt> will be 1538 + converted to use lighter-weight inter-processor interrupts (IPIs), 1539 + but this will still disturb readers, though to a much smaller degree.) 1540 + 1541 + <p> 1542 + Although all three of these corner cases were understood in the early 1543 + 1990s, a simple user-level test consisting of <tt>close(open(path))</tt> 1544 + in a tight loop 1545 + in the early 2000s suddenly provided a much deeper appreciation of the 1546 + high-update-rate corner case. 1547 + This test also motivated addition of some RCU code to react to high update 1548 + rates, for example, if a given CPU finds itself with more than 10,000 1549 + RCU callbacks queued, it will cause RCU to take evasive action by 1550 + more aggressively starting grace periods and more aggressively forcing 1551 + completion of grace-period processing. 1552 + This evasive action causes the grace period to complete more quickly, 1553 + but at the cost of restricting RCU's batching optimizations, thus 1554 + increasing the CPU overhead incurred by that grace period. 1555 + 1556 + <h2><a name="Software-Engineering Requirements"> 1557 + Software-Engineering Requirements</a></h2> 1558 + 1559 + <p> 1560 + Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to 1561 + guard against mishaps and misuse: 1562 + 1563 + <ol> 1564 + <li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> 1565 + everywhere that it is needed, so kernels built with 1566 + <tt>CONFIG_PROVE_RCU=y</tt> will spat if 1567 + <tt>rcu_dereference()</tt> is used outside of an 1568 + RCU read-side critical section. 1569 + Update-side code can use <tt>rcu_dereference_protected()</tt>, 1570 + which takes a 1571 + <a href="https://lwn.net/Articles/371986/">lockdep expression</a> 1572 + to indicate what is providing the protection. 1573 + If the indicated protection is not provided, a lockdep splat 1574 + is emitted. 1575 + 1576 + <p> 1577 + Code shared between readers and updaters can use 1578 + <tt>rcu_dereference_check()</tt>, which also takes a 1579 + lockdep expression, and emits a lockdep splat if neither 1580 + <tt>rcu_read_lock()</tt> nor the indicated protection 1581 + is in place. 1582 + In addition, <tt>rcu_dereference_raw()</tt> is used in those 1583 + (hopefully rare) cases where the required protection cannot 1584 + be easily described. 1585 + Finally, <tt>rcu_read_lock_held()</tt> is provided to 1586 + allow a function to verify that it has been invoked within 1587 + an RCU read-side critical section. 1588 + I was made aware of this set of requirements shortly after Thomas 1589 + Gleixner audited a number of RCU uses. 1590 + <li> A given function might wish to check for RCU-related preconditions 1591 + upon entry, before using any other RCU API. 1592 + The <tt>rcu_lockdep_assert()</tt> does this job, 1593 + asserting the expression in kernels having lockdep enabled 1594 + and doing nothing otherwise. 1595 + <li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> 1596 + and <tt>rcu_dereference()</tt>, perhaps (incorrectly) 1597 + substituting a simple assignment. 1598 + To catch this sort of error, a given RCU-protected pointer may be 1599 + tagged with <tt>__rcu</tt>, after which running sparse 1600 + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain 1601 + about simple-assignment accesses to that pointer. 1602 + Arnd Bergmann made me aware of this requirement, and also 1603 + supplied the needed 1604 + <a href="https://lwn.net/Articles/376011/">patch series</a>. 1605 + <li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> 1606 + will splat if a data element is passed to <tt>call_rcu()</tt> 1607 + twice in a row, without a grace period in between. 1608 + (This error is similar to a double free.) 1609 + The corresponding <tt>rcu_head</tt> structures that are 1610 + dynamically allocated are automatically tracked, but 1611 + <tt>rcu_head</tt> structures allocated on the stack 1612 + must be initialized with <tt>init_rcu_head_on_stack()</tt> 1613 + and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. 1614 + Similarly, statically allocated non-stack <tt>rcu_head</tt> 1615 + structures must be initialized with <tt>init_rcu_head()</tt> 1616 + and cleaned up with <tt>destroy_rcu_head()</tt>. 1617 + Mathieu Desnoyers made me aware of this requirement, and also 1618 + supplied the needed 1619 + <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. 1620 + <li> An infinite loop in an RCU read-side critical section will 1621 + eventually trigger an RCU CPU stall warning splat. 1622 + However, RCU is not obligated to produce this splat 1623 + unless there is a grace period waiting on that particular 1624 + RCU read-side critical section. 1625 + This requirement made itself known in the early 1990s, pretty 1626 + much the first time that it was necessary to debug a CPU stall. 1627 + <li> Although it would be very good to detect pointers leaking out 1628 + of RCU read-side critical sections, there is currently no 1629 + good way of doing this. 1630 + One complication is the need to distinguish between pointers 1631 + leaking and pointers that have been handed off from RCU to 1632 + some other synchronization mechanism, for example, reference 1633 + counting. 1634 + <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related 1635 + information is provided via both debugfs and event tracing. 1636 + <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and 1637 + <tt>rcu_dereference()</tt> to create typical linked 1638 + data structures can be surprisingly error-prone. 1639 + Therefore, RCU-protected 1640 + <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> 1641 + and, more recently, RCU-protected 1642 + <a href="https://lwn.net/Articles/612100/">hash tables</a> 1643 + are available. 1644 + Many other special-purpose RCU-protected data structures are 1645 + available in the Linux kernel and the userspace RCU library. 1646 + <li> Some linked structures are created at compile time, but still 1647 + require <tt>__rcu</tt> checking. 1648 + The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this 1649 + purpose. 1650 + <li> It is not necessary to use <tt>rcu_assign_pointer()</tt> 1651 + when creating linked structures that are to be published via 1652 + a single external pointer. 1653 + The <tt>RCU_INIT_POINTER()</tt> macro is provided for 1654 + this task and also for assigning <tt>NULL</tt> pointers 1655 + at runtime. 1656 + </ol> 1657 + 1658 + <p> 1659 + This not a hard-and-fast list: RCU's diagnostic capabilities will 1660 + continue to be guided by the number and type of usage bugs found 1661 + in real-world RCU usage. 1662 + 1663 + <h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> 1664 + 1665 + <p> 1666 + The Linux kernel provides an interesting environment for all kinds of 1667 + software, including RCU. 1668 + Some of the relevant points of interest are as follows: 1669 + 1670 + <ol> 1671 + <li> <a href="#Configuration">Configuration</a>. 1672 + <li> <a href="#Firmware Interface">Firmware Interface</a>. 1673 + <li> <a href="#Early Boot">Early Boot</a>. 1674 + <li> <a href="#Interrupts and NMIs"> 1675 + Interrupts and non-maskable interrupts (NMIs)</a>. 1676 + <li> <a href="#Loadable Modules">Loadable Modules</a>. 1677 + <li> <a href="#Hotplug CPU">Hotplug CPU</a>. 1678 + <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. 1679 + <li> <a href="#Tracing and RCU">Tracing and RCU</a>. 1680 + <li> <a href="#Energy Efficiency">Energy Efficiency</a>. 1681 + <li> <a href="#Performance, Scalability, Response Time, and Reliability"> 1682 + Performance, Scalability, Response Time, and Reliability</a>. 1683 + </ol> 1684 + 1685 + <p> 1686 + This list is probably incomplete, but it does give a feel for the 1687 + most notable Linux-kernel complications. 1688 + Each of the following sections covers one of the above topics. 1689 + 1690 + <h3><a name="Configuration">Configuration</a></h3> 1691 + 1692 + <p> 1693 + RCU's goal is automatic configuration, so that almost nobody 1694 + needs to worry about RCU's <tt>Kconfig</tt> options. 1695 + And for almost all users, RCU does in fact work well 1696 + &ldquo;out of the box.&rdquo; 1697 + 1698 + <p> 1699 + However, there are specialized use cases that are handled by 1700 + kernel boot parameters and <tt>Kconfig</tt> options. 1701 + Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users 1702 + about new <tt>Kconfig</tt> options, which requires almost all of them 1703 + be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. 1704 + 1705 + <p> 1706 + This all should be quite obvious, but the fact remains that 1707 + Linus Torvalds recently had to 1708 + <a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> 1709 + me of this requirement. 1710 + 1711 + <h3><a name="Firmware Interface">Firmware Interface</a></h3> 1712 + 1713 + <p> 1714 + In many cases, kernel obtains information about the system from the 1715 + firmware, and sometimes things are lost in translation. 1716 + Or the translation is accurate, but the original message is bogus. 1717 + 1718 + <p> 1719 + For example, some systems' firmware overreports the number of CPUs, 1720 + sometimes by a large factor. 1721 + If RCU naively believed the firmware, as it used to do, 1722 + it would create too many per-CPU kthreads. 1723 + Although the resulting system will still run correctly, the extra 1724 + kthreads needlessly consume memory and can cause confusion 1725 + when they show up in <tt>ps</tt> listings. 1726 + 1727 + <p> 1728 + RCU must therefore wait for a given CPU to actually come online before 1729 + it can allow itself to believe that the CPU actually exists. 1730 + The resulting &ldquo;ghost CPUs&rdquo; (which are never going to 1731 + come online) cause a number of 1732 + <a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. 1733 + 1734 + <h3><a name="Early Boot">Early Boot</a></h3> 1735 + 1736 + <p> 1737 + The Linux kernel's boot sequence is an interesting process, 1738 + and RCU is used early, even before <tt>rcu_init()</tt> 1739 + is invoked. 1740 + In fact, a number of RCU's primitives can be used as soon as the 1741 + initial task's <tt>task_struct</tt> is available and the 1742 + boot CPU's per-CPU variables are set up. 1743 + The read-side primitives (<tt>rcu_read_lock()</tt>, 1744 + <tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, 1745 + and <tt>rcu_access_pointer()</tt>) will operate normally very early on, 1746 + as will <tt>rcu_assign_pointer()</tt>. 1747 + 1748 + <p> 1749 + Although <tt>call_rcu()</tt> may be invoked at any 1750 + time during boot, callbacks are not guaranteed to be invoked until after 1751 + the scheduler is fully up and running. 1752 + This delay in callback invocation is due to the fact that RCU does not 1753 + invoke callbacks until it is fully initialized, and this full initialization 1754 + cannot occur until after the scheduler has initialized itself to the 1755 + point where RCU can spawn and run its kthreads. 1756 + In theory, it would be possible to invoke callbacks earlier, 1757 + however, this is not a panacea because there would be severe restrictions 1758 + on what operations those callbacks could invoke. 1759 + 1760 + <p> 1761 + Perhaps surprisingly, <tt>synchronize_rcu()</tt>, 1762 + <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> 1763 + (<a href="#Bottom-Half Flavor">discussed below</a>), 1764 + and 1765 + <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> 1766 + will all operate normally 1767 + during very early boot, the reason being that there is only one CPU 1768 + and preemption is disabled. 1769 + This means that the call <tt>synchronize_rcu()</tt> (or friends) 1770 + itself is a quiescent 1771 + state and thus a grace period, so the early-boot implementation can 1772 + be a no-op. 1773 + 1774 + <p> 1775 + Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> 1776 + continue to operate normally through the remainder of boot, courtesy 1777 + of the fact that preemption is disabled across their RCU read-side 1778 + critical sections and also courtesy of the fact that there is still 1779 + only one CPU. 1780 + However, once the scheduler starts initializing, preemption is enabled. 1781 + There is still only a single CPU, but the fact that preemption is enabled 1782 + means that the no-op implementation of <tt>synchronize_rcu()</tt> no 1783 + longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. 1784 + Therefore, as soon as the scheduler starts initializing, the early-boot 1785 + fastpath is disabled. 1786 + This means that <tt>synchronize_rcu()</tt> switches to its runtime 1787 + mode of operation where it posts callbacks, which in turn means that 1788 + any call to <tt>synchronize_rcu()</tt> will block until the corresponding 1789 + callback is invoked. 1790 + Unfortunately, the callback cannot be invoked until RCU's runtime 1791 + grace-period machinery is up and running, which cannot happen until 1792 + the scheduler has initialized itself sufficiently to allow RCU's 1793 + kthreads to be spawned. 1794 + Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler 1795 + initialization can result in deadlock. 1796 + 1797 + <p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a> 1798 + So what happens with <tt>synchronize_rcu()</tt> during 1799 + scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> 1800 + kernels? 1801 + <br><a href="#qq14answer">Answer</a> 1802 + 1803 + <p> 1804 + I learned of these boot-time requirements as a result of a series of 1805 + system hangs. 1806 + 1807 + <h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> 1808 + 1809 + <p> 1810 + The Linux kernel has interrupts, and RCU read-side critical sections are 1811 + legal within interrupt handlers and within interrupt-disabled regions 1812 + of code, as are invocations of <tt>call_rcu()</tt>. 1813 + 1814 + <p> 1815 + Some Linux-kernel architectures can enter an interrupt handler from 1816 + non-idle process context, and then just never leave it, instead stealthily 1817 + transitioning back to process context. 1818 + This trick is sometimes used to invoke system calls from inside the kernel. 1819 + These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful 1820 + about how it counts interrupt nesting levels. 1821 + I learned of this requirement the hard way during a rewrite 1822 + of RCU's dyntick-idle code. 1823 + 1824 + <p> 1825 + The Linux kernel has non-maskable interrupts (NMIs), and 1826 + RCU read-side critical sections are legal within NMI handlers. 1827 + Thankfully, RCU update-side primitives, including 1828 + <tt>call_rcu()</tt>, are prohibited within NMI handlers. 1829 + 1830 + <p> 1831 + The name notwithstanding, some Linux-kernel architectures 1832 + can have nested NMIs, which RCU must handle correctly. 1833 + Andy Lutomirski 1834 + <a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> 1835 + with this requirement; 1836 + he also kindly surprised me with 1837 + <a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> 1838 + that meets this requirement. 1839 + 1840 + <h3><a name="Loadable Modules">Loadable Modules</a></h3> 1841 + 1842 + <p> 1843 + The Linux kernel has loadable modules, and these modules can 1844 + also be unloaded. 1845 + After a given module has been unloaded, any attempt to call 1846 + one of its functions results in a segmentation fault. 1847 + The module-unload functions must therefore cancel any 1848 + delayed calls to loadable-module functions, for example, 1849 + any outstanding <tt>mod_timer()</tt> must be dealt with 1850 + via <tt>del_timer_sync()</tt> or similar. 1851 + 1852 + <p> 1853 + Unfortunately, there is no way to cancel an RCU callback; 1854 + once you invoke <tt>call_rcu()</tt>, the callback function is 1855 + going to eventually be invoked, unless the system goes down first. 1856 + Because it is normally considered socially irresponsible to crash the system 1857 + in response to a module unload request, we need some other way 1858 + to deal with in-flight RCU callbacks. 1859 + 1860 + <p> 1861 + RCU therefore provides 1862 + <tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, 1863 + which waits until all in-flight RCU callbacks have been invoked. 1864 + If a module uses <tt>call_rcu()</tt>, its exit function should therefore 1865 + prevent any future invocation of <tt>call_rcu()</tt>, then invoke 1866 + <tt>rcu_barrier()</tt>. 1867 + In theory, the underlying module-unload code could invoke 1868 + <tt>rcu_barrier()</tt> unconditionally, but in practice this would 1869 + incur unacceptable latencies. 1870 + 1871 + <p> 1872 + Nikita Danilov noted this requirement for an analogous filesystem-unmount 1873 + situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. 1874 + The need for <tt>rcu_barrier()</tt> for module unloading became 1875 + apparent later. 1876 + 1877 + <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> 1878 + 1879 + <p> 1880 + The Linux kernel supports CPU hotplug, which means that CPUs 1881 + can come and go. 1882 + It is of course illegal to use any RCU API member from an offline CPU. 1883 + This requirement was present from day one in DYNIX/ptx, but 1884 + on the other hand, the Linux kernel's CPU-hotplug implementation 1885 + is &ldquo;interesting.&rdquo; 1886 + 1887 + <p> 1888 + The Linux-kernel CPU-hotplug implementation has notifiers that 1889 + are used to allow the various kernel subsystems (including RCU) 1890 + to respond appropriately to a given CPU-hotplug operation. 1891 + Most RCU operations may be invoked from CPU-hotplug notifiers, 1892 + including even normal synchronous grace-period operations 1893 + such as <tt>synchronize_rcu()</tt>. 1894 + However, expedited grace-period operations such as 1895 + <tt>synchronize_rcu_expedited()</tt> are not supported, 1896 + due to the fact that current implementations block CPU-hotplug 1897 + operations, which could result in deadlock. 1898 + 1899 + <p> 1900 + In addition, all-callback-wait operations such as 1901 + <tt>rcu_barrier()</tt> are also not supported, due to the 1902 + fact that there are phases of CPU-hotplug operations where 1903 + the outgoing CPU's callbacks will not be invoked until after 1904 + the CPU-hotplug operation ends, which could also result in deadlock. 1905 + 1906 + <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> 1907 + 1908 + <p> 1909 + RCU depends on the scheduler, and the scheduler uses RCU to 1910 + protect some of its data structures. 1911 + This means the scheduler is forbidden from acquiring 1912 + the runqueue locks and the priority-inheritance locks 1913 + in the middle of an outermost RCU read-side critical section unless 1914 + it also releases them before exiting that same 1915 + RCU read-side critical section. 1916 + This same prohibition also applies to any lock that is acquired 1917 + while holding any lock to which this prohibition applies. 1918 + Violating this rule results in deadlock. 1919 + 1920 + <p> 1921 + For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> 1922 + implementation must be written carefully to avoid similar deadlocks. 1923 + In particular, <tt>rcu_read_unlock()</tt> must tolerate an 1924 + interrupt where the interrupt handler invokes both 1925 + <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. 1926 + This possibility requires <tt>rcu_read_unlock()</tt> to use 1927 + negative nesting levels to avoid destructive recursion via 1928 + interrupt handler's use of RCU. 1929 + 1930 + <p> 1931 + This pair of mutual scheduler-RCU requirements came as a 1932 + <a href="https://lwn.net/Articles/453002/">complete surprise</a>. 1933 + 1934 + <p> 1935 + As noted above, RCU makes use of kthreads, and it is necessary to 1936 + avoid excessive CPU-time accumulation by these kthreads. 1937 + This requirement was no surprise, but RCU's violation of it 1938 + when running context-switch-heavy workloads when built with 1939 + <tt>CONFIG_NO_HZ_FULL=y</tt> 1940 + <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. 1941 + RCU has made good progress towards meeting this requirement, even 1942 + for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, 1943 + but there is room for further improvement. 1944 + 1945 + <h3><a name="Tracing and RCU">Tracing and RCU</a></h3> 1946 + 1947 + <p> 1948 + It is possible to use tracing on RCU code, but tracing itself 1949 + uses RCU. 1950 + For this reason, <tt>rcu_dereference_raw_notrace()</tt> 1951 + is provided for use by tracing, which avoids the destructive 1952 + recursion that could otherwise ensue. 1953 + This API is also used by virtualization in some architectures, 1954 + where RCU readers execute in environments in which tracing 1955 + cannot be used. 1956 + The tracing folks both located the requirement and provided the 1957 + needed fix, so this surprise requirement was relatively painless. 1958 + 1959 + <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> 1960 + 1961 + <p> 1962 + Interrupting idle CPUs is considered socially unacceptable, 1963 + especially by people with battery-powered embedded systems. 1964 + RCU therefore conserves energy by detecting which CPUs are 1965 + idle, including tracking CPUs that have been interrupted from idle. 1966 + This is a large part of the energy-efficiency requirement, 1967 + so I learned of this via an irate phone call. 1968 + 1969 + <p> 1970 + Because RCU avoids interrupting idle CPUs, it is illegal to 1971 + execute an RCU read-side critical section on an idle CPU. 1972 + (Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat 1973 + if you try it.) 1974 + The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> 1975 + event tracing is provided to work around this restriction. 1976 + In addition, <tt>rcu_is_watching()</tt> may be used to 1977 + test whether or not it is currently legal to run RCU read-side 1978 + critical sections on this CPU. 1979 + I learned of the need for diagnostics on the one hand 1980 + and <tt>RCU_NONIDLE()</tt> on the other while inspecting 1981 + idle-loop code. 1982 + Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, 1983 + which is used quite heavily in the idle loop. 1984 + 1985 + <p> 1986 + It is similarly socially unacceptable to interrupt an 1987 + <tt>nohz_full</tt> CPU running in userspace. 1988 + RCU must therefore track <tt>nohz_full</tt> userspace 1989 + execution. 1990 + And in 1991 + <a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> 1992 + kernels, RCU must separately track idle CPUs on the one hand and 1993 + CPUs that are either idle or executing in userspace on the other. 1994 + In both cases, RCU must be able to sample state at two points in 1995 + time, and be able to determine whether or not some other CPU spent 1996 + any time idle and/or executing in userspace. 1997 + 1998 + <p> 1999 + These energy-efficiency requirements have proven quite difficult to 2000 + understand and to meet, for example, there have been more than five 2001 + clean-sheet rewrites of RCU's energy-efficiency code, the last of 2002 + which was finally able to demonstrate 2003 + <a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. 2004 + As noted earlier, 2005 + I learned of many of these requirements via angry phone calls: 2006 + Flaming me on the Linux-kernel mailing list was apparently not 2007 + sufficient to fully vent their ire at RCU's energy-efficiency bugs! 2008 + 2009 + <h3><a name="Performance, Scalability, Response Time, and Reliability"> 2010 + Performance, Scalability, Response Time, and Reliability</a></h3> 2011 + 2012 + <p> 2013 + Expanding on the 2014 + <a href="#Performance and Scalability">earlier discussion</a>, 2015 + RCU is used heavily by hot code paths in performance-critical 2016 + portions of the Linux kernel's networking, security, virtualization, 2017 + and scheduling code paths. 2018 + RCU must therefore use efficient implementations, especially in its 2019 + read-side primitives. 2020 + To that end, it would be good if preemptible RCU's implementation 2021 + of <tt>rcu_read_lock()</tt> could be inlined, however, doing 2022 + this requires resolving <tt>#include</tt> issues with the 2023 + <tt>task_struct</tt> structure. 2024 + 2025 + <p> 2026 + The Linux kernel supports hardware configurations with up to 2027 + 4096 CPUs, which means that RCU must be extremely scalable. 2028 + Algorithms that involve frequent acquisitions of global locks or 2029 + frequent atomic operations on global variables simply cannot be 2030 + tolerated within the RCU implementation. 2031 + RCU therefore makes heavy use of a combining tree based on the 2032 + <tt>rcu_node</tt> structure. 2033 + RCU is required to tolerate all CPUs continuously invoking any 2034 + combination of RCU's runtime primitives with minimal per-operation 2035 + overhead. 2036 + In fact, in many cases, increasing load must <i>decrease</i> the 2037 + per-operation overhead, witness the batching optimizations for 2038 + <tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, 2039 + <tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. 2040 + As a general rule, RCU must cheerfully accept whatever the 2041 + rest of the Linux kernel decides to throw at it. 2042 + 2043 + <p> 2044 + The Linux kernel is used for real-time workloads, especially 2045 + in conjunction with the 2046 + <a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. 2047 + The real-time-latency response requirements are such that the 2048 + traditional approach of disabling preemption across RCU 2049 + read-side critical sections is inappropriate. 2050 + Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore 2051 + use an RCU implementation that allows RCU read-side critical 2052 + sections to be preempted. 2053 + This requirement made its presence known after users made it 2054 + clear that an earlier 2055 + <a href="https://lwn.net/Articles/107930/">real-time patch</a> 2056 + did not meet their needs, in conjunction with some 2057 + <a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> 2058 + encountered by a very early version of the -rt patchset. 2059 + 2060 + <p> 2061 + In addition, RCU must make do with a sub-100-microsecond real-time latency 2062 + budget. 2063 + In fact, on smaller systems with the -rt patchset, the Linux kernel 2064 + provides sub-20-microsecond real-time latencies for the whole kernel, 2065 + including RCU. 2066 + RCU's scalability and latency must therefore be sufficient for 2067 + these sorts of configurations. 2068 + To my surprise, the sub-100-microsecond real-time latency budget 2069 + <a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> 2070 + applies to even the largest systems [PDF]</a>, 2071 + up to and including systems with 4096 CPUs. 2072 + This real-time requirement motivated the grace-period kthread, which 2073 + also simplified handling of a number of race conditions. 2074 + 2075 + <p> 2076 + Finally, RCU's status as a synchronization primitive means that 2077 + any RCU failure can result in arbitrary memory corruption that can be 2078 + extremely difficult to debug. 2079 + This means that RCU must be extremely reliable, which in 2080 + practice also means that RCU must have an aggressive stress-test 2081 + suite. 2082 + This stress-test suite is called <tt>rcutorture</tt>. 2083 + 2084 + <p> 2085 + Although the need for <tt>rcutorture</tt> was no surprise, 2086 + the current immense popularity of the Linux kernel is posing 2087 + interesting&mdash;and perhaps unprecedented&mdash;validation 2088 + challenges. 2089 + To see this, keep in mind that there are well over one billion 2090 + instances of the Linux kernel running today, given Android 2091 + smartphones, Linux-powered televisions, and servers. 2092 + This number can be expected to increase sharply with the advent of 2093 + the celebrated Internet of Things. 2094 + 2095 + <p> 2096 + Suppose that RCU contains a race condition that manifests on average 2097 + once per million years of runtime. 2098 + This bug will be occurring about three times per <i>day</i> across 2099 + the installed base. 2100 + RCU could simply hide behind hardware error rates, given that no one 2101 + should really expect their smartphone to last for a million years. 2102 + However, anyone taking too much comfort from this thought should 2103 + consider the fact that in most jurisdictions, a successful multi-year 2104 + test of a given mechanism, which might include a Linux kernel, 2105 + suffices for a number of types of safety-critical certifications. 2106 + In fact, rumor has it that the Linux kernel is already being used 2107 + in production for safety-critical applications. 2108 + I don't know about you, but I would feel quite bad if a bug in RCU 2109 + killed someone. 2110 + Which might explain my recent focus on validation and verification. 2111 + 2112 + <h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> 2113 + 2114 + <p> 2115 + One of the more surprising things about RCU is that there are now 2116 + no fewer than five <i>flavors</i>, or API families. 2117 + In addition, the primary flavor that has been the sole focus up to 2118 + this point has two different implementations, non-preemptible and 2119 + preemptible. 2120 + The other four flavors are listed below, with requirements for each 2121 + described in a separate section. 2122 + 2123 + <ol> 2124 + <li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> 2125 + <li> <a href="#Sched Flavor">Sched Flavor</a> 2126 + <li> <a href="#Sleepable RCU">Sleepable RCU</a> 2127 + <li> <a href="#Tasks RCU">Tasks RCU</a> 2128 + </ol> 2129 + 2130 + <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> 2131 + 2132 + <p> 2133 + The softirq-disable (AKA &ldquo;bottom-half&rdquo;, 2134 + hence the &ldquo;_bh&rdquo; abbreviations) 2135 + flavor of RCU, or <i>RCU-bh</i>, was developed by 2136 + Dipankar Sarma to provide a flavor of RCU that could withstand the 2137 + network-based denial-of-service attacks researched by Robert 2138 + Olsson. 2139 + These attacks placed so much networking load on the system 2140 + that some of the CPUs never exited softirq execution, 2141 + which in turn prevented those CPUs from ever executing a context switch, 2142 + which, in the RCU implementation of that time, prevented grace periods 2143 + from ever ending. 2144 + The result was an out-of-memory condition and a system hang. 2145 + 2146 + <p> 2147 + The solution was the creation of RCU-bh, which does 2148 + <tt>local_bh_disable()</tt> 2149 + across its read-side critical sections, and which uses the transition 2150 + from one type of softirq processing to another as a quiescent state 2151 + in addition to context switch, idle, user mode, and offline. 2152 + This means that RCU-bh grace periods can complete even when some of 2153 + the CPUs execute in softirq indefinitely, thus allowing algorithms 2154 + based on RCU-bh to withstand network-based denial-of-service attacks. 2155 + 2156 + <p> 2157 + Because 2158 + <tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> 2159 + disable and re-enable softirq handlers, any attempt to start a softirq 2160 + handlers during the 2161 + RCU-bh read-side critical section will be deferred. 2162 + In this case, <tt>rcu_read_unlock_bh()</tt> 2163 + will invoke softirq processing, which can take considerable time. 2164 + One can of course argue that this softirq overhead should be associated 2165 + with the code following the RCU-bh read-side critical section rather 2166 + than <tt>rcu_read_unlock_bh()</tt>, but the fact 2167 + is that most profiling tools cannot be expected to make this sort 2168 + of fine distinction. 2169 + For example, suppose that a three-millisecond-long RCU-bh read-side 2170 + critical section executes during a time of heavy networking load. 2171 + There will very likely be an attempt to invoke at least one softirq 2172 + handler during that three milliseconds, but any such invocation will 2173 + be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. 2174 + This can of course make it appear at first glance as if 2175 + <tt>rcu_read_unlock_bh()</tt> was executing very slowly. 2176 + 2177 + <p> 2178 + The 2179 + <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> 2180 + includes 2181 + <tt>rcu_read_lock_bh()</tt>, 2182 + <tt>rcu_read_unlock_bh()</tt>, 2183 + <tt>rcu_dereference_bh()</tt>, 2184 + <tt>rcu_dereference_bh_check()</tt>, 2185 + <tt>synchronize_rcu_bh()</tt>, 2186 + <tt>synchronize_rcu_bh_expedited()</tt>, 2187 + <tt>call_rcu_bh()</tt>, 2188 + <tt>rcu_barrier_bh()</tt>, and 2189 + <tt>rcu_read_lock_bh_held()</tt>. 2190 + 2191 + <h3><a name="Sched Flavor">Sched Flavor</a></h3> 2192 + 2193 + <p> 2194 + Before preemptible RCU, waiting for an RCU grace period had the 2195 + side effect of also waiting for all pre-existing interrupt 2196 + and NMI handlers. 2197 + However, there are legitimate preemptible-RCU implementations that 2198 + do not have this property, given that any point in the code outside 2199 + of an RCU read-side critical section can be a quiescent state. 2200 + Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo; 2201 + RCU in that an RCU-sched grace period waits for for pre-existing 2202 + interrupt and NMI handlers. 2203 + In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched 2204 + APIs have identical implementations, while kernels built with 2205 + <tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. 2206 + 2207 + <p> 2208 + Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, 2209 + <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> 2210 + disable and re-enable preemption, respectively. 2211 + This means that if there was a preemption attempt during the 2212 + RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> 2213 + will enter the scheduler, with all the latency and overhead entailed. 2214 + Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look 2215 + as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. 2216 + However, the highest-priority task won't be preempted, so that task 2217 + will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. 2218 + 2219 + <p> 2220 + The 2221 + <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> 2222 + includes 2223 + <tt>rcu_read_lock_sched()</tt>, 2224 + <tt>rcu_read_unlock_sched()</tt>, 2225 + <tt>rcu_read_lock_sched_notrace()</tt>, 2226 + <tt>rcu_read_unlock_sched_notrace()</tt>, 2227 + <tt>rcu_dereference_sched()</tt>, 2228 + <tt>rcu_dereference_sched_check()</tt>, 2229 + <tt>synchronize_sched()</tt>, 2230 + <tt>synchronize_rcu_sched_expedited()</tt>, 2231 + <tt>call_rcu_sched()</tt>, 2232 + <tt>rcu_barrier_sched()</tt>, and 2233 + <tt>rcu_read_lock_sched_held()</tt>. 2234 + However, anything that disables preemption also marks an RCU-sched 2235 + read-side critical section, including 2236 + <tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, 2237 + <tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, 2238 + and so on. 2239 + 2240 + <h3><a name="Sleepable RCU">Sleepable RCU</a></h3> 2241 + 2242 + <p> 2243 + For well over a decade, someone saying &ldquo;I need to block within 2244 + an RCU read-side critical section&rdquo; was a reliable indication 2245 + that this someone did not understand RCU. 2246 + After all, if you are always blocking in an RCU read-side critical 2247 + section, you can probably afford to use a higher-overhead synchronization 2248 + mechanism. 2249 + However, that changed with the advent of the Linux kernel's notifiers, 2250 + whose RCU read-side critical 2251 + sections almost never sleep, but sometimes need to. 2252 + This resulted in the introduction of 2253 + <a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, 2254 + or <i>SRCU</i>. 2255 + 2256 + <p> 2257 + SRCU allows different domains to be defined, with each such domain 2258 + defined by an instance of an <tt>srcu_struct</tt> structure. 2259 + A pointer to this structure must be passed in to each SRCU function, 2260 + for example, <tt>synchronize_srcu(&amp;ss)</tt>, where 2261 + <tt>ss</tt> is the <tt>srcu_struct</tt> structure. 2262 + The key benefit of these domains is that a slow SRCU reader in one 2263 + domain does not delay an SRCU grace period in some other domain. 2264 + That said, one consequence of these domains is that read-side code 2265 + must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt> 2266 + to <tt>srcu_read_unlock()</tt>, for example, as follows: 2267 + 2268 + <blockquote> 2269 + <pre> 2270 + 1 int idx; 2271 + 2 2272 + 3 idx = srcu_read_lock(&amp;ss); 2273 + 4 do_something(); 2274 + 5 srcu_read_unlock(&amp;ss, idx); 2275 + </pre> 2276 + </blockquote> 2277 + 2278 + <p> 2279 + As noted above, it is legal to block within SRCU read-side critical sections, 2280 + however, with great power comes great responsibility. 2281 + If you block forever in one of a given domain's SRCU read-side critical 2282 + sections, then that domain's grace periods will also be blocked forever. 2283 + Of course, one good way to block forever is to deadlock, which can 2284 + happen if any operation in a given domain's SRCU read-side critical 2285 + section can block waiting, either directly or indirectly, for that domain's 2286 + grace period to elapse. 2287 + For example, this results in a self-deadlock: 2288 + 2289 + <blockquote> 2290 + <pre> 2291 + 1 int idx; 2292 + 2 2293 + 3 idx = srcu_read_lock(&amp;ss); 2294 + 4 do_something(); 2295 + 5 synchronize_srcu(&amp;ss); 2296 + 6 srcu_read_unlock(&amp;ss, idx); 2297 + </pre> 2298 + </blockquote> 2299 + 2300 + <p> 2301 + However, if line&nbsp;5 acquired a mutex that was held across 2302 + a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, 2303 + deadlock would still be possible. 2304 + Furthermore, if line&nbsp;5 acquired a mutex that was held across 2305 + a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, 2306 + and if an <tt>ss1</tt>-domain SRCU read-side critical section 2307 + acquired another mutex that was held across as <tt>ss</tt>-domain 2308 + <tt>synchronize_srcu()</tt>, 2309 + deadlock would again be possible. 2310 + Such a deadlock cycle could extend across an arbitrarily large number 2311 + of different SRCU domains. 2312 + Again, with great power comes great responsibility. 2313 + 2314 + <p> 2315 + Unlike the other RCU flavors, SRCU read-side critical sections can 2316 + run on idle and even offline CPUs. 2317 + This ability requires that <tt>srcu_read_lock()</tt> and 2318 + <tt>srcu_read_unlock()</tt> contain memory barriers, which means 2319 + that SRCU readers will run a bit slower than would RCU readers. 2320 + It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> 2321 + API, which, in combination with <tt>srcu_read_unlock()</tt>, 2322 + guarantees a full memory barrier. 2323 + 2324 + <p> 2325 + The 2326 + <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> 2327 + includes 2328 + <tt>srcu_read_lock()</tt>, 2329 + <tt>srcu_read_unlock()</tt>, 2330 + <tt>srcu_dereference()</tt>, 2331 + <tt>srcu_dereference_check()</tt>, 2332 + <tt>synchronize_srcu()</tt>, 2333 + <tt>synchronize_srcu_expedited()</tt>, 2334 + <tt>call_srcu()</tt>, 2335 + <tt>srcu_barrier()</tt>, and 2336 + <tt>srcu_read_lock_held()</tt>. 2337 + It also includes 2338 + <tt>DEFINE_SRCU()</tt>, 2339 + <tt>DEFINE_STATIC_SRCU()</tt>, and 2340 + <tt>init_srcu_struct()</tt> 2341 + APIs for defining and initializing <tt>srcu_struct</tt> structures. 2342 + 2343 + <h3><a name="Tasks RCU">Tasks RCU</a></h3> 2344 + 2345 + <p> 2346 + Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the 2347 + binary rewriting required to install different types of probes. 2348 + It would be good to be able to free old trampolines, which sounds 2349 + like a job for some form of RCU. 2350 + However, because it is necessary to be able to install a trace 2351 + anywhere in the code, it is not possible to use read-side markers 2352 + such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. 2353 + In addition, it does not work to have these markers in the trampoline 2354 + itself, because there would need to be instructions following 2355 + <tt>rcu_read_unlock()</tt>. 2356 + Although <tt>synchronize_rcu()</tt> would guarantee that execution 2357 + reached the <tt>rcu_read_unlock()</tt>, it would not be able to 2358 + guarantee that execution had completely left the trampoline. 2359 + 2360 + <p> 2361 + The solution, in the form of 2362 + <a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, 2363 + is to have implicit 2364 + read-side critical sections that are delimited by voluntary context 2365 + switches, that is, calls to <tt>schedule()</tt>, 2366 + <tt>cond_resched_rcu_qs()</tt>, and 2367 + <tt>synchronize_rcu_tasks()</tt>. 2368 + In addition, transitions to and from userspace execution also delimit 2369 + tasks-RCU read-side critical sections. 2370 + 2371 + <p> 2372 + The tasks-RCU API is quite compact, consisting only of 2373 + <tt>call_rcu_tasks()</tt>, 2374 + <tt>synchronize_rcu_tasks()</tt>, and 2375 + <tt>rcu_barrier_tasks()</tt>. 2376 + 2377 + <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> 2378 + 2379 + <p> 2380 + One of the tricks that RCU uses to attain update-side scalability is 2381 + to increase grace-period latency with increasing numbers of CPUs. 2382 + If this becomes a serious problem, it will be necessary to rework the 2383 + grace-period state machine so as to avoid the need for the additional 2384 + latency. 2385 + 2386 + <p> 2387 + Expedited grace periods scan the CPUs, so their latency and overhead 2388 + increases with increasing numbers of CPUs. 2389 + If this becomes a serious problem on large systems, it will be necessary 2390 + to do some redesign to avoid this scalability problem. 2391 + 2392 + <p> 2393 + RCU disables CPU hotplug in a few places, perhaps most notably in the 2394 + expedited grace-period and <tt>rcu_barrier()</tt> operations. 2395 + If there is a strong reason to use expedited grace periods in CPU-hotplug 2396 + notifiers, it will be necessary to avoid disabling CPU hotplug. 2397 + This would introduce some complexity, so there had better be a <i>very</i> 2398 + good reason. 2399 + 2400 + <p> 2401 + The tradeoff between grace-period latency on the one hand and interruptions 2402 + of other CPUs on the other hand may need to be re-examined. 2403 + The desire is of course for zero grace-period latency as well as zero 2404 + interprocessor interrupts undertaken during an expedited grace period 2405 + operation. 2406 + While this ideal is unlikely to be achievable, it is quite possible that 2407 + further improvements can be made. 2408 + 2409 + <p> 2410 + The multiprocessor implementations of RCU use a combining tree that 2411 + groups CPUs so as to reduce lock contention and increase cache locality. 2412 + However, this combining tree does not spread its memory across NUMA 2413 + nodes nor does it align the CPU groups with hardware features such 2414 + as sockets or cores. 2415 + Such spreading and alignment is currently believed to be unnecessary 2416 + because the hotpath read-side primitives do not access the combining 2417 + tree, nor does <tt>call_rcu()</tt> in the common case. 2418 + If you believe that your architecture needs such spreading and alignment, 2419 + then your architecture should also benefit from the 2420 + <tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set 2421 + to the number of CPUs in a socket, NUMA node, or whatever. 2422 + If the number of CPUs is too large, use a fraction of the number of 2423 + CPUs. 2424 + If the number of CPUs is a large prime number, well, that certainly 2425 + is an &ldquo;interesting&rdquo; architectural choice! 2426 + More flexible arrangements might be considered, but only if 2427 + <tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only 2428 + if the inadequacy has been demonstrated by a carefully run and 2429 + realistic system-level workload. 2430 + 2431 + <p> 2432 + Please note that arrangements that require RCU to remap CPU numbers will 2433 + require extremely good demonstration of need and full exploration of 2434 + alternatives. 2435 + 2436 + <p> 2437 + There is an embarrassingly large number of flavors of RCU, and this 2438 + number has been increasing over time. 2439 + Perhaps it will be possible to combine some at some future date. 2440 + 2441 + <p> 2442 + RCU's various kthreads are reasonably recent additions. 2443 + It is quite likely that adjustments will be required to more gracefully 2444 + handle extreme loads. 2445 + It might also be necessary to be able to relate CPU utilization by 2446 + RCU's kthreads and softirq handlers to the code that instigated this 2447 + CPU utilization. 2448 + For example, RCU callback overhead might be charged back to the 2449 + originating <tt>call_rcu()</tt> instance, though probably not 2450 + in production kernels. 2451 + 2452 + <h2><a name="Summary">Summary</a></h2> 2453 + 2454 + <p> 2455 + This document has presented more than two decade's worth of RCU 2456 + requirements. 2457 + Given that the requirements keep changing, this will not be the last 2458 + word on this subject, but at least it serves to get an important 2459 + subset of the requirements set forth. 2460 + 2461 + <h2><a name="Acknowledgments">Acknowledgments</a></h2> 2462 + 2463 + I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, 2464 + Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and 2465 + Andy Lutomirski for their help in rendering 2466 + this article human readable, and to Michelle Rankin for her support 2467 + of this effort. 2468 + Other contributions are acknowledged in the Linux kernel's git archive. 2469 + The cartoon is copyright (c) 2013 by Melissa Broussard, 2470 + and is provided 2471 + under the terms of the Creative Commons Attribution-Share Alike 3.0 2472 + United States license. 2473 + 2474 + <h3><a name="Answers to Quick Quizzes"> 2475 + Answers to Quick Quizzes</a></h3> 2476 + 2477 + <a name="qq1answer"></a> 2478 + <p><b>Quick Quiz 1</b>: 2479 + Wait a minute! 2480 + You said that updaters can make useful forward progress concurrently 2481 + with readers, but pre-existing readers will block 2482 + <tt>synchronize_rcu()</tt>!!! 2483 + Just who are you trying to fool??? 2484 + 2485 + 2486 + </p><p><b>Answer</b>: 2487 + First, if updaters do not wish to be blocked by readers, they can use 2488 + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will 2489 + be discussed later. 2490 + Second, even when using <tt>synchronize_rcu()</tt>, the other 2491 + update-side code does run concurrently with readers, whether pre-existing 2492 + or not. 2493 + 2494 + 2495 + </p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a> 2496 + 2497 + <a name="qq2answer"></a> 2498 + <p><b>Quick Quiz 2</b>: 2499 + Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed? 2500 + 2501 + 2502 + </p><p><b>Answer</b>: 2503 + Without that extra grace period, memory reordering could result in 2504 + <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> 2505 + concurrently with the last bits of <tt>recovery()</tt>. 2506 + 2507 + 2508 + </p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a> 2509 + 2510 + <a name="qq3answer"></a> 2511 + <p><b>Quick Quiz 3</b>: 2512 + But <tt>rcu_assign_pointer()</tt> does nothing to prevent the 2513 + two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> 2514 + from being reordered. 2515 + Can't that also cause problems? 2516 + 2517 + 2518 + </p><p><b>Answer</b>: 2519 + No, it cannot. 2520 + The readers cannot see either of these two fields until 2521 + the assignment to <tt>gp</tt>, by which time both fields are 2522 + fully initialized. 2523 + So reordering the assignments 2524 + to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly 2525 + cause any problems. 2526 + 2527 + 2528 + </p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a> 2529 + 2530 + <a name="qq4answer"></a> 2531 + <p><b>Quick Quiz 4</b>: 2532 + Without the <tt>rcu_dereference()</tt> or the 2533 + <tt>rcu_access_pointer()</tt>, what destructive optimizations 2534 + might the compiler make use of? 2535 + 2536 + 2537 + </p><p><b>Answer</b>: 2538 + Let's start with what happens to <tt>do_something_gp()</tt> 2539 + if it fails to use <tt>rcu_dereference()</tt>. 2540 + It could reuse a value formerly fetched from this same pointer. 2541 + It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time 2542 + manner, resulting in <i>load tearing</i>, in turn resulting a bytewise 2543 + mash-up of two distince pointer values. 2544 + It might even use value-speculation optimizations, where it makes a wrong 2545 + guess, but by the time it gets around to checking the value, an update 2546 + has changed the pointer to match the wrong guess. 2547 + Too bad about any dereferences that returned pre-initialization garbage 2548 + in the meantime! 2549 + 2550 + <p> 2551 + For <tt>remove_gp_synchronous()</tt>, as long as all modifications 2552 + to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, 2553 + the above optimizations are harmless. 2554 + However, 2555 + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, 2556 + <tt>sparse</tt> will complain if you 2557 + define <tt>gp</tt> with <tt>__rcu</tt> and then 2558 + access it without using 2559 + either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. 2560 + 2561 + 2562 + </p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a> 2563 + 2564 + <a name="qq5answer"></a> 2565 + <p><b>Quick Quiz 5</b>: 2566 + Given that multiple CPUs can start RCU read-side critical sections 2567 + at any time without any ordering whatsoever, how can RCU possibly tell whether 2568 + or not a given RCU read-side critical section starts before a 2569 + given instance of <tt>synchronize_rcu()</tt>? 2570 + 2571 + 2572 + </p><p><b>Answer</b>: 2573 + If RCU cannot tell whether or not a given 2574 + RCU read-side critical section starts before a 2575 + given instance of <tt>synchronize_rcu()</tt>, 2576 + then it must assume that the RCU read-side critical section 2577 + started first. 2578 + In other words, a given instance of <tt>synchronize_rcu()</tt> 2579 + can avoid waiting on a given RCU read-side critical section only 2580 + if it can prove that <tt>synchronize_rcu()</tt> started first. 2581 + 2582 + 2583 + </p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a> 2584 + 2585 + <a name="qq6answer"></a> 2586 + <p><b>Quick Quiz 6</b>: 2587 + The first and second guarantees require unbelievably strict ordering! 2588 + Are all these memory barriers <i> really</i> required? 2589 + 2590 + 2591 + </p><p><b>Answer</b>: 2592 + Yes, they really are required. 2593 + To see why the first guarantee is required, consider the following 2594 + sequence of events: 2595 + 2596 + <ol> 2597 + <li> CPU 1: <tt>rcu_read_lock()</tt> 2598 + <li> CPU 1: <tt>q = rcu_dereference(gp); 2599 + /* Very likely to return p. */</tt> 2600 + <li> CPU 0: <tt>list_del_rcu(p);</tt> 2601 + <li> CPU 0: <tt>synchronize_rcu()</tt> starts. 2602 + <li> CPU 1: <tt>do_something_with(q-&gt;a); 2603 + /* No smp_mb(), so might happen after kfree(). */</tt> 2604 + <li> CPU 1: <tt>rcu_read_unlock()</tt> 2605 + <li> CPU 0: <tt>synchronize_rcu()</tt> returns. 2606 + <li> CPU 0: <tt>kfree(p);</tt> 2607 + </ol> 2608 + 2609 + <p> 2610 + Therefore, there absolutely must be a full memory barrier between the 2611 + end of the RCU read-side critical section and the end of the 2612 + grace period. 2613 + 2614 + <p> 2615 + The sequence of events demonstrating the necessity of the second rule 2616 + is roughly similar: 2617 + 2618 + <ol> 2619 + <li> CPU 0: <tt>list_del_rcu(p);</tt> 2620 + <li> CPU 0: <tt>synchronize_rcu()</tt> starts. 2621 + <li> CPU 1: <tt>rcu_read_lock()</tt> 2622 + <li> CPU 1: <tt>q = rcu_dereference(gp); 2623 + /* Might return p if no memory barrier. */</tt> 2624 + <li> CPU 0: <tt>synchronize_rcu()</tt> returns. 2625 + <li> CPU 0: <tt>kfree(p);</tt> 2626 + <li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt> 2627 + <li> CPU 1: <tt>rcu_read_unlock()</tt> 2628 + </ol> 2629 + 2630 + <p> 2631 + And similarly, without a memory barrier between the beginning of the 2632 + grace period and the beginning of the RCU read-side critical section, 2633 + CPU&nbsp;1 might end up accessing the freelist. 2634 + 2635 + <p> 2636 + The &ldquo;as if&rdquo; rule of course applies, so that any implementation 2637 + that acts as if the appropriate memory barriers were in place is a 2638 + correct implementation. 2639 + That said, it is much easier to fool yourself into believing that you have 2640 + adhered to the as-if rule than it is to actually adhere to it! 2641 + 2642 + 2643 + </p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a> 2644 + 2645 + <a name="qq7answer"></a> 2646 + <p><b>Quick Quiz 7</b>: 2647 + But how does the upgrade-to-write operation exclude other readers? 2648 + 2649 + 2650 + </p><p><b>Answer</b>: 2651 + It doesn't, just like normal RCU updates, which also do not exclude 2652 + RCU readers. 2653 + 2654 + 2655 + </p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a> 2656 + 2657 + <a name="qq8answer"></a> 2658 + <p><b>Quick Quiz 8</b>: 2659 + Can't the compiler also reorder this code? 2660 + 2661 + 2662 + </p><p><b>Answer</b>: 2663 + No, the volatile casts in <tt>READ_ONCE()</tt> and 2664 + <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in 2665 + this particular case. 2666 + 2667 + 2668 + </p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a> 2669 + 2670 + <a name="qq9answer"></a> 2671 + <p><b>Quick Quiz 9</b>: 2672 + Suppose that synchronize_rcu() did wait until all readers had completed. 2673 + Would the updater be able to rely on this? 2674 + 2675 + 2676 + </p><p><b>Answer</b>: 2677 + No. 2678 + Even if <tt>synchronize_rcu()</tt> were to wait until 2679 + all readers had completed, a new reader might start immediately after 2680 + <tt>synchronize_rcu()</tt> completed. 2681 + Therefore, the code following 2682 + <tt>synchronize_rcu()</tt> cannot rely on there being no readers 2683 + in any case. 2684 + 2685 + 2686 + </p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a> 2687 + 2688 + <a name="qq10answer"></a> 2689 + <p><b>Quick Quiz 10</b>: 2690 + How long a sequence of grace periods, each separated by an RCU read-side 2691 + critical section, would be required to partition the RCU read-side 2692 + critical sections at the beginning and end of the chain? 2693 + 2694 + 2695 + </p><p><b>Answer</b>: 2696 + In theory, an infinite number. 2697 + In practice, an unknown number that is sensitive to both implementation 2698 + details and timing considerations. 2699 + Therefore, even in practice, RCU users must abide by the theoretical rather 2700 + than the practical answer. 2701 + 2702 + 2703 + </p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a> 2704 + 2705 + <a name="qq11answer"></a> 2706 + <p><b>Quick Quiz 11</b>: 2707 + What about sleeping locks? 2708 + 2709 + 2710 + </p><p><b>Answer</b>: 2711 + These are forbidden within Linux-kernel RCU read-side critical sections 2712 + because it is not legal to place a quiescent state (in this case, 2713 + voluntary context switch) within an RCU read-side critical section. 2714 + However, sleeping locks may be used within userspace RCU read-side critical 2715 + sections, and also within Linux-kernel sleepable RCU 2716 + <a href="#Sleepable RCU">(SRCU)</a> 2717 + read-side critical sections. 2718 + In addition, the -rt patchset turns spinlocks into a sleeping locks so 2719 + that the corresponding critical sections can be preempted, which 2720 + also means that these sleeplockified spinlocks (but not other sleeping locks!) 2721 + may be acquire within -rt-Linux-kernel RCU read-side critical sections. 2722 + 2723 + <p> 2724 + Note that it <i>is</i> legal for a normal RCU read-side critical section 2725 + to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), 2726 + but only as long as it does not loop indefinitely attempting to 2727 + conditionally acquire that sleeping locks. 2728 + The key point is that things like <tt>mutex_trylock()</tt> 2729 + either return with the mutex held, or return an error indication if 2730 + the mutex was not immediately available. 2731 + Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. 2732 + 2733 + 2734 + </p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a> 2735 + 2736 + <a name="qq12answer"></a> 2737 + <p><b>Quick Quiz 12</b>: 2738 + Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>? 2739 + After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the 2740 + structure, which would interact badly with concurrent insertions. 2741 + Doesn't this mean that <tt>rcu_dereference()</tt> is required? 2742 + 2743 + 2744 + </p><p><b>Answer</b>: 2745 + Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes 2746 + any changes, including any insertions that <tt>rcu_dereference()</tt> 2747 + would protect against. 2748 + Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt> 2749 + is released on line&nbsp;25, which in turn means that 2750 + <tt>rcu_access_pointer()</tt> suffices. 2751 + 2752 + 2753 + </p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a> 2754 + 2755 + <a name="qq13answer"></a> 2756 + <p><b>Quick Quiz 13</b>: 2757 + Earlier it was claimed that <tt>call_rcu()</tt> and 2758 + <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked 2759 + by readers. 2760 + But how can that be correct, given that the invocation of the callback 2761 + and the freeing of the memory (respectively) must still wait for 2762 + a grace period to elapse? 2763 + 2764 + 2765 + </p><p><b>Answer</b>: 2766 + We could define things this way, but keep in mind that this sort of 2767 + definition would say that updates in garbage-collected languages 2768 + cannot complete until the next time the garbage collector runs, 2769 + which does not seem at all reasonable. 2770 + The key point is that in most cases, an updater using either 2771 + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the 2772 + next update as soon as it has invoked <tt>call_rcu()</tt> or 2773 + <tt>kfree_rcu()</tt>, without having to wait for a subsequent 2774 + grace period. 2775 + 2776 + 2777 + </p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a> 2778 + 2779 + <a name="qq14answer"></a> 2780 + <p><b>Quick Quiz 14</b>: 2781 + So what happens with <tt>synchronize_rcu()</tt> during 2782 + scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> 2783 + kernels? 2784 + 2785 + 2786 + </p><p><b>Answer</b>: 2787 + In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> 2788 + maps directly to <tt>synchronize_sched()</tt>. 2789 + Therefore, <tt>synchronize_rcu()</tt> works normally throughout 2790 + boot in <tt>CONFIG_PREEMPT=n</tt> kernels. 2791 + However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, 2792 + so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> 2793 + during scheduler initialization. 2794 + 2795 + 2796 + </p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a> 2797 + 2798 + 2799 + </body></html>
+2643
Documentation/RCU/Design/Requirements/Requirements.htmlx
··· 1 + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2 + "http://www.w3.org/TR/html4/loose.dtd"> 3 + <html> 4 + <head><title>A Tour Through RCU's Requirements [LWN.net]</title> 5 + <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> 6 + 7 + <h1>A Tour Through RCU's Requirements</h1> 8 + 9 + <p>Copyright IBM Corporation, 2015</p> 10 + <p>Author: Paul E.&nbsp;McKenney</p> 11 + <p><i>The initial version of this document appeared in the 12 + <a href="https://lwn.net/">LWN</a> articles 13 + <a href="https://lwn.net/Articles/652156/">here</a>, 14 + <a href="https://lwn.net/Articles/652677/">here</a>, and 15 + <a href="https://lwn.net/Articles/653326/">here</a>.</i></p> 16 + 17 + <h2>Introduction</h2> 18 + 19 + <p> 20 + Read-copy update (RCU) is a synchronization mechanism that is often 21 + used as a replacement for reader-writer locking. 22 + RCU is unusual in that updaters do not block readers, 23 + which means that RCU's read-side primitives can be exceedingly fast 24 + and scalable. 25 + In addition, updaters can make useful forward progress concurrently 26 + with readers. 27 + However, all this concurrency between RCU readers and updaters does raise 28 + the question of exactly what RCU readers are doing, which in turn 29 + raises the question of exactly what RCU's requirements are. 30 + 31 + <p> 32 + This document therefore summarizes RCU's requirements, and can be thought 33 + of as an informal, high-level specification for RCU. 34 + It is important to understand that RCU's specification is primarily 35 + empirical in nature; 36 + in fact, I learned about many of these requirements the hard way. 37 + This situation might cause some consternation, however, not only 38 + has this learning process been a lot of fun, but it has also been 39 + a great privilege to work with so many people willing to apply 40 + technologies in interesting new ways. 41 + 42 + <p> 43 + All that aside, here are the categories of currently known RCU requirements: 44 + </p> 45 + 46 + <ol> 47 + <li> <a href="#Fundamental Requirements"> 48 + Fundamental Requirements</a> 49 + <li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> 50 + <li> <a href="#Parallelism Facts of Life"> 51 + Parallelism Facts of Life</a> 52 + <li> <a href="#Quality-of-Implementation Requirements"> 53 + Quality-of-Implementation Requirements</a> 54 + <li> <a href="#Linux Kernel Complications"> 55 + Linux Kernel Complications</a> 56 + <li> <a href="#Software-Engineering Requirements"> 57 + Software-Engineering Requirements</a> 58 + <li> <a href="#Other RCU Flavors"> 59 + Other RCU Flavors</a> 60 + <li> <a href="#Possible Future Changes"> 61 + Possible Future Changes</a> 62 + </ol> 63 + 64 + <p> 65 + This is followed by a <a href="#Summary">summary</a>, 66 + which is in turn followed by the inevitable 67 + <a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. 68 + 69 + <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> 70 + 71 + <p> 72 + RCU's fundamental requirements are the closest thing RCU has to hard 73 + mathematical requirements. 74 + These are: 75 + 76 + <ol> 77 + <li> <a href="#Grace-Period Guarantee"> 78 + Grace-Period Guarantee</a> 79 + <li> <a href="#Publish-Subscribe Guarantee"> 80 + Publish-Subscribe Guarantee</a> 81 + <li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> 82 + RCU Primitives Guaranteed to Execute Unconditionally</a> 83 + <li> <a href="#Guaranteed Read-to-Write Upgrade"> 84 + Guaranteed Read-to-Write Upgrade</a> 85 + </ol> 86 + 87 + <h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> 88 + 89 + <p> 90 + RCU's grace-period guarantee is unusual in being premeditated: 91 + Jack Slingwine and I had this guarantee firmly in mind when we started 92 + work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s. 93 + That said, the past two decades of experience with RCU have produced 94 + a much more detailed understanding of this guarantee. 95 + 96 + <p> 97 + RCU's grace-period guarantee allows updaters to wait for the completion 98 + of all pre-existing RCU read-side critical sections. 99 + An RCU read-side critical section 100 + begins with the marker <tt>rcu_read_lock()</tt> and ends with 101 + the marker <tt>rcu_read_unlock()</tt>. 102 + These markers may be nested, and RCU treats a nested set as one 103 + big RCU read-side critical section. 104 + Production-quality implementations of <tt>rcu_read_lock()</tt> and 105 + <tt>rcu_read_unlock()</tt> are extremely lightweight, and in 106 + fact have exactly zero overhead in Linux kernels built for production 107 + use with <tt>CONFIG_PREEMPT=n</tt>. 108 + 109 + <p> 110 + This guarantee allows ordering to be enforced with extremely low 111 + overhead to readers, for example: 112 + 113 + <blockquote> 114 + <pre> 115 + 1 int x, y; 116 + 2 117 + 3 void thread0(void) 118 + 4 { 119 + 5 rcu_read_lock(); 120 + 6 r1 = READ_ONCE(x); 121 + 7 r2 = READ_ONCE(y); 122 + 8 rcu_read_unlock(); 123 + 9 } 124 + 10 125 + 11 void thread1(void) 126 + 12 { 127 + 13 WRITE_ONCE(x, 1); 128 + 14 synchronize_rcu(); 129 + 15 WRITE_ONCE(y, 1); 130 + 16 } 131 + </pre> 132 + </blockquote> 133 + 134 + <p> 135 + Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for 136 + all pre-existing readers, any instance of <tt>thread0()</tt> that 137 + loads a value of zero from <tt>x</tt> must complete before 138 + <tt>thread1()</tt> stores to <tt>y</tt>, so that instance must 139 + also load a value of zero from <tt>y</tt>. 140 + Similarly, any instance of <tt>thread0()</tt> that loads a value of 141 + one from <tt>y</tt> must have started after the 142 + <tt>synchronize_rcu()</tt> started, and must therefore also load 143 + a value of one from <tt>x</tt>. 144 + Therefore, the outcome: 145 + <blockquote> 146 + <pre> 147 + (r1 == 0 &amp;&amp; r2 == 1) 148 + </pre> 149 + </blockquote> 150 + cannot happen. 151 + 152 + <p>@@QQ@@ 153 + Wait a minute! 154 + You said that updaters can make useful forward progress concurrently 155 + with readers, but pre-existing readers will block 156 + <tt>synchronize_rcu()</tt>!!! 157 + Just who are you trying to fool??? 158 + <p>@@QQA@@ 159 + First, if updaters do not wish to be blocked by readers, they can use 160 + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will 161 + be discussed later. 162 + Second, even when using <tt>synchronize_rcu()</tt>, the other 163 + update-side code does run concurrently with readers, whether pre-existing 164 + or not. 165 + <p>@@QQE@@ 166 + 167 + <p> 168 + This scenario resembles one of the first uses of RCU in 169 + <a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, 170 + which managed a distributed lock manager's transition into 171 + a state suitable for handling recovery from node failure, 172 + more or less as follows: 173 + 174 + <blockquote> 175 + <pre> 176 + 1 #define STATE_NORMAL 0 177 + 2 #define STATE_WANT_RECOVERY 1 178 + 3 #define STATE_RECOVERING 2 179 + 4 #define STATE_WANT_NORMAL 3 180 + 5 181 + 6 int state = STATE_NORMAL; 182 + 7 183 + 8 void do_something_dlm(void) 184 + 9 { 185 + 10 int state_snap; 186 + 11 187 + 12 rcu_read_lock(); 188 + 13 state_snap = READ_ONCE(state); 189 + 14 if (state_snap == STATE_NORMAL) 190 + 15 do_something(); 191 + 16 else 192 + 17 do_something_carefully(); 193 + 18 rcu_read_unlock(); 194 + 19 } 195 + 20 196 + 21 void start_recovery(void) 197 + 22 { 198 + 23 WRITE_ONCE(state, STATE_WANT_RECOVERY); 199 + 24 synchronize_rcu(); 200 + 25 WRITE_ONCE(state, STATE_RECOVERING); 201 + 26 recovery(); 202 + 27 WRITE_ONCE(state, STATE_WANT_NORMAL); 203 + 28 synchronize_rcu(); 204 + 29 WRITE_ONCE(state, STATE_NORMAL); 205 + 30 } 206 + </pre> 207 + </blockquote> 208 + 209 + <p> 210 + The RCU read-side critical section in <tt>do_something_dlm()</tt> 211 + works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> 212 + to guarantee that <tt>do_something()</tt> never runs concurrently 213 + with <tt>recovery()</tt>, but with little or no synchronization 214 + overhead in <tt>do_something_dlm()</tt>. 215 + 216 + <p>@@QQ@@ 217 + Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed? 218 + <p>@@QQA@@ 219 + Without that extra grace period, memory reordering could result in 220 + <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> 221 + concurrently with the last bits of <tt>recovery()</tt>. 222 + <p>@@QQE@@ 223 + 224 + <p> 225 + In order to avoid fatal problems such as deadlocks, 226 + an RCU read-side critical section must not contain calls to 227 + <tt>synchronize_rcu()</tt>. 228 + Similarly, an RCU read-side critical section must not 229 + contain anything that waits, directly or indirectly, on completion of 230 + an invocation of <tt>synchronize_rcu()</tt>. 231 + 232 + <p> 233 + Although RCU's grace-period guarantee is useful in and of itself, with 234 + <a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, 235 + it would be good to be able to use RCU to coordinate read-side 236 + access to linked data structures. 237 + For this, the grace-period guarantee is not sufficient, as can 238 + be seen in function <tt>add_gp_buggy()</tt> below. 239 + We will look at the reader's code later, but in the meantime, just think of 240 + the reader as locklessly picking up the <tt>gp</tt> pointer, 241 + and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the 242 + <tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields. 243 + 244 + <blockquote> 245 + <pre> 246 + 1 bool add_gp_buggy(int a, int b) 247 + 2 { 248 + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); 249 + 4 if (!p) 250 + 5 return -ENOMEM; 251 + 6 spin_lock(&amp;gp_lock); 252 + 7 if (rcu_access_pointer(gp)) { 253 + 8 spin_unlock(&amp;gp_lock); 254 + 9 return false; 255 + 10 } 256 + 11 p-&gt;a = a; 257 + 12 p-&gt;b = a; 258 + 13 gp = p; /* ORDERING BUG */ 259 + 14 spin_unlock(&amp;gp_lock); 260 + 15 return true; 261 + 16 } 262 + </pre> 263 + </blockquote> 264 + 265 + <p> 266 + The problem is that both the compiler and weakly ordered CPUs are within 267 + their rights to reorder this code as follows: 268 + 269 + <blockquote> 270 + <pre> 271 + 1 bool add_gp_buggy_optimized(int a, int b) 272 + 2 { 273 + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); 274 + 4 if (!p) 275 + 5 return -ENOMEM; 276 + 6 spin_lock(&amp;gp_lock); 277 + 7 if (rcu_access_pointer(gp)) { 278 + 8 spin_unlock(&amp;gp_lock); 279 + 9 return false; 280 + 10 } 281 + <b>11 gp = p; /* ORDERING BUG */ 282 + 12 p-&gt;a = a; 283 + 13 p-&gt;b = a;</b> 284 + 14 spin_unlock(&amp;gp_lock); 285 + 15 return true; 286 + 16 } 287 + </pre> 288 + </blockquote> 289 + 290 + <p> 291 + If an RCU reader fetches <tt>gp</tt> just after 292 + <tt>add_gp_buggy_optimized</tt> executes line&nbsp;11, 293 + it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt> 294 + fields. 295 + And this is but one of many ways in which compiler and hardware optimizations 296 + could cause trouble. 297 + Therefore, we clearly need some way to prevent the compiler and the CPU from 298 + reordering in this manner, which brings us to the publish-subscribe 299 + guarantee discussed in the next section. 300 + 301 + <h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> 302 + 303 + <p> 304 + RCU's publish-subscribe guarantee allows data to be inserted 305 + into a linked data structure without disrupting RCU readers. 306 + The updater uses <tt>rcu_assign_pointer()</tt> to insert the 307 + new data, and readers use <tt>rcu_dereference()</tt> to 308 + access data, whether new or old. 309 + The following shows an example of insertion: 310 + 311 + <blockquote> 312 + <pre> 313 + 1 bool add_gp(int a, int b) 314 + 2 { 315 + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); 316 + 4 if (!p) 317 + 5 return -ENOMEM; 318 + 6 spin_lock(&amp;gp_lock); 319 + 7 if (rcu_access_pointer(gp)) { 320 + 8 spin_unlock(&amp;gp_lock); 321 + 9 return false; 322 + 10 } 323 + 11 p-&gt;a = a; 324 + 12 p-&gt;b = a; 325 + 13 rcu_assign_pointer(gp, p); 326 + 14 spin_unlock(&amp;gp_lock); 327 + 15 return true; 328 + 16 } 329 + </pre> 330 + </blockquote> 331 + 332 + <p> 333 + The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually 334 + equivalent to a simple assignment statement, but also guarantees 335 + that its assignment will 336 + happen after the two assignments in lines&nbsp;11 and&nbsp;12, 337 + similar to the C11 <tt>memory_order_release</tt> store operation. 338 + It also prevents any number of &ldquo;interesting&rdquo; compiler 339 + optimizations, for example, the use of <tt>gp</tt> as a scratch 340 + location immediately preceding the assignment. 341 + 342 + <p>@@QQ@@ 343 + But <tt>rcu_assign_pointer()</tt> does nothing to prevent the 344 + two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> 345 + from being reordered. 346 + Can't that also cause problems? 347 + <p>@@QQA@@ 348 + No, it cannot. 349 + The readers cannot see either of these two fields until 350 + the assignment to <tt>gp</tt>, by which time both fields are 351 + fully initialized. 352 + So reordering the assignments 353 + to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly 354 + cause any problems. 355 + <p>@@QQE@@ 356 + 357 + <p> 358 + It is tempting to assume that the reader need not do anything special 359 + to control its accesses to the RCU-protected data, 360 + as shown in <tt>do_something_gp_buggy()</tt> below: 361 + 362 + <blockquote> 363 + <pre> 364 + 1 bool do_something_gp_buggy(void) 365 + 2 { 366 + 3 rcu_read_lock(); 367 + 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ 368 + 5 if (p) { 369 + 6 do_something(p-&gt;a, p-&gt;b); 370 + 7 rcu_read_unlock(); 371 + 8 return true; 372 + 9 } 373 + 10 rcu_read_unlock(); 374 + 11 return false; 375 + 12 } 376 + </pre> 377 + </blockquote> 378 + 379 + <p> 380 + However, this temptation must be resisted because there are a 381 + surprisingly large number of ways that the compiler 382 + (to say nothing of 383 + <a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) 384 + can trip this code up. 385 + For but one example, if the compiler were short of registers, it 386 + might choose to refetch from <tt>gp</tt> rather than keeping 387 + a separate copy in <tt>p</tt> as follows: 388 + 389 + <blockquote> 390 + <pre> 391 + 1 bool do_something_gp_buggy_optimized(void) 392 + 2 { 393 + 3 rcu_read_lock(); 394 + 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ 395 + <b> 5 do_something(gp-&gt;a, gp-&gt;b);</b> 396 + 6 rcu_read_unlock(); 397 + 7 return true; 398 + 8 } 399 + 9 rcu_read_unlock(); 400 + 10 return false; 401 + 11 } 402 + </pre> 403 + </blockquote> 404 + 405 + <p> 406 + If this function ran concurrently with a series of updates that 407 + replaced the current structure with a new one, 408 + the fetches of <tt>gp-&gt;a</tt> 409 + and <tt>gp-&gt;b</tt> might well come from two different structures, 410 + which could cause serious confusion. 411 + To prevent this (and much else besides), <tt>do_something_gp()</tt> uses 412 + <tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: 413 + 414 + <blockquote> 415 + <pre> 416 + 1 bool do_something_gp(void) 417 + 2 { 418 + 3 rcu_read_lock(); 419 + 4 p = rcu_dereference(gp); 420 + 5 if (p) { 421 + 6 do_something(p-&gt;a, p-&gt;b); 422 + 7 rcu_read_unlock(); 423 + 8 return true; 424 + 9 } 425 + 10 rcu_read_unlock(); 426 + 11 return false; 427 + 12 } 428 + </pre> 429 + </blockquote> 430 + 431 + <p> 432 + The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) 433 + memory barriers in the Linux kernel. 434 + Should a 435 + <a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> 436 + ever appear, then <tt>rcu_dereference()</tt> could be implemented 437 + as a <tt>memory_order_consume</tt> load. 438 + Regardless of the exact implementation, a pointer fetched by 439 + <tt>rcu_dereference()</tt> may not be used outside of the 440 + outermost RCU read-side critical section containing that 441 + <tt>rcu_dereference()</tt>, unless protection of 442 + the corresponding data element has been passed from RCU to some 443 + other synchronization mechanism, most commonly locking or 444 + <a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. 445 + 446 + <p> 447 + In short, updaters use <tt>rcu_assign_pointer()</tt> and readers 448 + use <tt>rcu_dereference()</tt>, and these two RCU API elements 449 + work together to ensure that readers have a consistent view of 450 + newly added data elements. 451 + 452 + <p> 453 + Of course, it is also necessary to remove elements from RCU-protected 454 + data structures, for example, using the following process: 455 + 456 + <ol> 457 + <li> Remove the data element from the enclosing structure. 458 + <li> Wait for all pre-existing RCU read-side critical sections 459 + to complete (because only pre-existing readers can possibly have 460 + a reference to the newly removed data element). 461 + <li> At this point, only the updater has a reference to the 462 + newly removed data element, so it can safely reclaim 463 + the data element, for example, by passing it to <tt>kfree()</tt>. 464 + </ol> 465 + 466 + This process is implemented by <tt>remove_gp_synchronous()</tt>: 467 + 468 + <blockquote> 469 + <pre> 470 + 1 bool remove_gp_synchronous(void) 471 + 2 { 472 + 3 struct foo *p; 473 + 4 474 + 5 spin_lock(&amp;gp_lock); 475 + 6 p = rcu_access_pointer(gp); 476 + 7 if (!p) { 477 + 8 spin_unlock(&amp;gp_lock); 478 + 9 return false; 479 + 10 } 480 + 11 rcu_assign_pointer(gp, NULL); 481 + 12 spin_unlock(&amp;gp_lock); 482 + 13 synchronize_rcu(); 483 + 14 kfree(p); 484 + 15 return true; 485 + 16 } 486 + </pre> 487 + </blockquote> 488 + 489 + <p> 490 + This function is straightforward, with line&nbsp;13 waiting for a grace 491 + period before line&nbsp;14 frees the old data element. 492 + This waiting ensures that readers will reach line&nbsp;7 of 493 + <tt>do_something_gp()</tt> before the data element referenced by 494 + <tt>p</tt> is freed. 495 + The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to 496 + <tt>rcu_dereference()</tt>, except that: 497 + 498 + <ol> 499 + <li> The value returned by <tt>rcu_access_pointer()</tt> 500 + cannot be dereferenced. 501 + If you want to access the value pointed to as well as 502 + the pointer itself, use <tt>rcu_dereference()</tt> 503 + instead of <tt>rcu_access_pointer()</tt>. 504 + <li> The call to <tt>rcu_access_pointer()</tt> need not be 505 + protected. 506 + In contrast, <tt>rcu_dereference()</tt> must either be 507 + within an RCU read-side critical section or in a code 508 + segment where the pointer cannot change, for example, in 509 + code protected by the corresponding update-side lock. 510 + </ol> 511 + 512 + <p>@@QQ@@ 513 + Without the <tt>rcu_dereference()</tt> or the 514 + <tt>rcu_access_pointer()</tt>, what destructive optimizations 515 + might the compiler make use of? 516 + <p>@@QQA@@ 517 + Let's start with what happens to <tt>do_something_gp()</tt> 518 + if it fails to use <tt>rcu_dereference()</tt>. 519 + It could reuse a value formerly fetched from this same pointer. 520 + It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time 521 + manner, resulting in <i>load tearing</i>, in turn resulting a bytewise 522 + mash-up of two distince pointer values. 523 + It might even use value-speculation optimizations, where it makes a wrong 524 + guess, but by the time it gets around to checking the value, an update 525 + has changed the pointer to match the wrong guess. 526 + Too bad about any dereferences that returned pre-initialization garbage 527 + in the meantime! 528 + 529 + <p> 530 + For <tt>remove_gp_synchronous()</tt>, as long as all modifications 531 + to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, 532 + the above optimizations are harmless. 533 + However, 534 + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, 535 + <tt>sparse</tt> will complain if you 536 + define <tt>gp</tt> with <tt>__rcu</tt> and then 537 + access it without using 538 + either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. 539 + <p>@@QQE@@ 540 + 541 + <p> 542 + This simple linked-data-structure scenario clearly demonstrates the need 543 + for RCU's stringent memory-ordering guarantees on systems with more than 544 + one CPU: 545 + 546 + <ol> 547 + <li> Each CPU that has an RCU read-side critical section that 548 + begins before <tt>synchronize_rcu()</tt> starts is 549 + guaranteed to execute a full memory barrier between the time 550 + that the RCU read-side critical section ends and the time that 551 + <tt>synchronize_rcu()</tt> returns. 552 + Without this guarantee, a pre-existing RCU read-side critical section 553 + might hold a reference to the newly removed <tt>struct foo</tt> 554 + after the <tt>kfree()</tt> on line&nbsp;14 of 555 + <tt>remove_gp_synchronous()</tt>. 556 + <li> Each CPU that has an RCU read-side critical section that ends 557 + after <tt>synchronize_rcu()</tt> returns is guaranteed 558 + to execute a full memory barrier between the time that 559 + <tt>synchronize_rcu()</tt> begins and the time that the RCU 560 + read-side critical section begins. 561 + Without this guarantee, a later RCU read-side critical section 562 + running after the <tt>kfree()</tt> on line&nbsp;14 of 563 + <tt>remove_gp_synchronous()</tt> might 564 + later run <tt>do_something_gp()</tt> and find the 565 + newly deleted <tt>struct foo</tt>. 566 + <li> If the task invoking <tt>synchronize_rcu()</tt> remains 567 + on a given CPU, then that CPU is guaranteed to execute a full 568 + memory barrier sometime during the execution of 569 + <tt>synchronize_rcu()</tt>. 570 + This guarantee ensures that the <tt>kfree()</tt> on 571 + line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does 572 + execute after the removal on line&nbsp;11. 573 + <li> If the task invoking <tt>synchronize_rcu()</tt> migrates 574 + among a group of CPUs during that invocation, then each of the 575 + CPUs in that group is guaranteed to execute a full memory barrier 576 + sometime during the execution of <tt>synchronize_rcu()</tt>. 577 + This guarantee also ensures that the <tt>kfree()</tt> on 578 + line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does 579 + execute after the removal on 580 + line&nbsp;11, but also in the case where the thread executing the 581 + <tt>synchronize_rcu()</tt> migrates in the meantime. 582 + </ol> 583 + 584 + <p>@@QQ@@ 585 + Given that multiple CPUs can start RCU read-side critical sections 586 + at any time without any ordering whatsoever, how can RCU possibly tell whether 587 + or not a given RCU read-side critical section starts before a 588 + given instance of <tt>synchronize_rcu()</tt>? 589 + <p>@@QQA@@ 590 + If RCU cannot tell whether or not a given 591 + RCU read-side critical section starts before a 592 + given instance of <tt>synchronize_rcu()</tt>, 593 + then it must assume that the RCU read-side critical section 594 + started first. 595 + In other words, a given instance of <tt>synchronize_rcu()</tt> 596 + can avoid waiting on a given RCU read-side critical section only 597 + if it can prove that <tt>synchronize_rcu()</tt> started first. 598 + <p>@@QQE@@ 599 + 600 + <p>@@QQ@@ 601 + The first and second guarantees require unbelievably strict ordering! 602 + Are all these memory barriers <i> really</i> required? 603 + <p>@@QQA@@ 604 + Yes, they really are required. 605 + To see why the first guarantee is required, consider the following 606 + sequence of events: 607 + 608 + <ol> 609 + <li> CPU 1: <tt>rcu_read_lock()</tt> 610 + <li> CPU 1: <tt>q = rcu_dereference(gp); 611 + /* Very likely to return p. */</tt> 612 + <li> CPU 0: <tt>list_del_rcu(p);</tt> 613 + <li> CPU 0: <tt>synchronize_rcu()</tt> starts. 614 + <li> CPU 1: <tt>do_something_with(q-&gt;a); 615 + /* No smp_mb(), so might happen after kfree(). */</tt> 616 + <li> CPU 1: <tt>rcu_read_unlock()</tt> 617 + <li> CPU 0: <tt>synchronize_rcu()</tt> returns. 618 + <li> CPU 0: <tt>kfree(p);</tt> 619 + </ol> 620 + 621 + <p> 622 + Therefore, there absolutely must be a full memory barrier between the 623 + end of the RCU read-side critical section and the end of the 624 + grace period. 625 + 626 + <p> 627 + The sequence of events demonstrating the necessity of the second rule 628 + is roughly similar: 629 + 630 + <ol> 631 + <li> CPU 0: <tt>list_del_rcu(p);</tt> 632 + <li> CPU 0: <tt>synchronize_rcu()</tt> starts. 633 + <li> CPU 1: <tt>rcu_read_lock()</tt> 634 + <li> CPU 1: <tt>q = rcu_dereference(gp); 635 + /* Might return p if no memory barrier. */</tt> 636 + <li> CPU 0: <tt>synchronize_rcu()</tt> returns. 637 + <li> CPU 0: <tt>kfree(p);</tt> 638 + <li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt> 639 + <li> CPU 1: <tt>rcu_read_unlock()</tt> 640 + </ol> 641 + 642 + <p> 643 + And similarly, without a memory barrier between the beginning of the 644 + grace period and the beginning of the RCU read-side critical section, 645 + CPU&nbsp;1 might end up accessing the freelist. 646 + 647 + <p> 648 + The &ldquo;as if&rdquo; rule of course applies, so that any implementation 649 + that acts as if the appropriate memory barriers were in place is a 650 + correct implementation. 651 + That said, it is much easier to fool yourself into believing that you have 652 + adhered to the as-if rule than it is to actually adhere to it! 653 + <p>@@QQE@@ 654 + 655 + <p> 656 + In short, RCU's publish-subscribe guarantee is provided by the combination 657 + of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. 658 + This guarantee allows data elements to be safely added to RCU-protected 659 + linked data structures without disrupting RCU readers. 660 + This guarantee can be used in combination with the grace-period 661 + guarantee to also allow data elements to be removed from RCU-protected 662 + linked data structures, again without disrupting RCU readers. 663 + 664 + <p> 665 + This guarantee was only partially premeditated. 666 + DYNIX/ptx used an explicit memory barrier for publication, but had nothing 667 + resembling <tt>rcu_dereference()</tt> for subscription, nor did it 668 + have anything resembling the <tt>smp_read_barrier_depends()</tt> 669 + that was later subsumed into <tt>rcu_dereference()</tt>. 670 + The need for these operations made itself known quite suddenly at a 671 + late-1990s meeting with the DEC Alpha architects, back in the days when 672 + DEC was still a free-standing company. 673 + It took the Alpha architects a good hour to convince me that any sort 674 + of barrier would ever be needed, and it then took me a good <i>two</i> hours 675 + to convince them that their documentation did not make this point clear. 676 + More recent work with the C and C++ standards committees have provided 677 + much education on tricks and traps from the compiler. 678 + In short, compilers were much less tricky in the early 1990s, but in 679 + 2015, don't even think about omitting <tt>rcu_dereference()</tt>! 680 + 681 + <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> 682 + 683 + <p> 684 + The common-case RCU primitives are unconditional. 685 + They are invoked, they do their job, and they return, with no possibility 686 + of error, and no need to retry. 687 + This is a key RCU design philosophy. 688 + 689 + <p> 690 + However, this philosophy is pragmatic rather than pigheaded. 691 + If someone comes up with a good justification for a particular conditional 692 + RCU primitive, it might well be implemented and added. 693 + After all, this guarantee was reverse-engineered, not premeditated. 694 + The unconditional nature of the RCU primitives was initially an 695 + accident of implementation, and later experience with synchronization 696 + primitives with conditional primitives caused me to elevate this 697 + accident to a guarantee. 698 + Therefore, the justification for adding a conditional primitive to 699 + RCU would need to be based on detailed and compelling use cases. 700 + 701 + <h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> 702 + 703 + <p> 704 + As far as RCU is concerned, it is always possible to carry out an 705 + update within an RCU read-side critical section. 706 + For example, that RCU read-side critical section might search for 707 + a given data element, and then might acquire the update-side 708 + spinlock in order to update that element, all while remaining 709 + in that RCU read-side critical section. 710 + Of course, it is necessary to exit the RCU read-side critical section 711 + before invoking <tt>synchronize_rcu()</tt>, however, this 712 + inconvenience can be avoided through use of the 713 + <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members 714 + described later in this document. 715 + 716 + <p>@@QQ@@ 717 + But how does the upgrade-to-write operation exclude other readers? 718 + <p>@@QQA@@ 719 + It doesn't, just like normal RCU updates, which also do not exclude 720 + RCU readers. 721 + <p>@@QQE@@ 722 + 723 + <p> 724 + This guarantee allows lookup code to be shared between read-side 725 + and update-side code, and was premeditated, appearing in the earliest 726 + DYNIX/ptx RCU documentation. 727 + 728 + <h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> 729 + 730 + <p> 731 + RCU provides extremely lightweight readers, and its read-side guarantees, 732 + though quite useful, are correspondingly lightweight. 733 + It is therefore all too easy to assume that RCU is guaranteeing more 734 + than it really is. 735 + Of course, the list of things that RCU does not guarantee is infinitely 736 + long, however, the following sections list a few non-guarantees that 737 + have caused confusion. 738 + Except where otherwise noted, these non-guarantees were premeditated. 739 + 740 + <ol> 741 + <li> <a href="#Readers Impose Minimal Ordering"> 742 + Readers Impose Minimal Ordering</a> 743 + <li> <a href="#Readers Do Not Exclude Updaters"> 744 + Readers Do Not Exclude Updaters</a> 745 + <li> <a href="#Updaters Only Wait For Old Readers"> 746 + Updaters Only Wait For Old Readers</a> 747 + <li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> 748 + Grace Periods Don't Partition Read-Side Critical Sections</a> 749 + <li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> 750 + Read-Side Critical Sections Don't Partition Grace Periods</a> 751 + <li> <a href="#Disabling Preemption Does Not Block Grace Periods"> 752 + Disabling Preemption Does Not Block Grace Periods</a> 753 + </ol> 754 + 755 + <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> 756 + 757 + <p> 758 + Reader-side markers such as <tt>rcu_read_lock()</tt> and 759 + <tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees 760 + except through their interaction with the grace-period APIs such as 761 + <tt>synchronize_rcu()</tt>. 762 + To see this, consider the following pair of threads: 763 + 764 + <blockquote> 765 + <pre> 766 + 1 void thread0(void) 767 + 2 { 768 + 3 rcu_read_lock(); 769 + 4 WRITE_ONCE(x, 1); 770 + 5 rcu_read_unlock(); 771 + 6 rcu_read_lock(); 772 + 7 WRITE_ONCE(y, 1); 773 + 8 rcu_read_unlock(); 774 + 9 } 775 + 10 776 + 11 void thread1(void) 777 + 12 { 778 + 13 rcu_read_lock(); 779 + 14 r1 = READ_ONCE(y); 780 + 15 rcu_read_unlock(); 781 + 16 rcu_read_lock(); 782 + 17 r2 = READ_ONCE(x); 783 + 18 rcu_read_unlock(); 784 + 19 } 785 + </pre> 786 + </blockquote> 787 + 788 + <p> 789 + After <tt>thread0()</tt> and <tt>thread1()</tt> execute 790 + concurrently, it is quite possible to have 791 + 792 + <blockquote> 793 + <pre> 794 + (r1 == 1 &amp;&amp; r2 == 0) 795 + </pre> 796 + </blockquote> 797 + 798 + (that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), 799 + which would not be possible if <tt>rcu_read_lock()</tt> and 800 + <tt>rcu_read_unlock()</tt> had much in the way of ordering 801 + properties. 802 + But they do not, so the CPU is within its rights 803 + to do significant reordering. 804 + This is by design: Any significant ordering constraints would slow down 805 + these fast-path APIs. 806 + 807 + <p>@@QQ@@ 808 + Can't the compiler also reorder this code? 809 + <p>@@QQA@@ 810 + No, the volatile casts in <tt>READ_ONCE()</tt> and 811 + <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in 812 + this particular case. 813 + <p>@@QQE@@ 814 + 815 + <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> 816 + 817 + <p> 818 + Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> 819 + exclude updates. 820 + All they do is to prevent grace periods from ending. 821 + The following example illustrates this: 822 + 823 + <blockquote> 824 + <pre> 825 + 1 void thread0(void) 826 + 2 { 827 + 3 rcu_read_lock(); 828 + 4 r1 = READ_ONCE(y); 829 + 5 if (r1) { 830 + 6 do_something_with_nonzero_x(); 831 + 7 r2 = READ_ONCE(x); 832 + 8 WARN_ON(!r2); /* BUG!!! */ 833 + 9 } 834 + 10 rcu_read_unlock(); 835 + 11 } 836 + 12 837 + 13 void thread1(void) 838 + 14 { 839 + 15 spin_lock(&amp;my_lock); 840 + 16 WRITE_ONCE(x, 1); 841 + 17 WRITE_ONCE(y, 1); 842 + 18 spin_unlock(&amp;my_lock); 843 + 19 } 844 + </pre> 845 + </blockquote> 846 + 847 + <p> 848 + If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> 849 + excluded the <tt>thread1()</tt> function's update, 850 + the <tt>WARN_ON()</tt> could never fire. 851 + But the fact is that <tt>rcu_read_lock()</tt> does not exclude 852 + much of anything aside from subsequent grace periods, of which 853 + <tt>thread1()</tt> has none, so the 854 + <tt>WARN_ON()</tt> can and does fire. 855 + 856 + <h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> 857 + 858 + <p> 859 + It might be tempting to assume that after <tt>synchronize_rcu()</tt> 860 + completes, there are no readers executing. 861 + This temptation must be avoided because 862 + new readers can start immediately after <tt>synchronize_rcu()</tt> 863 + starts, and <tt>synchronize_rcu()</tt> is under no 864 + obligation to wait for these new readers. 865 + 866 + <p>@@QQ@@ 867 + Suppose that synchronize_rcu() did wait until all readers had completed. 868 + Would the updater be able to rely on this? 869 + <p>@@QQA@@ 870 + No. 871 + Even if <tt>synchronize_rcu()</tt> were to wait until 872 + all readers had completed, a new reader might start immediately after 873 + <tt>synchronize_rcu()</tt> completed. 874 + Therefore, the code following 875 + <tt>synchronize_rcu()</tt> cannot rely on there being no readers 876 + in any case. 877 + <p>@@QQE@@ 878 + 879 + <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> 880 + Grace Periods Don't Partition Read-Side Critical Sections</a></h3> 881 + 882 + <p> 883 + It is tempting to assume that if any part of one RCU read-side critical 884 + section precedes a given grace period, and if any part of another RCU 885 + read-side critical section follows that same grace period, then all of 886 + the first RCU read-side critical section must precede all of the second. 887 + However, this just isn't the case: A single grace period does not 888 + partition the set of RCU read-side critical sections. 889 + An example of this situation can be illustrated as follows, where 890 + <tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: 891 + 892 + <blockquote> 893 + <pre> 894 + 1 void thread0(void) 895 + 2 { 896 + 3 rcu_read_lock(); 897 + 4 WRITE_ONCE(a, 1); 898 + 5 WRITE_ONCE(b, 1); 899 + 6 rcu_read_unlock(); 900 + 7 } 901 + 8 902 + 9 void thread1(void) 903 + 10 { 904 + 11 r1 = READ_ONCE(a); 905 + 12 synchronize_rcu(); 906 + 13 WRITE_ONCE(c, 1); 907 + 14 } 908 + 15 909 + 16 void thread2(void) 910 + 17 { 911 + 18 rcu_read_lock(); 912 + 19 r2 = READ_ONCE(b); 913 + 20 r3 = READ_ONCE(c); 914 + 21 rcu_read_unlock(); 915 + 22 } 916 + </pre> 917 + </blockquote> 918 + 919 + <p> 920 + It turns out that the outcome: 921 + 922 + <blockquote> 923 + <pre> 924 + (r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1) 925 + </pre> 926 + </blockquote> 927 + 928 + is entirely possible. 929 + The following figure show how this can happen, with each circled 930 + <tt>QS</tt> indicating the point at which RCU recorded a 931 + <i>quiescent state</i> for each thread, that is, a state in which 932 + RCU knows that the thread cannot be in the midst of an RCU read-side 933 + critical section that started before the current grace period: 934 + 935 + <p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> 936 + 937 + <p> 938 + If it is necessary to partition RCU read-side critical sections in this 939 + manner, it is necessary to use two grace periods, where the first 940 + grace period is known to end before the second grace period starts: 941 + 942 + <blockquote> 943 + <pre> 944 + 1 void thread0(void) 945 + 2 { 946 + 3 rcu_read_lock(); 947 + 4 WRITE_ONCE(a, 1); 948 + 5 WRITE_ONCE(b, 1); 949 + 6 rcu_read_unlock(); 950 + 7 } 951 + 8 952 + 9 void thread1(void) 953 + 10 { 954 + 11 r1 = READ_ONCE(a); 955 + 12 synchronize_rcu(); 956 + 13 WRITE_ONCE(c, 1); 957 + 14 } 958 + 15 959 + 16 void thread2(void) 960 + 17 { 961 + 18 r2 = READ_ONCE(c); 962 + 19 synchronize_rcu(); 963 + 20 WRITE_ONCE(d, 1); 964 + 21 } 965 + 22 966 + 23 void thread3(void) 967 + 24 { 968 + 25 rcu_read_lock(); 969 + 26 r3 = READ_ONCE(b); 970 + 27 r4 = READ_ONCE(d); 971 + 28 rcu_read_unlock(); 972 + 29 } 973 + </pre> 974 + </blockquote> 975 + 976 + <p> 977 + Here, if <tt>(r1 == 1)</tt>, then 978 + <tt>thread0()</tt>'s write to <tt>b</tt> must happen 979 + before the end of <tt>thread1()</tt>'s grace period. 980 + If in addition <tt>(r4 == 1)</tt>, then 981 + <tt>thread3()</tt>'s read from <tt>b</tt> must happen 982 + after the beginning of <tt>thread2()</tt>'s grace period. 983 + If it is also the case that <tt>(r2 == 1)</tt>, then the 984 + end of <tt>thread1()</tt>'s grace period must precede the 985 + beginning of <tt>thread2()</tt>'s grace period. 986 + This mean that the two RCU read-side critical sections cannot overlap, 987 + guaranteeing that <tt>(r3 == 1)</tt>. 988 + As a result, the outcome: 989 + 990 + <blockquote> 991 + <pre> 992 + (r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1) 993 + </pre> 994 + </blockquote> 995 + 996 + cannot happen. 997 + 998 + <p> 999 + This non-requirement was also non-premeditated, but became apparent 1000 + when studying RCU's interaction with memory ordering. 1001 + 1002 + <h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> 1003 + Read-Side Critical Sections Don't Partition Grace Periods</a></h3> 1004 + 1005 + <p> 1006 + It is also tempting to assume that if an RCU read-side critical section 1007 + happens between a pair of grace periods, then those grace periods cannot 1008 + overlap. 1009 + However, this temptation leads nowhere good, as can be illustrated by 1010 + the following, with all variables initially zero: 1011 + 1012 + <blockquote> 1013 + <pre> 1014 + 1 void thread0(void) 1015 + 2 { 1016 + 3 rcu_read_lock(); 1017 + 4 WRITE_ONCE(a, 1); 1018 + 5 WRITE_ONCE(b, 1); 1019 + 6 rcu_read_unlock(); 1020 + 7 } 1021 + 8 1022 + 9 void thread1(void) 1023 + 10 { 1024 + 11 r1 = READ_ONCE(a); 1025 + 12 synchronize_rcu(); 1026 + 13 WRITE_ONCE(c, 1); 1027 + 14 } 1028 + 15 1029 + 16 void thread2(void) 1030 + 17 { 1031 + 18 rcu_read_lock(); 1032 + 19 WRITE_ONCE(d, 1); 1033 + 20 r2 = READ_ONCE(c); 1034 + 21 rcu_read_unlock(); 1035 + 22 } 1036 + 23 1037 + 24 void thread3(void) 1038 + 25 { 1039 + 26 r3 = READ_ONCE(d); 1040 + 27 synchronize_rcu(); 1041 + 28 WRITE_ONCE(e, 1); 1042 + 29 } 1043 + 30 1044 + 31 void thread4(void) 1045 + 32 { 1046 + 33 rcu_read_lock(); 1047 + 34 r4 = READ_ONCE(b); 1048 + 35 r5 = READ_ONCE(e); 1049 + 36 rcu_read_unlock(); 1050 + 37 } 1051 + </pre> 1052 + </blockquote> 1053 + 1054 + <p> 1055 + In this case, the outcome: 1056 + 1057 + <blockquote> 1058 + <pre> 1059 + (r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1) 1060 + </pre> 1061 + </blockquote> 1062 + 1063 + is entirely possible, as illustrated below: 1064 + 1065 + <p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> 1066 + 1067 + <p> 1068 + Again, an RCU read-side critical section can overlap almost all of a 1069 + given grace period, just so long as it does not overlap the entire 1070 + grace period. 1071 + As a result, an RCU read-side critical section cannot partition a pair 1072 + of RCU grace periods. 1073 + 1074 + <p>@@QQ@@ 1075 + How long a sequence of grace periods, each separated by an RCU read-side 1076 + critical section, would be required to partition the RCU read-side 1077 + critical sections at the beginning and end of the chain? 1078 + <p>@@QQA@@ 1079 + In theory, an infinite number. 1080 + In practice, an unknown number that is sensitive to both implementation 1081 + details and timing considerations. 1082 + Therefore, even in practice, RCU users must abide by the theoretical rather 1083 + than the practical answer. 1084 + <p>@@QQE@@ 1085 + 1086 + <h3><a name="Disabling Preemption Does Not Block Grace Periods"> 1087 + Disabling Preemption Does Not Block Grace Periods</a></h3> 1088 + 1089 + <p> 1090 + There was a time when disabling preemption on any given CPU would block 1091 + subsequent grace periods. 1092 + However, this was an accident of implementation and is not a requirement. 1093 + And in the current Linux-kernel implementation, disabling preemption 1094 + on a given CPU in fact does not block grace periods, as Oleg Nesterov 1095 + <a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. 1096 + 1097 + <p> 1098 + If you need a preempt-disable region to block grace periods, you need to add 1099 + <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example 1100 + as follows: 1101 + 1102 + <blockquote> 1103 + <pre> 1104 + 1 preempt_disable(); 1105 + 2 rcu_read_lock(); 1106 + 3 do_something(); 1107 + 4 rcu_read_unlock(); 1108 + 5 preempt_enable(); 1109 + 6 1110 + 7 /* Spinlocks implicitly disable preemption. */ 1111 + 8 spin_lock(&amp;mylock); 1112 + 9 rcu_read_lock(); 1113 + 10 do_something(); 1114 + 11 rcu_read_unlock(); 1115 + 12 spin_unlock(&amp;mylock); 1116 + </pre> 1117 + </blockquote> 1118 + 1119 + <p> 1120 + In theory, you could enter the RCU read-side critical section first, 1121 + but it is more efficient to keep the entire RCU read-side critical 1122 + section contained in the preempt-disable region as shown above. 1123 + Of course, RCU read-side critical sections that extend outside of 1124 + preempt-disable regions will work correctly, but such critical sections 1125 + can be preempted, which forces <tt>rcu_read_unlock()</tt> to do 1126 + more work. 1127 + And no, this is <i>not</i> an invitation to enclose all of your RCU 1128 + read-side critical sections within preempt-disable regions, because 1129 + doing so would degrade real-time response. 1130 + 1131 + <p> 1132 + This non-requirement appeared with preemptible RCU. 1133 + If you need a grace period that waits on non-preemptible code regions, use 1134 + <a href="#Sched Flavor">RCU-sched</a>. 1135 + 1136 + <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> 1137 + 1138 + <p> 1139 + These parallelism facts of life are by no means specific to RCU, but 1140 + the RCU implementation must abide by them. 1141 + They therefore bear repeating: 1142 + 1143 + <ol> 1144 + <li> Any CPU or task may be delayed at any time, 1145 + and any attempts to avoid these delays by disabling 1146 + preemption, interrupts, or whatever are completely futile. 1147 + This is most obvious in preemptible user-level 1148 + environments and in virtualized environments (where 1149 + a given guest OS's VCPUs can be preempted at any time by 1150 + the underlying hypervisor), but can also happen in bare-metal 1151 + environments due to ECC errors, NMIs, and other hardware 1152 + events. 1153 + Although a delay of more than about 20 seconds can result 1154 + in splats, the RCU implementation is obligated to use 1155 + algorithms that can tolerate extremely long delays, but where 1156 + &ldquo;extremely long&rdquo; is not long enough to allow 1157 + wrap-around when incrementing a 64-bit counter. 1158 + <li> Both the compiler and the CPU can reorder memory accesses. 1159 + Where it matters, RCU must use compiler directives and 1160 + memory-barrier instructions to preserve ordering. 1161 + <li> Conflicting writes to memory locations in any given cache line 1162 + will result in expensive cache misses. 1163 + Greater numbers of concurrent writes and more-frequent 1164 + concurrent writes will result in more dramatic slowdowns. 1165 + RCU is therefore obligated to use algorithms that have 1166 + sufficient locality to avoid significant performance and 1167 + scalability problems. 1168 + <li> As a rough rule of thumb, only one CPU's worth of processing 1169 + may be carried out under the protection of any given exclusive 1170 + lock. 1171 + RCU must therefore use scalable locking designs. 1172 + <li> Counters are finite, especially on 32-bit systems. 1173 + RCU's use of counters must therefore tolerate counter wrap, 1174 + or be designed such that counter wrap would take way more 1175 + time than a single system is likely to run. 1176 + An uptime of ten years is quite possible, a runtime 1177 + of a century much less so. 1178 + As an example of the latter, RCU's dyntick-idle nesting counter 1179 + allows 54 bits for interrupt nesting level (this counter 1180 + is 64 bits even on a 32-bit system). 1181 + Overflowing this counter requires 2<sup>54</sup> 1182 + half-interrupts on a given CPU without that CPU ever going idle. 1183 + If a half-interrupt happened every microsecond, it would take 1184 + 570 years of runtime to overflow this counter, which is currently 1185 + believed to be an acceptably long time. 1186 + <li> Linux systems can have thousands of CPUs running a single 1187 + Linux kernel in a single shared-memory environment. 1188 + RCU must therefore pay close attention to high-end scalability. 1189 + </ol> 1190 + 1191 + <p> 1192 + This last parallelism fact of life means that RCU must pay special 1193 + attention to the preceding facts of life. 1194 + The idea that Linux might scale to systems with thousands of CPUs would 1195 + have been met with some skepticism in the 1990s, but these requirements 1196 + would have otherwise have been unsurprising, even in the early 1990s. 1197 + 1198 + <h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> 1199 + 1200 + <p> 1201 + These sections list quality-of-implementation requirements. 1202 + Although an RCU implementation that ignores these requirements could 1203 + still be used, it would likely be subject to limitations that would 1204 + make it inappropriate for industrial-strength production use. 1205 + Classes of quality-of-implementation requirements are as follows: 1206 + 1207 + <ol> 1208 + <li> <a href="#Specialization">Specialization</a> 1209 + <li> <a href="#Performance and Scalability">Performance and Scalability</a> 1210 + <li> <a href="#Composability">Composability</a> 1211 + <li> <a href="#Corner Cases">Corner Cases</a> 1212 + </ol> 1213 + 1214 + <p> 1215 + These classes is covered in the following sections. 1216 + 1217 + <h3><a name="Specialization">Specialization</a></h3> 1218 + 1219 + <p> 1220 + RCU is and always has been intended primarily for read-mostly situations, as 1221 + illustrated by the following figure. 1222 + This means that RCU's read-side primitives are optimized, often at the 1223 + expense of its update-side primitives. 1224 + 1225 + <p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> 1226 + 1227 + <p> 1228 + This focus on read-mostly situations means that RCU must interoperate 1229 + with other synchronization primitives. 1230 + For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> 1231 + examples discussed earlier use RCU to protect readers and locking to 1232 + coordinate updaters. 1233 + However, the need extends much farther, requiring that a variety of 1234 + synchronization primitives be legal within RCU read-side critical sections, 1235 + including spinlocks, sequence locks, atomic operations, reference 1236 + counters, and memory barriers. 1237 + 1238 + <p>@@QQ@@ 1239 + What about sleeping locks? 1240 + <p>@@QQA@@ 1241 + These are forbidden within Linux-kernel RCU read-side critical sections 1242 + because it is not legal to place a quiescent state (in this case, 1243 + voluntary context switch) within an RCU read-side critical section. 1244 + However, sleeping locks may be used within userspace RCU read-side critical 1245 + sections, and also within Linux-kernel sleepable RCU 1246 + <a href="#Sleepable RCU">(SRCU)</a> 1247 + read-side critical sections. 1248 + In addition, the -rt patchset turns spinlocks into a sleeping locks so 1249 + that the corresponding critical sections can be preempted, which 1250 + also means that these sleeplockified spinlocks (but not other sleeping locks!) 1251 + may be acquire within -rt-Linux-kernel RCU read-side critical sections. 1252 + 1253 + <p> 1254 + Note that it <i>is</i> legal for a normal RCU read-side critical section 1255 + to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), 1256 + but only as long as it does not loop indefinitely attempting to 1257 + conditionally acquire that sleeping locks. 1258 + The key point is that things like <tt>mutex_trylock()</tt> 1259 + either return with the mutex held, or return an error indication if 1260 + the mutex was not immediately available. 1261 + Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. 1262 + <p>@@QQE@@ 1263 + 1264 + <p> 1265 + It often comes as a surprise that many algorithms do not require a 1266 + consistent view of data, but many can function in that mode, 1267 + with network routing being the poster child. 1268 + Internet routing algorithms take significant time to propagate 1269 + updates, so that by the time an update arrives at a given system, 1270 + that system has been sending network traffic the wrong way for 1271 + a considerable length of time. 1272 + Having a few threads continue to send traffic the wrong way for a 1273 + few more milliseconds is clearly not a problem: In the worst case, 1274 + TCP retransmissions will eventually get the data where it needs to go. 1275 + In general, when tracking the state of the universe outside of the 1276 + computer, some level of inconsistency must be tolerated due to 1277 + speed-of-light delays if nothing else. 1278 + 1279 + <p> 1280 + Furthermore, uncertainty about external state is inherent in many cases. 1281 + For example, a pair of veternarians might use heartbeat to determine 1282 + whether or not a given cat was alive. 1283 + But how long should they wait after the last heartbeat to decide that 1284 + the cat is in fact dead? 1285 + Waiting less than 400 milliseconds makes no sense because this would 1286 + mean that a relaxed cat would be considered to cycle between death 1287 + and life more than 100 times per minute. 1288 + Moreover, just as with human beings, a cat's heart might stop for 1289 + some period of time, so the exact wait period is a judgment call. 1290 + One of our pair of veternarians might wait 30 seconds before pronouncing 1291 + the cat dead, while the other might insist on waiting a full minute. 1292 + The two veternarians would then disagree on the state of the cat during 1293 + the final 30 seconds of the minute following the last heartbeat, as 1294 + fancifully illustrated below: 1295 + 1296 + <p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> 1297 + 1298 + <p> 1299 + Interestingly enough, this same situation applies to hardware. 1300 + When push comes to shove, how do we tell whether or not some 1301 + external server has failed? 1302 + We send messages to it periodically, and declare it failed if we 1303 + don't receive a response within a given period of time. 1304 + Policy decisions can usually tolerate short 1305 + periods of inconsistency. 1306 + The policy was decided some time ago, and is only now being put into 1307 + effect, so a few milliseconds of delay is normally inconsequential. 1308 + 1309 + <p> 1310 + However, there are algorithms that absolutely must see consistent data. 1311 + For example, the translation between a user-level SystemV semaphore 1312 + ID to the corresponding in-kernel data structure is protected by RCU, 1313 + but it is absolutely forbidden to update a semaphore that has just been 1314 + removed. 1315 + In the Linux kernel, this need for consistency is accommodated by acquiring 1316 + spinlocks located in the in-kernel data structure from within 1317 + the RCU read-side critical section, and this is indicated by the 1318 + green box in the figure above. 1319 + Many other techniques may be used, and are in fact used within the 1320 + Linux kernel. 1321 + 1322 + <p> 1323 + In short, RCU is not required to maintain consistency, and other 1324 + mechanisms may be used in concert with RCU when consistency is required. 1325 + RCU's specialization allows it to do its job extremely well, and its 1326 + ability to interoperate with other synchronization mechanisms allows 1327 + the right mix of synchronization tools to be used for a given job. 1328 + 1329 + <h3><a name="Performance and Scalability">Performance and Scalability</a></h3> 1330 + 1331 + <p> 1332 + Energy efficiency is a critical component of performance today, 1333 + and Linux-kernel RCU implementations must therefore avoid unnecessarily 1334 + awakening idle CPUs. 1335 + I cannot claim that this requirement was premeditated. 1336 + In fact, I learned of it during a telephone conversation in which I 1337 + was given &ldquo;frank and open&rdquo; feedback on the importance 1338 + of energy efficiency in battery-powered systems and on specific 1339 + energy-efficiency shortcomings of the Linux-kernel RCU implementation. 1340 + In my experience, the battery-powered embedded community will consider 1341 + any unnecessary wakeups to be extremely unfriendly acts. 1342 + So much so that mere Linux-kernel-mailing-list posts are 1343 + insufficient to vent their ire. 1344 + 1345 + <p> 1346 + Memory consumption is not particularly important for in most 1347 + situations, and has become decreasingly 1348 + so as memory sizes have expanded and memory 1349 + costs have plummeted. 1350 + However, as I learned from Matt Mackall's 1351 + <a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> 1352 + efforts, memory footprint is critically important on single-CPU systems with 1353 + non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus 1354 + <a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> 1355 + was born. 1356 + Josh Triplett has since taken over the small-memory banner with his 1357 + <a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> 1358 + project, which resulted in 1359 + <a href="#Sleepable RCU">SRCU</a> 1360 + becoming optional for those kernels not needing it. 1361 + 1362 + <p> 1363 + The remaining performance requirements are, for the most part, 1364 + unsurprising. 1365 + For example, in keeping with RCU's read-side specialization, 1366 + <tt>rcu_dereference()</tt> should have negligible overhead (for 1367 + example, suppression of a few minor compiler optimizations). 1368 + Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and 1369 + <tt>rcu_read_unlock()</tt> should have exactly zero overhead. 1370 + 1371 + <p> 1372 + In preemptible environments, in the case where the RCU read-side 1373 + critical section was not preempted (as will be the case for the 1374 + highest-priority real-time process), <tt>rcu_read_lock()</tt> and 1375 + <tt>rcu_read_unlock()</tt> should have minimal overhead. 1376 + In particular, they should not contain atomic read-modify-write 1377 + operations, memory-barrier instructions, preemption disabling, 1378 + interrupt disabling, or backwards branches. 1379 + However, in the case where the RCU read-side critical section was preempted, 1380 + <tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. 1381 + This is why it is better to nest an RCU read-side critical section 1382 + within a preempt-disable region than vice versa, at least in cases 1383 + where that critical section is short enough to avoid unduly degrading 1384 + real-time latencies. 1385 + 1386 + <p> 1387 + The <tt>synchronize_rcu()</tt> grace-period-wait primitive is 1388 + optimized for throughput. 1389 + It may therefore incur several milliseconds of latency in addition to 1390 + the duration of the longest RCU read-side critical section. 1391 + On the other hand, multiple concurrent invocations of 1392 + <tt>synchronize_rcu()</tt> are required to use batching optimizations 1393 + so that they can be satisfied by a single underlying grace-period-wait 1394 + operation. 1395 + For example, in the Linux kernel, it is not unusual for a single 1396 + grace-period-wait operation to serve more than 1397 + <a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> 1398 + of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation 1399 + overhead down to nearly zero. 1400 + However, the grace-period optimization is also required to avoid 1401 + measurable degradation of real-time scheduling and interrupt latencies. 1402 + 1403 + <p> 1404 + In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> 1405 + latencies are unacceptable. 1406 + In these cases, <tt>synchronize_rcu_expedited()</tt> may be used 1407 + instead, reducing the grace-period latency down to a few tens of 1408 + microseconds on small systems, at least in cases where the RCU read-side 1409 + critical sections are short. 1410 + There are currently no special latency requirements for 1411 + <tt>synchronize_rcu_expedited()</tt> on large systems, but, 1412 + consistent with the empirical nature of the RCU specification, 1413 + that is subject to change. 1414 + However, there most definitely are scalability requirements: 1415 + A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 1416 + CPUs should at least make reasonable forward progress. 1417 + In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> 1418 + is permitted to impose modest degradation of real-time latency 1419 + on non-idle online CPUs. 1420 + That said, it will likely be necessary to take further steps to reduce this 1421 + degradation, hopefully to roughly that of a scheduling-clock interrupt. 1422 + 1423 + <p> 1424 + There are a number of situations where even 1425 + <tt>synchronize_rcu_expedited()</tt>'s reduced grace-period 1426 + latency is unacceptable. 1427 + In these situations, the asynchronous <tt>call_rcu()</tt> can be 1428 + used in place of <tt>synchronize_rcu()</tt> as follows: 1429 + 1430 + <blockquote> 1431 + <pre> 1432 + 1 struct foo { 1433 + 2 int a; 1434 + 3 int b; 1435 + 4 struct rcu_head rh; 1436 + 5 }; 1437 + 6 1438 + 7 static void remove_gp_cb(struct rcu_head *rhp) 1439 + 8 { 1440 + 9 struct foo *p = container_of(rhp, struct foo, rh); 1441 + 10 1442 + 11 kfree(p); 1443 + 12 } 1444 + 13 1445 + 14 bool remove_gp_asynchronous(void) 1446 + 15 { 1447 + 16 struct foo *p; 1448 + 17 1449 + 18 spin_lock(&amp;gp_lock); 1450 + 19 p = rcu_dereference(gp); 1451 + 20 if (!p) { 1452 + 21 spin_unlock(&amp;gp_lock); 1453 + 22 return false; 1454 + 23 } 1455 + 24 rcu_assign_pointer(gp, NULL); 1456 + 25 call_rcu(&amp;p-&gt;rh, remove_gp_cb); 1457 + 26 spin_unlock(&amp;gp_lock); 1458 + 27 return true; 1459 + 28 } 1460 + </pre> 1461 + </blockquote> 1462 + 1463 + <p> 1464 + A definition of <tt>struct foo</tt> is finally needed, and appears 1465 + on lines&nbsp;1-5. 1466 + The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> 1467 + on line&nbsp;25, and will be invoked after the end of a subsequent 1468 + grace period. 1469 + This gets the same effect as <tt>remove_gp_synchronous()</tt>, 1470 + but without forcing the updater to wait for a grace period to elapse. 1471 + The <tt>call_rcu()</tt> function may be used in a number of 1472 + situations where neither <tt>synchronize_rcu()</tt> nor 1473 + <tt>synchronize_rcu_expedited()</tt> would be legal, 1474 + including within preempt-disable code, <tt>local_bh_disable()</tt> code, 1475 + interrupt-disable code, and interrupt handlers. 1476 + However, even <tt>call_rcu()</tt> is illegal within NMI handlers. 1477 + The callback function (<tt>remove_gp_cb()</tt> in this case) will be 1478 + executed within softirq (software interrupt) environment within the 1479 + Linux kernel, 1480 + either within a real softirq handler or under the protection 1481 + of <tt>local_bh_disable()</tt>. 1482 + In both the Linux kernel and in userspace, it is bad practice to 1483 + write an RCU callback function that takes too long. 1484 + Long-running operations should be relegated to separate threads or 1485 + (in the Linux kernel) workqueues. 1486 + 1487 + <p>@@QQ@@ 1488 + Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>? 1489 + After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the 1490 + structure, which would interact badly with concurrent insertions. 1491 + Doesn't this mean that <tt>rcu_dereference()</tt> is required? 1492 + <p>@@QQA@@ 1493 + Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes 1494 + any changes, including any insertions that <tt>rcu_dereference()</tt> 1495 + would protect against. 1496 + Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt> 1497 + is released on line&nbsp;25, which in turn means that 1498 + <tt>rcu_access_pointer()</tt> suffices. 1499 + <p>@@QQE@@ 1500 + 1501 + <p> 1502 + However, all that <tt>remove_gp_cb()</tt> is doing is 1503 + invoking <tt>kfree()</tt> on the data element. 1504 + This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, 1505 + which allows &ldquo;fire and forget&rdquo; operation as shown below: 1506 + 1507 + <blockquote> 1508 + <pre> 1509 + 1 struct foo { 1510 + 2 int a; 1511 + 3 int b; 1512 + 4 struct rcu_head rh; 1513 + 5 }; 1514 + 6 1515 + 7 bool remove_gp_faf(void) 1516 + 8 { 1517 + 9 struct foo *p; 1518 + 10 1519 + 11 spin_lock(&amp;gp_lock); 1520 + 12 p = rcu_dereference(gp); 1521 + 13 if (!p) { 1522 + 14 spin_unlock(&amp;gp_lock); 1523 + 15 return false; 1524 + 16 } 1525 + 17 rcu_assign_pointer(gp, NULL); 1526 + 18 kfree_rcu(p, rh); 1527 + 19 spin_unlock(&amp;gp_lock); 1528 + 20 return true; 1529 + 21 } 1530 + </pre> 1531 + </blockquote> 1532 + 1533 + <p> 1534 + Note that <tt>remove_gp_faf()</tt> simply invokes 1535 + <tt>kfree_rcu()</tt> and proceeds, without any need to pay any 1536 + further attention to the subsequent grace period and <tt>kfree()</tt>. 1537 + It is permissible to invoke <tt>kfree_rcu()</tt> from the same 1538 + environments as for <tt>call_rcu()</tt>. 1539 + Interestingly enough, DYNIX/ptx had the equivalents of 1540 + <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not 1541 + <tt>synchronize_rcu()</tt>. 1542 + This was due to the fact that RCU was not heavily used within DYNIX/ptx, 1543 + so the very few places that needed something like 1544 + <tt>synchronize_rcu()</tt> simply open-coded it. 1545 + 1546 + <p>@@QQ@@ 1547 + Earlier it was claimed that <tt>call_rcu()</tt> and 1548 + <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked 1549 + by readers. 1550 + But how can that be correct, given that the invocation of the callback 1551 + and the freeing of the memory (respectively) must still wait for 1552 + a grace period to elapse? 1553 + <p>@@QQA@@ 1554 + We could define things this way, but keep in mind that this sort of 1555 + definition would say that updates in garbage-collected languages 1556 + cannot complete until the next time the garbage collector runs, 1557 + which does not seem at all reasonable. 1558 + The key point is that in most cases, an updater using either 1559 + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the 1560 + next update as soon as it has invoked <tt>call_rcu()</tt> or 1561 + <tt>kfree_rcu()</tt>, without having to wait for a subsequent 1562 + grace period. 1563 + <p>@@QQE@@ 1564 + 1565 + <p> 1566 + But what if the updater must wait for the completion of code to be 1567 + executed after the end of the grace period, but has other tasks 1568 + that can be carried out in the meantime? 1569 + The polling-style <tt>get_state_synchronize_rcu()</tt> and 1570 + <tt>cond_synchronize_rcu()</tt> functions may be used for this 1571 + purpose, as shown below: 1572 + 1573 + <blockquote> 1574 + <pre> 1575 + 1 bool remove_gp_poll(void) 1576 + 2 { 1577 + 3 struct foo *p; 1578 + 4 unsigned long s; 1579 + 5 1580 + 6 spin_lock(&amp;gp_lock); 1581 + 7 p = rcu_access_pointer(gp); 1582 + 8 if (!p) { 1583 + 9 spin_unlock(&amp;gp_lock); 1584 + 10 return false; 1585 + 11 } 1586 + 12 rcu_assign_pointer(gp, NULL); 1587 + 13 spin_unlock(&amp;gp_lock); 1588 + 14 s = get_state_synchronize_rcu(); 1589 + 15 do_something_while_waiting(); 1590 + 16 cond_synchronize_rcu(s); 1591 + 17 kfree(p); 1592 + 18 return true; 1593 + 19 } 1594 + </pre> 1595 + </blockquote> 1596 + 1597 + <p> 1598 + On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a 1599 + &ldquo;cookie&rdquo; from RCU, 1600 + then line&nbsp;15 carries out other tasks, 1601 + and finally, line&nbsp;16 returns immediately if a grace period has 1602 + elapsed in the meantime, but otherwise waits as required. 1603 + The need for <tt>get_state_synchronize_rcu</tt> and 1604 + <tt>cond_synchronize_rcu()</tt> has appeared quite recently, 1605 + so it is too early to tell whether they will stand the test of time. 1606 + 1607 + <p> 1608 + RCU thus provides a range of tools to allow updaters to strike the 1609 + required tradeoff between latency, flexibility and CPU overhead. 1610 + 1611 + <h3><a name="Composability">Composability</a></h3> 1612 + 1613 + <p> 1614 + Composability has received much attention in recent years, perhaps in part 1615 + due to the collision of multicore hardware with object-oriented techniques 1616 + designed in single-threaded environments for single-threaded use. 1617 + And in theory, RCU read-side critical sections may be composed, and in 1618 + fact may be nested arbitrarily deeply. 1619 + In practice, as with all real-world implementations of composable 1620 + constructs, there are limitations. 1621 + 1622 + <p> 1623 + Implementations of RCU for which <tt>rcu_read_lock()</tt> 1624 + and <tt>rcu_read_unlock()</tt> generate no code, such as 1625 + Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be 1626 + nested arbitrarily deeply. 1627 + After all, there is no overhead. 1628 + Except that if all these instances of <tt>rcu_read_lock()</tt> 1629 + and <tt>rcu_read_unlock()</tt> are visible to the compiler, 1630 + compilation will eventually fail due to exhausting memory, 1631 + mass storage, or user patience, whichever comes first. 1632 + If the nesting is not visible to the compiler, as is the case with 1633 + mutually recursive functions each in its own translation unit, 1634 + stack overflow will result. 1635 + If the nesting takes the form of loops, either the control variable 1636 + will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. 1637 + Nevertheless, this class of RCU implementations is one 1638 + of the most composable constructs in existence. 1639 + 1640 + <p> 1641 + RCU implementations that explicitly track nesting depth 1642 + are limited by the nesting-depth counter. 1643 + For example, the Linux kernel's preemptible RCU limits nesting to 1644 + <tt>INT_MAX</tt>. 1645 + This should suffice for almost all practical purposes. 1646 + That said, a consecutive pair of RCU read-side critical sections 1647 + between which there is an operation that waits for a grace period 1648 + cannot be enclosed in another RCU read-side critical section. 1649 + This is because it is not legal to wait for a grace period within 1650 + an RCU read-side critical section: To do so would result either 1651 + in deadlock or 1652 + in RCU implicitly splitting the enclosing RCU read-side critical 1653 + section, neither of which is conducive to a long-lived and prosperous 1654 + kernel. 1655 + 1656 + <p> 1657 + In short, although RCU read-side critical sections are highly composable, 1658 + care is required in some situations, just as is the case for any other 1659 + composable synchronization mechanism. 1660 + 1661 + <h3><a name="Corner Cases">Corner Cases</a></h3> 1662 + 1663 + <p> 1664 + A given RCU workload might have an endless and intense stream of 1665 + RCU read-side critical sections, perhaps even so intense that there 1666 + was never a point in time during which there was not at least one 1667 + RCU read-side critical section in flight. 1668 + RCU cannot allow this situation to block grace periods: As long as 1669 + all the RCU read-side critical sections are finite, grace periods 1670 + must also be finite. 1671 + 1672 + <p> 1673 + That said, preemptible RCU implementations could potentially result 1674 + in RCU read-side critical sections being preempted for long durations, 1675 + which has the effect of creating a long-duration RCU read-side 1676 + critical section. 1677 + This situation can arise only in heavily loaded systems, but systems using 1678 + real-time priorities are of course more vulnerable. 1679 + Therefore, RCU priority boosting is provided to help deal with this 1680 + case. 1681 + That said, the exact requirements on RCU priority boosting will likely 1682 + evolve as more experience accumulates. 1683 + 1684 + <p> 1685 + Other workloads might have very high update rates. 1686 + Although one can argue that such workloads should instead use 1687 + something other than RCU, the fact remains that RCU must 1688 + handle such workloads gracefully. 1689 + This requirement is another factor driving batching of grace periods, 1690 + but it is also the driving force behind the checks for large numbers 1691 + of queued RCU callbacks in the <tt>call_rcu()</tt> code path. 1692 + Finally, high update rates should not delay RCU read-side critical 1693 + sections, although some read-side delays can occur when using 1694 + <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use 1695 + of <tt>try_stop_cpus()</tt>. 1696 + (In the future, <tt>synchronize_rcu_expedited()</tt> will be 1697 + converted to use lighter-weight inter-processor interrupts (IPIs), 1698 + but this will still disturb readers, though to a much smaller degree.) 1699 + 1700 + <p> 1701 + Although all three of these corner cases were understood in the early 1702 + 1990s, a simple user-level test consisting of <tt>close(open(path))</tt> 1703 + in a tight loop 1704 + in the early 2000s suddenly provided a much deeper appreciation of the 1705 + high-update-rate corner case. 1706 + This test also motivated addition of some RCU code to react to high update 1707 + rates, for example, if a given CPU finds itself with more than 10,000 1708 + RCU callbacks queued, it will cause RCU to take evasive action by 1709 + more aggressively starting grace periods and more aggressively forcing 1710 + completion of grace-period processing. 1711 + This evasive action causes the grace period to complete more quickly, 1712 + but at the cost of restricting RCU's batching optimizations, thus 1713 + increasing the CPU overhead incurred by that grace period. 1714 + 1715 + <h2><a name="Software-Engineering Requirements"> 1716 + Software-Engineering Requirements</a></h2> 1717 + 1718 + <p> 1719 + Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to 1720 + guard against mishaps and misuse: 1721 + 1722 + <ol> 1723 + <li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> 1724 + everywhere that it is needed, so kernels built with 1725 + <tt>CONFIG_PROVE_RCU=y</tt> will spat if 1726 + <tt>rcu_dereference()</tt> is used outside of an 1727 + RCU read-side critical section. 1728 + Update-side code can use <tt>rcu_dereference_protected()</tt>, 1729 + which takes a 1730 + <a href="https://lwn.net/Articles/371986/">lockdep expression</a> 1731 + to indicate what is providing the protection. 1732 + If the indicated protection is not provided, a lockdep splat 1733 + is emitted. 1734 + 1735 + <p> 1736 + Code shared between readers and updaters can use 1737 + <tt>rcu_dereference_check()</tt>, which also takes a 1738 + lockdep expression, and emits a lockdep splat if neither 1739 + <tt>rcu_read_lock()</tt> nor the indicated protection 1740 + is in place. 1741 + In addition, <tt>rcu_dereference_raw()</tt> is used in those 1742 + (hopefully rare) cases where the required protection cannot 1743 + be easily described. 1744 + Finally, <tt>rcu_read_lock_held()</tt> is provided to 1745 + allow a function to verify that it has been invoked within 1746 + an RCU read-side critical section. 1747 + I was made aware of this set of requirements shortly after Thomas 1748 + Gleixner audited a number of RCU uses. 1749 + <li> A given function might wish to check for RCU-related preconditions 1750 + upon entry, before using any other RCU API. 1751 + The <tt>rcu_lockdep_assert()</tt> does this job, 1752 + asserting the expression in kernels having lockdep enabled 1753 + and doing nothing otherwise. 1754 + <li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> 1755 + and <tt>rcu_dereference()</tt>, perhaps (incorrectly) 1756 + substituting a simple assignment. 1757 + To catch this sort of error, a given RCU-protected pointer may be 1758 + tagged with <tt>__rcu</tt>, after which running sparse 1759 + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain 1760 + about simple-assignment accesses to that pointer. 1761 + Arnd Bergmann made me aware of this requirement, and also 1762 + supplied the needed 1763 + <a href="https://lwn.net/Articles/376011/">patch series</a>. 1764 + <li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> 1765 + will splat if a data element is passed to <tt>call_rcu()</tt> 1766 + twice in a row, without a grace period in between. 1767 + (This error is similar to a double free.) 1768 + The corresponding <tt>rcu_head</tt> structures that are 1769 + dynamically allocated are automatically tracked, but 1770 + <tt>rcu_head</tt> structures allocated on the stack 1771 + must be initialized with <tt>init_rcu_head_on_stack()</tt> 1772 + and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. 1773 + Similarly, statically allocated non-stack <tt>rcu_head</tt> 1774 + structures must be initialized with <tt>init_rcu_head()</tt> 1775 + and cleaned up with <tt>destroy_rcu_head()</tt>. 1776 + Mathieu Desnoyers made me aware of this requirement, and also 1777 + supplied the needed 1778 + <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. 1779 + <li> An infinite loop in an RCU read-side critical section will 1780 + eventually trigger an RCU CPU stall warning splat. 1781 + However, RCU is not obligated to produce this splat 1782 + unless there is a grace period waiting on that particular 1783 + RCU read-side critical section. 1784 + This requirement made itself known in the early 1990s, pretty 1785 + much the first time that it was necessary to debug a CPU stall. 1786 + <li> Although it would be very good to detect pointers leaking out 1787 + of RCU read-side critical sections, there is currently no 1788 + good way of doing this. 1789 + One complication is the need to distinguish between pointers 1790 + leaking and pointers that have been handed off from RCU to 1791 + some other synchronization mechanism, for example, reference 1792 + counting. 1793 + <li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related 1794 + information is provided via both debugfs and event tracing. 1795 + <li> Open-coded use of <tt>rcu_assign_pointer()</tt> and 1796 + <tt>rcu_dereference()</tt> to create typical linked 1797 + data structures can be surprisingly error-prone. 1798 + Therefore, RCU-protected 1799 + <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> 1800 + and, more recently, RCU-protected 1801 + <a href="https://lwn.net/Articles/612100/">hash tables</a> 1802 + are available. 1803 + Many other special-purpose RCU-protected data structures are 1804 + available in the Linux kernel and the userspace RCU library. 1805 + <li> Some linked structures are created at compile time, but still 1806 + require <tt>__rcu</tt> checking. 1807 + The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this 1808 + purpose. 1809 + <li> It is not necessary to use <tt>rcu_assign_pointer()</tt> 1810 + when creating linked structures that are to be published via 1811 + a single external pointer. 1812 + The <tt>RCU_INIT_POINTER()</tt> macro is provided for 1813 + this task and also for assigning <tt>NULL</tt> pointers 1814 + at runtime. 1815 + </ol> 1816 + 1817 + <p> 1818 + This not a hard-and-fast list: RCU's diagnostic capabilities will 1819 + continue to be guided by the number and type of usage bugs found 1820 + in real-world RCU usage. 1821 + 1822 + <h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> 1823 + 1824 + <p> 1825 + The Linux kernel provides an interesting environment for all kinds of 1826 + software, including RCU. 1827 + Some of the relevant points of interest are as follows: 1828 + 1829 + <ol> 1830 + <li> <a href="#Configuration">Configuration</a>. 1831 + <li> <a href="#Firmware Interface">Firmware Interface</a>. 1832 + <li> <a href="#Early Boot">Early Boot</a>. 1833 + <li> <a href="#Interrupts and NMIs"> 1834 + Interrupts and non-maskable interrupts (NMIs)</a>. 1835 + <li> <a href="#Loadable Modules">Loadable Modules</a>. 1836 + <li> <a href="#Hotplug CPU">Hotplug CPU</a>. 1837 + <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. 1838 + <li> <a href="#Tracing and RCU">Tracing and RCU</a>. 1839 + <li> <a href="#Energy Efficiency">Energy Efficiency</a>. 1840 + <li> <a href="#Performance, Scalability, Response Time, and Reliability"> 1841 + Performance, Scalability, Response Time, and Reliability</a>. 1842 + </ol> 1843 + 1844 + <p> 1845 + This list is probably incomplete, but it does give a feel for the 1846 + most notable Linux-kernel complications. 1847 + Each of the following sections covers one of the above topics. 1848 + 1849 + <h3><a name="Configuration">Configuration</a></h3> 1850 + 1851 + <p> 1852 + RCU's goal is automatic configuration, so that almost nobody 1853 + needs to worry about RCU's <tt>Kconfig</tt> options. 1854 + And for almost all users, RCU does in fact work well 1855 + &ldquo;out of the box.&rdquo; 1856 + 1857 + <p> 1858 + However, there are specialized use cases that are handled by 1859 + kernel boot parameters and <tt>Kconfig</tt> options. 1860 + Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users 1861 + about new <tt>Kconfig</tt> options, which requires almost all of them 1862 + be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. 1863 + 1864 + <p> 1865 + This all should be quite obvious, but the fact remains that 1866 + Linus Torvalds recently had to 1867 + <a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> 1868 + me of this requirement. 1869 + 1870 + <h3><a name="Firmware Interface">Firmware Interface</a></h3> 1871 + 1872 + <p> 1873 + In many cases, kernel obtains information about the system from the 1874 + firmware, and sometimes things are lost in translation. 1875 + Or the translation is accurate, but the original message is bogus. 1876 + 1877 + <p> 1878 + For example, some systems' firmware overreports the number of CPUs, 1879 + sometimes by a large factor. 1880 + If RCU naively believed the firmware, as it used to do, 1881 + it would create too many per-CPU kthreads. 1882 + Although the resulting system will still run correctly, the extra 1883 + kthreads needlessly consume memory and can cause confusion 1884 + when they show up in <tt>ps</tt> listings. 1885 + 1886 + <p> 1887 + RCU must therefore wait for a given CPU to actually come online before 1888 + it can allow itself to believe that the CPU actually exists. 1889 + The resulting &ldquo;ghost CPUs&rdquo; (which are never going to 1890 + come online) cause a number of 1891 + <a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. 1892 + 1893 + <h3><a name="Early Boot">Early Boot</a></h3> 1894 + 1895 + <p> 1896 + The Linux kernel's boot sequence is an interesting process, 1897 + and RCU is used early, even before <tt>rcu_init()</tt> 1898 + is invoked. 1899 + In fact, a number of RCU's primitives can be used as soon as the 1900 + initial task's <tt>task_struct</tt> is available and the 1901 + boot CPU's per-CPU variables are set up. 1902 + The read-side primitives (<tt>rcu_read_lock()</tt>, 1903 + <tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, 1904 + and <tt>rcu_access_pointer()</tt>) will operate normally very early on, 1905 + as will <tt>rcu_assign_pointer()</tt>. 1906 + 1907 + <p> 1908 + Although <tt>call_rcu()</tt> may be invoked at any 1909 + time during boot, callbacks are not guaranteed to be invoked until after 1910 + the scheduler is fully up and running. 1911 + This delay in callback invocation is due to the fact that RCU does not 1912 + invoke callbacks until it is fully initialized, and this full initialization 1913 + cannot occur until after the scheduler has initialized itself to the 1914 + point where RCU can spawn and run its kthreads. 1915 + In theory, it would be possible to invoke callbacks earlier, 1916 + however, this is not a panacea because there would be severe restrictions 1917 + on what operations those callbacks could invoke. 1918 + 1919 + <p> 1920 + Perhaps surprisingly, <tt>synchronize_rcu()</tt>, 1921 + <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> 1922 + (<a href="#Bottom-Half Flavor">discussed below</a>), 1923 + and 1924 + <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> 1925 + will all operate normally 1926 + during very early boot, the reason being that there is only one CPU 1927 + and preemption is disabled. 1928 + This means that the call <tt>synchronize_rcu()</tt> (or friends) 1929 + itself is a quiescent 1930 + state and thus a grace period, so the early-boot implementation can 1931 + be a no-op. 1932 + 1933 + <p> 1934 + Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> 1935 + continue to operate normally through the remainder of boot, courtesy 1936 + of the fact that preemption is disabled across their RCU read-side 1937 + critical sections and also courtesy of the fact that there is still 1938 + only one CPU. 1939 + However, once the scheduler starts initializing, preemption is enabled. 1940 + There is still only a single CPU, but the fact that preemption is enabled 1941 + means that the no-op implementation of <tt>synchronize_rcu()</tt> no 1942 + longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. 1943 + Therefore, as soon as the scheduler starts initializing, the early-boot 1944 + fastpath is disabled. 1945 + This means that <tt>synchronize_rcu()</tt> switches to its runtime 1946 + mode of operation where it posts callbacks, which in turn means that 1947 + any call to <tt>synchronize_rcu()</tt> will block until the corresponding 1948 + callback is invoked. 1949 + Unfortunately, the callback cannot be invoked until RCU's runtime 1950 + grace-period machinery is up and running, which cannot happen until 1951 + the scheduler has initialized itself sufficiently to allow RCU's 1952 + kthreads to be spawned. 1953 + Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler 1954 + initialization can result in deadlock. 1955 + 1956 + <p>@@QQ@@ 1957 + So what happens with <tt>synchronize_rcu()</tt> during 1958 + scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> 1959 + kernels? 1960 + <p>@@QQA@@ 1961 + In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> 1962 + maps directly to <tt>synchronize_sched()</tt>. 1963 + Therefore, <tt>synchronize_rcu()</tt> works normally throughout 1964 + boot in <tt>CONFIG_PREEMPT=n</tt> kernels. 1965 + However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, 1966 + so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> 1967 + during scheduler initialization. 1968 + <p>@@QQE@@ 1969 + 1970 + <p> 1971 + I learned of these boot-time requirements as a result of a series of 1972 + system hangs. 1973 + 1974 + <h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> 1975 + 1976 + <p> 1977 + The Linux kernel has interrupts, and RCU read-side critical sections are 1978 + legal within interrupt handlers and within interrupt-disabled regions 1979 + of code, as are invocations of <tt>call_rcu()</tt>. 1980 + 1981 + <p> 1982 + Some Linux-kernel architectures can enter an interrupt handler from 1983 + non-idle process context, and then just never leave it, instead stealthily 1984 + transitioning back to process context. 1985 + This trick is sometimes used to invoke system calls from inside the kernel. 1986 + These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful 1987 + about how it counts interrupt nesting levels. 1988 + I learned of this requirement the hard way during a rewrite 1989 + of RCU's dyntick-idle code. 1990 + 1991 + <p> 1992 + The Linux kernel has non-maskable interrupts (NMIs), and 1993 + RCU read-side critical sections are legal within NMI handlers. 1994 + Thankfully, RCU update-side primitives, including 1995 + <tt>call_rcu()</tt>, are prohibited within NMI handlers. 1996 + 1997 + <p> 1998 + The name notwithstanding, some Linux-kernel architectures 1999 + can have nested NMIs, which RCU must handle correctly. 2000 + Andy Lutomirski 2001 + <a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> 2002 + with this requirement; 2003 + he also kindly surprised me with 2004 + <a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> 2005 + that meets this requirement. 2006 + 2007 + <h3><a name="Loadable Modules">Loadable Modules</a></h3> 2008 + 2009 + <p> 2010 + The Linux kernel has loadable modules, and these modules can 2011 + also be unloaded. 2012 + After a given module has been unloaded, any attempt to call 2013 + one of its functions results in a segmentation fault. 2014 + The module-unload functions must therefore cancel any 2015 + delayed calls to loadable-module functions, for example, 2016 + any outstanding <tt>mod_timer()</tt> must be dealt with 2017 + via <tt>del_timer_sync()</tt> or similar. 2018 + 2019 + <p> 2020 + Unfortunately, there is no way to cancel an RCU callback; 2021 + once you invoke <tt>call_rcu()</tt>, the callback function is 2022 + going to eventually be invoked, unless the system goes down first. 2023 + Because it is normally considered socially irresponsible to crash the system 2024 + in response to a module unload request, we need some other way 2025 + to deal with in-flight RCU callbacks. 2026 + 2027 + <p> 2028 + RCU therefore provides 2029 + <tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, 2030 + which waits until all in-flight RCU callbacks have been invoked. 2031 + If a module uses <tt>call_rcu()</tt>, its exit function should therefore 2032 + prevent any future invocation of <tt>call_rcu()</tt>, then invoke 2033 + <tt>rcu_barrier()</tt>. 2034 + In theory, the underlying module-unload code could invoke 2035 + <tt>rcu_barrier()</tt> unconditionally, but in practice this would 2036 + incur unacceptable latencies. 2037 + 2038 + <p> 2039 + Nikita Danilov noted this requirement for an analogous filesystem-unmount 2040 + situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. 2041 + The need for <tt>rcu_barrier()</tt> for module unloading became 2042 + apparent later. 2043 + 2044 + <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> 2045 + 2046 + <p> 2047 + The Linux kernel supports CPU hotplug, which means that CPUs 2048 + can come and go. 2049 + It is of course illegal to use any RCU API member from an offline CPU. 2050 + This requirement was present from day one in DYNIX/ptx, but 2051 + on the other hand, the Linux kernel's CPU-hotplug implementation 2052 + is &ldquo;interesting.&rdquo; 2053 + 2054 + <p> 2055 + The Linux-kernel CPU-hotplug implementation has notifiers that 2056 + are used to allow the various kernel subsystems (including RCU) 2057 + to respond appropriately to a given CPU-hotplug operation. 2058 + Most RCU operations may be invoked from CPU-hotplug notifiers, 2059 + including even normal synchronous grace-period operations 2060 + such as <tt>synchronize_rcu()</tt>. 2061 + However, expedited grace-period operations such as 2062 + <tt>synchronize_rcu_expedited()</tt> are not supported, 2063 + due to the fact that current implementations block CPU-hotplug 2064 + operations, which could result in deadlock. 2065 + 2066 + <p> 2067 + In addition, all-callback-wait operations such as 2068 + <tt>rcu_barrier()</tt> are also not supported, due to the 2069 + fact that there are phases of CPU-hotplug operations where 2070 + the outgoing CPU's callbacks will not be invoked until after 2071 + the CPU-hotplug operation ends, which could also result in deadlock. 2072 + 2073 + <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> 2074 + 2075 + <p> 2076 + RCU depends on the scheduler, and the scheduler uses RCU to 2077 + protect some of its data structures. 2078 + This means the scheduler is forbidden from acquiring 2079 + the runqueue locks and the priority-inheritance locks 2080 + in the middle of an outermost RCU read-side critical section unless 2081 + it also releases them before exiting that same 2082 + RCU read-side critical section. 2083 + This same prohibition also applies to any lock that is acquired 2084 + while holding any lock to which this prohibition applies. 2085 + Violating this rule results in deadlock. 2086 + 2087 + <p> 2088 + For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> 2089 + implementation must be written carefully to avoid similar deadlocks. 2090 + In particular, <tt>rcu_read_unlock()</tt> must tolerate an 2091 + interrupt where the interrupt handler invokes both 2092 + <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. 2093 + This possibility requires <tt>rcu_read_unlock()</tt> to use 2094 + negative nesting levels to avoid destructive recursion via 2095 + interrupt handler's use of RCU. 2096 + 2097 + <p> 2098 + This pair of mutual scheduler-RCU requirements came as a 2099 + <a href="https://lwn.net/Articles/453002/">complete surprise</a>. 2100 + 2101 + <p> 2102 + As noted above, RCU makes use of kthreads, and it is necessary to 2103 + avoid excessive CPU-time accumulation by these kthreads. 2104 + This requirement was no surprise, but RCU's violation of it 2105 + when running context-switch-heavy workloads when built with 2106 + <tt>CONFIG_NO_HZ_FULL=y</tt> 2107 + <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. 2108 + RCU has made good progress towards meeting this requirement, even 2109 + for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, 2110 + but there is room for further improvement. 2111 + 2112 + <h3><a name="Tracing and RCU">Tracing and RCU</a></h3> 2113 + 2114 + <p> 2115 + It is possible to use tracing on RCU code, but tracing itself 2116 + uses RCU. 2117 + For this reason, <tt>rcu_dereference_raw_notrace()</tt> 2118 + is provided for use by tracing, which avoids the destructive 2119 + recursion that could otherwise ensue. 2120 + This API is also used by virtualization in some architectures, 2121 + where RCU readers execute in environments in which tracing 2122 + cannot be used. 2123 + The tracing folks both located the requirement and provided the 2124 + needed fix, so this surprise requirement was relatively painless. 2125 + 2126 + <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> 2127 + 2128 + <p> 2129 + Interrupting idle CPUs is considered socially unacceptable, 2130 + especially by people with battery-powered embedded systems. 2131 + RCU therefore conserves energy by detecting which CPUs are 2132 + idle, including tracking CPUs that have been interrupted from idle. 2133 + This is a large part of the energy-efficiency requirement, 2134 + so I learned of this via an irate phone call. 2135 + 2136 + <p> 2137 + Because RCU avoids interrupting idle CPUs, it is illegal to 2138 + execute an RCU read-side critical section on an idle CPU. 2139 + (Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat 2140 + if you try it.) 2141 + The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> 2142 + event tracing is provided to work around this restriction. 2143 + In addition, <tt>rcu_is_watching()</tt> may be used to 2144 + test whether or not it is currently legal to run RCU read-side 2145 + critical sections on this CPU. 2146 + I learned of the need for diagnostics on the one hand 2147 + and <tt>RCU_NONIDLE()</tt> on the other while inspecting 2148 + idle-loop code. 2149 + Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, 2150 + which is used quite heavily in the idle loop. 2151 + 2152 + <p> 2153 + It is similarly socially unacceptable to interrupt an 2154 + <tt>nohz_full</tt> CPU running in userspace. 2155 + RCU must therefore track <tt>nohz_full</tt> userspace 2156 + execution. 2157 + And in 2158 + <a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> 2159 + kernels, RCU must separately track idle CPUs on the one hand and 2160 + CPUs that are either idle or executing in userspace on the other. 2161 + In both cases, RCU must be able to sample state at two points in 2162 + time, and be able to determine whether or not some other CPU spent 2163 + any time idle and/or executing in userspace. 2164 + 2165 + <p> 2166 + These energy-efficiency requirements have proven quite difficult to 2167 + understand and to meet, for example, there have been more than five 2168 + clean-sheet rewrites of RCU's energy-efficiency code, the last of 2169 + which was finally able to demonstrate 2170 + <a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. 2171 + As noted earlier, 2172 + I learned of many of these requirements via angry phone calls: 2173 + Flaming me on the Linux-kernel mailing list was apparently not 2174 + sufficient to fully vent their ire at RCU's energy-efficiency bugs! 2175 + 2176 + <h3><a name="Performance, Scalability, Response Time, and Reliability"> 2177 + Performance, Scalability, Response Time, and Reliability</a></h3> 2178 + 2179 + <p> 2180 + Expanding on the 2181 + <a href="#Performance and Scalability">earlier discussion</a>, 2182 + RCU is used heavily by hot code paths in performance-critical 2183 + portions of the Linux kernel's networking, security, virtualization, 2184 + and scheduling code paths. 2185 + RCU must therefore use efficient implementations, especially in its 2186 + read-side primitives. 2187 + To that end, it would be good if preemptible RCU's implementation 2188 + of <tt>rcu_read_lock()</tt> could be inlined, however, doing 2189 + this requires resolving <tt>#include</tt> issues with the 2190 + <tt>task_struct</tt> structure. 2191 + 2192 + <p> 2193 + The Linux kernel supports hardware configurations with up to 2194 + 4096 CPUs, which means that RCU must be extremely scalable. 2195 + Algorithms that involve frequent acquisitions of global locks or 2196 + frequent atomic operations on global variables simply cannot be 2197 + tolerated within the RCU implementation. 2198 + RCU therefore makes heavy use of a combining tree based on the 2199 + <tt>rcu_node</tt> structure. 2200 + RCU is required to tolerate all CPUs continuously invoking any 2201 + combination of RCU's runtime primitives with minimal per-operation 2202 + overhead. 2203 + In fact, in many cases, increasing load must <i>decrease</i> the 2204 + per-operation overhead, witness the batching optimizations for 2205 + <tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, 2206 + <tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. 2207 + As a general rule, RCU must cheerfully accept whatever the 2208 + rest of the Linux kernel decides to throw at it. 2209 + 2210 + <p> 2211 + The Linux kernel is used for real-time workloads, especially 2212 + in conjunction with the 2213 + <a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. 2214 + The real-time-latency response requirements are such that the 2215 + traditional approach of disabling preemption across RCU 2216 + read-side critical sections is inappropriate. 2217 + Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore 2218 + use an RCU implementation that allows RCU read-side critical 2219 + sections to be preempted. 2220 + This requirement made its presence known after users made it 2221 + clear that an earlier 2222 + <a href="https://lwn.net/Articles/107930/">real-time patch</a> 2223 + did not meet their needs, in conjunction with some 2224 + <a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> 2225 + encountered by a very early version of the -rt patchset. 2226 + 2227 + <p> 2228 + In addition, RCU must make do with a sub-100-microsecond real-time latency 2229 + budget. 2230 + In fact, on smaller systems with the -rt patchset, the Linux kernel 2231 + provides sub-20-microsecond real-time latencies for the whole kernel, 2232 + including RCU. 2233 + RCU's scalability and latency must therefore be sufficient for 2234 + these sorts of configurations. 2235 + To my surprise, the sub-100-microsecond real-time latency budget 2236 + <a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> 2237 + applies to even the largest systems [PDF]</a>, 2238 + up to and including systems with 4096 CPUs. 2239 + This real-time requirement motivated the grace-period kthread, which 2240 + also simplified handling of a number of race conditions. 2241 + 2242 + <p> 2243 + Finally, RCU's status as a synchronization primitive means that 2244 + any RCU failure can result in arbitrary memory corruption that can be 2245 + extremely difficult to debug. 2246 + This means that RCU must be extremely reliable, which in 2247 + practice also means that RCU must have an aggressive stress-test 2248 + suite. 2249 + This stress-test suite is called <tt>rcutorture</tt>. 2250 + 2251 + <p> 2252 + Although the need for <tt>rcutorture</tt> was no surprise, 2253 + the current immense popularity of the Linux kernel is posing 2254 + interesting&mdash;and perhaps unprecedented&mdash;validation 2255 + challenges. 2256 + To see this, keep in mind that there are well over one billion 2257 + instances of the Linux kernel running today, given Android 2258 + smartphones, Linux-powered televisions, and servers. 2259 + This number can be expected to increase sharply with the advent of 2260 + the celebrated Internet of Things. 2261 + 2262 + <p> 2263 + Suppose that RCU contains a race condition that manifests on average 2264 + once per million years of runtime. 2265 + This bug will be occurring about three times per <i>day</i> across 2266 + the installed base. 2267 + RCU could simply hide behind hardware error rates, given that no one 2268 + should really expect their smartphone to last for a million years. 2269 + However, anyone taking too much comfort from this thought should 2270 + consider the fact that in most jurisdictions, a successful multi-year 2271 + test of a given mechanism, which might include a Linux kernel, 2272 + suffices for a number of types of safety-critical certifications. 2273 + In fact, rumor has it that the Linux kernel is already being used 2274 + in production for safety-critical applications. 2275 + I don't know about you, but I would feel quite bad if a bug in RCU 2276 + killed someone. 2277 + Which might explain my recent focus on validation and verification. 2278 + 2279 + <h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> 2280 + 2281 + <p> 2282 + One of the more surprising things about RCU is that there are now 2283 + no fewer than five <i>flavors</i>, or API families. 2284 + In addition, the primary flavor that has been the sole focus up to 2285 + this point has two different implementations, non-preemptible and 2286 + preemptible. 2287 + The other four flavors are listed below, with requirements for each 2288 + described in a separate section. 2289 + 2290 + <ol> 2291 + <li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> 2292 + <li> <a href="#Sched Flavor">Sched Flavor</a> 2293 + <li> <a href="#Sleepable RCU">Sleepable RCU</a> 2294 + <li> <a href="#Tasks RCU">Tasks RCU</a> 2295 + </ol> 2296 + 2297 + <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> 2298 + 2299 + <p> 2300 + The softirq-disable (AKA &ldquo;bottom-half&rdquo;, 2301 + hence the &ldquo;_bh&rdquo; abbreviations) 2302 + flavor of RCU, or <i>RCU-bh</i>, was developed by 2303 + Dipankar Sarma to provide a flavor of RCU that could withstand the 2304 + network-based denial-of-service attacks researched by Robert 2305 + Olsson. 2306 + These attacks placed so much networking load on the system 2307 + that some of the CPUs never exited softirq execution, 2308 + which in turn prevented those CPUs from ever executing a context switch, 2309 + which, in the RCU implementation of that time, prevented grace periods 2310 + from ever ending. 2311 + The result was an out-of-memory condition and a system hang. 2312 + 2313 + <p> 2314 + The solution was the creation of RCU-bh, which does 2315 + <tt>local_bh_disable()</tt> 2316 + across its read-side critical sections, and which uses the transition 2317 + from one type of softirq processing to another as a quiescent state 2318 + in addition to context switch, idle, user mode, and offline. 2319 + This means that RCU-bh grace periods can complete even when some of 2320 + the CPUs execute in softirq indefinitely, thus allowing algorithms 2321 + based on RCU-bh to withstand network-based denial-of-service attacks. 2322 + 2323 + <p> 2324 + Because 2325 + <tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> 2326 + disable and re-enable softirq handlers, any attempt to start a softirq 2327 + handlers during the 2328 + RCU-bh read-side critical section will be deferred. 2329 + In this case, <tt>rcu_read_unlock_bh()</tt> 2330 + will invoke softirq processing, which can take considerable time. 2331 + One can of course argue that this softirq overhead should be associated 2332 + with the code following the RCU-bh read-side critical section rather 2333 + than <tt>rcu_read_unlock_bh()</tt>, but the fact 2334 + is that most profiling tools cannot be expected to make this sort 2335 + of fine distinction. 2336 + For example, suppose that a three-millisecond-long RCU-bh read-side 2337 + critical section executes during a time of heavy networking load. 2338 + There will very likely be an attempt to invoke at least one softirq 2339 + handler during that three milliseconds, but any such invocation will 2340 + be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. 2341 + This can of course make it appear at first glance as if 2342 + <tt>rcu_read_unlock_bh()</tt> was executing very slowly. 2343 + 2344 + <p> 2345 + The 2346 + <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> 2347 + includes 2348 + <tt>rcu_read_lock_bh()</tt>, 2349 + <tt>rcu_read_unlock_bh()</tt>, 2350 + <tt>rcu_dereference_bh()</tt>, 2351 + <tt>rcu_dereference_bh_check()</tt>, 2352 + <tt>synchronize_rcu_bh()</tt>, 2353 + <tt>synchronize_rcu_bh_expedited()</tt>, 2354 + <tt>call_rcu_bh()</tt>, 2355 + <tt>rcu_barrier_bh()</tt>, and 2356 + <tt>rcu_read_lock_bh_held()</tt>. 2357 + 2358 + <h3><a name="Sched Flavor">Sched Flavor</a></h3> 2359 + 2360 + <p> 2361 + Before preemptible RCU, waiting for an RCU grace period had the 2362 + side effect of also waiting for all pre-existing interrupt 2363 + and NMI handlers. 2364 + However, there are legitimate preemptible-RCU implementations that 2365 + do not have this property, given that any point in the code outside 2366 + of an RCU read-side critical section can be a quiescent state. 2367 + Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo; 2368 + RCU in that an RCU-sched grace period waits for for pre-existing 2369 + interrupt and NMI handlers. 2370 + In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched 2371 + APIs have identical implementations, while kernels built with 2372 + <tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. 2373 + 2374 + <p> 2375 + Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, 2376 + <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> 2377 + disable and re-enable preemption, respectively. 2378 + This means that if there was a preemption attempt during the 2379 + RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> 2380 + will enter the scheduler, with all the latency and overhead entailed. 2381 + Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look 2382 + as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. 2383 + However, the highest-priority task won't be preempted, so that task 2384 + will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. 2385 + 2386 + <p> 2387 + The 2388 + <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> 2389 + includes 2390 + <tt>rcu_read_lock_sched()</tt>, 2391 + <tt>rcu_read_unlock_sched()</tt>, 2392 + <tt>rcu_read_lock_sched_notrace()</tt>, 2393 + <tt>rcu_read_unlock_sched_notrace()</tt>, 2394 + <tt>rcu_dereference_sched()</tt>, 2395 + <tt>rcu_dereference_sched_check()</tt>, 2396 + <tt>synchronize_sched()</tt>, 2397 + <tt>synchronize_rcu_sched_expedited()</tt>, 2398 + <tt>call_rcu_sched()</tt>, 2399 + <tt>rcu_barrier_sched()</tt>, and 2400 + <tt>rcu_read_lock_sched_held()</tt>. 2401 + However, anything that disables preemption also marks an RCU-sched 2402 + read-side critical section, including 2403 + <tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, 2404 + <tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, 2405 + and so on. 2406 + 2407 + <h3><a name="Sleepable RCU">Sleepable RCU</a></h3> 2408 + 2409 + <p> 2410 + For well over a decade, someone saying &ldquo;I need to block within 2411 + an RCU read-side critical section&rdquo; was a reliable indication 2412 + that this someone did not understand RCU. 2413 + After all, if you are always blocking in an RCU read-side critical 2414 + section, you can probably afford to use a higher-overhead synchronization 2415 + mechanism. 2416 + However, that changed with the advent of the Linux kernel's notifiers, 2417 + whose RCU read-side critical 2418 + sections almost never sleep, but sometimes need to. 2419 + This resulted in the introduction of 2420 + <a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, 2421 + or <i>SRCU</i>. 2422 + 2423 + <p> 2424 + SRCU allows different domains to be defined, with each such domain 2425 + defined by an instance of an <tt>srcu_struct</tt> structure. 2426 + A pointer to this structure must be passed in to each SRCU function, 2427 + for example, <tt>synchronize_srcu(&amp;ss)</tt>, where 2428 + <tt>ss</tt> is the <tt>srcu_struct</tt> structure. 2429 + The key benefit of these domains is that a slow SRCU reader in one 2430 + domain does not delay an SRCU grace period in some other domain. 2431 + That said, one consequence of these domains is that read-side code 2432 + must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt> 2433 + to <tt>srcu_read_unlock()</tt>, for example, as follows: 2434 + 2435 + <blockquote> 2436 + <pre> 2437 + 1 int idx; 2438 + 2 2439 + 3 idx = srcu_read_lock(&amp;ss); 2440 + 4 do_something(); 2441 + 5 srcu_read_unlock(&amp;ss, idx); 2442 + </pre> 2443 + </blockquote> 2444 + 2445 + <p> 2446 + As noted above, it is legal to block within SRCU read-side critical sections, 2447 + however, with great power comes great responsibility. 2448 + If you block forever in one of a given domain's SRCU read-side critical 2449 + sections, then that domain's grace periods will also be blocked forever. 2450 + Of course, one good way to block forever is to deadlock, which can 2451 + happen if any operation in a given domain's SRCU read-side critical 2452 + section can block waiting, either directly or indirectly, for that domain's 2453 + grace period to elapse. 2454 + For example, this results in a self-deadlock: 2455 + 2456 + <blockquote> 2457 + <pre> 2458 + 1 int idx; 2459 + 2 2460 + 3 idx = srcu_read_lock(&amp;ss); 2461 + 4 do_something(); 2462 + 5 synchronize_srcu(&amp;ss); 2463 + 6 srcu_read_unlock(&amp;ss, idx); 2464 + </pre> 2465 + </blockquote> 2466 + 2467 + <p> 2468 + However, if line&nbsp;5 acquired a mutex that was held across 2469 + a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, 2470 + deadlock would still be possible. 2471 + Furthermore, if line&nbsp;5 acquired a mutex that was held across 2472 + a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, 2473 + and if an <tt>ss1</tt>-domain SRCU read-side critical section 2474 + acquired another mutex that was held across as <tt>ss</tt>-domain 2475 + <tt>synchronize_srcu()</tt>, 2476 + deadlock would again be possible. 2477 + Such a deadlock cycle could extend across an arbitrarily large number 2478 + of different SRCU domains. 2479 + Again, with great power comes great responsibility. 2480 + 2481 + <p> 2482 + Unlike the other RCU flavors, SRCU read-side critical sections can 2483 + run on idle and even offline CPUs. 2484 + This ability requires that <tt>srcu_read_lock()</tt> and 2485 + <tt>srcu_read_unlock()</tt> contain memory barriers, which means 2486 + that SRCU readers will run a bit slower than would RCU readers. 2487 + It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> 2488 + API, which, in combination with <tt>srcu_read_unlock()</tt>, 2489 + guarantees a full memory barrier. 2490 + 2491 + <p> 2492 + The 2493 + <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> 2494 + includes 2495 + <tt>srcu_read_lock()</tt>, 2496 + <tt>srcu_read_unlock()</tt>, 2497 + <tt>srcu_dereference()</tt>, 2498 + <tt>srcu_dereference_check()</tt>, 2499 + <tt>synchronize_srcu()</tt>, 2500 + <tt>synchronize_srcu_expedited()</tt>, 2501 + <tt>call_srcu()</tt>, 2502 + <tt>srcu_barrier()</tt>, and 2503 + <tt>srcu_read_lock_held()</tt>. 2504 + It also includes 2505 + <tt>DEFINE_SRCU()</tt>, 2506 + <tt>DEFINE_STATIC_SRCU()</tt>, and 2507 + <tt>init_srcu_struct()</tt> 2508 + APIs for defining and initializing <tt>srcu_struct</tt> structures. 2509 + 2510 + <h3><a name="Tasks RCU">Tasks RCU</a></h3> 2511 + 2512 + <p> 2513 + Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the 2514 + binary rewriting required to install different types of probes. 2515 + It would be good to be able to free old trampolines, which sounds 2516 + like a job for some form of RCU. 2517 + However, because it is necessary to be able to install a trace 2518 + anywhere in the code, it is not possible to use read-side markers 2519 + such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. 2520 + In addition, it does not work to have these markers in the trampoline 2521 + itself, because there would need to be instructions following 2522 + <tt>rcu_read_unlock()</tt>. 2523 + Although <tt>synchronize_rcu()</tt> would guarantee that execution 2524 + reached the <tt>rcu_read_unlock()</tt>, it would not be able to 2525 + guarantee that execution had completely left the trampoline. 2526 + 2527 + <p> 2528 + The solution, in the form of 2529 + <a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, 2530 + is to have implicit 2531 + read-side critical sections that are delimited by voluntary context 2532 + switches, that is, calls to <tt>schedule()</tt>, 2533 + <tt>cond_resched_rcu_qs()</tt>, and 2534 + <tt>synchronize_rcu_tasks()</tt>. 2535 + In addition, transitions to and from userspace execution also delimit 2536 + tasks-RCU read-side critical sections. 2537 + 2538 + <p> 2539 + The tasks-RCU API is quite compact, consisting only of 2540 + <tt>call_rcu_tasks()</tt>, 2541 + <tt>synchronize_rcu_tasks()</tt>, and 2542 + <tt>rcu_barrier_tasks()</tt>. 2543 + 2544 + <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> 2545 + 2546 + <p> 2547 + One of the tricks that RCU uses to attain update-side scalability is 2548 + to increase grace-period latency with increasing numbers of CPUs. 2549 + If this becomes a serious problem, it will be necessary to rework the 2550 + grace-period state machine so as to avoid the need for the additional 2551 + latency. 2552 + 2553 + <p> 2554 + Expedited grace periods scan the CPUs, so their latency and overhead 2555 + increases with increasing numbers of CPUs. 2556 + If this becomes a serious problem on large systems, it will be necessary 2557 + to do some redesign to avoid this scalability problem. 2558 + 2559 + <p> 2560 + RCU disables CPU hotplug in a few places, perhaps most notably in the 2561 + expedited grace-period and <tt>rcu_barrier()</tt> operations. 2562 + If there is a strong reason to use expedited grace periods in CPU-hotplug 2563 + notifiers, it will be necessary to avoid disabling CPU hotplug. 2564 + This would introduce some complexity, so there had better be a <i>very</i> 2565 + good reason. 2566 + 2567 + <p> 2568 + The tradeoff between grace-period latency on the one hand and interruptions 2569 + of other CPUs on the other hand may need to be re-examined. 2570 + The desire is of course for zero grace-period latency as well as zero 2571 + interprocessor interrupts undertaken during an expedited grace period 2572 + operation. 2573 + While this ideal is unlikely to be achievable, it is quite possible that 2574 + further improvements can be made. 2575 + 2576 + <p> 2577 + The multiprocessor implementations of RCU use a combining tree that 2578 + groups CPUs so as to reduce lock contention and increase cache locality. 2579 + However, this combining tree does not spread its memory across NUMA 2580 + nodes nor does it align the CPU groups with hardware features such 2581 + as sockets or cores. 2582 + Such spreading and alignment is currently believed to be unnecessary 2583 + because the hotpath read-side primitives do not access the combining 2584 + tree, nor does <tt>call_rcu()</tt> in the common case. 2585 + If you believe that your architecture needs such spreading and alignment, 2586 + then your architecture should also benefit from the 2587 + <tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set 2588 + to the number of CPUs in a socket, NUMA node, or whatever. 2589 + If the number of CPUs is too large, use a fraction of the number of 2590 + CPUs. 2591 + If the number of CPUs is a large prime number, well, that certainly 2592 + is an &ldquo;interesting&rdquo; architectural choice! 2593 + More flexible arrangements might be considered, but only if 2594 + <tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only 2595 + if the inadequacy has been demonstrated by a carefully run and 2596 + realistic system-level workload. 2597 + 2598 + <p> 2599 + Please note that arrangements that require RCU to remap CPU numbers will 2600 + require extremely good demonstration of need and full exploration of 2601 + alternatives. 2602 + 2603 + <p> 2604 + There is an embarrassingly large number of flavors of RCU, and this 2605 + number has been increasing over time. 2606 + Perhaps it will be possible to combine some at some future date. 2607 + 2608 + <p> 2609 + RCU's various kthreads are reasonably recent additions. 2610 + It is quite likely that adjustments will be required to more gracefully 2611 + handle extreme loads. 2612 + It might also be necessary to be able to relate CPU utilization by 2613 + RCU's kthreads and softirq handlers to the code that instigated this 2614 + CPU utilization. 2615 + For example, RCU callback overhead might be charged back to the 2616 + originating <tt>call_rcu()</tt> instance, though probably not 2617 + in production kernels. 2618 + 2619 + <h2><a name="Summary">Summary</a></h2> 2620 + 2621 + <p> 2622 + This document has presented more than two decade's worth of RCU 2623 + requirements. 2624 + Given that the requirements keep changing, this will not be the last 2625 + word on this subject, but at least it serves to get an important 2626 + subset of the requirements set forth. 2627 + 2628 + <h2><a name="Acknowledgments">Acknowledgments</a></h2> 2629 + 2630 + I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, 2631 + Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and 2632 + Andy Lutomirski for their help in rendering 2633 + this article human readable, and to Michelle Rankin for her support 2634 + of this effort. 2635 + Other contributions are acknowledged in the Linux kernel's git archive. 2636 + The cartoon is copyright (c) 2013 by Melissa Broussard, 2637 + and is provided 2638 + under the terms of the Creative Commons Attribution-Share Alike 3.0 2639 + United States license. 2640 + 2641 + <p>@@QQAL@@ 2642 + 2643 + </body></html>
+108
Documentation/RCU/Design/htmlqqz.sh
··· 1 + #!/bin/sh 2 + # 3 + # Usage: sh htmlqqz.sh file 4 + # 5 + # Extracts and converts quick quizzes in a proto-HTML document file.htmlx. 6 + # Commands, all of which must be on a line by themselves: 7 + # 8 + # "<p>@@QQ@@": Start of a quick quiz. 9 + # "<p>@@QQA@@": Start of a quick-quiz answer. 10 + # "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. 11 + # "<p>@@QQAL@@": Place to put quick-quiz answer list. 12 + # 13 + # Places the result in file.html. 14 + # 15 + # This program is free software; you can redistribute it and/or modify 16 + # it under the terms of the GNU General Public License as published by 17 + # the Free Software Foundation; either version 2 of the License, or 18 + # (at your option) any later version. 19 + # 20 + # This program is distributed in the hope that it will be useful, 21 + # but WITHOUT ANY WARRANTY; without even the implied warranty of 22 + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 + # GNU General Public License for more details. 24 + # 25 + # You should have received a copy of the GNU General Public License 26 + # along with this program; if not, you can access it online at 27 + # http://www.gnu.org/licenses/gpl-2.0.html. 28 + # 29 + # Copyright (c) 2013 Paul E. McKenney, IBM Corporation. 30 + 31 + fn=$1 32 + if test ! -r $fn.htmlx 33 + then 34 + echo "Error: $fn.htmlx unreadable." 35 + exit 1 36 + fi 37 + 38 + echo "<!-- DO NOT HAND EDIT. -->" > $fn.html 39 + echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html 40 + awk < $fn.htmlx >> $fn.html ' 41 + 42 + state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" { 43 + print $0; 44 + if ($0 ~ /^<p>@@QQ/) 45 + print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr" 46 + next; 47 + } 48 + 49 + state == "" && $1 == "<p>@@QQ@@" { 50 + qqn++; 51 + qqlineno = NR; 52 + haveqq = 1; 53 + state = "qq"; 54 + print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>" 55 + next; 56 + } 57 + 58 + state == "qq" && $1 != "<p>@@QQA@@" { 59 + qq[qqn] = qq[qqn] $0 "\n"; 60 + print $0 61 + if ($0 ~ /^<p>@@QQ/) 62 + print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr" 63 + next; 64 + } 65 + 66 + state == "qq" && $1 == "<p>@@QQA@@" { 67 + state = "qqa"; 68 + print "<br><a href=\"#qq" qqn "answer\">Answer</a>" 69 + next; 70 + } 71 + 72 + state == "qqa" && $1 != "<p>@@QQE@@" { 73 + qqa[qqn] = qqa[qqn] $0 "\n"; 74 + if ($0 ~ /^<p>@@QQ/) 75 + print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr" 76 + next; 77 + } 78 + 79 + state == "qqa" && $1 == "<p>@@QQE@@" { 80 + state = ""; 81 + next; 82 + } 83 + 84 + state == "" && $1 == "<p>@@QQAL@@" { 85 + haveqq = ""; 86 + print "<h3><a name=\"Answers to Quick Quizzes\">" 87 + print "Answers to Quick Quizzes</a></h3>" 88 + print ""; 89 + for (i = 1; i <= qqn; i++) { 90 + print "<a name=\"qq" i "answer\"></a>" 91 + print "<p><b>Quick Quiz " i "</b>:" 92 + print qq[i]; 93 + print ""; 94 + print "</p><p><b>Answer</b>:" 95 + print qqa[i]; 96 + print ""; 97 + print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>" 98 + print ""; 99 + } 100 + next; 101 + } 102 + 103 + END { 104 + if (state != "") 105 + print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" 106 + else if (haveqq) 107 + print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr" 108 + }'