Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block

+588

Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg

··· 1 + <?xml version="1.0" encoding="UTF-8" standalone="no"?> 2 +  3 + <svg 4 + xmlns:svg="http://www.w3.org/2000/svg" 5 + xmlns="http://www.w3.org/2000/svg" 6 + version="1.0" 7 + width="210mm" 8 + height="297mm" 9 + viewBox="0 0 21000 29700" 10 + id="svg2" 11 + style="fill-rule:evenodd"> 12 + <defs 13 + id="defs4" /> 14 + <g 15 + id="Default" 16 + style="visibility:visible"> 17 + <desc 18 + id="desc180">Master slide</desc> 19 + </g> 20 + <path 21 + d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z" 22 + id="path193" 23 + style="fill:#008000;visibility:visible" /> 24 + <path 25 + d="M 11999,7801 L 11999,8361" 26 + id="path197" 27 + style="fill:none;stroke:#008000;visibility:visible" /> 28 + <path 29 + d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z" 30 + id="path209" 31 + style="fill:#008000;visibility:visible" /> 32 + <path 33 + d="M 7999,9601 L 7999,10161" 34 + id="path213" 35 + style="fill:none;stroke:#008000;visibility:visible" /> 36 + <path 37 + d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z" 38 + id="path225" 39 + style="fill:#008000;visibility:visible" /> 40 + <path 41 + d="M 7999,7001 L 11764,7754" 42 + id="path229" 43 + style="fill:none;stroke:#008000;visibility:visible" /> 44 + <g 45 + transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)" 46 + id="g245" 47 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 48 + <text 49 + id="text247"> 50 + <tspan 51 + x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909" 52 + y="9284" 53 + id="tspan249">RSDataReply</tspan> 54 + </text> 55 + </g> 56 + <path 57 + d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z" 58 + id="path259" 59 + style="fill:#008000;visibility:visible" /> 60 + <path 61 + d="M 11999,9001 L 8236,9565" 62 + id="path263" 63 + style="fill:none;stroke:#008000;visibility:visible" /> 64 + <g 65 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)" 66 + id="g279" 67 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 68 + <text 69 + id="text281"> 70 + <tspan 71 + x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114" 72 + y="7023" 73 + id="tspan283">CsumRSRequest</tspan> 74 + </text> 75 + </g> 76 + <text 77 + id="text297" 78 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 79 + <tspan 80 + x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692" 81 + y="5707" 82 + id="tspan299">w_make_resync_request()</tspan> 83 + </text> 84 + <text 85 + id="text313" 86 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 87 + <tspan 88 + x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378" 89 + y="7806" 90 + id="tspan315">receive_DataRequest()</tspan> 91 + </text> 92 + <text 93 + id="text329" 94 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 95 + <tspan 96 + x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399" 97 + y="8606" 98 + id="tspan331">drbd_endio_read_sec()</tspan> 99 + </text> 100 + <text 101 + id="text345" 102 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 103 + <tspan 104 + x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616" 105 + y="9007" 106 + id="tspan347">w_e_end_csum_rs_req()</tspan> 107 + </text> 108 + <text 109 + id="text361" 110 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 111 + <tspan 112 + x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691" 113 + y="9507" 114 + id="tspan363">receive_RSDataReply()</tspan> 115 + </text> 116 + <text 117 + id="text377" 118 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 119 + <tspan 120 + x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691" 121 + y="10407" 122 + id="tspan379">drbd_endio_write_sec()</tspan> 123 + </text> 124 + <text 125 + id="text393" 126 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 127 + <tspan 128 + x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691" 129 + y="10907" 130 + id="tspan395">e_end_resync_block()</tspan> 131 + </text> 132 + <path 133 + d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z" 134 + id="path405" 135 + style="fill:#000080;visibility:visible" /> 136 + <path 137 + d="M 7999,10801 L 11764,11554" 138 + id="path409" 139 + style="fill:none;stroke:#000080;visibility:visible" /> 140 + <g 141 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)" 142 + id="g425" 143 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 144 + <text 145 + id="text427"> 146 + <tspan 147 + x="9320 9621 9726 9798 9887 10065 10277 10438" 148 + y="10943" 149 + id="tspan429">WriteAck</tspan> 150 + </text> 151 + </g> 152 + <text 153 + id="text443" 154 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 155 + <tspan 156 + x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244" 157 + y="11559" 158 + id="tspan445">got_BlockAck()</tspan> 159 + </text> 160 + <text 161 + id="text459" 162 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 163 + <tspan 164 + x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886" 165 + y="4877" 166 + id="tspan461">Checksum based Resync, case not in sync</tspan> 167 + </text> 168 + <text 169 + id="text475" 170 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 171 + <tspan 172 + x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407" 173 + y="2806" 174 + id="tspan477">DRBD-8.3 data flow</tspan> 175 + </text> 176 + <text 177 + id="text491" 178 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 179 + <tspan 180 + x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692" 181 + y="7005" 182 + id="tspan493">w_e_send_csum()</tspan> 183 + </text> 184 + <path 185 + d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z" 186 + id="path503" 187 + style="fill:#008000;visibility:visible" /> 188 + <path 189 + d="M 11999,16801 L 11999,17361" 190 + id="path507" 191 + style="fill:none;stroke:#008000;visibility:visible" /> 192 + <path 193 + d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z" 194 + id="path519" 195 + style="fill:#008000;visibility:visible" /> 196 + <path 197 + d="M 7999,16001 L 11764,16754" 198 + id="path523" 199 + style="fill:none;stroke:#008000;visibility:visible" /> 200 + <g 201 + transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)" 202 + id="g539" 203 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 204 + <text 205 + id="text541"> 206 + <tspan 207 + x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776" 208 + y="18265" 209 + id="tspan543">RSIsInSync</tspan> 210 + </text> 211 + </g> 212 + <path 213 + d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z" 214 + id="path553" 215 + style="fill:#000080;visibility:visible" /> 216 + <path 217 + d="M 11999,18001 L 8236,18565" 218 + id="path557" 219 + style="fill:none;stroke:#000080;visibility:visible" /> 220 + <g 221 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)" 222 + id="g573" 223 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 224 + <text 225 + id="text575"> 226 + <tspan 227 + x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114" 228 + y="16023" 229 + id="tspan577">CsumRSRequest</tspan> 230 + </text> 231 + </g> 232 + <text 233 + id="text591" 234 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 235 + <tspan 236 + x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378" 237 + y="16806" 238 + id="tspan593">receive_DataRequest()</tspan> 239 + </text> 240 + <text 241 + id="text607" 242 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 243 + <tspan 244 + x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399" 245 + y="17606" 246 + id="tspan609">drbd_endio_read_sec()</tspan> 247 + </text> 248 + <text 249 + id="text623" 250 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 251 + <tspan 252 + x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616" 253 + y="18007" 254 + id="tspan625">w_e_end_csum_rs_req()</tspan> 255 + </text> 256 + <text 257 + id="text639" 258 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 259 + <tspan 260 + x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691" 261 + y="18507" 262 + id="tspan641">got_IsInSync()</tspan> 263 + </text> 264 + <text 265 + id="text655" 266 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 267 + <tspan 268 + x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175" 269 + y="13877" 270 + id="tspan657">Checksum based Resync, case in sync</tspan> 271 + </text> 272 + <path 273 + d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z" 274 + id="path667" 275 + style="fill:#008000;visibility:visible" /> 276 + <path 277 + d="M 12000,23801 L 12000,24361" 278 + id="path671" 279 + style="fill:none;stroke:#008000;visibility:visible" /> 280 + <path 281 + d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z" 282 + id="path683" 283 + style="fill:#008000;visibility:visible" /> 284 + <path 285 + d="M 8000,25601 L 8000,26161" 286 + id="path687" 287 + style="fill:none;stroke:#008000;visibility:visible" /> 288 + <path 289 + d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z" 290 + id="path699" 291 + style="fill:#008000;visibility:visible" /> 292 + <path 293 + d="M 8000,23001 L 11765,23754" 294 + id="path703" 295 + style="fill:none;stroke:#008000;visibility:visible" /> 296 + <g 297 + transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)" 298 + id="g719" 299 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 300 + <text 301 + id="text721"> 302 + <tspan 303 + x="9464 9710 9921 10150 10328 10505 10577" 304 + y="25236" 305 + id="tspan723">OVReply</tspan> 306 + </text> 307 + </g> 308 + <path 309 + d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z" 310 + id="path733" 311 + style="fill:#008000;visibility:visible" /> 312 + <path 313 + d="M 12000,25001 L 8237,25565" 314 + id="path737" 315 + style="fill:none;stroke:#008000;visibility:visible" /> 316 + <g 317 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)" 318 + id="g753" 319 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 320 + <text 321 + id="text755"> 322 + <tspan 323 + x="9142 9388 9599 9828 10006 10183 10361 10539 10700" 324 + y="23106" 325 + id="tspan757">OVRequest</tspan> 326 + </text> 327 + </g> 328 + <text 329 + id="text771" 330 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 331 + <tspan 332 + x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163" 333 + y="23806" 334 + id="tspan773">receive_OVRequest()</tspan> 335 + </text> 336 + <text 337 + id="text787" 338 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 339 + <tspan 340 + x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400" 341 + y="24606" 342 + id="tspan789">drbd_endio_read_sec()</tspan> 343 + </text> 344 + <text 345 + id="text803" 346 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 347 + <tspan 348 + x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749" 349 + y="25007" 350 + id="tspan805">w_e_end_ov_req()</tspan> 351 + </text> 352 + <text 353 + id="text819" 354 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 355 + <tspan 356 + x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692" 357 + y="25507" 358 + id="tspan821">receive_OVReply()</tspan> 359 + </text> 360 + <text 361 + id="text835" 362 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 363 + <tspan 364 + x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692" 365 + y="26407" 366 + id="tspan837">drbd_endio_read_sec()</tspan> 367 + </text> 368 + <text 369 + id="text851" 370 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 371 + <tspan 372 + x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692" 373 + y="26907" 374 + id="tspan853">w_e_end_ov_reply()</tspan> 375 + </text> 376 + <path 377 + d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z" 378 + id="path863" 379 + style="fill:#000080;visibility:visible" /> 380 + <path 381 + d="M 8000,26801 L 11765,27554" 382 + id="path867" 383 + style="fill:none;stroke:#000080;visibility:visible" /> 384 + <g 385 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)" 386 + id="g883" 387 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 388 + <text 389 + id="text885"> 390 + <tspan 391 + x="9279 9525 9736 9965 10143 10303 10481 10553" 392 + y="26935" 393 + id="tspan887">OVResult</tspan> 394 + </text> 395 + </g> 396 + <text 397 + id="text901" 398 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 399 + <tspan 400 + x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291" 401 + y="27559" 402 + id="tspan903">got_OVResult()</tspan> 403 + </text> 404 + <text 405 + id="text917" 406 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 407 + <tspan 408 + x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146" 409 + y="21877" 410 + id="tspan919">Online verify</tspan> 411 + </text> 412 + <text 413 + id="text933" 414 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 415 + <tspan 416 + x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693" 417 + y="23005" 418 + id="tspan935">w_make_ov_request()</tspan> 419 + </text> 420 + <path 421 + d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z" 422 + id="path945" 423 + style="fill:#008000;visibility:visible" /> 424 + <path 425 + d="M 8000,5700 L 8000,6260" 426 + id="path949" 427 + style="fill:none;stroke:#008000;visibility:visible" /> 428 + <path 429 + d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000" 430 + id="path961" 431 + style="fill:none;stroke:#000000;visibility:visible" /> 432 + <path 433 + d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600" 434 + id="path973" 435 + style="fill:none;stroke:#000000;visibility:visible" /> 436 + <path 437 + d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900" 438 + id="path985" 439 + style="fill:none;stroke:#000000;visibility:visible" /> 440 + <text 441 + id="text1001" 442 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 443 + <tspan 444 + x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692" 445 + y="6506" 446 + id="tspan1003">drbd_endio_read_sec()</tspan> 447 + </text> 448 + <text 449 + id="text1017" 450 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 451 + <tspan 452 + x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692" 453 + y="14708" 454 + id="tspan1019">w_make_resync_request()</tspan> 455 + </text> 456 + <text 457 + id="text1033" 458 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 459 + <tspan 460 + x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692" 461 + y="16006" 462 + id="tspan1035">w_e_send_csum()</tspan> 463 + </text> 464 + <path 465 + d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z" 466 + id="path1045" 467 + style="fill:#008000;visibility:visible" /> 468 + <path 469 + d="M 8000,14701 L 8000,15261" 470 + id="path1049" 471 + style="fill:none;stroke:#008000;visibility:visible" /> 472 + <text 473 + id="text1065" 474 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 475 + <tspan 476 + x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692" 477 + y="15507" 478 + id="tspan1067">drbd_endio_read_sec()</tspan> 479 + </text> 480 + <path 481 + d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500" 482 + id="path1077" 483 + style="fill:none;stroke:#000000;visibility:visible" /> 484 + <path 485 + d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500" 486 + id="path1089" 487 + style="fill:none;stroke:#000000;visibility:visible" /> 488 + <path 489 + d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500" 490 + id="path1101" 491 + style="fill:none;stroke:#000000;visibility:visible" /> 492 + <text 493 + id="text1117" 494 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 495 + <tspan 496 + x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787" 497 + y="5402" 498 + id="tspan1119">rs_begin_io()</tspan> 499 + </text> 500 + <text 501 + id="text1133" 502 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 503 + <tspan 504 + x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788" 505 + y="14402" 506 + id="tspan1135">rs_begin_io()</tspan> 507 + </text> 508 + <text 509 + id="text1149" 510 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 511 + <tspan 512 + x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787" 513 + y="22602" 514 + id="tspan1151">rs_begin_io()</tspan> 515 + </text> 516 + <text 517 + id="text1165" 518 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 519 + <tspan 520 + x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699" 521 + y="11302" 522 + id="tspan1167">rs_complete_io()</tspan> 523 + </text> 524 + <text 525 + id="text1181" 526 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 527 + <tspan 528 + x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799" 529 + y="18931" 530 + id="tspan1183">rs_complete_io()</tspan> 531 + </text> 532 + <text 533 + id="text1197" 534 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 535 + <tspan 536 + x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799" 537 + y="27231" 538 + id="tspan1199">rs_complete_io()</tspan> 539 + </text> 540 + <text 541 + id="text1213" 542 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 543 + <tspan 544 + x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887" 545 + y="7402" 546 + id="tspan1215">rs_begin_io()</tspan> 547 + </text> 548 + <text 549 + id="text1229" 550 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 551 + <tspan 552 + x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888" 553 + y="16331" 554 + id="tspan1231">rs_begin_io()</tspan> 555 + </text> 556 + <text 557 + id="text1245" 558 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 559 + <tspan 560 + x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888" 561 + y="23302" 562 + id="tspan1247">rs_begin_io()</tspan> 563 + </text> 564 + <text 565 + id="text1261" 566 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 567 + <tspan 568 + x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388" 569 + y="9302" 570 + id="tspan1263">rs_complete_io()</tspan> 571 + </text> 572 + <text 573 + id="text1277" 574 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 575 + <tspan 576 + x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388" 577 + y="18331" 578 + id="tspan1279">rs_complete_io()</tspan> 579 + </text> 580 + <text 581 + id="text1293" 582 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 583 + <tspan 584 + x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399" 585 + y="25302" 586 + id="tspan1295">rs_complete_io()</tspan> 587 + </text> 588 + </svg>

+459

Documentation/blockdev/drbd/DRBD-data-packets.svg

··· 1 + <?xml version="1.0" encoding="UTF-8" standalone="no"?> 2 +  3 + <svg 4 + xmlns:svg="http://www.w3.org/2000/svg" 5 + xmlns="http://www.w3.org/2000/svg" 6 + version="1.0" 7 + width="210mm" 8 + height="297mm" 9 + viewBox="0 0 21000 29700" 10 + id="svg2" 11 + style="fill-rule:evenodd"> 12 + <defs 13 + id="defs4" /> 14 + <g 15 + id="Default" 16 + style="visibility:visible"> 17 + <desc 18 + id="desc176">Master slide</desc> 19 + </g> 20 + <path 21 + d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z" 22 + id="path189" 23 + style="fill:#008000;visibility:visible" /> 24 + <path 25 + d="M 11999,18801 L 11999,19361" 26 + id="path193" 27 + style="fill:none;stroke:#008000;visibility:visible" /> 28 + <path 29 + d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z" 30 + id="path205" 31 + style="fill:#008000;visibility:visible" /> 32 + <path 33 + d="M 7999,20601 L 7999,21161" 34 + id="path209" 35 + style="fill:none;stroke:#008000;visibility:visible" /> 36 + <path 37 + d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z" 38 + id="path221" 39 + style="fill:#008000;visibility:visible" /> 40 + <path 41 + d="M 7999,18001 L 11764,18754" 42 + id="path225" 43 + style="fill:none;stroke:#008000;visibility:visible" /> 44 + <text 45 + x="-3023.845" 46 + y="1106.8124" 47 + transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)" 48 + id="text243" 49 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 50 + <tspan 51 + x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553" 52 + y="21390.812" 53 + id="tspan245">RSDataReply</tspan> 54 + </text> 55 + <path 56 + d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z" 57 + id="path255" 58 + style="fill:#008000;visibility:visible" /> 59 + <path 60 + d="M 11999,20001 L 8236,20565" 61 + id="path259" 62 + style="fill:none;stroke:#008000;visibility:visible" /> 63 + <text 64 + x="3502.5356" 65 + y="-2184.6621" 66 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)" 67 + id="text277" 68 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 69 + <tspan 70 + x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536" 71 + y="15854.338" 72 + id="tspan279">RSDataRequest</tspan> 73 + </text> 74 + <text 75 + id="text293" 76 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 77 + <tspan 78 + x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692" 79 + y="17807" 80 + id="tspan295">w_make_resync_request()</tspan> 81 + </text> 82 + <text 83 + id="text309" 84 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 85 + <tspan 86 + x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378" 87 + y="18806" 88 + id="tspan311">receive_DataRequest()</tspan> 89 + </text> 90 + <text 91 + id="text325" 92 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 93 + <tspan 94 + x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399" 95 + y="19606" 96 + id="tspan327">drbd_endio_read_sec()</tspan> 97 + </text> 98 + <text 99 + id="text341" 100 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 101 + <tspan 102 + x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298" 103 + y="20007" 104 + id="tspan343">w_e_end_rsdata_req()</tspan> 105 + </text> 106 + <text 107 + id="text357" 108 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 109 + <tspan 110 + x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691" 111 + y="20507" 112 + id="tspan359">receive_RSDataReply()</tspan> 113 + </text> 114 + <text 115 + id="text373" 116 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 117 + <tspan 118 + x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691" 119 + y="21407" 120 + id="tspan375">drbd_endio_write_sec()</tspan> 121 + </text> 122 + <text 123 + id="text389" 124 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 125 + <tspan 126 + x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691" 127 + y="21907" 128 + id="tspan391">e_end_resync_block()</tspan> 129 + </text> 130 + <path 131 + d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z" 132 + id="path401" 133 + style="fill:#000080;visibility:visible" /> 134 + <path 135 + d="M 7999,21801 L 11764,22554" 136 + id="path405" 137 + style="fill:none;stroke:#000080;visibility:visible" /> 138 + <text 139 + x="4290.3008" 140 + y="-2369.6162" 141 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)" 142 + id="text423" 143 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 144 + <tspan 145 + x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301" 146 + y="19573.385" 147 + id="tspan425">WriteAck</tspan> 148 + </text> 149 + <text 150 + id="text439" 151 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 152 + <tspan 153 + x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244" 154 + y="22559" 155 + id="tspan441">got_BlockAck()</tspan> 156 + </text> 157 + <text 158 + id="text455" 159 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 160 + <tspan 161 + x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822" 162 + y="16877" 163 + id="tspan457">Resync blocks, 4-32K</tspan> 164 + </text> 165 + <path 166 + d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z" 167 + id="path467" 168 + style="fill:#008000;visibility:visible" /> 169 + <path 170 + d="M 12000,6801 L 12000,7361" 171 + id="path471" 172 + style="fill:none;stroke:#008000;visibility:visible" /> 173 + <path 174 + d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z" 175 + id="path483" 176 + style="fill:#008000;visibility:visible" /> 177 + <path 178 + d="M 8000,6001 L 11765,6754" 179 + id="path487" 180 + style="fill:none;stroke:#008000;visibility:visible" /> 181 + <text 182 + x="-1288.1796" 183 + y="1279.7666" 184 + transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)" 185 + id="text505" 186 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 187 + <tspan 188 + x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203" 189 + y="9516.7666" 190 + id="tspan507">WriteAck</tspan> 191 + </text> 192 + <path 193 + d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z" 194 + id="path517" 195 + style="fill:#000080;visibility:visible" /> 196 + <path 197 + d="M 12000,8001 L 8237,8565" 198 + id="path521" 199 + style="fill:none;stroke:#000080;visibility:visible" /> 200 + <text 201 + x="1065.6655" 202 + y="-2097.7664" 203 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)" 204 + id="text539" 205 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 206 + <tspan 207 + x="10682.666 10911.666 11088.666 11177.666" 208 + y="4107.2339" 209 + id="tspan541">Data</tspan> 210 + </text> 211 + <text 212 + id="text555" 213 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 214 + <tspan 215 + x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692" 216 + y="5505" 217 + id="tspan557">drbd_make_request()</tspan> 218 + </text> 219 + <text 220 + id="text571" 221 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 222 + <tspan 223 + x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190" 224 + y="6806" 225 + id="tspan573">receive_Data()</tspan> 226 + </text> 227 + <text 228 + id="text587" 229 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 230 + <tspan 231 + x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434" 232 + y="7606" 233 + id="tspan589">drbd_endio_write_sec()</tspan> 234 + </text> 235 + <text 236 + id="text603" 237 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 238 + <tspan 239 + x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114" 240 + y="8007" 241 + id="tspan605">e_end_block()</tspan> 242 + </text> 243 + <text 244 + id="text619" 245 + style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded"> 246 + <tspan 247 + x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692" 248 + y="8606" 249 + id="tspan621">got_BlockAck()</tspan> 250 + </text> 251 + <text 252 + id="text635" 253 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 254 + <tspan 255 + x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753" 256 + y="4877" 257 + id="tspan637">Regular mirrored write, 512-32K</tspan> 258 + </text> 259 + <text 260 + id="text651" 261 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 262 + <tspan 263 + x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692" 264 + y="6003" 265 + id="tspan653">w_send_dblock()</tspan> 266 + </text> 267 + <path 268 + d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z" 269 + id="path663" 270 + style="fill:#008000;visibility:visible" /> 271 + <path 272 + d="M 8000,6000 L 8000,6560" 273 + id="path667" 274 + style="fill:none;stroke:#008000;visibility:visible" /> 275 + <text 276 + id="text683" 277 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 278 + <tspan 279 + x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692" 280 + y="6905" 281 + id="tspan685">drbd_endio_write_pri()</tspan> 282 + </text> 283 + <path 284 + d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z" 285 + id="path695" 286 + style="fill:#008000;visibility:visible" /> 287 + <path 288 + d="M 12000,12802 L 12000,13362" 289 + id="path699" 290 + style="fill:none;stroke:#008000;visibility:visible" /> 291 + <path 292 + d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z" 293 + id="path711" 294 + style="fill:#008000;visibility:visible" /> 295 + <path 296 + d="M 8000,12002 L 11765,12755" 297 + id="path715" 298 + style="fill:none;stroke:#008000;visibility:visible" /> 299 + <text 300 + x="-2155.5266" 301 + y="1201.5964" 302 + transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)" 303 + id="text733" 304 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 305 + <tspan 306 + x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736" 307 + y="15454.597" 308 + id="tspan735">DataReply</tspan> 309 + </text> 310 + <path 311 + d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z" 312 + id="path745" 313 + style="fill:#008000;visibility:visible" /> 314 + <path 315 + d="M 12000,14002 L 8237,14566" 316 + id="path749" 317 + style="fill:none;stroke:#008000;visibility:visible" /> 318 + <text 319 + x="2280.3804" 320 + y="-2103.2141" 321 + transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)" 322 + id="text767" 323 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 324 + <tspan 325 + x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381" 326 + y="9981.7861" 327 + id="tspan769">DataRequest</tspan> 328 + </text> 329 + <text 330 + id="text783" 331 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 332 + <tspan 333 + x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692" 334 + y="11506" 335 + id="tspan785">drbd_make_request()</tspan> 336 + </text> 337 + <text 338 + id="text799" 339 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 340 + <tspan 341 + x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379" 342 + y="12807" 343 + id="tspan801">receive_DataRequest()</tspan> 344 + </text> 345 + <text 346 + id="text815" 347 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 348 + <tspan 349 + x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400" 350 + y="13607" 351 + id="tspan817">drbd_endio_read_sec()</tspan> 352 + </text> 353 + <text 354 + id="text831" 355 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 356 + <tspan 357 + x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033" 358 + y="14008" 359 + id="tspan833">w_e_end_data_req()</tspan> 360 + </text> 361 + <g 362 + id="g835" 363 + style="visibility:visible"> 364 + <desc 365 + id="desc837">Drawing</desc> 366 + <text 367 + id="text847" 368 + style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded"> 369 + <tspan 370 + x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692" 371 + y="14607" 372 + id="tspan849">receive_DataReply()</tspan> 373 + </text> 374 + </g> 375 + <text 376 + id="text863" 377 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 378 + <tspan 379 + x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106" 380 + y="10878" 381 + id="tspan865">Diskless read, 512-32K</tspan> 382 + </text> 383 + <text 384 + id="text879" 385 + style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded"> 386 + <tspan 387 + x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692" 388 + y="12004" 389 + id="tspan881">w_send_read_req()</tspan> 390 + </text> 391 + <text 392 + id="text895" 393 + style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 394 + <tspan 395 + x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030" 396 + y="2806" 397 + id="tspan897">DRBD 8 data flow</tspan> 398 + </text> 399 + <path 400 + d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000" 401 + id="path907" 402 + style="fill:none;stroke:#000000;visibility:visible" /> 403 + <path 404 + d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000" 405 + id="path919" 406 + style="fill:none;stroke:#000000;visibility:visible" /> 407 + <path 408 + d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500" 409 + id="path931" 410 + style="fill:none;stroke:#000000;visibility:visible" /> 411 + <text 412 + id="text947" 413 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 414 + <tspan 415 + x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870" 416 + y="5202" 417 + id="tspan949">al_begin_io()</tspan> 418 + </text> 419 + <text 420 + id="text963" 421 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 422 + <tspan 423 + x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888" 424 + y="7331" 425 + id="tspan965">al_complete_io()</tspan> 426 + </text> 427 + <text 428 + id="text979" 429 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 430 + <tspan 431 + x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887" 432 + y="17431" 433 + id="tspan981">rs_begin_io()</tspan> 434 + </text> 435 + <text 436 + id="text995" 437 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 438 + <tspan 439 + x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899" 440 + y="22331" 441 + id="tspan997">rs_complete_io()</tspan> 442 + </text> 443 + <text 444 + id="text1011" 445 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 446 + <tspan 447 + x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788" 448 + y="18402" 449 + id="tspan1013">rs_begin_io()</tspan> 450 + </text> 451 + <text 452 + id="text1027" 453 + style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded"> 454 + <tspan 455 + x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388" 456 + y="20331" 457 + id="tspan1029">rs_complete_io()</tspan> 458 + </text> 459 + </svg>

+16

Documentation/blockdev/drbd/README.txt

··· 1 + Description 2 + 3 + DRBD is a shared-nothing, synchronously replicated block device. It 4 + is designed to serve as a building block for high availability 5 + clusters and in this context, is a "drop-in" replacement for shared 6 + storage. Simplistically, you could see it as a network RAID 1. 7 + 8 + Please visit http://www.drbd.org to find out more. 9 + 10 + The here included files are intended to help understand the implementation 11 + 12 + DRBD-8.3-data-packets.svg, DRBD-data-packets.svg 13 + relates some functions, and write packets. 14 + 15 + conn-states-8.dot, disk-states-8.dot, node-states-8.dot 16 + The sub graphs of DRBD's state transitions

+18

Documentation/blockdev/drbd/conn-states-8.dot

··· 1 + digraph conn_states { 2 + StandAllone -> WFConnection [ label = "ioctl_set_net()" ] 3 + WFConnection -> Unconnected [ label = "unable to bind()" ] 4 + WFConnection -> WFReportParams [ label = "in connect() after accept" ] 5 + WFReportParams -> StandAllone [ label = "checks in receive_param()" ] 6 + WFReportParams -> Connected [ label = "in receive_param()" ] 7 + WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] 8 + WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] 9 + WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] 10 + WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] 11 + SyncSource -> Connected 12 + SyncTarget -> Connected 13 + SyncSource -> PausedSyncS 14 + SyncTarget -> PausedSyncT 15 + PausedSyncS -> SyncSource 16 + PausedSyncT -> SyncTarget 17 + Connected -> WFConnection [ label = "* on network error" ] 18 + }

+16

Documentation/blockdev/drbd/disk-states-8.dot

··· 1 + digraph disk_states { 2 + Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] 3 + Diskless -> Consistent [ label = "ioctl_set_disk()" ] 4 + Diskless -> Outdated [ label = "ioctl_set_disk()" ] 5 + Consistent -> Outdated [ label = "receive_param()" ] 6 + Consistent -> UpToDate [ label = "receive_param()" ] 7 + Consistent -> Inconsistent [ label = "start resync" ] 8 + Outdated -> Inconsistent [ label = "start resync" ] 9 + UpToDate -> Inconsistent [ label = "ioctl_replicate" ] 10 + Inconsistent -> UpToDate [ label = "resync completed" ] 11 + Consistent -> Failed [ label = "io completion error" ] 12 + Outdated -> Failed [ label = "io completion error" ] 13 + UpToDate -> Failed [ label = "io completion error" ] 14 + Inconsistent -> Failed [ label = "io completion error" ] 15 + Failed -> Diskless [ label = "sending notify to peer" ] 16 + }

+85

Documentation/blockdev/drbd/drbd-connection-state-overview.dot

··· 1 + // vim: set sw=2 sts=2 : 2 + digraph { 3 + rankdir=BT 4 + bgcolor=white 5 + 6 + node [shape=plaintext] 7 + node [fontcolor=black] 8 + 9 + StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] 10 + 11 + node [fontcolor=lightgray] 12 + 13 + Unconnected [ label=Unconnected ] 14 + 15 + CommTrouble [ shape=record, 16 + label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] 17 + 18 + node [fontcolor=gray] 19 + 20 + subgraph cluster_try_connect { 21 + label="try to connect, handshake" 22 + rank=max 23 + WFConnection [ label=WFConnection ] 24 + WFReportParams [ label=WFReportParams ] 25 + } 26 + 27 + TearDown [ label=TearDown ] 28 + 29 + Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] 30 + 31 + node [fontcolor=lightblue] 32 + 33 + StartingSyncS [ label=StartingSyncS ] 34 + StartingSyncT [ label=StartingSyncT ] 35 + 36 + subgraph cluster_bitmap_exchange { 37 + node [fontcolor=red] 38 + fontcolor=red 39 + label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" 40 + 41 + WFBitMapT [ label=WFBitMapT ] 42 + WFSyncUUID [ label=WFSyncUUID ] 43 + WFBitMapS [ label=WFBitMapS ] 44 + } 45 + 46 + node [fontcolor=blue] 47 + 48 + cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ] 49 + 50 + node [shape=box,fontcolor=black] 51 + 52 + // drbdadm [label="drbdadm connect"] 53 + // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] 54 + // comm_error [label="communication trouble"] 55 + 56 + // 57 + // edges 58 + // -------------------------------------- 59 + 60 + StandAlone -> Unconnected [ label="drbdadm connect" ] 61 + Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] 62 + Unconnected -> WFConnection [ label="receiver thread is started" ] 63 + WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] 64 + 65 + WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] 66 + WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] 67 + 68 + WFReportParams -> WFBitMapS 69 + WFReportParams -> WFBitMapT 70 + WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] 71 + 72 + WFBitMapS -> cluster_resync:S 73 + WFSyncUUID -> cluster_resync:T 74 + 75 + edge [color=green] 76 + cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] 77 + 78 + edge [color=red] 79 + WFReportParams -> CommTrouble 80 + Connected -> CommTrouble 81 + cluster_resync:any -> CommTrouble 82 + edge [color=black] 83 + CommTrouble -> Unconnected [label="receiver thread is stopped" ] 84 + 85 + }

+14

Documentation/blockdev/drbd/node-states-8.dot

··· 1 + digraph node_states { 2 + Secondary -> Primary [ label = "ioctl_set_state()" ] 3 + Primary -> Secondary [ label = "ioctl_set_state()" ] 4 + } 5 + 6 + digraph peer_states { 7 + Secondary -> Primary [ label = "recv state packet" ] 8 + Primary -> Secondary [ label = "recv state packet" ] 9 + Primary -> Unknown [ label = "connection lost" ] 10 + Secondary -> Unknown [ label = "connection lost" ] 11 + Unknown -> Primary [ label = "connected" ] 12 + Unknown -> Secondary [ label = "connected" ] 13 + } 14 +

+135

Documentation/cgroups/blkio-controller.txt

··· 1 + Block IO Controller 2 + =================== 3 + Overview 4 + ======== 5 + cgroup subsys "blkio" implements the block io controller. There seems to be 6 + a need of various kinds of IO control policies (like proportional BW, max BW) 7 + both at leaf nodes as well as at intermediate nodes in a storage hierarchy. 8 + Plan is to use the same cgroup based management interface for blkio controller 9 + and based on user options switch IO policies in the background. 10 + 11 + In the first phase, this patchset implements proportional weight time based 12 + division of disk policy. It is implemented in CFQ. Hence this policy takes 13 + effect only on leaf nodes when CFQ is being used. 14 + 15 + HOWTO 16 + ===== 17 + You can do a very simple testing of running two dd threads in two different 18 + cgroups. Here is what you can do. 19 + 20 + - Enable group scheduling in CFQ 21 + CONFIG_CFQ_GROUP_IOSCHED=y 22 + 23 + - Compile and boot into kernel and mount IO controller (blkio). 24 + 25 + mount -t cgroup -o blkio none /cgroup 26 + 27 + - Create two cgroups 28 + mkdir -p /cgroup/test1/ /cgroup/test2 29 + 30 + - Set weights of group test1 and test2 31 + echo 1000 > /cgroup/test1/blkio.weight 32 + echo 500 > /cgroup/test2/blkio.weight 33 + 34 + - Create two same size files (say 512MB each) on same disk (file1, file2) and 35 + launch two dd threads in different cgroup to read those files. 36 + 37 + sync 38 + echo 3 > /proc/sys/vm/drop_caches 39 + 40 + dd if=/mnt/sdb/zerofile1 of=/dev/null & 41 + echo $! > /cgroup/test1/tasks 42 + cat /cgroup/test1/tasks 43 + 44 + dd if=/mnt/sdb/zerofile2 of=/dev/null & 45 + echo $! > /cgroup/test2/tasks 46 + cat /cgroup/test2/tasks 47 + 48 + - At macro level, first dd should finish first. To get more precise data, keep 49 + on looking at (with the help of script), at blkio.disk_time and 50 + blkio.disk_sectors files of both test1 and test2 groups. This will tell how 51 + much disk time (in milli seconds), each group got and how many secotors each 52 + group dispatched to the disk. We provide fairness in terms of disk time, so 53 + ideally io.disk_time of cgroups should be in proportion to the weight. 54 + 55 + Various user visible config options 56 + =================================== 57 + CONFIG_CFQ_GROUP_IOSCHED 58 + - Enables group scheduling in CFQ. Currently only 1 level of group 59 + creation is allowed. 60 + 61 + CONFIG_DEBUG_CFQ_IOSCHED 62 + - Enables some debugging messages in blktrace. Also creates extra 63 + cgroup file blkio.dequeue. 64 + 65 + Config options selected automatically 66 + ===================================== 67 + These config options are not user visible and are selected/deselected 68 + automatically based on IO scheduler configuration. 69 + 70 + CONFIG_BLK_CGROUP 71 + - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. 72 + 73 + CONFIG_DEBUG_BLK_CGROUP 74 + - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. 75 + 76 + Details of cgroup files 77 + ======================= 78 + - blkio.weight 79 + - Specifies per cgroup weight. 80 + 81 + Currently allowed range of weights is from 100 to 1000. 82 + 83 + - blkio.time 84 + - disk time allocated to cgroup per device in milliseconds. First 85 + two fields specify the major and minor number of the device and 86 + third field specifies the disk time allocated to group in 87 + milliseconds. 88 + 89 + - blkio.sectors 90 + - number of sectors transferred to/from disk by the group. First 91 + two fields specify the major and minor number of the device and 92 + third field specifies the number of sectors transferred by the 93 + group to/from the device. 94 + 95 + - blkio.dequeue 96 + - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This 97 + gives the statistics about how many a times a group was dequeued 98 + from service tree of the device. First two fields specify the major 99 + and minor number of the device and third field specifies the number 100 + of times a group was dequeued from a particular device. 101 + 102 + CFQ sysfs tunable 103 + ================= 104 + /sys/block/<disk>/queue/iosched/group_isolation 105 + 106 + If group_isolation=1, it provides stronger isolation between groups at the 107 + expense of throughput. By default group_isolation is 0. In general that 108 + means that if group_isolation=0, expect fairness for sequential workload 109 + only. Set group_isolation=1 to see fairness for random IO workload also. 110 + 111 + Generally CFQ will put random seeky workload in sync-noidle category. CFQ 112 + will disable idling on these queues and it does a collective idling on group 113 + of such queues. Generally these are slow moving queues and if there is a 114 + sync-noidle service tree in each group, that group gets exclusive access to 115 + disk for certain period. That means it will bring the throughput down if 116 + group does not have enough IO to drive deeper queue depths and utilize disk 117 + capacity to the fullest in the slice allocated to it. But the flip side is 118 + that even a random reader should get better latencies and overall throughput 119 + if there are lots of sequential readers/sync-idle workload running in the 120 + system. 121 + 122 + If group_isolation=0, then CFQ automatically moves all the random seeky queues 123 + in the root group. That means there will be no service differentiation for 124 + that kind of workload. This leads to better throughput as we do collective 125 + idling on root sync-noidle tree. 126 + 127 + By default one should run with group_isolation=0. If that is not sufficient 128 + and one wants stronger isolation between groups, then set group_isolation=1 129 + but this will come at cost of reduced throughput. 130 + 131 + What works 132 + ========== 133 + - Currently only sync IO queues are support. All the buffered writes are 134 + still system wide and not per group. Hence we will not see service 135 + differentiation between buffered writes between groups.

+13

MAINTAINERS

··· 1848 1848 F: drivers/scsi/dpt* 1849 1849 F: drivers/scsi/dpt/ 1850 1850 1851 + DRBD DRIVER 1852 + P: Philipp Reisner 1853 + P: Lars Ellenberg 1854 + M: drbd-dev@lists.linbit.com 1855 + L: drbd-user@lists.linbit.com 1856 + W: http://www.drbd.org 1857 + T: git git://git.drbd.org/linux-2.6-drbd.git drbd 1858 + T: git git://git.drbd.org/drbd-8.3.git 1859 + S: Supported 1860 + F: drivers/block/drbd/ 1861 + F: lib/lru_cache.c 1862 + F: Documentation/blockdev/drbd/ 1863 + 1851 1864 DRIVER CORE, KOBJECTS, AND SYSFS 1852 1865 M: Greg Kroah-Hartman <gregkh@suse.de> 1853 1866 T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/

+1

arch/alpha/include/asm/cacheflush.h

··· 9 9 #define flush_cache_dup_mm(mm) do { } while (0) 10 10 #define flush_cache_range(vma, start, end) do { } while (0) 11 11 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 12 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 12 13 #define flush_dcache_page(page) do { } while (0) 13 14 #define flush_dcache_mmap_lock(mapping) do { } while (0) 14 15 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/arm/include/asm/cacheflush.h

··· 418 418 * about to change to user space. This is the same method as used on SPARC64. 419 419 * See update_mmu_cache for the user space part. 420 420 */ 421 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 421 422 extern void flush_dcache_page(struct page *); 422 423 423 424 static inline void __flush_icache_all(void)

+1

arch/avr32/include/asm/cacheflush.h

··· 107 107 * do something here, but only for certain configurations. No such 108 108 * configurations exist at this time. 109 109 */ 110 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 110 111 #define flush_dcache_page(page) do { } while (0) 111 112 #define flush_dcache_mmap_lock(page) do { } while (0) 112 113 #define flush_dcache_mmap_unlock(page) do { } while (0)

+2

arch/blackfin/include/asm/cacheflush.h

··· 68 68 #endif 69 69 #if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK) 70 70 # define flush_dcache_range(start,end) blackfin_dcache_flush_range((start), (end)) 71 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 71 72 # define flush_dcache_page(page) blackfin_dflush_page(page_address(page)) 72 73 #else 73 74 # define flush_dcache_range(start,end) do { } while (0) 75 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 74 76 # define flush_dcache_page(page) do { } while (0) 75 77 #endif 76 78

+1

arch/cris/include/asm/cacheflush.h

··· 12 12 #define flush_cache_dup_mm(mm) do { } while (0) 13 13 #define flush_cache_range(vma, start, end) do { } while (0) 14 14 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 15 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 15 16 #define flush_dcache_page(page) do { } while (0) 16 17 #define flush_dcache_mmap_lock(mapping) do { } while (0) 17 18 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/frv/include/asm/cacheflush.h

··· 47 47 } 48 48 49 49 /* dcache/icache coherency... */ 50 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 50 51 #ifdef CONFIG_MMU 51 52 extern void flush_dcache_page(struct page *page); 52 53 #else

+1

arch/h8300/include/asm/cacheflush.h

··· 15 15 #define flush_cache_dup_mm(mm) do { } while (0) 16 16 #define flush_cache_range(vma,a,b) 17 17 #define flush_cache_page(vma,p,pfn) 18 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 18 19 #define flush_dcache_page(page) 19 20 #define flush_dcache_mmap_lock(mapping) 20 21 #define flush_dcache_mmap_unlock(mapping)

+1

arch/ia64/include/asm/cacheflush.h

··· 25 25 #define flush_cache_vmap(start, end) do { } while (0) 26 26 #define flush_cache_vunmap(start, end) do { } while (0) 27 27 28 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 28 29 #define flush_dcache_page(page) \ 29 30 do { \ 30 31 clear_bit(PG_arch_1, &(page)->flags); \

+3

arch/m32r/include/asm/cacheflush.h

··· 12 12 #define flush_cache_dup_mm(mm) do { } while (0) 13 13 #define flush_cache_range(vma, start, end) do { } while (0) 14 14 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 15 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 15 16 #define flush_dcache_page(page) do { } while (0) 16 17 #define flush_dcache_mmap_lock(mapping) do { } while (0) 17 18 #define flush_dcache_mmap_unlock(mapping) do { } while (0) ··· 34 33 #define flush_cache_dup_mm(mm) do { } while (0) 35 34 #define flush_cache_range(vma, start, end) do { } while (0) 36 35 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 36 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 37 37 #define flush_dcache_page(page) do { } while (0) 38 38 #define flush_dcache_mmap_lock(mapping) do { } while (0) 39 39 #define flush_dcache_mmap_unlock(mapping) do { } while (0) ··· 48 46 #define flush_cache_dup_mm(mm) do { } while (0) 49 47 #define flush_cache_range(vma, start, end) do { } while (0) 50 48 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 49 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 51 50 #define flush_dcache_page(page) do { } while (0) 52 51 #define flush_dcache_mmap_lock(mapping) do { } while (0) 53 52 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/m68k/include/asm/cacheflush_mm.h

··· 128 128 } 129 129 } 130 130 131 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 131 132 #define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) 132 133 #define flush_dcache_mmap_lock(mapping) do { } while (0) 133 134 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/m68k/include/asm/cacheflush_no.h

··· 12 12 #define flush_cache_range(vma, start, end) __flush_cache_all() 13 13 #define flush_cache_page(vma, vmaddr) do { } while (0) 14 14 #define flush_dcache_range(start,len) __flush_cache_all() 15 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 15 16 #define flush_dcache_page(page) do { } while (0) 16 17 #define flush_dcache_mmap_lock(mapping) do { } while (0) 17 18 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/microblaze/include/asm/cacheflush.h

··· 37 37 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 38 38 39 39 #define flush_dcache_range(start, end) __invalidate_dcache_range(start, end) 40 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 40 41 #define flush_dcache_page(page) do { } while (0) 41 42 #define flush_dcache_mmap_lock(mapping) do { } while (0) 42 43 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/mips/include/asm/cacheflush.h

··· 38 38 extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); 39 39 extern void __flush_dcache_page(struct page *page); 40 40 41 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 41 42 static inline void flush_dcache_page(struct page *page) 42 43 { 43 44 if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc)

+1

arch/mn10300/include/asm/cacheflush.h

··· 26 26 #define flush_cache_page(vma, vmaddr, pfn) do {} while (0) 27 27 #define flush_cache_vmap(start, end) do {} while (0) 28 28 #define flush_cache_vunmap(start, end) do {} while (0) 29 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 29 30 #define flush_dcache_page(page) do {} while (0) 30 31 #define flush_dcache_mmap_lock(mapping) do {} while (0) 31 32 #define flush_dcache_mmap_unlock(mapping) do {} while (0)

+1

arch/parisc/include/asm/cacheflush.h

··· 42 42 #define flush_cache_vmap(start, end) flush_cache_all() 43 43 #define flush_cache_vunmap(start, end) flush_cache_all() 44 44 45 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 45 46 extern void flush_dcache_page(struct page *page); 46 47 47 48 #define flush_dcache_mmap_lock(mapping) \

+1

arch/powerpc/include/asm/cacheflush.h

··· 25 25 #define flush_cache_vmap(start, end) do { } while (0) 26 26 #define flush_cache_vunmap(start, end) do { } while (0) 27 27 28 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 28 29 extern void flush_dcache_page(struct page *page); 29 30 #define flush_dcache_mmap_lock(mapping) do { } while (0) 30 31 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/s390/include/asm/cacheflush.h

··· 10 10 #define flush_cache_dup_mm(mm) do { } while (0) 11 11 #define flush_cache_range(vma, start, end) do { } while (0) 12 12 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 13 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 13 14 #define flush_dcache_page(page) do { } while (0) 14 15 #define flush_dcache_mmap_lock(mapping) do { } while (0) 15 16 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/score/include/asm/cacheflush.h

··· 16 16 extern void flush_dcache_range(unsigned long start, unsigned long end); 17 17 18 18 #define flush_cache_dup_mm(mm) do {} while (0) 19 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 19 20 #define flush_dcache_page(page) do {} while (0) 20 21 #define flush_dcache_mmap_lock(mapping) do {} while (0) 21 22 #define flush_dcache_mmap_unlock(mapping) do {} while (0)

+1

arch/sh/include/asm/cacheflush.h

··· 42 42 unsigned long addr, unsigned long pfn); 43 43 extern void flush_cache_range(struct vm_area_struct *vma, 44 44 unsigned long start, unsigned long end); 45 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 45 46 extern void flush_dcache_page(struct page *page); 46 47 extern void flush_icache_range(unsigned long start, unsigned long end); 47 48 extern void flush_icache_page(struct vm_area_struct *vma,

+1

arch/sparc/include/asm/cacheflush_32.h

··· 75 75 76 76 extern void sparc_flush_page_to_ram(struct page *page); 77 77 78 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 78 79 #define flush_dcache_page(page) sparc_flush_page_to_ram(page) 79 80 #define flush_dcache_mmap_lock(mapping) do { } while (0) 80 81 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+1

arch/sparc/include/asm/cacheflush_64.h

··· 37 37 #endif 38 38 39 39 extern void __flush_dcache_range(unsigned long start, unsigned long end); 40 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 40 41 extern void flush_dcache_page(struct page *page); 41 42 42 43 #define flush_icache_page(vma, pg) do { } while(0)

+1

arch/x86/include/asm/cacheflush.h

··· 12 12 unsigned long start, unsigned long end) { } 13 13 static inline void flush_cache_page(struct vm_area_struct *vma, 14 14 unsigned long vmaddr, unsigned long pfn) { } 15 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 15 16 static inline void flush_dcache_page(struct page *page) { } 16 17 static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17 18 static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }

+1

arch/xtensa/include/asm/cacheflush.h

··· 101 101 #define flush_cache_vmap(start,end) flush_cache_all() 102 102 #define flush_cache_vunmap(start,end) flush_cache_all() 103 103 104 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 104 105 extern void flush_dcache_page(struct page*); 105 106 extern void flush_cache_range(struct vm_area_struct*, ulong, ulong); 106 107 extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);

+22

block/Kconfig

··· 77 77 T10/SCSI Data Integrity Field or the T13/ATA External Path 78 78 Protection. If in doubt, say N. 79 79 80 + config BLK_CGROUP 81 + bool 82 + depends on CGROUPS 83 + default n 84 + ---help--- 85 + Generic block IO controller cgroup interface. This is the common 86 + cgroup interface which should be used by various IO controlling 87 + policies. 88 + 89 + Currently, CFQ IO scheduler uses it to recognize task groups and 90 + control disk bandwidth allocation (proportional time slice allocation) 91 + to such task groups. 92 + 93 + config DEBUG_BLK_CGROUP 94 + bool 95 + depends on BLK_CGROUP 96 + default n 97 + ---help--- 98 + Enable some debugging help. Currently it stores the cgroup path 99 + in the blk group which can be used by cfq for tracing various 100 + group related activity. 101 + 80 102 endif # BLOCK 81 103 82 104 config BLOCK_COMPAT

+24 -19

block/Kconfig.iosched

··· 12 12 that do their own scheduling and require only minimal assistance from 13 13 the kernel. 14 14 15 - config IOSCHED_AS 16 - tristate "Anticipatory I/O scheduler" 17 - default y 18 - ---help--- 19 - The anticipatory I/O scheduler is generally a good choice for most 20 - environments, but is quite large and complex when compared to the 21 - deadline I/O scheduler, it can also be slower in some cases 22 - especially some database loads. 23 - 24 15 config IOSCHED_DEADLINE 25 16 tristate "Deadline I/O scheduler" 26 17 default y 27 18 ---help--- 28 - The deadline I/O scheduler is simple and compact, and is often as 29 - good as the anticipatory I/O scheduler, and in some database 30 - workloads, better. In the case of a single process performing I/O to 31 - a disk at any one time, its behaviour is almost identical to the 32 - anticipatory I/O scheduler and so is a good choice. 19 + The deadline I/O scheduler is simple and compact. It will provide 20 + CSCAN service with FIFO expiration of requests, switching to 21 + a new point in the service tree and doing a batch of IO from there 22 + in case of expiry. 33 23 34 24 config IOSCHED_CFQ 35 25 tristate "CFQ I/O scheduler" ··· 27 37 ---help--- 28 38 The CFQ I/O scheduler tries to distribute bandwidth equally 29 39 among all processes in the system. It should provide a fair 30 - working environment, suitable for desktop systems. 40 + and low latency working environment, suitable for both desktop 41 + and server systems. 42 + 31 43 This is the default I/O scheduler. 44 + 45 + config CFQ_GROUP_IOSCHED 46 + bool "CFQ Group Scheduling support" 47 + depends on IOSCHED_CFQ && CGROUPS 48 + select BLK_CGROUP 49 + default n 50 + ---help--- 51 + Enable group IO scheduling in CFQ. 52 + 53 + config DEBUG_CFQ_IOSCHED 54 + bool "Debug CFQ Scheduling" 55 + depends on CFQ_GROUP_IOSCHED 56 + select DEBUG_BLK_CGROUP 57 + default n 58 + ---help--- 59 + Enable CFQ IO scheduling debugging in CFQ. Currently it makes 60 + blktrace output more verbose. 32 61 33 62 choice 34 63 prompt "Default I/O scheduler" ··· 55 46 help 56 47 Select the I/O scheduler which will be used by default for all 57 48 block devices. 58 - 59 - config DEFAULT_AS 60 - bool "Anticipatory" if IOSCHED_AS=y 61 49 62 50 config DEFAULT_DEADLINE 63 51 bool "Deadline" if IOSCHED_DEADLINE=y ··· 69 63 70 64 config DEFAULT_IOSCHED 71 65 string 72 - default "anticipatory" if DEFAULT_AS 73 66 default "deadline" if DEFAULT_DEADLINE 74 67 default "cfq" if DEFAULT_CFQ 75 68 default "noop" if DEFAULT_NOOP

+1 -1

block/Makefile

··· 8 8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 9 9 10 10 obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 11 + obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11 12 obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 12 - obj-$(CONFIG_IOSCHED_AS) += as-iosched.o 13 13 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14 14 obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15 15

-1520

block/as-iosched.c

··· 1 - /* 2 - * Anticipatory & deadline i/o scheduler. 3 - * 4 - * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk> 5 - * Nick Piggin <nickpiggin@yahoo.com.au> 6 - * 7 - */ 8 - #include <linux/kernel.h> 9 - #include <linux/fs.h> 10 - #include <linux/blkdev.h> 11 - #include <linux/elevator.h> 12 - #include <linux/bio.h> 13 - #include <linux/module.h> 14 - #include <linux/slab.h> 15 - #include <linux/init.h> 16 - #include <linux/compiler.h> 17 - #include <linux/rbtree.h> 18 - #include <linux/interrupt.h> 19 - 20 - /* 21 - * See Documentation/block/as-iosched.txt 22 - */ 23 - 24 - /* 25 - * max time before a read is submitted. 26 - */ 27 - #define default_read_expire (HZ / 8) 28 - 29 - /* 30 - * ditto for writes, these limits are not hard, even 31 - * if the disk is capable of satisfying them. 32 - */ 33 - #define default_write_expire (HZ / 4) 34 - 35 - /* 36 - * read_batch_expire describes how long we will allow a stream of reads to 37 - * persist before looking to see whether it is time to switch over to writes. 38 - */ 39 - #define default_read_batch_expire (HZ / 2) 40 - 41 - /* 42 - * write_batch_expire describes how long we want a stream of writes to run for. 43 - * This is not a hard limit, but a target we set for the auto-tuning thingy. 44 - * See, the problem is: we can send a lot of writes to disk cache / TCQ in 45 - * a short amount of time... 46 - */ 47 - #define default_write_batch_expire (HZ / 8) 48 - 49 - /* 50 - * max time we may wait to anticipate a read (default around 6ms) 51 - */ 52 - #define default_antic_expire ((HZ / 150) ? HZ / 150 : 1) 53 - 54 - /* 55 - * Keep track of up to 20ms thinktimes. We can go as big as we like here, 56 - * however huge values tend to interfere and not decay fast enough. A program 57 - * might be in a non-io phase of operation. Waiting on user input for example, 58 - * or doing a lengthy computation. A small penalty can be justified there, and 59 - * will still catch out those processes that constantly have large thinktimes. 60 - */ 61 - #define MAX_THINKTIME (HZ/50UL) 62 - 63 - /* Bits in as_io_context.state */ 64 - enum as_io_states { 65 - AS_TASK_RUNNING=0, /* Process has not exited */ 66 - AS_TASK_IOSTARTED, /* Process has started some IO */ 67 - AS_TASK_IORUNNING, /* Process has completed some IO */ 68 - }; 69 - 70 - enum anticipation_status { 71 - ANTIC_OFF=0, /* Not anticipating (normal operation) */ 72 - ANTIC_WAIT_REQ, /* The last read has not yet completed */ 73 - ANTIC_WAIT_NEXT, /* Currently anticipating a request vs 74 - last read (which has completed) */ 75 - ANTIC_FINISHED, /* Anticipating but have found a candidate 76 - * or timed out */ 77 - }; 78 - 79 - struct as_data { 80 - /* 81 - * run time data 82 - */ 83 - 84 - struct request_queue *q; /* the "owner" queue */ 85 - 86 - /* 87 - * requests (as_rq s) are present on both sort_list and fifo_list 88 - */ 89 - struct rb_root sort_list[2]; 90 - struct list_head fifo_list[2]; 91 - 92 - struct request *next_rq[2]; /* next in sort order */ 93 - sector_t last_sector[2]; /* last SYNC & ASYNC sectors */ 94 - 95 - unsigned long exit_prob; /* probability a task will exit while 96 - being waited on */ 97 - unsigned long exit_no_coop; /* probablility an exited task will 98 - not be part of a later cooperating 99 - request */ 100 - unsigned long new_ttime_total; /* mean thinktime on new proc */ 101 - unsigned long new_ttime_mean; 102 - u64 new_seek_total; /* mean seek on new proc */ 103 - sector_t new_seek_mean; 104 - 105 - unsigned long current_batch_expires; 106 - unsigned long last_check_fifo[2]; 107 - int changed_batch; /* 1: waiting for old batch to end */ 108 - int new_batch; /* 1: waiting on first read complete */ 109 - int batch_data_dir; /* current batch SYNC / ASYNC */ 110 - int write_batch_count; /* max # of reqs in a write batch */ 111 - int current_write_count; /* how many requests left this batch */ 112 - int write_batch_idled; /* has the write batch gone idle? */ 113 - 114 - enum anticipation_status antic_status; 115 - unsigned long antic_start; /* jiffies: when it started */ 116 - struct timer_list antic_timer; /* anticipatory scheduling timer */ 117 - struct work_struct antic_work; /* Deferred unplugging */ 118 - struct io_context *io_context; /* Identify the expected process */ 119 - int ioc_finished; /* IO associated with io_context is finished */ 120 - int nr_dispatched; 121 - 122 - /* 123 - * settings that change how the i/o scheduler behaves 124 - */ 125 - unsigned long fifo_expire[2]; 126 - unsigned long batch_expire[2]; 127 - unsigned long antic_expire; 128 - }; 129 - 130 - /* 131 - * per-request data. 132 - */ 133 - enum arq_state { 134 - AS_RQ_NEW=0, /* New - not referenced and not on any lists */ 135 - AS_RQ_QUEUED, /* In the request queue. It belongs to the 136 - scheduler */ 137 - AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the 138 - driver now */ 139 - AS_RQ_PRESCHED, /* Debug poisoning for requests being used */ 140 - AS_RQ_REMOVED, 141 - AS_RQ_MERGED, 142 - AS_RQ_POSTSCHED, /* when they shouldn't be */ 143 - }; 144 - 145 - #define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private) 146 - #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) 147 - #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) 148 - 149 - static DEFINE_PER_CPU(unsigned long, as_ioc_count); 150 - static struct completion *ioc_gone; 151 - static DEFINE_SPINLOCK(ioc_gone_lock); 152 - 153 - static void as_move_to_dispatch(struct as_data *ad, struct request *rq); 154 - static void as_antic_stop(struct as_data *ad); 155 - 156 - /* 157 - * IO Context helper functions 158 - */ 159 - 160 - /* Called to deallocate the as_io_context */ 161 - static void free_as_io_context(struct as_io_context *aic) 162 - { 163 - kfree(aic); 164 - elv_ioc_count_dec(as_ioc_count); 165 - if (ioc_gone) { 166 - /* 167 - * AS scheduler is exiting, grab exit lock and check 168 - * the pending io context count. If it hits zero, 169 - * complete ioc_gone and set it back to NULL. 170 - */ 171 - spin_lock(&ioc_gone_lock); 172 - if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { 173 - complete(ioc_gone); 174 - ioc_gone = NULL; 175 - } 176 - spin_unlock(&ioc_gone_lock); 177 - } 178 - } 179 - 180 - static void as_trim(struct io_context *ioc) 181 - { 182 - spin_lock_irq(&ioc->lock); 183 - if (ioc->aic) 184 - free_as_io_context(ioc->aic); 185 - ioc->aic = NULL; 186 - spin_unlock_irq(&ioc->lock); 187 - } 188 - 189 - /* Called when the task exits */ 190 - static void exit_as_io_context(struct as_io_context *aic) 191 - { 192 - WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state)); 193 - clear_bit(AS_TASK_RUNNING, &aic->state); 194 - } 195 - 196 - static struct as_io_context *alloc_as_io_context(void) 197 - { 198 - struct as_io_context *ret; 199 - 200 - ret = kmalloc(sizeof(*ret), GFP_ATOMIC); 201 - if (ret) { 202 - ret->dtor = free_as_io_context; 203 - ret->exit = exit_as_io_context; 204 - ret->state = 1 << AS_TASK_RUNNING; 205 - atomic_set(&ret->nr_queued, 0); 206 - atomic_set(&ret->nr_dispatched, 0); 207 - spin_lock_init(&ret->lock); 208 - ret->ttime_total = 0; 209 - ret->ttime_samples = 0; 210 - ret->ttime_mean = 0; 211 - ret->seek_total = 0; 212 - ret->seek_samples = 0; 213 - ret->seek_mean = 0; 214 - elv_ioc_count_inc(as_ioc_count); 215 - } 216 - 217 - return ret; 218 - } 219 - 220 - /* 221 - * If the current task has no AS IO context then create one and initialise it. 222 - * Then take a ref on the task's io context and return it. 223 - */ 224 - static struct io_context *as_get_io_context(int node) 225 - { 226 - struct io_context *ioc = get_io_context(GFP_ATOMIC, node); 227 - if (ioc && !ioc->aic) { 228 - ioc->aic = alloc_as_io_context(); 229 - if (!ioc->aic) { 230 - put_io_context(ioc); 231 - ioc = NULL; 232 - } 233 - } 234 - return ioc; 235 - } 236 - 237 - static void as_put_io_context(struct request *rq) 238 - { 239 - struct as_io_context *aic; 240 - 241 - if (unlikely(!RQ_IOC(rq))) 242 - return; 243 - 244 - aic = RQ_IOC(rq)->aic; 245 - 246 - if (rq_is_sync(rq) && aic) { 247 - unsigned long flags; 248 - 249 - spin_lock_irqsave(&aic->lock, flags); 250 - set_bit(AS_TASK_IORUNNING, &aic->state); 251 - aic->last_end_request = jiffies; 252 - spin_unlock_irqrestore(&aic->lock, flags); 253 - } 254 - 255 - put_io_context(RQ_IOC(rq)); 256 - } 257 - 258 - /* 259 - * rb tree support functions 260 - */ 261 - #define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))]) 262 - 263 - static void as_add_rq_rb(struct as_data *ad, struct request *rq) 264 - { 265 - struct request *alias; 266 - 267 - while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) { 268 - as_move_to_dispatch(ad, alias); 269 - as_antic_stop(ad); 270 - } 271 - } 272 - 273 - static inline void as_del_rq_rb(struct as_data *ad, struct request *rq) 274 - { 275 - elv_rb_del(RQ_RB_ROOT(ad, rq), rq); 276 - } 277 - 278 - /* 279 - * IO Scheduler proper 280 - */ 281 - 282 - #define MAXBACK (1024 * 1024) /* 283 - * Maximum distance the disk will go backward 284 - * for a request. 285 - */ 286 - 287 - #define BACK_PENALTY 2 288 - 289 - /* 290 - * as_choose_req selects the preferred one of two requests of the same data_dir 291 - * ignoring time - eg. timeouts, which is the job of as_dispatch_request 292 - */ 293 - static struct request * 294 - as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2) 295 - { 296 - int data_dir; 297 - sector_t last, s1, s2, d1, d2; 298 - int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ 299 - const sector_t maxback = MAXBACK; 300 - 301 - if (rq1 == NULL || rq1 == rq2) 302 - return rq2; 303 - if (rq2 == NULL) 304 - return rq1; 305 - 306 - data_dir = rq_is_sync(rq1); 307 - 308 - last = ad->last_sector[data_dir]; 309 - s1 = blk_rq_pos(rq1); 310 - s2 = blk_rq_pos(rq2); 311 - 312 - BUG_ON(data_dir != rq_is_sync(rq2)); 313 - 314 - /* 315 - * Strict one way elevator _except_ in the case where we allow 316 - * short backward seeks which are biased as twice the cost of a 317 - * similar forward seek. 318 - */ 319 - if (s1 >= last) 320 - d1 = s1 - last; 321 - else if (s1+maxback >= last) 322 - d1 = (last - s1)*BACK_PENALTY; 323 - else { 324 - r1_wrap = 1; 325 - d1 = 0; /* shut up, gcc */ 326 - } 327 - 328 - if (s2 >= last) 329 - d2 = s2 - last; 330 - else if (s2+maxback >= last) 331 - d2 = (last - s2)*BACK_PENALTY; 332 - else { 333 - r2_wrap = 1; 334 - d2 = 0; 335 - } 336 - 337 - /* Found required data */ 338 - if (!r1_wrap && r2_wrap) 339 - return rq1; 340 - else if (!r2_wrap && r1_wrap) 341 - return rq2; 342 - else if (r1_wrap && r2_wrap) { 343 - /* both behind the head */ 344 - if (s1 <= s2) 345 - return rq1; 346 - else 347 - return rq2; 348 - } 349 - 350 - /* Both requests in front of the head */ 351 - if (d1 < d2) 352 - return rq1; 353 - else if (d2 < d1) 354 - return rq2; 355 - else { 356 - if (s1 >= s2) 357 - return rq1; 358 - else 359 - return rq2; 360 - } 361 - } 362 - 363 - /* 364 - * as_find_next_rq finds the next request after @prev in elevator order. 365 - * this with as_choose_req form the basis for how the scheduler chooses 366 - * what request to process next. Anticipation works on top of this. 367 - */ 368 - static struct request * 369 - as_find_next_rq(struct as_data *ad, struct request *last) 370 - { 371 - struct rb_node *rbnext = rb_next(&last->rb_node); 372 - struct rb_node *rbprev = rb_prev(&last->rb_node); 373 - struct request *next = NULL, *prev = NULL; 374 - 375 - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 376 - 377 - if (rbprev) 378 - prev = rb_entry_rq(rbprev); 379 - 380 - if (rbnext) 381 - next = rb_entry_rq(rbnext); 382 - else { 383 - const int data_dir = rq_is_sync(last); 384 - 385 - rbnext = rb_first(&ad->sort_list[data_dir]); 386 - if (rbnext && rbnext != &last->rb_node) 387 - next = rb_entry_rq(rbnext); 388 - } 389 - 390 - return as_choose_req(ad, next, prev); 391 - } 392 - 393 - /* 394 - * anticipatory scheduling functions follow 395 - */ 396 - 397 - /* 398 - * as_antic_expired tells us when we have anticipated too long. 399 - * The funny "absolute difference" math on the elapsed time is to handle 400 - * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. 401 - */ 402 - static int as_antic_expired(struct as_data *ad) 403 - { 404 - long delta_jif; 405 - 406 - delta_jif = jiffies - ad->antic_start; 407 - if (unlikely(delta_jif < 0)) 408 - delta_jif = -delta_jif; 409 - if (delta_jif < ad->antic_expire) 410 - return 0; 411 - 412 - return 1; 413 - } 414 - 415 - /* 416 - * as_antic_waitnext starts anticipating that a nice request will soon be 417 - * submitted. See also as_antic_waitreq 418 - */ 419 - static void as_antic_waitnext(struct as_data *ad) 420 - { 421 - unsigned long timeout; 422 - 423 - BUG_ON(ad->antic_status != ANTIC_OFF 424 - && ad->antic_status != ANTIC_WAIT_REQ); 425 - 426 - timeout = ad->antic_start + ad->antic_expire; 427 - 428 - mod_timer(&ad->antic_timer, timeout); 429 - 430 - ad->antic_status = ANTIC_WAIT_NEXT; 431 - } 432 - 433 - /* 434 - * as_antic_waitreq starts anticipating. We don't start timing the anticipation 435 - * until the request that we're anticipating on has finished. This means we 436 - * are timing from when the candidate process wakes up hopefully. 437 - */ 438 - static void as_antic_waitreq(struct as_data *ad) 439 - { 440 - BUG_ON(ad->antic_status == ANTIC_FINISHED); 441 - if (ad->antic_status == ANTIC_OFF) { 442 - if (!ad->io_context || ad->ioc_finished) 443 - as_antic_waitnext(ad); 444 - else 445 - ad->antic_status = ANTIC_WAIT_REQ; 446 - } 447 - } 448 - 449 - /* 450 - * This is called directly by the functions in this file to stop anticipation. 451 - * We kill the timer and schedule a call to the request_fn asap. 452 - */ 453 - static void as_antic_stop(struct as_data *ad) 454 - { 455 - int status = ad->antic_status; 456 - 457 - if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { 458 - if (status == ANTIC_WAIT_NEXT) 459 - del_timer(&ad->antic_timer); 460 - ad->antic_status = ANTIC_FINISHED; 461 - /* see as_work_handler */ 462 - kblockd_schedule_work(ad->q, &ad->antic_work); 463 - } 464 - } 465 - 466 - /* 467 - * as_antic_timeout is the timer function set by as_antic_waitnext. 468 - */ 469 - static void as_antic_timeout(unsigned long data) 470 - { 471 - struct request_queue *q = (struct request_queue *)data; 472 - struct as_data *ad = q->elevator->elevator_data; 473 - unsigned long flags; 474 - 475 - spin_lock_irqsave(q->queue_lock, flags); 476 - if (ad->antic_status == ANTIC_WAIT_REQ 477 - || ad->antic_status == ANTIC_WAIT_NEXT) { 478 - struct as_io_context *aic; 479 - spin_lock(&ad->io_context->lock); 480 - aic = ad->io_context->aic; 481 - 482 - ad->antic_status = ANTIC_FINISHED; 483 - kblockd_schedule_work(q, &ad->antic_work); 484 - 485 - if (aic->ttime_samples == 0) { 486 - /* process anticipated on has exited or timed out*/ 487 - ad->exit_prob = (7*ad->exit_prob + 256)/8; 488 - } 489 - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { 490 - /* process not "saved" by a cooperating request */ 491 - ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8; 492 - } 493 - spin_unlock(&ad->io_context->lock); 494 - } 495 - spin_unlock_irqrestore(q->queue_lock, flags); 496 - } 497 - 498 - static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, 499 - unsigned long ttime) 500 - { 501 - /* fixed point: 1.0 == 1<<8 */ 502 - if (aic->ttime_samples == 0) { 503 - ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; 504 - ad->new_ttime_mean = ad->new_ttime_total / 256; 505 - 506 - ad->exit_prob = (7*ad->exit_prob)/8; 507 - } 508 - aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; 509 - aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; 510 - aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; 511 - } 512 - 513 - static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, 514 - sector_t sdist) 515 - { 516 - u64 total; 517 - 518 - if (aic->seek_samples == 0) { 519 - ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; 520 - ad->new_seek_mean = ad->new_seek_total / 256; 521 - } 522 - 523 - /* 524 - * Don't allow the seek distance to get too large from the 525 - * odd fragment, pagein, etc 526 - */ 527 - if (aic->seek_samples <= 60) /* second&third seek */ 528 - sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); 529 - else 530 - sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); 531 - 532 - aic->seek_samples = (7*aic->seek_samples + 256) / 8; 533 - aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; 534 - total = aic->seek_total + (aic->seek_samples/2); 535 - do_div(total, aic->seek_samples); 536 - aic->seek_mean = (sector_t)total; 537 - } 538 - 539 - /* 540 - * as_update_iohist keeps a decaying histogram of IO thinktimes, and 541 - * updates @aic->ttime_mean based on that. It is called when a new 542 - * request is queued. 543 - */ 544 - static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, 545 - struct request *rq) 546 - { 547 - int data_dir = rq_is_sync(rq); 548 - unsigned long thinktime = 0; 549 - sector_t seek_dist; 550 - 551 - if (aic == NULL) 552 - return; 553 - 554 - if (data_dir == BLK_RW_SYNC) { 555 - unsigned long in_flight = atomic_read(&aic->nr_queued) 556 - + atomic_read(&aic->nr_dispatched); 557 - spin_lock(&aic->lock); 558 - if (test_bit(AS_TASK_IORUNNING, &aic->state) || 559 - test_bit(AS_TASK_IOSTARTED, &aic->state)) { 560 - /* Calculate read -> read thinktime */ 561 - if (test_bit(AS_TASK_IORUNNING, &aic->state) 562 - && in_flight == 0) { 563 - thinktime = jiffies - aic->last_end_request; 564 - thinktime = min(thinktime, MAX_THINKTIME-1); 565 - } 566 - as_update_thinktime(ad, aic, thinktime); 567 - 568 - /* Calculate read -> read seek distance */ 569 - if (aic->last_request_pos < blk_rq_pos(rq)) 570 - seek_dist = blk_rq_pos(rq) - 571 - aic->last_request_pos; 572 - else 573 - seek_dist = aic->last_request_pos - 574 - blk_rq_pos(rq); 575 - as_update_seekdist(ad, aic, seek_dist); 576 - } 577 - aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 578 - set_bit(AS_TASK_IOSTARTED, &aic->state); 579 - spin_unlock(&aic->lock); 580 - } 581 - } 582 - 583 - /* 584 - * as_close_req decides if one request is considered "close" to the 585 - * previous one issued. 586 - */ 587 - static int as_close_req(struct as_data *ad, struct as_io_context *aic, 588 - struct request *rq) 589 - { 590 - unsigned long delay; /* jiffies */ 591 - sector_t last = ad->last_sector[ad->batch_data_dir]; 592 - sector_t next = blk_rq_pos(rq); 593 - sector_t delta; /* acceptable close offset (in sectors) */ 594 - sector_t s; 595 - 596 - if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) 597 - delay = 0; 598 - else 599 - delay = jiffies - ad->antic_start; 600 - 601 - if (delay == 0) 602 - delta = 8192; 603 - else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire) 604 - delta = 8192 << delay; 605 - else 606 - return 1; 607 - 608 - if ((last <= next + (delta>>1)) && (next <= last + delta)) 609 - return 1; 610 - 611 - if (last < next) 612 - s = next - last; 613 - else 614 - s = last - next; 615 - 616 - if (aic->seek_samples == 0) { 617 - /* 618 - * Process has just started IO. Use past statistics to 619 - * gauge success possibility 620 - */ 621 - if (ad->new_seek_mean > s) { 622 - /* this request is better than what we're expecting */ 623 - return 1; 624 - } 625 - 626 - } else { 627 - if (aic->seek_mean > s) { 628 - /* this request is better than what we're expecting */ 629 - return 1; 630 - } 631 - } 632 - 633 - return 0; 634 - } 635 - 636 - /* 637 - * as_can_break_anticipation returns true if we have been anticipating this 638 - * request. 639 - * 640 - * It also returns true if the process against which we are anticipating 641 - * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to 642 - * dispatch it ASAP, because we know that application will not be submitting 643 - * any new reads. 644 - * 645 - * If the task which has submitted the request has exited, break anticipation. 646 - * 647 - * If this task has queued some other IO, do not enter enticipation. 648 - */ 649 - static int as_can_break_anticipation(struct as_data *ad, struct request *rq) 650 - { 651 - struct io_context *ioc; 652 - struct as_io_context *aic; 653 - 654 - ioc = ad->io_context; 655 - BUG_ON(!ioc); 656 - spin_lock(&ioc->lock); 657 - 658 - if (rq && ioc == RQ_IOC(rq)) { 659 - /* request from same process */ 660 - spin_unlock(&ioc->lock); 661 - return 1; 662 - } 663 - 664 - if (ad->ioc_finished && as_antic_expired(ad)) { 665 - /* 666 - * In this situation status should really be FINISHED, 667 - * however the timer hasn't had the chance to run yet. 668 - */ 669 - spin_unlock(&ioc->lock); 670 - return 1; 671 - } 672 - 673 - aic = ioc->aic; 674 - if (!aic) { 675 - spin_unlock(&ioc->lock); 676 - return 0; 677 - } 678 - 679 - if (atomic_read(&aic->nr_queued) > 0) { 680 - /* process has more requests queued */ 681 - spin_unlock(&ioc->lock); 682 - return 1; 683 - } 684 - 685 - if (atomic_read(&aic->nr_dispatched) > 0) { 686 - /* process has more requests dispatched */ 687 - spin_unlock(&ioc->lock); 688 - return 1; 689 - } 690 - 691 - if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) { 692 - /* 693 - * Found a close request that is not one of ours. 694 - * 695 - * This makes close requests from another process update 696 - * our IO history. Is generally useful when there are 697 - * two or more cooperating processes working in the same 698 - * area. 699 - */ 700 - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { 701 - if (aic->ttime_samples == 0) 702 - ad->exit_prob = (7*ad->exit_prob + 256)/8; 703 - 704 - ad->exit_no_coop = (7*ad->exit_no_coop)/8; 705 - } 706 - 707 - as_update_iohist(ad, aic, rq); 708 - spin_unlock(&ioc->lock); 709 - return 1; 710 - } 711 - 712 - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { 713 - /* process anticipated on has exited */ 714 - if (aic->ttime_samples == 0) 715 - ad->exit_prob = (7*ad->exit_prob + 256)/8; 716 - 717 - if (ad->exit_no_coop > 128) { 718 - spin_unlock(&ioc->lock); 719 - return 1; 720 - } 721 - } 722 - 723 - if (aic->ttime_samples == 0) { 724 - if (ad->new_ttime_mean > ad->antic_expire) { 725 - spin_unlock(&ioc->lock); 726 - return 1; 727 - } 728 - if (ad->exit_prob * ad->exit_no_coop > 128*256) { 729 - spin_unlock(&ioc->lock); 730 - return 1; 731 - } 732 - } else if (aic->ttime_mean > ad->antic_expire) { 733 - /* the process thinks too much between requests */ 734 - spin_unlock(&ioc->lock); 735 - return 1; 736 - } 737 - spin_unlock(&ioc->lock); 738 - return 0; 739 - } 740 - 741 - /* 742 - * as_can_anticipate indicates whether we should either run rq 743 - * or keep anticipating a better request. 744 - */ 745 - static int as_can_anticipate(struct as_data *ad, struct request *rq) 746 - { 747 - #if 0 /* disable for now, we need to check tag level as well */ 748 - /* 749 - * SSD device without seek penalty, disable idling 750 - */ 751 - if (blk_queue_nonrot(ad->q)) axman 752 - return 0; 753 - #endif 754 - 755 - if (!ad->io_context) 756 - /* 757 - * Last request submitted was a write 758 - */ 759 - return 0; 760 - 761 - if (ad->antic_status == ANTIC_FINISHED) 762 - /* 763 - * Don't restart if we have just finished. Run the next request 764 - */ 765 - return 0; 766 - 767 - if (as_can_break_anticipation(ad, rq)) 768 - /* 769 - * This request is a good candidate. Don't keep anticipating, 770 - * run it. 771 - */ 772 - return 0; 773 - 774 - /* 775 - * OK from here, we haven't finished, and don't have a decent request! 776 - * Status is either ANTIC_OFF so start waiting, 777 - * ANTIC_WAIT_REQ so continue waiting for request to finish 778 - * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request. 779 - */ 780 - 781 - return 1; 782 - } 783 - 784 - /* 785 - * as_update_rq must be called whenever a request (rq) is added to 786 - * the sort_list. This function keeps caches up to date, and checks if the 787 - * request might be one we are "anticipating" 788 - */ 789 - static void as_update_rq(struct as_data *ad, struct request *rq) 790 - { 791 - const int data_dir = rq_is_sync(rq); 792 - 793 - /* keep the next_rq cache up to date */ 794 - ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]); 795 - 796 - /* 797 - * have we been anticipating this request? 798 - * or does it come from the same process as the one we are anticipating 799 - * for? 800 - */ 801 - if (ad->antic_status == ANTIC_WAIT_REQ 802 - || ad->antic_status == ANTIC_WAIT_NEXT) { 803 - if (as_can_break_anticipation(ad, rq)) 804 - as_antic_stop(ad); 805 - } 806 - } 807 - 808 - /* 809 - * Gathers timings and resizes the write batch automatically 810 - */ 811 - static void update_write_batch(struct as_data *ad) 812 - { 813 - unsigned long batch = ad->batch_expire[BLK_RW_ASYNC]; 814 - long write_time; 815 - 816 - write_time = (jiffies - ad->current_batch_expires) + batch; 817 - if (write_time < 0) 818 - write_time = 0; 819 - 820 - if (write_time > batch && !ad->write_batch_idled) { 821 - if (write_time > batch * 3) 822 - ad->write_batch_count /= 2; 823 - else 824 - ad->write_batch_count--; 825 - } else if (write_time < batch && ad->current_write_count == 0) { 826 - if (batch > write_time * 3) 827 - ad->write_batch_count *= 2; 828 - else 829 - ad->write_batch_count++; 830 - } 831 - 832 - if (ad->write_batch_count < 1) 833 - ad->write_batch_count = 1; 834 - } 835 - 836 - /* 837 - * as_completed_request is to be called when a request has completed and 838 - * returned something to the requesting process, be it an error or data. 839 - */ 840 - static void as_completed_request(struct request_queue *q, struct request *rq) 841 - { 842 - struct as_data *ad = q->elevator->elevator_data; 843 - 844 - WARN_ON(!list_empty(&rq->queuelist)); 845 - 846 - if (RQ_STATE(rq) != AS_RQ_REMOVED) { 847 - WARN(1, "rq->state %d\n", RQ_STATE(rq)); 848 - goto out; 849 - } 850 - 851 - if (ad->changed_batch && ad->nr_dispatched == 1) { 852 - ad->current_batch_expires = jiffies + 853 - ad->batch_expire[ad->batch_data_dir]; 854 - kblockd_schedule_work(q, &ad->antic_work); 855 - ad->changed_batch = 0; 856 - 857 - if (ad->batch_data_dir == BLK_RW_SYNC) 858 - ad->new_batch = 1; 859 - } 860 - WARN_ON(ad->nr_dispatched == 0); 861 - ad->nr_dispatched--; 862 - 863 - /* 864 - * Start counting the batch from when a request of that direction is 865 - * actually serviced. This should help devices with big TCQ windows 866 - * and writeback caches 867 - */ 868 - if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { 869 - update_write_batch(ad); 870 - ad->current_batch_expires = jiffies + 871 - ad->batch_expire[BLK_RW_SYNC]; 872 - ad->new_batch = 0; 873 - } 874 - 875 - if (ad->io_context == RQ_IOC(rq) && ad->io_context) { 876 - ad->antic_start = jiffies; 877 - ad->ioc_finished = 1; 878 - if (ad->antic_status == ANTIC_WAIT_REQ) { 879 - /* 880 - * We were waiting on this request, now anticipate 881 - * the next one 882 - */ 883 - as_antic_waitnext(ad); 884 - } 885 - } 886 - 887 - as_put_io_context(rq); 888 - out: 889 - RQ_SET_STATE(rq, AS_RQ_POSTSCHED); 890 - } 891 - 892 - /* 893 - * as_remove_queued_request removes a request from the pre dispatch queue 894 - * without updating refcounts. It is expected the caller will drop the 895 - * reference unless it replaces the request at somepart of the elevator 896 - * (ie. the dispatch queue) 897 - */ 898 - static void as_remove_queued_request(struct request_queue *q, 899 - struct request *rq) 900 - { 901 - const int data_dir = rq_is_sync(rq); 902 - struct as_data *ad = q->elevator->elevator_data; 903 - struct io_context *ioc; 904 - 905 - WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); 906 - 907 - ioc = RQ_IOC(rq); 908 - if (ioc && ioc->aic) { 909 - BUG_ON(!atomic_read(&ioc->aic->nr_queued)); 910 - atomic_dec(&ioc->aic->nr_queued); 911 - } 912 - 913 - /* 914 - * Update the "next_rq" cache if we are about to remove its 915 - * entry 916 - */ 917 - if (ad->next_rq[data_dir] == rq) 918 - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); 919 - 920 - rq_fifo_clear(rq); 921 - as_del_rq_rb(ad, rq); 922 - } 923 - 924 - /* 925 - * as_fifo_expired returns 0 if there are no expired requests on the fifo, 926 - * 1 otherwise. It is ratelimited so that we only perform the check once per 927 - * `fifo_expire' interval. Otherwise a large number of expired requests 928 - * would create a hopeless seekstorm. 929 - * 930 - * See as_antic_expired comment. 931 - */ 932 - static int as_fifo_expired(struct as_data *ad, int adir) 933 - { 934 - struct request *rq; 935 - long delta_jif; 936 - 937 - delta_jif = jiffies - ad->last_check_fifo[adir]; 938 - if (unlikely(delta_jif < 0)) 939 - delta_jif = -delta_jif; 940 - if (delta_jif < ad->fifo_expire[adir]) 941 - return 0; 942 - 943 - ad->last_check_fifo[adir] = jiffies; 944 - 945 - if (list_empty(&ad->fifo_list[adir])) 946 - return 0; 947 - 948 - rq = rq_entry_fifo(ad->fifo_list[adir].next); 949 - 950 - return time_after(jiffies, rq_fifo_time(rq)); 951 - } 952 - 953 - /* 954 - * as_batch_expired returns true if the current batch has expired. A batch 955 - * is a set of reads or a set of writes. 956 - */ 957 - static inline int as_batch_expired(struct as_data *ad) 958 - { 959 - if (ad->changed_batch || ad->new_batch) 960 - return 0; 961 - 962 - if (ad->batch_data_dir == BLK_RW_SYNC) 963 - /* TODO! add a check so a complete fifo gets written? */ 964 - return time_after(jiffies, ad->current_batch_expires); 965 - 966 - return time_after(jiffies, ad->current_batch_expires) 967 - || ad->current_write_count == 0; 968 - } 969 - 970 - /* 971 - * move an entry to dispatch queue 972 - */ 973 - static void as_move_to_dispatch(struct as_data *ad, struct request *rq) 974 - { 975 - const int data_dir = rq_is_sync(rq); 976 - 977 - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); 978 - 979 - as_antic_stop(ad); 980 - ad->antic_status = ANTIC_OFF; 981 - 982 - /* 983 - * This has to be set in order to be correctly updated by 984 - * as_find_next_rq 985 - */ 986 - ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq); 987 - 988 - if (data_dir == BLK_RW_SYNC) { 989 - struct io_context *ioc = RQ_IOC(rq); 990 - /* In case we have to anticipate after this */ 991 - copy_io_context(&ad->io_context, &ioc); 992 - } else { 993 - if (ad->io_context) { 994 - put_io_context(ad->io_context); 995 - ad->io_context = NULL; 996 - } 997 - 998 - if (ad->current_write_count != 0) 999 - ad->current_write_count--; 1000 - } 1001 - ad->ioc_finished = 0; 1002 - 1003 - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); 1004 - 1005 - /* 1006 - * take it off the sort and fifo list, add to dispatch queue 1007 - */ 1008 - as_remove_queued_request(ad->q, rq); 1009 - WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); 1010 - 1011 - elv_dispatch_sort(ad->q, rq); 1012 - 1013 - RQ_SET_STATE(rq, AS_RQ_DISPATCHED); 1014 - if (RQ_IOC(rq) && RQ_IOC(rq)->aic) 1015 - atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); 1016 - ad->nr_dispatched++; 1017 - } 1018 - 1019 - /* 1020 - * as_dispatch_request selects the best request according to 1021 - * read/write expire, batch expire, etc, and moves it to the dispatch 1022 - * queue. Returns 1 if a request was found, 0 otherwise. 1023 - */ 1024 - static int as_dispatch_request(struct request_queue *q, int force) 1025 - { 1026 - struct as_data *ad = q->elevator->elevator_data; 1027 - const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]); 1028 - const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]); 1029 - struct request *rq; 1030 - 1031 - if (unlikely(force)) { 1032 - /* 1033 - * Forced dispatch, accounting is useless. Reset 1034 - * accounting states and dump fifo_lists. Note that 1035 - * batch_data_dir is reset to BLK_RW_SYNC to avoid 1036 - * screwing write batch accounting as write batch 1037 - * accounting occurs on W->R transition. 1038 - */ 1039 - int dispatched = 0; 1040 - 1041 - ad->batch_data_dir = BLK_RW_SYNC; 1042 - ad->changed_batch = 0; 1043 - ad->new_batch = 0; 1044 - 1045 - while (ad->next_rq[BLK_RW_SYNC]) { 1046 - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]); 1047 - dispatched++; 1048 - } 1049 - ad->last_check_fifo[BLK_RW_SYNC] = jiffies; 1050 - 1051 - while (ad->next_rq[BLK_RW_ASYNC]) { 1052 - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]); 1053 - dispatched++; 1054 - } 1055 - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; 1056 - 1057 - return dispatched; 1058 - } 1059 - 1060 - /* Signal that the write batch was uncontended, so we can't time it */ 1061 - if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) { 1062 - if (ad->current_write_count == 0 || !writes) 1063 - ad->write_batch_idled = 1; 1064 - } 1065 - 1066 - if (!(reads || writes) 1067 - || ad->antic_status == ANTIC_WAIT_REQ 1068 - || ad->antic_status == ANTIC_WAIT_NEXT 1069 - || ad->changed_batch) 1070 - return 0; 1071 - 1072 - if (!(reads && writes && as_batch_expired(ad))) { 1073 - /* 1074 - * batch is still running or no reads or no writes 1075 - */ 1076 - rq = ad->next_rq[ad->batch_data_dir]; 1077 - 1078 - if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) { 1079 - if (as_fifo_expired(ad, BLK_RW_SYNC)) 1080 - goto fifo_expired; 1081 - 1082 - if (as_can_anticipate(ad, rq)) { 1083 - as_antic_waitreq(ad); 1084 - return 0; 1085 - } 1086 - } 1087 - 1088 - if (rq) { 1089 - /* we have a "next request" */ 1090 - if (reads && !writes) 1091 - ad->current_batch_expires = 1092 - jiffies + ad->batch_expire[BLK_RW_SYNC]; 1093 - goto dispatch_request; 1094 - } 1095 - } 1096 - 1097 - /* 1098 - * at this point we are not running a batch. select the appropriate 1099 - * data direction (read / write) 1100 - */ 1101 - 1102 - if (reads) { 1103 - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC])); 1104 - 1105 - if (writes && ad->batch_data_dir == BLK_RW_SYNC) 1106 - /* 1107 - * Last batch was a read, switch to writes 1108 - */ 1109 - goto dispatch_writes; 1110 - 1111 - if (ad->batch_data_dir == BLK_RW_ASYNC) { 1112 - WARN_ON(ad->new_batch); 1113 - ad->changed_batch = 1; 1114 - } 1115 - ad->batch_data_dir = BLK_RW_SYNC; 1116 - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next); 1117 - ad->last_check_fifo[ad->batch_data_dir] = jiffies; 1118 - goto dispatch_request; 1119 - } 1120 - 1121 - /* 1122 - * the last batch was a read 1123 - */ 1124 - 1125 - if (writes) { 1126 - dispatch_writes: 1127 - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC])); 1128 - 1129 - if (ad->batch_data_dir == BLK_RW_SYNC) { 1130 - ad->changed_batch = 1; 1131 - 1132 - /* 1133 - * new_batch might be 1 when the queue runs out of 1134 - * reads. A subsequent submission of a write might 1135 - * cause a change of batch before the read is finished. 1136 - */ 1137 - ad->new_batch = 0; 1138 - } 1139 - ad->batch_data_dir = BLK_RW_ASYNC; 1140 - ad->current_write_count = ad->write_batch_count; 1141 - ad->write_batch_idled = 0; 1142 - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next); 1143 - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; 1144 - goto dispatch_request; 1145 - } 1146 - 1147 - BUG(); 1148 - return 0; 1149 - 1150 - dispatch_request: 1151 - /* 1152 - * If a request has expired, service it. 1153 - */ 1154 - 1155 - if (as_fifo_expired(ad, ad->batch_data_dir)) { 1156 - fifo_expired: 1157 - rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); 1158 - } 1159 - 1160 - if (ad->changed_batch) { 1161 - WARN_ON(ad->new_batch); 1162 - 1163 - if (ad->nr_dispatched) 1164 - return 0; 1165 - 1166 - if (ad->batch_data_dir == BLK_RW_ASYNC) 1167 - ad->current_batch_expires = jiffies + 1168 - ad->batch_expire[BLK_RW_ASYNC]; 1169 - else 1170 - ad->new_batch = 1; 1171 - 1172 - ad->changed_batch = 0; 1173 - } 1174 - 1175 - /* 1176 - * rq is the selected appropriate request. 1177 - */ 1178 - as_move_to_dispatch(ad, rq); 1179 - 1180 - return 1; 1181 - } 1182 - 1183 - /* 1184 - * add rq to rbtree and fifo 1185 - */ 1186 - static void as_add_request(struct request_queue *q, struct request *rq) 1187 - { 1188 - struct as_data *ad = q->elevator->elevator_data; 1189 - int data_dir; 1190 - 1191 - RQ_SET_STATE(rq, AS_RQ_NEW); 1192 - 1193 - data_dir = rq_is_sync(rq); 1194 - 1195 - rq->elevator_private = as_get_io_context(q->node); 1196 - 1197 - if (RQ_IOC(rq)) { 1198 - as_update_iohist(ad, RQ_IOC(rq)->aic, rq); 1199 - atomic_inc(&RQ_IOC(rq)->aic->nr_queued); 1200 - } 1201 - 1202 - as_add_rq_rb(ad, rq); 1203 - 1204 - /* 1205 - * set expire time and add to fifo list 1206 - */ 1207 - rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]); 1208 - list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]); 1209 - 1210 - as_update_rq(ad, rq); /* keep state machine up to date */ 1211 - RQ_SET_STATE(rq, AS_RQ_QUEUED); 1212 - } 1213 - 1214 - static void as_activate_request(struct request_queue *q, struct request *rq) 1215 - { 1216 - WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED); 1217 - RQ_SET_STATE(rq, AS_RQ_REMOVED); 1218 - if (RQ_IOC(rq) && RQ_IOC(rq)->aic) 1219 - atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched); 1220 - } 1221 - 1222 - static void as_deactivate_request(struct request_queue *q, struct request *rq) 1223 - { 1224 - WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED); 1225 - RQ_SET_STATE(rq, AS_RQ_DISPATCHED); 1226 - if (RQ_IOC(rq) && RQ_IOC(rq)->aic) 1227 - atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); 1228 - } 1229 - 1230 - /* 1231 - * as_queue_empty tells us if there are requests left in the device. It may 1232 - * not be the case that a driver can get the next request even if the queue 1233 - * is not empty - it is used in the block layer to check for plugging and 1234 - * merging opportunities 1235 - */ 1236 - static int as_queue_empty(struct request_queue *q) 1237 - { 1238 - struct as_data *ad = q->elevator->elevator_data; 1239 - 1240 - return list_empty(&ad->fifo_list[BLK_RW_ASYNC]) 1241 - && list_empty(&ad->fifo_list[BLK_RW_SYNC]); 1242 - } 1243 - 1244 - static int 1245 - as_merge(struct request_queue *q, struct request **req, struct bio *bio) 1246 - { 1247 - struct as_data *ad = q->elevator->elevator_data; 1248 - sector_t rb_key = bio->bi_sector + bio_sectors(bio); 1249 - struct request *__rq; 1250 - 1251 - /* 1252 - * check for front merge 1253 - */ 1254 - __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key); 1255 - if (__rq && elv_rq_merge_ok(__rq, bio)) { 1256 - *req = __rq; 1257 - return ELEVATOR_FRONT_MERGE; 1258 - } 1259 - 1260 - return ELEVATOR_NO_MERGE; 1261 - } 1262 - 1263 - static void as_merged_request(struct request_queue *q, struct request *req, 1264 - int type) 1265 - { 1266 - struct as_data *ad = q->elevator->elevator_data; 1267 - 1268 - /* 1269 - * if the merge was a front merge, we need to reposition request 1270 - */ 1271 - if (type == ELEVATOR_FRONT_MERGE) { 1272 - as_del_rq_rb(ad, req); 1273 - as_add_rq_rb(ad, req); 1274 - /* 1275 - * Note! At this stage of this and the next function, our next 1276 - * request may not be optimal - eg the request may have "grown" 1277 - * behind the disk head. We currently don't bother adjusting. 1278 - */ 1279 - } 1280 - } 1281 - 1282 - static void as_merged_requests(struct request_queue *q, struct request *req, 1283 - struct request *next) 1284 - { 1285 - /* 1286 - * if next expires before rq, assign its expire time to arq 1287 - * and move into next position (next will be deleted) in fifo 1288 - */ 1289 - if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { 1290 - if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { 1291 - list_move(&req->queuelist, &next->queuelist); 1292 - rq_set_fifo_time(req, rq_fifo_time(next)); 1293 - } 1294 - } 1295 - 1296 - /* 1297 - * kill knowledge of next, this one is a goner 1298 - */ 1299 - as_remove_queued_request(q, next); 1300 - as_put_io_context(next); 1301 - 1302 - RQ_SET_STATE(next, AS_RQ_MERGED); 1303 - } 1304 - 1305 - /* 1306 - * This is executed in a "deferred" process context, by kblockd. It calls the 1307 - * driver's request_fn so the driver can submit that request. 1308 - * 1309 - * IMPORTANT! This guy will reenter the elevator, so set up all queue global 1310 - * state before calling, and don't rely on any state over calls. 1311 - * 1312 - * FIXME! dispatch queue is not a queue at all! 1313 - */ 1314 - static void as_work_handler(struct work_struct *work) 1315 - { 1316 - struct as_data *ad = container_of(work, struct as_data, antic_work); 1317 - 1318 - blk_run_queue(ad->q); 1319 - } 1320 - 1321 - static int as_may_queue(struct request_queue *q, int rw) 1322 - { 1323 - int ret = ELV_MQUEUE_MAY; 1324 - struct as_data *ad = q->elevator->elevator_data; 1325 - struct io_context *ioc; 1326 - if (ad->antic_status == ANTIC_WAIT_REQ || 1327 - ad->antic_status == ANTIC_WAIT_NEXT) { 1328 - ioc = as_get_io_context(q->node); 1329 - if (ad->io_context == ioc) 1330 - ret = ELV_MQUEUE_MUST; 1331 - put_io_context(ioc); 1332 - } 1333 - 1334 - return ret; 1335 - } 1336 - 1337 - static void as_exit_queue(struct elevator_queue *e) 1338 - { 1339 - struct as_data *ad = e->elevator_data; 1340 - 1341 - del_timer_sync(&ad->antic_timer); 1342 - cancel_work_sync(&ad->antic_work); 1343 - 1344 - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC])); 1345 - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC])); 1346 - 1347 - put_io_context(ad->io_context); 1348 - kfree(ad); 1349 - } 1350 - 1351 - /* 1352 - * initialize elevator private data (as_data). 1353 - */ 1354 - static void *as_init_queue(struct request_queue *q) 1355 - { 1356 - struct as_data *ad; 1357 - 1358 - ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node); 1359 - if (!ad) 1360 - return NULL; 1361 - 1362 - ad->q = q; /* Identify what queue the data belongs to */ 1363 - 1364 - /* anticipatory scheduling helpers */ 1365 - ad->antic_timer.function = as_antic_timeout; 1366 - ad->antic_timer.data = (unsigned long)q; 1367 - init_timer(&ad->antic_timer); 1368 - INIT_WORK(&ad->antic_work, as_work_handler); 1369 - 1370 - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]); 1371 - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]); 1372 - ad->sort_list[BLK_RW_SYNC] = RB_ROOT; 1373 - ad->sort_list[BLK_RW_ASYNC] = RB_ROOT; 1374 - ad->fifo_expire[BLK_RW_SYNC] = default_read_expire; 1375 - ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire; 1376 - ad->antic_expire = default_antic_expire; 1377 - ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire; 1378 - ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire; 1379 - 1380 - ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; 1381 - ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; 1382 - if (ad->write_batch_count < 2) 1383 - ad->write_batch_count = 2; 1384 - 1385 - return ad; 1386 - } 1387 - 1388 - /* 1389 - * sysfs parts below 1390 - */ 1391 - 1392 - static ssize_t 1393 - as_var_show(unsigned int var, char *page) 1394 - { 1395 - return sprintf(page, "%d\n", var); 1396 - } 1397 - 1398 - static ssize_t 1399 - as_var_store(unsigned long *var, const char *page, size_t count) 1400 - { 1401 - char *p = (char *) page; 1402 - 1403 - *var = simple_strtoul(p, &p, 10); 1404 - return count; 1405 - } 1406 - 1407 - static ssize_t est_time_show(struct elevator_queue *e, char *page) 1408 - { 1409 - struct as_data *ad = e->elevator_data; 1410 - int pos = 0; 1411 - 1412 - pos += sprintf(page+pos, "%lu %% exit probability\n", 1413 - 100*ad->exit_prob/256); 1414 - pos += sprintf(page+pos, "%lu %% probability of exiting without a " 1415 - "cooperating process submitting IO\n", 1416 - 100*ad->exit_no_coop/256); 1417 - pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean); 1418 - pos += sprintf(page+pos, "%llu sectors new seek distance\n", 1419 - (unsigned long long)ad->new_seek_mean); 1420 - 1421 - return pos; 1422 - } 1423 - 1424 - #define SHOW_FUNCTION(__FUNC, __VAR) \ 1425 - static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 1426 - { \ 1427 - struct as_data *ad = e->elevator_data; \ 1428 - return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1429 - } 1430 - SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]); 1431 - SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]); 1432 - SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); 1433 - SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]); 1434 - SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]); 1435 - #undef SHOW_FUNCTION 1436 - 1437 - #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ 1438 - static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 1439 - { \ 1440 - struct as_data *ad = e->elevator_data; \ 1441 - int ret = as_var_store(__PTR, (page), count); \ 1442 - if (*(__PTR) < (MIN)) \ 1443 - *(__PTR) = (MIN); \ 1444 - else if (*(__PTR) > (MAX)) \ 1445 - *(__PTR) = (MAX); \ 1446 - *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1447 - return ret; \ 1448 - } 1449 - STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX); 1450 - STORE_FUNCTION(as_write_expire_store, 1451 - &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX); 1452 - STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); 1453 - STORE_FUNCTION(as_read_batch_expire_store, 1454 - &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX); 1455 - STORE_FUNCTION(as_write_batch_expire_store, 1456 - &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX); 1457 - #undef STORE_FUNCTION 1458 - 1459 - #define AS_ATTR(name) \ 1460 - __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store) 1461 - 1462 - static struct elv_fs_entry as_attrs[] = { 1463 - __ATTR_RO(est_time), 1464 - AS_ATTR(read_expire), 1465 - AS_ATTR(write_expire), 1466 - AS_ATTR(antic_expire), 1467 - AS_ATTR(read_batch_expire), 1468 - AS_ATTR(write_batch_expire), 1469 - __ATTR_NULL 1470 - }; 1471 - 1472 - static struct elevator_type iosched_as = { 1473 - .ops = { 1474 - .elevator_merge_fn = as_merge, 1475 - .elevator_merged_fn = as_merged_request, 1476 - .elevator_merge_req_fn = as_merged_requests, 1477 - .elevator_dispatch_fn = as_dispatch_request, 1478 - .elevator_add_req_fn = as_add_request, 1479 - .elevator_activate_req_fn = as_activate_request, 1480 - .elevator_deactivate_req_fn = as_deactivate_request, 1481 - .elevator_queue_empty_fn = as_queue_empty, 1482 - .elevator_completed_req_fn = as_completed_request, 1483 - .elevator_former_req_fn = elv_rb_former_request, 1484 - .elevator_latter_req_fn = elv_rb_latter_request, 1485 - .elevator_may_queue_fn = as_may_queue, 1486 - .elevator_init_fn = as_init_queue, 1487 - .elevator_exit_fn = as_exit_queue, 1488 - .trim = as_trim, 1489 - }, 1490 - 1491 - .elevator_attrs = as_attrs, 1492 - .elevator_name = "anticipatory", 1493 - .elevator_owner = THIS_MODULE, 1494 - }; 1495 - 1496 - static int __init as_init(void) 1497 - { 1498 - elv_register(&iosched_as); 1499 - 1500 - return 0; 1501 - } 1502 - 1503 - static void __exit as_exit(void) 1504 - { 1505 - DECLARE_COMPLETION_ONSTACK(all_gone); 1506 - elv_unregister(&iosched_as); 1507 - ioc_gone = &all_gone; 1508 - /* ioc_gone's update must be visible before reading ioc_count */ 1509 - smp_wmb(); 1510 - if (elv_ioc_count_read(as_ioc_count)) 1511 - wait_for_completion(&all_gone); 1512 - synchronize_rcu(); 1513 - } 1514 - 1515 - module_init(as_init); 1516 - module_exit(as_exit); 1517 - 1518 - MODULE_AUTHOR("Nick Piggin"); 1519 - MODULE_LICENSE("GPL"); 1520 - MODULE_DESCRIPTION("anticipatory IO scheduler");

+361

block/blk-cgroup.c

··· 1 + /* 2 + * Common Block IO controller cgroup interface 3 + * 4 + * Based on ideas and code from CFQ, CFS and BFQ: 5 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 + * 7 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 + * Paolo Valente <paolo.valente@unimore.it> 9 + * 10 + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 + * Nauman Rafique <nauman@google.com> 12 + */ 13 + #include <linux/ioprio.h> 14 + #include <linux/seq_file.h> 15 + #include <linux/kdev_t.h> 16 + #include <linux/module.h> 17 + #include <linux/err.h> 18 + #include "blk-cgroup.h" 19 + 20 + static DEFINE_SPINLOCK(blkio_list_lock); 21 + static LIST_HEAD(blkio_list); 22 + 23 + struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 24 + EXPORT_SYMBOL_GPL(blkio_root_cgroup); 25 + 26 + bool blkiocg_css_tryget(struct blkio_cgroup *blkcg) 27 + { 28 + if (!css_tryget(&blkcg->css)) 29 + return false; 30 + return true; 31 + } 32 + EXPORT_SYMBOL_GPL(blkiocg_css_tryget); 33 + 34 + void blkiocg_css_put(struct blkio_cgroup *blkcg) 35 + { 36 + css_put(&blkcg->css); 37 + } 38 + EXPORT_SYMBOL_GPL(blkiocg_css_put); 39 + 40 + struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 41 + { 42 + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 43 + struct blkio_cgroup, css); 44 + } 45 + EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 46 + 47 + void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 48 + unsigned long time, unsigned long sectors) 49 + { 50 + blkg->time += time; 51 + blkg->sectors += sectors; 52 + } 53 + EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); 54 + 55 + void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 56 + struct blkio_group *blkg, void *key, dev_t dev) 57 + { 58 + unsigned long flags; 59 + 60 + spin_lock_irqsave(&blkcg->lock, flags); 61 + rcu_assign_pointer(blkg->key, key); 62 + blkg->blkcg_id = css_id(&blkcg->css); 63 + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 64 + spin_unlock_irqrestore(&blkcg->lock, flags); 65 + #ifdef CONFIG_DEBUG_BLK_CGROUP 66 + /* Need to take css reference ? */ 67 + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 68 + #endif 69 + blkg->dev = dev; 70 + } 71 + EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 72 + 73 + static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 74 + { 75 + hlist_del_init_rcu(&blkg->blkcg_node); 76 + blkg->blkcg_id = 0; 77 + } 78 + 79 + /* 80 + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 81 + * indicating that blk_group was unhashed by the time we got to it. 82 + */ 83 + int blkiocg_del_blkio_group(struct blkio_group *blkg) 84 + { 85 + struct blkio_cgroup *blkcg; 86 + unsigned long flags; 87 + struct cgroup_subsys_state *css; 88 + int ret = 1; 89 + 90 + rcu_read_lock(); 91 + css = css_lookup(&blkio_subsys, blkg->blkcg_id); 92 + if (!css) 93 + goto out; 94 + 95 + blkcg = container_of(css, struct blkio_cgroup, css); 96 + spin_lock_irqsave(&blkcg->lock, flags); 97 + if (!hlist_unhashed(&blkg->blkcg_node)) { 98 + __blkiocg_del_blkio_group(blkg); 99 + ret = 0; 100 + } 101 + spin_unlock_irqrestore(&blkcg->lock, flags); 102 + out: 103 + rcu_read_unlock(); 104 + return ret; 105 + } 106 + EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); 107 + 108 + /* called under rcu_read_lock(). */ 109 + struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) 110 + { 111 + struct blkio_group *blkg; 112 + struct hlist_node *n; 113 + void *__key; 114 + 115 + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 116 + __key = blkg->key; 117 + if (__key == key) 118 + return blkg; 119 + } 120 + 121 + return NULL; 122 + } 123 + EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 124 + 125 + #define SHOW_FUNCTION(__VAR) \ 126 + static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 127 + struct cftype *cftype) \ 128 + { \ 129 + struct blkio_cgroup *blkcg; \ 130 + \ 131 + blkcg = cgroup_to_blkio_cgroup(cgroup); \ 132 + return (u64)blkcg->__VAR; \ 133 + } 134 + 135 + SHOW_FUNCTION(weight); 136 + #undef SHOW_FUNCTION 137 + 138 + static int 139 + blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) 140 + { 141 + struct blkio_cgroup *blkcg; 142 + struct blkio_group *blkg; 143 + struct hlist_node *n; 144 + struct blkio_policy_type *blkiop; 145 + 146 + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 147 + return -EINVAL; 148 + 149 + blkcg = cgroup_to_blkio_cgroup(cgroup); 150 + spin_lock_irq(&blkcg->lock); 151 + blkcg->weight = (unsigned int)val; 152 + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 153 + spin_lock(&blkio_list_lock); 154 + list_for_each_entry(blkiop, &blkio_list, list) 155 + blkiop->ops.blkio_update_group_weight_fn(blkg, 156 + blkcg->weight); 157 + spin_unlock(&blkio_list_lock); 158 + } 159 + spin_unlock_irq(&blkcg->lock); 160 + return 0; 161 + } 162 + 163 + #define SHOW_FUNCTION_PER_GROUP(__VAR) \ 164 + static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 165 + struct cftype *cftype, struct seq_file *m) \ 166 + { \ 167 + struct blkio_cgroup *blkcg; \ 168 + struct blkio_group *blkg; \ 169 + struct hlist_node *n; \ 170 + \ 171 + if (!cgroup_lock_live_group(cgroup)) \ 172 + return -ENODEV; \ 173 + \ 174 + blkcg = cgroup_to_blkio_cgroup(cgroup); \ 175 + rcu_read_lock(); \ 176 + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ 177 + if (blkg->dev) \ 178 + seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ 179 + MINOR(blkg->dev), blkg->__VAR); \ 180 + } \ 181 + rcu_read_unlock(); \ 182 + cgroup_unlock(); \ 183 + return 0; \ 184 + } 185 + 186 + SHOW_FUNCTION_PER_GROUP(time); 187 + SHOW_FUNCTION_PER_GROUP(sectors); 188 + #ifdef CONFIG_DEBUG_BLK_CGROUP 189 + SHOW_FUNCTION_PER_GROUP(dequeue); 190 + #endif 191 + #undef SHOW_FUNCTION_PER_GROUP 192 + 193 + #ifdef CONFIG_DEBUG_BLK_CGROUP 194 + void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, 195 + unsigned long dequeue) 196 + { 197 + blkg->dequeue += dequeue; 198 + } 199 + EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); 200 + #endif 201 + 202 + struct cftype blkio_files[] = { 203 + { 204 + .name = "weight", 205 + .read_u64 = blkiocg_weight_read, 206 + .write_u64 = blkiocg_weight_write, 207 + }, 208 + { 209 + .name = "time", 210 + .read_seq_string = blkiocg_time_read, 211 + }, 212 + { 213 + .name = "sectors", 214 + .read_seq_string = blkiocg_sectors_read, 215 + }, 216 + #ifdef CONFIG_DEBUG_BLK_CGROUP 217 + { 218 + .name = "dequeue", 219 + .read_seq_string = blkiocg_dequeue_read, 220 + }, 221 + #endif 222 + }; 223 + 224 + static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 225 + { 226 + return cgroup_add_files(cgroup, subsys, blkio_files, 227 + ARRAY_SIZE(blkio_files)); 228 + } 229 + 230 + static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) 231 + { 232 + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 233 + unsigned long flags; 234 + struct blkio_group *blkg; 235 + void *key; 236 + struct blkio_policy_type *blkiop; 237 + 238 + rcu_read_lock(); 239 + remove_entry: 240 + spin_lock_irqsave(&blkcg->lock, flags); 241 + 242 + if (hlist_empty(&blkcg->blkg_list)) { 243 + spin_unlock_irqrestore(&blkcg->lock, flags); 244 + goto done; 245 + } 246 + 247 + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 248 + blkcg_node); 249 + key = rcu_dereference(blkg->key); 250 + __blkiocg_del_blkio_group(blkg); 251 + 252 + spin_unlock_irqrestore(&blkcg->lock, flags); 253 + 254 + /* 255 + * This blkio_group is being unlinked as associated cgroup is going 256 + * away. Let all the IO controlling policies know about this event. 257 + * 258 + * Currently this is static call to one io controlling policy. Once 259 + * we have more policies in place, we need some dynamic registration 260 + * of callback function. 261 + */ 262 + spin_lock(&blkio_list_lock); 263 + list_for_each_entry(blkiop, &blkio_list, list) 264 + blkiop->ops.blkio_unlink_group_fn(key, blkg); 265 + spin_unlock(&blkio_list_lock); 266 + goto remove_entry; 267 + done: 268 + free_css_id(&blkio_subsys, &blkcg->css); 269 + rcu_read_unlock(); 270 + kfree(blkcg); 271 + } 272 + 273 + static struct cgroup_subsys_state * 274 + blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) 275 + { 276 + struct blkio_cgroup *blkcg, *parent_blkcg; 277 + 278 + if (!cgroup->parent) { 279 + blkcg = &blkio_root_cgroup; 280 + goto done; 281 + } 282 + 283 + /* Currently we do not support hierarchy deeper than two level (0,1) */ 284 + parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); 285 + if (css_depth(&parent_blkcg->css) > 0) 286 + return ERR_PTR(-EINVAL); 287 + 288 + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 289 + if (!blkcg) 290 + return ERR_PTR(-ENOMEM); 291 + 292 + blkcg->weight = BLKIO_WEIGHT_DEFAULT; 293 + done: 294 + spin_lock_init(&blkcg->lock); 295 + INIT_HLIST_HEAD(&blkcg->blkg_list); 296 + 297 + return &blkcg->css; 298 + } 299 + 300 + /* 301 + * We cannot support shared io contexts, as we have no mean to support 302 + * two tasks with the same ioc in two different groups without major rework 303 + * of the main cic data structures. For now we allow a task to change 304 + * its cgroup only if it's the only owner of its ioc. 305 + */ 306 + static int blkiocg_can_attach(struct cgroup_subsys *subsys, 307 + struct cgroup *cgroup, struct task_struct *tsk, 308 + bool threadgroup) 309 + { 310 + struct io_context *ioc; 311 + int ret = 0; 312 + 313 + /* task_lock() is needed to avoid races with exit_io_context() */ 314 + task_lock(tsk); 315 + ioc = tsk->io_context; 316 + if (ioc && atomic_read(&ioc->nr_tasks) > 1) 317 + ret = -EINVAL; 318 + task_unlock(tsk); 319 + 320 + return ret; 321 + } 322 + 323 + static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, 324 + struct cgroup *prev, struct task_struct *tsk, 325 + bool threadgroup) 326 + { 327 + struct io_context *ioc; 328 + 329 + task_lock(tsk); 330 + ioc = tsk->io_context; 331 + if (ioc) 332 + ioc->cgroup_changed = 1; 333 + task_unlock(tsk); 334 + } 335 + 336 + struct cgroup_subsys blkio_subsys = { 337 + .name = "blkio", 338 + .create = blkiocg_create, 339 + .can_attach = blkiocg_can_attach, 340 + .attach = blkiocg_attach, 341 + .destroy = blkiocg_destroy, 342 + .populate = blkiocg_populate, 343 + .subsys_id = blkio_subsys_id, 344 + .use_id = 1, 345 + }; 346 + 347 + void blkio_policy_register(struct blkio_policy_type *blkiop) 348 + { 349 + spin_lock(&blkio_list_lock); 350 + list_add_tail(&blkiop->list, &blkio_list); 351 + spin_unlock(&blkio_list_lock); 352 + } 353 + EXPORT_SYMBOL_GPL(blkio_policy_register); 354 + 355 + void blkio_policy_unregister(struct blkio_policy_type *blkiop) 356 + { 357 + spin_lock(&blkio_list_lock); 358 + list_del_init(&blkiop->list); 359 + spin_unlock(&blkio_list_lock); 360 + } 361 + EXPORT_SYMBOL_GPL(blkio_policy_unregister);

+127

block/blk-cgroup.h

··· 1 + #ifndef _BLK_CGROUP_H 2 + #define _BLK_CGROUP_H 3 + /* 4 + * Common Block IO controller cgroup interface 5 + * 6 + * Based on ideas and code from CFQ, CFS and BFQ: 7 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 + * 9 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 10 + * Paolo Valente <paolo.valente@unimore.it> 11 + * 12 + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 13 + * Nauman Rafique <nauman@google.com> 14 + */ 15 + 16 + #include <linux/cgroup.h> 17 + 18 + #ifdef CONFIG_BLK_CGROUP 19 + 20 + struct blkio_cgroup { 21 + struct cgroup_subsys_state css; 22 + unsigned int weight; 23 + spinlock_t lock; 24 + struct hlist_head blkg_list; 25 + }; 26 + 27 + struct blkio_group { 28 + /* An rcu protected unique identifier for the group */ 29 + void *key; 30 + struct hlist_node blkcg_node; 31 + unsigned short blkcg_id; 32 + #ifdef CONFIG_DEBUG_BLK_CGROUP 33 + /* Store cgroup path */ 34 + char path[128]; 35 + /* How many times this group has been removed from service tree */ 36 + unsigned long dequeue; 37 + #endif 38 + /* The device MKDEV(major, minor), this group has been created for */ 39 + dev_t dev; 40 + 41 + /* total disk time and nr sectors dispatched by this group */ 42 + unsigned long time; 43 + unsigned long sectors; 44 + }; 45 + 46 + extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg); 47 + extern void blkiocg_css_put(struct blkio_cgroup *blkcg); 48 + 49 + typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 50 + typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 51 + unsigned int weight); 52 + 53 + struct blkio_policy_ops { 54 + blkio_unlink_group_fn *blkio_unlink_group_fn; 55 + blkio_update_group_weight_fn *blkio_update_group_weight_fn; 56 + }; 57 + 58 + struct blkio_policy_type { 59 + struct list_head list; 60 + struct blkio_policy_ops ops; 61 + }; 62 + 63 + /* Blkio controller policy registration */ 64 + extern void blkio_policy_register(struct blkio_policy_type *); 65 + extern void blkio_policy_unregister(struct blkio_policy_type *); 66 + 67 + #else 68 + 69 + struct blkio_group { 70 + }; 71 + 72 + struct blkio_policy_type { 73 + }; 74 + 75 + static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } 76 + static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } 77 + 78 + #endif 79 + 80 + #define BLKIO_WEIGHT_MIN 100 81 + #define BLKIO_WEIGHT_MAX 1000 82 + #define BLKIO_WEIGHT_DEFAULT 500 83 + 84 + #ifdef CONFIG_DEBUG_BLK_CGROUP 85 + static inline char *blkg_path(struct blkio_group *blkg) 86 + { 87 + return blkg->path; 88 + } 89 + void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, 90 + unsigned long dequeue); 91 + #else 92 + static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 93 + static inline void blkiocg_update_blkio_group_dequeue_stats( 94 + struct blkio_group *blkg, unsigned long dequeue) {} 95 + #endif 96 + 97 + #ifdef CONFIG_BLK_CGROUP 98 + extern struct blkio_cgroup blkio_root_cgroup; 99 + extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); 100 + extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 101 + struct blkio_group *blkg, void *key, dev_t dev); 102 + extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 103 + extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 104 + void *key); 105 + void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 106 + unsigned long time, unsigned long sectors); 107 + #else 108 + struct cgroup; 109 + static inline struct blkio_cgroup * 110 + cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 111 + 112 + static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 113 + struct blkio_group *blkg, void *key, dev_t dev) 114 + { 115 + } 116 + 117 + static inline int 118 + blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 119 + 120 + static inline struct blkio_group * 121 + blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 122 + static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 123 + unsigned long time, unsigned long sectors) 124 + { 125 + } 126 + #endif 127 + #endif /* _BLK_CGROUP_H */

+19

block/blk-core.c

··· 2358 2358 rq->rq_disk = bio->bi_bdev->bd_disk; 2359 2359 } 2360 2360 2361 + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 2362 + /** 2363 + * rq_flush_dcache_pages - Helper function to flush all pages in a request 2364 + * @rq: the request to be flushed 2365 + * 2366 + * Description: 2367 + * Flush all pages in @rq. 2368 + */ 2369 + void rq_flush_dcache_pages(struct request *rq) 2370 + { 2371 + struct req_iterator iter; 2372 + struct bio_vec *bvec; 2373 + 2374 + rq_for_each_segment(bvec, rq, iter) 2375 + flush_dcache_page(bvec->bv_page); 2376 + } 2377 + EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); 2378 + #endif 2379 + 2361 2380 /** 2362 2381 * blk_lld_busy - Check if underlying low-level drivers of a device are busy 2363 2382 * @q : the queue of the device being checked

+6 -6

block/blk-ioc.c

··· 66 66 } 67 67 68 68 /* Called by the exitting task */ 69 - void exit_io_context(void) 69 + void exit_io_context(struct task_struct *task) 70 70 { 71 71 struct io_context *ioc; 72 72 73 - task_lock(current); 74 - ioc = current->io_context; 75 - current->io_context = NULL; 76 - task_unlock(current); 73 + task_lock(task); 74 + ioc = task->io_context; 75 + task->io_context = NULL; 76 + task_unlock(task); 77 77 78 78 if (atomic_dec_and_test(&ioc->nr_tasks)) { 79 79 if (ioc->aic && ioc->aic->exit) 80 80 ioc->aic->exit(ioc->aic); 81 81 cfq_exit(ioc); 82 82 83 - put_io_context(ioc); 84 83 } 84 + put_io_context(ioc); 85 85 } 86 86 87 87 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)

+40 -11

block/blk-settings.c

··· 8 8 #include <linux/blkdev.h> 9 9 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 10 10 #include <linux/gcd.h> 11 + #include <linux/jiffies.h> 11 12 12 13 #include "blk.h" 13 14 ··· 97 96 lim->max_segment_size = MAX_SEGMENT_SIZE; 98 97 lim->max_sectors = BLK_DEF_MAX_SECTORS; 99 98 lim->max_hw_sectors = INT_MAX; 100 - lim->max_discard_sectors = SAFE_MAX_SECTORS; 99 + lim->max_discard_sectors = 0; 100 + lim->discard_granularity = 0; 101 + lim->discard_alignment = 0; 102 + lim->discard_misaligned = 0; 103 + lim->discard_zeroes_data = -1; 101 104 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 102 105 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 103 106 lim->alignment_offset = 0; ··· 146 141 q->nr_batching = BLK_BATCH_REQ; 147 142 148 143 q->unplug_thresh = 4; /* hmm */ 149 - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 144 + q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ 150 145 if (q->unplug_delay == 0) 151 146 q->unplug_delay = 1; 152 147 ··· 493 488 } 494 489 EXPORT_SYMBOL(blk_queue_stack_limits); 495 490 491 + static unsigned int lcm(unsigned int a, unsigned int b) 492 + { 493 + if (a && b) 494 + return (a * b) / gcd(a, b); 495 + else if (b) 496 + return b; 497 + 498 + return a; 499 + } 500 + 496 501 /** 497 502 * blk_stack_limits - adjust queue_limits for stacked devices 498 503 * @t: the stacking driver limits (top) ··· 517 502 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 518 503 sector_t offset) 519 504 { 505 + int ret; 506 + 507 + ret = 0; 508 + 520 509 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 521 510 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 522 511 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); ··· 545 526 546 527 t->io_min = max(t->io_min, b->io_min); 547 528 t->no_cluster |= b->no_cluster; 529 + t->discard_zeroes_data &= b->discard_zeroes_data; 548 530 549 531 /* Bottom device offset aligned? */ 550 532 if (offset && 551 533 (offset & (b->physical_block_size - 1)) != b->alignment_offset) { 552 534 t->misaligned = 1; 553 - return -1; 535 + ret = -1; 536 + } 537 + 538 + if (offset && 539 + (offset & (b->discard_granularity - 1)) != b->discard_alignment) { 540 + t->discard_misaligned = 1; 541 + ret = -1; 554 542 } 555 543 556 544 /* If top has no alignment offset, inherit from bottom */ ··· 565 539 t->alignment_offset = 566 540 b->alignment_offset & (b->physical_block_size - 1); 567 541 542 + if (!t->discard_alignment) 543 + t->discard_alignment = 544 + b->discard_alignment & (b->discard_granularity - 1); 545 + 568 546 /* Top device aligned on logical block boundary? */ 569 547 if (t->alignment_offset & (t->logical_block_size - 1)) { 570 548 t->misaligned = 1; 571 - return -1; 549 + ret = -1; 572 550 } 573 551 574 - /* Find lcm() of optimal I/O size */ 575 - if (t->io_opt && b->io_opt) 576 - t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); 577 - else if (b->io_opt) 578 - t->io_opt = b->io_opt; 552 + /* Find lcm() of optimal I/O size and granularity */ 553 + t->io_opt = lcm(t->io_opt, b->io_opt); 554 + t->discard_granularity = lcm(t->discard_granularity, 555 + b->discard_granularity); 579 556 580 557 /* Verify that optimal I/O size is a multiple of io_min */ 581 558 if (t->io_min && t->io_opt % t->io_min) 582 - return -1; 559 + ret = -1; 583 560 584 - return 0; 561 + return ret; 585 562 } 586 563 EXPORT_SYMBOL(blk_stack_limits); 587 564

+33

block/blk-sysfs.c

··· 126 126 return queue_var_show(queue_io_opt(q), page); 127 127 } 128 128 129 + static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) 130 + { 131 + return queue_var_show(q->limits.discard_granularity, page); 132 + } 133 + 134 + static ssize_t queue_discard_max_show(struct request_queue *q, char *page) 135 + { 136 + return queue_var_show(q->limits.max_discard_sectors << 9, page); 137 + } 138 + 139 + static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) 140 + { 141 + return queue_var_show(queue_discard_zeroes_data(q), page); 142 + } 143 + 129 144 static ssize_t 130 145 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 131 146 { ··· 308 293 .show = queue_io_opt_show, 309 294 }; 310 295 296 + static struct queue_sysfs_entry queue_discard_granularity_entry = { 297 + .attr = {.name = "discard_granularity", .mode = S_IRUGO }, 298 + .show = queue_discard_granularity_show, 299 + }; 300 + 301 + static struct queue_sysfs_entry queue_discard_max_entry = { 302 + .attr = {.name = "discard_max_bytes", .mode = S_IRUGO }, 303 + .show = queue_discard_max_show, 304 + }; 305 + 306 + static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { 307 + .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO }, 308 + .show = queue_discard_zeroes_data_show, 309 + }; 310 + 311 311 static struct queue_sysfs_entry queue_nonrot_entry = { 312 312 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, 313 313 .show = queue_nonrot_show, ··· 358 328 &queue_physical_block_size_entry.attr, 359 329 &queue_io_min_entry.attr, 360 330 &queue_io_opt_entry.attr, 331 + &queue_discard_granularity_entry.attr, 332 + &queue_discard_max_entry.attr, 333 + &queue_discard_zeroes_data_entry.attr, 361 334 &queue_nonrot_entry.attr, 362 335 &queue_nomerges_entry.attr, 363 336 &queue_rq_affinity_entry.attr,

+2 -1

block/bsg.c

··· 15 15 #include <linux/blkdev.h> 16 16 #include <linux/poll.h> 17 17 #include <linux/cdev.h> 18 + #include <linux/jiffies.h> 18 19 #include <linux/percpu.h> 19 20 #include <linux/uio.h> 20 21 #include <linux/idr.h> ··· 198 197 rq->cmd_len = hdr->request_len; 199 198 rq->cmd_type = REQ_TYPE_BLOCK_PC; 200 199 201 - rq->timeout = (hdr->timeout * HZ) / 1000; 200 + rq->timeout = msecs_to_jiffies(hdr->timeout); 202 201 if (!rq->timeout) 203 202 rq->timeout = q->sg_timeout; 204 203 if (!rq->timeout)

+1338 -155

block/cfq-iosched.c

··· 9 9 #include <linux/module.h> 10 10 #include <linux/blkdev.h> 11 11 #include <linux/elevator.h> 12 + #include <linux/jiffies.h> 12 13 #include <linux/rbtree.h> 13 14 #include <linux/ioprio.h> 14 15 #include <linux/blktrace_api.h> 16 + #include "blk-cgroup.h" 15 17 16 18 /* 17 19 * tunables ··· 29 27 static int cfq_slice_async = HZ / 25; 30 28 static const int cfq_slice_async_rq = 2; 31 29 static int cfq_slice_idle = HZ / 125; 30 + static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 31 + static const int cfq_hist_divisor = 4; 32 32 33 33 /* 34 34 * offset from end of service tree ··· 42 38 */ 43 39 #define CFQ_MIN_TT (2) 44 40 41 + /* 42 + * Allow merged cfqqs to perform this amount of seeky I/O before 43 + * deciding to break the queues up again. 44 + */ 45 + #define CFQQ_COOP_TOUT (HZ) 46 + 45 47 #define CFQ_SLICE_SCALE (5) 46 48 #define CFQ_HW_QUEUE_MIN (5) 49 + #define CFQ_SERVICE_SHIFT 12 47 50 48 51 #define RQ_CIC(rq) \ 49 52 ((struct cfq_io_context *) (rq)->elevator_private) ··· 68 57 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 69 58 70 59 #define sample_valid(samples) ((samples) > 80) 60 + #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 71 61 72 62 /* 73 63 * Most of our rbtree usage is for sorting with min extraction, so ··· 79 67 struct cfq_rb_root { 80 68 struct rb_root rb; 81 69 struct rb_node *left; 70 + unsigned count; 71 + u64 min_vdisktime; 72 + struct rb_node *active; 73 + unsigned total_weight; 82 74 }; 83 - #define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 75 + #define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } 84 76 85 77 /* 86 78 * Per process-grouping structure ··· 115 99 /* fifo list of requests in sort_list */ 116 100 struct list_head fifo; 117 101 102 + /* time when queue got scheduled in to dispatch first request. */ 103 + unsigned long dispatch_start; 104 + unsigned int allocated_slice; 105 + /* time when first request from queue completed and slice started. */ 106 + unsigned long slice_start; 118 107 unsigned long slice_end; 119 108 long slice_resid; 120 109 unsigned int slice_dispatch; ··· 133 112 unsigned short ioprio, org_ioprio; 134 113 unsigned short ioprio_class, org_ioprio_class; 135 114 115 + unsigned int seek_samples; 116 + u64 seek_total; 117 + sector_t seek_mean; 118 + sector_t last_request_pos; 119 + unsigned long seeky_start; 120 + 136 121 pid_t pid; 122 + 123 + struct cfq_rb_root *service_tree; 124 + struct cfq_queue *new_cfqq; 125 + struct cfq_group *cfqg; 126 + struct cfq_group *orig_cfqg; 127 + /* Sectors dispatched in current dispatch round */ 128 + unsigned long nr_sectors; 129 + }; 130 + 131 + /* 132 + * First index in the service_trees. 133 + * IDLE is handled separately, so it has negative index 134 + */ 135 + enum wl_prio_t { 136 + BE_WORKLOAD = 0, 137 + RT_WORKLOAD = 1, 138 + IDLE_WORKLOAD = 2, 139 + }; 140 + 141 + /* 142 + * Second index in the service_trees. 143 + */ 144 + enum wl_type_t { 145 + ASYNC_WORKLOAD = 0, 146 + SYNC_NOIDLE_WORKLOAD = 1, 147 + SYNC_WORKLOAD = 2 148 + }; 149 + 150 + /* This is per cgroup per device grouping structure */ 151 + struct cfq_group { 152 + /* group service_tree member */ 153 + struct rb_node rb_node; 154 + 155 + /* group service_tree key */ 156 + u64 vdisktime; 157 + unsigned int weight; 158 + bool on_st; 159 + 160 + /* number of cfqq currently on this group */ 161 + int nr_cfqq; 162 + 163 + /* Per group busy queus average. Useful for workload slice calc. */ 164 + unsigned int busy_queues_avg[2]; 165 + /* 166 + * rr lists of queues with requests, onle rr for each priority class. 167 + * Counts are embedded in the cfq_rb_root 168 + */ 169 + struct cfq_rb_root service_trees[2][3]; 170 + struct cfq_rb_root service_tree_idle; 171 + 172 + unsigned long saved_workload_slice; 173 + enum wl_type_t saved_workload; 174 + enum wl_prio_t saved_serving_prio; 175 + struct blkio_group blkg; 176 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 177 + struct hlist_node cfqd_node; 178 + atomic_t ref; 179 + #endif 137 180 }; 138 181 139 182 /* ··· 205 120 */ 206 121 struct cfq_data { 207 122 struct request_queue *queue; 123 + /* Root service tree for cfq_groups */ 124 + struct cfq_rb_root grp_service_tree; 125 + struct cfq_group root_group; 126 + /* Number of active cfq groups on group service tree */ 127 + int nr_groups; 208 128 209 129 /* 210 - * rr list of queues with requests and the count of them 130 + * The priority currently being served 211 131 */ 212 - struct cfq_rb_root service_tree; 132 + enum wl_prio_t serving_prio; 133 + enum wl_type_t serving_type; 134 + unsigned long workload_expires; 135 + struct cfq_group *serving_group; 136 + bool noidle_tree_requires_idle; 213 137 214 138 /* 215 139 * Each priority tree is sorted by next_request position. These ··· 237 143 */ 238 144 int rq_queued; 239 145 int hw_tag; 240 - int hw_tag_samples; 241 - int rq_in_driver_peak; 146 + /* 147 + * hw_tag can be 148 + * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 149 + * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 150 + * 0 => no NCQ 151 + */ 152 + int hw_tag_est_depth; 153 + unsigned int hw_tag_samples; 242 154 243 155 /* 244 156 * idle window management ··· 274 174 unsigned int cfq_slice_async_rq; 275 175 unsigned int cfq_slice_idle; 276 176 unsigned int cfq_latency; 177 + unsigned int cfq_group_isolation; 277 178 278 179 struct list_head cic_list; 279 180 ··· 284 183 struct cfq_queue oom_cfqq; 285 184 286 185 unsigned long last_end_sync_rq; 186 + 187 + /* List of cfq groups being managed on this device*/ 188 + struct hlist_head cfqg_list; 189 + struct rcu_head rcu; 287 190 }; 191 + 192 + static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 193 + 194 + static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 195 + enum wl_prio_t prio, 196 + enum wl_type_t type, 197 + struct cfq_data *cfqd) 198 + { 199 + if (!cfqg) 200 + return NULL; 201 + 202 + if (prio == IDLE_WORKLOAD) 203 + return &cfqg->service_tree_idle; 204 + 205 + return &cfqg->service_trees[prio][type]; 206 + } 288 207 289 208 enum cfqq_state_flags { 290 209 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ ··· 316 195 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 317 196 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 318 197 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 319 - CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 320 - CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ 198 + CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 199 + CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 200 + CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 201 + CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */ 321 202 }; 322 203 323 204 #define CFQ_CFQQ_FNS(name) \ ··· 346 223 CFQ_CFQQ_FNS(slice_new); 347 224 CFQ_CFQQ_FNS(sync); 348 225 CFQ_CFQQ_FNS(coop); 349 - CFQ_CFQQ_FNS(coop_preempt); 226 + CFQ_CFQQ_FNS(deep); 227 + CFQ_CFQQ_FNS(wait_busy); 228 + CFQ_CFQQ_FNS(wait_busy_done); 350 229 #undef CFQ_CFQQ_FNS 351 230 231 + #ifdef CONFIG_DEBUG_CFQ_IOSCHED 232 + #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 233 + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 234 + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 235 + blkg_path(&(cfqq)->cfqg->blkg), ##args); 236 + 237 + #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 238 + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 239 + blkg_path(&(cfqg)->blkg), ##args); \ 240 + 241 + #else 352 242 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 353 243 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 244 + #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 245 + #endif 354 246 #define cfq_log(cfqd, fmt, args...) \ 355 247 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 248 + 249 + /* Traverses through cfq group service trees */ 250 + #define for_each_cfqg_st(cfqg, i, j, st) \ 251 + for (i = 0; i <= IDLE_WORKLOAD; i++) \ 252 + for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 253 + : &cfqg->service_tree_idle; \ 254 + (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 255 + (i == IDLE_WORKLOAD && j == 0); \ 256 + j++, st = i < IDLE_WORKLOAD ? \ 257 + &cfqg->service_trees[i][j]: NULL) \ 258 + 259 + 260 + static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 261 + { 262 + if (cfq_class_idle(cfqq)) 263 + return IDLE_WORKLOAD; 264 + if (cfq_class_rt(cfqq)) 265 + return RT_WORKLOAD; 266 + return BE_WORKLOAD; 267 + } 268 + 269 + 270 + static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 271 + { 272 + if (!cfq_cfqq_sync(cfqq)) 273 + return ASYNC_WORKLOAD; 274 + if (!cfq_cfqq_idle_window(cfqq)) 275 + return SYNC_NOIDLE_WORKLOAD; 276 + return SYNC_WORKLOAD; 277 + } 278 + 279 + static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 280 + struct cfq_data *cfqd, 281 + struct cfq_group *cfqg) 282 + { 283 + if (wl == IDLE_WORKLOAD) 284 + return cfqg->service_tree_idle.count; 285 + 286 + return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 287 + + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 288 + + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 289 + } 290 + 291 + static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 292 + struct cfq_group *cfqg) 293 + { 294 + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 295 + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 296 + } 356 297 357 298 static void cfq_dispatch_insert(struct request_queue *, struct request *); 358 299 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, ··· 466 279 { 467 280 struct cfq_data *cfqd = q->elevator->elevator_data; 468 281 469 - return !cfqd->busy_queues; 282 + return !cfqd->rq_queued; 470 283 } 471 284 472 285 /* ··· 490 303 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 491 304 } 492 305 306 + static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 307 + { 308 + u64 d = delta << CFQ_SERVICE_SHIFT; 309 + 310 + d = d * BLKIO_WEIGHT_DEFAULT; 311 + do_div(d, cfqg->weight); 312 + return d; 313 + } 314 + 315 + static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 316 + { 317 + s64 delta = (s64)(vdisktime - min_vdisktime); 318 + if (delta > 0) 319 + min_vdisktime = vdisktime; 320 + 321 + return min_vdisktime; 322 + } 323 + 324 + static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 325 + { 326 + s64 delta = (s64)(vdisktime - min_vdisktime); 327 + if (delta < 0) 328 + min_vdisktime = vdisktime; 329 + 330 + return min_vdisktime; 331 + } 332 + 333 + static void update_min_vdisktime(struct cfq_rb_root *st) 334 + { 335 + u64 vdisktime = st->min_vdisktime; 336 + struct cfq_group *cfqg; 337 + 338 + if (st->active) { 339 + cfqg = rb_entry_cfqg(st->active); 340 + vdisktime = cfqg->vdisktime; 341 + } 342 + 343 + if (st->left) { 344 + cfqg = rb_entry_cfqg(st->left); 345 + vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 346 + } 347 + 348 + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 349 + } 350 + 351 + /* 352 + * get averaged number of queues of RT/BE priority. 353 + * average is updated, with a formula that gives more weight to higher numbers, 354 + * to quickly follows sudden increases and decrease slowly 355 + */ 356 + 357 + static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 358 + struct cfq_group *cfqg, bool rt) 359 + { 360 + unsigned min_q, max_q; 361 + unsigned mult = cfq_hist_divisor - 1; 362 + unsigned round = cfq_hist_divisor / 2; 363 + unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 364 + 365 + min_q = min(cfqg->busy_queues_avg[rt], busy); 366 + max_q = max(cfqg->busy_queues_avg[rt], busy); 367 + cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 368 + cfq_hist_divisor; 369 + return cfqg->busy_queues_avg[rt]; 370 + } 371 + 372 + static inline unsigned 373 + cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 374 + { 375 + struct cfq_rb_root *st = &cfqd->grp_service_tree; 376 + 377 + return cfq_target_latency * cfqg->weight / st->total_weight; 378 + } 379 + 493 380 static inline void 494 381 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 495 382 { 496 - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 383 + unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 384 + if (cfqd->cfq_latency) { 385 + /* 386 + * interested queues (we consider only the ones with the same 387 + * priority class in the cfq group) 388 + */ 389 + unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 390 + cfq_class_rt(cfqq)); 391 + unsigned sync_slice = cfqd->cfq_slice[1]; 392 + unsigned expect_latency = sync_slice * iq; 393 + unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 394 + 395 + if (expect_latency > group_slice) { 396 + unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 397 + /* scale low_slice according to IO priority 398 + * and sync vs async */ 399 + unsigned low_slice = 400 + min(slice, base_low_slice * slice / sync_slice); 401 + /* the adapted slice value is scaled to fit all iqs 402 + * into the target latency */ 403 + slice = max(slice * group_slice / expect_latency, 404 + low_slice); 405 + } 406 + } 407 + cfqq->slice_start = jiffies; 408 + cfqq->slice_end = jiffies + slice; 409 + cfqq->allocated_slice = slice; 497 410 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 498 411 } 499 412 ··· 618 331 * behind the head is penalized and only allowed to a certain extent. 619 332 */ 620 333 static struct request * 621 - cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 334 + cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 622 335 { 623 - sector_t last, s1, s2, d1 = 0, d2 = 0; 336 + sector_t s1, s2, d1 = 0, d2 = 0; 624 337 unsigned long back_max; 625 338 #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 626 339 #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ··· 642 355 643 356 s1 = blk_rq_pos(rq1); 644 357 s2 = blk_rq_pos(rq2); 645 - 646 - last = cfqd->last_position; 647 358 648 359 /* 649 360 * by definition, 1KiB is 2 sectors ··· 710 425 */ 711 426 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 712 427 { 428 + /* Service tree is empty */ 429 + if (!root->count) 430 + return NULL; 431 + 713 432 if (!root->left) 714 433 root->left = rb_first(&root->rb); 715 434 716 435 if (root->left) 717 436 return rb_entry(root->left, struct cfq_queue, rb_node); 437 + 438 + return NULL; 439 + } 440 + 441 + static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 442 + { 443 + if (!root->left) 444 + root->left = rb_first(&root->rb); 445 + 446 + if (root->left) 447 + return rb_entry_cfqg(root->left); 718 448 719 449 return NULL; 720 450 } ··· 745 445 if (root->left == n) 746 446 root->left = NULL; 747 447 rb_erase_init(n, &root->rb); 448 + --root->count; 748 449 } 749 450 750 451 /* ··· 772 471 next = rb_entry_rq(rbnext); 773 472 } 774 473 775 - return cfq_choose_req(cfqd, next, prev); 474 + return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 776 475 } 777 476 778 477 static unsigned long cfq_slice_offset(struct cfq_data *cfqd, ··· 781 480 /* 782 481 * just an approximation, should be ok. 783 482 */ 784 - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 483 + return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 785 484 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 786 485 } 787 486 487 + static inline s64 488 + cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 489 + { 490 + return cfqg->vdisktime - st->min_vdisktime; 491 + } 492 + 493 + static void 494 + __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 495 + { 496 + struct rb_node **node = &st->rb.rb_node; 497 + struct rb_node *parent = NULL; 498 + struct cfq_group *__cfqg; 499 + s64 key = cfqg_key(st, cfqg); 500 + int left = 1; 501 + 502 + while (*node != NULL) { 503 + parent = *node; 504 + __cfqg = rb_entry_cfqg(parent); 505 + 506 + if (key < cfqg_key(st, __cfqg)) 507 + node = &parent->rb_left; 508 + else { 509 + node = &parent->rb_right; 510 + left = 0; 511 + } 512 + } 513 + 514 + if (left) 515 + st->left = &cfqg->rb_node; 516 + 517 + rb_link_node(&cfqg->rb_node, parent, node); 518 + rb_insert_color(&cfqg->rb_node, &st->rb); 519 + } 520 + 521 + static void 522 + cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 523 + { 524 + struct cfq_rb_root *st = &cfqd->grp_service_tree; 525 + struct cfq_group *__cfqg; 526 + struct rb_node *n; 527 + 528 + cfqg->nr_cfqq++; 529 + if (cfqg->on_st) 530 + return; 531 + 532 + /* 533 + * Currently put the group at the end. Later implement something 534 + * so that groups get lesser vtime based on their weights, so that 535 + * if group does not loose all if it was not continously backlogged. 536 + */ 537 + n = rb_last(&st->rb); 538 + if (n) { 539 + __cfqg = rb_entry_cfqg(n); 540 + cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 541 + } else 542 + cfqg->vdisktime = st->min_vdisktime; 543 + 544 + __cfq_group_service_tree_add(st, cfqg); 545 + cfqg->on_st = true; 546 + cfqd->nr_groups++; 547 + st->total_weight += cfqg->weight; 548 + } 549 + 550 + static void 551 + cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 552 + { 553 + struct cfq_rb_root *st = &cfqd->grp_service_tree; 554 + 555 + if (st->active == &cfqg->rb_node) 556 + st->active = NULL; 557 + 558 + BUG_ON(cfqg->nr_cfqq < 1); 559 + cfqg->nr_cfqq--; 560 + 561 + /* If there are other cfq queues under this group, don't delete it */ 562 + if (cfqg->nr_cfqq) 563 + return; 564 + 565 + cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 566 + cfqg->on_st = false; 567 + cfqd->nr_groups--; 568 + st->total_weight -= cfqg->weight; 569 + if (!RB_EMPTY_NODE(&cfqg->rb_node)) 570 + cfq_rb_erase(&cfqg->rb_node, st); 571 + cfqg->saved_workload_slice = 0; 572 + blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); 573 + } 574 + 575 + static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 576 + { 577 + unsigned int slice_used; 578 + 579 + /* 580 + * Queue got expired before even a single request completed or 581 + * got expired immediately after first request completion. 582 + */ 583 + if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 584 + /* 585 + * Also charge the seek time incurred to the group, otherwise 586 + * if there are mutiple queues in the group, each can dispatch 587 + * a single request on seeky media and cause lots of seek time 588 + * and group will never know it. 589 + */ 590 + slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 591 + 1); 592 + } else { 593 + slice_used = jiffies - cfqq->slice_start; 594 + if (slice_used > cfqq->allocated_slice) 595 + slice_used = cfqq->allocated_slice; 596 + } 597 + 598 + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, 599 + cfqq->nr_sectors); 600 + return slice_used; 601 + } 602 + 603 + static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 604 + struct cfq_queue *cfqq) 605 + { 606 + struct cfq_rb_root *st = &cfqd->grp_service_tree; 607 + unsigned int used_sl, charge_sl; 608 + int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 609 + - cfqg->service_tree_idle.count; 610 + 611 + BUG_ON(nr_sync < 0); 612 + used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); 613 + 614 + if (!cfq_cfqq_sync(cfqq) && !nr_sync) 615 + charge_sl = cfqq->allocated_slice; 616 + 617 + /* Can't update vdisktime while group is on service tree */ 618 + cfq_rb_erase(&cfqg->rb_node, st); 619 + cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); 620 + __cfq_group_service_tree_add(st, cfqg); 621 + 622 + /* This group is being expired. Save the context */ 623 + if (time_after(cfqd->workload_expires, jiffies)) { 624 + cfqg->saved_workload_slice = cfqd->workload_expires 625 + - jiffies; 626 + cfqg->saved_workload = cfqd->serving_type; 627 + cfqg->saved_serving_prio = cfqd->serving_prio; 628 + } else 629 + cfqg->saved_workload_slice = 0; 630 + 631 + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 632 + st->min_vdisktime); 633 + blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, 634 + cfqq->nr_sectors); 635 + } 636 + 637 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 638 + static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 639 + { 640 + if (blkg) 641 + return container_of(blkg, struct cfq_group, blkg); 642 + return NULL; 643 + } 644 + 645 + void 646 + cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 647 + { 648 + cfqg_of_blkg(blkg)->weight = weight; 649 + } 650 + 651 + static struct cfq_group * 652 + cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 653 + { 654 + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 655 + struct cfq_group *cfqg = NULL; 656 + void *key = cfqd; 657 + int i, j; 658 + struct cfq_rb_root *st; 659 + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 660 + unsigned int major, minor; 661 + 662 + /* Do we need to take this reference */ 663 + if (!blkiocg_css_tryget(blkcg)) 664 + return NULL;; 665 + 666 + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 667 + if (cfqg || !create) 668 + goto done; 669 + 670 + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 671 + if (!cfqg) 672 + goto done; 673 + 674 + cfqg->weight = blkcg->weight; 675 + for_each_cfqg_st(cfqg, i, j, st) 676 + *st = CFQ_RB_ROOT; 677 + RB_CLEAR_NODE(&cfqg->rb_node); 678 + 679 + /* 680 + * Take the initial reference that will be released on destroy 681 + * This can be thought of a joint reference by cgroup and 682 + * elevator which will be dropped by either elevator exit 683 + * or cgroup deletion path depending on who is exiting first. 684 + */ 685 + atomic_set(&cfqg->ref, 1); 686 + 687 + /* Add group onto cgroup list */ 688 + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 689 + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 690 + MKDEV(major, minor)); 691 + 692 + /* Add group on cfqd list */ 693 + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 694 + 695 + done: 696 + blkiocg_css_put(blkcg); 697 + return cfqg; 698 + } 699 + 788 700 /* 789 - * The cfqd->service_tree holds all pending cfq_queue's that have 701 + * Search for the cfq group current task belongs to. If create = 1, then also 702 + * create the cfq group if it does not exist. request_queue lock must be held. 703 + */ 704 + static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 705 + { 706 + struct cgroup *cgroup; 707 + struct cfq_group *cfqg = NULL; 708 + 709 + rcu_read_lock(); 710 + cgroup = task_cgroup(current, blkio_subsys_id); 711 + cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 712 + if (!cfqg && create) 713 + cfqg = &cfqd->root_group; 714 + rcu_read_unlock(); 715 + return cfqg; 716 + } 717 + 718 + static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 719 + { 720 + /* Currently, all async queues are mapped to root group */ 721 + if (!cfq_cfqq_sync(cfqq)) 722 + cfqg = &cfqq->cfqd->root_group; 723 + 724 + cfqq->cfqg = cfqg; 725 + /* cfqq reference on cfqg */ 726 + atomic_inc(&cfqq->cfqg->ref); 727 + } 728 + 729 + static void cfq_put_cfqg(struct cfq_group *cfqg) 730 + { 731 + struct cfq_rb_root *st; 732 + int i, j; 733 + 734 + BUG_ON(atomic_read(&cfqg->ref) <= 0); 735 + if (!atomic_dec_and_test(&cfqg->ref)) 736 + return; 737 + for_each_cfqg_st(cfqg, i, j, st) 738 + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 739 + kfree(cfqg); 740 + } 741 + 742 + static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 743 + { 744 + /* Something wrong if we are trying to remove same group twice */ 745 + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 746 + 747 + hlist_del_init(&cfqg->cfqd_node); 748 + 749 + /* 750 + * Put the reference taken at the time of creation so that when all 751 + * queues are gone, group can be destroyed. 752 + */ 753 + cfq_put_cfqg(cfqg); 754 + } 755 + 756 + static void cfq_release_cfq_groups(struct cfq_data *cfqd) 757 + { 758 + struct hlist_node *pos, *n; 759 + struct cfq_group *cfqg; 760 + 761 + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 762 + /* 763 + * If cgroup removal path got to blk_group first and removed 764 + * it from cgroup list, then it will take care of destroying 765 + * cfqg also. 766 + */ 767 + if (!blkiocg_del_blkio_group(&cfqg->blkg)) 768 + cfq_destroy_cfqg(cfqd, cfqg); 769 + } 770 + } 771 + 772 + /* 773 + * Blk cgroup controller notification saying that blkio_group object is being 774 + * delinked as associated cgroup object is going away. That also means that 775 + * no new IO will come in this group. So get rid of this group as soon as 776 + * any pending IO in the group is finished. 777 + * 778 + * This function is called under rcu_read_lock(). key is the rcu protected 779 + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 780 + * read lock. 781 + * 782 + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 783 + * it should not be NULL as even if elevator was exiting, cgroup deltion 784 + * path got to it first. 785 + */ 786 + void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 787 + { 788 + unsigned long flags; 789 + struct cfq_data *cfqd = key; 790 + 791 + spin_lock_irqsave(cfqd->queue->queue_lock, flags); 792 + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 793 + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 794 + } 795 + 796 + #else /* GROUP_IOSCHED */ 797 + static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 798 + { 799 + return &cfqd->root_group; 800 + } 801 + static inline void 802 + cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 803 + cfqq->cfqg = cfqg; 804 + } 805 + 806 + static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 807 + static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 808 + 809 + #endif /* GROUP_IOSCHED */ 810 + 811 + /* 812 + * The cfqd->service_trees holds all pending cfq_queue's that have 790 813 * requests waiting to be processed. It is sorted in the order that 791 814 * we will service the queues. 792 815 */ ··· 1120 495 struct rb_node **p, *parent; 1121 496 struct cfq_queue *__cfqq; 1122 497 unsigned long rb_key; 498 + struct cfq_rb_root *service_tree; 1123 499 int left; 500 + int new_cfqq = 1; 501 + int group_changed = 0; 1124 502 503 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 504 + if (!cfqd->cfq_group_isolation 505 + && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 506 + && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 507 + /* Move this cfq to root group */ 508 + cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 509 + if (!RB_EMPTY_NODE(&cfqq->rb_node)) 510 + cfq_group_service_tree_del(cfqd, cfqq->cfqg); 511 + cfqq->orig_cfqg = cfqq->cfqg; 512 + cfqq->cfqg = &cfqd->root_group; 513 + atomic_inc(&cfqd->root_group.ref); 514 + group_changed = 1; 515 + } else if (!cfqd->cfq_group_isolation 516 + && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 517 + /* cfqq is sequential now needs to go to its original group */ 518 + BUG_ON(cfqq->cfqg != &cfqd->root_group); 519 + if (!RB_EMPTY_NODE(&cfqq->rb_node)) 520 + cfq_group_service_tree_del(cfqd, cfqq->cfqg); 521 + cfq_put_cfqg(cfqq->cfqg); 522 + cfqq->cfqg = cfqq->orig_cfqg; 523 + cfqq->orig_cfqg = NULL; 524 + group_changed = 1; 525 + cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 526 + } 527 + #endif 528 + 529 + service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 530 + cfqq_type(cfqq), cfqd); 1125 531 if (cfq_class_idle(cfqq)) { 1126 532 rb_key = CFQ_IDLE_DELAY; 1127 - parent = rb_last(&cfqd->service_tree.rb); 533 + parent = rb_last(&service_tree->rb); 1128 534 if (parent && parent != &cfqq->rb_node) { 1129 535 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1130 536 rb_key += __cfqq->rb_key; ··· 1173 517 cfqq->slice_resid = 0; 1174 518 } else { 1175 519 rb_key = -HZ; 1176 - __cfqq = cfq_rb_first(&cfqd->service_tree); 520 + __cfqq = cfq_rb_first(service_tree); 1177 521 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1178 522 } 1179 523 1180 524 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 525 + new_cfqq = 0; 1181 526 /* 1182 527 * same position, nothing more to do 1183 528 */ 1184 - if (rb_key == cfqq->rb_key) 529 + if (rb_key == cfqq->rb_key && 530 + cfqq->service_tree == service_tree) 1185 531 return; 1186 532 1187 - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 533 + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 534 + cfqq->service_tree = NULL; 1188 535 } 1189 536 1190 537 left = 1; 1191 538 parent = NULL; 1192 - p = &cfqd->service_tree.rb.rb_node; 539 + cfqq->service_tree = service_tree; 540 + p = &service_tree->rb.rb_node; 1193 541 while (*p) { 1194 542 struct rb_node **n; 1195 543 ··· 1201 541 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1202 542 1203 543 /* 1204 - * sort RT queues first, we always want to give 1205 - * preference to them. IDLE queues goes to the back. 1206 - * after that, sort on the next service time. 544 + * sort by key, that represents service time. 1207 545 */ 1208 - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 546 + if (time_before(rb_key, __cfqq->rb_key)) 1209 547 n = &(*p)->rb_left; 1210 - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) 548 + else { 1211 549 n = &(*p)->rb_right; 1212 - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) 1213 - n = &(*p)->rb_left; 1214 - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) 1215 - n = &(*p)->rb_right; 1216 - else if (time_before(rb_key, __cfqq->rb_key)) 1217 - n = &(*p)->rb_left; 1218 - else 1219 - n = &(*p)->rb_right; 1220 - 1221 - if (n == &(*p)->rb_right) 1222 550 left = 0; 551 + } 1223 552 1224 553 p = n; 1225 554 } 1226 555 1227 556 if (left) 1228 - cfqd->service_tree.left = &cfqq->rb_node; 557 + service_tree->left = &cfqq->rb_node; 1229 558 1230 559 cfqq->rb_key = rb_key; 1231 560 rb_link_node(&cfqq->rb_node, parent, p); 1232 - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 561 + rb_insert_color(&cfqq->rb_node, &service_tree->rb); 562 + service_tree->count++; 563 + if ((add_front || !new_cfqq) && !group_changed) 564 + return; 565 + cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1233 566 } 1234 567 1235 568 static struct cfq_queue * ··· 1324 671 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1325 672 cfq_clear_cfqq_on_rr(cfqq); 1326 673 1327 - if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1328 - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 674 + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 675 + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 676 + cfqq->service_tree = NULL; 677 + } 1329 678 if (cfqq->p_root) { 1330 679 rb_erase(&cfqq->p_node, cfqq->p_root); 1331 680 cfqq->p_root = NULL; 1332 681 } 1333 682 683 + cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1334 684 BUG_ON(!cfqd->busy_queues); 1335 685 cfqd->busy_queues--; 1336 686 } ··· 1344 688 static void cfq_del_rq_rb(struct request *rq) 1345 689 { 1346 690 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1347 - struct cfq_data *cfqd = cfqq->cfqd; 1348 691 const int sync = rq_is_sync(rq); 1349 692 1350 693 BUG_ON(!cfqq->queued[sync]); ··· 1351 696 1352 697 elv_rb_del(&cfqq->sort_list, rq); 1353 698 1354 - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1355 - cfq_del_cfqq_rr(cfqd, cfqq); 699 + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 700 + /* 701 + * Queue will be deleted from service tree when we actually 702 + * expire it later. Right now just remove it from prio tree 703 + * as it is empty. 704 + */ 705 + if (cfqq->p_root) { 706 + rb_erase(&cfqq->p_node, cfqq->p_root); 707 + cfqq->p_root = NULL; 708 + } 709 + } 1356 710 } 1357 711 1358 712 static void cfq_add_rq_rb(struct request *rq) ··· 1386 722 * check if this request is a better next-serve candidate 1387 723 */ 1388 724 prev = cfqq->next_rq; 1389 - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 725 + cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1390 726 1391 727 /* 1392 728 * adjust priority tree position, if ->next_rq changes ··· 1493 829 cfq_merged_requests(struct request_queue *q, struct request *rq, 1494 830 struct request *next) 1495 831 { 832 + struct cfq_queue *cfqq = RQ_CFQQ(rq); 1496 833 /* 1497 834 * reposition in fifo if next is older than rq 1498 835 */ ··· 1503 838 rq_set_fifo_time(rq, rq_fifo_time(next)); 1504 839 } 1505 840 841 + if (cfqq->next_rq == next) 842 + cfqq->next_rq = rq; 1506 843 cfq_remove_request(next); 1507 844 } 1508 845 ··· 1515 848 struct cfq_io_context *cic; 1516 849 struct cfq_queue *cfqq; 1517 850 851 + /* Deny merge if bio and rq don't belong to same cfq group */ 852 + if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) 853 + return false; 1518 854 /* 1519 855 * Disallow merge of a sync bio into an async request. 1520 856 */ ··· 1541 871 { 1542 872 if (cfqq) { 1543 873 cfq_log_cfqq(cfqd, cfqq, "set_active"); 874 + cfqq->slice_start = 0; 875 + cfqq->dispatch_start = jiffies; 876 + cfqq->allocated_slice = 0; 1544 877 cfqq->slice_end = 0; 1545 878 cfqq->slice_dispatch = 0; 879 + cfqq->nr_sectors = 0; 1546 880 1547 881 cfq_clear_cfqq_wait_request(cfqq); 1548 882 cfq_clear_cfqq_must_dispatch(cfqq); ··· 1573 899 del_timer(&cfqd->idle_slice_timer); 1574 900 1575 901 cfq_clear_cfqq_wait_request(cfqq); 902 + cfq_clear_cfqq_wait_busy(cfqq); 903 + cfq_clear_cfqq_wait_busy_done(cfqq); 1576 904 1577 905 /* 1578 906 * store what was left of this slice, if the queue idled/timed out ··· 1584 908 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1585 909 } 1586 910 911 + cfq_group_served(cfqd, cfqq->cfqg, cfqq); 912 + 913 + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 914 + cfq_del_cfqq_rr(cfqd, cfqq); 915 + 1587 916 cfq_resort_rr_list(cfqd, cfqq); 1588 917 1589 918 if (cfqq == cfqd->active_queue) 1590 919 cfqd->active_queue = NULL; 920 + 921 + if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 922 + cfqd->grp_service_tree.active = NULL; 1591 923 1592 924 if (cfqd->active_cic) { 1593 925 put_io_context(cfqd->active_cic->ioc); ··· 1617 933 */ 1618 934 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1619 935 { 1620 - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 936 + struct cfq_rb_root *service_tree = 937 + service_tree_for(cfqd->serving_group, cfqd->serving_prio, 938 + cfqd->serving_type, cfqd); 939 + 940 + if (!cfqd->rq_queued) 1621 941 return NULL; 1622 942 1623 - return cfq_rb_first(&cfqd->service_tree); 943 + /* There is nothing to dispatch */ 944 + if (!service_tree) 945 + return NULL; 946 + if (RB_EMPTY_ROOT(&service_tree->rb)) 947 + return NULL; 948 + return cfq_rb_first(service_tree); 949 + } 950 + 951 + static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 952 + { 953 + struct cfq_group *cfqg; 954 + struct cfq_queue *cfqq; 955 + int i, j; 956 + struct cfq_rb_root *st; 957 + 958 + if (!cfqd->rq_queued) 959 + return NULL; 960 + 961 + cfqg = cfq_get_next_cfqg(cfqd); 962 + if (!cfqg) 963 + return NULL; 964 + 965 + for_each_cfqg_st(cfqg, i, j, st) 966 + if ((cfqq = cfq_rb_first(st)) != NULL) 967 + return cfqq; 968 + return NULL; 1624 969 } 1625 970 1626 971 /* ··· 1658 945 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1659 946 struct cfq_queue *cfqq) 1660 947 { 1661 - if (!cfqq) { 948 + if (!cfqq) 1662 949 cfqq = cfq_get_next_queue(cfqd); 1663 - if (cfqq && !cfq_cfqq_coop_preempt(cfqq)) 1664 - cfq_clear_cfqq_coop(cfqq); 1665 - } 1666 - 1667 - if (cfqq) 1668 - cfq_clear_cfqq_coop_preempt(cfqq); 1669 950 1670 951 __cfq_set_active_queue(cfqd, cfqq); 1671 952 return cfqq; ··· 1674 967 return cfqd->last_position - blk_rq_pos(rq); 1675 968 } 1676 969 1677 - #define CIC_SEEK_THR 8 * 1024 1678 - #define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 970 + #define CFQQ_SEEK_THR 8 * 1024 971 + #define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) 1679 972 1680 - static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 973 + static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 974 + struct request *rq) 1681 975 { 1682 - struct cfq_io_context *cic = cfqd->active_cic; 1683 - sector_t sdist = cic->seek_mean; 976 + sector_t sdist = cfqq->seek_mean; 1684 977 1685 - if (!sample_valid(cic->seek_samples)) 1686 - sdist = CIC_SEEK_THR; 978 + if (!sample_valid(cfqq->seek_samples)) 979 + sdist = CFQQ_SEEK_THR; 1687 980 1688 981 return cfq_dist_from_last(cfqd, rq) <= sdist; 1689 982 } ··· 1712 1005 * will contain the closest sector. 1713 1006 */ 1714 1007 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1715 - if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1008 + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1716 1009 return __cfqq; 1717 1010 1718 1011 if (blk_rq_pos(__cfqq->next_rq) < sector) ··· 1723 1016 return NULL; 1724 1017 1725 1018 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1726 - if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1019 + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1727 1020 return __cfqq; 1728 1021 1729 1022 return NULL; ··· 1740 1033 * assumption. 1741 1034 */ 1742 1035 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1743 - struct cfq_queue *cur_cfqq, 1744 - bool probe) 1036 + struct cfq_queue *cur_cfqq) 1745 1037 { 1746 1038 struct cfq_queue *cfqq; 1747 1039 1748 - /* 1749 - * A valid cfq_io_context is necessary to compare requests against 1750 - * the seek_mean of the current cfqq. 1751 - */ 1752 - if (!cfqd->active_cic) 1040 + if (!cfq_cfqq_sync(cur_cfqq)) 1041 + return NULL; 1042 + if (CFQQ_SEEKY(cur_cfqq)) 1753 1043 return NULL; 1754 1044 1755 1045 /* ··· 1758 1054 if (!cfqq) 1759 1055 return NULL; 1760 1056 1761 - if (cfq_cfqq_coop(cfqq)) 1057 + /* If new queue belongs to different cfq_group, don't choose it */ 1058 + if (cur_cfqq->cfqg != cfqq->cfqg) 1762 1059 return NULL; 1763 1060 1764 - if (!probe) 1765 - cfq_mark_cfqq_coop(cfqq); 1061 + /* 1062 + * It only makes sense to merge sync queues. 1063 + */ 1064 + if (!cfq_cfqq_sync(cfqq)) 1065 + return NULL; 1066 + if (CFQQ_SEEKY(cfqq)) 1067 + return NULL; 1068 + 1069 + /* 1070 + * Do not merge queues of different priority classes 1071 + */ 1072 + if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1073 + return NULL; 1074 + 1766 1075 return cfqq; 1076 + } 1077 + 1078 + /* 1079 + * Determine whether we should enforce idle window for this queue. 1080 + */ 1081 + 1082 + static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1083 + { 1084 + enum wl_prio_t prio = cfqq_prio(cfqq); 1085 + struct cfq_rb_root *service_tree = cfqq->service_tree; 1086 + 1087 + BUG_ON(!service_tree); 1088 + BUG_ON(!service_tree->count); 1089 + 1090 + /* We never do for idle class queues. */ 1091 + if (prio == IDLE_WORKLOAD) 1092 + return false; 1093 + 1094 + /* We do for queues that were marked with idle window flag. */ 1095 + if (cfq_cfqq_idle_window(cfqq) && 1096 + !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1097 + return true; 1098 + 1099 + /* 1100 + * Otherwise, we do only if they are the last ones 1101 + * in their service tree. 1102 + */ 1103 + return service_tree->count == 1; 1767 1104 } 1768 1105 1769 1106 static void cfq_arm_slice_timer(struct cfq_data *cfqd) ··· 1827 1082 /* 1828 1083 * idle is disabled, either manually or by past process history 1829 1084 */ 1830 - if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 1085 + if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) 1831 1086 return; 1832 1087 1833 1088 /* 1834 - * still requests with the driver, don't idle 1089 + * still active requests from this queue, don't idle 1835 1090 */ 1836 - if (rq_in_driver(cfqd)) 1091 + if (cfqq->dispatched) 1837 1092 return; 1838 1093 1839 1094 /* ··· 1854 1109 1855 1110 cfq_mark_cfqq_wait_request(cfqq); 1856 1111 1857 - /* 1858 - * we don't want to idle for seeks, but we do want to allow 1859 - * fair distribution of slice time for a process doing back-to-back 1860 - * seeks. so allow a little bit of time for him to submit a new rq 1861 - */ 1862 1112 sl = cfqd->cfq_slice_idle; 1863 - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 1864 - sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1865 1113 1866 1114 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1867 1115 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); ··· 1877 1139 1878 1140 if (cfq_cfqq_sync(cfqq)) 1879 1141 cfqd->sync_flight++; 1142 + cfqq->nr_sectors += blk_rq_sectors(rq); 1880 1143 } 1881 1144 1882 1145 /* ··· 1914 1175 } 1915 1176 1916 1177 /* 1178 + * Must be called with the queue_lock held. 1179 + */ 1180 + static int cfqq_process_refs(struct cfq_queue *cfqq) 1181 + { 1182 + int process_refs, io_refs; 1183 + 1184 + io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 1185 + process_refs = atomic_read(&cfqq->ref) - io_refs; 1186 + BUG_ON(process_refs < 0); 1187 + return process_refs; 1188 + } 1189 + 1190 + static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 1191 + { 1192 + int process_refs, new_process_refs; 1193 + struct cfq_queue *__cfqq; 1194 + 1195 + /* Avoid a circular list and skip interim queue merges */ 1196 + while ((__cfqq = new_cfqq->new_cfqq)) { 1197 + if (__cfqq == cfqq) 1198 + return; 1199 + new_cfqq = __cfqq; 1200 + } 1201 + 1202 + process_refs = cfqq_process_refs(cfqq); 1203 + /* 1204 + * If the process for the cfqq has gone away, there is no 1205 + * sense in merging the queues. 1206 + */ 1207 + if (process_refs == 0) 1208 + return; 1209 + 1210 + /* 1211 + * Merge in the direction of the lesser amount of work. 1212 + */ 1213 + new_process_refs = cfqq_process_refs(new_cfqq); 1214 + if (new_process_refs >= process_refs) { 1215 + cfqq->new_cfqq = new_cfqq; 1216 + atomic_add(process_refs, &new_cfqq->ref); 1217 + } else { 1218 + new_cfqq->new_cfqq = cfqq; 1219 + atomic_add(new_process_refs, &cfqq->ref); 1220 + } 1221 + } 1222 + 1223 + static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 1224 + struct cfq_group *cfqg, enum wl_prio_t prio, 1225 + bool prio_changed) 1226 + { 1227 + struct cfq_queue *queue; 1228 + int i; 1229 + bool key_valid = false; 1230 + unsigned long lowest_key = 0; 1231 + enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 1232 + 1233 + if (prio_changed) { 1234 + /* 1235 + * When priorities switched, we prefer starting 1236 + * from SYNC_NOIDLE (first choice), or just SYNC 1237 + * over ASYNC 1238 + */ 1239 + if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) 1240 + return cur_best; 1241 + cur_best = SYNC_WORKLOAD; 1242 + if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) 1243 + return cur_best; 1244 + 1245 + return ASYNC_WORKLOAD; 1246 + } 1247 + 1248 + for (i = 0; i < 3; ++i) { 1249 + /* otherwise, select the one with lowest rb_key */ 1250 + queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd)); 1251 + if (queue && 1252 + (!key_valid || time_before(queue->rb_key, lowest_key))) { 1253 + lowest_key = queue->rb_key; 1254 + cur_best = i; 1255 + key_valid = true; 1256 + } 1257 + } 1258 + 1259 + return cur_best; 1260 + } 1261 + 1262 + static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 1263 + { 1264 + enum wl_prio_t previous_prio = cfqd->serving_prio; 1265 + bool prio_changed; 1266 + unsigned slice; 1267 + unsigned count; 1268 + struct cfq_rb_root *st; 1269 + unsigned group_slice; 1270 + 1271 + if (!cfqg) { 1272 + cfqd->serving_prio = IDLE_WORKLOAD; 1273 + cfqd->workload_expires = jiffies + 1; 1274 + return; 1275 + } 1276 + 1277 + /* Choose next priority. RT > BE > IDLE */ 1278 + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 1279 + cfqd->serving_prio = RT_WORKLOAD; 1280 + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 1281 + cfqd->serving_prio = BE_WORKLOAD; 1282 + else { 1283 + cfqd->serving_prio = IDLE_WORKLOAD; 1284 + cfqd->workload_expires = jiffies + 1; 1285 + return; 1286 + } 1287 + 1288 + /* 1289 + * For RT and BE, we have to choose also the type 1290 + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 1291 + * expiration time 1292 + */ 1293 + prio_changed = (cfqd->serving_prio != previous_prio); 1294 + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 1295 + cfqd); 1296 + count = st->count; 1297 + 1298 + /* 1299 + * If priority didn't change, check workload expiration, 1300 + * and that we still have other queues ready 1301 + */ 1302 + if (!prio_changed && count && 1303 + !time_after(jiffies, cfqd->workload_expires)) 1304 + return; 1305 + 1306 + /* otherwise select new workload type */ 1307 + cfqd->serving_type = 1308 + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); 1309 + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, 1310 + cfqd); 1311 + count = st->count; 1312 + 1313 + /* 1314 + * the workload slice is computed as a fraction of target latency 1315 + * proportional to the number of queues in that workload, over 1316 + * all the queues in the same priority class 1317 + */ 1318 + group_slice = cfq_group_slice(cfqd, cfqg); 1319 + 1320 + slice = group_slice * count / 1321 + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 1322 + cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 1323 + 1324 + if (cfqd->serving_type == ASYNC_WORKLOAD) { 1325 + unsigned int tmp; 1326 + 1327 + /* 1328 + * Async queues are currently system wide. Just taking 1329 + * proportion of queues with-in same group will lead to higher 1330 + * async ratio system wide as generally root group is going 1331 + * to have higher weight. A more accurate thing would be to 1332 + * calculate system wide asnc/sync ratio. 1333 + */ 1334 + tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 1335 + tmp = tmp/cfqd->busy_queues; 1336 + slice = min_t(unsigned, slice, tmp); 1337 + 1338 + /* async workload slice is scaled down according to 1339 + * the sync/async slice ratio. */ 1340 + slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 1341 + } else 1342 + /* sync workload slice is at least 2 * cfq_slice_idle */ 1343 + slice = max(slice, 2 * cfqd->cfq_slice_idle); 1344 + 1345 + slice = max_t(unsigned, slice, CFQ_MIN_TT); 1346 + cfqd->workload_expires = jiffies + slice; 1347 + cfqd->noidle_tree_requires_idle = false; 1348 + } 1349 + 1350 + static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 1351 + { 1352 + struct cfq_rb_root *st = &cfqd->grp_service_tree; 1353 + struct cfq_group *cfqg; 1354 + 1355 + if (RB_EMPTY_ROOT(&st->rb)) 1356 + return NULL; 1357 + cfqg = cfq_rb_first_group(st); 1358 + st->active = &cfqg->rb_node; 1359 + update_min_vdisktime(st); 1360 + return cfqg; 1361 + } 1362 + 1363 + static void cfq_choose_cfqg(struct cfq_data *cfqd) 1364 + { 1365 + struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 1366 + 1367 + cfqd->serving_group = cfqg; 1368 + 1369 + /* Restore the workload type data */ 1370 + if (cfqg->saved_workload_slice) { 1371 + cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 1372 + cfqd->serving_type = cfqg->saved_workload; 1373 + cfqd->serving_prio = cfqg->saved_serving_prio; 1374 + } 1375 + choose_service_tree(cfqd, cfqg); 1376 + } 1377 + 1378 + /* 1917 1379 * Select a queue for service. If we have a current active queue, 1918 1380 * check whether to continue servicing it, or retrieve and set a new one. 1919 1381 */ ··· 2126 1186 if (!cfqq) 2127 1187 goto new_queue; 2128 1188 1189 + if (!cfqd->rq_queued) 1190 + return NULL; 2129 1191 /* 2130 1192 * The active queue has run out of time, expire it and select new. 2131 1193 */ 2132 - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 1194 + if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq)) 1195 + && !cfq_cfqq_must_dispatch(cfqq)) 2133 1196 goto expire; 2134 1197 2135 1198 /* ··· 2146 1203 * If another queue has a request waiting within our mean seek 2147 1204 * distance, let it run. The expire code will check for close 2148 1205 * cooperators and put the close queue at the front of the service 2149 - * tree. 1206 + * tree. If possible, merge the expiring queue with the new cfqq. 2150 1207 */ 2151 - new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 2152 - if (new_cfqq) 1208 + new_cfqq = cfq_close_cooperator(cfqd, cfqq); 1209 + if (new_cfqq) { 1210 + if (!cfqq->new_cfqq) 1211 + cfq_setup_merge(cfqq, new_cfqq); 2153 1212 goto expire; 1213 + } 2154 1214 2155 1215 /* 2156 1216 * No requests pending. If the active queue still has requests in ··· 2161 1215 * conditions to happen (or time out) before selecting a new queue. 2162 1216 */ 2163 1217 if (timer_pending(&cfqd->idle_slice_timer) || 2164 - (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { 1218 + (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { 2165 1219 cfqq = NULL; 2166 1220 goto keep_queue; 2167 1221 } ··· 2169 1223 expire: 2170 1224 cfq_slice_expired(cfqd, 0); 2171 1225 new_queue: 1226 + /* 1227 + * Current queue expired. Check if we have to switch to a new 1228 + * service tree 1229 + */ 1230 + if (!new_cfqq) 1231 + cfq_choose_cfqg(cfqd); 1232 + 2172 1233 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2173 1234 keep_queue: 2174 1235 return cfqq; ··· 2191 1238 } 2192 1239 2193 1240 BUG_ON(!list_empty(&cfqq->fifo)); 1241 + 1242 + /* By default cfqq is not expired if it is empty. Do it explicitly */ 1243 + __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2194 1244 return dispatched; 2195 1245 } 2196 1246 ··· 2206 1250 struct cfq_queue *cfqq; 2207 1251 int dispatched = 0; 2208 1252 2209 - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) 1253 + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) 2210 1254 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2211 1255 2212 1256 cfq_slice_expired(cfqd, 0); 2213 - 2214 1257 BUG_ON(cfqd->busy_queues); 2215 1258 2216 1259 cfq_log(cfqd, "forced_dispatch=%d", dispatched); ··· 2223 1268 /* 2224 1269 * Drain async requests before we start sync IO 2225 1270 */ 2226 - if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 1271 + if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 2227 1272 return false; 2228 1273 2229 1274 /* ··· 2253 1298 return false; 2254 1299 2255 1300 /* 2256 - * Sole queue user, allow bigger slice 1301 + * Sole queue user, no limit 2257 1302 */ 2258 - max_dispatch *= 4; 1303 + max_dispatch = -1; 2259 1304 } 2260 1305 2261 1306 /* ··· 2362 1407 * task holds one reference to the queue, dropped when task exits. each rq 2363 1408 * in-flight on this queue also holds a reference, dropped when rq is freed. 2364 1409 * 1410 + * Each cfq queue took a reference on the parent group. Drop it now. 2365 1411 * queue lock must be held here. 2366 1412 */ 2367 1413 static void cfq_put_queue(struct cfq_queue *cfqq) 2368 1414 { 2369 1415 struct cfq_data *cfqd = cfqq->cfqd; 1416 + struct cfq_group *cfqg, *orig_cfqg; 2370 1417 2371 1418 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2372 1419 ··· 2378 1421 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2379 1422 BUG_ON(rb_first(&cfqq->sort_list)); 2380 1423 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2381 - BUG_ON(cfq_cfqq_on_rr(cfqq)); 1424 + cfqg = cfqq->cfqg; 1425 + orig_cfqg = cfqq->orig_cfqg; 2382 1426 2383 1427 if (unlikely(cfqd->active_queue == cfqq)) { 2384 1428 __cfq_slice_expired(cfqd, cfqq, 0); 2385 1429 cfq_schedule_dispatch(cfqd); 2386 1430 } 2387 1431 1432 + BUG_ON(cfq_cfqq_on_rr(cfqq)); 2388 1433 kmem_cache_free(cfq_pool, cfqq); 1434 + cfq_put_cfqg(cfqg); 1435 + if (orig_cfqg) 1436 + cfq_put_cfqg(orig_cfqg); 2389 1437 } 2390 1438 2391 1439 /* ··· 2480 1518 2481 1519 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2482 1520 { 1521 + struct cfq_queue *__cfqq, *next; 1522 + 2483 1523 if (unlikely(cfqq == cfqd->active_queue)) { 2484 1524 __cfq_slice_expired(cfqd, cfqq, 0); 2485 1525 cfq_schedule_dispatch(cfqd); 1526 + } 1527 + 1528 + /* 1529 + * If this queue was scheduled to merge with another queue, be 1530 + * sure to drop the reference taken on that queue (and others in 1531 + * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 1532 + */ 1533 + __cfqq = cfqq->new_cfqq; 1534 + while (__cfqq) { 1535 + if (__cfqq == cfqq) { 1536 + WARN(1, "cfqq->new_cfqq loop detected\n"); 1537 + break; 1538 + } 1539 + next = __cfqq->new_cfqq; 1540 + cfq_put_queue(__cfqq); 1541 + __cfqq = next; 2486 1542 } 2487 1543 2488 1544 cfq_put_queue(cfqq); ··· 2683 1703 cfqq->pid = pid; 2684 1704 } 2685 1705 1706 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 1707 + static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 1708 + { 1709 + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 1710 + struct cfq_data *cfqd = cic->key; 1711 + unsigned long flags; 1712 + struct request_queue *q; 1713 + 1714 + if (unlikely(!cfqd)) 1715 + return; 1716 + 1717 + q = cfqd->queue; 1718 + 1719 + spin_lock_irqsave(q->queue_lock, flags); 1720 + 1721 + if (sync_cfqq) { 1722 + /* 1723 + * Drop reference to sync queue. A new sync queue will be 1724 + * assigned in new group upon arrival of a fresh request. 1725 + */ 1726 + cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 1727 + cic_set_cfqq(cic, NULL, 1); 1728 + cfq_put_queue(sync_cfqq); 1729 + } 1730 + 1731 + spin_unlock_irqrestore(q->queue_lock, flags); 1732 + } 1733 + 1734 + static void cfq_ioc_set_cgroup(struct io_context *ioc) 1735 + { 1736 + call_for_each_cic(ioc, changed_cgroup); 1737 + ioc->cgroup_changed = 0; 1738 + } 1739 + #endif /* CONFIG_CFQ_GROUP_IOSCHED */ 1740 + 2686 1741 static struct cfq_queue * 2687 1742 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2688 1743 struct io_context *ioc, gfp_t gfp_mask) 2689 1744 { 2690 1745 struct cfq_queue *cfqq, *new_cfqq = NULL; 2691 1746 struct cfq_io_context *cic; 1747 + struct cfq_group *cfqg; 2692 1748 2693 1749 retry: 1750 + cfqg = cfq_get_cfqg(cfqd, 1); 2694 1751 cic = cfq_cic_lookup(cfqd, ioc); 2695 1752 /* cic always exists here */ 2696 1753 cfqq = cic_to_cfqq(cic, is_sync); ··· 2758 1741 if (cfqq) { 2759 1742 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2760 1743 cfq_init_prio_data(cfqq, ioc); 1744 + cfq_link_cfqq_cfqg(cfqq, cfqg); 2761 1745 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2762 1746 } else 2763 1747 cfqq = &cfqd->oom_cfqq; ··· 2950 1932 if (unlikely(ioc->ioprio_changed)) 2951 1933 cfq_ioc_set_ioprio(ioc); 2952 1934 1935 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 1936 + if (unlikely(ioc->cgroup_changed)) 1937 + cfq_ioc_set_cgroup(ioc); 1938 + #endif 2953 1939 return cic; 2954 1940 err_free: 2955 1941 cfq_cic_free(cic); ··· 2974 1952 } 2975 1953 2976 1954 static void 2977 - cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 1955 + cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2978 1956 struct request *rq) 2979 1957 { 2980 1958 sector_t sdist; 2981 1959 u64 total; 2982 1960 2983 - if (!cic->last_request_pos) 1961 + if (!cfqq->last_request_pos) 2984 1962 sdist = 0; 2985 - else if (cic->last_request_pos < blk_rq_pos(rq)) 2986 - sdist = blk_rq_pos(rq) - cic->last_request_pos; 1963 + else if (cfqq->last_request_pos < blk_rq_pos(rq)) 1964 + sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 2987 1965 else 2988 - sdist = cic->last_request_pos - blk_rq_pos(rq); 1966 + sdist = cfqq->last_request_pos - blk_rq_pos(rq); 2989 1967 2990 1968 /* 2991 1969 * Don't allow the seek distance to get too large from the 2992 1970 * odd fragment, pagein, etc 2993 1971 */ 2994 - if (cic->seek_samples <= 60) /* second&third seek */ 2995 - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 1972 + if (cfqq->seek_samples <= 60) /* second&third seek */ 1973 + sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024); 2996 1974 else 2997 - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 1975 + sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64); 2998 1976 2999 - cic->seek_samples = (7*cic->seek_samples + 256) / 8; 3000 - cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 3001 - total = cic->seek_total + (cic->seek_samples/2); 3002 - do_div(total, cic->seek_samples); 3003 - cic->seek_mean = (sector_t)total; 1977 + cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8; 1978 + cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8; 1979 + total = cfqq->seek_total + (cfqq->seek_samples/2); 1980 + do_div(total, cfqq->seek_samples); 1981 + cfqq->seek_mean = (sector_t)total; 1982 + 1983 + /* 1984 + * If this cfqq is shared between multiple processes, check to 1985 + * make sure that those processes are still issuing I/Os within 1986 + * the mean seek distance. If not, it may be time to break the 1987 + * queues apart again. 1988 + */ 1989 + if (cfq_cfqq_coop(cfqq)) { 1990 + if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) 1991 + cfqq->seeky_start = jiffies; 1992 + else if (!CFQQ_SEEKY(cfqq)) 1993 + cfqq->seeky_start = 0; 1994 + } 3004 1995 } 3005 1996 3006 1997 /* ··· 3034 1999 3035 2000 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3036 2001 2002 + if (cfqq->queued[0] + cfqq->queued[1] >= 4) 2003 + cfq_mark_cfqq_deep(cfqq); 2004 + 3037 2005 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3038 - (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) 2006 + (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) 2007 + && CFQQ_SEEKY(cfqq))) 3039 2008 enable_idle = 0; 3040 2009 else if (sample_valid(cic->ttime_samples)) { 3041 - unsigned int slice_idle = cfqd->cfq_slice_idle; 3042 - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 3043 - slice_idle = msecs_to_jiffies(CFQ_MIN_TT); 3044 - if (cic->ttime_mean > slice_idle) 2010 + if (cic->ttime_mean > cfqd->cfq_slice_idle) 3045 2011 enable_idle = 0; 3046 2012 else 3047 2013 enable_idle = 1; ··· 3071 2035 if (!cfqq) 3072 2036 return false; 3073 2037 3074 - if (cfq_slice_used(cfqq)) 3075 - return true; 3076 - 3077 2038 if (cfq_class_idle(new_cfqq)) 3078 2039 return false; 3079 2040 ··· 3082 2049 * not, let the sync request have priority. 3083 2050 */ 3084 2051 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 2052 + return true; 2053 + 2054 + if (new_cfqq->cfqg != cfqq->cfqg) 2055 + return false; 2056 + 2057 + if (cfq_slice_used(cfqq)) 2058 + return true; 2059 + 2060 + /* Allow preemption only if we are idling on sync-noidle tree */ 2061 + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 2062 + cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 2063 + new_cfqq->service_tree->count == 2 && 2064 + RB_EMPTY_ROOT(&cfqq->sort_list)) 3085 2065 return true; 3086 2066 3087 2067 /* ··· 3117 2071 * if this request is as-good as one we would expect from the 3118 2072 * current cfqq, let it preempt 3119 2073 */ 3120 - if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) || 3121 - cfqd->busy_queues == 1)) { 3122 - /* 3123 - * Mark new queue coop_preempt, so its coop flag will not be 3124 - * cleared when new queue gets scheduled at the very first time 3125 - */ 3126 - cfq_mark_cfqq_coop_preempt(new_cfqq); 3127 - cfq_mark_cfqq_coop(new_cfqq); 2074 + if (cfq_rq_close(cfqd, cfqq, rq)) 3128 2075 return true; 3129 - } 3130 2076 3131 2077 return false; 3132 2078 } ··· 3159 2121 cfqq->meta_pending++; 3160 2122 3161 2123 cfq_update_io_thinktime(cfqd, cic); 3162 - cfq_update_io_seektime(cfqd, cic, rq); 2124 + cfq_update_io_seektime(cfqd, cfqq, rq); 3163 2125 cfq_update_idle_window(cfqd, cfqq, cic); 3164 2126 3165 - cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 2127 + cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3166 2128 3167 2129 if (cfqq == cfqd->active_queue) { 2130 + if (cfq_cfqq_wait_busy(cfqq)) { 2131 + cfq_clear_cfqq_wait_busy(cfqq); 2132 + cfq_mark_cfqq_wait_busy_done(cfqq); 2133 + } 3168 2134 /* 3169 2135 * Remember that we saw a request from this process, but 3170 2136 * don't start queuing just yet. Otherwise we risk seeing lots ··· 3183 2141 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3184 2142 cfqd->busy_queues > 1) { 3185 2143 del_timer(&cfqd->idle_slice_timer); 3186 - __blk_run_queue(cfqd->queue); 3187 - } 3188 - cfq_mark_cfqq_must_dispatch(cfqq); 2144 + __blk_run_queue(cfqd->queue); 2145 + } else 2146 + cfq_mark_cfqq_must_dispatch(cfqq); 3189 2147 } 3190 2148 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3191 2149 /* ··· 3207 2165 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3208 2166 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3209 2167 3210 - cfq_add_rq_rb(rq); 3211 - 3212 2168 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3213 2169 list_add_tail(&rq->queuelist, &cfqq->fifo); 2170 + cfq_add_rq_rb(rq); 3214 2171 3215 2172 cfq_rq_enqueued(cfqd, cfqq, rq); 3216 2173 } ··· 3220 2179 */ 3221 2180 static void cfq_update_hw_tag(struct cfq_data *cfqd) 3222 2181 { 3223 - if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) 3224 - cfqd->rq_in_driver_peak = rq_in_driver(cfqd); 2182 + struct cfq_queue *cfqq = cfqd->active_queue; 2183 + 2184 + if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) 2185 + cfqd->hw_tag_est_depth = rq_in_driver(cfqd); 2186 + 2187 + if (cfqd->hw_tag == 1) 2188 + return; 3225 2189 3226 2190 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3227 2191 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 3228 2192 return; 3229 2193 2194 + /* 2195 + * If active queue hasn't enough requests and can idle, cfq might not 2196 + * dispatch sufficient requests to hardware. Don't zero hw_tag in this 2197 + * case 2198 + */ 2199 + if (cfqq && cfq_cfqq_idle_window(cfqq) && 2200 + cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 2201 + CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) 2202 + return; 2203 + 3230 2204 if (cfqd->hw_tag_samples++ < 50) 3231 2205 return; 3232 2206 3233 - if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) 2207 + if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3234 2208 cfqd->hw_tag = 1; 3235 2209 else 3236 2210 cfqd->hw_tag = 0; 3237 - 3238 - cfqd->hw_tag_samples = 0; 3239 - cfqd->rq_in_driver_peak = 0; 3240 2211 } 3241 2212 3242 2213 static void cfq_completed_request(struct request_queue *q, struct request *rq) ··· 3259 2206 unsigned long now; 3260 2207 3261 2208 now = jiffies; 3262 - cfq_log_cfqq(cfqd, cfqq, "complete"); 2209 + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); 3263 2210 3264 2211 cfq_update_hw_tag(cfqd); 3265 2212 ··· 3287 2234 cfq_set_prio_slice(cfqd, cfqq); 3288 2235 cfq_clear_cfqq_slice_new(cfqq); 3289 2236 } 2237 + 3290 2238 /* 3291 - * If there are no requests waiting in this queue, and 3292 - * there are other queues ready to issue requests, AND 3293 - * those other queues are issuing requests within our 3294 - * mean seek distance, give them a chance to run instead 3295 - * of idling. 2239 + * If this queue consumed its slice and this is last queue 2240 + * in the group, wait for next request before we expire 2241 + * the queue 2242 + */ 2243 + if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) { 2244 + cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; 2245 + cfq_mark_cfqq_wait_busy(cfqq); 2246 + } 2247 + 2248 + /* 2249 + * Idling is not enabled on: 2250 + * - expired queues 2251 + * - idle-priority queues 2252 + * - async queues 2253 + * - queues with still some requests queued 2254 + * - when there is a close cooperator 3296 2255 */ 3297 2256 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3298 2257 cfq_slice_expired(cfqd, 1); 3299 - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 3300 - sync && !rq_noidle(rq)) 3301 - cfq_arm_slice_timer(cfqd); 2258 + else if (sync && cfqq_empty && 2259 + !cfq_close_cooperator(cfqd, cfqq)) { 2260 + cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); 2261 + /* 2262 + * Idling is enabled for SYNC_WORKLOAD. 2263 + * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 2264 + * only if we processed at least one !rq_noidle request 2265 + */ 2266 + if (cfqd->serving_type == SYNC_WORKLOAD 2267 + || cfqd->noidle_tree_requires_idle 2268 + || cfqq->cfqg->nr_cfqq == 1) 2269 + cfq_arm_slice_timer(cfqd); 2270 + } 3302 2271 } 3303 2272 3304 2273 if (!rq_in_driver(cfqd)) ··· 3344 2269 cfqq->ioprio = IOPRIO_NORM; 3345 2270 } else { 3346 2271 /* 3347 - * check if we need to unboost the queue 2272 + * unboost the queue (if needed) 3348 2273 */ 3349 - if (cfqq->ioprio_class != cfqq->org_ioprio_class) 3350 - cfqq->ioprio_class = cfqq->org_ioprio_class; 3351 - if (cfqq->ioprio != cfqq->org_ioprio) 3352 - cfqq->ioprio = cfqq->org_ioprio; 2274 + cfqq->ioprio_class = cfqq->org_ioprio_class; 2275 + cfqq->ioprio = cfqq->org_ioprio; 3353 2276 } 3354 2277 } 3355 2278 ··· 3411 2338 } 3412 2339 } 3413 2340 2341 + static struct cfq_queue * 2342 + cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 2343 + struct cfq_queue *cfqq) 2344 + { 2345 + cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 2346 + cic_set_cfqq(cic, cfqq->new_cfqq, 1); 2347 + cfq_mark_cfqq_coop(cfqq->new_cfqq); 2348 + cfq_put_queue(cfqq); 2349 + return cic_to_cfqq(cic, 1); 2350 + } 2351 + 2352 + static int should_split_cfqq(struct cfq_queue *cfqq) 2353 + { 2354 + if (cfqq->seeky_start && 2355 + time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) 2356 + return 1; 2357 + return 0; 2358 + } 2359 + 2360 + /* 2361 + * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 2362 + * was the last process referring to said cfqq. 2363 + */ 2364 + static struct cfq_queue * 2365 + split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 2366 + { 2367 + if (cfqq_process_refs(cfqq) == 1) { 2368 + cfqq->seeky_start = 0; 2369 + cfqq->pid = current->pid; 2370 + cfq_clear_cfqq_coop(cfqq); 2371 + return cfqq; 2372 + } 2373 + 2374 + cic_set_cfqq(cic, NULL, 1); 2375 + cfq_put_queue(cfqq); 2376 + return NULL; 2377 + } 3414 2378 /* 3415 2379 * Allocate cfq data structures associated with this request. 3416 2380 */ ··· 3470 2360 if (!cic) 3471 2361 goto queue_fail; 3472 2362 2363 + new_queue: 3473 2364 cfqq = cic_to_cfqq(cic, is_sync); 3474 2365 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3475 2366 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3476 2367 cic_set_cfqq(cic, cfqq, is_sync); 2368 + } else { 2369 + /* 2370 + * If the queue was seeky for too long, break it apart. 2371 + */ 2372 + if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { 2373 + cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 2374 + cfqq = split_cfqq(cic, cfqq); 2375 + if (!cfqq) 2376 + goto new_queue; 2377 + } 2378 + 2379 + /* 2380 + * Check to see if this queue is scheduled to merge with 2381 + * another, closely cooperating queue. The merging of 2382 + * queues happens here as it must be done in process context. 2383 + * The reference on new_cfqq was taken in merge_cfqqs. 2384 + */ 2385 + if (cfqq->new_cfqq) 2386 + cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3477 2387 } 3478 2388 3479 2389 cfqq->allocated[rw]++; ··· 3568 2438 */ 3569 2439 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3570 2440 goto out_kick; 2441 + 2442 + /* 2443 + * Queue depth flag is reset only when the idle didn't succeed 2444 + */ 2445 + cfq_clear_cfqq_deep(cfqq); 3571 2446 } 3572 2447 expire: 3573 2448 cfq_slice_expired(cfqd, timed_out); ··· 3603 2468 cfq_put_queue(cfqd->async_idle_cfqq); 3604 2469 } 3605 2470 2471 + static void cfq_cfqd_free(struct rcu_head *head) 2472 + { 2473 + kfree(container_of(head, struct cfq_data, rcu)); 2474 + } 2475 + 3606 2476 static void cfq_exit_queue(struct elevator_queue *e) 3607 2477 { 3608 2478 struct cfq_data *cfqd = e->elevator_data; ··· 3629 2489 } 3630 2490 3631 2491 cfq_put_async_queues(cfqd); 2492 + cfq_release_cfq_groups(cfqd); 2493 + blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3632 2494 3633 2495 spin_unlock_irq(q->queue_lock); 3634 2496 3635 2497 cfq_shutdown_timer_wq(cfqd); 3636 2498 3637 - kfree(cfqd); 2499 + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 2500 + call_rcu(&cfqd->rcu, cfq_cfqd_free); 3638 2501 } 3639 2502 3640 2503 static void *cfq_init_queue(struct request_queue *q) 3641 2504 { 3642 2505 struct cfq_data *cfqd; 3643 - int i; 2506 + int i, j; 2507 + struct cfq_group *cfqg; 2508 + struct cfq_rb_root *st; 3644 2509 3645 2510 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3646 2511 if (!cfqd) 3647 2512 return NULL; 3648 2513 3649 - cfqd->service_tree = CFQ_RB_ROOT; 2514 + /* Init root service tree */ 2515 + cfqd->grp_service_tree = CFQ_RB_ROOT; 3650 2516 2517 + /* Init root group */ 2518 + cfqg = &cfqd->root_group; 2519 + for_each_cfqg_st(cfqg, i, j, st) 2520 + *st = CFQ_RB_ROOT; 2521 + RB_CLEAR_NODE(&cfqg->rb_node); 2522 + 2523 + /* Give preference to root group over other groups */ 2524 + cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 2525 + 2526 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 2527 + /* 2528 + * Take a reference to root group which we never drop. This is just 2529 + * to make sure that cfq_put_cfqg() does not try to kfree root group 2530 + */ 2531 + atomic_set(&cfqg->ref, 1); 2532 + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 2533 + 0); 2534 + #endif 3651 2535 /* 3652 2536 * Not strictly needed (since RB_ROOT just clears the node and we 3653 2537 * zeroed cfqd on alloc), but better be safe in case someone decides ··· 3687 2523 */ 3688 2524 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3689 2525 atomic_inc(&cfqd->oom_cfqq.ref); 2526 + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3690 2527 3691 2528 INIT_LIST_HEAD(&cfqd->cic_list); 3692 2529 ··· 3709 2544 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3710 2545 cfqd->cfq_slice_idle = cfq_slice_idle; 3711 2546 cfqd->cfq_latency = 1; 3712 - cfqd->hw_tag = 1; 2547 + cfqd->cfq_group_isolation = 0; 2548 + cfqd->hw_tag = -1; 3713 2549 cfqd->last_end_sync_rq = jiffies; 2550 + INIT_RCU_HEAD(&cfqd->rcu); 3714 2551 return cfqd; 3715 2552 } 3716 2553 ··· 3781 2614 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3782 2615 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3783 2616 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 2617 + SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 3784 2618 #undef SHOW_FUNCTION 3785 2619 3786 2620 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ··· 3814 2646 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3815 2647 UINT_MAX, 0); 3816 2648 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 2649 + STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 3817 2650 #undef STORE_FUNCTION 3818 2651 3819 2652 #define CFQ_ATTR(name) \ ··· 3831 2662 CFQ_ATTR(slice_async_rq), 3832 2663 CFQ_ATTR(slice_idle), 3833 2664 CFQ_ATTR(low_latency), 2665 + CFQ_ATTR(group_isolation), 3834 2666 __ATTR_NULL 3835 2667 }; 3836 2668 ··· 3861 2691 .elevator_owner = THIS_MODULE, 3862 2692 }; 3863 2693 2694 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 2695 + static struct blkio_policy_type blkio_policy_cfq = { 2696 + .ops = { 2697 + .blkio_unlink_group_fn = cfq_unlink_blkio_group, 2698 + .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 2699 + }, 2700 + }; 2701 + #else 2702 + static struct blkio_policy_type blkio_policy_cfq; 2703 + #endif 2704 + 3864 2705 static int __init cfq_init(void) 3865 2706 { 3866 2707 /* ··· 3886 2705 return -ENOMEM; 3887 2706 3888 2707 elv_register(&iosched_cfq); 2708 + blkio_policy_register(&blkio_policy_cfq); 3889 2709 3890 2710 return 0; 3891 2711 } ··· 3894 2712 static void __exit cfq_exit(void) 3895 2713 { 3896 2714 DECLARE_COMPLETION_ONSTACK(all_gone); 2715 + blkio_policy_unregister(&blkio_policy_cfq); 3897 2716 elv_unregister(&iosched_cfq); 3898 2717 ioc_gone = &all_gone; 3899 2718 /* ioc_gone's update must be visible before reading ioc_count */

+2

block/compat_ioctl.c

··· 747 747 return compat_put_uint(arg, bdev_io_opt(bdev)); 748 748 case BLKALIGNOFF: 749 749 return compat_put_int(arg, bdev_alignment_offset(bdev)); 750 + case BLKDISCARDZEROES: 751 + return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); 750 752 case BLKFLSBUF: 751 753 case BLKROSET: 752 754 case BLKDISCARD:

+2 -8

block/elevator.c

··· 154 154 155 155 spin_unlock(&elv_list_lock); 156 156 157 - if (!strcmp(name, "anticipatory")) 158 - sprintf(elv, "as-iosched"); 159 - else 160 - sprintf(elv, "%s-iosched", name); 157 + sprintf(elv, "%s-iosched", name); 161 158 162 159 request_module("%s", elv); 163 160 spin_lock(&elv_list_lock); ··· 190 193 * Be backwards-compatible with previous kernels, so users 191 194 * won't get the wrong elevator. 192 195 */ 193 - if (!strcmp(str, "as")) 194 - strcpy(chosen_elevator, "anticipatory"); 195 - else 196 - strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); 196 + strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); 197 197 return 1; 198 198 } 199 199

+12

block/genhd.c

··· 861 861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); 862 862 } 863 863 864 + static ssize_t disk_discard_alignment_show(struct device *dev, 865 + struct device_attribute *attr, 866 + char *buf) 867 + { 868 + struct gendisk *disk = dev_to_disk(dev); 869 + 870 + return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue)); 871 + } 872 + 864 873 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); 865 874 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); 866 875 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); 867 876 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); 868 877 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 869 878 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); 879 + static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, 880 + NULL); 870 881 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); 871 882 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 872 883 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); ··· 898 887 &dev_attr_ro.attr, 899 888 &dev_attr_size.attr, 900 889 &dev_attr_alignment_offset.attr, 890 + &dev_attr_discard_alignment.attr, 901 891 &dev_attr_capability.attr, 902 892 &dev_attr_stat.attr, 903 893 &dev_attr_inflight.attr,

+2

block/ioctl.c

··· 280 280 return put_uint(arg, bdev_io_opt(bdev)); 281 281 case BLKALIGNOFF: 282 282 return put_int(arg, bdev_alignment_offset(bdev)); 283 + case BLKDISCARDZEROES: 284 + return put_uint(arg, bdev_discard_zeroes_data(bdev)); 283 285 case BLKSECTGET: 284 286 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 285 287 case BLKRASET:

+4 -2

block/scsi_ioctl.c

··· 35 35 struct blk_cmd_filter { 36 36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; 37 37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; 38 - } blk_default_cmd_filter; 38 + }; 39 + 40 + static struct blk_cmd_filter blk_default_cmd_filter; 39 41 40 42 /* Command group 3 is reserved and should never be used. */ 41 43 const unsigned char scsi_command_size_tbl[8] = ··· 677 675 } 678 676 EXPORT_SYMBOL(scsi_cmd_ioctl); 679 677 680 - int __init blk_scsi_ioctl_init(void) 678 + static int __init blk_scsi_ioctl_init(void) 681 679 { 682 680 blk_set_cmd_filter_defaults(&blk_default_cmd_filter); 683 681 return 0;

+2

drivers/block/Kconfig

··· 271 271 instead, which can be configured to be on-disk compatible with the 272 272 cryptoloop device. 273 273 274 + source "drivers/block/drbd/Kconfig" 275 + 274 276 config BLK_DEV_NBD 275 277 tristate "Network block device support" 276 278 depends on NET

+1

drivers/block/Makefile

··· 36 36 obj-$(CONFIG_BLK_DEV_HD) += hd.o 37 37 38 38 obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 39 + obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ 39 40 40 41 swim_mod-objs := swim.o swim_asm.o

+261 -283

drivers/block/cciss.c

··· 179 179 static int deregister_disk(ctlr_info_t *h, int drv_index, 180 180 int clear_all, int via_ioctl); 181 181 182 - static void cciss_read_capacity(int ctlr, int logvol, int withirq, 182 + static void cciss_read_capacity(int ctlr, int logvol, 183 183 sector_t *total_size, unsigned int *block_size); 184 - static void cciss_read_capacity_16(int ctlr, int logvol, int withirq, 184 + static void cciss_read_capacity_16(int ctlr, int logvol, 185 185 sector_t *total_size, unsigned int *block_size); 186 186 static void cciss_geometry_inquiry(int ctlr, int logvol, 187 - int withirq, sector_t total_size, 187 + sector_t total_size, 188 188 unsigned int block_size, InquiryData_struct *inq_buff, 189 189 drive_info_struct *drv); 190 190 static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, 191 191 __u32); 192 192 static void start_io(ctlr_info_t *h); 193 - static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, 194 - __u8 page_code, unsigned char *scsi3addr, int cmd_type); 195 193 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, 196 194 __u8 page_code, unsigned char scsi3addr[], 197 195 int cmd_type); ··· 422 424 if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { 423 425 struct seq_file *seq = file->private_data; 424 426 ctlr_info_t *h = seq->private; 425 - int rc; 426 427 427 - rc = cciss_engage_scsi(h->ctlr); 428 - if (rc != 0) 429 - err = -rc; 430 - else 428 + err = cciss_engage_scsi(h->ctlr); 429 + if (err == 0) 431 430 err = length; 432 431 } else 433 432 #endif /* CONFIG_CISS_SCSI_TAPE */ ··· 1652 1657 { 1653 1658 CommandList_struct *cmd = rq->completion_data; 1654 1659 ctlr_info_t *h = hba[cmd->ctlr]; 1660 + SGDescriptor_struct *curr_sg = cmd->SG; 1655 1661 unsigned long flags; 1656 1662 u64bit temp64; 1657 1663 int i, ddir; 1664 + int sg_index = 0; 1658 1665 1659 1666 if (cmd->Request.Type.Direction == XFER_READ) 1660 1667 ddir = PCI_DMA_FROMDEVICE; ··· 1666 1669 /* command did not need to be retried */ 1667 1670 /* unmap the DMA mapping for all the scatter gather elements */ 1668 1671 for (i = 0; i < cmd->Header.SGList; i++) { 1669 - temp64.val32.lower = cmd->SG[i].Addr.lower; 1670 - temp64.val32.upper = cmd->SG[i].Addr.upper; 1671 - pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); 1672 + if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) { 1673 + temp64.val32.lower = cmd->SG[i].Addr.lower; 1674 + temp64.val32.upper = cmd->SG[i].Addr.upper; 1675 + pci_dma_sync_single_for_cpu(h->pdev, temp64.val, 1676 + cmd->SG[i].Len, ddir); 1677 + pci_unmap_single(h->pdev, temp64.val, 1678 + cmd->SG[i].Len, ddir); 1679 + /* Point to the next block */ 1680 + curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain; 1681 + sg_index = 0; 1682 + } 1683 + temp64.val32.lower = curr_sg[sg_index].Addr.lower; 1684 + temp64.val32.upper = curr_sg[sg_index].Addr.upper; 1685 + pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len, 1686 + ddir); 1687 + ++sg_index; 1672 1688 } 1673 1689 1674 1690 #ifdef CCISS_DEBUG ··· 1711 1701 * via the inquiry page 0. Model, vendor, and rev are set to empty strings if 1712 1702 * they cannot be read. 1713 1703 */ 1714 - static void cciss_get_device_descr(int ctlr, int logvol, int withirq, 1704 + static void cciss_get_device_descr(int ctlr, int logvol, 1715 1705 char *vendor, char *model, char *rev) 1716 1706 { 1717 1707 int rc; ··· 1727 1717 return; 1728 1718 1729 1719 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 1730 - if (withirq) 1731 - rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, 1732 - sizeof(InquiryData_struct), 0, 1733 - scsi3addr, TYPE_CMD); 1734 - else 1735 - rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf, 1736 - sizeof(InquiryData_struct), 0, 1737 - scsi3addr, TYPE_CMD); 1720 + rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0, 1721 + scsi3addr, TYPE_CMD); 1738 1722 if (rc == IO_OK) { 1739 1723 memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); 1740 1724 vendor[VENDOR_LEN] = '\0'; ··· 1747 1743 * number cannot be had, for whatever reason, 16 bytes of 0xff 1748 1744 * are returned instead. 1749 1745 */ 1750 - static void cciss_get_serial_no(int ctlr, int logvol, int withirq, 1746 + static void cciss_get_serial_no(int ctlr, int logvol, 1751 1747 unsigned char *serial_no, int buflen) 1752 1748 { 1753 1749 #define PAGE_83_INQ_BYTES 64 ··· 1763 1759 return; 1764 1760 memset(serial_no, 0, buflen); 1765 1761 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 1766 - if (withirq) 1767 - rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, 1768 - PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); 1769 - else 1770 - rc = sendcmd(CISS_INQUIRY, ctlr, buf, 1771 - PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); 1762 + rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, 1763 + PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); 1772 1764 if (rc == IO_OK) 1773 1765 memcpy(serial_no, &buf[8], buflen); 1774 1766 kfree(buf); ··· 1793 1793 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); 1794 1794 1795 1795 /* This is a hardware imposed limit. */ 1796 - blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES); 1796 + blk_queue_max_hw_segments(disk->queue, h->maxsgentries); 1797 1797 1798 1798 /* This is a limit in the driver and could be eliminated. */ 1799 - blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES); 1799 + blk_queue_max_phys_segments(disk->queue, h->maxsgentries); 1800 1800 1801 1801 blk_queue_max_sectors(disk->queue, h->cciss_max_sectors); 1802 1802 ··· 1852 1852 1853 1853 /* testing to see if 16-byte CDBs are already being used */ 1854 1854 if (h->cciss_read == CCISS_READ_16) { 1855 - cciss_read_capacity_16(h->ctlr, drv_index, 1, 1855 + cciss_read_capacity_16(h->ctlr, drv_index, 1856 1856 &total_size, &block_size); 1857 1857 1858 1858 } else { 1859 - cciss_read_capacity(ctlr, drv_index, 1, 1860 - &total_size, &block_size); 1861 - 1859 + cciss_read_capacity(ctlr, drv_index, &total_size, &block_size); 1862 1860 /* if read_capacity returns all F's this volume is >2TB */ 1863 1861 /* in size so we switch to 16-byte CDB's for all */ 1864 1862 /* read/write ops */ 1865 1863 if (total_size == 0xFFFFFFFFULL) { 1866 - cciss_read_capacity_16(ctlr, drv_index, 1, 1864 + cciss_read_capacity_16(ctlr, drv_index, 1867 1865 &total_size, &block_size); 1868 1866 h->cciss_read = CCISS_READ_16; 1869 1867 h->cciss_write = CCISS_WRITE_16; ··· 1871 1873 } 1872 1874 } 1873 1875 1874 - cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size, 1876 + cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size, 1875 1877 inq_buff, drvinfo); 1876 1878 drvinfo->block_size = block_size; 1877 1879 drvinfo->nr_blocks = total_size + 1; 1878 1880 1879 - cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor, 1881 + cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor, 1880 1882 drvinfo->model, drvinfo->rev); 1881 - cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no, 1883 + cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no, 1882 1884 sizeof(drvinfo->serial_no)); 1883 1885 /* Save the lunid in case we deregister the disk, below. */ 1884 1886 memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, ··· 2529 2531 case 0: return IO_OK; /* no sense */ 2530 2532 case 1: return IO_OK; /* recovered error */ 2531 2533 default: 2534 + if (check_for_unit_attention(h, c)) 2535 + return IO_NEEDS_RETRY; 2532 2536 printk(KERN_WARNING "cciss%d: cmd 0x%02x " 2533 2537 "check condition, sense key = 0x%02x\n", 2534 2538 h->ctlr, c->Request.CDB[0], ··· 2672 2672 } 2673 2673 2674 2674 static void cciss_geometry_inquiry(int ctlr, int logvol, 2675 - int withirq, sector_t total_size, 2675 + sector_t total_size, 2676 2676 unsigned int block_size, 2677 2677 InquiryData_struct *inq_buff, 2678 2678 drive_info_struct *drv) ··· 2683 2683 2684 2684 memset(inq_buff, 0, sizeof(InquiryData_struct)); 2685 2685 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2686 - if (withirq) 2687 - return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, 2688 - inq_buff, sizeof(*inq_buff), 2689 - 0xC1, scsi3addr, TYPE_CMD); 2690 - else 2691 - return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff, 2692 - sizeof(*inq_buff), 0xC1, scsi3addr, 2693 - TYPE_CMD); 2686 + return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff, 2687 + sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD); 2694 2688 if (return_code == IO_OK) { 2695 2689 if (inq_buff->data_byte[8] == 0xFF) { 2696 2690 printk(KERN_WARNING ··· 2717 2723 } 2718 2724 2719 2725 static void 2720 - cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, 2726 + cciss_read_capacity(int ctlr, int logvol, sector_t *total_size, 2721 2727 unsigned int *block_size) 2722 2728 { 2723 2729 ReadCapdata_struct *buf; ··· 2731 2737 } 2732 2738 2733 2739 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2734 - if (withirq) 2735 - return_code = sendcmd_withirq(CCISS_READ_CAPACITY, 2736 - ctlr, buf, sizeof(ReadCapdata_struct), 2737 - 0, scsi3addr, TYPE_CMD); 2738 - else 2739 - return_code = sendcmd(CCISS_READ_CAPACITY, 2740 - ctlr, buf, sizeof(ReadCapdata_struct), 2741 - 0, scsi3addr, TYPE_CMD); 2740 + return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf, 2741 + sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD); 2742 2742 if (return_code == IO_OK) { 2743 2743 *total_size = be32_to_cpu(*(__be32 *) buf->total_size); 2744 2744 *block_size = be32_to_cpu(*(__be32 *) buf->block_size); ··· 2744 2756 kfree(buf); 2745 2757 } 2746 2758 2747 - static void 2748 - cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size) 2759 + static void cciss_read_capacity_16(int ctlr, int logvol, 2760 + sector_t *total_size, unsigned int *block_size) 2749 2761 { 2750 2762 ReadCapdata_struct_16 *buf; 2751 2763 int return_code; ··· 2758 2770 } 2759 2771 2760 2772 log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); 2761 - if (withirq) { 2762 - return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, 2763 - ctlr, buf, sizeof(ReadCapdata_struct_16), 2764 - 0, scsi3addr, TYPE_CMD); 2765 - } 2766 - else { 2767 - return_code = sendcmd(CCISS_READ_CAPACITY_16, 2768 - ctlr, buf, sizeof(ReadCapdata_struct_16), 2769 - 0, scsi3addr, TYPE_CMD); 2770 - } 2773 + return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, 2774 + ctlr, buf, sizeof(ReadCapdata_struct_16), 2775 + 0, scsi3addr, TYPE_CMD); 2771 2776 if (return_code == IO_OK) { 2772 2777 *total_size = be64_to_cpu(*(__be64 *) buf->total_size); 2773 2778 *block_size = be32_to_cpu(*(__be32 *) buf->block_size); ··· 2801 2820 return 1; 2802 2821 } 2803 2822 if (h->cciss_read == CCISS_READ_10) { 2804 - cciss_read_capacity(h->ctlr, logvol, 1, 2823 + cciss_read_capacity(h->ctlr, logvol, 2805 2824 &total_size, &block_size); 2806 2825 } else { 2807 - cciss_read_capacity_16(h->ctlr, logvol, 1, 2826 + cciss_read_capacity_16(h->ctlr, logvol, 2808 2827 &total_size, &block_size); 2809 2828 } 2810 - cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, 2829 + cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size, 2811 2830 inq_buff, drv); 2812 2831 2813 2832 blk_queue_logical_block_size(drv->queue, drv->block_size); ··· 2815 2834 2816 2835 kfree(inq_buff); 2817 2836 return 0; 2818 - } 2819 - 2820 - /* 2821 - * Wait polling for a command to complete. 2822 - * The memory mapped FIFO is polled for the completion. 2823 - * Used only at init time, interrupts from the HBA are disabled. 2824 - */ 2825 - static unsigned long pollcomplete(int ctlr) 2826 - { 2827 - unsigned long done; 2828 - int i; 2829 - 2830 - /* Wait (up to 20 seconds) for a command to complete */ 2831 - 2832 - for (i = 20 * HZ; i > 0; i--) { 2833 - done = hba[ctlr]->access.command_completed(hba[ctlr]); 2834 - if (done == FIFO_EMPTY) 2835 - schedule_timeout_uninterruptible(1); 2836 - else 2837 - return done; 2838 - } 2839 - /* Invalid address to tell caller we ran out of time */ 2840 - return 1; 2841 - } 2842 - 2843 - /* Send command c to controller h and poll for it to complete. 2844 - * Turns interrupts off on the board. Used at driver init time 2845 - * and during SCSI error recovery. 2846 - */ 2847 - static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c) 2848 - { 2849 - int i; 2850 - unsigned long complete; 2851 - int status = IO_ERROR; 2852 - u64bit buff_dma_handle; 2853 - 2854 - resend_cmd1: 2855 - 2856 - /* Disable interrupt on the board. */ 2857 - h->access.set_intr_mask(h, CCISS_INTR_OFF); 2858 - 2859 - /* Make sure there is room in the command FIFO */ 2860 - /* Actually it should be completely empty at this time */ 2861 - /* unless we are in here doing error handling for the scsi */ 2862 - /* tape side of the driver. */ 2863 - for (i = 200000; i > 0; i--) { 2864 - /* if fifo isn't full go */ 2865 - if (!(h->access.fifo_full(h))) 2866 - break; 2867 - udelay(10); 2868 - printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full," 2869 - " waiting!\n", h->ctlr); 2870 - } 2871 - h->access.submit_command(h, c); /* Send the cmd */ 2872 - do { 2873 - complete = pollcomplete(h->ctlr); 2874 - 2875 - #ifdef CCISS_DEBUG 2876 - printk(KERN_DEBUG "cciss: command completed\n"); 2877 - #endif /* CCISS_DEBUG */ 2878 - 2879 - if (complete == 1) { 2880 - printk(KERN_WARNING 2881 - "cciss cciss%d: SendCmd Timeout out, " 2882 - "No command list address returned!\n", h->ctlr); 2883 - status = IO_ERROR; 2884 - break; 2885 - } 2886 - 2887 - /* Make sure it's the command we're expecting. */ 2888 - if ((complete & ~CISS_ERROR_BIT) != c->busaddr) { 2889 - printk(KERN_WARNING "cciss%d: Unexpected command " 2890 - "completion.\n", h->ctlr); 2891 - continue; 2892 - } 2893 - 2894 - /* It is our command. If no error, we're done. */ 2895 - if (!(complete & CISS_ERROR_BIT)) { 2896 - status = IO_OK; 2897 - break; 2898 - } 2899 - 2900 - /* There is an error... */ 2901 - 2902 - /* if data overrun or underun on Report command ignore it */ 2903 - if (((c->Request.CDB[0] == CISS_REPORT_LOG) || 2904 - (c->Request.CDB[0] == CISS_REPORT_PHYS) || 2905 - (c->Request.CDB[0] == CISS_INQUIRY)) && 2906 - ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) || 2907 - (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) { 2908 - complete = c->busaddr; 2909 - status = IO_OK; 2910 - break; 2911 - } 2912 - 2913 - if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) { 2914 - printk(KERN_WARNING "cciss%d: unsolicited abort %p\n", 2915 - h->ctlr, c); 2916 - if (c->retry_count < MAX_CMD_RETRIES) { 2917 - printk(KERN_WARNING "cciss%d: retrying %p\n", 2918 - h->ctlr, c); 2919 - c->retry_count++; 2920 - /* erase the old error information */ 2921 - memset(c->err_info, 0, sizeof(c->err_info)); 2922 - goto resend_cmd1; 2923 - } 2924 - printk(KERN_WARNING "cciss%d: retried %p too many " 2925 - "times\n", h->ctlr, c); 2926 - status = IO_ERROR; 2927 - break; 2928 - } 2929 - 2930 - if (c->err_info->CommandStatus == CMD_UNABORTABLE) { 2931 - printk(KERN_WARNING "cciss%d: command could not be " 2932 - "aborted.\n", h->ctlr); 2933 - status = IO_ERROR; 2934 - break; 2935 - } 2936 - 2937 - if (c->err_info->CommandStatus == CMD_TARGET_STATUS) { 2938 - status = check_target_status(h, c); 2939 - break; 2940 - } 2941 - 2942 - printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr); 2943 - printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n", 2944 - c->Request.CDB[0], c->err_info->CommandStatus); 2945 - status = IO_ERROR; 2946 - break; 2947 - 2948 - } while (1); 2949 - 2950 - /* unlock the data buffer from DMA */ 2951 - buff_dma_handle.val32.lower = c->SG[0].Addr.lower; 2952 - buff_dma_handle.val32.upper = c->SG[0].Addr.upper; 2953 - pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val, 2954 - c->SG[0].Len, PCI_DMA_BIDIRECTIONAL); 2955 - return status; 2956 - } 2957 - 2958 - /* 2959 - * Send a command to the controller, and wait for it to complete. 2960 - * Used at init time, and during SCSI error recovery. 2961 - */ 2962 - static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, 2963 - __u8 page_code, unsigned char *scsi3addr, int cmd_type) 2964 - { 2965 - CommandList_struct *c; 2966 - int status; 2967 - 2968 - c = cmd_alloc(hba[ctlr], 1); 2969 - if (!c) { 2970 - printk(KERN_WARNING "cciss: unable to get memory"); 2971 - return IO_ERROR; 2972 - } 2973 - status = fill_cmd(c, cmd, ctlr, buff, size, page_code, 2974 - scsi3addr, cmd_type); 2975 - if (status == IO_OK) 2976 - status = sendcmd_core(hba[ctlr], c); 2977 - cmd_free(hba[ctlr], c, 1); 2978 - return status; 2979 2837 } 2980 2838 2981 2839 /* ··· 3075 3255 int seg; 3076 3256 struct request *creq; 3077 3257 u64bit temp64; 3078 - struct scatterlist tmp_sg[MAXSGENTRIES]; 3258 + struct scatterlist *tmp_sg; 3259 + SGDescriptor_struct *curr_sg; 3079 3260 drive_info_struct *drv; 3080 3261 int i, dir; 3262 + int nseg = 0; 3263 + int sg_index = 0; 3264 + int chained = 0; 3081 3265 3082 3266 /* We call start_io here in case there is a command waiting on the 3083 3267 * queue that has not been sent. ··· 3094 3270 if (!creq) 3095 3271 goto startio; 3096 3272 3097 - BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); 3273 + BUG_ON(creq->nr_phys_segments > h->maxsgentries); 3098 3274 3099 3275 if ((c = cmd_alloc(h, 1)) == NULL) 3100 3276 goto full; 3101 3277 3102 3278 blk_start_request(creq); 3103 3279 3280 + tmp_sg = h->scatter_list[c->cmdindex]; 3104 3281 spin_unlock_irq(q->queue_lock); 3105 3282 3106 3283 c->cmd_type = CMD_RWREQ; ··· 3130 3305 (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); 3131 3306 #endif /* CCISS_DEBUG */ 3132 3307 3133 - sg_init_table(tmp_sg, MAXSGENTRIES); 3308 + sg_init_table(tmp_sg, h->maxsgentries); 3134 3309 seg = blk_rq_map_sg(q, creq, tmp_sg); 3135 3310 3136 3311 /* get the DMA records for the setup */ ··· 3139 3314 else 3140 3315 dir = PCI_DMA_TODEVICE; 3141 3316 3317 + curr_sg = c->SG; 3318 + sg_index = 0; 3319 + chained = 0; 3320 + 3142 3321 for (i = 0; i < seg; i++) { 3143 - c->SG[i].Len = tmp_sg[i].length; 3322 + if (((sg_index+1) == (h->max_cmd_sgentries)) && 3323 + !chained && ((seg - i) > 1)) { 3324 + nseg = seg - i; 3325 + curr_sg[sg_index].Len = (nseg) * 3326 + sizeof(SGDescriptor_struct); 3327 + curr_sg[sg_index].Ext = CCISS_SG_CHAIN; 3328 + 3329 + /* Point to next chain block. */ 3330 + curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain; 3331 + sg_index = 0; 3332 + chained = 1; 3333 + } 3334 + curr_sg[sg_index].Len = tmp_sg[i].length; 3144 3335 temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), 3145 - tmp_sg[i].offset, 3146 - tmp_sg[i].length, dir); 3147 - c->SG[i].Addr.lower = temp64.val32.lower; 3148 - c->SG[i].Addr.upper = temp64.val32.upper; 3149 - c->SG[i].Ext = 0; // we are not chaining 3336 + tmp_sg[i].offset, 3337 + tmp_sg[i].length, dir); 3338 + curr_sg[sg_index].Addr.lower = temp64.val32.lower; 3339 + curr_sg[sg_index].Addr.upper = temp64.val32.upper; 3340 + curr_sg[sg_index].Ext = 0; /* we are not chaining */ 3341 + 3342 + ++sg_index; 3150 3343 } 3344 + 3345 + if (chained) { 3346 + int len; 3347 + curr_sg = c->SG; 3348 + sg_index = h->max_cmd_sgentries - 1; 3349 + len = curr_sg[sg_index].Len; 3350 + /* Setup pointer to next chain block. 3351 + * Fill out last element in current chain 3352 + * block with address of next chain block. 3353 + */ 3354 + temp64.val = pci_map_single(h->pdev, 3355 + h->cmd_sg_list[c->cmdindex]->sgchain, 3356 + len, dir); 3357 + 3358 + h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val; 3359 + curr_sg[sg_index].Addr.lower = temp64.val32.lower; 3360 + curr_sg[sg_index].Addr.upper = temp64.val32.upper; 3361 + 3362 + pci_dma_sync_single_for_device(h->pdev, 3363 + h->cmd_sg_list[c->cmdindex]->sg_chain_dma, 3364 + len, dir); 3365 + } 3366 + 3151 3367 /* track how many SG entries we are using */ 3152 3368 if (seg > h->maxSG) 3153 3369 h->maxSG = seg; 3154 3370 3155 3371 #ifdef CCISS_DEBUG 3156 - printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", 3157 - blk_rq_sectors(creq), seg); 3372 + printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments " 3373 + "chained[%d]\n", 3374 + blk_rq_sectors(creq), seg, chained); 3158 3375 #endif /* CCISS_DEBUG */ 3159 3376 3160 - c->Header.SGList = c->Header.SGTotal = seg; 3377 + c->Header.SGList = c->Header.SGTotal = seg + chained; 3378 + if (seg > h->max_cmd_sgentries) 3379 + c->Header.SGList = h->max_cmd_sgentries; 3380 + 3161 3381 if (likely(blk_fs_request(creq))) { 3162 3382 if(h->cciss_read == CCISS_READ_10) { 3163 3383 c->Request.CDB[1] = 0; ··· 3383 3513 * @h: Pointer to the controller. 3384 3514 * 3385 3515 * Removes the controller from the rescan queue if present. Blocks if 3386 - * the controller is currently conducting a rescan. 3516 + * the controller is currently conducting a rescan. The controller 3517 + * can be in one of three states: 3518 + * 1. Doesn't need a scan 3519 + * 2. On the scan list, but not scanning yet (we remove it) 3520 + * 3. Busy scanning (and not on the list). In this case we want to wait for 3521 + * the scan to complete to make sure the scanning thread for this 3522 + * controller is completely idle. 3387 3523 **/ 3388 3524 static void remove_from_scan_list(struct ctlr_info *h) 3389 3525 { 3390 3526 struct ctlr_info *test_h, *tmp_h; 3391 - int scanning = 0; 3392 3527 3393 3528 mutex_lock(&scan_mutex); 3394 3529 list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { 3395 - if (test_h == h) { 3530 + if (test_h == h) { /* state 2. */ 3396 3531 list_del(&h->scan_list); 3397 3532 complete_all(&h->scan_wait); 3398 3533 mutex_unlock(&scan_mutex); 3399 3534 return; 3400 3535 } 3401 3536 } 3402 - if (&h->busy_scanning) 3403 - scanning = 0; 3404 - mutex_unlock(&scan_mutex); 3405 - 3406 - if (scanning) 3537 + if (h->busy_scanning) { /* state 3. */ 3538 + mutex_unlock(&scan_mutex); 3407 3539 wait_for_completion(&h->scan_wait); 3540 + } else { /* state 1, nothing to do. */ 3541 + mutex_unlock(&scan_mutex); 3542 + } 3408 3543 } 3409 3544 3410 3545 /** ··· 3448 3573 h->busy_scanning = 1; 3449 3574 mutex_unlock(&scan_mutex); 3450 3575 3451 - if (h) { 3452 - rebuild_lun_table(h, 0, 0); 3453 - complete_all(&h->scan_wait); 3454 - mutex_lock(&scan_mutex); 3455 - h->busy_scanning = 0; 3456 - mutex_unlock(&scan_mutex); 3457 - } 3576 + rebuild_lun_table(h, 0, 0); 3577 + complete_all(&h->scan_wait); 3578 + mutex_lock(&scan_mutex); 3579 + h->busy_scanning = 0; 3580 + mutex_unlock(&scan_mutex); 3458 3581 } 3459 3582 } 3460 3583 ··· 3478 3605 case REPORT_LUNS_CHANGED: 3479 3606 printk(KERN_WARNING "cciss%d: report LUN data " 3480 3607 "changed\n", h->ctlr); 3481 - add_to_scan_list(h); 3482 - wake_up_process(cciss_scan_thread); 3608 + /* 3609 + * Here, we could call add_to_scan_list and wake up the scan thread, 3610 + * except that it's quite likely that we will get more than one 3611 + * REPORT_LUNS_CHANGED condition in quick succession, which means 3612 + * that those which occur after the first one will likely happen 3613 + * *during* the scan_thread's rescan. And the rescan code is not 3614 + * robust enough to restart in the middle, undoing what it has already 3615 + * done, and it's not clear that it's even possible to do this, since 3616 + * part of what it does is notify the block layer, which starts 3617 + * doing it's own i/o to read partition tables and so on, and the 3618 + * driver doesn't have visibility to know what might need undoing. 3619 + * In any event, if possible, it is horribly complicated to get right 3620 + * so we just don't do it for now. 3621 + * 3622 + * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012. 3623 + */ 3483 3624 return 1; 3484 3625 break; 3485 3626 case POWER_OR_RESET: ··· 3775 3888 * leave a little room for ioctl calls. 3776 3889 */ 3777 3890 c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); 3891 + c->maxsgentries = readl(&(c->cfgtable->MaxSGElements)); 3892 + 3893 + /* 3894 + * Limit native command to 32 s/g elements to save dma'able memory. 3895 + * Howvever spec says if 0, use 31 3896 + */ 3897 + 3898 + c->max_cmd_sgentries = 31; 3899 + if (c->maxsgentries > 512) { 3900 + c->max_cmd_sgentries = 32; 3901 + c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1; 3902 + c->maxsgentries -= 1; /* account for chain pointer */ 3903 + } else { 3904 + c->maxsgentries = 31; /* Default to traditional value */ 3905 + c->chainsize = 0; /* traditional */ 3906 + } 3907 + 3778 3908 c->product_name = products[prod_index].product_name; 3779 3909 c->access = *(products[prod_index].access); 3780 3910 c->nr_cmds = c->max_commands - 4; ··· 4118 4214 { 4119 4215 int i; 4120 4216 int j = 0; 4217 + int k = 0; 4121 4218 int rc; 4122 4219 int dac, return_code; 4123 4220 InquiryData_struct *inq_buff; ··· 4222 4317 printk(KERN_ERR "cciss: out of memory"); 4223 4318 goto clean4; 4224 4319 } 4320 + 4321 + /* Need space for temp scatter list */ 4322 + hba[i]->scatter_list = kmalloc(hba[i]->max_commands * 4323 + sizeof(struct scatterlist *), 4324 + GFP_KERNEL); 4325 + for (k = 0; k < hba[i]->nr_cmds; k++) { 4326 + hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) * 4327 + hba[i]->maxsgentries, 4328 + GFP_KERNEL); 4329 + if (hba[i]->scatter_list[k] == NULL) { 4330 + printk(KERN_ERR "cciss%d: could not allocate " 4331 + "s/g lists\n", i); 4332 + goto clean4; 4333 + } 4334 + } 4335 + hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) * 4336 + hba[i]->nr_cmds, 4337 + GFP_KERNEL); 4338 + if (!hba[i]->cmd_sg_list) { 4339 + printk(KERN_ERR "cciss%d: Cannot get memory for " 4340 + "s/g chaining.\n", i); 4341 + goto clean4; 4342 + } 4343 + /* Build up chain blocks for each command */ 4344 + if (hba[i]->chainsize > 0) { 4345 + for (j = 0; j < hba[i]->nr_cmds; j++) { 4346 + hba[i]->cmd_sg_list[j] = 4347 + kmalloc(sizeof(struct Cmd_sg_list), 4348 + GFP_KERNEL); 4349 + if (!hba[i]->cmd_sg_list[j]) { 4350 + printk(KERN_ERR "cciss%d: Cannot get memory " 4351 + "for chain block.\n", i); 4352 + goto clean4; 4353 + } 4354 + /* Need a block of chainsized s/g elements. */ 4355 + hba[i]->cmd_sg_list[j]->sgchain = 4356 + kmalloc((hba[i]->chainsize * 4357 + sizeof(SGDescriptor_struct)), 4358 + GFP_KERNEL); 4359 + if (!hba[i]->cmd_sg_list[j]->sgchain) { 4360 + printk(KERN_ERR "cciss%d: Cannot get memory " 4361 + "for s/g chains\n", i); 4362 + goto clean4; 4363 + } 4364 + } 4365 + } 4366 + 4225 4367 spin_lock_init(&hba[i]->lock); 4226 4368 4227 4369 /* Initialize the pdev driver private data. ··· 4314 4362 4315 4363 cciss_procinit(i); 4316 4364 4317 - hba[i]->cciss_max_sectors = 2048; 4365 + hba[i]->cciss_max_sectors = 8192; 4318 4366 4319 4367 rebuild_lun_table(hba[i], 1, 0); 4320 4368 hba[i]->busy_initializing = 0; ··· 4322 4370 4323 4371 clean4: 4324 4372 kfree(hba[i]->cmd_pool_bits); 4373 + /* Free up sg elements */ 4374 + for (k = 0; k < hba[i]->nr_cmds; k++) 4375 + kfree(hba[i]->scatter_list[k]); 4376 + kfree(hba[i]->scatter_list); 4377 + /* Only free up extra s/g lists if controller supports them */ 4378 + if (hba[i]->chainsize > 0) { 4379 + for (j = 0; j < hba[i]->nr_cmds; j++) { 4380 + if (hba[i]->cmd_sg_list[j]) { 4381 + kfree(hba[i]->cmd_sg_list[j]->sgchain); 4382 + kfree(hba[i]->cmd_sg_list[j]); 4383 + } 4384 + } 4385 + kfree(hba[i]->cmd_sg_list); 4386 + } 4325 4387 if (hba[i]->cmd_pool) 4326 4388 pci_free_consistent(hba[i]->pdev, 4327 4389 hba[i]->nr_cmds * sizeof(CommandList_struct), ··· 4366 4400 4367 4401 static void cciss_shutdown(struct pci_dev *pdev) 4368 4402 { 4369 - ctlr_info_t *tmp_ptr; 4370 - int i; 4371 - char flush_buf[4]; 4403 + ctlr_info_t *h; 4404 + char *flush_buf; 4372 4405 int return_code; 4373 4406 4374 - tmp_ptr = pci_get_drvdata(pdev); 4375 - if (tmp_ptr == NULL) 4407 + h = pci_get_drvdata(pdev); 4408 + flush_buf = kzalloc(4, GFP_KERNEL); 4409 + if (!flush_buf) { 4410 + printk(KERN_WARNING 4411 + "cciss:%d cache not flushed, out of memory.\n", 4412 + h->ctlr); 4376 4413 return; 4377 - i = tmp_ptr->ctlr; 4378 - if (hba[i] == NULL) 4379 - return; 4380 - 4381 - /* Turn board interrupts off and send the flush cache command */ 4382 - /* sendcmd will turn off interrupt, and send the flush... 4383 - * To write all data in the battery backed cache to disks */ 4384 - memset(flush_buf, 0, 4); 4385 - return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0, 4386 - CTLR_LUNID, TYPE_CMD); 4387 - if (return_code == IO_OK) { 4388 - printk(KERN_INFO "Completed flushing cache on controller %d\n", i); 4389 - } else { 4390 - printk(KERN_WARNING "Error flushing cache on controller %d\n", i); 4391 4414 } 4392 - free_irq(hba[i]->intr[2], hba[i]); 4415 + /* write all data in the battery backed cache to disk */ 4416 + memset(flush_buf, 0, 4); 4417 + return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf, 4418 + 4, 0, CTLR_LUNID, TYPE_CMD); 4419 + kfree(flush_buf); 4420 + if (return_code != IO_OK) 4421 + printk(KERN_WARNING "cciss%d: Error flushing cache\n", 4422 + h->ctlr); 4423 + h->access.set_intr_mask(h, CCISS_INTR_OFF); 4424 + free_irq(h->intr[2], h); 4393 4425 } 4394 4426 4395 4427 static void __devexit cciss_remove_one(struct pci_dev *pdev) ··· 4449 4485 pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), 4450 4486 hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); 4451 4487 kfree(hba[i]->cmd_pool_bits); 4488 + /* Free up sg elements */ 4489 + for (j = 0; j < hba[i]->nr_cmds; j++) 4490 + kfree(hba[i]->scatter_list[j]); 4491 + kfree(hba[i]->scatter_list); 4492 + /* Only free up extra s/g lists if controller supports them */ 4493 + if (hba[i]->chainsize > 0) { 4494 + for (j = 0; j < hba[i]->nr_cmds; j++) { 4495 + if (hba[i]->cmd_sg_list[j]) { 4496 + kfree(hba[i]->cmd_sg_list[j]->sgchain); 4497 + kfree(hba[i]->cmd_sg_list[j]); 4498 + } 4499 + } 4500 + kfree(hba[i]->cmd_sg_list); 4501 + } 4452 4502 /* 4453 4503 * Deliberately omit pci_disable_device(): it does something nasty to 4454 4504 * Smart Array controllers that pci_enable_device does not undo

+17 -1

drivers/block/cciss.h

··· 55 55 char device_initialized; /* indicates whether dev is initialized */ 56 56 } drive_info_struct; 57 57 58 - struct ctlr_info 58 + struct Cmd_sg_list { 59 + SGDescriptor_struct *sgchain; 60 + dma_addr_t sg_chain_dma; 61 + int chain_block_size; 62 + }; 63 + 64 + struct ctlr_info 59 65 { 60 66 int ctlr; 61 67 char devname[8]; ··· 81 75 int num_luns; 82 76 int highest_lun; 83 77 int usage_count; /* number of opens all all minor devices */ 78 + /* Need space for temp sg list 79 + * number of scatter/gathers supported 80 + * number of scatter/gathers in chained block 81 + */ 82 + struct scatterlist **scatter_list; 83 + int maxsgentries; 84 + int chainsize; 85 + int max_cmd_sgentries; 86 + struct Cmd_sg_list **cmd_sg_list; 87 + 84 88 # define DOORBELL_INT 0 85 89 # define PERF_MODE_INT 1 86 90 # define SIMPLE_MODE_INT 2

+6 -1

drivers/block/cciss_cmd.h

··· 7 7 8 8 //general boundary defintions 9 9 #define SENSEINFOBYTES 32//note that this value may vary between host implementations 10 - #define MAXSGENTRIES 31 10 + #define MAXSGENTRIES 32 11 + #define CCISS_SG_CHAIN 0x80000000 11 12 #define MAXREPLYQS 256 12 13 13 14 //Command Status value ··· 320 319 BYTE ServerName[16]; 321 320 DWORD HeartBeat; 322 321 DWORD SCSI_Prefetch; 322 + DWORD MaxSGElements; 323 + DWORD MaxLogicalUnits; 324 + DWORD MaxPhysicalDrives; 325 + DWORD MaxPhysicalDrivesPerLogicalUnit; 323 326 } CfgTable_struct; 324 327 #pragma pack() 325 328 #endif // CCISS_CMD_H

+2 -2

drivers/block/cciss_scsi.c

··· 755 755 cp, 756 756 ei->ScsiStatus); 757 757 #endif 758 - cmd->result |= (ei->ScsiStatus < 1); 758 + cmd->result |= (ei->ScsiStatus << 1); 759 759 } 760 760 else { /* scsi status is zero??? How??? */ 761 761 ··· 1547 1547 if (sa->registered) { 1548 1548 printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); 1549 1549 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); 1550 - return ENXIO; 1550 + return -ENXIO; 1551 1551 } 1552 1552 sa->registered = 1; 1553 1553 spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);

+71

drivers/block/drbd/Kconfig

··· 1 + # 2 + # DRBD device driver configuration 3 + # 4 + 5 + comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" 6 + depends on !PROC_FS || !INET || !CONNECTOR 7 + 8 + config BLK_DEV_DRBD 9 + tristate "DRBD Distributed Replicated Block Device support" 10 + depends on PROC_FS && INET && CONNECTOR 11 + select LRU_CACHE 12 + default n 13 + help 14 + 15 + NOTE: In order to authenticate connections you have to select 16 + CRYPTO_HMAC and a hash function as well. 17 + 18 + DRBD is a shared-nothing, synchronously replicated block device. It 19 + is designed to serve as a building block for high availability 20 + clusters and in this context, is a "drop-in" replacement for shared 21 + storage. Simplistically, you could see it as a network RAID 1. 22 + 23 + Each minor device has a role, which can be 'primary' or 'secondary'. 24 + On the node with the primary device the application is supposed to 25 + run and to access the device (/dev/drbdX). Every write is sent to 26 + the local 'lower level block device' and, across the network, to the 27 + node with the device in 'secondary' state. The secondary device 28 + simply writes the data to its lower level block device. 29 + 30 + DRBD can also be used in dual-Primary mode (device writable on both 31 + nodes), which means it can exhibit shared disk semantics in a 32 + shared-nothing cluster. Needless to say, on top of dual-Primary 33 + DRBD utilizing a cluster file system is necessary to maintain for 34 + cache coherency. 35 + 36 + For automatic failover you need a cluster manager (e.g. heartbeat). 37 + See also: http://www.drbd.org/, http://www.linux-ha.org 38 + 39 + If unsure, say N. 40 + 41 + config DRBD_FAULT_INJECTION 42 + bool "DRBD fault injection" 43 + depends on BLK_DEV_DRBD 44 + help 45 + 46 + Say Y here if you want to simulate IO errors, in order to test DRBD's 47 + behavior. 48 + 49 + The actual simulation of IO errors is done by writing 3 values to 50 + /sys/module/drbd/parameters/ 51 + 52 + enable_faults: bitmask of... 53 + 1 meta data write 54 + 2 read 55 + 4 resync data write 56 + 8 read 57 + 16 data write 58 + 32 data read 59 + 64 read ahead 60 + 128 kmalloc of bitmap 61 + 256 allocation of EE (epoch_entries) 62 + 63 + fault_devs: bitmask of minor numbers 64 + fault_rate: frequency in percent 65 + 66 + Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. 67 + echo 16 > /sys/module/drbd/parameters/enable_faults 68 + echo 1 > /sys/module/drbd/parameters/fault_devs 69 + echo 5 > /sys/module/drbd/parameters/fault_rate 70 + 71 + If unsure, say N.

+5

drivers/block/drbd/Makefile

··· 1 + drbd-y := drbd_bitmap.o drbd_proc.o 2 + drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o 3 + drbd-y += drbd_main.o drbd_strings.o drbd_nl.o 4 + 5 + obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o

+1424

drivers/block/drbd/drbd_actlog.c

··· 1 + /* 2 + drbd_actlog.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/slab.h> 27 + #include <linux/drbd.h> 28 + #include "drbd_int.h" 29 + #include "drbd_wrappers.h" 30 + 31 + /* We maintain a trivial check sum in our on disk activity log. 32 + * With that we can ensure correct operation even when the storage 33 + * device might do a partial (last) sector write while loosing power. 34 + */ 35 + struct __packed al_transaction { 36 + u32 magic; 37 + u32 tr_number; 38 + struct __packed { 39 + u32 pos; 40 + u32 extent; } updates[1 + AL_EXTENTS_PT]; 41 + u32 xor_sum; 42 + }; 43 + 44 + struct update_odbm_work { 45 + struct drbd_work w; 46 + unsigned int enr; 47 + }; 48 + 49 + struct update_al_work { 50 + struct drbd_work w; 51 + struct lc_element *al_ext; 52 + struct completion event; 53 + unsigned int enr; 54 + /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ 55 + unsigned int old_enr; 56 + }; 57 + 58 + struct drbd_atodb_wait { 59 + atomic_t count; 60 + struct completion io_done; 61 + struct drbd_conf *mdev; 62 + int error; 63 + }; 64 + 65 + 66 + int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); 67 + 68 + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, 69 + struct drbd_backing_dev *bdev, 70 + struct page *page, sector_t sector, 71 + int rw, int size) 72 + { 73 + struct bio *bio; 74 + struct drbd_md_io md_io; 75 + int ok; 76 + 77 + md_io.mdev = mdev; 78 + init_completion(&md_io.event); 79 + md_io.error = 0; 80 + 81 + if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) 82 + rw |= (1 << BIO_RW_BARRIER); 83 + rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)); 84 + 85 + retry: 86 + bio = bio_alloc(GFP_NOIO, 1); 87 + bio->bi_bdev = bdev->md_bdev; 88 + bio->bi_sector = sector; 89 + ok = (bio_add_page(bio, page, size, 0) == size); 90 + if (!ok) 91 + goto out; 92 + bio->bi_private = &md_io; 93 + bio->bi_end_io = drbd_md_io_complete; 94 + bio->bi_rw = rw; 95 + 96 + if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 97 + bio_endio(bio, -EIO); 98 + else 99 + submit_bio(rw, bio); 100 + wait_for_completion(&md_io.event); 101 + ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; 102 + 103 + /* check for unsupported barrier op. 104 + * would rather check on EOPNOTSUPP, but that is not reliable. 105 + * don't try again for ANY return value != 0 */ 106 + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { 107 + /* Try again with no barrier */ 108 + dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); 109 + set_bit(MD_NO_BARRIER, &mdev->flags); 110 + rw &= ~(1 << BIO_RW_BARRIER); 111 + bio_put(bio); 112 + goto retry; 113 + } 114 + out: 115 + bio_put(bio); 116 + return ok; 117 + } 118 + 119 + int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, 120 + sector_t sector, int rw) 121 + { 122 + int logical_block_size, mask, ok; 123 + int offset = 0; 124 + struct page *iop = mdev->md_io_page; 125 + 126 + D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); 127 + 128 + BUG_ON(!bdev->md_bdev); 129 + 130 + logical_block_size = bdev_logical_block_size(bdev->md_bdev); 131 + if (logical_block_size == 0) 132 + logical_block_size = MD_SECTOR_SIZE; 133 + 134 + /* in case logical_block_size != 512 [ s390 only? ] */ 135 + if (logical_block_size != MD_SECTOR_SIZE) { 136 + mask = (logical_block_size / MD_SECTOR_SIZE) - 1; 137 + D_ASSERT(mask == 1 || mask == 3 || mask == 7); 138 + D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); 139 + offset = sector & mask; 140 + sector = sector & ~mask; 141 + iop = mdev->md_io_tmpp; 142 + 143 + if (rw & WRITE) { 144 + /* these are GFP_KERNEL pages, pre-allocated 145 + * on device initialization */ 146 + void *p = page_address(mdev->md_io_page); 147 + void *hp = page_address(mdev->md_io_tmpp); 148 + 149 + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, 150 + READ, logical_block_size); 151 + 152 + if (unlikely(!ok)) { 153 + dev_err(DEV, "drbd_md_sync_page_io(,%llus," 154 + "READ [logical_block_size!=512]) failed!\n", 155 + (unsigned long long)sector); 156 + return 0; 157 + } 158 + 159 + memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); 160 + } 161 + } 162 + 163 + if (sector < drbd_md_first_sector(bdev) || 164 + sector > drbd_md_last_sector(bdev)) 165 + dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 166 + current->comm, current->pid, __func__, 167 + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 168 + 169 + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); 170 + if (unlikely(!ok)) { 171 + dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", 172 + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 173 + return 0; 174 + } 175 + 176 + if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { 177 + void *p = page_address(mdev->md_io_page); 178 + void *hp = page_address(mdev->md_io_tmpp); 179 + 180 + memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); 181 + } 182 + 183 + return ok; 184 + } 185 + 186 + static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) 187 + { 188 + struct lc_element *al_ext; 189 + struct lc_element *tmp; 190 + unsigned long al_flags = 0; 191 + 192 + spin_lock_irq(&mdev->al_lock); 193 + tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 194 + if (unlikely(tmp != NULL)) { 195 + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 196 + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 197 + spin_unlock_irq(&mdev->al_lock); 198 + return NULL; 199 + } 200 + } 201 + al_ext = lc_get(mdev->act_log, enr); 202 + al_flags = mdev->act_log->flags; 203 + spin_unlock_irq(&mdev->al_lock); 204 + 205 + /* 206 + if (!al_ext) { 207 + if (al_flags & LC_STARVING) 208 + dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); 209 + if (al_flags & LC_DIRTY) 210 + dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); 211 + } 212 + */ 213 + 214 + return al_ext; 215 + } 216 + 217 + void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) 218 + { 219 + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); 220 + struct lc_element *al_ext; 221 + struct update_al_work al_work; 222 + 223 + D_ASSERT(atomic_read(&mdev->local_cnt) > 0); 224 + 225 + wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); 226 + 227 + if (al_ext->lc_number != enr) { 228 + /* drbd_al_write_transaction(mdev,al_ext,enr); 229 + * recurses into generic_make_request(), which 230 + * disallows recursion, bios being serialized on the 231 + * current->bio_tail list now. 232 + * we have to delegate updates to the activity log 233 + * to the worker thread. */ 234 + init_completion(&al_work.event); 235 + al_work.al_ext = al_ext; 236 + al_work.enr = enr; 237 + al_work.old_enr = al_ext->lc_number; 238 + al_work.w.cb = w_al_write_transaction; 239 + drbd_queue_work_front(&mdev->data.work, &al_work.w); 240 + wait_for_completion(&al_work.event); 241 + 242 + mdev->al_writ_cnt++; 243 + 244 + spin_lock_irq(&mdev->al_lock); 245 + lc_changed(mdev->act_log, al_ext); 246 + spin_unlock_irq(&mdev->al_lock); 247 + wake_up(&mdev->al_wait); 248 + } 249 + } 250 + 251 + void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) 252 + { 253 + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); 254 + struct lc_element *extent; 255 + unsigned long flags; 256 + 257 + spin_lock_irqsave(&mdev->al_lock, flags); 258 + 259 + extent = lc_find(mdev->act_log, enr); 260 + 261 + if (!extent) { 262 + spin_unlock_irqrestore(&mdev->al_lock, flags); 263 + dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); 264 + return; 265 + } 266 + 267 + if (lc_put(mdev->act_log, extent) == 0) 268 + wake_up(&mdev->al_wait); 269 + 270 + spin_unlock_irqrestore(&mdev->al_lock, flags); 271 + } 272 + 273 + int 274 + w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) 275 + { 276 + struct update_al_work *aw = container_of(w, struct update_al_work, w); 277 + struct lc_element *updated = aw->al_ext; 278 + const unsigned int new_enr = aw->enr; 279 + const unsigned int evicted = aw->old_enr; 280 + struct al_transaction *buffer; 281 + sector_t sector; 282 + int i, n, mx; 283 + unsigned int extent_nr; 284 + u32 xor_sum = 0; 285 + 286 + if (!get_ldev(mdev)) { 287 + dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); 288 + complete(&((struct update_al_work *)w)->event); 289 + return 1; 290 + } 291 + /* do we have to do a bitmap write, first? 292 + * TODO reduce maximum latency: 293 + * submit both bios, then wait for both, 294 + * instead of doing two synchronous sector writes. */ 295 + if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 296 + drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 297 + 298 + mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ 299 + buffer = (struct al_transaction *)page_address(mdev->md_io_page); 300 + 301 + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); 302 + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); 303 + 304 + n = lc_index_of(mdev->act_log, updated); 305 + 306 + buffer->updates[0].pos = cpu_to_be32(n); 307 + buffer->updates[0].extent = cpu_to_be32(new_enr); 308 + 309 + xor_sum ^= new_enr; 310 + 311 + mx = min_t(int, AL_EXTENTS_PT, 312 + mdev->act_log->nr_elements - mdev->al_tr_cycle); 313 + for (i = 0; i < mx; i++) { 314 + unsigned idx = mdev->al_tr_cycle + i; 315 + extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; 316 + buffer->updates[i+1].pos = cpu_to_be32(idx); 317 + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); 318 + xor_sum ^= extent_nr; 319 + } 320 + for (; i < AL_EXTENTS_PT; i++) { 321 + buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); 322 + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); 323 + xor_sum ^= LC_FREE; 324 + } 325 + mdev->al_tr_cycle += AL_EXTENTS_PT; 326 + if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 327 + mdev->al_tr_cycle = 0; 328 + 329 + buffer->xor_sum = cpu_to_be32(xor_sum); 330 + 331 + sector = mdev->ldev->md.md_offset 332 + + mdev->ldev->md.al_offset + mdev->al_tr_pos; 333 + 334 + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) 335 + drbd_chk_io_error(mdev, 1, TRUE); 336 + 337 + if (++mdev->al_tr_pos > 338 + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) 339 + mdev->al_tr_pos = 0; 340 + 341 + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); 342 + mdev->al_tr_number++; 343 + 344 + mutex_unlock(&mdev->md_io_mutex); 345 + 346 + complete(&((struct update_al_work *)w)->event); 347 + put_ldev(mdev); 348 + 349 + return 1; 350 + } 351 + 352 + /** 353 + * drbd_al_read_tr() - Read a single transaction from the on disk activity log 354 + * @mdev: DRBD device. 355 + * @bdev: Block device to read form. 356 + * @b: pointer to an al_transaction. 357 + * @index: On disk slot of the transaction to read. 358 + * 359 + * Returns -1 on IO error, 0 on checksum error and 1 upon success. 360 + */ 361 + static int drbd_al_read_tr(struct drbd_conf *mdev, 362 + struct drbd_backing_dev *bdev, 363 + struct al_transaction *b, 364 + int index) 365 + { 366 + sector_t sector; 367 + int rv, i; 368 + u32 xor_sum = 0; 369 + 370 + sector = bdev->md.md_offset + bdev->md.al_offset + index; 371 + 372 + /* Dont process error normally, 373 + * as this is done before disk is attached! */ 374 + if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) 375 + return -1; 376 + 377 + rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); 378 + 379 + for (i = 0; i < AL_EXTENTS_PT + 1; i++) 380 + xor_sum ^= be32_to_cpu(b->updates[i].extent); 381 + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); 382 + 383 + return rv; 384 + } 385 + 386 + /** 387 + * drbd_al_read_log() - Restores the activity log from its on disk representation. 388 + * @mdev: DRBD device. 389 + * @bdev: Block device to read form. 390 + * 391 + * Returns 1 on success, returns 0 when reading the log failed due to IO errors. 392 + */ 393 + int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 394 + { 395 + struct al_transaction *buffer; 396 + int i; 397 + int rv; 398 + int mx; 399 + int active_extents = 0; 400 + int transactions = 0; 401 + int found_valid = 0; 402 + int from = 0; 403 + int to = 0; 404 + u32 from_tnr = 0; 405 + u32 to_tnr = 0; 406 + u32 cnr; 407 + 408 + mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); 409 + 410 + /* lock out all other meta data io for now, 411 + * and make sure the page is mapped. 412 + */ 413 + mutex_lock(&mdev->md_io_mutex); 414 + buffer = page_address(mdev->md_io_page); 415 + 416 + /* Find the valid transaction in the log */ 417 + for (i = 0; i <= mx; i++) { 418 + rv = drbd_al_read_tr(mdev, bdev, buffer, i); 419 + if (rv == 0) 420 + continue; 421 + if (rv == -1) { 422 + mutex_unlock(&mdev->md_io_mutex); 423 + return 0; 424 + } 425 + cnr = be32_to_cpu(buffer->tr_number); 426 + 427 + if (++found_valid == 1) { 428 + from = i; 429 + to = i; 430 + from_tnr = cnr; 431 + to_tnr = cnr; 432 + continue; 433 + } 434 + if ((int)cnr - (int)from_tnr < 0) { 435 + D_ASSERT(from_tnr - cnr + i - from == mx+1); 436 + from = i; 437 + from_tnr = cnr; 438 + } 439 + if ((int)cnr - (int)to_tnr > 0) { 440 + D_ASSERT(cnr - to_tnr == i - to); 441 + to = i; 442 + to_tnr = cnr; 443 + } 444 + } 445 + 446 + if (!found_valid) { 447 + dev_warn(DEV, "No usable activity log found.\n"); 448 + mutex_unlock(&mdev->md_io_mutex); 449 + return 1; 450 + } 451 + 452 + /* Read the valid transactions. 453 + * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ 454 + i = from; 455 + while (1) { 456 + int j, pos; 457 + unsigned int extent_nr; 458 + unsigned int trn; 459 + 460 + rv = drbd_al_read_tr(mdev, bdev, buffer, i); 461 + ERR_IF(rv == 0) goto cancel; 462 + if (rv == -1) { 463 + mutex_unlock(&mdev->md_io_mutex); 464 + return 0; 465 + } 466 + 467 + trn = be32_to_cpu(buffer->tr_number); 468 + 469 + spin_lock_irq(&mdev->al_lock); 470 + 471 + /* This loop runs backwards because in the cyclic 472 + elements there might be an old version of the 473 + updated element (in slot 0). So the element in slot 0 474 + can overwrite old versions. */ 475 + for (j = AL_EXTENTS_PT; j >= 0; j--) { 476 + pos = be32_to_cpu(buffer->updates[j].pos); 477 + extent_nr = be32_to_cpu(buffer->updates[j].extent); 478 + 479 + if (extent_nr == LC_FREE) 480 + continue; 481 + 482 + lc_set(mdev->act_log, extent_nr, pos); 483 + active_extents++; 484 + } 485 + spin_unlock_irq(&mdev->al_lock); 486 + 487 + transactions++; 488 + 489 + cancel: 490 + if (i == to) 491 + break; 492 + i++; 493 + if (i > mx) 494 + i = 0; 495 + } 496 + 497 + mdev->al_tr_number = to_tnr+1; 498 + mdev->al_tr_pos = to; 499 + if (++mdev->al_tr_pos > 500 + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) 501 + mdev->al_tr_pos = 0; 502 + 503 + /* ok, we are done with it */ 504 + mutex_unlock(&mdev->md_io_mutex); 505 + 506 + dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", 507 + transactions, active_extents); 508 + 509 + return 1; 510 + } 511 + 512 + static void atodb_endio(struct bio *bio, int error) 513 + { 514 + struct drbd_atodb_wait *wc = bio->bi_private; 515 + struct drbd_conf *mdev = wc->mdev; 516 + struct page *page; 517 + int uptodate = bio_flagged(bio, BIO_UPTODATE); 518 + 519 + /* strange behavior of some lower level drivers... 520 + * fail the request by clearing the uptodate flag, 521 + * but do not return any error?! */ 522 + if (!error && !uptodate) 523 + error = -EIO; 524 + 525 + drbd_chk_io_error(mdev, error, TRUE); 526 + if (error && wc->error == 0) 527 + wc->error = error; 528 + 529 + if (atomic_dec_and_test(&wc->count)) 530 + complete(&wc->io_done); 531 + 532 + page = bio->bi_io_vec[0].bv_page; 533 + put_page(page); 534 + bio_put(bio); 535 + mdev->bm_writ_cnt++; 536 + put_ldev(mdev); 537 + } 538 + 539 + #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) 540 + /* activity log to on disk bitmap -- prepare bio unless that sector 541 + * is already covered by previously prepared bios */ 542 + static int atodb_prepare_unless_covered(struct drbd_conf *mdev, 543 + struct bio **bios, 544 + unsigned int enr, 545 + struct drbd_atodb_wait *wc) __must_hold(local) 546 + { 547 + struct bio *bio; 548 + struct page *page; 549 + sector_t on_disk_sector = enr + mdev->ldev->md.md_offset 550 + + mdev->ldev->md.bm_offset; 551 + unsigned int page_offset = PAGE_SIZE; 552 + int offset; 553 + int i = 0; 554 + int err = -ENOMEM; 555 + 556 + /* Check if that enr is already covered by an already created bio. 557 + * Caution, bios[] is not NULL terminated, 558 + * but only initialized to all NULL. 559 + * For completely scattered activity log, 560 + * the last invocation iterates over all bios, 561 + * and finds the last NULL entry. 562 + */ 563 + while ((bio = bios[i])) { 564 + if (bio->bi_sector == on_disk_sector) 565 + return 0; 566 + i++; 567 + } 568 + /* bios[i] == NULL, the next not yet used slot */ 569 + 570 + /* GFP_KERNEL, we are not in the write-out path */ 571 + bio = bio_alloc(GFP_KERNEL, 1); 572 + if (bio == NULL) 573 + return -ENOMEM; 574 + 575 + if (i > 0) { 576 + const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; 577 + page_offset = prev_bv->bv_offset + prev_bv->bv_len; 578 + page = prev_bv->bv_page; 579 + } 580 + if (page_offset == PAGE_SIZE) { 581 + page = alloc_page(__GFP_HIGHMEM); 582 + if (page == NULL) 583 + goto out_bio_put; 584 + page_offset = 0; 585 + } else { 586 + get_page(page); 587 + } 588 + 589 + offset = S2W(enr); 590 + drbd_bm_get_lel(mdev, offset, 591 + min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), 592 + kmap(page) + page_offset); 593 + kunmap(page); 594 + 595 + bio->bi_private = wc; 596 + bio->bi_end_io = atodb_endio; 597 + bio->bi_bdev = mdev->ldev->md_bdev; 598 + bio->bi_sector = on_disk_sector; 599 + 600 + if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) 601 + goto out_put_page; 602 + 603 + atomic_inc(&wc->count); 604 + /* we already know that we may do this... 605 + * get_ldev_if_state(mdev,D_ATTACHING); 606 + * just get the extra reference, so that the local_cnt reflects 607 + * the number of pending IO requests DRBD at its backing device. 608 + */ 609 + atomic_inc(&mdev->local_cnt); 610 + 611 + bios[i] = bio; 612 + 613 + return 0; 614 + 615 + out_put_page: 616 + err = -EINVAL; 617 + put_page(page); 618 + out_bio_put: 619 + bio_put(bio); 620 + return err; 621 + } 622 + 623 + /** 624 + * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents 625 + * @mdev: DRBD device. 626 + * 627 + * Called when we detach (unconfigure) local storage, 628 + * or when we go from R_PRIMARY to R_SECONDARY role. 629 + */ 630 + void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) 631 + { 632 + int i, nr_elements; 633 + unsigned int enr; 634 + struct bio **bios; 635 + struct drbd_atodb_wait wc; 636 + 637 + ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) 638 + return; /* sorry, I don't have any act_log etc... */ 639 + 640 + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 641 + 642 + nr_elements = mdev->act_log->nr_elements; 643 + 644 + /* GFP_KERNEL, we are not in anyone's write-out path */ 645 + bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); 646 + if (!bios) 647 + goto submit_one_by_one; 648 + 649 + atomic_set(&wc.count, 0); 650 + init_completion(&wc.io_done); 651 + wc.mdev = mdev; 652 + wc.error = 0; 653 + 654 + for (i = 0; i < nr_elements; i++) { 655 + enr = lc_element_by_index(mdev->act_log, i)->lc_number; 656 + if (enr == LC_FREE) 657 + continue; 658 + /* next statement also does atomic_inc wc.count and local_cnt */ 659 + if (atodb_prepare_unless_covered(mdev, bios, 660 + enr/AL_EXT_PER_BM_SECT, 661 + &wc)) 662 + goto free_bios_submit_one_by_one; 663 + } 664 + 665 + /* unnecessary optimization? */ 666 + lc_unlock(mdev->act_log); 667 + wake_up(&mdev->al_wait); 668 + 669 + /* all prepared, submit them */ 670 + for (i = 0; i < nr_elements; i++) { 671 + if (bios[i] == NULL) 672 + break; 673 + if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { 674 + bios[i]->bi_rw = WRITE; 675 + bio_endio(bios[i], -EIO); 676 + } else { 677 + submit_bio(WRITE, bios[i]); 678 + } 679 + } 680 + 681 + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); 682 + 683 + /* always (try to) flush bitmap to stable storage */ 684 + drbd_md_flush(mdev); 685 + 686 + /* In case we did not submit a single IO do not wait for 687 + * them to complete. ( Because we would wait forever here. ) 688 + * 689 + * In case we had IOs and they are already complete, there 690 + * is not point in waiting anyways. 691 + * Therefore this if () ... */ 692 + if (atomic_read(&wc.count)) 693 + wait_for_completion(&wc.io_done); 694 + 695 + put_ldev(mdev); 696 + 697 + kfree(bios); 698 + return; 699 + 700 + free_bios_submit_one_by_one: 701 + /* free everything by calling the endio callback directly. */ 702 + for (i = 0; i < nr_elements && bios[i]; i++) 703 + bio_endio(bios[i], 0); 704 + 705 + kfree(bios); 706 + 707 + submit_one_by_one: 708 + dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); 709 + 710 + for (i = 0; i < mdev->act_log->nr_elements; i++) { 711 + enr = lc_element_by_index(mdev->act_log, i)->lc_number; 712 + if (enr == LC_FREE) 713 + continue; 714 + /* Really slow: if we have al-extents 16..19 active, 715 + * sector 4 will be written four times! Synchronous! */ 716 + drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); 717 + } 718 + 719 + lc_unlock(mdev->act_log); 720 + wake_up(&mdev->al_wait); 721 + put_ldev(mdev); 722 + } 723 + 724 + /** 725 + * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents 726 + * @mdev: DRBD device. 727 + */ 728 + void drbd_al_apply_to_bm(struct drbd_conf *mdev) 729 + { 730 + unsigned int enr; 731 + unsigned long add = 0; 732 + char ppb[10]; 733 + int i; 734 + 735 + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 736 + 737 + for (i = 0; i < mdev->act_log->nr_elements; i++) { 738 + enr = lc_element_by_index(mdev->act_log, i)->lc_number; 739 + if (enr == LC_FREE) 740 + continue; 741 + add += drbd_bm_ALe_set_all(mdev, enr); 742 + } 743 + 744 + lc_unlock(mdev->act_log); 745 + wake_up(&mdev->al_wait); 746 + 747 + dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", 748 + ppsize(ppb, Bit2KB(add))); 749 + } 750 + 751 + static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) 752 + { 753 + int rv; 754 + 755 + spin_lock_irq(&mdev->al_lock); 756 + rv = (al_ext->refcnt == 0); 757 + if (likely(rv)) 758 + lc_del(mdev->act_log, al_ext); 759 + spin_unlock_irq(&mdev->al_lock); 760 + 761 + return rv; 762 + } 763 + 764 + /** 765 + * drbd_al_shrink() - Removes all active extents form the activity log 766 + * @mdev: DRBD device. 767 + * 768 + * Removes all active extents form the activity log, waiting until 769 + * the reference count of each entry dropped to 0 first, of course. 770 + * 771 + * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() 772 + */ 773 + void drbd_al_shrink(struct drbd_conf *mdev) 774 + { 775 + struct lc_element *al_ext; 776 + int i; 777 + 778 + D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); 779 + 780 + for (i = 0; i < mdev->act_log->nr_elements; i++) { 781 + al_ext = lc_element_by_index(mdev->act_log, i); 782 + if (al_ext->lc_number == LC_FREE) 783 + continue; 784 + wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); 785 + } 786 + 787 + wake_up(&mdev->al_wait); 788 + } 789 + 790 + static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) 791 + { 792 + struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 793 + 794 + if (!get_ldev(mdev)) { 795 + if (__ratelimit(&drbd_ratelimit_state)) 796 + dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); 797 + kfree(udw); 798 + return 1; 799 + } 800 + 801 + drbd_bm_write_sect(mdev, udw->enr); 802 + put_ldev(mdev); 803 + 804 + kfree(udw); 805 + 806 + if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { 807 + switch (mdev->state.conn) { 808 + case C_SYNC_SOURCE: case C_SYNC_TARGET: 809 + case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: 810 + drbd_resync_finished(mdev); 811 + default: 812 + /* nothing to do */ 813 + break; 814 + } 815 + } 816 + drbd_bcast_sync_progress(mdev); 817 + 818 + return 1; 819 + } 820 + 821 + 822 + /* ATTENTION. The AL's extents are 4MB each, while the extents in the 823 + * resync LRU-cache are 16MB each. 824 + * The caller of this function has to hold an get_ldev() reference. 825 + * 826 + * TODO will be obsoleted once we have a caching lru of the on disk bitmap 827 + */ 828 + static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, 829 + int count, int success) 830 + { 831 + struct lc_element *e; 832 + struct update_odbm_work *udw; 833 + 834 + unsigned int enr; 835 + 836 + D_ASSERT(atomic_read(&mdev->local_cnt)); 837 + 838 + /* I simply assume that a sector/size pair never crosses 839 + * a 16 MB extent border. (Currently this is true...) */ 840 + enr = BM_SECT_TO_EXT(sector); 841 + 842 + e = lc_get(mdev->resync, enr); 843 + if (e) { 844 + struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 845 + if (ext->lce.lc_number == enr) { 846 + if (success) 847 + ext->rs_left -= count; 848 + else 849 + ext->rs_failed += count; 850 + if (ext->rs_left < ext->rs_failed) { 851 + dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " 852 + "rs_failed=%d count=%d\n", 853 + (unsigned long long)sector, 854 + ext->lce.lc_number, ext->rs_left, 855 + ext->rs_failed, count); 856 + dump_stack(); 857 + 858 + lc_put(mdev->resync, &ext->lce); 859 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 860 + return; 861 + } 862 + } else { 863 + /* Normally this element should be in the cache, 864 + * since drbd_rs_begin_io() pulled it already in. 865 + * 866 + * But maybe an application write finished, and we set 867 + * something outside the resync lru_cache in sync. 868 + */ 869 + int rs_left = drbd_bm_e_weight(mdev, enr); 870 + if (ext->flags != 0) { 871 + dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" 872 + " -> %d[%u;00]\n", 873 + ext->lce.lc_number, ext->rs_left, 874 + ext->flags, enr, rs_left); 875 + ext->flags = 0; 876 + } 877 + if (ext->rs_failed) { 878 + dev_warn(DEV, "Kicking resync_lru element enr=%u " 879 + "out with rs_failed=%d\n", 880 + ext->lce.lc_number, ext->rs_failed); 881 + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); 882 + } 883 + ext->rs_left = rs_left; 884 + ext->rs_failed = success ? 0 : count; 885 + lc_changed(mdev->resync, &ext->lce); 886 + } 887 + lc_put(mdev->resync, &ext->lce); 888 + /* no race, we are within the al_lock! */ 889 + 890 + if (ext->rs_left == ext->rs_failed) { 891 + ext->rs_failed = 0; 892 + 893 + udw = kmalloc(sizeof(*udw), GFP_ATOMIC); 894 + if (udw) { 895 + udw->enr = ext->lce.lc_number; 896 + udw->w.cb = w_update_odbm; 897 + drbd_queue_work_front(&mdev->data.work, &udw->w); 898 + } else { 899 + dev_warn(DEV, "Could not kmalloc an udw\n"); 900 + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); 901 + } 902 + } 903 + } else { 904 + dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", 905 + mdev->resync_locked, 906 + mdev->resync->nr_elements, 907 + mdev->resync->flags); 908 + } 909 + } 910 + 911 + /* clear the bit corresponding to the piece of storage in question: 912 + * size byte of data starting from sector. Only clear a bits of the affected 913 + * one ore more _aligned_ BM_BLOCK_SIZE blocks. 914 + * 915 + * called by worker on C_SYNC_TARGET and receiver on SyncSource. 916 + * 917 + */ 918 + void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, 919 + const char *file, const unsigned int line) 920 + { 921 + /* Is called from worker and receiver context _only_ */ 922 + unsigned long sbnr, ebnr, lbnr; 923 + unsigned long count = 0; 924 + sector_t esector, nr_sectors; 925 + int wake_up = 0; 926 + unsigned long flags; 927 + 928 + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 929 + dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 930 + (unsigned long long)sector, size); 931 + return; 932 + } 933 + nr_sectors = drbd_get_capacity(mdev->this_bdev); 934 + esector = sector + (size >> 9) - 1; 935 + 936 + ERR_IF(sector >= nr_sectors) return; 937 + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); 938 + 939 + lbnr = BM_SECT_TO_BIT(nr_sectors-1); 940 + 941 + /* we clear it (in sync). 942 + * round up start sector, round down end sector. we make sure we only 943 + * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ 944 + if (unlikely(esector < BM_SECT_PER_BIT-1)) 945 + return; 946 + if (unlikely(esector == (nr_sectors-1))) 947 + ebnr = lbnr; 948 + else 949 + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 950 + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 951 + 952 + if (sbnr > ebnr) 953 + return; 954 + 955 + /* 956 + * ok, (capacity & 7) != 0 sometimes, but who cares... 957 + * we count rs_{total,left} in bits, not sectors. 958 + */ 959 + spin_lock_irqsave(&mdev->al_lock, flags); 960 + count = drbd_bm_clear_bits(mdev, sbnr, ebnr); 961 + if (count) { 962 + /* we need the lock for drbd_try_clear_on_disk_bm */ 963 + if (jiffies - mdev->rs_mark_time > HZ*10) { 964 + /* should be rolling marks, 965 + * but we estimate only anyways. */ 966 + if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && 967 + mdev->state.conn != C_PAUSED_SYNC_T && 968 + mdev->state.conn != C_PAUSED_SYNC_S) { 969 + mdev->rs_mark_time = jiffies; 970 + mdev->rs_mark_left = drbd_bm_total_weight(mdev); 971 + } 972 + } 973 + if (get_ldev(mdev)) { 974 + drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); 975 + put_ldev(mdev); 976 + } 977 + /* just wake_up unconditional now, various lc_chaged(), 978 + * lc_put() in drbd_try_clear_on_disk_bm(). */ 979 + wake_up = 1; 980 + } 981 + spin_unlock_irqrestore(&mdev->al_lock, flags); 982 + if (wake_up) 983 + wake_up(&mdev->al_wait); 984 + } 985 + 986 + /* 987 + * this is intended to set one request worth of data out of sync. 988 + * affects at least 1 bit, 989 + * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. 990 + * 991 + * called by tl_clear and drbd_send_dblock (==drbd_make_request). 992 + * so this can be _any_ process. 993 + */ 994 + void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, 995 + const char *file, const unsigned int line) 996 + { 997 + unsigned long sbnr, ebnr, lbnr, flags; 998 + sector_t esector, nr_sectors; 999 + unsigned int enr, count; 1000 + struct lc_element *e; 1001 + 1002 + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1003 + dev_err(DEV, "sector: %llus, size: %d\n", 1004 + (unsigned long long)sector, size); 1005 + return; 1006 + } 1007 + 1008 + if (!get_ldev(mdev)) 1009 + return; /* no disk, no metadata, no bitmap to set bits in */ 1010 + 1011 + nr_sectors = drbd_get_capacity(mdev->this_bdev); 1012 + esector = sector + (size >> 9) - 1; 1013 + 1014 + ERR_IF(sector >= nr_sectors) 1015 + goto out; 1016 + ERR_IF(esector >= nr_sectors) 1017 + esector = (nr_sectors-1); 1018 + 1019 + lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1020 + 1021 + /* we set it out of sync, 1022 + * we do not need to round anything here */ 1023 + sbnr = BM_SECT_TO_BIT(sector); 1024 + ebnr = BM_SECT_TO_BIT(esector); 1025 + 1026 + /* ok, (capacity & 7) != 0 sometimes, but who cares... 1027 + * we count rs_{total,left} in bits, not sectors. */ 1028 + spin_lock_irqsave(&mdev->al_lock, flags); 1029 + count = drbd_bm_set_bits(mdev, sbnr, ebnr); 1030 + 1031 + enr = BM_SECT_TO_EXT(sector); 1032 + e = lc_find(mdev->resync, enr); 1033 + if (e) 1034 + lc_entry(e, struct bm_extent, lce)->rs_left += count; 1035 + spin_unlock_irqrestore(&mdev->al_lock, flags); 1036 + 1037 + out: 1038 + put_ldev(mdev); 1039 + } 1040 + 1041 + static 1042 + struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) 1043 + { 1044 + struct lc_element *e; 1045 + struct bm_extent *bm_ext; 1046 + int wakeup = 0; 1047 + unsigned long rs_flags; 1048 + 1049 + spin_lock_irq(&mdev->al_lock); 1050 + if (mdev->resync_locked > mdev->resync->nr_elements/2) { 1051 + spin_unlock_irq(&mdev->al_lock); 1052 + return NULL; 1053 + } 1054 + e = lc_get(mdev->resync, enr); 1055 + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1056 + if (bm_ext) { 1057 + if (bm_ext->lce.lc_number != enr) { 1058 + bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); 1059 + bm_ext->rs_failed = 0; 1060 + lc_changed(mdev->resync, &bm_ext->lce); 1061 + wakeup = 1; 1062 + } 1063 + if (bm_ext->lce.refcnt == 1) 1064 + mdev->resync_locked++; 1065 + set_bit(BME_NO_WRITES, &bm_ext->flags); 1066 + } 1067 + rs_flags = mdev->resync->flags; 1068 + spin_unlock_irq(&mdev->al_lock); 1069 + if (wakeup) 1070 + wake_up(&mdev->al_wait); 1071 + 1072 + if (!bm_ext) { 1073 + if (rs_flags & LC_STARVING) 1074 + dev_warn(DEV, "Have to wait for element" 1075 + " (resync LRU too small?)\n"); 1076 + BUG_ON(rs_flags & LC_DIRTY); 1077 + } 1078 + 1079 + return bm_ext; 1080 + } 1081 + 1082 + static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) 1083 + { 1084 + struct lc_element *al_ext; 1085 + int rv = 0; 1086 + 1087 + spin_lock_irq(&mdev->al_lock); 1088 + if (unlikely(enr == mdev->act_log->new_number)) 1089 + rv = 1; 1090 + else { 1091 + al_ext = lc_find(mdev->act_log, enr); 1092 + if (al_ext) { 1093 + if (al_ext->refcnt) 1094 + rv = 1; 1095 + } 1096 + } 1097 + spin_unlock_irq(&mdev->al_lock); 1098 + 1099 + /* 1100 + if (unlikely(rv)) { 1101 + dev_info(DEV, "Delaying sync read until app's write is done\n"); 1102 + } 1103 + */ 1104 + return rv; 1105 + } 1106 + 1107 + /** 1108 + * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 1109 + * @mdev: DRBD device. 1110 + * @sector: The sector number. 1111 + * 1112 + * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. 1113 + */ 1114 + int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 1115 + { 1116 + unsigned int enr = BM_SECT_TO_EXT(sector); 1117 + struct bm_extent *bm_ext; 1118 + int i, sig; 1119 + 1120 + sig = wait_event_interruptible(mdev->al_wait, 1121 + (bm_ext = _bme_get(mdev, enr))); 1122 + if (sig) 1123 + return 0; 1124 + 1125 + if (test_bit(BME_LOCKED, &bm_ext->flags)) 1126 + return 1; 1127 + 1128 + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1129 + sig = wait_event_interruptible(mdev->al_wait, 1130 + !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); 1131 + if (sig) { 1132 + spin_lock_irq(&mdev->al_lock); 1133 + if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1134 + clear_bit(BME_NO_WRITES, &bm_ext->flags); 1135 + mdev->resync_locked--; 1136 + wake_up(&mdev->al_wait); 1137 + } 1138 + spin_unlock_irq(&mdev->al_lock); 1139 + return 0; 1140 + } 1141 + } 1142 + 1143 + set_bit(BME_LOCKED, &bm_ext->flags); 1144 + 1145 + return 1; 1146 + } 1147 + 1148 + /** 1149 + * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 1150 + * @mdev: DRBD device. 1151 + * @sector: The sector number. 1152 + * 1153 + * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 1154 + * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 1155 + * if there is still application IO going on in this area. 1156 + */ 1157 + int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 1158 + { 1159 + unsigned int enr = BM_SECT_TO_EXT(sector); 1160 + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 1161 + struct lc_element *e; 1162 + struct bm_extent *bm_ext; 1163 + int i; 1164 + 1165 + spin_lock_irq(&mdev->al_lock); 1166 + if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { 1167 + /* in case you have very heavy scattered io, it may 1168 + * stall the syncer undefined if we give up the ref count 1169 + * when we try again and requeue. 1170 + * 1171 + * if we don't give up the refcount, but the next time 1172 + * we are scheduled this extent has been "synced" by new 1173 + * application writes, we'd miss the lc_put on the 1174 + * extent we keep the refcount on. 1175 + * so we remembered which extent we had to try again, and 1176 + * if the next requested one is something else, we do 1177 + * the lc_put here... 1178 + * we also have to wake_up 1179 + */ 1180 + e = lc_find(mdev->resync, mdev->resync_wenr); 1181 + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1182 + if (bm_ext) { 1183 + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1184 + D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); 1185 + clear_bit(BME_NO_WRITES, &bm_ext->flags); 1186 + mdev->resync_wenr = LC_FREE; 1187 + if (lc_put(mdev->resync, &bm_ext->lce) == 0) 1188 + mdev->resync_locked--; 1189 + wake_up(&mdev->al_wait); 1190 + } else { 1191 + dev_alert(DEV, "LOGIC BUG\n"); 1192 + } 1193 + } 1194 + /* TRY. */ 1195 + e = lc_try_get(mdev->resync, enr); 1196 + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1197 + if (bm_ext) { 1198 + if (test_bit(BME_LOCKED, &bm_ext->flags)) 1199 + goto proceed; 1200 + if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 1201 + mdev->resync_locked++; 1202 + } else { 1203 + /* we did set the BME_NO_WRITES, 1204 + * but then could not set BME_LOCKED, 1205 + * so we tried again. 1206 + * drop the extra reference. */ 1207 + bm_ext->lce.refcnt--; 1208 + D_ASSERT(bm_ext->lce.refcnt > 0); 1209 + } 1210 + goto check_al; 1211 + } else { 1212 + /* do we rather want to try later? */ 1213 + if (mdev->resync_locked > mdev->resync->nr_elements-3) 1214 + goto try_again; 1215 + /* Do or do not. There is no try. -- Yoda */ 1216 + e = lc_get(mdev->resync, enr); 1217 + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1218 + if (!bm_ext) { 1219 + const unsigned long rs_flags = mdev->resync->flags; 1220 + if (rs_flags & LC_STARVING) 1221 + dev_warn(DEV, "Have to wait for element" 1222 + " (resync LRU too small?)\n"); 1223 + BUG_ON(rs_flags & LC_DIRTY); 1224 + goto try_again; 1225 + } 1226 + if (bm_ext->lce.lc_number != enr) { 1227 + bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); 1228 + bm_ext->rs_failed = 0; 1229 + lc_changed(mdev->resync, &bm_ext->lce); 1230 + wake_up(&mdev->al_wait); 1231 + D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); 1232 + } 1233 + set_bit(BME_NO_WRITES, &bm_ext->flags); 1234 + D_ASSERT(bm_ext->lce.refcnt == 1); 1235 + mdev->resync_locked++; 1236 + goto check_al; 1237 + } 1238 + check_al: 1239 + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1240 + if (unlikely(al_enr+i == mdev->act_log->new_number)) 1241 + goto try_again; 1242 + if (lc_is_used(mdev->act_log, al_enr+i)) 1243 + goto try_again; 1244 + } 1245 + set_bit(BME_LOCKED, &bm_ext->flags); 1246 + proceed: 1247 + mdev->resync_wenr = LC_FREE; 1248 + spin_unlock_irq(&mdev->al_lock); 1249 + return 0; 1250 + 1251 + try_again: 1252 + if (bm_ext) 1253 + mdev->resync_wenr = enr; 1254 + spin_unlock_irq(&mdev->al_lock); 1255 + return -EAGAIN; 1256 + } 1257 + 1258 + void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) 1259 + { 1260 + unsigned int enr = BM_SECT_TO_EXT(sector); 1261 + struct lc_element *e; 1262 + struct bm_extent *bm_ext; 1263 + unsigned long flags; 1264 + 1265 + spin_lock_irqsave(&mdev->al_lock, flags); 1266 + e = lc_find(mdev->resync, enr); 1267 + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1268 + if (!bm_ext) { 1269 + spin_unlock_irqrestore(&mdev->al_lock, flags); 1270 + if (__ratelimit(&drbd_ratelimit_state)) 1271 + dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); 1272 + return; 1273 + } 1274 + 1275 + if (bm_ext->lce.refcnt == 0) { 1276 + spin_unlock_irqrestore(&mdev->al_lock, flags); 1277 + dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " 1278 + "but refcnt is 0!?\n", 1279 + (unsigned long long)sector, enr); 1280 + return; 1281 + } 1282 + 1283 + if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1284 + clear_bit(BME_LOCKED, &bm_ext->flags); 1285 + clear_bit(BME_NO_WRITES, &bm_ext->flags); 1286 + mdev->resync_locked--; 1287 + wake_up(&mdev->al_wait); 1288 + } 1289 + 1290 + spin_unlock_irqrestore(&mdev->al_lock, flags); 1291 + } 1292 + 1293 + /** 1294 + * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1295 + * @mdev: DRBD device. 1296 + */ 1297 + void drbd_rs_cancel_all(struct drbd_conf *mdev) 1298 + { 1299 + spin_lock_irq(&mdev->al_lock); 1300 + 1301 + if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ 1302 + lc_reset(mdev->resync); 1303 + put_ldev(mdev); 1304 + } 1305 + mdev->resync_locked = 0; 1306 + mdev->resync_wenr = LC_FREE; 1307 + spin_unlock_irq(&mdev->al_lock); 1308 + wake_up(&mdev->al_wait); 1309 + } 1310 + 1311 + /** 1312 + * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1313 + * @mdev: DRBD device. 1314 + * 1315 + * Returns 0 upon success, -EAGAIN if at least one reference count was 1316 + * not zero. 1317 + */ 1318 + int drbd_rs_del_all(struct drbd_conf *mdev) 1319 + { 1320 + struct lc_element *e; 1321 + struct bm_extent *bm_ext; 1322 + int i; 1323 + 1324 + spin_lock_irq(&mdev->al_lock); 1325 + 1326 + if (get_ldev_if_state(mdev, D_FAILED)) { 1327 + /* ok, ->resync is there. */ 1328 + for (i = 0; i < mdev->resync->nr_elements; i++) { 1329 + e = lc_element_by_index(mdev->resync, i); 1330 + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1331 + if (bm_ext->lce.lc_number == LC_FREE) 1332 + continue; 1333 + if (bm_ext->lce.lc_number == mdev->resync_wenr) { 1334 + dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" 1335 + " got 'synced' by application io\n", 1336 + mdev->resync_wenr); 1337 + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1338 + D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); 1339 + clear_bit(BME_NO_WRITES, &bm_ext->flags); 1340 + mdev->resync_wenr = LC_FREE; 1341 + lc_put(mdev->resync, &bm_ext->lce); 1342 + } 1343 + if (bm_ext->lce.refcnt != 0) { 1344 + dev_info(DEV, "Retrying drbd_rs_del_all() later. " 1345 + "refcnt=%d\n", bm_ext->lce.refcnt); 1346 + put_ldev(mdev); 1347 + spin_unlock_irq(&mdev->al_lock); 1348 + return -EAGAIN; 1349 + } 1350 + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1351 + D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); 1352 + lc_del(mdev->resync, &bm_ext->lce); 1353 + } 1354 + D_ASSERT(mdev->resync->used == 0); 1355 + put_ldev(mdev); 1356 + } 1357 + spin_unlock_irq(&mdev->al_lock); 1358 + 1359 + return 0; 1360 + } 1361 + 1362 + /** 1363 + * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks 1364 + * @mdev: DRBD device. 1365 + * @sector: The sector number. 1366 + * @size: Size of failed IO operation, in byte. 1367 + */ 1368 + void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) 1369 + { 1370 + /* Is called from worker and receiver context _only_ */ 1371 + unsigned long sbnr, ebnr, lbnr; 1372 + unsigned long count; 1373 + sector_t esector, nr_sectors; 1374 + int wake_up = 0; 1375 + 1376 + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1377 + dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1378 + (unsigned long long)sector, size); 1379 + return; 1380 + } 1381 + nr_sectors = drbd_get_capacity(mdev->this_bdev); 1382 + esector = sector + (size >> 9) - 1; 1383 + 1384 + ERR_IF(sector >= nr_sectors) return; 1385 + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); 1386 + 1387 + lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1388 + 1389 + /* 1390 + * round up start sector, round down end sector. we make sure we only 1391 + * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ 1392 + if (unlikely(esector < BM_SECT_PER_BIT-1)) 1393 + return; 1394 + if (unlikely(esector == (nr_sectors-1))) 1395 + ebnr = lbnr; 1396 + else 1397 + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 1398 + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 1399 + 1400 + if (sbnr > ebnr) 1401 + return; 1402 + 1403 + /* 1404 + * ok, (capacity & 7) != 0 sometimes, but who cares... 1405 + * we count rs_{total,left} in bits, not sectors. 1406 + */ 1407 + spin_lock_irq(&mdev->al_lock); 1408 + count = drbd_bm_count_bits(mdev, sbnr, ebnr); 1409 + if (count) { 1410 + mdev->rs_failed += count; 1411 + 1412 + if (get_ldev(mdev)) { 1413 + drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); 1414 + put_ldev(mdev); 1415 + } 1416 + 1417 + /* just wake_up unconditional now, various lc_chaged(), 1418 + * lc_put() in drbd_try_clear_on_disk_bm(). */ 1419 + wake_up = 1; 1420 + } 1421 + spin_unlock_irq(&mdev->al_lock); 1422 + if (wake_up) 1423 + wake_up(&mdev->al_wait); 1424 + }

+1327

drivers/block/drbd/drbd_bitmap.c

··· 1 + /* 2 + drbd_bitmap.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + */ 24 + 25 + #include <linux/bitops.h> 26 + #include <linux/vmalloc.h> 27 + #include <linux/string.h> 28 + #include <linux/drbd.h> 29 + #include <asm/kmap_types.h> 30 + #include "drbd_int.h" 31 + 32 + /* OPAQUE outside this file! 33 + * interface defined in drbd_int.h 34 + 35 + * convention: 36 + * function name drbd_bm_... => used elsewhere, "public". 37 + * function name bm_... => internal to implementation, "private". 38 + 39 + * Note that since find_first_bit returns int, at the current granularity of 40 + * the bitmap (4KB per byte), this implementation "only" supports up to 41 + * 1<<(32+12) == 16 TB... 42 + */ 43 + 44 + /* 45 + * NOTE 46 + * Access to the *bm_pages is protected by bm_lock. 47 + * It is safe to read the other members within the lock. 48 + * 49 + * drbd_bm_set_bits is called from bio_endio callbacks, 50 + * We may be called with irq already disabled, 51 + * so we need spin_lock_irqsave(). 52 + * And we need the kmap_atomic. 53 + */ 54 + struct drbd_bitmap { 55 + struct page **bm_pages; 56 + spinlock_t bm_lock; 57 + /* WARNING unsigned long bm_*: 58 + * 32bit number of bit offset is just enough for 512 MB bitmap. 59 + * it will blow up if we make the bitmap bigger... 60 + * not that it makes much sense to have a bitmap that large, 61 + * rather change the granularity to 16k or 64k or something. 62 + * (that implies other problems, however...) 63 + */ 64 + unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ 65 + unsigned long bm_bits; 66 + size_t bm_words; 67 + size_t bm_number_of_pages; 68 + sector_t bm_dev_capacity; 69 + struct semaphore bm_change; /* serializes resize operations */ 70 + 71 + atomic_t bm_async_io; 72 + wait_queue_head_t bm_io_wait; 73 + 74 + unsigned long bm_flags; 75 + 76 + /* debugging aid, in case we are still racy somewhere */ 77 + char *bm_why; 78 + struct task_struct *bm_task; 79 + }; 80 + 81 + /* definition of bits in bm_flags */ 82 + #define BM_LOCKED 0 83 + #define BM_MD_IO_ERROR 1 84 + #define BM_P_VMALLOCED 2 85 + 86 + static int bm_is_locked(struct drbd_bitmap *b) 87 + { 88 + return test_bit(BM_LOCKED, &b->bm_flags); 89 + } 90 + 91 + #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) 92 + static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) 93 + { 94 + struct drbd_bitmap *b = mdev->bitmap; 95 + if (!__ratelimit(&drbd_ratelimit_state)) 96 + return; 97 + dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", 98 + current == mdev->receiver.task ? "receiver" : 99 + current == mdev->asender.task ? "asender" : 100 + current == mdev->worker.task ? "worker" : current->comm, 101 + func, b->bm_why ?: "?", 102 + b->bm_task == mdev->receiver.task ? "receiver" : 103 + b->bm_task == mdev->asender.task ? "asender" : 104 + b->bm_task == mdev->worker.task ? "worker" : "?"); 105 + } 106 + 107 + void drbd_bm_lock(struct drbd_conf *mdev, char *why) 108 + { 109 + struct drbd_bitmap *b = mdev->bitmap; 110 + int trylock_failed; 111 + 112 + if (!b) { 113 + dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n"); 114 + return; 115 + } 116 + 117 + trylock_failed = down_trylock(&b->bm_change); 118 + 119 + if (trylock_failed) { 120 + dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", 121 + current == mdev->receiver.task ? "receiver" : 122 + current == mdev->asender.task ? "asender" : 123 + current == mdev->worker.task ? "worker" : current->comm, 124 + why, b->bm_why ?: "?", 125 + b->bm_task == mdev->receiver.task ? "receiver" : 126 + b->bm_task == mdev->asender.task ? "asender" : 127 + b->bm_task == mdev->worker.task ? "worker" : "?"); 128 + down(&b->bm_change); 129 + } 130 + if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) 131 + dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); 132 + 133 + b->bm_why = why; 134 + b->bm_task = current; 135 + } 136 + 137 + void drbd_bm_unlock(struct drbd_conf *mdev) 138 + { 139 + struct drbd_bitmap *b = mdev->bitmap; 140 + if (!b) { 141 + dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n"); 142 + return; 143 + } 144 + 145 + if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) 146 + dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); 147 + 148 + b->bm_why = NULL; 149 + b->bm_task = NULL; 150 + up(&b->bm_change); 151 + } 152 + 153 + /* word offset to long pointer */ 154 + static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) 155 + { 156 + struct page *page; 157 + unsigned long page_nr; 158 + 159 + /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ 160 + page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); 161 + BUG_ON(page_nr >= b->bm_number_of_pages); 162 + page = b->bm_pages[page_nr]; 163 + 164 + return (unsigned long *) kmap_atomic(page, km); 165 + } 166 + 167 + static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) 168 + { 169 + return __bm_map_paddr(b, offset, KM_IRQ1); 170 + } 171 + 172 + static void __bm_unmap(unsigned long *p_addr, const enum km_type km) 173 + { 174 + kunmap_atomic(p_addr, km); 175 + }; 176 + 177 + static void bm_unmap(unsigned long *p_addr) 178 + { 179 + return __bm_unmap(p_addr, KM_IRQ1); 180 + } 181 + 182 + /* long word offset of _bitmap_ sector */ 183 + #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) 184 + /* word offset from start of bitmap to word number _in_page_ 185 + * modulo longs per page 186 + #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) 187 + hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) 188 + so do it explicitly: 189 + */ 190 + #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) 191 + 192 + /* Long words per page */ 193 + #define LWPP (PAGE_SIZE/sizeof(long)) 194 + 195 + /* 196 + * actually most functions herein should take a struct drbd_bitmap*, not a 197 + * struct drbd_conf*, but for the debug macros I like to have the mdev around 198 + * to be able to report device specific. 199 + */ 200 + 201 + static void bm_free_pages(struct page **pages, unsigned long number) 202 + { 203 + unsigned long i; 204 + if (!pages) 205 + return; 206 + 207 + for (i = 0; i < number; i++) { 208 + if (!pages[i]) { 209 + printk(KERN_ALERT "drbd: bm_free_pages tried to free " 210 + "a NULL pointer; i=%lu n=%lu\n", 211 + i, number); 212 + continue; 213 + } 214 + __free_page(pages[i]); 215 + pages[i] = NULL; 216 + } 217 + } 218 + 219 + static void bm_vk_free(void *ptr, int v) 220 + { 221 + if (v) 222 + vfree(ptr); 223 + else 224 + kfree(ptr); 225 + } 226 + 227 + /* 228 + * "have" and "want" are NUMBER OF PAGES. 229 + */ 230 + static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) 231 + { 232 + struct page **old_pages = b->bm_pages; 233 + struct page **new_pages, *page; 234 + unsigned int i, bytes, vmalloced = 0; 235 + unsigned long have = b->bm_number_of_pages; 236 + 237 + BUG_ON(have == 0 && old_pages != NULL); 238 + BUG_ON(have != 0 && old_pages == NULL); 239 + 240 + if (have == want) 241 + return old_pages; 242 + 243 + /* Trying kmalloc first, falling back to vmalloc. 244 + * GFP_KERNEL is ok, as this is done when a lower level disk is 245 + * "attached" to the drbd. Context is receiver thread or cqueue 246 + * thread. As we have no disk yet, we are not in the IO path, 247 + * not even the IO path of the peer. */ 248 + bytes = sizeof(struct page *)*want; 249 + new_pages = kmalloc(bytes, GFP_KERNEL); 250 + if (!new_pages) { 251 + new_pages = vmalloc(bytes); 252 + if (!new_pages) 253 + return NULL; 254 + vmalloced = 1; 255 + } 256 + 257 + memset(new_pages, 0, bytes); 258 + if (want >= have) { 259 + for (i = 0; i < have; i++) 260 + new_pages[i] = old_pages[i]; 261 + for (; i < want; i++) { 262 + page = alloc_page(GFP_HIGHUSER); 263 + if (!page) { 264 + bm_free_pages(new_pages + have, i - have); 265 + bm_vk_free(new_pages, vmalloced); 266 + return NULL; 267 + } 268 + new_pages[i] = page; 269 + } 270 + } else { 271 + for (i = 0; i < want; i++) 272 + new_pages[i] = old_pages[i]; 273 + /* NOT HERE, we are outside the spinlock! 274 + bm_free_pages(old_pages + want, have - want); 275 + */ 276 + } 277 + 278 + if (vmalloced) 279 + set_bit(BM_P_VMALLOCED, &b->bm_flags); 280 + else 281 + clear_bit(BM_P_VMALLOCED, &b->bm_flags); 282 + 283 + return new_pages; 284 + } 285 + 286 + /* 287 + * called on driver init only. TODO call when a device is created. 288 + * allocates the drbd_bitmap, and stores it in mdev->bitmap. 289 + */ 290 + int drbd_bm_init(struct drbd_conf *mdev) 291 + { 292 + struct drbd_bitmap *b = mdev->bitmap; 293 + WARN_ON(b != NULL); 294 + b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); 295 + if (!b) 296 + return -ENOMEM; 297 + spin_lock_init(&b->bm_lock); 298 + init_MUTEX(&b->bm_change); 299 + init_waitqueue_head(&b->bm_io_wait); 300 + 301 + mdev->bitmap = b; 302 + 303 + return 0; 304 + } 305 + 306 + sector_t drbd_bm_capacity(struct drbd_conf *mdev) 307 + { 308 + ERR_IF(!mdev->bitmap) return 0; 309 + return mdev->bitmap->bm_dev_capacity; 310 + } 311 + 312 + /* called on driver unload. TODO: call when a device is destroyed. 313 + */ 314 + void drbd_bm_cleanup(struct drbd_conf *mdev) 315 + { 316 + ERR_IF (!mdev->bitmap) return; 317 + bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); 318 + bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); 319 + kfree(mdev->bitmap); 320 + mdev->bitmap = NULL; 321 + } 322 + 323 + /* 324 + * since (b->bm_bits % BITS_PER_LONG) != 0, 325 + * this masks out the remaining bits. 326 + * Returns the number of bits cleared. 327 + */ 328 + static int bm_clear_surplus(struct drbd_bitmap *b) 329 + { 330 + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; 331 + size_t w = b->bm_bits >> LN2_BPL; 332 + int cleared = 0; 333 + unsigned long *p_addr, *bm; 334 + 335 + p_addr = bm_map_paddr(b, w); 336 + bm = p_addr + MLPP(w); 337 + if (w < b->bm_words) { 338 + cleared = hweight_long(*bm & ~mask); 339 + *bm &= mask; 340 + w++; bm++; 341 + } 342 + 343 + if (w < b->bm_words) { 344 + cleared += hweight_long(*bm); 345 + *bm = 0; 346 + } 347 + bm_unmap(p_addr); 348 + return cleared; 349 + } 350 + 351 + static void bm_set_surplus(struct drbd_bitmap *b) 352 + { 353 + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; 354 + size_t w = b->bm_bits >> LN2_BPL; 355 + unsigned long *p_addr, *bm; 356 + 357 + p_addr = bm_map_paddr(b, w); 358 + bm = p_addr + MLPP(w); 359 + if (w < b->bm_words) { 360 + *bm |= ~mask; 361 + bm++; w++; 362 + } 363 + 364 + if (w < b->bm_words) { 365 + *bm = ~(0UL); 366 + } 367 + bm_unmap(p_addr); 368 + } 369 + 370 + static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) 371 + { 372 + unsigned long *p_addr, *bm, offset = 0; 373 + unsigned long bits = 0; 374 + unsigned long i, do_now; 375 + 376 + while (offset < b->bm_words) { 377 + i = do_now = min_t(size_t, b->bm_words-offset, LWPP); 378 + p_addr = __bm_map_paddr(b, offset, KM_USER0); 379 + bm = p_addr + MLPP(offset); 380 + while (i--) { 381 + #ifndef __LITTLE_ENDIAN 382 + if (swap_endian) 383 + *bm = lel_to_cpu(*bm); 384 + #endif 385 + bits += hweight_long(*bm++); 386 + } 387 + __bm_unmap(p_addr, KM_USER0); 388 + offset += do_now; 389 + cond_resched(); 390 + } 391 + 392 + return bits; 393 + } 394 + 395 + static unsigned long bm_count_bits(struct drbd_bitmap *b) 396 + { 397 + return __bm_count_bits(b, 0); 398 + } 399 + 400 + static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) 401 + { 402 + return __bm_count_bits(b, 1); 403 + } 404 + 405 + /* offset and len in long words.*/ 406 + static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) 407 + { 408 + unsigned long *p_addr, *bm; 409 + size_t do_now, end; 410 + 411 + #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) 412 + 413 + end = offset + len; 414 + 415 + if (end > b->bm_words) { 416 + printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); 417 + return; 418 + } 419 + 420 + while (offset < end) { 421 + do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; 422 + p_addr = bm_map_paddr(b, offset); 423 + bm = p_addr + MLPP(offset); 424 + if (bm+do_now > p_addr + LWPP) { 425 + printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 426 + p_addr, bm, (int)do_now); 427 + break; /* breaks to after catch_oob_access_end() only! */ 428 + } 429 + memset(bm, c, do_now * sizeof(long)); 430 + bm_unmap(p_addr); 431 + offset += do_now; 432 + } 433 + } 434 + 435 + /* 436 + * make sure the bitmap has enough room for the attached storage, 437 + * if necessary, resize. 438 + * called whenever we may have changed the device size. 439 + * returns -ENOMEM if we could not allocate enough memory, 0 on success. 440 + * In case this is actually a resize, we copy the old bitmap into the new one. 441 + * Otherwise, the bitmap is initialized to all bits set. 442 + */ 443 + int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) 444 + { 445 + struct drbd_bitmap *b = mdev->bitmap; 446 + unsigned long bits, words, owords, obits, *p_addr, *bm; 447 + unsigned long want, have, onpages; /* number of pages */ 448 + struct page **npages, **opages = NULL; 449 + int err = 0, growing; 450 + int opages_vmalloced; 451 + 452 + ERR_IF(!b) return -ENOMEM; 453 + 454 + drbd_bm_lock(mdev, "resize"); 455 + 456 + dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", 457 + (unsigned long long)capacity); 458 + 459 + if (capacity == b->bm_dev_capacity) 460 + goto out; 461 + 462 + opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); 463 + 464 + if (capacity == 0) { 465 + spin_lock_irq(&b->bm_lock); 466 + opages = b->bm_pages; 467 + onpages = b->bm_number_of_pages; 468 + owords = b->bm_words; 469 + b->bm_pages = NULL; 470 + b->bm_number_of_pages = 471 + b->bm_set = 472 + b->bm_bits = 473 + b->bm_words = 474 + b->bm_dev_capacity = 0; 475 + spin_unlock_irq(&b->bm_lock); 476 + bm_free_pages(opages, onpages); 477 + bm_vk_free(opages, opages_vmalloced); 478 + goto out; 479 + } 480 + bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); 481 + 482 + /* if we would use 483 + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; 484 + a 32bit host could present the wrong number of words 485 + to a 64bit host. 486 + */ 487 + words = ALIGN(bits, 64) >> LN2_BPL; 488 + 489 + if (get_ldev(mdev)) { 490 + D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); 491 + put_ldev(mdev); 492 + } 493 + 494 + /* one extra long to catch off by one errors */ 495 + want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; 496 + have = b->bm_number_of_pages; 497 + if (want == have) { 498 + D_ASSERT(b->bm_pages != NULL); 499 + npages = b->bm_pages; 500 + } else { 501 + if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) 502 + npages = NULL; 503 + else 504 + npages = bm_realloc_pages(b, want); 505 + } 506 + 507 + if (!npages) { 508 + err = -ENOMEM; 509 + goto out; 510 + } 511 + 512 + spin_lock_irq(&b->bm_lock); 513 + opages = b->bm_pages; 514 + owords = b->bm_words; 515 + obits = b->bm_bits; 516 + 517 + growing = bits > obits; 518 + if (opages) 519 + bm_set_surplus(b); 520 + 521 + b->bm_pages = npages; 522 + b->bm_number_of_pages = want; 523 + b->bm_bits = bits; 524 + b->bm_words = words; 525 + b->bm_dev_capacity = capacity; 526 + 527 + if (growing) { 528 + bm_memset(b, owords, 0xff, words-owords); 529 + b->bm_set += bits - obits; 530 + } 531 + 532 + if (want < have) { 533 + /* implicit: (opages != NULL) && (opages != npages) */ 534 + bm_free_pages(opages + want, have - want); 535 + } 536 + 537 + p_addr = bm_map_paddr(b, words); 538 + bm = p_addr + MLPP(words); 539 + *bm = DRBD_MAGIC; 540 + bm_unmap(p_addr); 541 + 542 + (void)bm_clear_surplus(b); 543 + 544 + spin_unlock_irq(&b->bm_lock); 545 + if (opages != npages) 546 + bm_vk_free(opages, opages_vmalloced); 547 + if (!growing) 548 + b->bm_set = bm_count_bits(b); 549 + dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); 550 + 551 + out: 552 + drbd_bm_unlock(mdev); 553 + return err; 554 + } 555 + 556 + /* inherently racy: 557 + * if not protected by other means, return value may be out of date when 558 + * leaving this function... 559 + * we still need to lock it, since it is important that this returns 560 + * bm_set == 0 precisely. 561 + * 562 + * maybe bm_set should be atomic_t ? 563 + */ 564 + static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) 565 + { 566 + struct drbd_bitmap *b = mdev->bitmap; 567 + unsigned long s; 568 + unsigned long flags; 569 + 570 + ERR_IF(!b) return 0; 571 + ERR_IF(!b->bm_pages) return 0; 572 + 573 + spin_lock_irqsave(&b->bm_lock, flags); 574 + s = b->bm_set; 575 + spin_unlock_irqrestore(&b->bm_lock, flags); 576 + 577 + return s; 578 + } 579 + 580 + unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) 581 + { 582 + unsigned long s; 583 + /* if I don't have a disk, I don't know about out-of-sync status */ 584 + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) 585 + return 0; 586 + s = _drbd_bm_total_weight(mdev); 587 + put_ldev(mdev); 588 + return s; 589 + } 590 + 591 + size_t drbd_bm_words(struct drbd_conf *mdev) 592 + { 593 + struct drbd_bitmap *b = mdev->bitmap; 594 + ERR_IF(!b) return 0; 595 + ERR_IF(!b->bm_pages) return 0; 596 + 597 + return b->bm_words; 598 + } 599 + 600 + unsigned long drbd_bm_bits(struct drbd_conf *mdev) 601 + { 602 + struct drbd_bitmap *b = mdev->bitmap; 603 + ERR_IF(!b) return 0; 604 + 605 + return b->bm_bits; 606 + } 607 + 608 + /* merge number words from buffer into the bitmap starting at offset. 609 + * buffer[i] is expected to be little endian unsigned long. 610 + * bitmap must be locked by drbd_bm_lock. 611 + * currently only used from receive_bitmap. 612 + */ 613 + void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, 614 + unsigned long *buffer) 615 + { 616 + struct drbd_bitmap *b = mdev->bitmap; 617 + unsigned long *p_addr, *bm; 618 + unsigned long word, bits; 619 + size_t end, do_now; 620 + 621 + end = offset + number; 622 + 623 + ERR_IF(!b) return; 624 + ERR_IF(!b->bm_pages) return; 625 + if (number == 0) 626 + return; 627 + WARN_ON(offset >= b->bm_words); 628 + WARN_ON(end > b->bm_words); 629 + 630 + spin_lock_irq(&b->bm_lock); 631 + while (offset < end) { 632 + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 633 + p_addr = bm_map_paddr(b, offset); 634 + bm = p_addr + MLPP(offset); 635 + offset += do_now; 636 + while (do_now--) { 637 + bits = hweight_long(*bm); 638 + word = *bm | lel_to_cpu(*buffer++); 639 + *bm++ = word; 640 + b->bm_set += hweight_long(word) - bits; 641 + } 642 + bm_unmap(p_addr); 643 + } 644 + /* with 32bit <-> 64bit cross-platform connect 645 + * this is only correct for current usage, 646 + * where we _know_ that we are 64 bit aligned, 647 + * and know that this function is used in this way, too... 648 + */ 649 + if (end == b->bm_words) 650 + b->bm_set -= bm_clear_surplus(b); 651 + 652 + spin_unlock_irq(&b->bm_lock); 653 + } 654 + 655 + /* copy number words from the bitmap starting at offset into the buffer. 656 + * buffer[i] will be little endian unsigned long. 657 + */ 658 + void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, 659 + unsigned long *buffer) 660 + { 661 + struct drbd_bitmap *b = mdev->bitmap; 662 + unsigned long *p_addr, *bm; 663 + size_t end, do_now; 664 + 665 + end = offset + number; 666 + 667 + ERR_IF(!b) return; 668 + ERR_IF(!b->bm_pages) return; 669 + 670 + spin_lock_irq(&b->bm_lock); 671 + if ((offset >= b->bm_words) || 672 + (end > b->bm_words) || 673 + (number <= 0)) 674 + dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n", 675 + (unsigned long) offset, 676 + (unsigned long) number, 677 + (unsigned long) b->bm_words); 678 + else { 679 + while (offset < end) { 680 + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 681 + p_addr = bm_map_paddr(b, offset); 682 + bm = p_addr + MLPP(offset); 683 + offset += do_now; 684 + while (do_now--) 685 + *buffer++ = cpu_to_lel(*bm++); 686 + bm_unmap(p_addr); 687 + } 688 + } 689 + spin_unlock_irq(&b->bm_lock); 690 + } 691 + 692 + /* set all bits in the bitmap */ 693 + void drbd_bm_set_all(struct drbd_conf *mdev) 694 + { 695 + struct drbd_bitmap *b = mdev->bitmap; 696 + ERR_IF(!b) return; 697 + ERR_IF(!b->bm_pages) return; 698 + 699 + spin_lock_irq(&b->bm_lock); 700 + bm_memset(b, 0, 0xff, b->bm_words); 701 + (void)bm_clear_surplus(b); 702 + b->bm_set = b->bm_bits; 703 + spin_unlock_irq(&b->bm_lock); 704 + } 705 + 706 + /* clear all bits in the bitmap */ 707 + void drbd_bm_clear_all(struct drbd_conf *mdev) 708 + { 709 + struct drbd_bitmap *b = mdev->bitmap; 710 + ERR_IF(!b) return; 711 + ERR_IF(!b->bm_pages) return; 712 + 713 + spin_lock_irq(&b->bm_lock); 714 + bm_memset(b, 0, 0, b->bm_words); 715 + b->bm_set = 0; 716 + spin_unlock_irq(&b->bm_lock); 717 + } 718 + 719 + static void bm_async_io_complete(struct bio *bio, int error) 720 + { 721 + struct drbd_bitmap *b = bio->bi_private; 722 + int uptodate = bio_flagged(bio, BIO_UPTODATE); 723 + 724 + 725 + /* strange behavior of some lower level drivers... 726 + * fail the request by clearing the uptodate flag, 727 + * but do not return any error?! 728 + * do we want to WARN() on this? */ 729 + if (!error && !uptodate) 730 + error = -EIO; 731 + 732 + if (error) { 733 + /* doh. what now? 734 + * for now, set all bits, and flag MD_IO_ERROR */ 735 + __set_bit(BM_MD_IO_ERROR, &b->bm_flags); 736 + } 737 + if (atomic_dec_and_test(&b->bm_async_io)) 738 + wake_up(&b->bm_io_wait); 739 + 740 + bio_put(bio); 741 + } 742 + 743 + static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) 744 + { 745 + /* we are process context. we always get a bio */ 746 + struct bio *bio = bio_alloc(GFP_KERNEL, 1); 747 + unsigned int len; 748 + sector_t on_disk_sector = 749 + mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; 750 + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); 751 + 752 + /* this might happen with very small 753 + * flexible external meta data device */ 754 + len = min_t(unsigned int, PAGE_SIZE, 755 + (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); 756 + 757 + bio->bi_bdev = mdev->ldev->md_bdev; 758 + bio->bi_sector = on_disk_sector; 759 + bio_add_page(bio, b->bm_pages[page_nr], len, 0); 760 + bio->bi_private = b; 761 + bio->bi_end_io = bm_async_io_complete; 762 + 763 + if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { 764 + bio->bi_rw |= rw; 765 + bio_endio(bio, -EIO); 766 + } else { 767 + submit_bio(rw, bio); 768 + } 769 + } 770 + 771 + # if defined(__LITTLE_ENDIAN) 772 + /* nothing to do, on disk == in memory */ 773 + # define bm_cpu_to_lel(x) ((void)0) 774 + # else 775 + void bm_cpu_to_lel(struct drbd_bitmap *b) 776 + { 777 + /* need to cpu_to_lel all the pages ... 778 + * this may be optimized by using 779 + * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; 780 + * the following is still not optimal, but better than nothing */ 781 + unsigned int i; 782 + unsigned long *p_addr, *bm; 783 + if (b->bm_set == 0) { 784 + /* no page at all; avoid swap if all is 0 */ 785 + i = b->bm_number_of_pages; 786 + } else if (b->bm_set == b->bm_bits) { 787 + /* only the last page */ 788 + i = b->bm_number_of_pages - 1; 789 + } else { 790 + /* all pages */ 791 + i = 0; 792 + } 793 + for (; i < b->bm_number_of_pages; i++) { 794 + p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); 795 + for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) 796 + *bm = cpu_to_lel(*bm); 797 + kunmap_atomic(p_addr, KM_USER0); 798 + } 799 + } 800 + # endif 801 + /* lel_to_cpu == cpu_to_lel */ 802 + # define bm_lel_to_cpu(x) bm_cpu_to_lel(x) 803 + 804 + /* 805 + * bm_rw: read/write the whole bitmap from/to its on disk location. 806 + */ 807 + static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) 808 + { 809 + struct drbd_bitmap *b = mdev->bitmap; 810 + /* sector_t sector; */ 811 + int bm_words, num_pages, i; 812 + unsigned long now; 813 + char ppb[10]; 814 + int err = 0; 815 + 816 + WARN_ON(!bm_is_locked(b)); 817 + 818 + /* no spinlock here, the drbd_bm_lock should be enough! */ 819 + 820 + bm_words = drbd_bm_words(mdev); 821 + num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; 822 + 823 + /* on disk bitmap is little endian */ 824 + if (rw == WRITE) 825 + bm_cpu_to_lel(b); 826 + 827 + now = jiffies; 828 + atomic_set(&b->bm_async_io, num_pages); 829 + __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); 830 + 831 + /* let the layers below us try to merge these bios... */ 832 + for (i = 0; i < num_pages; i++) 833 + bm_page_io_async(mdev, b, i, rw); 834 + 835 + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); 836 + wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); 837 + 838 + if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { 839 + dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); 840 + drbd_chk_io_error(mdev, 1, TRUE); 841 + err = -EIO; 842 + } 843 + 844 + now = jiffies; 845 + if (rw == WRITE) { 846 + /* swap back endianness */ 847 + bm_lel_to_cpu(b); 848 + /* flush bitmap to stable storage */ 849 + drbd_md_flush(mdev); 850 + } else /* rw == READ */ { 851 + /* just read, if necessary adjust endianness */ 852 + b->bm_set = bm_count_bits_swap_endian(b); 853 + dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", 854 + jiffies - now); 855 + } 856 + now = b->bm_set; 857 + 858 + dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", 859 + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); 860 + 861 + return err; 862 + } 863 + 864 + /** 865 + * drbd_bm_read() - Read the whole bitmap from its on disk location. 866 + * @mdev: DRBD device. 867 + */ 868 + int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) 869 + { 870 + return bm_rw(mdev, READ); 871 + } 872 + 873 + /** 874 + * drbd_bm_write() - Write the whole bitmap to its on disk location. 875 + * @mdev: DRBD device. 876 + */ 877 + int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) 878 + { 879 + return bm_rw(mdev, WRITE); 880 + } 881 + 882 + /** 883 + * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap 884 + * @mdev: DRBD device. 885 + * @enr: Extent number in the resync lru (happens to be sector offset) 886 + * 887 + * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered 888 + * by a single sector write. Therefore enr == sector offset from the 889 + * start of the bitmap. 890 + */ 891 + int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) 892 + { 893 + sector_t on_disk_sector = enr + mdev->ldev->md.md_offset 894 + + mdev->ldev->md.bm_offset; 895 + int bm_words, num_words, offset; 896 + int err = 0; 897 + 898 + mutex_lock(&mdev->md_io_mutex); 899 + bm_words = drbd_bm_words(mdev); 900 + offset = S2W(enr); /* word offset into bitmap */ 901 + num_words = min(S2W(1), bm_words - offset); 902 + if (num_words < S2W(1)) 903 + memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); 904 + drbd_bm_get_lel(mdev, offset, num_words, 905 + page_address(mdev->md_io_page)); 906 + if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { 907 + int i; 908 + err = -EIO; 909 + dev_err(DEV, "IO ERROR writing bitmap sector %lu " 910 + "(meta-disk sector %llus)\n", 911 + enr, (unsigned long long)on_disk_sector); 912 + drbd_chk_io_error(mdev, 1, TRUE); 913 + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) 914 + drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); 915 + } 916 + mdev->bm_writ_cnt++; 917 + mutex_unlock(&mdev->md_io_mutex); 918 + return err; 919 + } 920 + 921 + /* NOTE 922 + * find_first_bit returns int, we return unsigned long. 923 + * should not make much difference anyways, but ... 924 + * 925 + * this returns a bit number, NOT a sector! 926 + */ 927 + #define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) 928 + static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, 929 + const int find_zero_bit, const enum km_type km) 930 + { 931 + struct drbd_bitmap *b = mdev->bitmap; 932 + unsigned long i = -1UL; 933 + unsigned long *p_addr; 934 + unsigned long bit_offset; /* bit offset of the mapped page. */ 935 + 936 + if (bm_fo > b->bm_bits) { 937 + dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); 938 + } else { 939 + while (bm_fo < b->bm_bits) { 940 + unsigned long offset; 941 + bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ 942 + offset = bit_offset >> LN2_BPL; /* word offset of the page */ 943 + p_addr = __bm_map_paddr(b, offset, km); 944 + 945 + if (find_zero_bit) 946 + i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 947 + else 948 + i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 949 + 950 + __bm_unmap(p_addr, km); 951 + if (i < PAGE_SIZE*8) { 952 + i = bit_offset + i; 953 + if (i >= b->bm_bits) 954 + break; 955 + goto found; 956 + } 957 + bm_fo = bit_offset + PAGE_SIZE*8; 958 + } 959 + i = -1UL; 960 + } 961 + found: 962 + return i; 963 + } 964 + 965 + static unsigned long bm_find_next(struct drbd_conf *mdev, 966 + unsigned long bm_fo, const int find_zero_bit) 967 + { 968 + struct drbd_bitmap *b = mdev->bitmap; 969 + unsigned long i = -1UL; 970 + 971 + ERR_IF(!b) return i; 972 + ERR_IF(!b->bm_pages) return i; 973 + 974 + spin_lock_irq(&b->bm_lock); 975 + if (bm_is_locked(b)) 976 + bm_print_lock_info(mdev); 977 + 978 + i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); 979 + 980 + spin_unlock_irq(&b->bm_lock); 981 + return i; 982 + } 983 + 984 + unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) 985 + { 986 + return bm_find_next(mdev, bm_fo, 0); 987 + } 988 + 989 + #if 0 990 + /* not yet needed for anything. */ 991 + unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) 992 + { 993 + return bm_find_next(mdev, bm_fo, 1); 994 + } 995 + #endif 996 + 997 + /* does not spin_lock_irqsave. 998 + * you must take drbd_bm_lock() first */ 999 + unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) 1000 + { 1001 + /* WARN_ON(!bm_is_locked(mdev)); */ 1002 + return __bm_find_next(mdev, bm_fo, 0, KM_USER1); 1003 + } 1004 + 1005 + unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) 1006 + { 1007 + /* WARN_ON(!bm_is_locked(mdev)); */ 1008 + return __bm_find_next(mdev, bm_fo, 1, KM_USER1); 1009 + } 1010 + 1011 + /* returns number of bits actually changed. 1012 + * for val != 0, we change 0 -> 1, return code positive 1013 + * for val == 0, we change 1 -> 0, return code negative 1014 + * wants bitnr, not sector. 1015 + * expected to be called for only a few bits (e - s about BITS_PER_LONG). 1016 + * Must hold bitmap lock already. */ 1017 + int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1018 + unsigned long e, int val, const enum km_type km) 1019 + { 1020 + struct drbd_bitmap *b = mdev->bitmap; 1021 + unsigned long *p_addr = NULL; 1022 + unsigned long bitnr; 1023 + unsigned long last_page_nr = -1UL; 1024 + int c = 0; 1025 + 1026 + if (e >= b->bm_bits) { 1027 + dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", 1028 + s, e, b->bm_bits); 1029 + e = b->bm_bits ? b->bm_bits -1 : 0; 1030 + } 1031 + for (bitnr = s; bitnr <= e; bitnr++) { 1032 + unsigned long offset = bitnr>>LN2_BPL; 1033 + unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); 1034 + if (page_nr != last_page_nr) { 1035 + if (p_addr) 1036 + __bm_unmap(p_addr, km); 1037 + p_addr = __bm_map_paddr(b, offset, km); 1038 + last_page_nr = page_nr; 1039 + } 1040 + if (val) 1041 + c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); 1042 + else 1043 + c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); 1044 + } 1045 + if (p_addr) 1046 + __bm_unmap(p_addr, km); 1047 + b->bm_set += c; 1048 + return c; 1049 + } 1050 + 1051 + /* returns number of bits actually changed. 1052 + * for val != 0, we change 0 -> 1, return code positive 1053 + * for val == 0, we change 1 -> 0, return code negative 1054 + * wants bitnr, not sector */ 1055 + int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1056 + const unsigned long e, int val) 1057 + { 1058 + unsigned long flags; 1059 + struct drbd_bitmap *b = mdev->bitmap; 1060 + int c = 0; 1061 + 1062 + ERR_IF(!b) return 1; 1063 + ERR_IF(!b->bm_pages) return 0; 1064 + 1065 + spin_lock_irqsave(&b->bm_lock, flags); 1066 + if (bm_is_locked(b)) 1067 + bm_print_lock_info(mdev); 1068 + 1069 + c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); 1070 + 1071 + spin_unlock_irqrestore(&b->bm_lock, flags); 1072 + return c; 1073 + } 1074 + 1075 + /* returns number of bits changed 0 -> 1 */ 1076 + int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) 1077 + { 1078 + return bm_change_bits_to(mdev, s, e, 1); 1079 + } 1080 + 1081 + /* returns number of bits changed 1 -> 0 */ 1082 + int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) 1083 + { 1084 + return -bm_change_bits_to(mdev, s, e, 0); 1085 + } 1086 + 1087 + /* sets all bits in full words, 1088 + * from first_word up to, but not including, last_word */ 1089 + static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, 1090 + int page_nr, int first_word, int last_word) 1091 + { 1092 + int i; 1093 + int bits; 1094 + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); 1095 + for (i = first_word; i < last_word; i++) { 1096 + bits = hweight_long(paddr[i]); 1097 + paddr[i] = ~0UL; 1098 + b->bm_set += BITS_PER_LONG - bits; 1099 + } 1100 + kunmap_atomic(paddr, KM_USER0); 1101 + } 1102 + 1103 + /* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. 1104 + * You must first drbd_bm_lock(). 1105 + * Can be called to set the whole bitmap in one go. 1106 + * Sets bits from s to e _inclusive_. */ 1107 + void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) 1108 + { 1109 + /* First set_bit from the first bit (s) 1110 + * up to the next long boundary (sl), 1111 + * then assign full words up to the last long boundary (el), 1112 + * then set_bit up to and including the last bit (e). 1113 + * 1114 + * Do not use memset, because we must account for changes, 1115 + * so we need to loop over the words with hweight() anyways. 1116 + */ 1117 + unsigned long sl = ALIGN(s,BITS_PER_LONG); 1118 + unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); 1119 + int first_page; 1120 + int last_page; 1121 + int page_nr; 1122 + int first_word; 1123 + int last_word; 1124 + 1125 + if (e - s <= 3*BITS_PER_LONG) { 1126 + /* don't bother; el and sl may even be wrong. */ 1127 + __bm_change_bits_to(mdev, s, e, 1, KM_USER0); 1128 + return; 1129 + } 1130 + 1131 + /* difference is large enough that we can trust sl and el */ 1132 + 1133 + /* bits filling the current long */ 1134 + if (sl) 1135 + __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); 1136 + 1137 + first_page = sl >> (3 + PAGE_SHIFT); 1138 + last_page = el >> (3 + PAGE_SHIFT); 1139 + 1140 + /* MLPP: modulo longs per page */ 1141 + /* LWPP: long words per page */ 1142 + first_word = MLPP(sl >> LN2_BPL); 1143 + last_word = LWPP; 1144 + 1145 + /* first and full pages, unless first page == last page */ 1146 + for (page_nr = first_page; page_nr < last_page; page_nr++) { 1147 + bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); 1148 + cond_resched(); 1149 + first_word = 0; 1150 + } 1151 + 1152 + /* last page (respectively only page, for first page == last page) */ 1153 + last_word = MLPP(el >> LN2_BPL); 1154 + bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); 1155 + 1156 + /* possibly trailing bits. 1157 + * example: (e & 63) == 63, el will be e+1. 1158 + * if that even was the very last bit, 1159 + * it would trigger an assert in __bm_change_bits_to() 1160 + */ 1161 + if (el <= e) 1162 + __bm_change_bits_to(mdev, el, e, 1, KM_USER0); 1163 + } 1164 + 1165 + /* returns bit state 1166 + * wants bitnr, NOT sector. 1167 + * inherently racy... area needs to be locked by means of {al,rs}_lru 1168 + * 1 ... bit set 1169 + * 0 ... bit not set 1170 + * -1 ... first out of bounds access, stop testing for bits! 1171 + */ 1172 + int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) 1173 + { 1174 + unsigned long flags; 1175 + struct drbd_bitmap *b = mdev->bitmap; 1176 + unsigned long *p_addr; 1177 + int i; 1178 + 1179 + ERR_IF(!b) return 0; 1180 + ERR_IF(!b->bm_pages) return 0; 1181 + 1182 + spin_lock_irqsave(&b->bm_lock, flags); 1183 + if (bm_is_locked(b)) 1184 + bm_print_lock_info(mdev); 1185 + if (bitnr < b->bm_bits) { 1186 + unsigned long offset = bitnr>>LN2_BPL; 1187 + p_addr = bm_map_paddr(b, offset); 1188 + i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; 1189 + bm_unmap(p_addr); 1190 + } else if (bitnr == b->bm_bits) { 1191 + i = -1; 1192 + } else { /* (bitnr > b->bm_bits) */ 1193 + dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); 1194 + i = 0; 1195 + } 1196 + 1197 + spin_unlock_irqrestore(&b->bm_lock, flags); 1198 + return i; 1199 + } 1200 + 1201 + /* returns number of bits set in the range [s, e] */ 1202 + int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) 1203 + { 1204 + unsigned long flags; 1205 + struct drbd_bitmap *b = mdev->bitmap; 1206 + unsigned long *p_addr = NULL, page_nr = -1; 1207 + unsigned long bitnr; 1208 + int c = 0; 1209 + size_t w; 1210 + 1211 + /* If this is called without a bitmap, that is a bug. But just to be 1212 + * robust in case we screwed up elsewhere, in that case pretend there 1213 + * was one dirty bit in the requested area, so we won't try to do a 1214 + * local read there (no bitmap probably implies no disk) */ 1215 + ERR_IF(!b) return 1; 1216 + ERR_IF(!b->bm_pages) return 1; 1217 + 1218 + spin_lock_irqsave(&b->bm_lock, flags); 1219 + if (bm_is_locked(b)) 1220 + bm_print_lock_info(mdev); 1221 + for (bitnr = s; bitnr <= e; bitnr++) { 1222 + w = bitnr >> LN2_BPL; 1223 + if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { 1224 + page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); 1225 + if (p_addr) 1226 + bm_unmap(p_addr); 1227 + p_addr = bm_map_paddr(b, w); 1228 + } 1229 + ERR_IF (bitnr >= b->bm_bits) { 1230 + dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); 1231 + } else { 1232 + c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); 1233 + } 1234 + } 1235 + if (p_addr) 1236 + bm_unmap(p_addr); 1237 + spin_unlock_irqrestore(&b->bm_lock, flags); 1238 + return c; 1239 + } 1240 + 1241 + 1242 + /* inherently racy... 1243 + * return value may be already out-of-date when this function returns. 1244 + * but the general usage is that this is only use during a cstate when bits are 1245 + * only cleared, not set, and typically only care for the case when the return 1246 + * value is zero, or we already "locked" this "bitmap extent" by other means. 1247 + * 1248 + * enr is bm-extent number, since we chose to name one sector (512 bytes) 1249 + * worth of the bitmap a "bitmap extent". 1250 + * 1251 + * TODO 1252 + * I think since we use it like a reference count, we should use the real 1253 + * reference count of some bitmap extent element from some lru instead... 1254 + * 1255 + */ 1256 + int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) 1257 + { 1258 + struct drbd_bitmap *b = mdev->bitmap; 1259 + int count, s, e; 1260 + unsigned long flags; 1261 + unsigned long *p_addr, *bm; 1262 + 1263 + ERR_IF(!b) return 0; 1264 + ERR_IF(!b->bm_pages) return 0; 1265 + 1266 + spin_lock_irqsave(&b->bm_lock, flags); 1267 + if (bm_is_locked(b)) 1268 + bm_print_lock_info(mdev); 1269 + 1270 + s = S2W(enr); 1271 + e = min((size_t)S2W(enr+1), b->bm_words); 1272 + count = 0; 1273 + if (s < b->bm_words) { 1274 + int n = e-s; 1275 + p_addr = bm_map_paddr(b, s); 1276 + bm = p_addr + MLPP(s); 1277 + while (n--) 1278 + count += hweight_long(*bm++); 1279 + bm_unmap(p_addr); 1280 + } else { 1281 + dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); 1282 + } 1283 + spin_unlock_irqrestore(&b->bm_lock, flags); 1284 + return count; 1285 + } 1286 + 1287 + /* set all bits covered by the AL-extent al_enr */ 1288 + unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) 1289 + { 1290 + struct drbd_bitmap *b = mdev->bitmap; 1291 + unsigned long *p_addr, *bm; 1292 + unsigned long weight; 1293 + int count, s, e, i, do_now; 1294 + ERR_IF(!b) return 0; 1295 + ERR_IF(!b->bm_pages) return 0; 1296 + 1297 + spin_lock_irq(&b->bm_lock); 1298 + if (bm_is_locked(b)) 1299 + bm_print_lock_info(mdev); 1300 + weight = b->bm_set; 1301 + 1302 + s = al_enr * BM_WORDS_PER_AL_EXT; 1303 + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); 1304 + /* assert that s and e are on the same page */ 1305 + D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) 1306 + == s >> (PAGE_SHIFT - LN2_BPL + 3)); 1307 + count = 0; 1308 + if (s < b->bm_words) { 1309 + i = do_now = e-s; 1310 + p_addr = bm_map_paddr(b, s); 1311 + bm = p_addr + MLPP(s); 1312 + while (i--) { 1313 + count += hweight_long(*bm); 1314 + *bm = -1UL; 1315 + bm++; 1316 + } 1317 + bm_unmap(p_addr); 1318 + b->bm_set += do_now*BITS_PER_LONG - count; 1319 + if (e == b->bm_words) 1320 + b->bm_set -= bm_clear_surplus(b); 1321 + } else { 1322 + dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); 1323 + } 1324 + weight = b->bm_set - weight; 1325 + spin_unlock_irq(&b->bm_lock); 1326 + return weight; 1327 + }

+2252

drivers/block/drbd/drbd_int.h

··· 1 + /* 2 + drbd_int.h 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #ifndef _DRBD_INT_H 27 + #define _DRBD_INT_H 28 + 29 + #include <linux/compiler.h> 30 + #include <linux/types.h> 31 + #include <linux/version.h> 32 + #include <linux/list.h> 33 + #include <linux/sched.h> 34 + #include <linux/bitops.h> 35 + #include <linux/slab.h> 36 + #include <linux/crypto.h> 37 + #include <linux/ratelimit.h> 38 + #include <linux/tcp.h> 39 + #include <linux/mutex.h> 40 + #include <linux/major.h> 41 + #include <linux/blkdev.h> 42 + #include <linux/genhd.h> 43 + #include <net/tcp.h> 44 + #include <linux/lru_cache.h> 45 + 46 + #ifdef __CHECKER__ 47 + # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) 48 + # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) 49 + # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) 50 + # define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) 51 + #else 52 + # define __protected_by(x) 53 + # define __protected_read_by(x) 54 + # define __protected_write_by(x) 55 + # define __must_hold(x) 56 + #endif 57 + 58 + #define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) 59 + 60 + /* module parameter, defined in drbd_main.c */ 61 + extern unsigned int minor_count; 62 + extern int disable_sendpage; 63 + extern int allow_oos; 64 + extern unsigned int cn_idx; 65 + 66 + #ifdef CONFIG_DRBD_FAULT_INJECTION 67 + extern int enable_faults; 68 + extern int fault_rate; 69 + extern int fault_devs; 70 + #endif 71 + 72 + extern char usermode_helper[]; 73 + 74 + 75 + #ifndef TRUE 76 + #define TRUE 1 77 + #endif 78 + #ifndef FALSE 79 + #define FALSE 0 80 + #endif 81 + 82 + /* I don't remember why XCPU ... 83 + * This is used to wake the asender, 84 + * and to interrupt sending the sending task 85 + * on disconnect. 86 + */ 87 + #define DRBD_SIG SIGXCPU 88 + 89 + /* This is used to stop/restart our threads. 90 + * Cannot use SIGTERM nor SIGKILL, since these 91 + * are sent out by init on runlevel changes 92 + * I choose SIGHUP for now. 93 + */ 94 + #define DRBD_SIGKILL SIGHUP 95 + 96 + /* All EEs on the free list should have ID_VACANT (== 0) 97 + * freshly allocated EEs get !ID_VACANT (== 1) 98 + * so if it says "cannot dereference null pointer at adress 0x00000001", 99 + * it is most likely one of these :( */ 100 + 101 + #define ID_IN_SYNC (4711ULL) 102 + #define ID_OUT_OF_SYNC (4712ULL) 103 + 104 + #define ID_SYNCER (-1ULL) 105 + #define ID_VACANT 0 106 + #define is_syncer_block_id(id) ((id) == ID_SYNCER) 107 + 108 + struct drbd_conf; 109 + 110 + 111 + /* to shorten dev_warn(DEV, "msg"); and relatives statements */ 112 + #define DEV (disk_to_dev(mdev->vdisk)) 113 + 114 + #define D_ASSERT(exp) if (!(exp)) \ 115 + dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) 116 + 117 + #define ERR_IF(exp) if (({ \ 118 + int _b = (exp) != 0; \ 119 + if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ 120 + __func__, #exp, __FILE__, __LINE__); \ 121 + _b; \ 122 + })) 123 + 124 + /* Defines to control fault insertion */ 125 + enum { 126 + DRBD_FAULT_MD_WR = 0, /* meta data write */ 127 + DRBD_FAULT_MD_RD = 1, /* read */ 128 + DRBD_FAULT_RS_WR = 2, /* resync */ 129 + DRBD_FAULT_RS_RD = 3, 130 + DRBD_FAULT_DT_WR = 4, /* data */ 131 + DRBD_FAULT_DT_RD = 5, 132 + DRBD_FAULT_DT_RA = 6, /* data read ahead */ 133 + DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ 134 + DRBD_FAULT_AL_EE = 8, /* alloc ee */ 135 + 136 + DRBD_FAULT_MAX, 137 + }; 138 + 139 + #ifdef CONFIG_DRBD_FAULT_INJECTION 140 + extern unsigned int 141 + _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); 142 + static inline int 143 + drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { 144 + return fault_rate && 145 + (enable_faults & (1<<type)) && 146 + _drbd_insert_fault(mdev, type); 147 + } 148 + #define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t))) 149 + 150 + #else 151 + #define FAULT_ACTIVE(_m, _t) (0) 152 + #endif 153 + 154 + /* integer division, round _UP_ to the next integer */ 155 + #define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) 156 + /* usual integer division */ 157 + #define div_floor(A, B) ((A)/(B)) 158 + 159 + /* drbd_meta-data.c (still in drbd_main.c) */ 160 + /* 4th incarnation of the disk layout. */ 161 + #define DRBD_MD_MAGIC (DRBD_MAGIC+4) 162 + 163 + extern struct drbd_conf **minor_table; 164 + extern struct ratelimit_state drbd_ratelimit_state; 165 + 166 + /* on the wire */ 167 + enum drbd_packets { 168 + /* receiver (data socket) */ 169 + P_DATA = 0x00, 170 + P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ 171 + P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ 172 + P_BARRIER = 0x03, 173 + P_BITMAP = 0x04, 174 + P_BECOME_SYNC_TARGET = 0x05, 175 + P_BECOME_SYNC_SOURCE = 0x06, 176 + P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ 177 + P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ 178 + P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ 179 + P_SYNC_PARAM = 0x0a, 180 + P_PROTOCOL = 0x0b, 181 + P_UUIDS = 0x0c, 182 + P_SIZES = 0x0d, 183 + P_STATE = 0x0e, 184 + P_SYNC_UUID = 0x0f, 185 + P_AUTH_CHALLENGE = 0x10, 186 + P_AUTH_RESPONSE = 0x11, 187 + P_STATE_CHG_REQ = 0x12, 188 + 189 + /* asender (meta socket */ 190 + P_PING = 0x13, 191 + P_PING_ACK = 0x14, 192 + P_RECV_ACK = 0x15, /* Used in protocol B */ 193 + P_WRITE_ACK = 0x16, /* Used in protocol C */ 194 + P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ 195 + P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ 196 + P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ 197 + P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ 198 + P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ 199 + P_BARRIER_ACK = 0x1c, 200 + P_STATE_CHG_REPLY = 0x1d, 201 + 202 + /* "new" commands, no longer fitting into the ordering scheme above */ 203 + 204 + P_OV_REQUEST = 0x1e, /* data socket */ 205 + P_OV_REPLY = 0x1f, 206 + P_OV_RESULT = 0x20, /* meta socket */ 207 + P_CSUM_RS_REQUEST = 0x21, /* data socket */ 208 + P_RS_IS_IN_SYNC = 0x22, /* meta socket */ 209 + P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ 210 + P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ 211 + 212 + P_MAX_CMD = 0x25, 213 + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 214 + P_MAX_OPT_CMD = 0x101, 215 + 216 + /* special command ids for handshake */ 217 + 218 + P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ 219 + P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ 220 + 221 + P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ 222 + }; 223 + 224 + static inline const char *cmdname(enum drbd_packets cmd) 225 + { 226 + /* THINK may need to become several global tables 227 + * when we want to support more than 228 + * one PRO_VERSION */ 229 + static const char *cmdnames[] = { 230 + [P_DATA] = "Data", 231 + [P_DATA_REPLY] = "DataReply", 232 + [P_RS_DATA_REPLY] = "RSDataReply", 233 + [P_BARRIER] = "Barrier", 234 + [P_BITMAP] = "ReportBitMap", 235 + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", 236 + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", 237 + [P_UNPLUG_REMOTE] = "UnplugRemote", 238 + [P_DATA_REQUEST] = "DataRequest", 239 + [P_RS_DATA_REQUEST] = "RSDataRequest", 240 + [P_SYNC_PARAM] = "SyncParam", 241 + [P_SYNC_PARAM89] = "SyncParam89", 242 + [P_PROTOCOL] = "ReportProtocol", 243 + [P_UUIDS] = "ReportUUIDs", 244 + [P_SIZES] = "ReportSizes", 245 + [P_STATE] = "ReportState", 246 + [P_SYNC_UUID] = "ReportSyncUUID", 247 + [P_AUTH_CHALLENGE] = "AuthChallenge", 248 + [P_AUTH_RESPONSE] = "AuthResponse", 249 + [P_PING] = "Ping", 250 + [P_PING_ACK] = "PingAck", 251 + [P_RECV_ACK] = "RecvAck", 252 + [P_WRITE_ACK] = "WriteAck", 253 + [P_RS_WRITE_ACK] = "RSWriteAck", 254 + [P_DISCARD_ACK] = "DiscardAck", 255 + [P_NEG_ACK] = "NegAck", 256 + [P_NEG_DREPLY] = "NegDReply", 257 + [P_NEG_RS_DREPLY] = "NegRSDReply", 258 + [P_BARRIER_ACK] = "BarrierAck", 259 + [P_STATE_CHG_REQ] = "StateChgRequest", 260 + [P_STATE_CHG_REPLY] = "StateChgReply", 261 + [P_OV_REQUEST] = "OVRequest", 262 + [P_OV_REPLY] = "OVReply", 263 + [P_OV_RESULT] = "OVResult", 264 + [P_MAX_CMD] = NULL, 265 + }; 266 + 267 + if (cmd == P_HAND_SHAKE_M) 268 + return "HandShakeM"; 269 + if (cmd == P_HAND_SHAKE_S) 270 + return "HandShakeS"; 271 + if (cmd == P_HAND_SHAKE) 272 + return "HandShake"; 273 + if (cmd >= P_MAX_CMD) 274 + return "Unknown"; 275 + return cmdnames[cmd]; 276 + } 277 + 278 + /* for sending/receiving the bitmap, 279 + * possibly in some encoding scheme */ 280 + struct bm_xfer_ctx { 281 + /* "const" 282 + * stores total bits and long words 283 + * of the bitmap, so we don't need to 284 + * call the accessor functions over and again. */ 285 + unsigned long bm_bits; 286 + unsigned long bm_words; 287 + /* during xfer, current position within the bitmap */ 288 + unsigned long bit_offset; 289 + unsigned long word_offset; 290 + 291 + /* statistics; index: (h->command == P_BITMAP) */ 292 + unsigned packets[2]; 293 + unsigned bytes[2]; 294 + }; 295 + 296 + extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, 297 + const char *direction, struct bm_xfer_ctx *c); 298 + 299 + static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) 300 + { 301 + /* word_offset counts "native long words" (32 or 64 bit), 302 + * aligned at 64 bit. 303 + * Encoded packet may end at an unaligned bit offset. 304 + * In case a fallback clear text packet is transmitted in 305 + * between, we adjust this offset back to the last 64bit 306 + * aligned "native long word", which makes coding and decoding 307 + * the plain text bitmap much more convenient. */ 308 + #if BITS_PER_LONG == 64 309 + c->word_offset = c->bit_offset >> 6; 310 + #elif BITS_PER_LONG == 32 311 + c->word_offset = c->bit_offset >> 5; 312 + c->word_offset &= ~(1UL); 313 + #else 314 + # error "unsupported BITS_PER_LONG" 315 + #endif 316 + } 317 + 318 + #ifndef __packed 319 + #define __packed __attribute__((packed)) 320 + #endif 321 + 322 + /* This is the layout for a packet on the wire. 323 + * The byteorder is the network byte order. 324 + * (except block_id and barrier fields. 325 + * these are pointers to local structs 326 + * and have no relevance for the partner, 327 + * which just echoes them as received.) 328 + * 329 + * NOTE that the payload starts at a long aligned offset, 330 + * regardless of 32 or 64 bit arch! 331 + */ 332 + struct p_header { 333 + u32 magic; 334 + u16 command; 335 + u16 length; /* bytes of data after this header */ 336 + u8 payload[0]; 337 + } __packed; 338 + /* 8 bytes. packet FIXED for the next century! */ 339 + 340 + /* 341 + * short commands, packets without payload, plain p_header: 342 + * P_PING 343 + * P_PING_ACK 344 + * P_BECOME_SYNC_TARGET 345 + * P_BECOME_SYNC_SOURCE 346 + * P_UNPLUG_REMOTE 347 + */ 348 + 349 + /* 350 + * commands with out-of-struct payload: 351 + * P_BITMAP (no additional fields) 352 + * P_DATA, P_DATA_REPLY (see p_data) 353 + * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) 354 + */ 355 + 356 + /* these defines must not be changed without changing the protocol version */ 357 + #define DP_HARDBARRIER 1 358 + #define DP_RW_SYNC 2 359 + #define DP_MAY_SET_IN_SYNC 4 360 + 361 + struct p_data { 362 + struct p_header head; 363 + u64 sector; /* 64 bits sector number */ 364 + u64 block_id; /* to identify the request in protocol B&C */ 365 + u32 seq_num; 366 + u32 dp_flags; 367 + } __packed; 368 + 369 + /* 370 + * commands which share a struct: 371 + * p_block_ack: 372 + * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), 373 + * P_DISCARD_ACK (proto C, two-primaries conflict detection) 374 + * p_block_req: 375 + * P_DATA_REQUEST, P_RS_DATA_REQUEST 376 + */ 377 + struct p_block_ack { 378 + struct p_header head; 379 + u64 sector; 380 + u64 block_id; 381 + u32 blksize; 382 + u32 seq_num; 383 + } __packed; 384 + 385 + 386 + struct p_block_req { 387 + struct p_header head; 388 + u64 sector; 389 + u64 block_id; 390 + u32 blksize; 391 + u32 pad; /* to multiple of 8 Byte */ 392 + } __packed; 393 + 394 + /* 395 + * commands with their own struct for additional fields: 396 + * P_HAND_SHAKE 397 + * P_BARRIER 398 + * P_BARRIER_ACK 399 + * P_SYNC_PARAM 400 + * ReportParams 401 + */ 402 + 403 + struct p_handshake { 404 + struct p_header head; /* 8 bytes */ 405 + u32 protocol_min; 406 + u32 feature_flags; 407 + u32 protocol_max; 408 + 409 + /* should be more than enough for future enhancements 410 + * for now, feature_flags and the reserverd array shall be zero. 411 + */ 412 + 413 + u32 _pad; 414 + u64 reserverd[7]; 415 + } __packed; 416 + /* 80 bytes, FIXED for the next century */ 417 + 418 + struct p_barrier { 419 + struct p_header head; 420 + u32 barrier; /* barrier number _handle_ only */ 421 + u32 pad; /* to multiple of 8 Byte */ 422 + } __packed; 423 + 424 + struct p_barrier_ack { 425 + struct p_header head; 426 + u32 barrier; 427 + u32 set_size; 428 + } __packed; 429 + 430 + struct p_rs_param { 431 + struct p_header head; 432 + u32 rate; 433 + 434 + /* Since protocol version 88 and higher. */ 435 + char verify_alg[0]; 436 + } __packed; 437 + 438 + struct p_rs_param_89 { 439 + struct p_header head; 440 + u32 rate; 441 + /* protocol version 89: */ 442 + char verify_alg[SHARED_SECRET_MAX]; 443 + char csums_alg[SHARED_SECRET_MAX]; 444 + } __packed; 445 + 446 + struct p_protocol { 447 + struct p_header head; 448 + u32 protocol; 449 + u32 after_sb_0p; 450 + u32 after_sb_1p; 451 + u32 after_sb_2p; 452 + u32 want_lose; 453 + u32 two_primaries; 454 + 455 + /* Since protocol version 87 and higher. */ 456 + char integrity_alg[0]; 457 + 458 + } __packed; 459 + 460 + struct p_uuids { 461 + struct p_header head; 462 + u64 uuid[UI_EXTENDED_SIZE]; 463 + } __packed; 464 + 465 + struct p_rs_uuid { 466 + struct p_header head; 467 + u64 uuid; 468 + } __packed; 469 + 470 + struct p_sizes { 471 + struct p_header head; 472 + u64 d_size; /* size of disk */ 473 + u64 u_size; /* user requested size */ 474 + u64 c_size; /* current exported size */ 475 + u32 max_segment_size; /* Maximal size of a BIO */ 476 + u32 queue_order_type; 477 + } __packed; 478 + 479 + struct p_state { 480 + struct p_header head; 481 + u32 state; 482 + } __packed; 483 + 484 + struct p_req_state { 485 + struct p_header head; 486 + u32 mask; 487 + u32 val; 488 + } __packed; 489 + 490 + struct p_req_state_reply { 491 + struct p_header head; 492 + u32 retcode; 493 + } __packed; 494 + 495 + struct p_drbd06_param { 496 + u64 size; 497 + u32 state; 498 + u32 blksize; 499 + u32 protocol; 500 + u32 version; 501 + u32 gen_cnt[5]; 502 + u32 bit_map_gen[5]; 503 + } __packed; 504 + 505 + struct p_discard { 506 + struct p_header head; 507 + u64 block_id; 508 + u32 seq_num; 509 + u32 pad; 510 + } __packed; 511 + 512 + /* Valid values for the encoding field. 513 + * Bump proto version when changing this. */ 514 + enum drbd_bitmap_code { 515 + /* RLE_VLI_Bytes = 0, 516 + * and other bit variants had been defined during 517 + * algorithm evaluation. */ 518 + RLE_VLI_Bits = 2, 519 + }; 520 + 521 + struct p_compressed_bm { 522 + struct p_header head; 523 + /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code 524 + * (encoding & 0x80): polarity (set/unset) of first runlength 525 + * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits 526 + * used to pad up to head.length bytes 527 + */ 528 + u8 encoding; 529 + 530 + u8 code[0]; 531 + } __packed; 532 + 533 + /* DCBP: Drbd Compressed Bitmap Packet ... */ 534 + static inline enum drbd_bitmap_code 535 + DCBP_get_code(struct p_compressed_bm *p) 536 + { 537 + return (enum drbd_bitmap_code)(p->encoding & 0x0f); 538 + } 539 + 540 + static inline void 541 + DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) 542 + { 543 + BUG_ON(code & ~0xf); 544 + p->encoding = (p->encoding & ~0xf) | code; 545 + } 546 + 547 + static inline int 548 + DCBP_get_start(struct p_compressed_bm *p) 549 + { 550 + return (p->encoding & 0x80) != 0; 551 + } 552 + 553 + static inline void 554 + DCBP_set_start(struct p_compressed_bm *p, int set) 555 + { 556 + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); 557 + } 558 + 559 + static inline int 560 + DCBP_get_pad_bits(struct p_compressed_bm *p) 561 + { 562 + return (p->encoding >> 4) & 0x7; 563 + } 564 + 565 + static inline void 566 + DCBP_set_pad_bits(struct p_compressed_bm *p, int n) 567 + { 568 + BUG_ON(n & ~0x7); 569 + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); 570 + } 571 + 572 + /* one bitmap packet, including the p_header, 573 + * should fit within one _architecture independend_ page. 574 + * so we need to use the fixed size 4KiB page size 575 + * most architechtures have used for a long time. 576 + */ 577 + #define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) 578 + #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) 579 + #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) 580 + #if (PAGE_SIZE < 4096) 581 + /* drbd_send_bitmap / receive_bitmap would break horribly */ 582 + #error "PAGE_SIZE too small" 583 + #endif 584 + 585 + union p_polymorph { 586 + struct p_header header; 587 + struct p_handshake handshake; 588 + struct p_data data; 589 + struct p_block_ack block_ack; 590 + struct p_barrier barrier; 591 + struct p_barrier_ack barrier_ack; 592 + struct p_rs_param_89 rs_param_89; 593 + struct p_protocol protocol; 594 + struct p_sizes sizes; 595 + struct p_uuids uuids; 596 + struct p_state state; 597 + struct p_req_state req_state; 598 + struct p_req_state_reply req_state_reply; 599 + struct p_block_req block_req; 600 + } __packed; 601 + 602 + /**********************************************************************/ 603 + enum drbd_thread_state { 604 + None, 605 + Running, 606 + Exiting, 607 + Restarting 608 + }; 609 + 610 + struct drbd_thread { 611 + spinlock_t t_lock; 612 + struct task_struct *task; 613 + struct completion stop; 614 + enum drbd_thread_state t_state; 615 + int (*function) (struct drbd_thread *); 616 + struct drbd_conf *mdev; 617 + int reset_cpu_mask; 618 + }; 619 + 620 + static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) 621 + { 622 + /* THINK testing the t_state seems to be uncritical in all cases 623 + * (but thread_{start,stop}), so we can read it *without* the lock. 624 + * --lge */ 625 + 626 + smp_rmb(); 627 + return thi->t_state; 628 + } 629 + 630 + 631 + /* 632 + * Having this as the first member of a struct provides sort of "inheritance". 633 + * "derived" structs can be "drbd_queue_work()"ed. 634 + * The callback should know and cast back to the descendant struct. 635 + * drbd_request and drbd_epoch_entry are descendants of drbd_work. 636 + */ 637 + struct drbd_work; 638 + typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); 639 + struct drbd_work { 640 + struct list_head list; 641 + drbd_work_cb cb; 642 + }; 643 + 644 + struct drbd_tl_epoch; 645 + struct drbd_request { 646 + struct drbd_work w; 647 + struct drbd_conf *mdev; 648 + 649 + /* if local IO is not allowed, will be NULL. 650 + * if local IO _is_ allowed, holds the locally submitted bio clone, 651 + * or, after local IO completion, the ERR_PTR(error). 652 + * see drbd_endio_pri(). */ 653 + struct bio *private_bio; 654 + 655 + struct hlist_node colision; 656 + sector_t sector; 657 + unsigned int size; 658 + unsigned int epoch; /* barrier_nr */ 659 + 660 + /* barrier_nr: used to check on "completion" whether this req was in 661 + * the current epoch, and we therefore have to close it, 662 + * starting a new epoch... 663 + */ 664 + 665 + /* up to here, the struct layout is identical to drbd_epoch_entry; 666 + * we might be able to use that to our advantage... */ 667 + 668 + struct list_head tl_requests; /* ring list in the transfer log */ 669 + struct bio *master_bio; /* master bio pointer */ 670 + unsigned long rq_state; /* see comments above _req_mod() */ 671 + int seq_num; 672 + unsigned long start_time; 673 + }; 674 + 675 + struct drbd_tl_epoch { 676 + struct drbd_work w; 677 + struct list_head requests; /* requests before */ 678 + struct drbd_tl_epoch *next; /* pointer to the next barrier */ 679 + unsigned int br_number; /* the barriers identifier. */ 680 + int n_req; /* number of requests attached before this barrier */ 681 + }; 682 + 683 + struct drbd_request; 684 + 685 + /* These Tl_epoch_entries may be in one of 6 lists: 686 + active_ee .. data packet being written 687 + sync_ee .. syncer block being written 688 + done_ee .. block written, need to send P_WRITE_ACK 689 + read_ee .. [RS]P_DATA_REQUEST being read 690 + */ 691 + 692 + struct drbd_epoch { 693 + struct list_head list; 694 + unsigned int barrier_nr; 695 + atomic_t epoch_size; /* increased on every request added. */ 696 + atomic_t active; /* increased on every req. added, and dec on every finished. */ 697 + unsigned long flags; 698 + }; 699 + 700 + /* drbd_epoch flag bits */ 701 + enum { 702 + DE_BARRIER_IN_NEXT_EPOCH_ISSUED, 703 + DE_BARRIER_IN_NEXT_EPOCH_DONE, 704 + DE_CONTAINS_A_BARRIER, 705 + DE_HAVE_BARRIER_NUMBER, 706 + DE_IS_FINISHING, 707 + }; 708 + 709 + enum epoch_event { 710 + EV_PUT, 711 + EV_GOT_BARRIER_NR, 712 + EV_BARRIER_DONE, 713 + EV_BECAME_LAST, 714 + EV_CLEANUP = 32, /* used as flag */ 715 + }; 716 + 717 + struct drbd_epoch_entry { 718 + struct drbd_work w; 719 + struct drbd_conf *mdev; 720 + struct bio *private_bio; 721 + struct hlist_node colision; 722 + sector_t sector; 723 + unsigned int size; 724 + struct drbd_epoch *epoch; 725 + 726 + /* up to here, the struct layout is identical to drbd_request; 727 + * we might be able to use that to our advantage... */ 728 + 729 + unsigned int flags; 730 + u64 block_id; 731 + }; 732 + 733 + struct drbd_wq_barrier { 734 + struct drbd_work w; 735 + struct completion done; 736 + }; 737 + 738 + struct digest_info { 739 + int digest_size; 740 + void *digest; 741 + }; 742 + 743 + /* ee flag bits */ 744 + enum { 745 + __EE_CALL_AL_COMPLETE_IO, 746 + __EE_CONFLICT_PENDING, 747 + __EE_MAY_SET_IN_SYNC, 748 + __EE_IS_BARRIER, 749 + }; 750 + #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 751 + #define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) 752 + #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 753 + #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) 754 + 755 + /* global flag bits */ 756 + enum { 757 + CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ 758 + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ 759 + SEND_PING, /* whether asender should send a ping asap */ 760 + 761 + STOP_SYNC_TIMER, /* tell timer to cancel itself */ 762 + UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ 763 + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ 764 + MD_DIRTY, /* current uuids and flags not yet on disk */ 765 + DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ 766 + USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ 767 + CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ 768 + CL_ST_CHG_SUCCESS, 769 + CL_ST_CHG_FAIL, 770 + CRASHED_PRIMARY, /* This node was a crashed primary. 771 + * Gets cleared when the state.conn 772 + * goes into C_CONNECTED state. */ 773 + WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ 774 + NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ 775 + CONSIDER_RESYNC, 776 + 777 + MD_NO_BARRIER, /* meta data device does not support barriers, 778 + so don't even try */ 779 + SUSPEND_IO, /* suspend application io */ 780 + BITMAP_IO, /* suspend application io; 781 + once no more io in flight, start bitmap io */ 782 + BITMAP_IO_QUEUED, /* Started bitmap IO */ 783 + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 784 + NET_CONGESTED, /* The data socket is congested */ 785 + 786 + CONFIG_PENDING, /* serialization of (re)configuration requests. 787 + * if set, also prevents the device from dying */ 788 + DEVICE_DYING, /* device became unconfigured, 789 + * but worker thread is still handling the cleanup. 790 + * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, 791 + * while this is set. */ 792 + RESIZE_PENDING, /* Size change detected locally, waiting for the response from 793 + * the peer, if it changed there as well. */ 794 + }; 795 + 796 + struct drbd_bitmap; /* opaque for drbd_conf */ 797 + 798 + /* TODO sort members for performance 799 + * MAYBE group them further */ 800 + 801 + /* THINK maybe we actually want to use the default "event/%s" worker threads 802 + * or similar in linux 2.6, which uses per cpu data and threads. 803 + * 804 + * To be general, this might need a spin_lock member. 805 + * For now, please use the mdev->req_lock to protect list_head, 806 + * see drbd_queue_work below. 807 + */ 808 + struct drbd_work_queue { 809 + struct list_head q; 810 + struct semaphore s; /* producers up it, worker down()s it */ 811 + spinlock_t q_lock; /* to protect the list. */ 812 + }; 813 + 814 + struct drbd_socket { 815 + struct drbd_work_queue work; 816 + struct mutex mutex; 817 + struct socket *socket; 818 + /* this way we get our 819 + * send/receive buffers off the stack */ 820 + union p_polymorph sbuf; 821 + union p_polymorph rbuf; 822 + }; 823 + 824 + struct drbd_md { 825 + u64 md_offset; /* sector offset to 'super' block */ 826 + 827 + u64 la_size_sect; /* last agreed size, unit sectors */ 828 + u64 uuid[UI_SIZE]; 829 + u64 device_uuid; 830 + u32 flags; 831 + u32 md_size_sect; 832 + 833 + s32 al_offset; /* signed relative sector offset to al area */ 834 + s32 bm_offset; /* signed relative sector offset to bitmap */ 835 + 836 + /* u32 al_nr_extents; important for restoring the AL 837 + * is stored into sync_conf.al_extents, which in turn 838 + * gets applied to act_log->nr_elements 839 + */ 840 + }; 841 + 842 + /* for sync_conf and other types... */ 843 + #define NL_PACKET(name, number, fields) struct name { fields }; 844 + #define NL_INTEGER(pn,pr,member) int member; 845 + #define NL_INT64(pn,pr,member) __u64 member; 846 + #define NL_BIT(pn,pr,member) unsigned member:1; 847 + #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; 848 + #include "linux/drbd_nl.h" 849 + 850 + struct drbd_backing_dev { 851 + struct block_device *backing_bdev; 852 + struct block_device *md_bdev; 853 + struct file *lo_file; 854 + struct file *md_file; 855 + struct drbd_md md; 856 + struct disk_conf dc; /* The user provided config... */ 857 + sector_t known_size; /* last known size of that backing device */ 858 + }; 859 + 860 + struct drbd_md_io { 861 + struct drbd_conf *mdev; 862 + struct completion event; 863 + int error; 864 + }; 865 + 866 + struct bm_io_work { 867 + struct drbd_work w; 868 + char *why; 869 + int (*io_fn)(struct drbd_conf *mdev); 870 + void (*done)(struct drbd_conf *mdev, int rv); 871 + }; 872 + 873 + enum write_ordering_e { 874 + WO_none, 875 + WO_drain_io, 876 + WO_bdev_flush, 877 + WO_bio_barrier 878 + }; 879 + 880 + struct drbd_conf { 881 + /* things that are stored as / read from meta data on disk */ 882 + unsigned long flags; 883 + 884 + /* configured by drbdsetup */ 885 + struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ 886 + struct syncer_conf sync_conf; 887 + struct drbd_backing_dev *ldev __protected_by(local); 888 + 889 + sector_t p_size; /* partner's disk size */ 890 + struct request_queue *rq_queue; 891 + struct block_device *this_bdev; 892 + struct gendisk *vdisk; 893 + 894 + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ 895 + struct drbd_socket meta; /* ping/ack (metadata) packets */ 896 + int agreed_pro_version; /* actually used protocol version */ 897 + unsigned long last_received; /* in jiffies, either socket */ 898 + unsigned int ko_count; 899 + struct drbd_work resync_work, 900 + unplug_work, 901 + md_sync_work; 902 + struct timer_list resync_timer; 903 + struct timer_list md_sync_timer; 904 + 905 + /* Used after attach while negotiating new disk state. */ 906 + union drbd_state new_state_tmp; 907 + 908 + union drbd_state state; 909 + wait_queue_head_t misc_wait; 910 + wait_queue_head_t state_wait; /* upon each state change. */ 911 + unsigned int send_cnt; 912 + unsigned int recv_cnt; 913 + unsigned int read_cnt; 914 + unsigned int writ_cnt; 915 + unsigned int al_writ_cnt; 916 + unsigned int bm_writ_cnt; 917 + atomic_t ap_bio_cnt; /* Requests we need to complete */ 918 + atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ 919 + atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ 920 + atomic_t unacked_cnt; /* Need to send replys for */ 921 + atomic_t local_cnt; /* Waiting for local completion */ 922 + atomic_t net_cnt; /* Users of net_conf */ 923 + spinlock_t req_lock; 924 + struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ 925 + struct drbd_tl_epoch *newest_tle; 926 + struct drbd_tl_epoch *oldest_tle; 927 + struct list_head out_of_sequence_requests; 928 + struct hlist_head *tl_hash; 929 + unsigned int tl_hash_s; 930 + 931 + /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ 932 + unsigned long rs_total; 933 + /* number of sync IOs that failed in this run */ 934 + unsigned long rs_failed; 935 + /* Syncer's start time [unit jiffies] */ 936 + unsigned long rs_start; 937 + /* cumulated time in PausedSyncX state [unit jiffies] */ 938 + unsigned long rs_paused; 939 + /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ 940 + unsigned long rs_mark_left; 941 + /* marks's time [unit jiffies] */ 942 + unsigned long rs_mark_time; 943 + /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ 944 + unsigned long rs_same_csum; 945 + 946 + /* where does the admin want us to start? (sector) */ 947 + sector_t ov_start_sector; 948 + /* where are we now? (sector) */ 949 + sector_t ov_position; 950 + /* Start sector of out of sync range (to merge printk reporting). */ 951 + sector_t ov_last_oos_start; 952 + /* size of out-of-sync range in sectors. */ 953 + sector_t ov_last_oos_size; 954 + unsigned long ov_left; /* in bits */ 955 + struct crypto_hash *csums_tfm; 956 + struct crypto_hash *verify_tfm; 957 + 958 + struct drbd_thread receiver; 959 + struct drbd_thread worker; 960 + struct drbd_thread asender; 961 + struct drbd_bitmap *bitmap; 962 + unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ 963 + 964 + /* Used to track operations of resync... */ 965 + struct lru_cache *resync; 966 + /* Number of locked elements in resync LRU */ 967 + unsigned int resync_locked; 968 + /* resync extent number waiting for application requests */ 969 + unsigned int resync_wenr; 970 + 971 + int open_cnt; 972 + u64 *p_uuid; 973 + struct drbd_epoch *current_epoch; 974 + spinlock_t epoch_lock; 975 + unsigned int epochs; 976 + enum write_ordering_e write_ordering; 977 + struct list_head active_ee; /* IO in progress */ 978 + struct list_head sync_ee; /* IO in progress */ 979 + struct list_head done_ee; /* send ack */ 980 + struct list_head read_ee; /* IO in progress */ 981 + struct list_head net_ee; /* zero-copy network send in progress */ 982 + struct hlist_head *ee_hash; /* is proteced by req_lock! */ 983 + unsigned int ee_hash_s; 984 + 985 + /* this one is protected by ee_lock, single thread */ 986 + struct drbd_epoch_entry *last_write_w_barrier; 987 + 988 + int next_barrier_nr; 989 + struct hlist_head *app_reads_hash; /* is proteced by req_lock */ 990 + struct list_head resync_reads; 991 + atomic_t pp_in_use; 992 + wait_queue_head_t ee_wait; 993 + struct page *md_io_page; /* one page buffer for md_io */ 994 + struct page *md_io_tmpp; /* for logical_block_size != 512 */ 995 + struct mutex md_io_mutex; /* protects the md_io_buffer */ 996 + spinlock_t al_lock; 997 + wait_queue_head_t al_wait; 998 + struct lru_cache *act_log; /* activity log */ 999 + unsigned int al_tr_number; 1000 + int al_tr_cycle; 1001 + int al_tr_pos; /* position of the next transaction in the journal */ 1002 + struct crypto_hash *cram_hmac_tfm; 1003 + struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ 1004 + struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ 1005 + void *int_dig_out; 1006 + void *int_dig_in; 1007 + void *int_dig_vv; 1008 + wait_queue_head_t seq_wait; 1009 + atomic_t packet_seq; 1010 + unsigned int peer_seq; 1011 + spinlock_t peer_seq_lock; 1012 + unsigned int minor; 1013 + unsigned long comm_bm_set; /* communicated number of set bits. */ 1014 + cpumask_var_t cpu_mask; 1015 + struct bm_io_work bm_io_work; 1016 + u64 ed_uuid; /* UUID of the exposed data */ 1017 + struct mutex state_mutex; 1018 + char congestion_reason; /* Why we where congested... */ 1019 + }; 1020 + 1021 + static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1022 + { 1023 + struct drbd_conf *mdev; 1024 + 1025 + mdev = minor < minor_count ? minor_table[minor] : NULL; 1026 + 1027 + return mdev; 1028 + } 1029 + 1030 + static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) 1031 + { 1032 + return mdev->minor; 1033 + } 1034 + 1035 + /* returns 1 if it was successfull, 1036 + * returns 0 if there was no data socket. 1037 + * so wherever you are going to use the data.socket, e.g. do 1038 + * if (!drbd_get_data_sock(mdev)) 1039 + * return 0; 1040 + * CODE(); 1041 + * drbd_put_data_sock(mdev); 1042 + */ 1043 + static inline int drbd_get_data_sock(struct drbd_conf *mdev) 1044 + { 1045 + mutex_lock(&mdev->data.mutex); 1046 + /* drbd_disconnect() could have called drbd_free_sock() 1047 + * while we were waiting in down()... */ 1048 + if (unlikely(mdev->data.socket == NULL)) { 1049 + mutex_unlock(&mdev->data.mutex); 1050 + return 0; 1051 + } 1052 + return 1; 1053 + } 1054 + 1055 + static inline void drbd_put_data_sock(struct drbd_conf *mdev) 1056 + { 1057 + mutex_unlock(&mdev->data.mutex); 1058 + } 1059 + 1060 + /* 1061 + * function declarations 1062 + *************************/ 1063 + 1064 + /* drbd_main.c */ 1065 + 1066 + enum chg_state_flags { 1067 + CS_HARD = 1, 1068 + CS_VERBOSE = 2, 1069 + CS_WAIT_COMPLETE = 4, 1070 + CS_SERIALIZE = 8, 1071 + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, 1072 + }; 1073 + 1074 + extern void drbd_init_set_defaults(struct drbd_conf *mdev); 1075 + extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 1076 + union drbd_state mask, union drbd_state val); 1077 + extern void drbd_force_state(struct drbd_conf *, union drbd_state, 1078 + union drbd_state); 1079 + extern int _drbd_request_state(struct drbd_conf *, union drbd_state, 1080 + union drbd_state, enum chg_state_flags); 1081 + extern int __drbd_set_state(struct drbd_conf *, union drbd_state, 1082 + enum chg_state_flags, struct completion *done); 1083 + extern void print_st_err(struct drbd_conf *, union drbd_state, 1084 + union drbd_state, int); 1085 + extern int drbd_thread_start(struct drbd_thread *thi); 1086 + extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); 1087 + #ifdef CONFIG_SMP 1088 + extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); 1089 + extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); 1090 + #else 1091 + #define drbd_thread_current_set_cpu(A) ({}) 1092 + #define drbd_calc_cpu_mask(A) ({}) 1093 + #endif 1094 + extern void drbd_free_resources(struct drbd_conf *mdev); 1095 + extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, 1096 + unsigned int set_size); 1097 + extern void tl_clear(struct drbd_conf *mdev); 1098 + extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); 1099 + extern void drbd_free_sock(struct drbd_conf *mdev); 1100 + extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, 1101 + void *buf, size_t size, unsigned msg_flags); 1102 + extern int drbd_send_protocol(struct drbd_conf *mdev); 1103 + extern int drbd_send_uuids(struct drbd_conf *mdev); 1104 + extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); 1105 + extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); 1106 + extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); 1107 + extern int _drbd_send_state(struct drbd_conf *mdev); 1108 + extern int drbd_send_state(struct drbd_conf *mdev); 1109 + extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1110 + enum drbd_packets cmd, struct p_header *h, 1111 + size_t size, unsigned msg_flags); 1112 + #define USE_DATA_SOCKET 1 1113 + #define USE_META_SOCKET 0 1114 + extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, 1115 + enum drbd_packets cmd, struct p_header *h, 1116 + size_t size); 1117 + extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, 1118 + char *data, size_t size); 1119 + extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); 1120 + extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, 1121 + u32 set_size); 1122 + extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, 1123 + struct drbd_epoch_entry *e); 1124 + extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, 1125 + struct p_block_req *rp); 1126 + extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, 1127 + struct p_data *dp); 1128 + extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, 1129 + sector_t sector, int blksize, u64 block_id); 1130 + extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, 1131 + struct drbd_epoch_entry *e); 1132 + extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); 1133 + extern int _drbd_send_barrier(struct drbd_conf *mdev, 1134 + struct drbd_tl_epoch *barrier); 1135 + extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, 1136 + sector_t sector, int size, u64 block_id); 1137 + extern int drbd_send_drequest_csum(struct drbd_conf *mdev, 1138 + sector_t sector,int size, 1139 + void *digest, int digest_size, 1140 + enum drbd_packets cmd); 1141 + extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); 1142 + 1143 + extern int drbd_send_bitmap(struct drbd_conf *mdev); 1144 + extern int _drbd_send_bitmap(struct drbd_conf *mdev); 1145 + extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); 1146 + extern void drbd_free_bc(struct drbd_backing_dev *ldev); 1147 + extern void drbd_mdev_cleanup(struct drbd_conf *mdev); 1148 + 1149 + /* drbd_meta-data.c (still in drbd_main.c) */ 1150 + extern void drbd_md_sync(struct drbd_conf *mdev); 1151 + extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); 1152 + /* maybe define them below as inline? */ 1153 + extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1154 + extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1155 + extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); 1156 + extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); 1157 + extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); 1158 + extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); 1159 + extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); 1160 + extern int drbd_md_test_flag(struct drbd_backing_dev *, int); 1161 + extern void drbd_md_mark_dirty(struct drbd_conf *mdev); 1162 + extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, 1163 + int (*io_fn)(struct drbd_conf *), 1164 + void (*done)(struct drbd_conf *, int), 1165 + char *why); 1166 + extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); 1167 + extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1168 + extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); 1169 + 1170 + 1171 + /* Meta data layout 1172 + We reserve a 128MB Block (4k aligned) 1173 + * either at the end of the backing device 1174 + * or on a seperate meta data device. */ 1175 + 1176 + #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ 1177 + /* The following numbers are sectors */ 1178 + #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ 1179 + #define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ 1180 + /* Allows up to about 3.8TB */ 1181 + #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) 1182 + 1183 + /* Since the smalles IO unit is usually 512 byte */ 1184 + #define MD_SECTOR_SHIFT 9 1185 + #define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) 1186 + 1187 + /* activity log */ 1188 + #define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ 1189 + #define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */ 1190 + #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) 1191 + 1192 + #if BITS_PER_LONG == 32 1193 + #define LN2_BPL 5 1194 + #define cpu_to_lel(A) cpu_to_le32(A) 1195 + #define lel_to_cpu(A) le32_to_cpu(A) 1196 + #elif BITS_PER_LONG == 64 1197 + #define LN2_BPL 6 1198 + #define cpu_to_lel(A) cpu_to_le64(A) 1199 + #define lel_to_cpu(A) le64_to_cpu(A) 1200 + #else 1201 + #error "LN2 of BITS_PER_LONG unknown!" 1202 + #endif 1203 + 1204 + /* resync bitmap */ 1205 + /* 16MB sized 'bitmap extent' to track syncer usage */ 1206 + struct bm_extent { 1207 + int rs_left; /* number of bits set (out of sync) in this extent. */ 1208 + int rs_failed; /* number of failed resync requests in this extent. */ 1209 + unsigned long flags; 1210 + struct lc_element lce; 1211 + }; 1212 + 1213 + #define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ 1214 + #define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ 1215 + 1216 + /* drbd_bitmap.c */ 1217 + /* 1218 + * We need to store one bit for a block. 1219 + * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. 1220 + * Bit 0 ==> local node thinks this block is binary identical on both nodes 1221 + * Bit 1 ==> local node thinks this block needs to be synced. 1222 + */ 1223 + 1224 + #define BM_BLOCK_SHIFT 12 /* 4k per bit */ 1225 + #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) 1226 + /* (9+3) : 512 bytes @ 8 bits; representing 16M storage 1227 + * per sector of on disk bitmap */ 1228 + #define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */ 1229 + #define BM_EXT_SIZE (1<<BM_EXT_SHIFT) 1230 + 1231 + #if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) 1232 + #error "HAVE YOU FIXED drbdmeta AS WELL??" 1233 + #endif 1234 + 1235 + /* thus many _storage_ sectors are described by one bit */ 1236 + #define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) 1237 + #define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) 1238 + #define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) 1239 + 1240 + /* bit to represented kilo byte conversion */ 1241 + #define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) 1242 + 1243 + /* in which _bitmap_ extent (resp. sector) the bit for a certain 1244 + * _storage_ sector is located in */ 1245 + #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) 1246 + 1247 + /* how much _storage_ sectors we have per bitmap sector */ 1248 + #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) 1249 + #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) 1250 + 1251 + /* in one sector of the bitmap, we have this many activity_log extents. */ 1252 + #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1253 + #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) 1254 + 1255 + #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) 1256 + #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) 1257 + 1258 + /* the extent in "PER_EXTENT" below is an activity log extent 1259 + * we need that many (long words/bytes) to store the bitmap 1260 + * of one AL_EXTENT_SIZE chunk of storage. 1261 + * we can store the bitmap for that many AL_EXTENTS within 1262 + * one sector of the _on_disk_ bitmap: 1263 + * bit 0 bit 37 bit 38 bit (512*8)-1 1264 + * ...|........|........|.. // ..|........| 1265 + * sect. 0 `296 `304 ^(512*8*8)-1 1266 + * 1267 + #define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) 1268 + #define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 1269 + #define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 1270 + */ 1271 + 1272 + #define DRBD_MAX_SECTORS_32 (0xffffffffLU) 1273 + #define DRBD_MAX_SECTORS_BM \ 1274 + ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) 1275 + #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 1276 + #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1277 + #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM 1278 + #elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32 1279 + #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 1280 + #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 1281 + #else 1282 + #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1283 + /* 16 TB in units of sectors */ 1284 + #if BITS_PER_LONG == 32 1285 + /* adjust by one page worth of bitmap, 1286 + * so we won't wrap around in drbd_bm_find_next_bit. 1287 + * you should use 64bit OS for that much storage, anyways. */ 1288 + #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) 1289 + #else 1290 + #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) 1291 + #endif 1292 + #endif 1293 + 1294 + /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. 1295 + * With a value of 6 all IO in one 32K block make it to the same slot of the 1296 + * hash table. */ 1297 + #define HT_SHIFT 6 1298 + #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) 1299 + 1300 + /* Number of elements in the app_reads_hash */ 1301 + #define APP_R_HSIZE 15 1302 + 1303 + extern int drbd_bm_init(struct drbd_conf *mdev); 1304 + extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); 1305 + extern void drbd_bm_cleanup(struct drbd_conf *mdev); 1306 + extern void drbd_bm_set_all(struct drbd_conf *mdev); 1307 + extern void drbd_bm_clear_all(struct drbd_conf *mdev); 1308 + extern int drbd_bm_set_bits( 1309 + struct drbd_conf *mdev, unsigned long s, unsigned long e); 1310 + extern int drbd_bm_clear_bits( 1311 + struct drbd_conf *mdev, unsigned long s, unsigned long e); 1312 + /* bm_set_bits variant for use while holding drbd_bm_lock */ 1313 + extern void _drbd_bm_set_bits(struct drbd_conf *mdev, 1314 + const unsigned long s, const unsigned long e); 1315 + extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); 1316 + extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); 1317 + extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); 1318 + extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); 1319 + extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); 1320 + extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, 1321 + unsigned long al_enr); 1322 + extern size_t drbd_bm_words(struct drbd_conf *mdev); 1323 + extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); 1324 + extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); 1325 + extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1326 + /* bm_find_next variants for use while you hold drbd_bm_lock() */ 1327 + extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1328 + extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); 1329 + extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); 1330 + extern int drbd_bm_rs_done(struct drbd_conf *mdev); 1331 + /* for receive_bitmap */ 1332 + extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, 1333 + size_t number, unsigned long *buffer); 1334 + /* for _drbd_send_bitmap and drbd_bm_write_sect */ 1335 + extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, 1336 + size_t number, unsigned long *buffer); 1337 + 1338 + extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); 1339 + extern void drbd_bm_unlock(struct drbd_conf *mdev); 1340 + 1341 + extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); 1342 + /* drbd_main.c */ 1343 + 1344 + extern struct kmem_cache *drbd_request_cache; 1345 + extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ 1346 + extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ 1347 + extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ 1348 + extern mempool_t *drbd_request_mempool; 1349 + extern mempool_t *drbd_ee_mempool; 1350 + 1351 + extern struct page *drbd_pp_pool; /* drbd's page pool */ 1352 + extern spinlock_t drbd_pp_lock; 1353 + extern int drbd_pp_vacant; 1354 + extern wait_queue_head_t drbd_pp_wait; 1355 + 1356 + extern rwlock_t global_state_lock; 1357 + 1358 + extern struct drbd_conf *drbd_new_device(unsigned int minor); 1359 + extern void drbd_free_mdev(struct drbd_conf *mdev); 1360 + 1361 + extern int proc_details; 1362 + 1363 + /* drbd_req */ 1364 + extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); 1365 + extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); 1366 + extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); 1367 + extern int is_valid_ar_handle(struct drbd_request *, sector_t); 1368 + 1369 + 1370 + /* drbd_nl.c */ 1371 + extern void drbd_suspend_io(struct drbd_conf *mdev); 1372 + extern void drbd_resume_io(struct drbd_conf *mdev); 1373 + extern char *ppsize(char *buf, unsigned long long size); 1374 + extern sector_t drbd_new_dev_size(struct drbd_conf *, 1375 + struct drbd_backing_dev *); 1376 + enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1377 + extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local); 1378 + extern void resync_after_online_grow(struct drbd_conf *); 1379 + extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1380 + extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, 1381 + int force); 1382 + enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); 1383 + extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); 1384 + 1385 + /* drbd_worker.c */ 1386 + extern int drbd_worker(struct drbd_thread *thi); 1387 + extern int drbd_alter_sa(struct drbd_conf *mdev, int na); 1388 + extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); 1389 + extern void resume_next_sg(struct drbd_conf *mdev); 1390 + extern void suspend_other_sg(struct drbd_conf *mdev); 1391 + extern int drbd_resync_finished(struct drbd_conf *mdev); 1392 + /* maybe rather drbd_main.c ? */ 1393 + extern int drbd_md_sync_page_io(struct drbd_conf *mdev, 1394 + struct drbd_backing_dev *bdev, sector_t sector, int rw); 1395 + extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); 1396 + 1397 + static inline void ov_oos_print(struct drbd_conf *mdev) 1398 + { 1399 + if (mdev->ov_last_oos_size) { 1400 + dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", 1401 + (unsigned long long)mdev->ov_last_oos_start, 1402 + (unsigned long)mdev->ov_last_oos_size); 1403 + } 1404 + mdev->ov_last_oos_size=0; 1405 + } 1406 + 1407 + 1408 + extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); 1409 + /* worker callbacks */ 1410 + extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); 1411 + extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); 1412 + extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); 1413 + extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); 1414 + extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); 1415 + extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); 1416 + extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); 1417 + extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); 1418 + extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); 1419 + extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); 1420 + extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); 1421 + extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); 1422 + extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); 1423 + extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); 1424 + extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); 1425 + extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); 1426 + extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); 1427 + extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); 1428 + 1429 + extern void resync_timer_fn(unsigned long data); 1430 + 1431 + /* drbd_receiver.c */ 1432 + extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); 1433 + extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, 1434 + u64 id, 1435 + sector_t sector, 1436 + unsigned int data_size, 1437 + gfp_t gfp_mask) __must_hold(local); 1438 + extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); 1439 + extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, 1440 + struct list_head *head); 1441 + extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, 1442 + struct list_head *head); 1443 + extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); 1444 + extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); 1445 + extern void drbd_flush_workqueue(struct drbd_conf *mdev); 1446 + 1447 + /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to 1448 + * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ 1449 + static inline int drbd_setsockopt(struct socket *sock, int level, int optname, 1450 + char __user *optval, int optlen) 1451 + { 1452 + int err; 1453 + if (level == SOL_SOCKET) 1454 + err = sock_setsockopt(sock, level, optname, optval, optlen); 1455 + else 1456 + err = sock->ops->setsockopt(sock, level, optname, optval, 1457 + optlen); 1458 + return err; 1459 + } 1460 + 1461 + static inline void drbd_tcp_cork(struct socket *sock) 1462 + { 1463 + int __user val = 1; 1464 + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, 1465 + (char __user *)&val, sizeof(val)); 1466 + } 1467 + 1468 + static inline void drbd_tcp_uncork(struct socket *sock) 1469 + { 1470 + int __user val = 0; 1471 + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, 1472 + (char __user *)&val, sizeof(val)); 1473 + } 1474 + 1475 + static inline void drbd_tcp_nodelay(struct socket *sock) 1476 + { 1477 + int __user val = 1; 1478 + (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, 1479 + (char __user *)&val, sizeof(val)); 1480 + } 1481 + 1482 + static inline void drbd_tcp_quickack(struct socket *sock) 1483 + { 1484 + int __user val = 1; 1485 + (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, 1486 + (char __user *)&val, sizeof(val)); 1487 + } 1488 + 1489 + void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); 1490 + 1491 + /* drbd_proc.c */ 1492 + extern struct proc_dir_entry *drbd_proc; 1493 + extern struct file_operations drbd_proc_fops; 1494 + extern const char *drbd_conn_str(enum drbd_conns s); 1495 + extern const char *drbd_role_str(enum drbd_role s); 1496 + 1497 + /* drbd_actlog.c */ 1498 + extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); 1499 + extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); 1500 + extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); 1501 + extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); 1502 + extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); 1503 + extern void drbd_rs_cancel_all(struct drbd_conf *mdev); 1504 + extern int drbd_rs_del_all(struct drbd_conf *mdev); 1505 + extern void drbd_rs_failed_io(struct drbd_conf *mdev, 1506 + sector_t sector, int size); 1507 + extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); 1508 + extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, 1509 + int size, const char *file, const unsigned int line); 1510 + #define drbd_set_in_sync(mdev, sector, size) \ 1511 + __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) 1512 + extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, 1513 + int size, const char *file, const unsigned int line); 1514 + #define drbd_set_out_of_sync(mdev, sector, size) \ 1515 + __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) 1516 + extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); 1517 + extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); 1518 + extern void drbd_al_shrink(struct drbd_conf *mdev); 1519 + 1520 + 1521 + /* drbd_nl.c */ 1522 + 1523 + void drbd_nl_cleanup(void); 1524 + int __init drbd_nl_init(void); 1525 + void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); 1526 + void drbd_bcast_sync_progress(struct drbd_conf *mdev); 1527 + void drbd_bcast_ee(struct drbd_conf *mdev, 1528 + const char *reason, const int dgs, 1529 + const char* seen_hash, const char* calc_hash, 1530 + const struct drbd_epoch_entry* e); 1531 + 1532 + 1533 + /** 1534 + * DOC: DRBD State macros 1535 + * 1536 + * These macros are used to express state changes in easily readable form. 1537 + * 1538 + * The NS macros expand to a mask and a value, that can be bit ored onto the 1539 + * current state as soon as the spinlock (req_lock) was taken. 1540 + * 1541 + * The _NS macros are used for state functions that get called with the 1542 + * spinlock. These macros expand directly to the new state value. 1543 + * 1544 + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined 1545 + * to express state changes that affect more than one aspect of the state. 1546 + * 1547 + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) 1548 + * Means that the network connection was established and that the peer 1549 + * is in secondary role. 1550 + */ 1551 + #define role_MASK R_MASK 1552 + #define peer_MASK R_MASK 1553 + #define disk_MASK D_MASK 1554 + #define pdsk_MASK D_MASK 1555 + #define conn_MASK C_MASK 1556 + #define susp_MASK 1 1557 + #define user_isp_MASK 1 1558 + #define aftr_isp_MASK 1 1559 + 1560 + #define NS(T, S) \ 1561 + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ 1562 + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) 1563 + #define NS2(T1, S1, T2, S2) \ 1564 + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ 1565 + mask.T2 = T2##_MASK; mask; }), \ 1566 + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ 1567 + val.T2 = (S2); val; }) 1568 + #define NS3(T1, S1, T2, S2, T3, S3) \ 1569 + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ 1570 + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ 1571 + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ 1572 + val.T2 = (S2); val.T3 = (S3); val; }) 1573 + 1574 + #define _NS(D, T, S) \ 1575 + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) 1576 + #define _NS2(D, T1, S1, T2, S2) \ 1577 + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ 1578 + __ns.T2 = (S2); __ns; }) 1579 + #define _NS3(D, T1, S1, T2, S2, T3, S3) \ 1580 + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ 1581 + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) 1582 + 1583 + /* 1584 + * inline helper functions 1585 + *************************/ 1586 + 1587 + static inline void drbd_state_lock(struct drbd_conf *mdev) 1588 + { 1589 + wait_event(mdev->misc_wait, 1590 + !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); 1591 + } 1592 + 1593 + static inline void drbd_state_unlock(struct drbd_conf *mdev) 1594 + { 1595 + clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); 1596 + wake_up(&mdev->misc_wait); 1597 + } 1598 + 1599 + static inline int _drbd_set_state(struct drbd_conf *mdev, 1600 + union drbd_state ns, enum chg_state_flags flags, 1601 + struct completion *done) 1602 + { 1603 + int rv; 1604 + 1605 + read_lock(&global_state_lock); 1606 + rv = __drbd_set_state(mdev, ns, flags, done); 1607 + read_unlock(&global_state_lock); 1608 + 1609 + return rv; 1610 + } 1611 + 1612 + /** 1613 + * drbd_request_state() - Reqest a state change 1614 + * @mdev: DRBD device. 1615 + * @mask: mask of state bits to change. 1616 + * @val: value of new state bits. 1617 + * 1618 + * This is the most graceful way of requesting a state change. It is verbose 1619 + * quite verbose in case the state change is not possible, and all those 1620 + * state changes are globally serialized. 1621 + */ 1622 + static inline int drbd_request_state(struct drbd_conf *mdev, 1623 + union drbd_state mask, 1624 + union drbd_state val) 1625 + { 1626 + return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); 1627 + } 1628 + 1629 + #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) 1630 + static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) 1631 + { 1632 + switch (mdev->ldev->dc.on_io_error) { 1633 + case EP_PASS_ON: 1634 + if (!forcedetach) { 1635 + if (printk_ratelimit()) 1636 + dev_err(DEV, "Local IO failed in %s." 1637 + "Passing error on...\n", where); 1638 + break; 1639 + } 1640 + /* NOTE fall through to detach case if forcedetach set */ 1641 + case EP_DETACH: 1642 + case EP_CALL_HELPER: 1643 + if (mdev->state.disk > D_FAILED) { 1644 + _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); 1645 + dev_err(DEV, "Local IO failed in %s." 1646 + "Detaching...\n", where); 1647 + } 1648 + break; 1649 + } 1650 + } 1651 + 1652 + /** 1653 + * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers 1654 + * @mdev: DRBD device. 1655 + * @error: Error code passed to the IO completion callback 1656 + * @forcedetach: Force detach. I.e. the error happened while accessing the meta data 1657 + * 1658 + * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) 1659 + */ 1660 + #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) 1661 + static inline void drbd_chk_io_error_(struct drbd_conf *mdev, 1662 + int error, int forcedetach, const char *where) 1663 + { 1664 + if (error) { 1665 + unsigned long flags; 1666 + spin_lock_irqsave(&mdev->req_lock, flags); 1667 + __drbd_chk_io_error_(mdev, forcedetach, where); 1668 + spin_unlock_irqrestore(&mdev->req_lock, flags); 1669 + } 1670 + } 1671 + 1672 + 1673 + /** 1674 + * drbd_md_first_sector() - Returns the first sector number of the meta data area 1675 + * @bdev: Meta data block device. 1676 + * 1677 + * BTW, for internal meta data, this happens to be the maximum capacity 1678 + * we could agree upon with our peer node. 1679 + */ 1680 + static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) 1681 + { 1682 + switch (bdev->dc.meta_dev_idx) { 1683 + case DRBD_MD_INDEX_INTERNAL: 1684 + case DRBD_MD_INDEX_FLEX_INT: 1685 + return bdev->md.md_offset + bdev->md.bm_offset; 1686 + case DRBD_MD_INDEX_FLEX_EXT: 1687 + default: 1688 + return bdev->md.md_offset; 1689 + } 1690 + } 1691 + 1692 + /** 1693 + * drbd_md_last_sector() - Return the last sector number of the meta data area 1694 + * @bdev: Meta data block device. 1695 + */ 1696 + static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) 1697 + { 1698 + switch (bdev->dc.meta_dev_idx) { 1699 + case DRBD_MD_INDEX_INTERNAL: 1700 + case DRBD_MD_INDEX_FLEX_INT: 1701 + return bdev->md.md_offset + MD_AL_OFFSET - 1; 1702 + case DRBD_MD_INDEX_FLEX_EXT: 1703 + default: 1704 + return bdev->md.md_offset + bdev->md.md_size_sect; 1705 + } 1706 + } 1707 + 1708 + /* Returns the number of 512 byte sectors of the device */ 1709 + static inline sector_t drbd_get_capacity(struct block_device *bdev) 1710 + { 1711 + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ 1712 + return bdev ? bdev->bd_inode->i_size >> 9 : 0; 1713 + } 1714 + 1715 + /** 1716 + * drbd_get_max_capacity() - Returns the capacity we announce to out peer 1717 + * @bdev: Meta data block device. 1718 + * 1719 + * returns the capacity we announce to out peer. we clip ourselves at the 1720 + * various MAX_SECTORS, because if we don't, current implementation will 1721 + * oops sooner or later 1722 + */ 1723 + static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) 1724 + { 1725 + sector_t s; 1726 + switch (bdev->dc.meta_dev_idx) { 1727 + case DRBD_MD_INDEX_INTERNAL: 1728 + case DRBD_MD_INDEX_FLEX_INT: 1729 + s = drbd_get_capacity(bdev->backing_bdev) 1730 + ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, 1731 + drbd_md_first_sector(bdev)) 1732 + : 0; 1733 + break; 1734 + case DRBD_MD_INDEX_FLEX_EXT: 1735 + s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, 1736 + drbd_get_capacity(bdev->backing_bdev)); 1737 + /* clip at maximum size the meta device can support */ 1738 + s = min_t(sector_t, s, 1739 + BM_EXT_TO_SECT(bdev->md.md_size_sect 1740 + - bdev->md.bm_offset)); 1741 + break; 1742 + default: 1743 + s = min_t(sector_t, DRBD_MAX_SECTORS, 1744 + drbd_get_capacity(bdev->backing_bdev)); 1745 + } 1746 + return s; 1747 + } 1748 + 1749 + /** 1750 + * drbd_md_ss__() - Return the sector number of our meta data super block 1751 + * @mdev: DRBD device. 1752 + * @bdev: Meta data block device. 1753 + */ 1754 + static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, 1755 + struct drbd_backing_dev *bdev) 1756 + { 1757 + switch (bdev->dc.meta_dev_idx) { 1758 + default: /* external, some index */ 1759 + return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; 1760 + case DRBD_MD_INDEX_INTERNAL: 1761 + /* with drbd08, internal meta data is always "flexible" */ 1762 + case DRBD_MD_INDEX_FLEX_INT: 1763 + /* sizeof(struct md_on_disk_07) == 4k 1764 + * position: last 4k aligned block of 4k size */ 1765 + if (!bdev->backing_bdev) { 1766 + if (__ratelimit(&drbd_ratelimit_state)) { 1767 + dev_err(DEV, "bdev->backing_bdev==NULL\n"); 1768 + dump_stack(); 1769 + } 1770 + return 0; 1771 + } 1772 + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) 1773 + - MD_AL_OFFSET; 1774 + case DRBD_MD_INDEX_FLEX_EXT: 1775 + return 0; 1776 + } 1777 + } 1778 + 1779 + static inline void 1780 + _drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) 1781 + { 1782 + list_add_tail(&w->list, &q->q); 1783 + up(&q->s); 1784 + } 1785 + 1786 + static inline void 1787 + drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) 1788 + { 1789 + unsigned long flags; 1790 + spin_lock_irqsave(&q->q_lock, flags); 1791 + list_add(&w->list, &q->q); 1792 + up(&q->s); /* within the spinlock, 1793 + see comment near end of drbd_worker() */ 1794 + spin_unlock_irqrestore(&q->q_lock, flags); 1795 + } 1796 + 1797 + static inline void 1798 + drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) 1799 + { 1800 + unsigned long flags; 1801 + spin_lock_irqsave(&q->q_lock, flags); 1802 + list_add_tail(&w->list, &q->q); 1803 + up(&q->s); /* within the spinlock, 1804 + see comment near end of drbd_worker() */ 1805 + spin_unlock_irqrestore(&q->q_lock, flags); 1806 + } 1807 + 1808 + static inline void wake_asender(struct drbd_conf *mdev) 1809 + { 1810 + if (test_bit(SIGNAL_ASENDER, &mdev->flags)) 1811 + force_sig(DRBD_SIG, mdev->asender.task); 1812 + } 1813 + 1814 + static inline void request_ping(struct drbd_conf *mdev) 1815 + { 1816 + set_bit(SEND_PING, &mdev->flags); 1817 + wake_asender(mdev); 1818 + } 1819 + 1820 + static inline int drbd_send_short_cmd(struct drbd_conf *mdev, 1821 + enum drbd_packets cmd) 1822 + { 1823 + struct p_header h; 1824 + return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); 1825 + } 1826 + 1827 + static inline int drbd_send_ping(struct drbd_conf *mdev) 1828 + { 1829 + struct p_header h; 1830 + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); 1831 + } 1832 + 1833 + static inline int drbd_send_ping_ack(struct drbd_conf *mdev) 1834 + { 1835 + struct p_header h; 1836 + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); 1837 + } 1838 + 1839 + static inline void drbd_thread_stop(struct drbd_thread *thi) 1840 + { 1841 + _drbd_thread_stop(thi, FALSE, TRUE); 1842 + } 1843 + 1844 + static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) 1845 + { 1846 + _drbd_thread_stop(thi, FALSE, FALSE); 1847 + } 1848 + 1849 + static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) 1850 + { 1851 + _drbd_thread_stop(thi, TRUE, FALSE); 1852 + } 1853 + 1854 + /* counts how many answer packets packets we expect from our peer, 1855 + * for either explicit application requests, 1856 + * or implicit barrier packets as necessary. 1857 + * increased: 1858 + * w_send_barrier 1859 + * _req_mod(req, queue_for_net_write or queue_for_net_read); 1860 + * it is much easier and equally valid to count what we queue for the 1861 + * worker, even before it actually was queued or send. 1862 + * (drbd_make_request_common; recovery path on read io-error) 1863 + * decreased: 1864 + * got_BarrierAck (respective tl_clear, tl_clear_barrier) 1865 + * _req_mod(req, data_received) 1866 + * [from receive_DataReply] 1867 + * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) 1868 + * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] 1869 + * for some reason it is NOT decreased in got_NegAck, 1870 + * but in the resulting cleanup code from report_params. 1871 + * we should try to remember the reason for that... 1872 + * _req_mod(req, send_failed or send_canceled) 1873 + * _req_mod(req, connection_lost_while_pending) 1874 + * [from tl_clear_barrier] 1875 + */ 1876 + static inline void inc_ap_pending(struct drbd_conf *mdev) 1877 + { 1878 + atomic_inc(&mdev->ap_pending_cnt); 1879 + } 1880 + 1881 + #define ERR_IF_CNT_IS_NEGATIVE(which) \ 1882 + if (atomic_read(&mdev->which) < 0) \ 1883 + dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ 1884 + __func__ , __LINE__ , \ 1885 + atomic_read(&mdev->which)) 1886 + 1887 + #define dec_ap_pending(mdev) do { \ 1888 + typecheck(struct drbd_conf *, mdev); \ 1889 + if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ 1890 + wake_up(&mdev->misc_wait); \ 1891 + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) 1892 + 1893 + /* counts how many resync-related answers we still expect from the peer 1894 + * increase decrease 1895 + * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) 1896 + * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) 1897 + * (or P_NEG_ACK with ID_SYNCER) 1898 + */ 1899 + static inline void inc_rs_pending(struct drbd_conf *mdev) 1900 + { 1901 + atomic_inc(&mdev->rs_pending_cnt); 1902 + } 1903 + 1904 + #define dec_rs_pending(mdev) do { \ 1905 + typecheck(struct drbd_conf *, mdev); \ 1906 + atomic_dec(&mdev->rs_pending_cnt); \ 1907 + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) 1908 + 1909 + /* counts how many answers we still need to send to the peer. 1910 + * increased on 1911 + * receive_Data unless protocol A; 1912 + * we need to send a P_RECV_ACK (proto B) 1913 + * or P_WRITE_ACK (proto C) 1914 + * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK 1915 + * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA 1916 + * receive_Barrier_* we need to send a P_BARRIER_ACK 1917 + */ 1918 + static inline void inc_unacked(struct drbd_conf *mdev) 1919 + { 1920 + atomic_inc(&mdev->unacked_cnt); 1921 + } 1922 + 1923 + #define dec_unacked(mdev) do { \ 1924 + typecheck(struct drbd_conf *, mdev); \ 1925 + atomic_dec(&mdev->unacked_cnt); \ 1926 + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) 1927 + 1928 + #define sub_unacked(mdev, n) do { \ 1929 + typecheck(struct drbd_conf *, mdev); \ 1930 + atomic_sub(n, &mdev->unacked_cnt); \ 1931 + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) 1932 + 1933 + 1934 + static inline void put_net_conf(struct drbd_conf *mdev) 1935 + { 1936 + if (atomic_dec_and_test(&mdev->net_cnt)) 1937 + wake_up(&mdev->misc_wait); 1938 + } 1939 + 1940 + /** 1941 + * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there 1942 + * @mdev: DRBD device. 1943 + * 1944 + * You have to call put_net_conf() when finished working with mdev->net_conf. 1945 + */ 1946 + static inline int get_net_conf(struct drbd_conf *mdev) 1947 + { 1948 + int have_net_conf; 1949 + 1950 + atomic_inc(&mdev->net_cnt); 1951 + have_net_conf = mdev->state.conn >= C_UNCONNECTED; 1952 + if (!have_net_conf) 1953 + put_net_conf(mdev); 1954 + return have_net_conf; 1955 + } 1956 + 1957 + /** 1958 + * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev 1959 + * @M: DRBD device. 1960 + * 1961 + * You have to call put_ldev() when finished working with mdev->ldev. 1962 + */ 1963 + #define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) 1964 + #define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) 1965 + 1966 + static inline void put_ldev(struct drbd_conf *mdev) 1967 + { 1968 + __release(local); 1969 + if (atomic_dec_and_test(&mdev->local_cnt)) 1970 + wake_up(&mdev->misc_wait); 1971 + D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); 1972 + } 1973 + 1974 + #ifndef __CHECKER__ 1975 + static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) 1976 + { 1977 + int io_allowed; 1978 + 1979 + atomic_inc(&mdev->local_cnt); 1980 + io_allowed = (mdev->state.disk >= mins); 1981 + if (!io_allowed) 1982 + put_ldev(mdev); 1983 + return io_allowed; 1984 + } 1985 + #else 1986 + extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins); 1987 + #endif 1988 + 1989 + /* you must have an "get_ldev" reference */ 1990 + static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, 1991 + unsigned long *bits_left, unsigned int *per_mil_done) 1992 + { 1993 + /* 1994 + * this is to break it at compile time when we change that 1995 + * (we may feel 4TB maximum storage per drbd is not enough) 1996 + */ 1997 + typecheck(unsigned long, mdev->rs_total); 1998 + 1999 + /* note: both rs_total and rs_left are in bits, i.e. in 2000 + * units of BM_BLOCK_SIZE. 2001 + * for the percentage, we don't care. */ 2002 + 2003 + *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; 2004 + /* >> 10 to prevent overflow, 2005 + * +1 to prevent division by zero */ 2006 + if (*bits_left > mdev->rs_total) { 2007 + /* doh. maybe a logic bug somewhere. 2008 + * may also be just a race condition 2009 + * between this and a disconnect during sync. 2010 + * for now, just prevent in-kernel buffer overflow. 2011 + */ 2012 + smp_rmb(); 2013 + dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", 2014 + drbd_conn_str(mdev->state.conn), 2015 + *bits_left, mdev->rs_total, mdev->rs_failed); 2016 + *per_mil_done = 0; 2017 + } else { 2018 + /* make sure the calculation happens in long context */ 2019 + unsigned long tmp = 1000UL - 2020 + (*bits_left >> 10)*1000UL 2021 + / ((mdev->rs_total >> 10) + 1UL); 2022 + *per_mil_done = tmp; 2023 + } 2024 + } 2025 + 2026 + 2027 + /* this throttles on-the-fly application requests 2028 + * according to max_buffers settings; 2029 + * maybe re-implement using semaphores? */ 2030 + static inline int drbd_get_max_buffers(struct drbd_conf *mdev) 2031 + { 2032 + int mxb = 1000000; /* arbitrary limit on open requests */ 2033 + if (get_net_conf(mdev)) { 2034 + mxb = mdev->net_conf->max_buffers; 2035 + put_net_conf(mdev); 2036 + } 2037 + return mxb; 2038 + } 2039 + 2040 + static inline int drbd_state_is_stable(union drbd_state s) 2041 + { 2042 + 2043 + /* DO NOT add a default clause, we want the compiler to warn us 2044 + * for any newly introduced state we may have forgotten to add here */ 2045 + 2046 + switch ((enum drbd_conns)s.conn) { 2047 + /* new io only accepted when there is no connection, ... */ 2048 + case C_STANDALONE: 2049 + case C_WF_CONNECTION: 2050 + /* ... or there is a well established connection. */ 2051 + case C_CONNECTED: 2052 + case C_SYNC_SOURCE: 2053 + case C_SYNC_TARGET: 2054 + case C_VERIFY_S: 2055 + case C_VERIFY_T: 2056 + case C_PAUSED_SYNC_S: 2057 + case C_PAUSED_SYNC_T: 2058 + /* maybe stable, look at the disk state */ 2059 + break; 2060 + 2061 + /* no new io accepted during tansitional states 2062 + * like handshake or teardown */ 2063 + case C_DISCONNECTING: 2064 + case C_UNCONNECTED: 2065 + case C_TIMEOUT: 2066 + case C_BROKEN_PIPE: 2067 + case C_NETWORK_FAILURE: 2068 + case C_PROTOCOL_ERROR: 2069 + case C_TEAR_DOWN: 2070 + case C_WF_REPORT_PARAMS: 2071 + case C_STARTING_SYNC_S: 2072 + case C_STARTING_SYNC_T: 2073 + case C_WF_BITMAP_S: 2074 + case C_WF_BITMAP_T: 2075 + case C_WF_SYNC_UUID: 2076 + case C_MASK: 2077 + /* not "stable" */ 2078 + return 0; 2079 + } 2080 + 2081 + switch ((enum drbd_disk_state)s.disk) { 2082 + case D_DISKLESS: 2083 + case D_INCONSISTENT: 2084 + case D_OUTDATED: 2085 + case D_CONSISTENT: 2086 + case D_UP_TO_DATE: 2087 + /* disk state is stable as well. */ 2088 + break; 2089 + 2090 + /* no new io accepted during tansitional states */ 2091 + case D_ATTACHING: 2092 + case D_FAILED: 2093 + case D_NEGOTIATING: 2094 + case D_UNKNOWN: 2095 + case D_MASK: 2096 + /* not "stable" */ 2097 + return 0; 2098 + } 2099 + 2100 + return 1; 2101 + } 2102 + 2103 + static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) 2104 + { 2105 + int mxb = drbd_get_max_buffers(mdev); 2106 + 2107 + if (mdev->state.susp) 2108 + return 0; 2109 + if (test_bit(SUSPEND_IO, &mdev->flags)) 2110 + return 0; 2111 + 2112 + /* to avoid potential deadlock or bitmap corruption, 2113 + * in various places, we only allow new application io 2114 + * to start during "stable" states. */ 2115 + 2116 + /* no new io accepted when attaching or detaching the disk */ 2117 + if (!drbd_state_is_stable(mdev->state)) 2118 + return 0; 2119 + 2120 + /* since some older kernels don't have atomic_add_unless, 2121 + * and we are within the spinlock anyways, we have this workaround. */ 2122 + if (atomic_read(&mdev->ap_bio_cnt) > mxb) 2123 + return 0; 2124 + if (test_bit(BITMAP_IO, &mdev->flags)) 2125 + return 0; 2126 + return 1; 2127 + } 2128 + 2129 + /* I'd like to use wait_event_lock_irq, 2130 + * but I'm not sure when it got introduced, 2131 + * and not sure when it has 3 or 4 arguments */ 2132 + static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) 2133 + { 2134 + /* compare with after_state_ch, 2135 + * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ 2136 + DEFINE_WAIT(wait); 2137 + 2138 + /* we wait here 2139 + * as long as the device is suspended 2140 + * until the bitmap is no longer on the fly during connection 2141 + * handshake as long as we would exeed the max_buffer limit. 2142 + * 2143 + * to avoid races with the reconnect code, 2144 + * we need to atomic_inc within the spinlock. */ 2145 + 2146 + spin_lock_irq(&mdev->req_lock); 2147 + while (!__inc_ap_bio_cond(mdev)) { 2148 + prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 2149 + spin_unlock_irq(&mdev->req_lock); 2150 + schedule(); 2151 + finish_wait(&mdev->misc_wait, &wait); 2152 + spin_lock_irq(&mdev->req_lock); 2153 + } 2154 + atomic_add(one_or_two, &mdev->ap_bio_cnt); 2155 + spin_unlock_irq(&mdev->req_lock); 2156 + } 2157 + 2158 + static inline void dec_ap_bio(struct drbd_conf *mdev) 2159 + { 2160 + int mxb = drbd_get_max_buffers(mdev); 2161 + int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); 2162 + 2163 + D_ASSERT(ap_bio >= 0); 2164 + /* this currently does wake_up for every dec_ap_bio! 2165 + * maybe rather introduce some type of hysteresis? 2166 + * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ 2167 + if (ap_bio < mxb) 2168 + wake_up(&mdev->misc_wait); 2169 + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { 2170 + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) 2171 + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); 2172 + } 2173 + } 2174 + 2175 + static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) 2176 + { 2177 + mdev->ed_uuid = val; 2178 + } 2179 + 2180 + static inline int seq_cmp(u32 a, u32 b) 2181 + { 2182 + /* we assume wrap around at 32bit. 2183 + * for wrap around at 24bit (old atomic_t), 2184 + * we'd have to 2185 + * a <<= 8; b <<= 8; 2186 + */ 2187 + return (s32)(a) - (s32)(b); 2188 + } 2189 + #define seq_lt(a, b) (seq_cmp((a), (b)) < 0) 2190 + #define seq_gt(a, b) (seq_cmp((a), (b)) > 0) 2191 + #define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) 2192 + #define seq_le(a, b) (seq_cmp((a), (b)) <= 0) 2193 + /* CAUTION: please no side effects in arguments! */ 2194 + #define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) 2195 + 2196 + static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) 2197 + { 2198 + unsigned int m; 2199 + spin_lock(&mdev->peer_seq_lock); 2200 + m = seq_max(mdev->peer_seq, new_seq); 2201 + mdev->peer_seq = m; 2202 + spin_unlock(&mdev->peer_seq_lock); 2203 + if (m == new_seq) 2204 + wake_up(&mdev->seq_wait); 2205 + } 2206 + 2207 + static inline void drbd_update_congested(struct drbd_conf *mdev) 2208 + { 2209 + struct sock *sk = mdev->data.socket->sk; 2210 + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) 2211 + set_bit(NET_CONGESTED, &mdev->flags); 2212 + } 2213 + 2214 + static inline int drbd_queue_order_type(struct drbd_conf *mdev) 2215 + { 2216 + /* sorry, we currently have no working implementation 2217 + * of distributed TCQ stuff */ 2218 + #ifndef QUEUE_ORDERED_NONE 2219 + #define QUEUE_ORDERED_NONE 0 2220 + #endif 2221 + return QUEUE_ORDERED_NONE; 2222 + } 2223 + 2224 + static inline void drbd_blk_run_queue(struct request_queue *q) 2225 + { 2226 + if (q && q->unplug_fn) 2227 + q->unplug_fn(q); 2228 + } 2229 + 2230 + static inline void drbd_kick_lo(struct drbd_conf *mdev) 2231 + { 2232 + if (get_ldev(mdev)) { 2233 + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); 2234 + put_ldev(mdev); 2235 + } 2236 + } 2237 + 2238 + static inline void drbd_md_flush(struct drbd_conf *mdev) 2239 + { 2240 + int r; 2241 + 2242 + if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2243 + return; 2244 + 2245 + r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); 2246 + if (r) { 2247 + set_bit(MD_NO_BARRIER, &mdev->flags); 2248 + dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2249 + } 2250 + } 2251 + 2252 + #endif

+3699

drivers/block/drbd/drbd_main.c

··· 1 + /* 2 + drbd.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev 11 + from Logicworks, Inc. for making SDP replication support possible. 12 + 13 + drbd is free software; you can redistribute it and/or modify 14 + it under the terms of the GNU General Public License as published by 15 + the Free Software Foundation; either version 2, or (at your option) 16 + any later version. 17 + 18 + drbd is distributed in the hope that it will be useful, 19 + but WITHOUT ANY WARRANTY; without even the implied warranty of 20 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 + GNU General Public License for more details. 22 + 23 + You should have received a copy of the GNU General Public License 24 + along with drbd; see the file COPYING. If not, write to 25 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 26 + 27 + */ 28 + 29 + #include <linux/module.h> 30 + #include <linux/version.h> 31 + #include <linux/drbd.h> 32 + #include <asm/uaccess.h> 33 + #include <asm/types.h> 34 + #include <net/sock.h> 35 + #include <linux/ctype.h> 36 + #include <linux/smp_lock.h> 37 + #include <linux/fs.h> 38 + #include <linux/file.h> 39 + #include <linux/proc_fs.h> 40 + #include <linux/init.h> 41 + #include <linux/mm.h> 42 + #include <linux/memcontrol.h> 43 + #include <linux/mm_inline.h> 44 + #include <linux/slab.h> 45 + #include <linux/random.h> 46 + #include <linux/reboot.h> 47 + #include <linux/notifier.h> 48 + #include <linux/kthread.h> 49 + 50 + #define __KERNEL_SYSCALLS__ 51 + #include <linux/unistd.h> 52 + #include <linux/vmalloc.h> 53 + 54 + #include <linux/drbd_limits.h> 55 + #include "drbd_int.h" 56 + #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ 57 + 58 + #include "drbd_vli.h" 59 + 60 + struct after_state_chg_work { 61 + struct drbd_work w; 62 + union drbd_state os; 63 + union drbd_state ns; 64 + enum chg_state_flags flags; 65 + struct completion *done; 66 + }; 67 + 68 + int drbdd_init(struct drbd_thread *); 69 + int drbd_worker(struct drbd_thread *); 70 + int drbd_asender(struct drbd_thread *); 71 + 72 + int drbd_init(void); 73 + static int drbd_open(struct block_device *bdev, fmode_t mode); 74 + static int drbd_release(struct gendisk *gd, fmode_t mode); 75 + static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); 76 + static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, 77 + union drbd_state ns, enum chg_state_flags flags); 78 + static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); 79 + static void md_sync_timer_fn(unsigned long data); 80 + static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); 81 + 82 + MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 83 + "Lars Ellenberg <lars@linbit.com>"); 84 + MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); 85 + MODULE_VERSION(REL_VERSION); 86 + MODULE_LICENSE("GPL"); 87 + MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); 88 + MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); 89 + 90 + #include <linux/moduleparam.h> 91 + /* allow_open_on_secondary */ 92 + MODULE_PARM_DESC(allow_oos, "DONT USE!"); 93 + /* thanks to these macros, if compiled into the kernel (not-module), 94 + * this becomes the boot parameter drbd.minor_count */ 95 + module_param(minor_count, uint, 0444); 96 + module_param(disable_sendpage, bool, 0644); 97 + module_param(allow_oos, bool, 0); 98 + module_param(cn_idx, uint, 0444); 99 + module_param(proc_details, int, 0644); 100 + 101 + #ifdef CONFIG_DRBD_FAULT_INJECTION 102 + int enable_faults; 103 + int fault_rate; 104 + static int fault_count; 105 + int fault_devs; 106 + /* bitmap of enabled faults */ 107 + module_param(enable_faults, int, 0664); 108 + /* fault rate % value - applies to all enabled faults */ 109 + module_param(fault_rate, int, 0664); 110 + /* count of faults inserted */ 111 + module_param(fault_count, int, 0664); 112 + /* bitmap of devices to insert faults on */ 113 + module_param(fault_devs, int, 0644); 114 + #endif 115 + 116 + /* module parameter, defined */ 117 + unsigned int minor_count = 32; 118 + int disable_sendpage; 119 + int allow_oos; 120 + unsigned int cn_idx = CN_IDX_DRBD; 121 + int proc_details; /* Detail level in proc drbd*/ 122 + 123 + /* Module parameter for setting the user mode helper program 124 + * to run. Default is /sbin/drbdadm */ 125 + char usermode_helper[80] = "/sbin/drbdadm"; 126 + 127 + module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); 128 + 129 + /* in 2.6.x, our device mapping and config info contains our virtual gendisks 130 + * as member "struct gendisk *vdisk;" 131 + */ 132 + struct drbd_conf **minor_table; 133 + 134 + struct kmem_cache *drbd_request_cache; 135 + struct kmem_cache *drbd_ee_cache; /* epoch entries */ 136 + struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ 137 + struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ 138 + mempool_t *drbd_request_mempool; 139 + mempool_t *drbd_ee_mempool; 140 + 141 + /* I do not use a standard mempool, because: 142 + 1) I want to hand out the pre-allocated objects first. 143 + 2) I want to be able to interrupt sleeping allocation with a signal. 144 + Note: This is a single linked list, the next pointer is the private 145 + member of struct page. 146 + */ 147 + struct page *drbd_pp_pool; 148 + spinlock_t drbd_pp_lock; 149 + int drbd_pp_vacant; 150 + wait_queue_head_t drbd_pp_wait; 151 + 152 + DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); 153 + 154 + static struct block_device_operations drbd_ops = { 155 + .owner = THIS_MODULE, 156 + .open = drbd_open, 157 + .release = drbd_release, 158 + }; 159 + 160 + #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) 161 + 162 + #ifdef __CHECKER__ 163 + /* When checking with sparse, and this is an inline function, sparse will 164 + give tons of false positives. When this is a real functions sparse works. 165 + */ 166 + int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) 167 + { 168 + int io_allowed; 169 + 170 + atomic_inc(&mdev->local_cnt); 171 + io_allowed = (mdev->state.disk >= mins); 172 + if (!io_allowed) { 173 + if (atomic_dec_and_test(&mdev->local_cnt)) 174 + wake_up(&mdev->misc_wait); 175 + } 176 + return io_allowed; 177 + } 178 + 179 + #endif 180 + 181 + /** 182 + * DOC: The transfer log 183 + * 184 + * The transfer log is a single linked list of &struct drbd_tl_epoch objects. 185 + * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail 186 + * of the list. There is always at least one &struct drbd_tl_epoch object. 187 + * 188 + * Each &struct drbd_tl_epoch has a circular double linked list of requests 189 + * attached. 190 + */ 191 + static int tl_init(struct drbd_conf *mdev) 192 + { 193 + struct drbd_tl_epoch *b; 194 + 195 + /* during device minor initialization, we may well use GFP_KERNEL */ 196 + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); 197 + if (!b) 198 + return 0; 199 + INIT_LIST_HEAD(&b->requests); 200 + INIT_LIST_HEAD(&b->w.list); 201 + b->next = NULL; 202 + b->br_number = 4711; 203 + b->n_req = 0; 204 + b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ 205 + 206 + mdev->oldest_tle = b; 207 + mdev->newest_tle = b; 208 + INIT_LIST_HEAD(&mdev->out_of_sequence_requests); 209 + 210 + mdev->tl_hash = NULL; 211 + mdev->tl_hash_s = 0; 212 + 213 + return 1; 214 + } 215 + 216 + static void tl_cleanup(struct drbd_conf *mdev) 217 + { 218 + D_ASSERT(mdev->oldest_tle == mdev->newest_tle); 219 + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); 220 + kfree(mdev->oldest_tle); 221 + mdev->oldest_tle = NULL; 222 + kfree(mdev->unused_spare_tle); 223 + mdev->unused_spare_tle = NULL; 224 + kfree(mdev->tl_hash); 225 + mdev->tl_hash = NULL; 226 + mdev->tl_hash_s = 0; 227 + } 228 + 229 + /** 230 + * _tl_add_barrier() - Adds a barrier to the transfer log 231 + * @mdev: DRBD device. 232 + * @new: Barrier to be added before the current head of the TL. 233 + * 234 + * The caller must hold the req_lock. 235 + */ 236 + void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) 237 + { 238 + struct drbd_tl_epoch *newest_before; 239 + 240 + INIT_LIST_HEAD(&new->requests); 241 + INIT_LIST_HEAD(&new->w.list); 242 + new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ 243 + new->next = NULL; 244 + new->n_req = 0; 245 + 246 + newest_before = mdev->newest_tle; 247 + /* never send a barrier number == 0, because that is special-cased 248 + * when using TCQ for our write ordering code */ 249 + new->br_number = (newest_before->br_number+1) ?: 1; 250 + if (mdev->newest_tle != new) { 251 + mdev->newest_tle->next = new; 252 + mdev->newest_tle = new; 253 + } 254 + } 255 + 256 + /** 257 + * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL 258 + * @mdev: DRBD device. 259 + * @barrier_nr: Expected identifier of the DRBD write barrier packet. 260 + * @set_size: Expected number of requests before that barrier. 261 + * 262 + * In case the passed barrier_nr or set_size does not match the oldest 263 + * &struct drbd_tl_epoch objects this function will cause a termination 264 + * of the connection. 265 + */ 266 + void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, 267 + unsigned int set_size) 268 + { 269 + struct drbd_tl_epoch *b, *nob; /* next old barrier */ 270 + struct list_head *le, *tle; 271 + struct drbd_request *r; 272 + 273 + spin_lock_irq(&mdev->req_lock); 274 + 275 + b = mdev->oldest_tle; 276 + 277 + /* first some paranoia code */ 278 + if (b == NULL) { 279 + dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", 280 + barrier_nr); 281 + goto bail; 282 + } 283 + if (b->br_number != barrier_nr) { 284 + dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", 285 + barrier_nr, b->br_number); 286 + goto bail; 287 + } 288 + if (b->n_req != set_size) { 289 + dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", 290 + barrier_nr, set_size, b->n_req); 291 + goto bail; 292 + } 293 + 294 + /* Clean up list of requests processed during current epoch */ 295 + list_for_each_safe(le, tle, &b->requests) { 296 + r = list_entry(le, struct drbd_request, tl_requests); 297 + _req_mod(r, barrier_acked); 298 + } 299 + /* There could be requests on the list waiting for completion 300 + of the write to the local disk. To avoid corruptions of 301 + slab's data structures we have to remove the lists head. 302 + 303 + Also there could have been a barrier ack out of sequence, overtaking 304 + the write acks - which would be a bug and violating write ordering. 305 + To not deadlock in case we lose connection while such requests are 306 + still pending, we need some way to find them for the 307 + _req_mode(connection_lost_while_pending). 308 + 309 + These have been list_move'd to the out_of_sequence_requests list in 310 + _req_mod(, barrier_acked) above. 311 + */ 312 + list_del_init(&b->requests); 313 + 314 + nob = b->next; 315 + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { 316 + _tl_add_barrier(mdev, b); 317 + if (nob) 318 + mdev->oldest_tle = nob; 319 + /* if nob == NULL b was the only barrier, and becomes the new 320 + barrier. Therefore mdev->oldest_tle points already to b */ 321 + } else { 322 + D_ASSERT(nob != NULL); 323 + mdev->oldest_tle = nob; 324 + kfree(b); 325 + } 326 + 327 + spin_unlock_irq(&mdev->req_lock); 328 + dec_ap_pending(mdev); 329 + 330 + return; 331 + 332 + bail: 333 + spin_unlock_irq(&mdev->req_lock); 334 + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 335 + } 336 + 337 + 338 + /** 339 + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL 340 + * @mdev: DRBD device. 341 + * 342 + * This is called after the connection to the peer was lost. The storage covered 343 + * by the requests on the transfer gets marked as our of sync. Called from the 344 + * receiver thread and the worker thread. 345 + */ 346 + void tl_clear(struct drbd_conf *mdev) 347 + { 348 + struct drbd_tl_epoch *b, *tmp; 349 + struct list_head *le, *tle; 350 + struct drbd_request *r; 351 + int new_initial_bnr = net_random(); 352 + 353 + spin_lock_irq(&mdev->req_lock); 354 + 355 + b = mdev->oldest_tle; 356 + while (b) { 357 + list_for_each_safe(le, tle, &b->requests) { 358 + r = list_entry(le, struct drbd_request, tl_requests); 359 + /* It would be nice to complete outside of spinlock. 360 + * But this is easier for now. */ 361 + _req_mod(r, connection_lost_while_pending); 362 + } 363 + tmp = b->next; 364 + 365 + /* there could still be requests on that ring list, 366 + * in case local io is still pending */ 367 + list_del(&b->requests); 368 + 369 + /* dec_ap_pending corresponding to queue_barrier. 370 + * the newest barrier may not have been queued yet, 371 + * in which case w.cb is still NULL. */ 372 + if (b->w.cb != NULL) 373 + dec_ap_pending(mdev); 374 + 375 + if (b == mdev->newest_tle) { 376 + /* recycle, but reinit! */ 377 + D_ASSERT(tmp == NULL); 378 + INIT_LIST_HEAD(&b->requests); 379 + INIT_LIST_HEAD(&b->w.list); 380 + b->w.cb = NULL; 381 + b->br_number = new_initial_bnr; 382 + b->n_req = 0; 383 + 384 + mdev->oldest_tle = b; 385 + break; 386 + } 387 + kfree(b); 388 + b = tmp; 389 + } 390 + 391 + /* we expect this list to be empty. */ 392 + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); 393 + 394 + /* but just in case, clean it up anyways! */ 395 + list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { 396 + r = list_entry(le, struct drbd_request, tl_requests); 397 + /* It would be nice to complete outside of spinlock. 398 + * But this is easier for now. */ 399 + _req_mod(r, connection_lost_while_pending); 400 + } 401 + 402 + /* ensure bit indicating barrier is required is clear */ 403 + clear_bit(CREATE_BARRIER, &mdev->flags); 404 + 405 + spin_unlock_irq(&mdev->req_lock); 406 + } 407 + 408 + /** 409 + * cl_wide_st_chg() - TRUE if the state change is a cluster wide one 410 + * @mdev: DRBD device. 411 + * @os: old (current) state. 412 + * @ns: new (wanted) state. 413 + */ 414 + static int cl_wide_st_chg(struct drbd_conf *mdev, 415 + union drbd_state os, union drbd_state ns) 416 + { 417 + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && 418 + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || 419 + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 420 + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || 421 + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || 422 + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || 423 + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); 424 + } 425 + 426 + int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 427 + union drbd_state mask, union drbd_state val) 428 + { 429 + unsigned long flags; 430 + union drbd_state os, ns; 431 + int rv; 432 + 433 + spin_lock_irqsave(&mdev->req_lock, flags); 434 + os = mdev->state; 435 + ns.i = (os.i & ~mask.i) | val.i; 436 + rv = _drbd_set_state(mdev, ns, f, NULL); 437 + ns = mdev->state; 438 + spin_unlock_irqrestore(&mdev->req_lock, flags); 439 + 440 + return rv; 441 + } 442 + 443 + /** 444 + * drbd_force_state() - Impose a change which happens outside our control on our state 445 + * @mdev: DRBD device. 446 + * @mask: mask of state bits to change. 447 + * @val: value of new state bits. 448 + */ 449 + void drbd_force_state(struct drbd_conf *mdev, 450 + union drbd_state mask, union drbd_state val) 451 + { 452 + drbd_change_state(mdev, CS_HARD, mask, val); 453 + } 454 + 455 + static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); 456 + static int is_valid_state_transition(struct drbd_conf *, 457 + union drbd_state, union drbd_state); 458 + static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 459 + union drbd_state ns, int *warn_sync_abort); 460 + int drbd_send_state_req(struct drbd_conf *, 461 + union drbd_state, union drbd_state); 462 + 463 + static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, 464 + union drbd_state mask, union drbd_state val) 465 + { 466 + union drbd_state os, ns; 467 + unsigned long flags; 468 + int rv; 469 + 470 + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) 471 + return SS_CW_SUCCESS; 472 + 473 + if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) 474 + return SS_CW_FAILED_BY_PEER; 475 + 476 + rv = 0; 477 + spin_lock_irqsave(&mdev->req_lock, flags); 478 + os = mdev->state; 479 + ns.i = (os.i & ~mask.i) | val.i; 480 + ns = sanitize_state(mdev, os, ns, NULL); 481 + 482 + if (!cl_wide_st_chg(mdev, os, ns)) 483 + rv = SS_CW_NO_NEED; 484 + if (!rv) { 485 + rv = is_valid_state(mdev, ns); 486 + if (rv == SS_SUCCESS) { 487 + rv = is_valid_state_transition(mdev, ns, os); 488 + if (rv == SS_SUCCESS) 489 + rv = 0; /* cont waiting, otherwise fail. */ 490 + } 491 + } 492 + spin_unlock_irqrestore(&mdev->req_lock, flags); 493 + 494 + return rv; 495 + } 496 + 497 + /** 498 + * drbd_req_state() - Perform an eventually cluster wide state change 499 + * @mdev: DRBD device. 500 + * @mask: mask of state bits to change. 501 + * @val: value of new state bits. 502 + * @f: flags 503 + * 504 + * Should not be called directly, use drbd_request_state() or 505 + * _drbd_request_state(). 506 + */ 507 + static int drbd_req_state(struct drbd_conf *mdev, 508 + union drbd_state mask, union drbd_state val, 509 + enum chg_state_flags f) 510 + { 511 + struct completion done; 512 + unsigned long flags; 513 + union drbd_state os, ns; 514 + int rv; 515 + 516 + init_completion(&done); 517 + 518 + if (f & CS_SERIALIZE) 519 + mutex_lock(&mdev->state_mutex); 520 + 521 + spin_lock_irqsave(&mdev->req_lock, flags); 522 + os = mdev->state; 523 + ns.i = (os.i & ~mask.i) | val.i; 524 + ns = sanitize_state(mdev, os, ns, NULL); 525 + 526 + if (cl_wide_st_chg(mdev, os, ns)) { 527 + rv = is_valid_state(mdev, ns); 528 + if (rv == SS_SUCCESS) 529 + rv = is_valid_state_transition(mdev, ns, os); 530 + spin_unlock_irqrestore(&mdev->req_lock, flags); 531 + 532 + if (rv < SS_SUCCESS) { 533 + if (f & CS_VERBOSE) 534 + print_st_err(mdev, os, ns, rv); 535 + goto abort; 536 + } 537 + 538 + drbd_state_lock(mdev); 539 + if (!drbd_send_state_req(mdev, mask, val)) { 540 + drbd_state_unlock(mdev); 541 + rv = SS_CW_FAILED_BY_PEER; 542 + if (f & CS_VERBOSE) 543 + print_st_err(mdev, os, ns, rv); 544 + goto abort; 545 + } 546 + 547 + wait_event(mdev->state_wait, 548 + (rv = _req_st_cond(mdev, mask, val))); 549 + 550 + if (rv < SS_SUCCESS) { 551 + drbd_state_unlock(mdev); 552 + if (f & CS_VERBOSE) 553 + print_st_err(mdev, os, ns, rv); 554 + goto abort; 555 + } 556 + spin_lock_irqsave(&mdev->req_lock, flags); 557 + os = mdev->state; 558 + ns.i = (os.i & ~mask.i) | val.i; 559 + rv = _drbd_set_state(mdev, ns, f, &done); 560 + drbd_state_unlock(mdev); 561 + } else { 562 + rv = _drbd_set_state(mdev, ns, f, &done); 563 + } 564 + 565 + spin_unlock_irqrestore(&mdev->req_lock, flags); 566 + 567 + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { 568 + D_ASSERT(current != mdev->worker.task); 569 + wait_for_completion(&done); 570 + } 571 + 572 + abort: 573 + if (f & CS_SERIALIZE) 574 + mutex_unlock(&mdev->state_mutex); 575 + 576 + return rv; 577 + } 578 + 579 + /** 580 + * _drbd_request_state() - Request a state change (with flags) 581 + * @mdev: DRBD device. 582 + * @mask: mask of state bits to change. 583 + * @val: value of new state bits. 584 + * @f: flags 585 + * 586 + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE 587 + * flag, or when logging of failed state change requests is not desired. 588 + */ 589 + int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, 590 + union drbd_state val, enum chg_state_flags f) 591 + { 592 + int rv; 593 + 594 + wait_event(mdev->state_wait, 595 + (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); 596 + 597 + return rv; 598 + } 599 + 600 + static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) 601 + { 602 + dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", 603 + name, 604 + drbd_conn_str(ns.conn), 605 + drbd_role_str(ns.role), 606 + drbd_role_str(ns.peer), 607 + drbd_disk_str(ns.disk), 608 + drbd_disk_str(ns.pdsk), 609 + ns.susp ? 's' : 'r', 610 + ns.aftr_isp ? 'a' : '-', 611 + ns.peer_isp ? 'p' : '-', 612 + ns.user_isp ? 'u' : '-' 613 + ); 614 + } 615 + 616 + void print_st_err(struct drbd_conf *mdev, 617 + union drbd_state os, union drbd_state ns, int err) 618 + { 619 + if (err == SS_IN_TRANSIENT_STATE) 620 + return; 621 + dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); 622 + print_st(mdev, " state", os); 623 + print_st(mdev, "wanted", ns); 624 + } 625 + 626 + 627 + #define drbd_peer_str drbd_role_str 628 + #define drbd_pdsk_str drbd_disk_str 629 + 630 + #define drbd_susp_str(A) ((A) ? "1" : "0") 631 + #define drbd_aftr_isp_str(A) ((A) ? "1" : "0") 632 + #define drbd_peer_isp_str(A) ((A) ? "1" : "0") 633 + #define drbd_user_isp_str(A) ((A) ? "1" : "0") 634 + 635 + #define PSC(A) \ 636 + ({ if (ns.A != os.A) { \ 637 + pbp += sprintf(pbp, #A "( %s -> %s ) ", \ 638 + drbd_##A##_str(os.A), \ 639 + drbd_##A##_str(ns.A)); \ 640 + } }) 641 + 642 + /** 643 + * is_valid_state() - Returns an SS_ error code if ns is not valid 644 + * @mdev: DRBD device. 645 + * @ns: State to consider. 646 + */ 647 + static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) 648 + { 649 + /* See drbd_state_sw_errors in drbd_strings.c */ 650 + 651 + enum drbd_fencing_p fp; 652 + int rv = SS_SUCCESS; 653 + 654 + fp = FP_DONT_CARE; 655 + if (get_ldev(mdev)) { 656 + fp = mdev->ldev->dc.fencing; 657 + put_ldev(mdev); 658 + } 659 + 660 + if (get_net_conf(mdev)) { 661 + if (!mdev->net_conf->two_primaries && 662 + ns.role == R_PRIMARY && ns.peer == R_PRIMARY) 663 + rv = SS_TWO_PRIMARIES; 664 + put_net_conf(mdev); 665 + } 666 + 667 + if (rv <= 0) 668 + /* already found a reason to abort */; 669 + else if (ns.role == R_SECONDARY && mdev->open_cnt) 670 + rv = SS_DEVICE_IN_USE; 671 + 672 + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) 673 + rv = SS_NO_UP_TO_DATE_DISK; 674 + 675 + else if (fp >= FP_RESOURCE && 676 + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) 677 + rv = SS_PRIMARY_NOP; 678 + 679 + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) 680 + rv = SS_NO_UP_TO_DATE_DISK; 681 + 682 + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) 683 + rv = SS_NO_LOCAL_DISK; 684 + 685 + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) 686 + rv = SS_NO_REMOTE_DISK; 687 + 688 + else if ((ns.conn == C_CONNECTED || 689 + ns.conn == C_WF_BITMAP_S || 690 + ns.conn == C_SYNC_SOURCE || 691 + ns.conn == C_PAUSED_SYNC_S) && 692 + ns.disk == D_OUTDATED) 693 + rv = SS_CONNECTED_OUTDATES; 694 + 695 + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && 696 + (mdev->sync_conf.verify_alg[0] == 0)) 697 + rv = SS_NO_VERIFY_ALG; 698 + 699 + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && 700 + mdev->agreed_pro_version < 88) 701 + rv = SS_NOT_SUPPORTED; 702 + 703 + return rv; 704 + } 705 + 706 + /** 707 + * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible 708 + * @mdev: DRBD device. 709 + * @ns: new state. 710 + * @os: old state. 711 + */ 712 + static int is_valid_state_transition(struct drbd_conf *mdev, 713 + union drbd_state ns, union drbd_state os) 714 + { 715 + int rv = SS_SUCCESS; 716 + 717 + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && 718 + os.conn > C_CONNECTED) 719 + rv = SS_RESYNC_RUNNING; 720 + 721 + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) 722 + rv = SS_ALREADY_STANDALONE; 723 + 724 + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) 725 + rv = SS_IS_DISKLESS; 726 + 727 + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) 728 + rv = SS_NO_NET_CONFIG; 729 + 730 + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) 731 + rv = SS_LOWER_THAN_OUTDATED; 732 + 733 + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) 734 + rv = SS_IN_TRANSIENT_STATE; 735 + 736 + if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) 737 + rv = SS_IN_TRANSIENT_STATE; 738 + 739 + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) 740 + rv = SS_NEED_CONNECTION; 741 + 742 + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && 743 + ns.conn != os.conn && os.conn > C_CONNECTED) 744 + rv = SS_RESYNC_RUNNING; 745 + 746 + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && 747 + os.conn < C_CONNECTED) 748 + rv = SS_NEED_CONNECTION; 749 + 750 + return rv; 751 + } 752 + 753 + /** 754 + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition 755 + * @mdev: DRBD device. 756 + * @os: old state. 757 + * @ns: new state. 758 + * @warn_sync_abort: 759 + * 760 + * When we loose connection, we have to set the state of the peers disk (pdsk) 761 + * to D_UNKNOWN. This rule and many more along those lines are in this function. 762 + */ 763 + static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 764 + union drbd_state ns, int *warn_sync_abort) 765 + { 766 + enum drbd_fencing_p fp; 767 + 768 + fp = FP_DONT_CARE; 769 + if (get_ldev(mdev)) { 770 + fp = mdev->ldev->dc.fencing; 771 + put_ldev(mdev); 772 + } 773 + 774 + /* Disallow Network errors to configure a device's network part */ 775 + if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && 776 + os.conn <= C_DISCONNECTING) 777 + ns.conn = os.conn; 778 + 779 + /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ 780 + if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && 781 + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) 782 + ns.conn = os.conn; 783 + 784 + /* After C_DISCONNECTING only C_STANDALONE may follow */ 785 + if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) 786 + ns.conn = os.conn; 787 + 788 + if (ns.conn < C_CONNECTED) { 789 + ns.peer_isp = 0; 790 + ns.peer = R_UNKNOWN; 791 + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) 792 + ns.pdsk = D_UNKNOWN; 793 + } 794 + 795 + /* Clear the aftr_isp when becoming unconfigured */ 796 + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) 797 + ns.aftr_isp = 0; 798 + 799 + if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) 800 + ns.pdsk = D_UNKNOWN; 801 + 802 + /* Abort resync if a disk fails/detaches */ 803 + if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && 804 + (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { 805 + if (warn_sync_abort) 806 + *warn_sync_abort = 1; 807 + ns.conn = C_CONNECTED; 808 + } 809 + 810 + if (ns.conn >= C_CONNECTED && 811 + ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || 812 + (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { 813 + switch (ns.conn) { 814 + case C_WF_BITMAP_T: 815 + case C_PAUSED_SYNC_T: 816 + ns.disk = D_OUTDATED; 817 + break; 818 + case C_CONNECTED: 819 + case C_WF_BITMAP_S: 820 + case C_SYNC_SOURCE: 821 + case C_PAUSED_SYNC_S: 822 + ns.disk = D_UP_TO_DATE; 823 + break; 824 + case C_SYNC_TARGET: 825 + ns.disk = D_INCONSISTENT; 826 + dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); 827 + break; 828 + } 829 + if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) 830 + dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); 831 + } 832 + 833 + if (ns.conn >= C_CONNECTED && 834 + (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { 835 + switch (ns.conn) { 836 + case C_CONNECTED: 837 + case C_WF_BITMAP_T: 838 + case C_PAUSED_SYNC_T: 839 + case C_SYNC_TARGET: 840 + ns.pdsk = D_UP_TO_DATE; 841 + break; 842 + case C_WF_BITMAP_S: 843 + case C_PAUSED_SYNC_S: 844 + ns.pdsk = D_OUTDATED; 845 + break; 846 + case C_SYNC_SOURCE: 847 + ns.pdsk = D_INCONSISTENT; 848 + dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); 849 + break; 850 + } 851 + if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) 852 + dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); 853 + } 854 + 855 + /* Connection breaks down before we finished "Negotiating" */ 856 + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && 857 + get_ldev_if_state(mdev, D_NEGOTIATING)) { 858 + if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { 859 + ns.disk = mdev->new_state_tmp.disk; 860 + ns.pdsk = mdev->new_state_tmp.pdsk; 861 + } else { 862 + dev_alert(DEV, "Connection lost while negotiating, no data!\n"); 863 + ns.disk = D_DISKLESS; 864 + ns.pdsk = D_UNKNOWN; 865 + } 866 + put_ldev(mdev); 867 + } 868 + 869 + if (fp == FP_STONITH && 870 + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && 871 + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) 872 + ns.susp = 1; 873 + 874 + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { 875 + if (ns.conn == C_SYNC_SOURCE) 876 + ns.conn = C_PAUSED_SYNC_S; 877 + if (ns.conn == C_SYNC_TARGET) 878 + ns.conn = C_PAUSED_SYNC_T; 879 + } else { 880 + if (ns.conn == C_PAUSED_SYNC_S) 881 + ns.conn = C_SYNC_SOURCE; 882 + if (ns.conn == C_PAUSED_SYNC_T) 883 + ns.conn = C_SYNC_TARGET; 884 + } 885 + 886 + return ns; 887 + } 888 + 889 + /* helper for __drbd_set_state */ 890 + static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) 891 + { 892 + if (cs == C_VERIFY_T) { 893 + /* starting online verify from an arbitrary position 894 + * does not fit well into the existing protocol. 895 + * on C_VERIFY_T, we initialize ov_left and friends 896 + * implicitly in receive_DataRequest once the 897 + * first P_OV_REQUEST is received */ 898 + mdev->ov_start_sector = ~(sector_t)0; 899 + } else { 900 + unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); 901 + if (bit >= mdev->rs_total) 902 + mdev->ov_start_sector = 903 + BM_BIT_TO_SECT(mdev->rs_total - 1); 904 + mdev->ov_position = mdev->ov_start_sector; 905 + } 906 + } 907 + 908 + /** 909 + * __drbd_set_state() - Set a new DRBD state 910 + * @mdev: DRBD device. 911 + * @ns: new state. 912 + * @flags: Flags 913 + * @done: Optional completion, that will get completed after the after_state_ch() finished 914 + * 915 + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. 916 + */ 917 + int __drbd_set_state(struct drbd_conf *mdev, 918 + union drbd_state ns, enum chg_state_flags flags, 919 + struct completion *done) 920 + { 921 + union drbd_state os; 922 + int rv = SS_SUCCESS; 923 + int warn_sync_abort = 0; 924 + struct after_state_chg_work *ascw; 925 + 926 + os = mdev->state; 927 + 928 + ns = sanitize_state(mdev, os, ns, &warn_sync_abort); 929 + 930 + if (ns.i == os.i) 931 + return SS_NOTHING_TO_DO; 932 + 933 + if (!(flags & CS_HARD)) { 934 + /* pre-state-change checks ; only look at ns */ 935 + /* See drbd_state_sw_errors in drbd_strings.c */ 936 + 937 + rv = is_valid_state(mdev, ns); 938 + if (rv < SS_SUCCESS) { 939 + /* If the old state was illegal as well, then let 940 + this happen...*/ 941 + 942 + if (is_valid_state(mdev, os) == rv) { 943 + dev_err(DEV, "Considering state change from bad state. " 944 + "Error would be: '%s'\n", 945 + drbd_set_st_err_str(rv)); 946 + print_st(mdev, "old", os); 947 + print_st(mdev, "new", ns); 948 + rv = is_valid_state_transition(mdev, ns, os); 949 + } 950 + } else 951 + rv = is_valid_state_transition(mdev, ns, os); 952 + } 953 + 954 + if (rv < SS_SUCCESS) { 955 + if (flags & CS_VERBOSE) 956 + print_st_err(mdev, os, ns, rv); 957 + return rv; 958 + } 959 + 960 + if (warn_sync_abort) 961 + dev_warn(DEV, "Resync aborted.\n"); 962 + 963 + { 964 + char *pbp, pb[300]; 965 + pbp = pb; 966 + *pbp = 0; 967 + PSC(role); 968 + PSC(peer); 969 + PSC(conn); 970 + PSC(disk); 971 + PSC(pdsk); 972 + PSC(susp); 973 + PSC(aftr_isp); 974 + PSC(peer_isp); 975 + PSC(user_isp); 976 + dev_info(DEV, "%s\n", pb); 977 + } 978 + 979 + /* solve the race between becoming unconfigured, 980 + * worker doing the cleanup, and 981 + * admin reconfiguring us: 982 + * on (re)configure, first set CONFIG_PENDING, 983 + * then wait for a potentially exiting worker, 984 + * start the worker, and schedule one no_op. 985 + * then proceed with configuration. 986 + */ 987 + if (ns.disk == D_DISKLESS && 988 + ns.conn == C_STANDALONE && 989 + ns.role == R_SECONDARY && 990 + !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) 991 + set_bit(DEVICE_DYING, &mdev->flags); 992 + 993 + mdev->state.i = ns.i; 994 + wake_up(&mdev->misc_wait); 995 + wake_up(&mdev->state_wait); 996 + 997 + /* post-state-change actions */ 998 + if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { 999 + set_bit(STOP_SYNC_TIMER, &mdev->flags); 1000 + mod_timer(&mdev->resync_timer, jiffies); 1001 + } 1002 + 1003 + /* aborted verify run. log the last position */ 1004 + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && 1005 + ns.conn < C_CONNECTED) { 1006 + mdev->ov_start_sector = 1007 + BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); 1008 + dev_info(DEV, "Online Verify reached sector %llu\n", 1009 + (unsigned long long)mdev->ov_start_sector); 1010 + } 1011 + 1012 + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && 1013 + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { 1014 + dev_info(DEV, "Syncer continues.\n"); 1015 + mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; 1016 + if (ns.conn == C_SYNC_TARGET) { 1017 + if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) 1018 + mod_timer(&mdev->resync_timer, jiffies); 1019 + /* This if (!test_bit) is only needed for the case 1020 + that a device that has ceased to used its timer, 1021 + i.e. it is already in drbd_resync_finished() gets 1022 + paused and resumed. */ 1023 + } 1024 + } 1025 + 1026 + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && 1027 + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { 1028 + dev_info(DEV, "Resync suspended\n"); 1029 + mdev->rs_mark_time = jiffies; 1030 + if (ns.conn == C_PAUSED_SYNC_T) 1031 + set_bit(STOP_SYNC_TIMER, &mdev->flags); 1032 + } 1033 + 1034 + if (os.conn == C_CONNECTED && 1035 + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { 1036 + mdev->ov_position = 0; 1037 + mdev->rs_total = 1038 + mdev->rs_mark_left = drbd_bm_bits(mdev); 1039 + if (mdev->agreed_pro_version >= 90) 1040 + set_ov_position(mdev, ns.conn); 1041 + else 1042 + mdev->ov_start_sector = 0; 1043 + mdev->ov_left = mdev->rs_total 1044 + - BM_SECT_TO_BIT(mdev->ov_position); 1045 + mdev->rs_start = 1046 + mdev->rs_mark_time = jiffies; 1047 + mdev->ov_last_oos_size = 0; 1048 + mdev->ov_last_oos_start = 0; 1049 + 1050 + if (ns.conn == C_VERIFY_S) { 1051 + dev_info(DEV, "Starting Online Verify from sector %llu\n", 1052 + (unsigned long long)mdev->ov_position); 1053 + mod_timer(&mdev->resync_timer, jiffies); 1054 + } 1055 + } 1056 + 1057 + if (get_ldev(mdev)) { 1058 + u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| 1059 + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| 1060 + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); 1061 + 1062 + if (test_bit(CRASHED_PRIMARY, &mdev->flags)) 1063 + mdf |= MDF_CRASHED_PRIMARY; 1064 + if (mdev->state.role == R_PRIMARY || 1065 + (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) 1066 + mdf |= MDF_PRIMARY_IND; 1067 + if (mdev->state.conn > C_WF_REPORT_PARAMS) 1068 + mdf |= MDF_CONNECTED_IND; 1069 + if (mdev->state.disk > D_INCONSISTENT) 1070 + mdf |= MDF_CONSISTENT; 1071 + if (mdev->state.disk > D_OUTDATED) 1072 + mdf |= MDF_WAS_UP_TO_DATE; 1073 + if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) 1074 + mdf |= MDF_PEER_OUT_DATED; 1075 + if (mdf != mdev->ldev->md.flags) { 1076 + mdev->ldev->md.flags = mdf; 1077 + drbd_md_mark_dirty(mdev); 1078 + } 1079 + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) 1080 + drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); 1081 + put_ldev(mdev); 1082 + } 1083 + 1084 + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ 1085 + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && 1086 + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) 1087 + set_bit(CONSIDER_RESYNC, &mdev->flags); 1088 + 1089 + /* Receiver should clean up itself */ 1090 + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) 1091 + drbd_thread_stop_nowait(&mdev->receiver); 1092 + 1093 + /* Now the receiver finished cleaning up itself, it should die */ 1094 + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) 1095 + drbd_thread_stop_nowait(&mdev->receiver); 1096 + 1097 + /* Upon network failure, we need to restart the receiver. */ 1098 + if (os.conn > C_TEAR_DOWN && 1099 + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) 1100 + drbd_thread_restart_nowait(&mdev->receiver); 1101 + 1102 + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); 1103 + if (ascw) { 1104 + ascw->os = os; 1105 + ascw->ns = ns; 1106 + ascw->flags = flags; 1107 + ascw->w.cb = w_after_state_ch; 1108 + ascw->done = done; 1109 + drbd_queue_work(&mdev->data.work, &ascw->w); 1110 + } else { 1111 + dev_warn(DEV, "Could not kmalloc an ascw\n"); 1112 + } 1113 + 1114 + return rv; 1115 + } 1116 + 1117 + static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) 1118 + { 1119 + struct after_state_chg_work *ascw = 1120 + container_of(w, struct after_state_chg_work, w); 1121 + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); 1122 + if (ascw->flags & CS_WAIT_COMPLETE) { 1123 + D_ASSERT(ascw->done != NULL); 1124 + complete(ascw->done); 1125 + } 1126 + kfree(ascw); 1127 + 1128 + return 1; 1129 + } 1130 + 1131 + static void abw_start_sync(struct drbd_conf *mdev, int rv) 1132 + { 1133 + if (rv) { 1134 + dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); 1135 + _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); 1136 + return; 1137 + } 1138 + 1139 + switch (mdev->state.conn) { 1140 + case C_STARTING_SYNC_T: 1141 + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); 1142 + break; 1143 + case C_STARTING_SYNC_S: 1144 + drbd_start_resync(mdev, C_SYNC_SOURCE); 1145 + break; 1146 + } 1147 + } 1148 + 1149 + /** 1150 + * after_state_ch() - Perform after state change actions that may sleep 1151 + * @mdev: DRBD device. 1152 + * @os: old state. 1153 + * @ns: new state. 1154 + * @flags: Flags 1155 + */ 1156 + static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, 1157 + union drbd_state ns, enum chg_state_flags flags) 1158 + { 1159 + enum drbd_fencing_p fp; 1160 + 1161 + if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { 1162 + clear_bit(CRASHED_PRIMARY, &mdev->flags); 1163 + if (mdev->p_uuid) 1164 + mdev->p_uuid[UI_FLAGS] &= ~((u64)2); 1165 + } 1166 + 1167 + fp = FP_DONT_CARE; 1168 + if (get_ldev(mdev)) { 1169 + fp = mdev->ldev->dc.fencing; 1170 + put_ldev(mdev); 1171 + } 1172 + 1173 + /* Inform userspace about the change... */ 1174 + drbd_bcast_state(mdev, ns); 1175 + 1176 + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && 1177 + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) 1178 + drbd_khelper(mdev, "pri-on-incon-degr"); 1179 + 1180 + /* Here we have the actions that are performed after a 1181 + state change. This function might sleep */ 1182 + 1183 + if (fp == FP_STONITH && ns.susp) { 1184 + /* case1: The outdate peer handler is successful: 1185 + * case2: The connection was established again: */ 1186 + if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || 1187 + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { 1188 + tl_clear(mdev); 1189 + spin_lock_irq(&mdev->req_lock); 1190 + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); 1191 + spin_unlock_irq(&mdev->req_lock); 1192 + } 1193 + } 1194 + /* Do not change the order of the if above and the two below... */ 1195 + if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ 1196 + drbd_send_uuids(mdev); 1197 + drbd_send_state(mdev); 1198 + } 1199 + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) 1200 + drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); 1201 + 1202 + /* Lost contact to peer's copy of the data */ 1203 + if ((os.pdsk >= D_INCONSISTENT && 1204 + os.pdsk != D_UNKNOWN && 1205 + os.pdsk != D_OUTDATED) 1206 + && (ns.pdsk < D_INCONSISTENT || 1207 + ns.pdsk == D_UNKNOWN || 1208 + ns.pdsk == D_OUTDATED)) { 1209 + kfree(mdev->p_uuid); 1210 + mdev->p_uuid = NULL; 1211 + if (get_ldev(mdev)) { 1212 + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1213 + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1214 + drbd_uuid_new_current(mdev); 1215 + drbd_send_uuids(mdev); 1216 + } 1217 + put_ldev(mdev); 1218 + } 1219 + } 1220 + 1221 + if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1222 + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) 1223 + drbd_uuid_new_current(mdev); 1224 + 1225 + /* D_DISKLESS Peer becomes secondary */ 1226 + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1227 + drbd_al_to_on_disk_bm(mdev); 1228 + put_ldev(mdev); 1229 + } 1230 + 1231 + /* Last part of the attaching process ... */ 1232 + if (ns.conn >= C_CONNECTED && 1233 + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { 1234 + kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ 1235 + mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ 1236 + drbd_send_sizes(mdev, 0); /* to start sync... */ 1237 + drbd_send_uuids(mdev); 1238 + drbd_send_state(mdev); 1239 + } 1240 + 1241 + /* We want to pause/continue resync, tell peer. */ 1242 + if (ns.conn >= C_CONNECTED && 1243 + ((os.aftr_isp != ns.aftr_isp) || 1244 + (os.user_isp != ns.user_isp))) 1245 + drbd_send_state(mdev); 1246 + 1247 + /* In case one of the isp bits got set, suspend other devices. */ 1248 + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && 1249 + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) 1250 + suspend_other_sg(mdev); 1251 + 1252 + /* Make sure the peer gets informed about eventual state 1253 + changes (ISP bits) while we were in WFReportParams. */ 1254 + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) 1255 + drbd_send_state(mdev); 1256 + 1257 + /* We are in the progress to start a full sync... */ 1258 + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 1259 + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) 1260 + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); 1261 + 1262 + /* We are invalidating our self... */ 1263 + if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && 1264 + os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1265 + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1266 + 1267 + if (os.disk > D_FAILED && ns.disk == D_FAILED) { 1268 + enum drbd_io_error_p eh; 1269 + 1270 + eh = EP_PASS_ON; 1271 + if (get_ldev_if_state(mdev, D_FAILED)) { 1272 + eh = mdev->ldev->dc.on_io_error; 1273 + put_ldev(mdev); 1274 + } 1275 + 1276 + drbd_rs_cancel_all(mdev); 1277 + /* since get_ldev() only works as long as disk>=D_INCONSISTENT, 1278 + and it is D_DISKLESS here, local_cnt can only go down, it can 1279 + not increase... It will reach zero */ 1280 + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); 1281 + mdev->rs_total = 0; 1282 + mdev->rs_failed = 0; 1283 + atomic_set(&mdev->rs_pending_cnt, 0); 1284 + 1285 + spin_lock_irq(&mdev->req_lock); 1286 + _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); 1287 + spin_unlock_irq(&mdev->req_lock); 1288 + 1289 + if (eh == EP_CALL_HELPER) 1290 + drbd_khelper(mdev, "local-io-error"); 1291 + } 1292 + 1293 + if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { 1294 + 1295 + if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { 1296 + if (drbd_send_state(mdev)) 1297 + dev_warn(DEV, "Notified peer that my disk is broken.\n"); 1298 + else 1299 + dev_err(DEV, "Sending state in drbd_io_error() failed\n"); 1300 + } 1301 + 1302 + lc_destroy(mdev->resync); 1303 + mdev->resync = NULL; 1304 + lc_destroy(mdev->act_log); 1305 + mdev->act_log = NULL; 1306 + __no_warn(local, 1307 + drbd_free_bc(mdev->ldev); 1308 + mdev->ldev = NULL;); 1309 + 1310 + if (mdev->md_io_tmpp) 1311 + __free_page(mdev->md_io_tmpp); 1312 + } 1313 + 1314 + /* Disks got bigger while they were detached */ 1315 + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && 1316 + test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { 1317 + if (ns.conn == C_CONNECTED) 1318 + resync_after_online_grow(mdev); 1319 + } 1320 + 1321 + /* A resync finished or aborted, wake paused devices... */ 1322 + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || 1323 + (os.peer_isp && !ns.peer_isp) || 1324 + (os.user_isp && !ns.user_isp)) 1325 + resume_next_sg(mdev); 1326 + 1327 + /* Upon network connection, we need to start the receiver */ 1328 + if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) 1329 + drbd_thread_start(&mdev->receiver); 1330 + 1331 + /* Terminate worker thread if we are unconfigured - it will be 1332 + restarted as needed... */ 1333 + if (ns.disk == D_DISKLESS && 1334 + ns.conn == C_STANDALONE && 1335 + ns.role == R_SECONDARY) { 1336 + if (os.aftr_isp != ns.aftr_isp) 1337 + resume_next_sg(mdev); 1338 + /* set in __drbd_set_state, unless CONFIG_PENDING was set */ 1339 + if (test_bit(DEVICE_DYING, &mdev->flags)) 1340 + drbd_thread_stop_nowait(&mdev->worker); 1341 + } 1342 + 1343 + drbd_md_sync(mdev); 1344 + } 1345 + 1346 + 1347 + static int drbd_thread_setup(void *arg) 1348 + { 1349 + struct drbd_thread *thi = (struct drbd_thread *) arg; 1350 + struct drbd_conf *mdev = thi->mdev; 1351 + unsigned long flags; 1352 + int retval; 1353 + 1354 + restart: 1355 + retval = thi->function(thi); 1356 + 1357 + spin_lock_irqsave(&thi->t_lock, flags); 1358 + 1359 + /* if the receiver has been "Exiting", the last thing it did 1360 + * was set the conn state to "StandAlone", 1361 + * if now a re-connect request comes in, conn state goes C_UNCONNECTED, 1362 + * and receiver thread will be "started". 1363 + * drbd_thread_start needs to set "Restarting" in that case. 1364 + * t_state check and assignment needs to be within the same spinlock, 1365 + * so either thread_start sees Exiting, and can remap to Restarting, 1366 + * or thread_start see None, and can proceed as normal. 1367 + */ 1368 + 1369 + if (thi->t_state == Restarting) { 1370 + dev_info(DEV, "Restarting %s\n", current->comm); 1371 + thi->t_state = Running; 1372 + spin_unlock_irqrestore(&thi->t_lock, flags); 1373 + goto restart; 1374 + } 1375 + 1376 + thi->task = NULL; 1377 + thi->t_state = None; 1378 + smp_mb(); 1379 + complete(&thi->stop); 1380 + spin_unlock_irqrestore(&thi->t_lock, flags); 1381 + 1382 + dev_info(DEV, "Terminating %s\n", current->comm); 1383 + 1384 + /* Release mod reference taken when thread was started */ 1385 + module_put(THIS_MODULE); 1386 + return retval; 1387 + } 1388 + 1389 + static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, 1390 + int (*func) (struct drbd_thread *)) 1391 + { 1392 + spin_lock_init(&thi->t_lock); 1393 + thi->task = NULL; 1394 + thi->t_state = None; 1395 + thi->function = func; 1396 + thi->mdev = mdev; 1397 + } 1398 + 1399 + int drbd_thread_start(struct drbd_thread *thi) 1400 + { 1401 + struct drbd_conf *mdev = thi->mdev; 1402 + struct task_struct *nt; 1403 + unsigned long flags; 1404 + 1405 + const char *me = 1406 + thi == &mdev->receiver ? "receiver" : 1407 + thi == &mdev->asender ? "asender" : 1408 + thi == &mdev->worker ? "worker" : "NONSENSE"; 1409 + 1410 + /* is used from state engine doing drbd_thread_stop_nowait, 1411 + * while holding the req lock irqsave */ 1412 + spin_lock_irqsave(&thi->t_lock, flags); 1413 + 1414 + switch (thi->t_state) { 1415 + case None: 1416 + dev_info(DEV, "Starting %s thread (from %s [%d])\n", 1417 + me, current->comm, current->pid); 1418 + 1419 + /* Get ref on module for thread - this is released when thread exits */ 1420 + if (!try_module_get(THIS_MODULE)) { 1421 + dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); 1422 + spin_unlock_irqrestore(&thi->t_lock, flags); 1423 + return FALSE; 1424 + } 1425 + 1426 + init_completion(&thi->stop); 1427 + D_ASSERT(thi->task == NULL); 1428 + thi->reset_cpu_mask = 1; 1429 + thi->t_state = Running; 1430 + spin_unlock_irqrestore(&thi->t_lock, flags); 1431 + flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ 1432 + 1433 + nt = kthread_create(drbd_thread_setup, (void *) thi, 1434 + "drbd%d_%s", mdev_to_minor(mdev), me); 1435 + 1436 + if (IS_ERR(nt)) { 1437 + dev_err(DEV, "Couldn't start thread\n"); 1438 + 1439 + module_put(THIS_MODULE); 1440 + return FALSE; 1441 + } 1442 + spin_lock_irqsave(&thi->t_lock, flags); 1443 + thi->task = nt; 1444 + thi->t_state = Running; 1445 + spin_unlock_irqrestore(&thi->t_lock, flags); 1446 + wake_up_process(nt); 1447 + break; 1448 + case Exiting: 1449 + thi->t_state = Restarting; 1450 + dev_info(DEV, "Restarting %s thread (from %s [%d])\n", 1451 + me, current->comm, current->pid); 1452 + /* fall through */ 1453 + case Running: 1454 + case Restarting: 1455 + default: 1456 + spin_unlock_irqrestore(&thi->t_lock, flags); 1457 + break; 1458 + } 1459 + 1460 + return TRUE; 1461 + } 1462 + 1463 + 1464 + void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) 1465 + { 1466 + unsigned long flags; 1467 + 1468 + enum drbd_thread_state ns = restart ? Restarting : Exiting; 1469 + 1470 + /* may be called from state engine, holding the req lock irqsave */ 1471 + spin_lock_irqsave(&thi->t_lock, flags); 1472 + 1473 + if (thi->t_state == None) { 1474 + spin_unlock_irqrestore(&thi->t_lock, flags); 1475 + if (restart) 1476 + drbd_thread_start(thi); 1477 + return; 1478 + } 1479 + 1480 + if (thi->t_state != ns) { 1481 + if (thi->task == NULL) { 1482 + spin_unlock_irqrestore(&thi->t_lock, flags); 1483 + return; 1484 + } 1485 + 1486 + thi->t_state = ns; 1487 + smp_mb(); 1488 + init_completion(&thi->stop); 1489 + if (thi->task != current) 1490 + force_sig(DRBD_SIGKILL, thi->task); 1491 + 1492 + } 1493 + 1494 + spin_unlock_irqrestore(&thi->t_lock, flags); 1495 + 1496 + if (wait) 1497 + wait_for_completion(&thi->stop); 1498 + } 1499 + 1500 + #ifdef CONFIG_SMP 1501 + /** 1502 + * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs 1503 + * @mdev: DRBD device. 1504 + * 1505 + * Forces all threads of a device onto the same CPU. This is beneficial for 1506 + * DRBD's performance. May be overwritten by user's configuration. 1507 + */ 1508 + void drbd_calc_cpu_mask(struct drbd_conf *mdev) 1509 + { 1510 + int ord, cpu; 1511 + 1512 + /* user override. */ 1513 + if (cpumask_weight(mdev->cpu_mask)) 1514 + return; 1515 + 1516 + ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); 1517 + for_each_online_cpu(cpu) { 1518 + if (ord-- == 0) { 1519 + cpumask_set_cpu(cpu, mdev->cpu_mask); 1520 + return; 1521 + } 1522 + } 1523 + /* should not be reached */ 1524 + cpumask_setall(mdev->cpu_mask); 1525 + } 1526 + 1527 + /** 1528 + * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread 1529 + * @mdev: DRBD device. 1530 + * 1531 + * call in the "main loop" of _all_ threads, no need for any mutex, current won't die 1532 + * prematurely. 1533 + */ 1534 + void drbd_thread_current_set_cpu(struct drbd_conf *mdev) 1535 + { 1536 + struct task_struct *p = current; 1537 + struct drbd_thread *thi = 1538 + p == mdev->asender.task ? &mdev->asender : 1539 + p == mdev->receiver.task ? &mdev->receiver : 1540 + p == mdev->worker.task ? &mdev->worker : 1541 + NULL; 1542 + ERR_IF(thi == NULL) 1543 + return; 1544 + if (!thi->reset_cpu_mask) 1545 + return; 1546 + thi->reset_cpu_mask = 0; 1547 + set_cpus_allowed_ptr(p, mdev->cpu_mask); 1548 + } 1549 + #endif 1550 + 1551 + /* the appropriate socket mutex must be held already */ 1552 + int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1553 + enum drbd_packets cmd, struct p_header *h, 1554 + size_t size, unsigned msg_flags) 1555 + { 1556 + int sent, ok; 1557 + 1558 + ERR_IF(!h) return FALSE; 1559 + ERR_IF(!size) return FALSE; 1560 + 1561 + h->magic = BE_DRBD_MAGIC; 1562 + h->command = cpu_to_be16(cmd); 1563 + h->length = cpu_to_be16(size-sizeof(struct p_header)); 1564 + 1565 + sent = drbd_send(mdev, sock, h, size, msg_flags); 1566 + 1567 + ok = (sent == size); 1568 + if (!ok) 1569 + dev_err(DEV, "short sent %s size=%d sent=%d\n", 1570 + cmdname(cmd), (int)size, sent); 1571 + return ok; 1572 + } 1573 + 1574 + /* don't pass the socket. we may only look at it 1575 + * when we hold the appropriate socket mutex. 1576 + */ 1577 + int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, 1578 + enum drbd_packets cmd, struct p_header *h, size_t size) 1579 + { 1580 + int ok = 0; 1581 + struct socket *sock; 1582 + 1583 + if (use_data_socket) { 1584 + mutex_lock(&mdev->data.mutex); 1585 + sock = mdev->data.socket; 1586 + } else { 1587 + mutex_lock(&mdev->meta.mutex); 1588 + sock = mdev->meta.socket; 1589 + } 1590 + 1591 + /* drbd_disconnect() could have called drbd_free_sock() 1592 + * while we were waiting in down()... */ 1593 + if (likely(sock != NULL)) 1594 + ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); 1595 + 1596 + if (use_data_socket) 1597 + mutex_unlock(&mdev->data.mutex); 1598 + else 1599 + mutex_unlock(&mdev->meta.mutex); 1600 + return ok; 1601 + } 1602 + 1603 + int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, 1604 + size_t size) 1605 + { 1606 + struct p_header h; 1607 + int ok; 1608 + 1609 + h.magic = BE_DRBD_MAGIC; 1610 + h.command = cpu_to_be16(cmd); 1611 + h.length = cpu_to_be16(size); 1612 + 1613 + if (!drbd_get_data_sock(mdev)) 1614 + return 0; 1615 + 1616 + ok = (sizeof(h) == 1617 + drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); 1618 + ok = ok && (size == 1619 + drbd_send(mdev, mdev->data.socket, data, size, 0)); 1620 + 1621 + drbd_put_data_sock(mdev); 1622 + 1623 + return ok; 1624 + } 1625 + 1626 + int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) 1627 + { 1628 + struct p_rs_param_89 *p; 1629 + struct socket *sock; 1630 + int size, rv; 1631 + const int apv = mdev->agreed_pro_version; 1632 + 1633 + size = apv <= 87 ? sizeof(struct p_rs_param) 1634 + : apv == 88 ? sizeof(struct p_rs_param) 1635 + + strlen(mdev->sync_conf.verify_alg) + 1 1636 + : /* 89 */ sizeof(struct p_rs_param_89); 1637 + 1638 + /* used from admin command context and receiver/worker context. 1639 + * to avoid kmalloc, grab the socket right here, 1640 + * then use the pre-allocated sbuf there */ 1641 + mutex_lock(&mdev->data.mutex); 1642 + sock = mdev->data.socket; 1643 + 1644 + if (likely(sock != NULL)) { 1645 + enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; 1646 + 1647 + p = &mdev->data.sbuf.rs_param_89; 1648 + 1649 + /* initialize verify_alg and csums_alg */ 1650 + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 1651 + 1652 + p->rate = cpu_to_be32(sc->rate); 1653 + 1654 + if (apv >= 88) 1655 + strcpy(p->verify_alg, mdev->sync_conf.verify_alg); 1656 + if (apv >= 89) 1657 + strcpy(p->csums_alg, mdev->sync_conf.csums_alg); 1658 + 1659 + rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); 1660 + } else 1661 + rv = 0; /* not ok */ 1662 + 1663 + mutex_unlock(&mdev->data.mutex); 1664 + 1665 + return rv; 1666 + } 1667 + 1668 + int drbd_send_protocol(struct drbd_conf *mdev) 1669 + { 1670 + struct p_protocol *p; 1671 + int size, rv; 1672 + 1673 + size = sizeof(struct p_protocol); 1674 + 1675 + if (mdev->agreed_pro_version >= 87) 1676 + size += strlen(mdev->net_conf->integrity_alg) + 1; 1677 + 1678 + /* we must not recurse into our own queue, 1679 + * as that is blocked during handshake */ 1680 + p = kmalloc(size, GFP_NOIO); 1681 + if (p == NULL) 1682 + return 0; 1683 + 1684 + p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); 1685 + p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); 1686 + p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); 1687 + p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); 1688 + p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); 1689 + p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); 1690 + 1691 + if (mdev->agreed_pro_version >= 87) 1692 + strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); 1693 + 1694 + rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, 1695 + (struct p_header *)p, size); 1696 + kfree(p); 1697 + return rv; 1698 + } 1699 + 1700 + int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) 1701 + { 1702 + struct p_uuids p; 1703 + int i; 1704 + 1705 + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) 1706 + return 1; 1707 + 1708 + for (i = UI_CURRENT; i < UI_SIZE; i++) 1709 + p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; 1710 + 1711 + mdev->comm_bm_set = drbd_bm_total_weight(mdev); 1712 + p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); 1713 + uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; 1714 + uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; 1715 + uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; 1716 + p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); 1717 + 1718 + put_ldev(mdev); 1719 + 1720 + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, 1721 + (struct p_header *)&p, sizeof(p)); 1722 + } 1723 + 1724 + int drbd_send_uuids(struct drbd_conf *mdev) 1725 + { 1726 + return _drbd_send_uuids(mdev, 0); 1727 + } 1728 + 1729 + int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev) 1730 + { 1731 + return _drbd_send_uuids(mdev, 8); 1732 + } 1733 + 1734 + 1735 + int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) 1736 + { 1737 + struct p_rs_uuid p; 1738 + 1739 + p.uuid = cpu_to_be64(val); 1740 + 1741 + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, 1742 + (struct p_header *)&p, sizeof(p)); 1743 + } 1744 + 1745 + int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) 1746 + { 1747 + struct p_sizes p; 1748 + sector_t d_size, u_size; 1749 + int q_order_type; 1750 + int ok; 1751 + 1752 + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 1753 + D_ASSERT(mdev->ldev->backing_bdev); 1754 + d_size = drbd_get_max_capacity(mdev->ldev); 1755 + u_size = mdev->ldev->dc.disk_size; 1756 + q_order_type = drbd_queue_order_type(mdev); 1757 + p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); 1758 + put_ldev(mdev); 1759 + } else { 1760 + d_size = 0; 1761 + u_size = 0; 1762 + q_order_type = QUEUE_ORDERED_NONE; 1763 + } 1764 + 1765 + p.d_size = cpu_to_be64(d_size); 1766 + p.u_size = cpu_to_be64(u_size); 1767 + p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 1768 + p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); 1769 + p.queue_order_type = cpu_to_be32(q_order_type); 1770 + 1771 + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, 1772 + (struct p_header *)&p, sizeof(p)); 1773 + return ok; 1774 + } 1775 + 1776 + /** 1777 + * drbd_send_state() - Sends the drbd state to the peer 1778 + * @mdev: DRBD device. 1779 + */ 1780 + int drbd_send_state(struct drbd_conf *mdev) 1781 + { 1782 + struct socket *sock; 1783 + struct p_state p; 1784 + int ok = 0; 1785 + 1786 + /* Grab state lock so we wont send state if we're in the middle 1787 + * of a cluster wide state change on another thread */ 1788 + drbd_state_lock(mdev); 1789 + 1790 + mutex_lock(&mdev->data.mutex); 1791 + 1792 + p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ 1793 + sock = mdev->data.socket; 1794 + 1795 + if (likely(sock != NULL)) { 1796 + ok = _drbd_send_cmd(mdev, sock, P_STATE, 1797 + (struct p_header *)&p, sizeof(p), 0); 1798 + } 1799 + 1800 + mutex_unlock(&mdev->data.mutex); 1801 + 1802 + drbd_state_unlock(mdev); 1803 + return ok; 1804 + } 1805 + 1806 + int drbd_send_state_req(struct drbd_conf *mdev, 1807 + union drbd_state mask, union drbd_state val) 1808 + { 1809 + struct p_req_state p; 1810 + 1811 + p.mask = cpu_to_be32(mask.i); 1812 + p.val = cpu_to_be32(val.i); 1813 + 1814 + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, 1815 + (struct p_header *)&p, sizeof(p)); 1816 + } 1817 + 1818 + int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) 1819 + { 1820 + struct p_req_state_reply p; 1821 + 1822 + p.retcode = cpu_to_be32(retcode); 1823 + 1824 + return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, 1825 + (struct p_header *)&p, sizeof(p)); 1826 + } 1827 + 1828 + int fill_bitmap_rle_bits(struct drbd_conf *mdev, 1829 + struct p_compressed_bm *p, 1830 + struct bm_xfer_ctx *c) 1831 + { 1832 + struct bitstream bs; 1833 + unsigned long plain_bits; 1834 + unsigned long tmp; 1835 + unsigned long rl; 1836 + unsigned len; 1837 + unsigned toggle; 1838 + int bits; 1839 + 1840 + /* may we use this feature? */ 1841 + if ((mdev->sync_conf.use_rle == 0) || 1842 + (mdev->agreed_pro_version < 90)) 1843 + return 0; 1844 + 1845 + if (c->bit_offset >= c->bm_bits) 1846 + return 0; /* nothing to do. */ 1847 + 1848 + /* use at most thus many bytes */ 1849 + bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); 1850 + memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); 1851 + /* plain bits covered in this code string */ 1852 + plain_bits = 0; 1853 + 1854 + /* p->encoding & 0x80 stores whether the first run length is set. 1855 + * bit offset is implicit. 1856 + * start with toggle == 2 to be able to tell the first iteration */ 1857 + toggle = 2; 1858 + 1859 + /* see how much plain bits we can stuff into one packet 1860 + * using RLE and VLI. */ 1861 + do { 1862 + tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset) 1863 + : _drbd_bm_find_next(mdev, c->bit_offset); 1864 + if (tmp == -1UL) 1865 + tmp = c->bm_bits; 1866 + rl = tmp - c->bit_offset; 1867 + 1868 + if (toggle == 2) { /* first iteration */ 1869 + if (rl == 0) { 1870 + /* the first checked bit was set, 1871 + * store start value, */ 1872 + DCBP_set_start(p, 1); 1873 + /* but skip encoding of zero run length */ 1874 + toggle = !toggle; 1875 + continue; 1876 + } 1877 + DCBP_set_start(p, 0); 1878 + } 1879 + 1880 + /* paranoia: catch zero runlength. 1881 + * can only happen if bitmap is modified while we scan it. */ 1882 + if (rl == 0) { 1883 + dev_err(DEV, "unexpected zero runlength while encoding bitmap " 1884 + "t:%u bo:%lu\n", toggle, c->bit_offset); 1885 + return -1; 1886 + } 1887 + 1888 + bits = vli_encode_bits(&bs, rl); 1889 + if (bits == -ENOBUFS) /* buffer full */ 1890 + break; 1891 + if (bits <= 0) { 1892 + dev_err(DEV, "error while encoding bitmap: %d\n", bits); 1893 + return 0; 1894 + } 1895 + 1896 + toggle = !toggle; 1897 + plain_bits += rl; 1898 + c->bit_offset = tmp; 1899 + } while (c->bit_offset < c->bm_bits); 1900 + 1901 + len = bs.cur.b - p->code + !!bs.cur.bit; 1902 + 1903 + if (plain_bits < (len << 3)) { 1904 + /* incompressible with this method. 1905 + * we need to rewind both word and bit position. */ 1906 + c->bit_offset -= plain_bits; 1907 + bm_xfer_ctx_bit_to_word_offset(c); 1908 + c->bit_offset = c->word_offset * BITS_PER_LONG; 1909 + return 0; 1910 + } 1911 + 1912 + /* RLE + VLI was able to compress it just fine. 1913 + * update c->word_offset. */ 1914 + bm_xfer_ctx_bit_to_word_offset(c); 1915 + 1916 + /* store pad_bits */ 1917 + DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); 1918 + 1919 + return len; 1920 + } 1921 + 1922 + enum { OK, FAILED, DONE } 1923 + send_bitmap_rle_or_plain(struct drbd_conf *mdev, 1924 + struct p_header *h, struct bm_xfer_ctx *c) 1925 + { 1926 + struct p_compressed_bm *p = (void*)h; 1927 + unsigned long num_words; 1928 + int len; 1929 + int ok; 1930 + 1931 + len = fill_bitmap_rle_bits(mdev, p, c); 1932 + 1933 + if (len < 0) 1934 + return FAILED; 1935 + 1936 + if (len) { 1937 + DCBP_set_code(p, RLE_VLI_Bits); 1938 + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, 1939 + sizeof(*p) + len, 0); 1940 + 1941 + c->packets[0]++; 1942 + c->bytes[0] += sizeof(*p) + len; 1943 + 1944 + if (c->bit_offset >= c->bm_bits) 1945 + len = 0; /* DONE */ 1946 + } else { 1947 + /* was not compressible. 1948 + * send a buffer full of plain text bits instead. */ 1949 + num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); 1950 + len = num_words * sizeof(long); 1951 + if (len) 1952 + drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); 1953 + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, 1954 + h, sizeof(struct p_header) + len, 0); 1955 + c->word_offset += num_words; 1956 + c->bit_offset = c->word_offset * BITS_PER_LONG; 1957 + 1958 + c->packets[1]++; 1959 + c->bytes[1] += sizeof(struct p_header) + len; 1960 + 1961 + if (c->bit_offset > c->bm_bits) 1962 + c->bit_offset = c->bm_bits; 1963 + } 1964 + ok = ok ? ((len == 0) ? DONE : OK) : FAILED; 1965 + 1966 + if (ok == DONE) 1967 + INFO_bm_xfer_stats(mdev, "send", c); 1968 + return ok; 1969 + } 1970 + 1971 + /* See the comment at receive_bitmap() */ 1972 + int _drbd_send_bitmap(struct drbd_conf *mdev) 1973 + { 1974 + struct bm_xfer_ctx c; 1975 + struct p_header *p; 1976 + int ret; 1977 + 1978 + ERR_IF(!mdev->bitmap) return FALSE; 1979 + 1980 + /* maybe we should use some per thread scratch page, 1981 + * and allocate that during initial device creation? */ 1982 + p = (struct p_header *) __get_free_page(GFP_NOIO); 1983 + if (!p) { 1984 + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); 1985 + return FALSE; 1986 + } 1987 + 1988 + if (get_ldev(mdev)) { 1989 + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { 1990 + dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n"); 1991 + drbd_bm_set_all(mdev); 1992 + if (drbd_bm_write(mdev)) { 1993 + /* write_bm did fail! Leave full sync flag set in Meta P_DATA 1994 + * but otherwise process as per normal - need to tell other 1995 + * side that a full resync is required! */ 1996 + dev_err(DEV, "Failed to write bitmap to disk!\n"); 1997 + } else { 1998 + drbd_md_clear_flag(mdev, MDF_FULL_SYNC); 1999 + drbd_md_sync(mdev); 2000 + } 2001 + } 2002 + put_ldev(mdev); 2003 + } 2004 + 2005 + c = (struct bm_xfer_ctx) { 2006 + .bm_bits = drbd_bm_bits(mdev), 2007 + .bm_words = drbd_bm_words(mdev), 2008 + }; 2009 + 2010 + do { 2011 + ret = send_bitmap_rle_or_plain(mdev, p, &c); 2012 + } while (ret == OK); 2013 + 2014 + free_page((unsigned long) p); 2015 + return (ret == DONE); 2016 + } 2017 + 2018 + int drbd_send_bitmap(struct drbd_conf *mdev) 2019 + { 2020 + int err; 2021 + 2022 + if (!drbd_get_data_sock(mdev)) 2023 + return -1; 2024 + err = !_drbd_send_bitmap(mdev); 2025 + drbd_put_data_sock(mdev); 2026 + return err; 2027 + } 2028 + 2029 + int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) 2030 + { 2031 + int ok; 2032 + struct p_barrier_ack p; 2033 + 2034 + p.barrier = barrier_nr; 2035 + p.set_size = cpu_to_be32(set_size); 2036 + 2037 + if (mdev->state.conn < C_CONNECTED) 2038 + return FALSE; 2039 + ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, 2040 + (struct p_header *)&p, sizeof(p)); 2041 + return ok; 2042 + } 2043 + 2044 + /** 2045 + * _drbd_send_ack() - Sends an ack packet 2046 + * @mdev: DRBD device. 2047 + * @cmd: Packet command code. 2048 + * @sector: sector, needs to be in big endian byte order 2049 + * @blksize: size in byte, needs to be in big endian byte order 2050 + * @block_id: Id, big endian byte order 2051 + */ 2052 + static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, 2053 + u64 sector, 2054 + u32 blksize, 2055 + u64 block_id) 2056 + { 2057 + int ok; 2058 + struct p_block_ack p; 2059 + 2060 + p.sector = sector; 2061 + p.block_id = block_id; 2062 + p.blksize = blksize; 2063 + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); 2064 + 2065 + if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) 2066 + return FALSE; 2067 + ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, 2068 + (struct p_header *)&p, sizeof(p)); 2069 + return ok; 2070 + } 2071 + 2072 + int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, 2073 + struct p_data *dp) 2074 + { 2075 + const int header_size = sizeof(struct p_data) 2076 + - sizeof(struct p_header); 2077 + int data_size = ((struct p_header *)dp)->length - header_size; 2078 + 2079 + return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), 2080 + dp->block_id); 2081 + } 2082 + 2083 + int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, 2084 + struct p_block_req *rp) 2085 + { 2086 + return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); 2087 + } 2088 + 2089 + /** 2090 + * drbd_send_ack() - Sends an ack packet 2091 + * @mdev: DRBD device. 2092 + * @cmd: Packet command code. 2093 + * @e: Epoch entry. 2094 + */ 2095 + int drbd_send_ack(struct drbd_conf *mdev, 2096 + enum drbd_packets cmd, struct drbd_epoch_entry *e) 2097 + { 2098 + return _drbd_send_ack(mdev, cmd, 2099 + cpu_to_be64(e->sector), 2100 + cpu_to_be32(e->size), 2101 + e->block_id); 2102 + } 2103 + 2104 + /* This function misuses the block_id field to signal if the blocks 2105 + * are is sync or not. */ 2106 + int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, 2107 + sector_t sector, int blksize, u64 block_id) 2108 + { 2109 + return _drbd_send_ack(mdev, cmd, 2110 + cpu_to_be64(sector), 2111 + cpu_to_be32(blksize), 2112 + cpu_to_be64(block_id)); 2113 + } 2114 + 2115 + int drbd_send_drequest(struct drbd_conf *mdev, int cmd, 2116 + sector_t sector, int size, u64 block_id) 2117 + { 2118 + int ok; 2119 + struct p_block_req p; 2120 + 2121 + p.sector = cpu_to_be64(sector); 2122 + p.block_id = block_id; 2123 + p.blksize = cpu_to_be32(size); 2124 + 2125 + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, 2126 + (struct p_header *)&p, sizeof(p)); 2127 + return ok; 2128 + } 2129 + 2130 + int drbd_send_drequest_csum(struct drbd_conf *mdev, 2131 + sector_t sector, int size, 2132 + void *digest, int digest_size, 2133 + enum drbd_packets cmd) 2134 + { 2135 + int ok; 2136 + struct p_block_req p; 2137 + 2138 + p.sector = cpu_to_be64(sector); 2139 + p.block_id = BE_DRBD_MAGIC + 0xbeef; 2140 + p.blksize = cpu_to_be32(size); 2141 + 2142 + p.head.magic = BE_DRBD_MAGIC; 2143 + p.head.command = cpu_to_be16(cmd); 2144 + p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); 2145 + 2146 + mutex_lock(&mdev->data.mutex); 2147 + 2148 + ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); 2149 + ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); 2150 + 2151 + mutex_unlock(&mdev->data.mutex); 2152 + 2153 + return ok; 2154 + } 2155 + 2156 + int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) 2157 + { 2158 + int ok; 2159 + struct p_block_req p; 2160 + 2161 + p.sector = cpu_to_be64(sector); 2162 + p.block_id = BE_DRBD_MAGIC + 0xbabe; 2163 + p.blksize = cpu_to_be32(size); 2164 + 2165 + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, 2166 + (struct p_header *)&p, sizeof(p)); 2167 + return ok; 2168 + } 2169 + 2170 + /* called on sndtimeo 2171 + * returns FALSE if we should retry, 2172 + * TRUE if we think connection is dead 2173 + */ 2174 + static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) 2175 + { 2176 + int drop_it; 2177 + /* long elapsed = (long)(jiffies - mdev->last_received); */ 2178 + 2179 + drop_it = mdev->meta.socket == sock 2180 + || !mdev->asender.task 2181 + || get_t_state(&mdev->asender) != Running 2182 + || mdev->state.conn < C_CONNECTED; 2183 + 2184 + if (drop_it) 2185 + return TRUE; 2186 + 2187 + drop_it = !--mdev->ko_count; 2188 + if (!drop_it) { 2189 + dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", 2190 + current->comm, current->pid, mdev->ko_count); 2191 + request_ping(mdev); 2192 + } 2193 + 2194 + return drop_it; /* && (mdev->state == R_PRIMARY) */; 2195 + } 2196 + 2197 + /* The idea of sendpage seems to be to put some kind of reference 2198 + * to the page into the skb, and to hand it over to the NIC. In 2199 + * this process get_page() gets called. 2200 + * 2201 + * As soon as the page was really sent over the network put_page() 2202 + * gets called by some part of the network layer. [ NIC driver? ] 2203 + * 2204 + * [ get_page() / put_page() increment/decrement the count. If count 2205 + * reaches 0 the page will be freed. ] 2206 + * 2207 + * This works nicely with pages from FSs. 2208 + * But this means that in protocol A we might signal IO completion too early! 2209 + * 2210 + * In order not to corrupt data during a resync we must make sure 2211 + * that we do not reuse our own buffer pages (EEs) to early, therefore 2212 + * we have the net_ee list. 2213 + * 2214 + * XFS seems to have problems, still, it submits pages with page_count == 0! 2215 + * As a workaround, we disable sendpage on pages 2216 + * with page_count == 0 or PageSlab. 2217 + */ 2218 + static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, 2219 + int offset, size_t size) 2220 + { 2221 + int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); 2222 + kunmap(page); 2223 + if (sent == size) 2224 + mdev->send_cnt += size>>9; 2225 + return sent == size; 2226 + } 2227 + 2228 + static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, 2229 + int offset, size_t size) 2230 + { 2231 + mm_segment_t oldfs = get_fs(); 2232 + int sent, ok; 2233 + int len = size; 2234 + 2235 + /* e.g. XFS meta- & log-data is in slab pages, which have a 2236 + * page_count of 0 and/or have PageSlab() set. 2237 + * we cannot use send_page for those, as that does get_page(); 2238 + * put_page(); and would cause either a VM_BUG directly, or 2239 + * __page_cache_release a page that would actually still be referenced 2240 + * by someone, leading to some obscure delayed Oops somewhere else. */ 2241 + if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) 2242 + return _drbd_no_send_page(mdev, page, offset, size); 2243 + 2244 + drbd_update_congested(mdev); 2245 + set_fs(KERNEL_DS); 2246 + do { 2247 + sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, 2248 + offset, len, 2249 + MSG_NOSIGNAL); 2250 + if (sent == -EAGAIN) { 2251 + if (we_should_drop_the_connection(mdev, 2252 + mdev->data.socket)) 2253 + break; 2254 + else 2255 + continue; 2256 + } 2257 + if (sent <= 0) { 2258 + dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", 2259 + __func__, (int)size, len, sent); 2260 + break; 2261 + } 2262 + len -= sent; 2263 + offset += sent; 2264 + } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); 2265 + set_fs(oldfs); 2266 + clear_bit(NET_CONGESTED, &mdev->flags); 2267 + 2268 + ok = (len == 0); 2269 + if (likely(ok)) 2270 + mdev->send_cnt += size>>9; 2271 + return ok; 2272 + } 2273 + 2274 + static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) 2275 + { 2276 + struct bio_vec *bvec; 2277 + int i; 2278 + __bio_for_each_segment(bvec, bio, i, 0) { 2279 + if (!_drbd_no_send_page(mdev, bvec->bv_page, 2280 + bvec->bv_offset, bvec->bv_len)) 2281 + return 0; 2282 + } 2283 + return 1; 2284 + } 2285 + 2286 + static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) 2287 + { 2288 + struct bio_vec *bvec; 2289 + int i; 2290 + __bio_for_each_segment(bvec, bio, i, 0) { 2291 + if (!_drbd_send_page(mdev, bvec->bv_page, 2292 + bvec->bv_offset, bvec->bv_len)) 2293 + return 0; 2294 + } 2295 + 2296 + return 1; 2297 + } 2298 + 2299 + /* Used to send write requests 2300 + * R_PRIMARY -> Peer (P_DATA) 2301 + */ 2302 + int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) 2303 + { 2304 + int ok = 1; 2305 + struct p_data p; 2306 + unsigned int dp_flags = 0; 2307 + void *dgb; 2308 + int dgs; 2309 + 2310 + if (!drbd_get_data_sock(mdev)) 2311 + return 0; 2312 + 2313 + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? 2314 + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; 2315 + 2316 + p.head.magic = BE_DRBD_MAGIC; 2317 + p.head.command = cpu_to_be16(P_DATA); 2318 + p.head.length = 2319 + cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); 2320 + 2321 + p.sector = cpu_to_be64(req->sector); 2322 + p.block_id = (unsigned long)req; 2323 + p.seq_num = cpu_to_be32(req->seq_num = 2324 + atomic_add_return(1, &mdev->packet_seq)); 2325 + dp_flags = 0; 2326 + 2327 + /* NOTE: no need to check if barriers supported here as we would 2328 + * not pass the test in make_request_common in that case 2329 + */ 2330 + if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { 2331 + dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); 2332 + /* dp_flags |= DP_HARDBARRIER; */ 2333 + } 2334 + if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) 2335 + dp_flags |= DP_RW_SYNC; 2336 + /* for now handle SYNCIO and UNPLUG 2337 + * as if they still were one and the same flag */ 2338 + if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) 2339 + dp_flags |= DP_RW_SYNC; 2340 + if (mdev->state.conn >= C_SYNC_SOURCE && 2341 + mdev->state.conn <= C_PAUSED_SYNC_T) 2342 + dp_flags |= DP_MAY_SET_IN_SYNC; 2343 + 2344 + p.dp_flags = cpu_to_be32(dp_flags); 2345 + set_bit(UNPLUG_REMOTE, &mdev->flags); 2346 + ok = (sizeof(p) == 2347 + drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); 2348 + if (ok && dgs) { 2349 + dgb = mdev->int_dig_out; 2350 + drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2351 + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2352 + } 2353 + if (ok) { 2354 + if (mdev->net_conf->wire_protocol == DRBD_PROT_A) 2355 + ok = _drbd_send_bio(mdev, req->master_bio); 2356 + else 2357 + ok = _drbd_send_zc_bio(mdev, req->master_bio); 2358 + } 2359 + 2360 + drbd_put_data_sock(mdev); 2361 + return ok; 2362 + } 2363 + 2364 + /* answer packet, used to send data back for read requests: 2365 + * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) 2366 + * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) 2367 + */ 2368 + int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, 2369 + struct drbd_epoch_entry *e) 2370 + { 2371 + int ok; 2372 + struct p_data p; 2373 + void *dgb; 2374 + int dgs; 2375 + 2376 + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? 2377 + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; 2378 + 2379 + p.head.magic = BE_DRBD_MAGIC; 2380 + p.head.command = cpu_to_be16(cmd); 2381 + p.head.length = 2382 + cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); 2383 + 2384 + p.sector = cpu_to_be64(e->sector); 2385 + p.block_id = e->block_id; 2386 + /* p.seq_num = 0; No sequence numbers here.. */ 2387 + 2388 + /* Only called by our kernel thread. 2389 + * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL 2390 + * in response to admin command or module unload. 2391 + */ 2392 + if (!drbd_get_data_sock(mdev)) 2393 + return 0; 2394 + 2395 + ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, 2396 + sizeof(p), MSG_MORE); 2397 + if (ok && dgs) { 2398 + dgb = mdev->int_dig_out; 2399 + drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); 2400 + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2401 + } 2402 + if (ok) 2403 + ok = _drbd_send_zc_bio(mdev, e->private_bio); 2404 + 2405 + drbd_put_data_sock(mdev); 2406 + return ok; 2407 + } 2408 + 2409 + /* 2410 + drbd_send distinguishes two cases: 2411 + 2412 + Packets sent via the data socket "sock" 2413 + and packets sent via the meta data socket "msock" 2414 + 2415 + sock msock 2416 + -----------------+-------------------------+------------------------------ 2417 + timeout conf.timeout / 2 conf.timeout / 2 2418 + timeout action send a ping via msock Abort communication 2419 + and close all sockets 2420 + */ 2421 + 2422 + /* 2423 + * you must have down()ed the appropriate [m]sock_mutex elsewhere! 2424 + */ 2425 + int drbd_send(struct drbd_conf *mdev, struct socket *sock, 2426 + void *buf, size_t size, unsigned msg_flags) 2427 + { 2428 + struct kvec iov; 2429 + struct msghdr msg; 2430 + int rv, sent = 0; 2431 + 2432 + if (!sock) 2433 + return -1000; 2434 + 2435 + /* THINK if (signal_pending) return ... ? */ 2436 + 2437 + iov.iov_base = buf; 2438 + iov.iov_len = size; 2439 + 2440 + msg.msg_name = NULL; 2441 + msg.msg_namelen = 0; 2442 + msg.msg_control = NULL; 2443 + msg.msg_controllen = 0; 2444 + msg.msg_flags = msg_flags | MSG_NOSIGNAL; 2445 + 2446 + if (sock == mdev->data.socket) { 2447 + mdev->ko_count = mdev->net_conf->ko_count; 2448 + drbd_update_congested(mdev); 2449 + } 2450 + do { 2451 + /* STRANGE 2452 + * tcp_sendmsg does _not_ use its size parameter at all ? 2453 + * 2454 + * -EAGAIN on timeout, -EINTR on signal. 2455 + */ 2456 + /* THINK 2457 + * do we need to block DRBD_SIG if sock == &meta.socket ?? 2458 + * otherwise wake_asender() might interrupt some send_*Ack ! 2459 + */ 2460 + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); 2461 + if (rv == -EAGAIN) { 2462 + if (we_should_drop_the_connection(mdev, sock)) 2463 + break; 2464 + else 2465 + continue; 2466 + } 2467 + D_ASSERT(rv != 0); 2468 + if (rv == -EINTR) { 2469 + flush_signals(current); 2470 + rv = 0; 2471 + } 2472 + if (rv < 0) 2473 + break; 2474 + sent += rv; 2475 + iov.iov_base += rv; 2476 + iov.iov_len -= rv; 2477 + } while (sent < size); 2478 + 2479 + if (sock == mdev->data.socket) 2480 + clear_bit(NET_CONGESTED, &mdev->flags); 2481 + 2482 + if (rv <= 0) { 2483 + if (rv != -EAGAIN) { 2484 + dev_err(DEV, "%s_sendmsg returned %d\n", 2485 + sock == mdev->meta.socket ? "msock" : "sock", 2486 + rv); 2487 + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); 2488 + } else 2489 + drbd_force_state(mdev, NS(conn, C_TIMEOUT)); 2490 + } 2491 + 2492 + return sent; 2493 + } 2494 + 2495 + static int drbd_open(struct block_device *bdev, fmode_t mode) 2496 + { 2497 + struct drbd_conf *mdev = bdev->bd_disk->private_data; 2498 + unsigned long flags; 2499 + int rv = 0; 2500 + 2501 + spin_lock_irqsave(&mdev->req_lock, flags); 2502 + /* to have a stable mdev->state.role 2503 + * and no race with updating open_cnt */ 2504 + 2505 + if (mdev->state.role != R_PRIMARY) { 2506 + if (mode & FMODE_WRITE) 2507 + rv = -EROFS; 2508 + else if (!allow_oos) 2509 + rv = -EMEDIUMTYPE; 2510 + } 2511 + 2512 + if (!rv) 2513 + mdev->open_cnt++; 2514 + spin_unlock_irqrestore(&mdev->req_lock, flags); 2515 + 2516 + return rv; 2517 + } 2518 + 2519 + static int drbd_release(struct gendisk *gd, fmode_t mode) 2520 + { 2521 + struct drbd_conf *mdev = gd->private_data; 2522 + mdev->open_cnt--; 2523 + return 0; 2524 + } 2525 + 2526 + static void drbd_unplug_fn(struct request_queue *q) 2527 + { 2528 + struct drbd_conf *mdev = q->queuedata; 2529 + 2530 + /* unplug FIRST */ 2531 + spin_lock_irq(q->queue_lock); 2532 + blk_remove_plug(q); 2533 + spin_unlock_irq(q->queue_lock); 2534 + 2535 + /* only if connected */ 2536 + spin_lock_irq(&mdev->req_lock); 2537 + if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { 2538 + D_ASSERT(mdev->state.role == R_PRIMARY); 2539 + if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { 2540 + /* add to the data.work queue, 2541 + * unless already queued. 2542 + * XXX this might be a good addition to drbd_queue_work 2543 + * anyways, to detect "double queuing" ... */ 2544 + if (list_empty(&mdev->unplug_work.list)) 2545 + drbd_queue_work(&mdev->data.work, 2546 + &mdev->unplug_work); 2547 + } 2548 + } 2549 + spin_unlock_irq(&mdev->req_lock); 2550 + 2551 + if (mdev->state.disk >= D_INCONSISTENT) 2552 + drbd_kick_lo(mdev); 2553 + } 2554 + 2555 + static void drbd_set_defaults(struct drbd_conf *mdev) 2556 + { 2557 + mdev->sync_conf.after = DRBD_AFTER_DEF; 2558 + mdev->sync_conf.rate = DRBD_RATE_DEF; 2559 + mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; 2560 + mdev->state = (union drbd_state) { 2561 + { .role = R_SECONDARY, 2562 + .peer = R_UNKNOWN, 2563 + .conn = C_STANDALONE, 2564 + .disk = D_DISKLESS, 2565 + .pdsk = D_UNKNOWN, 2566 + .susp = 0 2567 + } }; 2568 + } 2569 + 2570 + void drbd_init_set_defaults(struct drbd_conf *mdev) 2571 + { 2572 + /* the memset(,0,) did most of this. 2573 + * note: only assignments, no allocation in here */ 2574 + 2575 + drbd_set_defaults(mdev); 2576 + 2577 + /* for now, we do NOT yet support it, 2578 + * even though we start some framework 2579 + * to eventually support barriers */ 2580 + set_bit(NO_BARRIER_SUPP, &mdev->flags); 2581 + 2582 + atomic_set(&mdev->ap_bio_cnt, 0); 2583 + atomic_set(&mdev->ap_pending_cnt, 0); 2584 + atomic_set(&mdev->rs_pending_cnt, 0); 2585 + atomic_set(&mdev->unacked_cnt, 0); 2586 + atomic_set(&mdev->local_cnt, 0); 2587 + atomic_set(&mdev->net_cnt, 0); 2588 + atomic_set(&mdev->packet_seq, 0); 2589 + atomic_set(&mdev->pp_in_use, 0); 2590 + 2591 + mutex_init(&mdev->md_io_mutex); 2592 + mutex_init(&mdev->data.mutex); 2593 + mutex_init(&mdev->meta.mutex); 2594 + sema_init(&mdev->data.work.s, 0); 2595 + sema_init(&mdev->meta.work.s, 0); 2596 + mutex_init(&mdev->state_mutex); 2597 + 2598 + spin_lock_init(&mdev->data.work.q_lock); 2599 + spin_lock_init(&mdev->meta.work.q_lock); 2600 + 2601 + spin_lock_init(&mdev->al_lock); 2602 + spin_lock_init(&mdev->req_lock); 2603 + spin_lock_init(&mdev->peer_seq_lock); 2604 + spin_lock_init(&mdev->epoch_lock); 2605 + 2606 + INIT_LIST_HEAD(&mdev->active_ee); 2607 + INIT_LIST_HEAD(&mdev->sync_ee); 2608 + INIT_LIST_HEAD(&mdev->done_ee); 2609 + INIT_LIST_HEAD(&mdev->read_ee); 2610 + INIT_LIST_HEAD(&mdev->net_ee); 2611 + INIT_LIST_HEAD(&mdev->resync_reads); 2612 + INIT_LIST_HEAD(&mdev->data.work.q); 2613 + INIT_LIST_HEAD(&mdev->meta.work.q); 2614 + INIT_LIST_HEAD(&mdev->resync_work.list); 2615 + INIT_LIST_HEAD(&mdev->unplug_work.list); 2616 + INIT_LIST_HEAD(&mdev->md_sync_work.list); 2617 + INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 2618 + mdev->resync_work.cb = w_resync_inactive; 2619 + mdev->unplug_work.cb = w_send_write_hint; 2620 + mdev->md_sync_work.cb = w_md_sync; 2621 + mdev->bm_io_work.w.cb = w_bitmap_io; 2622 + init_timer(&mdev->resync_timer); 2623 + init_timer(&mdev->md_sync_timer); 2624 + mdev->resync_timer.function = resync_timer_fn; 2625 + mdev->resync_timer.data = (unsigned long) mdev; 2626 + mdev->md_sync_timer.function = md_sync_timer_fn; 2627 + mdev->md_sync_timer.data = (unsigned long) mdev; 2628 + 2629 + init_waitqueue_head(&mdev->misc_wait); 2630 + init_waitqueue_head(&mdev->state_wait); 2631 + init_waitqueue_head(&mdev->ee_wait); 2632 + init_waitqueue_head(&mdev->al_wait); 2633 + init_waitqueue_head(&mdev->seq_wait); 2634 + 2635 + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); 2636 + drbd_thread_init(mdev, &mdev->worker, drbd_worker); 2637 + drbd_thread_init(mdev, &mdev->asender, drbd_asender); 2638 + 2639 + mdev->agreed_pro_version = PRO_VERSION_MAX; 2640 + mdev->write_ordering = WO_bio_barrier; 2641 + mdev->resync_wenr = LC_FREE; 2642 + } 2643 + 2644 + void drbd_mdev_cleanup(struct drbd_conf *mdev) 2645 + { 2646 + if (mdev->receiver.t_state != None) 2647 + dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", 2648 + mdev->receiver.t_state); 2649 + 2650 + /* no need to lock it, I'm the only thread alive */ 2651 + if (atomic_read(&mdev->current_epoch->epoch_size) != 0) 2652 + dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); 2653 + mdev->al_writ_cnt = 2654 + mdev->bm_writ_cnt = 2655 + mdev->read_cnt = 2656 + mdev->recv_cnt = 2657 + mdev->send_cnt = 2658 + mdev->writ_cnt = 2659 + mdev->p_size = 2660 + mdev->rs_start = 2661 + mdev->rs_total = 2662 + mdev->rs_failed = 2663 + mdev->rs_mark_left = 2664 + mdev->rs_mark_time = 0; 2665 + D_ASSERT(mdev->net_conf == NULL); 2666 + 2667 + drbd_set_my_capacity(mdev, 0); 2668 + if (mdev->bitmap) { 2669 + /* maybe never allocated. */ 2670 + drbd_bm_resize(mdev, 0); 2671 + drbd_bm_cleanup(mdev); 2672 + } 2673 + 2674 + drbd_free_resources(mdev); 2675 + 2676 + /* 2677 + * currently we drbd_init_ee only on module load, so 2678 + * we may do drbd_release_ee only on module unload! 2679 + */ 2680 + D_ASSERT(list_empty(&mdev->active_ee)); 2681 + D_ASSERT(list_empty(&mdev->sync_ee)); 2682 + D_ASSERT(list_empty(&mdev->done_ee)); 2683 + D_ASSERT(list_empty(&mdev->read_ee)); 2684 + D_ASSERT(list_empty(&mdev->net_ee)); 2685 + D_ASSERT(list_empty(&mdev->resync_reads)); 2686 + D_ASSERT(list_empty(&mdev->data.work.q)); 2687 + D_ASSERT(list_empty(&mdev->meta.work.q)); 2688 + D_ASSERT(list_empty(&mdev->resync_work.list)); 2689 + D_ASSERT(list_empty(&mdev->unplug_work.list)); 2690 + 2691 + } 2692 + 2693 + 2694 + static void drbd_destroy_mempools(void) 2695 + { 2696 + struct page *page; 2697 + 2698 + while (drbd_pp_pool) { 2699 + page = drbd_pp_pool; 2700 + drbd_pp_pool = (struct page *)page_private(page); 2701 + __free_page(page); 2702 + drbd_pp_vacant--; 2703 + } 2704 + 2705 + /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ 2706 + 2707 + if (drbd_ee_mempool) 2708 + mempool_destroy(drbd_ee_mempool); 2709 + if (drbd_request_mempool) 2710 + mempool_destroy(drbd_request_mempool); 2711 + if (drbd_ee_cache) 2712 + kmem_cache_destroy(drbd_ee_cache); 2713 + if (drbd_request_cache) 2714 + kmem_cache_destroy(drbd_request_cache); 2715 + if (drbd_bm_ext_cache) 2716 + kmem_cache_destroy(drbd_bm_ext_cache); 2717 + if (drbd_al_ext_cache) 2718 + kmem_cache_destroy(drbd_al_ext_cache); 2719 + 2720 + drbd_ee_mempool = NULL; 2721 + drbd_request_mempool = NULL; 2722 + drbd_ee_cache = NULL; 2723 + drbd_request_cache = NULL; 2724 + drbd_bm_ext_cache = NULL; 2725 + drbd_al_ext_cache = NULL; 2726 + 2727 + return; 2728 + } 2729 + 2730 + static int drbd_create_mempools(void) 2731 + { 2732 + struct page *page; 2733 + const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; 2734 + int i; 2735 + 2736 + /* prepare our caches and mempools */ 2737 + drbd_request_mempool = NULL; 2738 + drbd_ee_cache = NULL; 2739 + drbd_request_cache = NULL; 2740 + drbd_bm_ext_cache = NULL; 2741 + drbd_al_ext_cache = NULL; 2742 + drbd_pp_pool = NULL; 2743 + 2744 + /* caches */ 2745 + drbd_request_cache = kmem_cache_create( 2746 + "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); 2747 + if (drbd_request_cache == NULL) 2748 + goto Enomem; 2749 + 2750 + drbd_ee_cache = kmem_cache_create( 2751 + "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); 2752 + if (drbd_ee_cache == NULL) 2753 + goto Enomem; 2754 + 2755 + drbd_bm_ext_cache = kmem_cache_create( 2756 + "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); 2757 + if (drbd_bm_ext_cache == NULL) 2758 + goto Enomem; 2759 + 2760 + drbd_al_ext_cache = kmem_cache_create( 2761 + "drbd_al", sizeof(struct lc_element), 0, 0, NULL); 2762 + if (drbd_al_ext_cache == NULL) 2763 + goto Enomem; 2764 + 2765 + /* mempools */ 2766 + drbd_request_mempool = mempool_create(number, 2767 + mempool_alloc_slab, mempool_free_slab, drbd_request_cache); 2768 + if (drbd_request_mempool == NULL) 2769 + goto Enomem; 2770 + 2771 + drbd_ee_mempool = mempool_create(number, 2772 + mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); 2773 + if (drbd_request_mempool == NULL) 2774 + goto Enomem; 2775 + 2776 + /* drbd's page pool */ 2777 + spin_lock_init(&drbd_pp_lock); 2778 + 2779 + for (i = 0; i < number; i++) { 2780 + page = alloc_page(GFP_HIGHUSER); 2781 + if (!page) 2782 + goto Enomem; 2783 + set_page_private(page, (unsigned long)drbd_pp_pool); 2784 + drbd_pp_pool = page; 2785 + } 2786 + drbd_pp_vacant = number; 2787 + 2788 + return 0; 2789 + 2790 + Enomem: 2791 + drbd_destroy_mempools(); /* in case we allocated some */ 2792 + return -ENOMEM; 2793 + } 2794 + 2795 + static int drbd_notify_sys(struct notifier_block *this, unsigned long code, 2796 + void *unused) 2797 + { 2798 + /* just so we have it. you never know what interesting things we 2799 + * might want to do here some day... 2800 + */ 2801 + 2802 + return NOTIFY_DONE; 2803 + } 2804 + 2805 + static struct notifier_block drbd_notifier = { 2806 + .notifier_call = drbd_notify_sys, 2807 + }; 2808 + 2809 + static void drbd_release_ee_lists(struct drbd_conf *mdev) 2810 + { 2811 + int rr; 2812 + 2813 + rr = drbd_release_ee(mdev, &mdev->active_ee); 2814 + if (rr) 2815 + dev_err(DEV, "%d EEs in active list found!\n", rr); 2816 + 2817 + rr = drbd_release_ee(mdev, &mdev->sync_ee); 2818 + if (rr) 2819 + dev_err(DEV, "%d EEs in sync list found!\n", rr); 2820 + 2821 + rr = drbd_release_ee(mdev, &mdev->read_ee); 2822 + if (rr) 2823 + dev_err(DEV, "%d EEs in read list found!\n", rr); 2824 + 2825 + rr = drbd_release_ee(mdev, &mdev->done_ee); 2826 + if (rr) 2827 + dev_err(DEV, "%d EEs in done list found!\n", rr); 2828 + 2829 + rr = drbd_release_ee(mdev, &mdev->net_ee); 2830 + if (rr) 2831 + dev_err(DEV, "%d EEs in net list found!\n", rr); 2832 + } 2833 + 2834 + /* caution. no locking. 2835 + * currently only used from module cleanup code. */ 2836 + static void drbd_delete_device(unsigned int minor) 2837 + { 2838 + struct drbd_conf *mdev = minor_to_mdev(minor); 2839 + 2840 + if (!mdev) 2841 + return; 2842 + 2843 + /* paranoia asserts */ 2844 + if (mdev->open_cnt != 0) 2845 + dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, 2846 + __FILE__ , __LINE__); 2847 + 2848 + ERR_IF (!list_empty(&mdev->data.work.q)) { 2849 + struct list_head *lp; 2850 + list_for_each(lp, &mdev->data.work.q) { 2851 + dev_err(DEV, "lp = %p\n", lp); 2852 + } 2853 + }; 2854 + /* end paranoia asserts */ 2855 + 2856 + del_gendisk(mdev->vdisk); 2857 + 2858 + /* cleanup stuff that may have been allocated during 2859 + * device (re-)configuration or state changes */ 2860 + 2861 + if (mdev->this_bdev) 2862 + bdput(mdev->this_bdev); 2863 + 2864 + drbd_free_resources(mdev); 2865 + 2866 + drbd_release_ee_lists(mdev); 2867 + 2868 + /* should be free'd on disconnect? */ 2869 + kfree(mdev->ee_hash); 2870 + /* 2871 + mdev->ee_hash_s = 0; 2872 + mdev->ee_hash = NULL; 2873 + */ 2874 + 2875 + lc_destroy(mdev->act_log); 2876 + lc_destroy(mdev->resync); 2877 + 2878 + kfree(mdev->p_uuid); 2879 + /* mdev->p_uuid = NULL; */ 2880 + 2881 + kfree(mdev->int_dig_out); 2882 + kfree(mdev->int_dig_in); 2883 + kfree(mdev->int_dig_vv); 2884 + 2885 + /* cleanup the rest that has been 2886 + * allocated from drbd_new_device 2887 + * and actually free the mdev itself */ 2888 + drbd_free_mdev(mdev); 2889 + } 2890 + 2891 + static void drbd_cleanup(void) 2892 + { 2893 + unsigned int i; 2894 + 2895 + unregister_reboot_notifier(&drbd_notifier); 2896 + 2897 + drbd_nl_cleanup(); 2898 + 2899 + if (minor_table) { 2900 + if (drbd_proc) 2901 + remove_proc_entry("drbd", NULL); 2902 + i = minor_count; 2903 + while (i--) 2904 + drbd_delete_device(i); 2905 + drbd_destroy_mempools(); 2906 + } 2907 + 2908 + kfree(minor_table); 2909 + 2910 + unregister_blkdev(DRBD_MAJOR, "drbd"); 2911 + 2912 + printk(KERN_INFO "drbd: module cleanup done.\n"); 2913 + } 2914 + 2915 + /** 2916 + * drbd_congested() - Callback for pdflush 2917 + * @congested_data: User data 2918 + * @bdi_bits: Bits pdflush is currently interested in 2919 + * 2920 + * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. 2921 + */ 2922 + static int drbd_congested(void *congested_data, int bdi_bits) 2923 + { 2924 + struct drbd_conf *mdev = congested_data; 2925 + struct request_queue *q; 2926 + char reason = '-'; 2927 + int r = 0; 2928 + 2929 + if (!__inc_ap_bio_cond(mdev)) { 2930 + /* DRBD has frozen IO */ 2931 + r = bdi_bits; 2932 + reason = 'd'; 2933 + goto out; 2934 + } 2935 + 2936 + if (get_ldev(mdev)) { 2937 + q = bdev_get_queue(mdev->ldev->backing_bdev); 2938 + r = bdi_congested(&q->backing_dev_info, bdi_bits); 2939 + put_ldev(mdev); 2940 + if (r) 2941 + reason = 'b'; 2942 + } 2943 + 2944 + if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { 2945 + r |= (1 << BDI_async_congested); 2946 + reason = reason == 'b' ? 'a' : 'n'; 2947 + } 2948 + 2949 + out: 2950 + mdev->congestion_reason = reason; 2951 + return r; 2952 + } 2953 + 2954 + struct drbd_conf *drbd_new_device(unsigned int minor) 2955 + { 2956 + struct drbd_conf *mdev; 2957 + struct gendisk *disk; 2958 + struct request_queue *q; 2959 + 2960 + /* GFP_KERNEL, we are outside of all write-out paths */ 2961 + mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); 2962 + if (!mdev) 2963 + return NULL; 2964 + if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) 2965 + goto out_no_cpumask; 2966 + 2967 + mdev->minor = minor; 2968 + 2969 + drbd_init_set_defaults(mdev); 2970 + 2971 + q = blk_alloc_queue(GFP_KERNEL); 2972 + if (!q) 2973 + goto out_no_q; 2974 + mdev->rq_queue = q; 2975 + q->queuedata = mdev; 2976 + blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); 2977 + 2978 + disk = alloc_disk(1); 2979 + if (!disk) 2980 + goto out_no_disk; 2981 + mdev->vdisk = disk; 2982 + 2983 + set_disk_ro(disk, TRUE); 2984 + 2985 + disk->queue = q; 2986 + disk->major = DRBD_MAJOR; 2987 + disk->first_minor = minor; 2988 + disk->fops = &drbd_ops; 2989 + sprintf(disk->disk_name, "drbd%d", minor); 2990 + disk->private_data = mdev; 2991 + 2992 + mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); 2993 + /* we have no partitions. we contain only ourselves. */ 2994 + mdev->this_bdev->bd_contains = mdev->this_bdev; 2995 + 2996 + q->backing_dev_info.congested_fn = drbd_congested; 2997 + q->backing_dev_info.congested_data = mdev; 2998 + 2999 + blk_queue_make_request(q, drbd_make_request_26); 3000 + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 3001 + blk_queue_merge_bvec(q, drbd_merge_bvec); 3002 + q->queue_lock = &mdev->req_lock; /* needed since we use */ 3003 + /* plugging on a queue, that actually has no requests! */ 3004 + q->unplug_fn = drbd_unplug_fn; 3005 + 3006 + mdev->md_io_page = alloc_page(GFP_KERNEL); 3007 + if (!mdev->md_io_page) 3008 + goto out_no_io_page; 3009 + 3010 + if (drbd_bm_init(mdev)) 3011 + goto out_no_bitmap; 3012 + /* no need to lock access, we are still initializing this minor device. */ 3013 + if (!tl_init(mdev)) 3014 + goto out_no_tl; 3015 + 3016 + mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); 3017 + if (!mdev->app_reads_hash) 3018 + goto out_no_app_reads; 3019 + 3020 + mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); 3021 + if (!mdev->current_epoch) 3022 + goto out_no_epoch; 3023 + 3024 + INIT_LIST_HEAD(&mdev->current_epoch->list); 3025 + mdev->epochs = 1; 3026 + 3027 + return mdev; 3028 + 3029 + /* out_whatever_else: 3030 + kfree(mdev->current_epoch); */ 3031 + out_no_epoch: 3032 + kfree(mdev->app_reads_hash); 3033 + out_no_app_reads: 3034 + tl_cleanup(mdev); 3035 + out_no_tl: 3036 + drbd_bm_cleanup(mdev); 3037 + out_no_bitmap: 3038 + __free_page(mdev->md_io_page); 3039 + out_no_io_page: 3040 + put_disk(disk); 3041 + out_no_disk: 3042 + blk_cleanup_queue(q); 3043 + out_no_q: 3044 + free_cpumask_var(mdev->cpu_mask); 3045 + out_no_cpumask: 3046 + kfree(mdev); 3047 + return NULL; 3048 + } 3049 + 3050 + /* counterpart of drbd_new_device. 3051 + * last part of drbd_delete_device. */ 3052 + void drbd_free_mdev(struct drbd_conf *mdev) 3053 + { 3054 + kfree(mdev->current_epoch); 3055 + kfree(mdev->app_reads_hash); 3056 + tl_cleanup(mdev); 3057 + if (mdev->bitmap) /* should no longer be there. */ 3058 + drbd_bm_cleanup(mdev); 3059 + __free_page(mdev->md_io_page); 3060 + put_disk(mdev->vdisk); 3061 + blk_cleanup_queue(mdev->rq_queue); 3062 + free_cpumask_var(mdev->cpu_mask); 3063 + kfree(mdev); 3064 + } 3065 + 3066 + 3067 + int __init drbd_init(void) 3068 + { 3069 + int err; 3070 + 3071 + if (sizeof(struct p_handshake) != 80) { 3072 + printk(KERN_ERR 3073 + "drbd: never change the size or layout " 3074 + "of the HandShake packet.\n"); 3075 + return -EINVAL; 3076 + } 3077 + 3078 + if (1 > minor_count || minor_count > 255) { 3079 + printk(KERN_ERR 3080 + "drbd: invalid minor_count (%d)\n", minor_count); 3081 + #ifdef MODULE 3082 + return -EINVAL; 3083 + #else 3084 + minor_count = 8; 3085 + #endif 3086 + } 3087 + 3088 + err = drbd_nl_init(); 3089 + if (err) 3090 + return err; 3091 + 3092 + err = register_blkdev(DRBD_MAJOR, "drbd"); 3093 + if (err) { 3094 + printk(KERN_ERR 3095 + "drbd: unable to register block device major %d\n", 3096 + DRBD_MAJOR); 3097 + return err; 3098 + } 3099 + 3100 + register_reboot_notifier(&drbd_notifier); 3101 + 3102 + /* 3103 + * allocate all necessary structs 3104 + */ 3105 + err = -ENOMEM; 3106 + 3107 + init_waitqueue_head(&drbd_pp_wait); 3108 + 3109 + drbd_proc = NULL; /* play safe for drbd_cleanup */ 3110 + minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, 3111 + GFP_KERNEL); 3112 + if (!minor_table) 3113 + goto Enomem; 3114 + 3115 + err = drbd_create_mempools(); 3116 + if (err) 3117 + goto Enomem; 3118 + 3119 + drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); 3120 + if (!drbd_proc) { 3121 + printk(KERN_ERR "drbd: unable to register proc file\n"); 3122 + goto Enomem; 3123 + } 3124 + 3125 + rwlock_init(&global_state_lock); 3126 + 3127 + printk(KERN_INFO "drbd: initialized. " 3128 + "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", 3129 + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); 3130 + printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); 3131 + printk(KERN_INFO "drbd: registered as block device major %d\n", 3132 + DRBD_MAJOR); 3133 + printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); 3134 + 3135 + return 0; /* Success! */ 3136 + 3137 + Enomem: 3138 + drbd_cleanup(); 3139 + if (err == -ENOMEM) 3140 + /* currently always the case */ 3141 + printk(KERN_ERR "drbd: ran out of memory\n"); 3142 + else 3143 + printk(KERN_ERR "drbd: initialization failure\n"); 3144 + return err; 3145 + } 3146 + 3147 + void drbd_free_bc(struct drbd_backing_dev *ldev) 3148 + { 3149 + if (ldev == NULL) 3150 + return; 3151 + 3152 + bd_release(ldev->backing_bdev); 3153 + bd_release(ldev->md_bdev); 3154 + 3155 + fput(ldev->lo_file); 3156 + fput(ldev->md_file); 3157 + 3158 + kfree(ldev); 3159 + } 3160 + 3161 + void drbd_free_sock(struct drbd_conf *mdev) 3162 + { 3163 + if (mdev->data.socket) { 3164 + kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); 3165 + sock_release(mdev->data.socket); 3166 + mdev->data.socket = NULL; 3167 + } 3168 + if (mdev->meta.socket) { 3169 + kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); 3170 + sock_release(mdev->meta.socket); 3171 + mdev->meta.socket = NULL; 3172 + } 3173 + } 3174 + 3175 + 3176 + void drbd_free_resources(struct drbd_conf *mdev) 3177 + { 3178 + crypto_free_hash(mdev->csums_tfm); 3179 + mdev->csums_tfm = NULL; 3180 + crypto_free_hash(mdev->verify_tfm); 3181 + mdev->verify_tfm = NULL; 3182 + crypto_free_hash(mdev->cram_hmac_tfm); 3183 + mdev->cram_hmac_tfm = NULL; 3184 + crypto_free_hash(mdev->integrity_w_tfm); 3185 + mdev->integrity_w_tfm = NULL; 3186 + crypto_free_hash(mdev->integrity_r_tfm); 3187 + mdev->integrity_r_tfm = NULL; 3188 + 3189 + drbd_free_sock(mdev); 3190 + 3191 + __no_warn(local, 3192 + drbd_free_bc(mdev->ldev); 3193 + mdev->ldev = NULL;); 3194 + } 3195 + 3196 + /* meta data management */ 3197 + 3198 + struct meta_data_on_disk { 3199 + u64 la_size; /* last agreed size. */ 3200 + u64 uuid[UI_SIZE]; /* UUIDs. */ 3201 + u64 device_uuid; 3202 + u64 reserved_u64_1; 3203 + u32 flags; /* MDF */ 3204 + u32 magic; 3205 + u32 md_size_sect; 3206 + u32 al_offset; /* offset to this block */ 3207 + u32 al_nr_extents; /* important for restoring the AL */ 3208 + /* `-- act_log->nr_elements <-- sync_conf.al_extents */ 3209 + u32 bm_offset; /* offset to the bitmap, from here */ 3210 + u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 3211 + u32 reserved_u32[4]; 3212 + 3213 + } __packed; 3214 + 3215 + /** 3216 + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set 3217 + * @mdev: DRBD device. 3218 + */ 3219 + void drbd_md_sync(struct drbd_conf *mdev) 3220 + { 3221 + struct meta_data_on_disk *buffer; 3222 + sector_t sector; 3223 + int i; 3224 + 3225 + if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 3226 + return; 3227 + del_timer(&mdev->md_sync_timer); 3228 + 3229 + /* We use here D_FAILED and not D_ATTACHING because we try to write 3230 + * metadata even if we detach due to a disk failure! */ 3231 + if (!get_ldev_if_state(mdev, D_FAILED)) 3232 + return; 3233 + 3234 + mutex_lock(&mdev->md_io_mutex); 3235 + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); 3236 + memset(buffer, 0, 512); 3237 + 3238 + buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); 3239 + for (i = UI_CURRENT; i < UI_SIZE; i++) 3240 + buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); 3241 + buffer->flags = cpu_to_be32(mdev->ldev->md.flags); 3242 + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); 3243 + 3244 + buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); 3245 + buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); 3246 + buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); 3247 + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); 3248 + buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); 3249 + 3250 + buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); 3251 + 3252 + D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 3253 + sector = mdev->ldev->md.md_offset; 3254 + 3255 + if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 3256 + clear_bit(MD_DIRTY, &mdev->flags); 3257 + } else { 3258 + /* this was a try anyways ... */ 3259 + dev_err(DEV, "meta data update failed!\n"); 3260 + 3261 + drbd_chk_io_error(mdev, 1, TRUE); 3262 + } 3263 + 3264 + /* Update mdev->ldev->md.la_size_sect, 3265 + * since we updated it on metadata. */ 3266 + mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); 3267 + 3268 + mutex_unlock(&mdev->md_io_mutex); 3269 + put_ldev(mdev); 3270 + } 3271 + 3272 + /** 3273 + * drbd_md_read() - Reads in the meta data super block 3274 + * @mdev: DRBD device. 3275 + * @bdev: Device from which the meta data should be read in. 3276 + * 3277 + * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case 3278 + * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. 3279 + */ 3280 + int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 3281 + { 3282 + struct meta_data_on_disk *buffer; 3283 + int i, rv = NO_ERROR; 3284 + 3285 + if (!get_ldev_if_state(mdev, D_ATTACHING)) 3286 + return ERR_IO_MD_DISK; 3287 + 3288 + mutex_lock(&mdev->md_io_mutex); 3289 + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); 3290 + 3291 + if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { 3292 + /* NOTE: cant do normal error processing here as this is 3293 + called BEFORE disk is attached */ 3294 + dev_err(DEV, "Error while reading metadata.\n"); 3295 + rv = ERR_IO_MD_DISK; 3296 + goto err; 3297 + } 3298 + 3299 + if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { 3300 + dev_err(DEV, "Error while reading metadata, magic not found.\n"); 3301 + rv = ERR_MD_INVALID; 3302 + goto err; 3303 + } 3304 + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { 3305 + dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", 3306 + be32_to_cpu(buffer->al_offset), bdev->md.al_offset); 3307 + rv = ERR_MD_INVALID; 3308 + goto err; 3309 + } 3310 + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { 3311 + dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", 3312 + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); 3313 + rv = ERR_MD_INVALID; 3314 + goto err; 3315 + } 3316 + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { 3317 + dev_err(DEV, "unexpected md_size: %u (expected %u)\n", 3318 + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); 3319 + rv = ERR_MD_INVALID; 3320 + goto err; 3321 + } 3322 + 3323 + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { 3324 + dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", 3325 + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); 3326 + rv = ERR_MD_INVALID; 3327 + goto err; 3328 + } 3329 + 3330 + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); 3331 + for (i = UI_CURRENT; i < UI_SIZE; i++) 3332 + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); 3333 + bdev->md.flags = be32_to_cpu(buffer->flags); 3334 + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); 3335 + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); 3336 + 3337 + if (mdev->sync_conf.al_extents < 7) 3338 + mdev->sync_conf.al_extents = 127; 3339 + 3340 + err: 3341 + mutex_unlock(&mdev->md_io_mutex); 3342 + put_ldev(mdev); 3343 + 3344 + return rv; 3345 + } 3346 + 3347 + /** 3348 + * drbd_md_mark_dirty() - Mark meta data super block as dirty 3349 + * @mdev: DRBD device. 3350 + * 3351 + * Call this function if you change anything that should be written to 3352 + * the meta-data super block. This function sets MD_DIRTY, and starts a 3353 + * timer that ensures that within five seconds you have to call drbd_md_sync(). 3354 + */ 3355 + void drbd_md_mark_dirty(struct drbd_conf *mdev) 3356 + { 3357 + set_bit(MD_DIRTY, &mdev->flags); 3358 + mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); 3359 + } 3360 + 3361 + 3362 + static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) 3363 + { 3364 + int i; 3365 + 3366 + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) 3367 + mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; 3368 + } 3369 + 3370 + void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) 3371 + { 3372 + if (idx == UI_CURRENT) { 3373 + if (mdev->state.role == R_PRIMARY) 3374 + val |= 1; 3375 + else 3376 + val &= ~((u64)1); 3377 + 3378 + drbd_set_ed_uuid(mdev, val); 3379 + } 3380 + 3381 + mdev->ldev->md.uuid[idx] = val; 3382 + drbd_md_mark_dirty(mdev); 3383 + } 3384 + 3385 + 3386 + void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) 3387 + { 3388 + if (mdev->ldev->md.uuid[idx]) { 3389 + drbd_uuid_move_history(mdev); 3390 + mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; 3391 + } 3392 + _drbd_uuid_set(mdev, idx, val); 3393 + } 3394 + 3395 + /** 3396 + * drbd_uuid_new_current() - Creates a new current UUID 3397 + * @mdev: DRBD device. 3398 + * 3399 + * Creates a new current UUID, and rotates the old current UUID into 3400 + * the bitmap slot. Causes an incremental resync upon next connect. 3401 + */ 3402 + void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) 3403 + { 3404 + u64 val; 3405 + 3406 + dev_info(DEV, "Creating new current UUID\n"); 3407 + D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); 3408 + mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; 3409 + 3410 + get_random_bytes(&val, sizeof(u64)); 3411 + _drbd_uuid_set(mdev, UI_CURRENT, val); 3412 + } 3413 + 3414 + void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) 3415 + { 3416 + if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) 3417 + return; 3418 + 3419 + if (val == 0) { 3420 + drbd_uuid_move_history(mdev); 3421 + mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; 3422 + mdev->ldev->md.uuid[UI_BITMAP] = 0; 3423 + } else { 3424 + if (mdev->ldev->md.uuid[UI_BITMAP]) 3425 + dev_warn(DEV, "bm UUID already set"); 3426 + 3427 + mdev->ldev->md.uuid[UI_BITMAP] = val; 3428 + mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); 3429 + 3430 + } 3431 + drbd_md_mark_dirty(mdev); 3432 + } 3433 + 3434 + /** 3435 + * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() 3436 + * @mdev: DRBD device. 3437 + * 3438 + * Sets all bits in the bitmap and writes the whole bitmap to stable storage. 3439 + */ 3440 + int drbd_bmio_set_n_write(struct drbd_conf *mdev) 3441 + { 3442 + int rv = -EIO; 3443 + 3444 + if (get_ldev_if_state(mdev, D_ATTACHING)) { 3445 + drbd_md_set_flag(mdev, MDF_FULL_SYNC); 3446 + drbd_md_sync(mdev); 3447 + drbd_bm_set_all(mdev); 3448 + 3449 + rv = drbd_bm_write(mdev); 3450 + 3451 + if (!rv) { 3452 + drbd_md_clear_flag(mdev, MDF_FULL_SYNC); 3453 + drbd_md_sync(mdev); 3454 + } 3455 + 3456 + put_ldev(mdev); 3457 + } 3458 + 3459 + return rv; 3460 + } 3461 + 3462 + /** 3463 + * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() 3464 + * @mdev: DRBD device. 3465 + * 3466 + * Clears all bits in the bitmap and writes the whole bitmap to stable storage. 3467 + */ 3468 + int drbd_bmio_clear_n_write(struct drbd_conf *mdev) 3469 + { 3470 + int rv = -EIO; 3471 + 3472 + if (get_ldev_if_state(mdev, D_ATTACHING)) { 3473 + drbd_bm_clear_all(mdev); 3474 + rv = drbd_bm_write(mdev); 3475 + put_ldev(mdev); 3476 + } 3477 + 3478 + return rv; 3479 + } 3480 + 3481 + static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3482 + { 3483 + struct bm_io_work *work = container_of(w, struct bm_io_work, w); 3484 + int rv; 3485 + 3486 + D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); 3487 + 3488 + drbd_bm_lock(mdev, work->why); 3489 + rv = work->io_fn(mdev); 3490 + drbd_bm_unlock(mdev); 3491 + 3492 + clear_bit(BITMAP_IO, &mdev->flags); 3493 + wake_up(&mdev->misc_wait); 3494 + 3495 + if (work->done) 3496 + work->done(mdev, rv); 3497 + 3498 + clear_bit(BITMAP_IO_QUEUED, &mdev->flags); 3499 + work->why = NULL; 3500 + 3501 + return 1; 3502 + } 3503 + 3504 + /** 3505 + * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 3506 + * @mdev: DRBD device. 3507 + * @io_fn: IO callback to be called when bitmap IO is possible 3508 + * @done: callback to be called after the bitmap IO was performed 3509 + * @why: Descriptive text of the reason for doing the IO 3510 + * 3511 + * While IO on the bitmap happens we freeze application IO thus we ensure 3512 + * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be 3513 + * called from worker context. It MUST NOT be used while a previous such 3514 + * work is still pending! 3515 + */ 3516 + void drbd_queue_bitmap_io(struct drbd_conf *mdev, 3517 + int (*io_fn)(struct drbd_conf *), 3518 + void (*done)(struct drbd_conf *, int), 3519 + char *why) 3520 + { 3521 + D_ASSERT(current == mdev->worker.task); 3522 + 3523 + D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); 3524 + D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); 3525 + D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); 3526 + if (mdev->bm_io_work.why) 3527 + dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", 3528 + why, mdev->bm_io_work.why); 3529 + 3530 + mdev->bm_io_work.io_fn = io_fn; 3531 + mdev->bm_io_work.done = done; 3532 + mdev->bm_io_work.why = why; 3533 + 3534 + set_bit(BITMAP_IO, &mdev->flags); 3535 + if (atomic_read(&mdev->ap_bio_cnt) == 0) { 3536 + if (list_empty(&mdev->bm_io_work.w.list)) { 3537 + set_bit(BITMAP_IO_QUEUED, &mdev->flags); 3538 + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); 3539 + } else 3540 + dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); 3541 + } 3542 + } 3543 + 3544 + /** 3545 + * drbd_bitmap_io() - Does an IO operation on the whole bitmap 3546 + * @mdev: DRBD device. 3547 + * @io_fn: IO callback to be called when bitmap IO is possible 3548 + * @why: Descriptive text of the reason for doing the IO 3549 + * 3550 + * freezes application IO while that the actual IO operations runs. This 3551 + * functions MAY NOT be called from worker context. 3552 + */ 3553 + int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) 3554 + { 3555 + int rv; 3556 + 3557 + D_ASSERT(current != mdev->worker.task); 3558 + 3559 + drbd_suspend_io(mdev); 3560 + 3561 + drbd_bm_lock(mdev, why); 3562 + rv = io_fn(mdev); 3563 + drbd_bm_unlock(mdev); 3564 + 3565 + drbd_resume_io(mdev); 3566 + 3567 + return rv; 3568 + } 3569 + 3570 + void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local) 3571 + { 3572 + if ((mdev->ldev->md.flags & flag) != flag) { 3573 + drbd_md_mark_dirty(mdev); 3574 + mdev->ldev->md.flags |= flag; 3575 + } 3576 + } 3577 + 3578 + void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local) 3579 + { 3580 + if ((mdev->ldev->md.flags & flag) != 0) { 3581 + drbd_md_mark_dirty(mdev); 3582 + mdev->ldev->md.flags &= ~flag; 3583 + } 3584 + } 3585 + int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) 3586 + { 3587 + return (bdev->md.flags & flag) != 0; 3588 + } 3589 + 3590 + static void md_sync_timer_fn(unsigned long data) 3591 + { 3592 + struct drbd_conf *mdev = (struct drbd_conf *) data; 3593 + 3594 + drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); 3595 + } 3596 + 3597 + static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3598 + { 3599 + dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); 3600 + drbd_md_sync(mdev); 3601 + 3602 + return 1; 3603 + } 3604 + 3605 + #ifdef CONFIG_DRBD_FAULT_INJECTION 3606 + /* Fault insertion support including random number generator shamelessly 3607 + * stolen from kernel/rcutorture.c */ 3608 + struct fault_random_state { 3609 + unsigned long state; 3610 + unsigned long count; 3611 + }; 3612 + 3613 + #define FAULT_RANDOM_MULT 39916801 /* prime */ 3614 + #define FAULT_RANDOM_ADD 479001701 /* prime */ 3615 + #define FAULT_RANDOM_REFRESH 10000 3616 + 3617 + /* 3618 + * Crude but fast random-number generator. Uses a linear congruential 3619 + * generator, with occasional help from get_random_bytes(). 3620 + */ 3621 + static unsigned long 3622 + _drbd_fault_random(struct fault_random_state *rsp) 3623 + { 3624 + long refresh; 3625 + 3626 + if (--rsp->count < 0) { 3627 + get_random_bytes(&refresh, sizeof(refresh)); 3628 + rsp->state += refresh; 3629 + rsp->count = FAULT_RANDOM_REFRESH; 3630 + } 3631 + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; 3632 + return swahw32(rsp->state); 3633 + } 3634 + 3635 + static char * 3636 + _drbd_fault_str(unsigned int type) { 3637 + static char *_faults[] = { 3638 + [DRBD_FAULT_MD_WR] = "Meta-data write", 3639 + [DRBD_FAULT_MD_RD] = "Meta-data read", 3640 + [DRBD_FAULT_RS_WR] = "Resync write", 3641 + [DRBD_FAULT_RS_RD] = "Resync read", 3642 + [DRBD_FAULT_DT_WR] = "Data write", 3643 + [DRBD_FAULT_DT_RD] = "Data read", 3644 + [DRBD_FAULT_DT_RA] = "Data read ahead", 3645 + [DRBD_FAULT_BM_ALLOC] = "BM allocation", 3646 + [DRBD_FAULT_AL_EE] = "EE allocation" 3647 + }; 3648 + 3649 + return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; 3650 + } 3651 + 3652 + unsigned int 3653 + _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) 3654 + { 3655 + static struct fault_random_state rrs = {0, 0}; 3656 + 3657 + unsigned int ret = ( 3658 + (fault_devs == 0 || 3659 + ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && 3660 + (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); 3661 + 3662 + if (ret) { 3663 + fault_count++; 3664 + 3665 + if (printk_ratelimit()) 3666 + dev_warn(DEV, "***Simulating %s failure\n", 3667 + _drbd_fault_str(type)); 3668 + } 3669 + 3670 + return ret; 3671 + } 3672 + #endif 3673 + 3674 + const char *drbd_buildtag(void) 3675 + { 3676 + /* DRBD built from external sources has here a reference to the 3677 + git hash of the source code. */ 3678 + 3679 + static char buildtag[38] = "\0uilt-in"; 3680 + 3681 + if (buildtag[0] == 0) { 3682 + #ifdef CONFIG_MODULES 3683 + if (THIS_MODULE != NULL) 3684 + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); 3685 + else 3686 + #endif 3687 + buildtag[0] = 'b'; 3688 + } 3689 + 3690 + return buildtag; 3691 + } 3692 + 3693 + module_init(drbd_init) 3694 + module_exit(drbd_cleanup) 3695 + 3696 + EXPORT_SYMBOL(drbd_conn_str); 3697 + EXPORT_SYMBOL(drbd_role_str); 3698 + EXPORT_SYMBOL(drbd_disk_str); 3699 + EXPORT_SYMBOL(drbd_set_st_err_str);

+2364

drivers/block/drbd/drbd_nl.c

··· 1 + /* 2 + drbd_nl.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/module.h> 27 + #include <linux/drbd.h> 28 + #include <linux/in.h> 29 + #include <linux/fs.h> 30 + #include <linux/file.h> 31 + #include <linux/slab.h> 32 + #include <linux/connector.h> 33 + #include <linux/blkpg.h> 34 + #include <linux/cpumask.h> 35 + #include "drbd_int.h" 36 + #include "drbd_wrappers.h" 37 + #include <asm/unaligned.h> 38 + #include <linux/drbd_tag_magic.h> 39 + #include <linux/drbd_limits.h> 40 + 41 + static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); 42 + static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); 43 + static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); 44 + 45 + /* see get_sb_bdev and bd_claim */ 46 + static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; 47 + 48 + /* Generate the tag_list to struct functions */ 49 + #define NL_PACKET(name, number, fields) \ 50 + static int name ## _from_tags(struct drbd_conf *mdev, \ 51 + unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ 52 + static int name ## _from_tags(struct drbd_conf *mdev, \ 53 + unsigned short *tags, struct name *arg) \ 54 + { \ 55 + int tag; \ 56 + int dlen; \ 57 + \ 58 + while ((tag = get_unaligned(tags++)) != TT_END) { \ 59 + dlen = get_unaligned(tags++); \ 60 + switch (tag_number(tag)) { \ 61 + fields \ 62 + default: \ 63 + if (tag & T_MANDATORY) { \ 64 + dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ 65 + return 0; \ 66 + } \ 67 + } \ 68 + tags = (unsigned short *)((char *)tags + dlen); \ 69 + } \ 70 + return 1; \ 71 + } 72 + #define NL_INTEGER(pn, pr, member) \ 73 + case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ 74 + arg->member = get_unaligned((int *)(tags)); \ 75 + break; 76 + #define NL_INT64(pn, pr, member) \ 77 + case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ 78 + arg->member = get_unaligned((u64 *)(tags)); \ 79 + break; 80 + #define NL_BIT(pn, pr, member) \ 81 + case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ 82 + arg->member = *(char *)(tags) ? 1 : 0; \ 83 + break; 84 + #define NL_STRING(pn, pr, member, len) \ 85 + case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ 86 + if (dlen > len) { \ 87 + dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ 88 + #member, dlen, (unsigned int)len); \ 89 + return 0; \ 90 + } \ 91 + arg->member ## _len = dlen; \ 92 + memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ 93 + break; 94 + #include "linux/drbd_nl.h" 95 + 96 + /* Generate the struct to tag_list functions */ 97 + #define NL_PACKET(name, number, fields) \ 98 + static unsigned short* \ 99 + name ## _to_tags(struct drbd_conf *mdev, \ 100 + struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ 101 + static unsigned short* \ 102 + name ## _to_tags(struct drbd_conf *mdev, \ 103 + struct name *arg, unsigned short *tags) \ 104 + { \ 105 + fields \ 106 + return tags; \ 107 + } 108 + 109 + #define NL_INTEGER(pn, pr, member) \ 110 + put_unaligned(pn | pr | TT_INTEGER, tags++); \ 111 + put_unaligned(sizeof(int), tags++); \ 112 + put_unaligned(arg->member, (int *)tags); \ 113 + tags = (unsigned short *)((char *)tags+sizeof(int)); 114 + #define NL_INT64(pn, pr, member) \ 115 + put_unaligned(pn | pr | TT_INT64, tags++); \ 116 + put_unaligned(sizeof(u64), tags++); \ 117 + put_unaligned(arg->member, (u64 *)tags); \ 118 + tags = (unsigned short *)((char *)tags+sizeof(u64)); 119 + #define NL_BIT(pn, pr, member) \ 120 + put_unaligned(pn | pr | TT_BIT, tags++); \ 121 + put_unaligned(sizeof(char), tags++); \ 122 + *(char *)tags = arg->member; \ 123 + tags = (unsigned short *)((char *)tags+sizeof(char)); 124 + #define NL_STRING(pn, pr, member, len) \ 125 + put_unaligned(pn | pr | TT_STRING, tags++); \ 126 + put_unaligned(arg->member ## _len, tags++); \ 127 + memcpy(tags, arg->member, arg->member ## _len); \ 128 + tags = (unsigned short *)((char *)tags + arg->member ## _len); 129 + #include "linux/drbd_nl.h" 130 + 131 + void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); 132 + void drbd_nl_send_reply(struct cn_msg *, int); 133 + 134 + int drbd_khelper(struct drbd_conf *mdev, char *cmd) 135 + { 136 + char *envp[] = { "HOME=/", 137 + "TERM=linux", 138 + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", 139 + NULL, /* Will be set to address family */ 140 + NULL, /* Will be set to address */ 141 + NULL }; 142 + 143 + char mb[12], af[20], ad[60], *afs; 144 + char *argv[] = {usermode_helper, cmd, mb, NULL }; 145 + int ret; 146 + 147 + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); 148 + 149 + if (get_net_conf(mdev)) { 150 + switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { 151 + case AF_INET6: 152 + afs = "ipv6"; 153 + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", 154 + &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); 155 + break; 156 + case AF_INET: 157 + afs = "ipv4"; 158 + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", 159 + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); 160 + break; 161 + default: 162 + afs = "ssocks"; 163 + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", 164 + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); 165 + } 166 + snprintf(af, 20, "DRBD_PEER_AF=%s", afs); 167 + envp[3]=af; 168 + envp[4]=ad; 169 + put_net_conf(mdev); 170 + } 171 + 172 + dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); 173 + 174 + drbd_bcast_ev_helper(mdev, cmd); 175 + ret = call_usermodehelper(usermode_helper, argv, envp, 1); 176 + if (ret) 177 + dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", 178 + usermode_helper, cmd, mb, 179 + (ret >> 8) & 0xff, ret); 180 + else 181 + dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", 182 + usermode_helper, cmd, mb, 183 + (ret >> 8) & 0xff, ret); 184 + 185 + if (ret < 0) /* Ignore any ERRNOs we got. */ 186 + ret = 0; 187 + 188 + return ret; 189 + } 190 + 191 + enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) 192 + { 193 + char *ex_to_string; 194 + int r; 195 + enum drbd_disk_state nps; 196 + enum drbd_fencing_p fp; 197 + 198 + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); 199 + 200 + if (get_ldev_if_state(mdev, D_CONSISTENT)) { 201 + fp = mdev->ldev->dc.fencing; 202 + put_ldev(mdev); 203 + } else { 204 + dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); 205 + return mdev->state.pdsk; 206 + } 207 + 208 + if (fp == FP_STONITH) 209 + _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); 210 + 211 + r = drbd_khelper(mdev, "fence-peer"); 212 + 213 + switch ((r>>8) & 0xff) { 214 + case 3: /* peer is inconsistent */ 215 + ex_to_string = "peer is inconsistent or worse"; 216 + nps = D_INCONSISTENT; 217 + break; 218 + case 4: /* peer got outdated, or was already outdated */ 219 + ex_to_string = "peer was fenced"; 220 + nps = D_OUTDATED; 221 + break; 222 + case 5: /* peer was down */ 223 + if (mdev->state.disk == D_UP_TO_DATE) { 224 + /* we will(have) create(d) a new UUID anyways... */ 225 + ex_to_string = "peer is unreachable, assumed to be dead"; 226 + nps = D_OUTDATED; 227 + } else { 228 + ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; 229 + nps = mdev->state.pdsk; 230 + } 231 + break; 232 + case 6: /* Peer is primary, voluntarily outdate myself. 233 + * This is useful when an unconnected R_SECONDARY is asked to 234 + * become R_PRIMARY, but finds the other peer being active. */ 235 + ex_to_string = "peer is active"; 236 + dev_warn(DEV, "Peer is primary, outdating myself.\n"); 237 + nps = D_UNKNOWN; 238 + _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); 239 + break; 240 + case 7: 241 + if (fp != FP_STONITH) 242 + dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); 243 + ex_to_string = "peer was stonithed"; 244 + nps = D_OUTDATED; 245 + break; 246 + default: 247 + /* The script is broken ... */ 248 + nps = D_UNKNOWN; 249 + dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); 250 + return nps; 251 + } 252 + 253 + dev_info(DEV, "fence-peer helper returned %d (%s)\n", 254 + (r>>8) & 0xff, ex_to_string); 255 + return nps; 256 + } 257 + 258 + 259 + int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) 260 + { 261 + const int max_tries = 4; 262 + int r = 0; 263 + int try = 0; 264 + int forced = 0; 265 + union drbd_state mask, val; 266 + enum drbd_disk_state nps; 267 + 268 + if (new_role == R_PRIMARY) 269 + request_ping(mdev); /* Detect a dead peer ASAP */ 270 + 271 + mutex_lock(&mdev->state_mutex); 272 + 273 + mask.i = 0; mask.role = R_MASK; 274 + val.i = 0; val.role = new_role; 275 + 276 + while (try++ < max_tries) { 277 + r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); 278 + 279 + /* in case we first succeeded to outdate, 280 + * but now suddenly could establish a connection */ 281 + if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { 282 + val.pdsk = 0; 283 + mask.pdsk = 0; 284 + continue; 285 + } 286 + 287 + if (r == SS_NO_UP_TO_DATE_DISK && force && 288 + (mdev->state.disk == D_INCONSISTENT || 289 + mdev->state.disk == D_OUTDATED)) { 290 + mask.disk = D_MASK; 291 + val.disk = D_UP_TO_DATE; 292 + forced = 1; 293 + continue; 294 + } 295 + 296 + if (r == SS_NO_UP_TO_DATE_DISK && 297 + mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { 298 + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); 299 + nps = drbd_try_outdate_peer(mdev); 300 + 301 + if (nps == D_OUTDATED || nps == D_INCONSISTENT) { 302 + val.disk = D_UP_TO_DATE; 303 + mask.disk = D_MASK; 304 + } 305 + 306 + val.pdsk = nps; 307 + mask.pdsk = D_MASK; 308 + 309 + continue; 310 + } 311 + 312 + if (r == SS_NOTHING_TO_DO) 313 + goto fail; 314 + if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { 315 + nps = drbd_try_outdate_peer(mdev); 316 + 317 + if (force && nps > D_OUTDATED) { 318 + dev_warn(DEV, "Forced into split brain situation!\n"); 319 + nps = D_OUTDATED; 320 + } 321 + 322 + mask.pdsk = D_MASK; 323 + val.pdsk = nps; 324 + 325 + continue; 326 + } 327 + if (r == SS_TWO_PRIMARIES) { 328 + /* Maybe the peer is detected as dead very soon... 329 + retry at most once more in this case. */ 330 + __set_current_state(TASK_INTERRUPTIBLE); 331 + schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); 332 + if (try < max_tries) 333 + try = max_tries - 1; 334 + continue; 335 + } 336 + if (r < SS_SUCCESS) { 337 + r = _drbd_request_state(mdev, mask, val, 338 + CS_VERBOSE + CS_WAIT_COMPLETE); 339 + if (r < SS_SUCCESS) 340 + goto fail; 341 + } 342 + break; 343 + } 344 + 345 + if (r < SS_SUCCESS) 346 + goto fail; 347 + 348 + if (forced) 349 + dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); 350 + 351 + /* Wait until nothing is on the fly :) */ 352 + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); 353 + 354 + if (new_role == R_SECONDARY) { 355 + set_disk_ro(mdev->vdisk, TRUE); 356 + if (get_ldev(mdev)) { 357 + mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 358 + put_ldev(mdev); 359 + } 360 + } else { 361 + if (get_net_conf(mdev)) { 362 + mdev->net_conf->want_lose = 0; 363 + put_net_conf(mdev); 364 + } 365 + set_disk_ro(mdev->vdisk, FALSE); 366 + if (get_ldev(mdev)) { 367 + if (((mdev->state.conn < C_CONNECTED || 368 + mdev->state.pdsk <= D_FAILED) 369 + && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) 370 + drbd_uuid_new_current(mdev); 371 + 372 + mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; 373 + put_ldev(mdev); 374 + } 375 + } 376 + 377 + if ((new_role == R_SECONDARY) && get_ldev(mdev)) { 378 + drbd_al_to_on_disk_bm(mdev); 379 + put_ldev(mdev); 380 + } 381 + 382 + if (mdev->state.conn >= C_WF_REPORT_PARAMS) { 383 + /* if this was forced, we should consider sync */ 384 + if (forced) 385 + drbd_send_uuids(mdev); 386 + drbd_send_state(mdev); 387 + } 388 + 389 + drbd_md_sync(mdev); 390 + 391 + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 392 + fail: 393 + mutex_unlock(&mdev->state_mutex); 394 + return r; 395 + } 396 + 397 + 398 + static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 399 + struct drbd_nl_cfg_reply *reply) 400 + { 401 + struct primary primary_args; 402 + 403 + memset(&primary_args, 0, sizeof(struct primary)); 404 + if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { 405 + reply->ret_code = ERR_MANDATORY_TAG; 406 + return 0; 407 + } 408 + 409 + reply->ret_code = 410 + drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); 411 + 412 + return 0; 413 + } 414 + 415 + static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 416 + struct drbd_nl_cfg_reply *reply) 417 + { 418 + reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); 419 + 420 + return 0; 421 + } 422 + 423 + /* initializes the md.*_offset members, so we are able to find 424 + * the on disk meta data */ 425 + static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, 426 + struct drbd_backing_dev *bdev) 427 + { 428 + sector_t md_size_sect = 0; 429 + switch (bdev->dc.meta_dev_idx) { 430 + default: 431 + /* v07 style fixed size indexed meta data */ 432 + bdev->md.md_size_sect = MD_RESERVED_SECT; 433 + bdev->md.md_offset = drbd_md_ss__(mdev, bdev); 434 + bdev->md.al_offset = MD_AL_OFFSET; 435 + bdev->md.bm_offset = MD_BM_OFFSET; 436 + break; 437 + case DRBD_MD_INDEX_FLEX_EXT: 438 + /* just occupy the full device; unit: sectors */ 439 + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); 440 + bdev->md.md_offset = 0; 441 + bdev->md.al_offset = MD_AL_OFFSET; 442 + bdev->md.bm_offset = MD_BM_OFFSET; 443 + break; 444 + case DRBD_MD_INDEX_INTERNAL: 445 + case DRBD_MD_INDEX_FLEX_INT: 446 + bdev->md.md_offset = drbd_md_ss__(mdev, bdev); 447 + /* al size is still fixed */ 448 + bdev->md.al_offset = -MD_AL_MAX_SIZE; 449 + /* we need (slightly less than) ~ this much bitmap sectors: */ 450 + md_size_sect = drbd_get_capacity(bdev->backing_bdev); 451 + md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); 452 + md_size_sect = BM_SECT_TO_EXT(md_size_sect); 453 + md_size_sect = ALIGN(md_size_sect, 8); 454 + 455 + /* plus the "drbd meta data super block", 456 + * and the activity log; */ 457 + md_size_sect += MD_BM_OFFSET; 458 + 459 + bdev->md.md_size_sect = md_size_sect; 460 + /* bitmap offset is adjusted by 'super' block size */ 461 + bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; 462 + break; 463 + } 464 + } 465 + 466 + char *ppsize(char *buf, unsigned long long size) 467 + { 468 + /* Needs 9 bytes at max. */ 469 + static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; 470 + int base = 0; 471 + while (size >= 10000) { 472 + /* shift + round */ 473 + size = (size >> 10) + !!(size & (1<<9)); 474 + base++; 475 + } 476 + sprintf(buf, "%lu %cB", (long)size, units[base]); 477 + 478 + return buf; 479 + } 480 + 481 + /* there is still a theoretical deadlock when called from receiver 482 + * on an D_INCONSISTENT R_PRIMARY: 483 + * remote READ does inc_ap_bio, receiver would need to receive answer 484 + * packet from remote to dec_ap_bio again. 485 + * receiver receive_sizes(), comes here, 486 + * waits for ap_bio_cnt == 0. -> deadlock. 487 + * but this cannot happen, actually, because: 488 + * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable 489 + * (not connected, or bad/no disk on peer): 490 + * see drbd_fail_request_early, ap_bio_cnt is zero. 491 + * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: 492 + * peer may not initiate a resize. 493 + */ 494 + void drbd_suspend_io(struct drbd_conf *mdev) 495 + { 496 + set_bit(SUSPEND_IO, &mdev->flags); 497 + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 498 + } 499 + 500 + void drbd_resume_io(struct drbd_conf *mdev) 501 + { 502 + clear_bit(SUSPEND_IO, &mdev->flags); 503 + wake_up(&mdev->misc_wait); 504 + } 505 + 506 + /** 507 + * drbd_determine_dev_size() - Sets the right device size obeying all constraints 508 + * @mdev: DRBD device. 509 + * 510 + * Returns 0 on success, negative return values indicate errors. 511 + * You should call drbd_md_sync() after calling this function. 512 + */ 513 + enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local) 514 + { 515 + sector_t prev_first_sect, prev_size; /* previous meta location */ 516 + sector_t la_size; 517 + sector_t size; 518 + char ppb[10]; 519 + 520 + int md_moved, la_size_changed; 521 + enum determine_dev_size rv = unchanged; 522 + 523 + /* race: 524 + * application request passes inc_ap_bio, 525 + * but then cannot get an AL-reference. 526 + * this function later may wait on ap_bio_cnt == 0. -> deadlock. 527 + * 528 + * to avoid that: 529 + * Suspend IO right here. 530 + * still lock the act_log to not trigger ASSERTs there. 531 + */ 532 + drbd_suspend_io(mdev); 533 + 534 + /* no wait necessary anymore, actually we could assert that */ 535 + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 536 + 537 + prev_first_sect = drbd_md_first_sector(mdev->ldev); 538 + prev_size = mdev->ldev->md.md_size_sect; 539 + la_size = mdev->ldev->md.la_size_sect; 540 + 541 + /* TODO: should only be some assert here, not (re)init... */ 542 + drbd_md_set_sector_offsets(mdev, mdev->ldev); 543 + 544 + size = drbd_new_dev_size(mdev, mdev->ldev); 545 + 546 + if (drbd_get_capacity(mdev->this_bdev) != size || 547 + drbd_bm_capacity(mdev) != size) { 548 + int err; 549 + err = drbd_bm_resize(mdev, size); 550 + if (unlikely(err)) { 551 + /* currently there is only one error: ENOMEM! */ 552 + size = drbd_bm_capacity(mdev)>>1; 553 + if (size == 0) { 554 + dev_err(DEV, "OUT OF MEMORY! " 555 + "Could not allocate bitmap!\n"); 556 + } else { 557 + dev_err(DEV, "BM resizing failed. " 558 + "Leaving size unchanged at size = %lu KB\n", 559 + (unsigned long)size); 560 + } 561 + rv = dev_size_error; 562 + } 563 + /* racy, see comments above. */ 564 + drbd_set_my_capacity(mdev, size); 565 + mdev->ldev->md.la_size_sect = size; 566 + dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), 567 + (unsigned long long)size>>1); 568 + } 569 + if (rv == dev_size_error) 570 + goto out; 571 + 572 + la_size_changed = (la_size != mdev->ldev->md.la_size_sect); 573 + 574 + md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) 575 + || prev_size != mdev->ldev->md.md_size_sect; 576 + 577 + if (la_size_changed || md_moved) { 578 + drbd_al_shrink(mdev); /* All extents inactive. */ 579 + dev_info(DEV, "Writing the whole bitmap, %s\n", 580 + la_size_changed && md_moved ? "size changed and md moved" : 581 + la_size_changed ? "size changed" : "md moved"); 582 + rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ 583 + drbd_md_mark_dirty(mdev); 584 + } 585 + 586 + if (size > la_size) 587 + rv = grew; 588 + if (size < la_size) 589 + rv = shrunk; 590 + out: 591 + lc_unlock(mdev->act_log); 592 + wake_up(&mdev->al_wait); 593 + drbd_resume_io(mdev); 594 + 595 + return rv; 596 + } 597 + 598 + sector_t 599 + drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 600 + { 601 + sector_t p_size = mdev->p_size; /* partner's disk size. */ 602 + sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ 603 + sector_t m_size; /* my size */ 604 + sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ 605 + sector_t size = 0; 606 + 607 + m_size = drbd_get_max_capacity(bdev); 608 + 609 + if (p_size && m_size) { 610 + size = min_t(sector_t, p_size, m_size); 611 + } else { 612 + if (la_size) { 613 + size = la_size; 614 + if (m_size && m_size < size) 615 + size = m_size; 616 + if (p_size && p_size < size) 617 + size = p_size; 618 + } else { 619 + if (m_size) 620 + size = m_size; 621 + if (p_size) 622 + size = p_size; 623 + } 624 + } 625 + 626 + if (size == 0) 627 + dev_err(DEV, "Both nodes diskless!\n"); 628 + 629 + if (u_size) { 630 + if (u_size > size) 631 + dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", 632 + (unsigned long)u_size>>1, (unsigned long)size>>1); 633 + else 634 + size = u_size; 635 + } 636 + 637 + return size; 638 + } 639 + 640 + /** 641 + * drbd_check_al_size() - Ensures that the AL is of the right size 642 + * @mdev: DRBD device. 643 + * 644 + * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation 645 + * failed, and 0 on success. You should call drbd_md_sync() after you called 646 + * this function. 647 + */ 648 + static int drbd_check_al_size(struct drbd_conf *mdev) 649 + { 650 + struct lru_cache *n, *t; 651 + struct lc_element *e; 652 + unsigned int in_use; 653 + int i; 654 + 655 + ERR_IF(mdev->sync_conf.al_extents < 7) 656 + mdev->sync_conf.al_extents = 127; 657 + 658 + if (mdev->act_log && 659 + mdev->act_log->nr_elements == mdev->sync_conf.al_extents) 660 + return 0; 661 + 662 + in_use = 0; 663 + t = mdev->act_log; 664 + n = lc_create("act_log", drbd_al_ext_cache, 665 + mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); 666 + 667 + if (n == NULL) { 668 + dev_err(DEV, "Cannot allocate act_log lru!\n"); 669 + return -ENOMEM; 670 + } 671 + spin_lock_irq(&mdev->al_lock); 672 + if (t) { 673 + for (i = 0; i < t->nr_elements; i++) { 674 + e = lc_element_by_index(t, i); 675 + if (e->refcnt) 676 + dev_err(DEV, "refcnt(%d)==%d\n", 677 + e->lc_number, e->refcnt); 678 + in_use += e->refcnt; 679 + } 680 + } 681 + if (!in_use) 682 + mdev->act_log = n; 683 + spin_unlock_irq(&mdev->al_lock); 684 + if (in_use) { 685 + dev_err(DEV, "Activity log still in use!\n"); 686 + lc_destroy(n); 687 + return -EBUSY; 688 + } else { 689 + if (t) 690 + lc_destroy(t); 691 + } 692 + drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ 693 + return 0; 694 + } 695 + 696 + void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) 697 + { 698 + struct request_queue * const q = mdev->rq_queue; 699 + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 700 + int max_segments = mdev->ldev->dc.max_bio_bvecs; 701 + 702 + if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) 703 + max_seg_s = PAGE_SIZE; 704 + 705 + max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); 706 + 707 + blk_queue_max_sectors(q, max_seg_s >> 9); 708 + blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); 709 + blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); 710 + blk_queue_max_segment_size(q, max_seg_s); 711 + blk_queue_logical_block_size(q, 512); 712 + blk_queue_segment_boundary(q, PAGE_SIZE-1); 713 + blk_stack_limits(&q->limits, &b->limits, 0); 714 + 715 + if (b->merge_bvec_fn) 716 + dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", 717 + b->merge_bvec_fn); 718 + dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); 719 + 720 + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 721 + dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 722 + q->backing_dev_info.ra_pages, 723 + b->backing_dev_info.ra_pages); 724 + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 725 + } 726 + } 727 + 728 + /* serialize deconfig (worker exiting, doing cleanup) 729 + * and reconfig (drbdsetup disk, drbdsetup net) 730 + * 731 + * wait for a potentially exiting worker, then restart it, 732 + * or start a new one. 733 + */ 734 + static void drbd_reconfig_start(struct drbd_conf *mdev) 735 + { 736 + wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); 737 + wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); 738 + drbd_thread_start(&mdev->worker); 739 + } 740 + 741 + /* if still unconfigured, stops worker again. 742 + * if configured now, clears CONFIG_PENDING. 743 + * wakes potential waiters */ 744 + static void drbd_reconfig_done(struct drbd_conf *mdev) 745 + { 746 + spin_lock_irq(&mdev->req_lock); 747 + if (mdev->state.disk == D_DISKLESS && 748 + mdev->state.conn == C_STANDALONE && 749 + mdev->state.role == R_SECONDARY) { 750 + set_bit(DEVICE_DYING, &mdev->flags); 751 + drbd_thread_stop_nowait(&mdev->worker); 752 + } else 753 + clear_bit(CONFIG_PENDING, &mdev->flags); 754 + spin_unlock_irq(&mdev->req_lock); 755 + wake_up(&mdev->state_wait); 756 + } 757 + 758 + /* does always return 0; 759 + * interesting return code is in reply->ret_code */ 760 + static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 761 + struct drbd_nl_cfg_reply *reply) 762 + { 763 + enum drbd_ret_codes retcode; 764 + enum determine_dev_size dd; 765 + sector_t max_possible_sectors; 766 + sector_t min_md_device_sectors; 767 + struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ 768 + struct inode *inode, *inode2; 769 + struct lru_cache *resync_lru = NULL; 770 + union drbd_state ns, os; 771 + int rv; 772 + int cp_discovered = 0; 773 + int logical_block_size; 774 + 775 + drbd_reconfig_start(mdev); 776 + 777 + /* if you want to reconfigure, please tear down first */ 778 + if (mdev->state.disk > D_DISKLESS) { 779 + retcode = ERR_DISK_CONFIGURED; 780 + goto fail; 781 + } 782 + 783 + /* allocation not in the IO path, cqueue thread context */ 784 + nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); 785 + if (!nbc) { 786 + retcode = ERR_NOMEM; 787 + goto fail; 788 + } 789 + 790 + nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; 791 + nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; 792 + nbc->dc.fencing = DRBD_FENCING_DEF; 793 + nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; 794 + 795 + if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { 796 + retcode = ERR_MANDATORY_TAG; 797 + goto fail; 798 + } 799 + 800 + if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { 801 + retcode = ERR_MD_IDX_INVALID; 802 + goto fail; 803 + } 804 + 805 + nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); 806 + if (IS_ERR(nbc->lo_file)) { 807 + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, 808 + PTR_ERR(nbc->lo_file)); 809 + nbc->lo_file = NULL; 810 + retcode = ERR_OPEN_DISK; 811 + goto fail; 812 + } 813 + 814 + inode = nbc->lo_file->f_dentry->d_inode; 815 + 816 + if (!S_ISBLK(inode->i_mode)) { 817 + retcode = ERR_DISK_NOT_BDEV; 818 + goto fail; 819 + } 820 + 821 + nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); 822 + if (IS_ERR(nbc->md_file)) { 823 + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, 824 + PTR_ERR(nbc->md_file)); 825 + nbc->md_file = NULL; 826 + retcode = ERR_OPEN_MD_DISK; 827 + goto fail; 828 + } 829 + 830 + inode2 = nbc->md_file->f_dentry->d_inode; 831 + 832 + if (!S_ISBLK(inode2->i_mode)) { 833 + retcode = ERR_MD_NOT_BDEV; 834 + goto fail; 835 + } 836 + 837 + nbc->backing_bdev = inode->i_bdev; 838 + if (bd_claim(nbc->backing_bdev, mdev)) { 839 + printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", 840 + nbc->backing_bdev, mdev, 841 + nbc->backing_bdev->bd_holder, 842 + nbc->backing_bdev->bd_contains->bd_holder, 843 + nbc->backing_bdev->bd_holders); 844 + retcode = ERR_BDCLAIM_DISK; 845 + goto fail; 846 + } 847 + 848 + resync_lru = lc_create("resync", drbd_bm_ext_cache, 849 + 61, sizeof(struct bm_extent), 850 + offsetof(struct bm_extent, lce)); 851 + if (!resync_lru) { 852 + retcode = ERR_NOMEM; 853 + goto release_bdev_fail; 854 + } 855 + 856 + /* meta_dev_idx >= 0: external fixed size, 857 + * possibly multiple drbd sharing one meta device. 858 + * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is 859 + * not yet used by some other drbd minor! 860 + * (if you use drbd.conf + drbdadm, 861 + * that should check it for you already; but if you don't, or someone 862 + * fooled it, we need to double check here) */ 863 + nbc->md_bdev = inode2->i_bdev; 864 + if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev 865 + : (void *) drbd_m_holder)) { 866 + retcode = ERR_BDCLAIM_MD_DISK; 867 + goto release_bdev_fail; 868 + } 869 + 870 + if ((nbc->backing_bdev == nbc->md_bdev) != 871 + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || 872 + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { 873 + retcode = ERR_MD_IDX_INVALID; 874 + goto release_bdev2_fail; 875 + } 876 + 877 + /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ 878 + drbd_md_set_sector_offsets(mdev, nbc); 879 + 880 + if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { 881 + dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", 882 + (unsigned long long) drbd_get_max_capacity(nbc), 883 + (unsigned long long) nbc->dc.disk_size); 884 + retcode = ERR_DISK_TO_SMALL; 885 + goto release_bdev2_fail; 886 + } 887 + 888 + if (nbc->dc.meta_dev_idx < 0) { 889 + max_possible_sectors = DRBD_MAX_SECTORS_FLEX; 890 + /* at least one MB, otherwise it does not make sense */ 891 + min_md_device_sectors = (2<<10); 892 + } else { 893 + max_possible_sectors = DRBD_MAX_SECTORS; 894 + min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); 895 + } 896 + 897 + if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { 898 + retcode = ERR_MD_DISK_TO_SMALL; 899 + dev_warn(DEV, "refusing attach: md-device too small, " 900 + "at least %llu sectors needed for this meta-disk type\n", 901 + (unsigned long long) min_md_device_sectors); 902 + goto release_bdev2_fail; 903 + } 904 + 905 + /* Make sure the new disk is big enough 906 + * (we may currently be R_PRIMARY with no local disk...) */ 907 + if (drbd_get_max_capacity(nbc) < 908 + drbd_get_capacity(mdev->this_bdev)) { 909 + retcode = ERR_DISK_TO_SMALL; 910 + goto release_bdev2_fail; 911 + } 912 + 913 + nbc->known_size = drbd_get_capacity(nbc->backing_bdev); 914 + 915 + if (nbc->known_size > max_possible_sectors) { 916 + dev_warn(DEV, "==> truncating very big lower level device " 917 + "to currently maximum possible %llu sectors <==\n", 918 + (unsigned long long) max_possible_sectors); 919 + if (nbc->dc.meta_dev_idx >= 0) 920 + dev_warn(DEV, "==>> using internal or flexible " 921 + "meta data may help <<==\n"); 922 + } 923 + 924 + drbd_suspend_io(mdev); 925 + /* also wait for the last barrier ack. */ 926 + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); 927 + /* and for any other previously queued work */ 928 + drbd_flush_workqueue(mdev); 929 + 930 + retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); 931 + drbd_resume_io(mdev); 932 + if (retcode < SS_SUCCESS) 933 + goto release_bdev2_fail; 934 + 935 + if (!get_ldev_if_state(mdev, D_ATTACHING)) 936 + goto force_diskless; 937 + 938 + drbd_md_set_sector_offsets(mdev, nbc); 939 + 940 + if (!mdev->bitmap) { 941 + if (drbd_bm_init(mdev)) { 942 + retcode = ERR_NOMEM; 943 + goto force_diskless_dec; 944 + } 945 + } 946 + 947 + retcode = drbd_md_read(mdev, nbc); 948 + if (retcode != NO_ERROR) 949 + goto force_diskless_dec; 950 + 951 + if (mdev->state.conn < C_CONNECTED && 952 + mdev->state.role == R_PRIMARY && 953 + (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { 954 + dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", 955 + (unsigned long long)mdev->ed_uuid); 956 + retcode = ERR_DATA_NOT_CURRENT; 957 + goto force_diskless_dec; 958 + } 959 + 960 + /* Since we are diskless, fix the activity log first... */ 961 + if (drbd_check_al_size(mdev)) { 962 + retcode = ERR_NOMEM; 963 + goto force_diskless_dec; 964 + } 965 + 966 + /* Prevent shrinking of consistent devices ! */ 967 + if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && 968 + drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) { 969 + dev_warn(DEV, "refusing to truncate a consistent device\n"); 970 + retcode = ERR_DISK_TO_SMALL; 971 + goto force_diskless_dec; 972 + } 973 + 974 + if (!drbd_al_read_log(mdev, nbc)) { 975 + retcode = ERR_IO_MD_DISK; 976 + goto force_diskless_dec; 977 + } 978 + 979 + /* allocate a second IO page if logical_block_size != 512 */ 980 + logical_block_size = bdev_logical_block_size(nbc->md_bdev); 981 + if (logical_block_size == 0) 982 + logical_block_size = MD_SECTOR_SIZE; 983 + 984 + if (logical_block_size != MD_SECTOR_SIZE) { 985 + if (!mdev->md_io_tmpp) { 986 + struct page *page = alloc_page(GFP_NOIO); 987 + if (!page) 988 + goto force_diskless_dec; 989 + 990 + dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", 991 + logical_block_size, MD_SECTOR_SIZE); 992 + dev_warn(DEV, "Workaround engaged (has performance impact).\n"); 993 + 994 + mdev->md_io_tmpp = page; 995 + } 996 + } 997 + 998 + /* Reset the "barriers don't work" bits here, then force meta data to 999 + * be written, to ensure we determine if barriers are supported. */ 1000 + if (nbc->dc.no_md_flush) 1001 + set_bit(MD_NO_BARRIER, &mdev->flags); 1002 + else 1003 + clear_bit(MD_NO_BARRIER, &mdev->flags); 1004 + 1005 + /* Point of no return reached. 1006 + * Devices and memory are no longer released by error cleanup below. 1007 + * now mdev takes over responsibility, and the state engine should 1008 + * clean it up somewhere. */ 1009 + D_ASSERT(mdev->ldev == NULL); 1010 + mdev->ldev = nbc; 1011 + mdev->resync = resync_lru; 1012 + nbc = NULL; 1013 + resync_lru = NULL; 1014 + 1015 + mdev->write_ordering = WO_bio_barrier; 1016 + drbd_bump_write_ordering(mdev, WO_bio_barrier); 1017 + 1018 + if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) 1019 + set_bit(CRASHED_PRIMARY, &mdev->flags); 1020 + else 1021 + clear_bit(CRASHED_PRIMARY, &mdev->flags); 1022 + 1023 + if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { 1024 + set_bit(CRASHED_PRIMARY, &mdev->flags); 1025 + cp_discovered = 1; 1026 + } 1027 + 1028 + mdev->send_cnt = 0; 1029 + mdev->recv_cnt = 0; 1030 + mdev->read_cnt = 0; 1031 + mdev->writ_cnt = 0; 1032 + 1033 + drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); 1034 + 1035 + /* If I am currently not R_PRIMARY, 1036 + * but meta data primary indicator is set, 1037 + * I just now recover from a hard crash, 1038 + * and have been R_PRIMARY before that crash. 1039 + * 1040 + * Now, if I had no connection before that crash 1041 + * (have been degraded R_PRIMARY), chances are that 1042 + * I won't find my peer now either. 1043 + * 1044 + * In that case, and _only_ in that case, 1045 + * we use the degr-wfc-timeout instead of the default, 1046 + * so we can automatically recover from a crash of a 1047 + * degraded but active "cluster" after a certain timeout. 1048 + */ 1049 + clear_bit(USE_DEGR_WFC_T, &mdev->flags); 1050 + if (mdev->state.role != R_PRIMARY && 1051 + drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && 1052 + !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) 1053 + set_bit(USE_DEGR_WFC_T, &mdev->flags); 1054 + 1055 + dd = drbd_determin_dev_size(mdev); 1056 + if (dd == dev_size_error) { 1057 + retcode = ERR_NOMEM_BITMAP; 1058 + goto force_diskless_dec; 1059 + } else if (dd == grew) 1060 + set_bit(RESYNC_AFTER_NEG, &mdev->flags); 1061 + 1062 + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { 1063 + dev_info(DEV, "Assuming that all blocks are out of sync " 1064 + "(aka FullSync)\n"); 1065 + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { 1066 + retcode = ERR_IO_MD_DISK; 1067 + goto force_diskless_dec; 1068 + } 1069 + } else { 1070 + if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { 1071 + retcode = ERR_IO_MD_DISK; 1072 + goto force_diskless_dec; 1073 + } 1074 + } 1075 + 1076 + if (cp_discovered) { 1077 + drbd_al_apply_to_bm(mdev); 1078 + drbd_al_to_on_disk_bm(mdev); 1079 + } 1080 + 1081 + spin_lock_irq(&mdev->req_lock); 1082 + os = mdev->state; 1083 + ns.i = os.i; 1084 + /* If MDF_CONSISTENT is not set go into inconsistent state, 1085 + otherwise investigate MDF_WasUpToDate... 1086 + If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, 1087 + otherwise into D_CONSISTENT state. 1088 + */ 1089 + if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { 1090 + if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) 1091 + ns.disk = D_CONSISTENT; 1092 + else 1093 + ns.disk = D_OUTDATED; 1094 + } else { 1095 + ns.disk = D_INCONSISTENT; 1096 + } 1097 + 1098 + if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) 1099 + ns.pdsk = D_OUTDATED; 1100 + 1101 + if ( ns.disk == D_CONSISTENT && 1102 + (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) 1103 + ns.disk = D_UP_TO_DATE; 1104 + 1105 + /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, 1106 + MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before 1107 + this point, because drbd_request_state() modifies these 1108 + flags. */ 1109 + 1110 + /* In case we are C_CONNECTED postpone any decision on the new disk 1111 + state after the negotiation phase. */ 1112 + if (mdev->state.conn == C_CONNECTED) { 1113 + mdev->new_state_tmp.i = ns.i; 1114 + ns.i = os.i; 1115 + ns.disk = D_NEGOTIATING; 1116 + } 1117 + 1118 + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); 1119 + ns = mdev->state; 1120 + spin_unlock_irq(&mdev->req_lock); 1121 + 1122 + if (rv < SS_SUCCESS) 1123 + goto force_diskless_dec; 1124 + 1125 + if (mdev->state.role == R_PRIMARY) 1126 + mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; 1127 + else 1128 + mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 1129 + 1130 + drbd_md_mark_dirty(mdev); 1131 + drbd_md_sync(mdev); 1132 + 1133 + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1134 + put_ldev(mdev); 1135 + reply->ret_code = retcode; 1136 + drbd_reconfig_done(mdev); 1137 + return 0; 1138 + 1139 + force_diskless_dec: 1140 + put_ldev(mdev); 1141 + force_diskless: 1142 + drbd_force_state(mdev, NS(disk, D_DISKLESS)); 1143 + drbd_md_sync(mdev); 1144 + release_bdev2_fail: 1145 + if (nbc) 1146 + bd_release(nbc->md_bdev); 1147 + release_bdev_fail: 1148 + if (nbc) 1149 + bd_release(nbc->backing_bdev); 1150 + fail: 1151 + if (nbc) { 1152 + if (nbc->lo_file) 1153 + fput(nbc->lo_file); 1154 + if (nbc->md_file) 1155 + fput(nbc->md_file); 1156 + kfree(nbc); 1157 + } 1158 + lc_destroy(resync_lru); 1159 + 1160 + reply->ret_code = retcode; 1161 + drbd_reconfig_done(mdev); 1162 + return 0; 1163 + } 1164 + 1165 + static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1166 + struct drbd_nl_cfg_reply *reply) 1167 + { 1168 + reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); 1169 + return 0; 1170 + } 1171 + 1172 + static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1173 + struct drbd_nl_cfg_reply *reply) 1174 + { 1175 + int i, ns; 1176 + enum drbd_ret_codes retcode; 1177 + struct net_conf *new_conf = NULL; 1178 + struct crypto_hash *tfm = NULL; 1179 + struct crypto_hash *integrity_w_tfm = NULL; 1180 + struct crypto_hash *integrity_r_tfm = NULL; 1181 + struct hlist_head *new_tl_hash = NULL; 1182 + struct hlist_head *new_ee_hash = NULL; 1183 + struct drbd_conf *odev; 1184 + char hmac_name[CRYPTO_MAX_ALG_NAME]; 1185 + void *int_dig_out = NULL; 1186 + void *int_dig_in = NULL; 1187 + void *int_dig_vv = NULL; 1188 + struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; 1189 + 1190 + drbd_reconfig_start(mdev); 1191 + 1192 + if (mdev->state.conn > C_STANDALONE) { 1193 + retcode = ERR_NET_CONFIGURED; 1194 + goto fail; 1195 + } 1196 + 1197 + /* allocation not in the IO path, cqueue thread context */ 1198 + new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); 1199 + if (!new_conf) { 1200 + retcode = ERR_NOMEM; 1201 + goto fail; 1202 + } 1203 + 1204 + memset(new_conf, 0, sizeof(struct net_conf)); 1205 + new_conf->timeout = DRBD_TIMEOUT_DEF; 1206 + new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; 1207 + new_conf->ping_int = DRBD_PING_INT_DEF; 1208 + new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; 1209 + new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; 1210 + new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; 1211 + new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; 1212 + new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; 1213 + new_conf->ko_count = DRBD_KO_COUNT_DEF; 1214 + new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; 1215 + new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; 1216 + new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; 1217 + new_conf->want_lose = 0; 1218 + new_conf->two_primaries = 0; 1219 + new_conf->wire_protocol = DRBD_PROT_C; 1220 + new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; 1221 + new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; 1222 + 1223 + if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { 1224 + retcode = ERR_MANDATORY_TAG; 1225 + goto fail; 1226 + } 1227 + 1228 + if (new_conf->two_primaries 1229 + && (new_conf->wire_protocol != DRBD_PROT_C)) { 1230 + retcode = ERR_NOT_PROTO_C; 1231 + goto fail; 1232 + }; 1233 + 1234 + if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { 1235 + retcode = ERR_DISCARD; 1236 + goto fail; 1237 + } 1238 + 1239 + retcode = NO_ERROR; 1240 + 1241 + new_my_addr = (struct sockaddr *)&new_conf->my_addr; 1242 + new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; 1243 + for (i = 0; i < minor_count; i++) { 1244 + odev = minor_to_mdev(i); 1245 + if (!odev || odev == mdev) 1246 + continue; 1247 + if (get_net_conf(odev)) { 1248 + taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; 1249 + if (new_conf->my_addr_len == odev->net_conf->my_addr_len && 1250 + !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) 1251 + retcode = ERR_LOCAL_ADDR; 1252 + 1253 + taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; 1254 + if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && 1255 + !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) 1256 + retcode = ERR_PEER_ADDR; 1257 + 1258 + put_net_conf(odev); 1259 + if (retcode != NO_ERROR) 1260 + goto fail; 1261 + } 1262 + } 1263 + 1264 + if (new_conf->cram_hmac_alg[0] != 0) { 1265 + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", 1266 + new_conf->cram_hmac_alg); 1267 + tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); 1268 + if (IS_ERR(tfm)) { 1269 + tfm = NULL; 1270 + retcode = ERR_AUTH_ALG; 1271 + goto fail; 1272 + } 1273 + 1274 + if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) 1275 + != CRYPTO_ALG_TYPE_HASH) { 1276 + retcode = ERR_AUTH_ALG_ND; 1277 + goto fail; 1278 + } 1279 + } 1280 + 1281 + if (new_conf->integrity_alg[0]) { 1282 + integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); 1283 + if (IS_ERR(integrity_w_tfm)) { 1284 + integrity_w_tfm = NULL; 1285 + retcode=ERR_INTEGRITY_ALG; 1286 + goto fail; 1287 + } 1288 + 1289 + if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { 1290 + retcode=ERR_INTEGRITY_ALG_ND; 1291 + goto fail; 1292 + } 1293 + 1294 + integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); 1295 + if (IS_ERR(integrity_r_tfm)) { 1296 + integrity_r_tfm = NULL; 1297 + retcode=ERR_INTEGRITY_ALG; 1298 + goto fail; 1299 + } 1300 + } 1301 + 1302 + ns = new_conf->max_epoch_size/8; 1303 + if (mdev->tl_hash_s != ns) { 1304 + new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); 1305 + if (!new_tl_hash) { 1306 + retcode = ERR_NOMEM; 1307 + goto fail; 1308 + } 1309 + } 1310 + 1311 + ns = new_conf->max_buffers/8; 1312 + if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { 1313 + new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); 1314 + if (!new_ee_hash) { 1315 + retcode = ERR_NOMEM; 1316 + goto fail; 1317 + } 1318 + } 1319 + 1320 + ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; 1321 + 1322 + if (integrity_w_tfm) { 1323 + i = crypto_hash_digestsize(integrity_w_tfm); 1324 + int_dig_out = kmalloc(i, GFP_KERNEL); 1325 + if (!int_dig_out) { 1326 + retcode = ERR_NOMEM; 1327 + goto fail; 1328 + } 1329 + int_dig_in = kmalloc(i, GFP_KERNEL); 1330 + if (!int_dig_in) { 1331 + retcode = ERR_NOMEM; 1332 + goto fail; 1333 + } 1334 + int_dig_vv = kmalloc(i, GFP_KERNEL); 1335 + if (!int_dig_vv) { 1336 + retcode = ERR_NOMEM; 1337 + goto fail; 1338 + } 1339 + } 1340 + 1341 + if (!mdev->bitmap) { 1342 + if(drbd_bm_init(mdev)) { 1343 + retcode = ERR_NOMEM; 1344 + goto fail; 1345 + } 1346 + } 1347 + 1348 + spin_lock_irq(&mdev->req_lock); 1349 + if (mdev->net_conf != NULL) { 1350 + retcode = ERR_NET_CONFIGURED; 1351 + spin_unlock_irq(&mdev->req_lock); 1352 + goto fail; 1353 + } 1354 + mdev->net_conf = new_conf; 1355 + 1356 + mdev->send_cnt = 0; 1357 + mdev->recv_cnt = 0; 1358 + 1359 + if (new_tl_hash) { 1360 + kfree(mdev->tl_hash); 1361 + mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; 1362 + mdev->tl_hash = new_tl_hash; 1363 + } 1364 + 1365 + if (new_ee_hash) { 1366 + kfree(mdev->ee_hash); 1367 + mdev->ee_hash_s = mdev->net_conf->max_buffers/8; 1368 + mdev->ee_hash = new_ee_hash; 1369 + } 1370 + 1371 + crypto_free_hash(mdev->cram_hmac_tfm); 1372 + mdev->cram_hmac_tfm = tfm; 1373 + 1374 + crypto_free_hash(mdev->integrity_w_tfm); 1375 + mdev->integrity_w_tfm = integrity_w_tfm; 1376 + 1377 + crypto_free_hash(mdev->integrity_r_tfm); 1378 + mdev->integrity_r_tfm = integrity_r_tfm; 1379 + 1380 + kfree(mdev->int_dig_out); 1381 + kfree(mdev->int_dig_in); 1382 + kfree(mdev->int_dig_vv); 1383 + mdev->int_dig_out=int_dig_out; 1384 + mdev->int_dig_in=int_dig_in; 1385 + mdev->int_dig_vv=int_dig_vv; 1386 + spin_unlock_irq(&mdev->req_lock); 1387 + 1388 + retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); 1389 + 1390 + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1391 + reply->ret_code = retcode; 1392 + drbd_reconfig_done(mdev); 1393 + return 0; 1394 + 1395 + fail: 1396 + kfree(int_dig_out); 1397 + kfree(int_dig_in); 1398 + kfree(int_dig_vv); 1399 + crypto_free_hash(tfm); 1400 + crypto_free_hash(integrity_w_tfm); 1401 + crypto_free_hash(integrity_r_tfm); 1402 + kfree(new_tl_hash); 1403 + kfree(new_ee_hash); 1404 + kfree(new_conf); 1405 + 1406 + reply->ret_code = retcode; 1407 + drbd_reconfig_done(mdev); 1408 + return 0; 1409 + } 1410 + 1411 + static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1412 + struct drbd_nl_cfg_reply *reply) 1413 + { 1414 + int retcode; 1415 + 1416 + retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); 1417 + 1418 + if (retcode == SS_NOTHING_TO_DO) 1419 + goto done; 1420 + else if (retcode == SS_ALREADY_STANDALONE) 1421 + goto done; 1422 + else if (retcode == SS_PRIMARY_NOP) { 1423 + /* Our statche checking code wants to see the peer outdated. */ 1424 + retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, 1425 + pdsk, D_OUTDATED)); 1426 + } else if (retcode == SS_CW_FAILED_BY_PEER) { 1427 + /* The peer probably wants to see us outdated. */ 1428 + retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, 1429 + disk, D_OUTDATED), 1430 + CS_ORDERED); 1431 + if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { 1432 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 1433 + retcode = SS_SUCCESS; 1434 + } 1435 + } 1436 + 1437 + if (retcode < SS_SUCCESS) 1438 + goto fail; 1439 + 1440 + if (wait_event_interruptible(mdev->state_wait, 1441 + mdev->state.conn != C_DISCONNECTING)) { 1442 + /* Do not test for mdev->state.conn == C_STANDALONE, since 1443 + someone else might connect us in the mean time! */ 1444 + retcode = ERR_INTR; 1445 + goto fail; 1446 + } 1447 + 1448 + done: 1449 + retcode = NO_ERROR; 1450 + fail: 1451 + drbd_md_sync(mdev); 1452 + reply->ret_code = retcode; 1453 + return 0; 1454 + } 1455 + 1456 + void resync_after_online_grow(struct drbd_conf *mdev) 1457 + { 1458 + int iass; /* I am sync source */ 1459 + 1460 + dev_info(DEV, "Resync of new storage after online grow\n"); 1461 + if (mdev->state.role != mdev->state.peer) 1462 + iass = (mdev->state.role == R_PRIMARY); 1463 + else 1464 + iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); 1465 + 1466 + if (iass) 1467 + drbd_start_resync(mdev, C_SYNC_SOURCE); 1468 + else 1469 + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); 1470 + } 1471 + 1472 + static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1473 + struct drbd_nl_cfg_reply *reply) 1474 + { 1475 + struct resize rs; 1476 + int retcode = NO_ERROR; 1477 + int ldsc = 0; /* local disk size changed */ 1478 + enum determine_dev_size dd; 1479 + 1480 + memset(&rs, 0, sizeof(struct resize)); 1481 + if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { 1482 + retcode = ERR_MANDATORY_TAG; 1483 + goto fail; 1484 + } 1485 + 1486 + if (mdev->state.conn > C_CONNECTED) { 1487 + retcode = ERR_RESIZE_RESYNC; 1488 + goto fail; 1489 + } 1490 + 1491 + if (mdev->state.role == R_SECONDARY && 1492 + mdev->state.peer == R_SECONDARY) { 1493 + retcode = ERR_NO_PRIMARY; 1494 + goto fail; 1495 + } 1496 + 1497 + if (!get_ldev(mdev)) { 1498 + retcode = ERR_NO_DISK; 1499 + goto fail; 1500 + } 1501 + 1502 + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 1503 + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 1504 + ldsc = 1; 1505 + } 1506 + 1507 + mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; 1508 + dd = drbd_determin_dev_size(mdev); 1509 + drbd_md_sync(mdev); 1510 + put_ldev(mdev); 1511 + if (dd == dev_size_error) { 1512 + retcode = ERR_NOMEM_BITMAP; 1513 + goto fail; 1514 + } 1515 + 1516 + if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { 1517 + if (dd == grew) 1518 + set_bit(RESIZE_PENDING, &mdev->flags); 1519 + 1520 + drbd_send_uuids(mdev); 1521 + drbd_send_sizes(mdev, 1); 1522 + } 1523 + 1524 + fail: 1525 + reply->ret_code = retcode; 1526 + return 0; 1527 + } 1528 + 1529 + static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1530 + struct drbd_nl_cfg_reply *reply) 1531 + { 1532 + int retcode = NO_ERROR; 1533 + int err; 1534 + int ovr; /* online verify running */ 1535 + int rsr; /* re-sync running */ 1536 + struct crypto_hash *verify_tfm = NULL; 1537 + struct crypto_hash *csums_tfm = NULL; 1538 + struct syncer_conf sc; 1539 + cpumask_var_t new_cpu_mask; 1540 + 1541 + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { 1542 + retcode = ERR_NOMEM; 1543 + goto fail; 1544 + } 1545 + 1546 + if (nlp->flags & DRBD_NL_SET_DEFAULTS) { 1547 + memset(&sc, 0, sizeof(struct syncer_conf)); 1548 + sc.rate = DRBD_RATE_DEF; 1549 + sc.after = DRBD_AFTER_DEF; 1550 + sc.al_extents = DRBD_AL_EXTENTS_DEF; 1551 + } else 1552 + memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); 1553 + 1554 + if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { 1555 + retcode = ERR_MANDATORY_TAG; 1556 + goto fail; 1557 + } 1558 + 1559 + /* re-sync running */ 1560 + rsr = ( mdev->state.conn == C_SYNC_SOURCE || 1561 + mdev->state.conn == C_SYNC_TARGET || 1562 + mdev->state.conn == C_PAUSED_SYNC_S || 1563 + mdev->state.conn == C_PAUSED_SYNC_T ); 1564 + 1565 + if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { 1566 + retcode = ERR_CSUMS_RESYNC_RUNNING; 1567 + goto fail; 1568 + } 1569 + 1570 + if (!rsr && sc.csums_alg[0]) { 1571 + csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); 1572 + if (IS_ERR(csums_tfm)) { 1573 + csums_tfm = NULL; 1574 + retcode = ERR_CSUMS_ALG; 1575 + goto fail; 1576 + } 1577 + 1578 + if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { 1579 + retcode = ERR_CSUMS_ALG_ND; 1580 + goto fail; 1581 + } 1582 + } 1583 + 1584 + /* online verify running */ 1585 + ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); 1586 + 1587 + if (ovr) { 1588 + if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { 1589 + retcode = ERR_VERIFY_RUNNING; 1590 + goto fail; 1591 + } 1592 + } 1593 + 1594 + if (!ovr && sc.verify_alg[0]) { 1595 + verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); 1596 + if (IS_ERR(verify_tfm)) { 1597 + verify_tfm = NULL; 1598 + retcode = ERR_VERIFY_ALG; 1599 + goto fail; 1600 + } 1601 + 1602 + if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { 1603 + retcode = ERR_VERIFY_ALG_ND; 1604 + goto fail; 1605 + } 1606 + } 1607 + 1608 + /* silently ignore cpu mask on UP kernel */ 1609 + if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { 1610 + err = __bitmap_parse(sc.cpu_mask, 32, 0, 1611 + cpumask_bits(new_cpu_mask), nr_cpu_ids); 1612 + if (err) { 1613 + dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); 1614 + retcode = ERR_CPU_MASK_PARSE; 1615 + goto fail; 1616 + } 1617 + } 1618 + 1619 + ERR_IF (sc.rate < 1) sc.rate = 1; 1620 + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ 1621 + #define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) 1622 + if (sc.al_extents > AL_MAX) { 1623 + dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); 1624 + sc.al_extents = AL_MAX; 1625 + } 1626 + #undef AL_MAX 1627 + 1628 + /* most sanity checks done, try to assign the new sync-after 1629 + * dependency. need to hold the global lock in there, 1630 + * to avoid a race in the dependency loop check. */ 1631 + retcode = drbd_alter_sa(mdev, sc.after); 1632 + if (retcode != NO_ERROR) 1633 + goto fail; 1634 + 1635 + /* ok, assign the rest of it as well. 1636 + * lock against receive_SyncParam() */ 1637 + spin_lock(&mdev->peer_seq_lock); 1638 + mdev->sync_conf = sc; 1639 + 1640 + if (!rsr) { 1641 + crypto_free_hash(mdev->csums_tfm); 1642 + mdev->csums_tfm = csums_tfm; 1643 + csums_tfm = NULL; 1644 + } 1645 + 1646 + if (!ovr) { 1647 + crypto_free_hash(mdev->verify_tfm); 1648 + mdev->verify_tfm = verify_tfm; 1649 + verify_tfm = NULL; 1650 + } 1651 + spin_unlock(&mdev->peer_seq_lock); 1652 + 1653 + if (get_ldev(mdev)) { 1654 + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 1655 + drbd_al_shrink(mdev); 1656 + err = drbd_check_al_size(mdev); 1657 + lc_unlock(mdev->act_log); 1658 + wake_up(&mdev->al_wait); 1659 + 1660 + put_ldev(mdev); 1661 + drbd_md_sync(mdev); 1662 + 1663 + if (err) { 1664 + retcode = ERR_NOMEM; 1665 + goto fail; 1666 + } 1667 + } 1668 + 1669 + if (mdev->state.conn >= C_CONNECTED) 1670 + drbd_send_sync_param(mdev, &sc); 1671 + 1672 + if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { 1673 + cpumask_copy(mdev->cpu_mask, new_cpu_mask); 1674 + drbd_calc_cpu_mask(mdev); 1675 + mdev->receiver.reset_cpu_mask = 1; 1676 + mdev->asender.reset_cpu_mask = 1; 1677 + mdev->worker.reset_cpu_mask = 1; 1678 + } 1679 + 1680 + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 1681 + fail: 1682 + free_cpumask_var(new_cpu_mask); 1683 + crypto_free_hash(csums_tfm); 1684 + crypto_free_hash(verify_tfm); 1685 + reply->ret_code = retcode; 1686 + return 0; 1687 + } 1688 + 1689 + static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1690 + struct drbd_nl_cfg_reply *reply) 1691 + { 1692 + int retcode; 1693 + 1694 + retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); 1695 + 1696 + if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) 1697 + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 1698 + 1699 + while (retcode == SS_NEED_CONNECTION) { 1700 + spin_lock_irq(&mdev->req_lock); 1701 + if (mdev->state.conn < C_CONNECTED) 1702 + retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); 1703 + spin_unlock_irq(&mdev->req_lock); 1704 + 1705 + if (retcode != SS_NEED_CONNECTION) 1706 + break; 1707 + 1708 + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 1709 + } 1710 + 1711 + reply->ret_code = retcode; 1712 + return 0; 1713 + } 1714 + 1715 + static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1716 + struct drbd_nl_cfg_reply *reply) 1717 + { 1718 + 1719 + reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 1720 + 1721 + return 0; 1722 + } 1723 + 1724 + static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1725 + struct drbd_nl_cfg_reply *reply) 1726 + { 1727 + int retcode = NO_ERROR; 1728 + 1729 + if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) 1730 + retcode = ERR_PAUSE_IS_SET; 1731 + 1732 + reply->ret_code = retcode; 1733 + return 0; 1734 + } 1735 + 1736 + static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1737 + struct drbd_nl_cfg_reply *reply) 1738 + { 1739 + int retcode = NO_ERROR; 1740 + 1741 + if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) 1742 + retcode = ERR_PAUSE_IS_CLEAR; 1743 + 1744 + reply->ret_code = retcode; 1745 + return 0; 1746 + } 1747 + 1748 + static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1749 + struct drbd_nl_cfg_reply *reply) 1750 + { 1751 + reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); 1752 + 1753 + return 0; 1754 + } 1755 + 1756 + static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1757 + struct drbd_nl_cfg_reply *reply) 1758 + { 1759 + reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); 1760 + return 0; 1761 + } 1762 + 1763 + static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1764 + struct drbd_nl_cfg_reply *reply) 1765 + { 1766 + reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); 1767 + return 0; 1768 + } 1769 + 1770 + static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1771 + struct drbd_nl_cfg_reply *reply) 1772 + { 1773 + unsigned short *tl; 1774 + 1775 + tl = reply->tag_list; 1776 + 1777 + if (get_ldev(mdev)) { 1778 + tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); 1779 + put_ldev(mdev); 1780 + } 1781 + 1782 + if (get_net_conf(mdev)) { 1783 + tl = net_conf_to_tags(mdev, mdev->net_conf, tl); 1784 + put_net_conf(mdev); 1785 + } 1786 + tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); 1787 + 1788 + put_unaligned(TT_END, tl++); /* Close the tag list */ 1789 + 1790 + return (int)((char *)tl - (char *)reply->tag_list); 1791 + } 1792 + 1793 + static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1794 + struct drbd_nl_cfg_reply *reply) 1795 + { 1796 + unsigned short *tl = reply->tag_list; 1797 + union drbd_state s = mdev->state; 1798 + unsigned long rs_left; 1799 + unsigned int res; 1800 + 1801 + tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); 1802 + 1803 + /* no local ref, no bitmap, no syncer progress. */ 1804 + if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { 1805 + if (get_ldev(mdev)) { 1806 + drbd_get_syncer_progress(mdev, &rs_left, &res); 1807 + tl = tl_add_int(tl, T_sync_progress, &res); 1808 + put_ldev(mdev); 1809 + } 1810 + } 1811 + put_unaligned(TT_END, tl++); /* Close the tag list */ 1812 + 1813 + return (int)((char *)tl - (char *)reply->tag_list); 1814 + } 1815 + 1816 + static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1817 + struct drbd_nl_cfg_reply *reply) 1818 + { 1819 + unsigned short *tl; 1820 + 1821 + tl = reply->tag_list; 1822 + 1823 + if (get_ldev(mdev)) { 1824 + tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); 1825 + tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); 1826 + put_ldev(mdev); 1827 + } 1828 + put_unaligned(TT_END, tl++); /* Close the tag list */ 1829 + 1830 + return (int)((char *)tl - (char *)reply->tag_list); 1831 + } 1832 + 1833 + /** 1834 + * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use 1835 + * @mdev: DRBD device. 1836 + * @nlp: Netlink/connector packet from drbdsetup 1837 + * @reply: Reply packet for drbdsetup 1838 + */ 1839 + static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1840 + struct drbd_nl_cfg_reply *reply) 1841 + { 1842 + unsigned short *tl; 1843 + char rv; 1844 + 1845 + tl = reply->tag_list; 1846 + 1847 + rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : 1848 + test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; 1849 + 1850 + tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); 1851 + put_unaligned(TT_END, tl++); /* Close the tag list */ 1852 + 1853 + return (int)((char *)tl - (char *)reply->tag_list); 1854 + } 1855 + 1856 + static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1857 + struct drbd_nl_cfg_reply *reply) 1858 + { 1859 + /* default to resume from last known position, if possible */ 1860 + struct start_ov args = 1861 + { .start_sector = mdev->ov_start_sector }; 1862 + 1863 + if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { 1864 + reply->ret_code = ERR_MANDATORY_TAG; 1865 + return 0; 1866 + } 1867 + /* w_make_ov_request expects position to be aligned */ 1868 + mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; 1869 + reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); 1870 + return 0; 1871 + } 1872 + 1873 + 1874 + static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1875 + struct drbd_nl_cfg_reply *reply) 1876 + { 1877 + int retcode = NO_ERROR; 1878 + int skip_initial_sync = 0; 1879 + int err; 1880 + 1881 + struct new_c_uuid args; 1882 + 1883 + memset(&args, 0, sizeof(struct new_c_uuid)); 1884 + if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { 1885 + reply->ret_code = ERR_MANDATORY_TAG; 1886 + return 0; 1887 + } 1888 + 1889 + mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ 1890 + 1891 + if (!get_ldev(mdev)) { 1892 + retcode = ERR_NO_DISK; 1893 + goto out; 1894 + } 1895 + 1896 + /* this is "skip initial sync", assume to be clean */ 1897 + if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && 1898 + mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { 1899 + dev_info(DEV, "Preparing to skip initial sync\n"); 1900 + skip_initial_sync = 1; 1901 + } else if (mdev->state.conn != C_STANDALONE) { 1902 + retcode = ERR_CONNECTED; 1903 + goto out_dec; 1904 + } 1905 + 1906 + drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ 1907 + drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ 1908 + 1909 + if (args.clear_bm) { 1910 + err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); 1911 + if (err) { 1912 + dev_err(DEV, "Writing bitmap failed with %d\n",err); 1913 + retcode = ERR_IO_MD_DISK; 1914 + } 1915 + if (skip_initial_sync) { 1916 + drbd_send_uuids_skip_initial_sync(mdev); 1917 + _drbd_uuid_set(mdev, UI_BITMAP, 0); 1918 + spin_lock_irq(&mdev->req_lock); 1919 + _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 1920 + CS_VERBOSE, NULL); 1921 + spin_unlock_irq(&mdev->req_lock); 1922 + } 1923 + } 1924 + 1925 + drbd_md_sync(mdev); 1926 + out_dec: 1927 + put_ldev(mdev); 1928 + out: 1929 + mutex_unlock(&mdev->state_mutex); 1930 + 1931 + reply->ret_code = retcode; 1932 + return 0; 1933 + } 1934 + 1935 + static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) 1936 + { 1937 + struct drbd_conf *mdev; 1938 + 1939 + if (nlp->drbd_minor >= minor_count) 1940 + return NULL; 1941 + 1942 + mdev = minor_to_mdev(nlp->drbd_minor); 1943 + 1944 + if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { 1945 + struct gendisk *disk = NULL; 1946 + mdev = drbd_new_device(nlp->drbd_minor); 1947 + 1948 + spin_lock_irq(&drbd_pp_lock); 1949 + if (minor_table[nlp->drbd_minor] == NULL) { 1950 + minor_table[nlp->drbd_minor] = mdev; 1951 + disk = mdev->vdisk; 1952 + mdev = NULL; 1953 + } /* else: we lost the race */ 1954 + spin_unlock_irq(&drbd_pp_lock); 1955 + 1956 + if (disk) /* we won the race above */ 1957 + /* in case we ever add a drbd_delete_device(), 1958 + * don't forget the del_gendisk! */ 1959 + add_disk(disk); 1960 + else /* we lost the race above */ 1961 + drbd_free_mdev(mdev); 1962 + 1963 + mdev = minor_to_mdev(nlp->drbd_minor); 1964 + } 1965 + 1966 + return mdev; 1967 + } 1968 + 1969 + struct cn_handler_struct { 1970 + int (*function)(struct drbd_conf *, 1971 + struct drbd_nl_cfg_req *, 1972 + struct drbd_nl_cfg_reply *); 1973 + int reply_body_size; 1974 + }; 1975 + 1976 + static struct cn_handler_struct cnd_table[] = { 1977 + [ P_primary ] = { &drbd_nl_primary, 0 }, 1978 + [ P_secondary ] = { &drbd_nl_secondary, 0 }, 1979 + [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, 1980 + [ P_detach ] = { &drbd_nl_detach, 0 }, 1981 + [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, 1982 + [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, 1983 + [ P_resize ] = { &drbd_nl_resize, 0 }, 1984 + [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, 1985 + [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, 1986 + [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, 1987 + [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, 1988 + [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, 1989 + [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, 1990 + [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, 1991 + [ P_outdate ] = { &drbd_nl_outdate, 0 }, 1992 + [ P_get_config ] = { &drbd_nl_get_config, 1993 + sizeof(struct syncer_conf_tag_len_struct) + 1994 + sizeof(struct disk_conf_tag_len_struct) + 1995 + sizeof(struct net_conf_tag_len_struct) }, 1996 + [ P_get_state ] = { &drbd_nl_get_state, 1997 + sizeof(struct get_state_tag_len_struct) + 1998 + sizeof(struct sync_progress_tag_len_struct) }, 1999 + [ P_get_uuids ] = { &drbd_nl_get_uuids, 2000 + sizeof(struct get_uuids_tag_len_struct) }, 2001 + [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, 2002 + sizeof(struct get_timeout_flag_tag_len_struct)}, 2003 + [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, 2004 + [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, 2005 + }; 2006 + 2007 + static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) 2008 + { 2009 + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; 2010 + struct cn_handler_struct *cm; 2011 + struct cn_msg *cn_reply; 2012 + struct drbd_nl_cfg_reply *reply; 2013 + struct drbd_conf *mdev; 2014 + int retcode, rr; 2015 + int reply_size = sizeof(struct cn_msg) 2016 + + sizeof(struct drbd_nl_cfg_reply) 2017 + + sizeof(short int); 2018 + 2019 + if (!try_module_get(THIS_MODULE)) { 2020 + printk(KERN_ERR "drbd: try_module_get() failed!\n"); 2021 + return; 2022 + } 2023 + 2024 + if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { 2025 + retcode = ERR_PERM; 2026 + goto fail; 2027 + } 2028 + 2029 + mdev = ensure_mdev(nlp); 2030 + if (!mdev) { 2031 + retcode = ERR_MINOR_INVALID; 2032 + goto fail; 2033 + } 2034 + 2035 + if (nlp->packet_type >= P_nl_after_last_packet) { 2036 + retcode = ERR_PACKET_NR; 2037 + goto fail; 2038 + } 2039 + 2040 + cm = cnd_table + nlp->packet_type; 2041 + 2042 + /* This may happen if packet number is 0: */ 2043 + if (cm->function == NULL) { 2044 + retcode = ERR_PACKET_NR; 2045 + goto fail; 2046 + } 2047 + 2048 + reply_size += cm->reply_body_size; 2049 + 2050 + /* allocation not in the IO path, cqueue thread context */ 2051 + cn_reply = kmalloc(reply_size, GFP_KERNEL); 2052 + if (!cn_reply) { 2053 + retcode = ERR_NOMEM; 2054 + goto fail; 2055 + } 2056 + reply = (struct drbd_nl_cfg_reply *) cn_reply->data; 2057 + 2058 + reply->packet_type = 2059 + cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; 2060 + reply->minor = nlp->drbd_minor; 2061 + reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ 2062 + /* reply->tag_list; might be modified by cm->function. */ 2063 + 2064 + rr = cm->function(mdev, nlp, reply); 2065 + 2066 + cn_reply->id = req->id; 2067 + cn_reply->seq = req->seq; 2068 + cn_reply->ack = req->ack + 1; 2069 + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; 2070 + cn_reply->flags = 0; 2071 + 2072 + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); 2073 + if (rr && rr != -ESRCH) 2074 + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); 2075 + 2076 + kfree(cn_reply); 2077 + module_put(THIS_MODULE); 2078 + return; 2079 + fail: 2080 + drbd_nl_send_reply(req, retcode); 2081 + module_put(THIS_MODULE); 2082 + } 2083 + 2084 + static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ 2085 + 2086 + static unsigned short * 2087 + __tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, 2088 + unsigned short len, int nul_terminated) 2089 + { 2090 + unsigned short l = tag_descriptions[tag_number(tag)].max_len; 2091 + len = (len < l) ? len : l; 2092 + put_unaligned(tag, tl++); 2093 + put_unaligned(len, tl++); 2094 + memcpy(tl, data, len); 2095 + tl = (unsigned short*)((char*)tl + len); 2096 + if (nul_terminated) 2097 + *((char*)tl - 1) = 0; 2098 + return tl; 2099 + } 2100 + 2101 + static unsigned short * 2102 + tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) 2103 + { 2104 + return __tl_add_blob(tl, tag, data, len, 0); 2105 + } 2106 + 2107 + static unsigned short * 2108 + tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) 2109 + { 2110 + return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); 2111 + } 2112 + 2113 + static unsigned short * 2114 + tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) 2115 + { 2116 + put_unaligned(tag, tl++); 2117 + switch(tag_type(tag)) { 2118 + case TT_INTEGER: 2119 + put_unaligned(sizeof(int), tl++); 2120 + put_unaligned(*(int *)val, (int *)tl); 2121 + tl = (unsigned short*)((char*)tl+sizeof(int)); 2122 + break; 2123 + case TT_INT64: 2124 + put_unaligned(sizeof(u64), tl++); 2125 + put_unaligned(*(u64 *)val, (u64 *)tl); 2126 + tl = (unsigned short*)((char*)tl+sizeof(u64)); 2127 + break; 2128 + default: 2129 + /* someone did something stupid. */ 2130 + ; 2131 + } 2132 + return tl; 2133 + } 2134 + 2135 + void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) 2136 + { 2137 + char buffer[sizeof(struct cn_msg)+ 2138 + sizeof(struct drbd_nl_cfg_reply)+ 2139 + sizeof(struct get_state_tag_len_struct)+ 2140 + sizeof(short int)]; 2141 + struct cn_msg *cn_reply = (struct cn_msg *) buffer; 2142 + struct drbd_nl_cfg_reply *reply = 2143 + (struct drbd_nl_cfg_reply *)cn_reply->data; 2144 + unsigned short *tl = reply->tag_list; 2145 + 2146 + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ 2147 + 2148 + tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); 2149 + 2150 + put_unaligned(TT_END, tl++); /* Close the tag list */ 2151 + 2152 + cn_reply->id.idx = CN_IDX_DRBD; 2153 + cn_reply->id.val = CN_VAL_DRBD; 2154 + 2155 + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); 2156 + cn_reply->ack = 0; /* not used here. */ 2157 + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + 2158 + (int)((char *)tl - (char *)reply->tag_list); 2159 + cn_reply->flags = 0; 2160 + 2161 + reply->packet_type = P_get_state; 2162 + reply->minor = mdev_to_minor(mdev); 2163 + reply->ret_code = NO_ERROR; 2164 + 2165 + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); 2166 + } 2167 + 2168 + void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) 2169 + { 2170 + char buffer[sizeof(struct cn_msg)+ 2171 + sizeof(struct drbd_nl_cfg_reply)+ 2172 + sizeof(struct call_helper_tag_len_struct)+ 2173 + sizeof(short int)]; 2174 + struct cn_msg *cn_reply = (struct cn_msg *) buffer; 2175 + struct drbd_nl_cfg_reply *reply = 2176 + (struct drbd_nl_cfg_reply *)cn_reply->data; 2177 + unsigned short *tl = reply->tag_list; 2178 + 2179 + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ 2180 + 2181 + tl = tl_add_str(tl, T_helper, helper_name); 2182 + put_unaligned(TT_END, tl++); /* Close the tag list */ 2183 + 2184 + cn_reply->id.idx = CN_IDX_DRBD; 2185 + cn_reply->id.val = CN_VAL_DRBD; 2186 + 2187 + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); 2188 + cn_reply->ack = 0; /* not used here. */ 2189 + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + 2190 + (int)((char *)tl - (char *)reply->tag_list); 2191 + cn_reply->flags = 0; 2192 + 2193 + reply->packet_type = P_call_helper; 2194 + reply->minor = mdev_to_minor(mdev); 2195 + reply->ret_code = NO_ERROR; 2196 + 2197 + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); 2198 + } 2199 + 2200 + void drbd_bcast_ee(struct drbd_conf *mdev, 2201 + const char *reason, const int dgs, 2202 + const char* seen_hash, const char* calc_hash, 2203 + const struct drbd_epoch_entry* e) 2204 + { 2205 + struct cn_msg *cn_reply; 2206 + struct drbd_nl_cfg_reply *reply; 2207 + struct bio_vec *bvec; 2208 + unsigned short *tl; 2209 + int i; 2210 + 2211 + if (!e) 2212 + return; 2213 + if (!reason || !reason[0]) 2214 + return; 2215 + 2216 + /* apparently we have to memcpy twice, first to prepare the data for the 2217 + * struct cn_msg, then within cn_netlink_send from the cn_msg to the 2218 + * netlink skb. */ 2219 + /* receiver thread context, which is not in the writeout path (of this node), 2220 + * but may be in the writeout path of the _other_ node. 2221 + * GFP_NOIO to avoid potential "distributed deadlock". */ 2222 + cn_reply = kmalloc( 2223 + sizeof(struct cn_msg)+ 2224 + sizeof(struct drbd_nl_cfg_reply)+ 2225 + sizeof(struct dump_ee_tag_len_struct)+ 2226 + sizeof(short int), 2227 + GFP_NOIO); 2228 + 2229 + if (!cn_reply) { 2230 + dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", 2231 + (unsigned long long)e->sector, e->size); 2232 + return; 2233 + } 2234 + 2235 + reply = (struct drbd_nl_cfg_reply*)cn_reply->data; 2236 + tl = reply->tag_list; 2237 + 2238 + tl = tl_add_str(tl, T_dump_ee_reason, reason); 2239 + tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); 2240 + tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); 2241 + tl = tl_add_int(tl, T_ee_sector, &e->sector); 2242 + tl = tl_add_int(tl, T_ee_block_id, &e->block_id); 2243 + 2244 + put_unaligned(T_ee_data, tl++); 2245 + put_unaligned(e->size, tl++); 2246 + 2247 + __bio_for_each_segment(bvec, e->private_bio, i, 0) { 2248 + void *d = kmap(bvec->bv_page); 2249 + memcpy(tl, d + bvec->bv_offset, bvec->bv_len); 2250 + kunmap(bvec->bv_page); 2251 + tl=(unsigned short*)((char*)tl + bvec->bv_len); 2252 + } 2253 + put_unaligned(TT_END, tl++); /* Close the tag list */ 2254 + 2255 + cn_reply->id.idx = CN_IDX_DRBD; 2256 + cn_reply->id.val = CN_VAL_DRBD; 2257 + 2258 + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); 2259 + cn_reply->ack = 0; // not used here. 2260 + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + 2261 + (int)((char*)tl - (char*)reply->tag_list); 2262 + cn_reply->flags = 0; 2263 + 2264 + reply->packet_type = P_dump_ee; 2265 + reply->minor = mdev_to_minor(mdev); 2266 + reply->ret_code = NO_ERROR; 2267 + 2268 + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); 2269 + kfree(cn_reply); 2270 + } 2271 + 2272 + void drbd_bcast_sync_progress(struct drbd_conf *mdev) 2273 + { 2274 + char buffer[sizeof(struct cn_msg)+ 2275 + sizeof(struct drbd_nl_cfg_reply)+ 2276 + sizeof(struct sync_progress_tag_len_struct)+ 2277 + sizeof(short int)]; 2278 + struct cn_msg *cn_reply = (struct cn_msg *) buffer; 2279 + struct drbd_nl_cfg_reply *reply = 2280 + (struct drbd_nl_cfg_reply *)cn_reply->data; 2281 + unsigned short *tl = reply->tag_list; 2282 + unsigned long rs_left; 2283 + unsigned int res; 2284 + 2285 + /* no local ref, no bitmap, no syncer progress, no broadcast. */ 2286 + if (!get_ldev(mdev)) 2287 + return; 2288 + drbd_get_syncer_progress(mdev, &rs_left, &res); 2289 + put_ldev(mdev); 2290 + 2291 + tl = tl_add_int(tl, T_sync_progress, &res); 2292 + put_unaligned(TT_END, tl++); /* Close the tag list */ 2293 + 2294 + cn_reply->id.idx = CN_IDX_DRBD; 2295 + cn_reply->id.val = CN_VAL_DRBD; 2296 + 2297 + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); 2298 + cn_reply->ack = 0; /* not used here. */ 2299 + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + 2300 + (int)((char *)tl - (char *)reply->tag_list); 2301 + cn_reply->flags = 0; 2302 + 2303 + reply->packet_type = P_sync_progress; 2304 + reply->minor = mdev_to_minor(mdev); 2305 + reply->ret_code = NO_ERROR; 2306 + 2307 + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); 2308 + } 2309 + 2310 + int __init drbd_nl_init(void) 2311 + { 2312 + static struct cb_id cn_id_drbd; 2313 + int err, try=10; 2314 + 2315 + cn_id_drbd.val = CN_VAL_DRBD; 2316 + do { 2317 + cn_id_drbd.idx = cn_idx; 2318 + err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); 2319 + if (!err) 2320 + break; 2321 + cn_idx = (cn_idx + CN_IDX_STEP); 2322 + } while (try--); 2323 + 2324 + if (err) { 2325 + printk(KERN_ERR "drbd: cn_drbd failed to register\n"); 2326 + return err; 2327 + } 2328 + 2329 + return 0; 2330 + } 2331 + 2332 + void drbd_nl_cleanup(void) 2333 + { 2334 + static struct cb_id cn_id_drbd; 2335 + 2336 + cn_id_drbd.idx = cn_idx; 2337 + cn_id_drbd.val = CN_VAL_DRBD; 2338 + 2339 + cn_del_callback(&cn_id_drbd); 2340 + } 2341 + 2342 + void drbd_nl_send_reply(struct cn_msg *req, int ret_code) 2343 + { 2344 + char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; 2345 + struct cn_msg *cn_reply = (struct cn_msg *) buffer; 2346 + struct drbd_nl_cfg_reply *reply = 2347 + (struct drbd_nl_cfg_reply *)cn_reply->data; 2348 + int rr; 2349 + 2350 + cn_reply->id = req->id; 2351 + 2352 + cn_reply->seq = req->seq; 2353 + cn_reply->ack = req->ack + 1; 2354 + cn_reply->len = sizeof(struct drbd_nl_cfg_reply); 2355 + cn_reply->flags = 0; 2356 + 2357 + reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; 2358 + reply->ret_code = ret_code; 2359 + 2360 + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); 2361 + if (rr && rr != -ESRCH) 2362 + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); 2363 + } 2364 +

+265

drivers/block/drbd/drbd_proc.c

··· 1 + /* 2 + drbd_proc.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/module.h> 27 + 28 + #include <asm/uaccess.h> 29 + #include <linux/fs.h> 30 + #include <linux/file.h> 31 + #include <linux/slab.h> 32 + #include <linux/proc_fs.h> 33 + #include <linux/seq_file.h> 34 + #include <linux/drbd.h> 35 + #include "drbd_int.h" 36 + 37 + static int drbd_proc_open(struct inode *inode, struct file *file); 38 + 39 + 40 + struct proc_dir_entry *drbd_proc; 41 + struct file_operations drbd_proc_fops = { 42 + .owner = THIS_MODULE, 43 + .open = drbd_proc_open, 44 + .read = seq_read, 45 + .llseek = seq_lseek, 46 + .release = single_release, 47 + }; 48 + 49 + 50 + /*lge 51 + * progress bars shamelessly adapted from driver/md/md.c 52 + * output looks like 53 + * [=====>..............] 33.5% (23456/123456) 54 + * finish: 2:20:20 speed: 6,345 (6,456) K/sec 55 + */ 56 + static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) 57 + { 58 + unsigned long db, dt, dbdt, rt, rs_left; 59 + unsigned int res; 60 + int i, x, y; 61 + 62 + drbd_get_syncer_progress(mdev, &rs_left, &res); 63 + 64 + x = res/50; 65 + y = 20-x; 66 + seq_printf(seq, "\t["); 67 + for (i = 1; i < x; i++) 68 + seq_printf(seq, "="); 69 + seq_printf(seq, ">"); 70 + for (i = 0; i < y; i++) 71 + seq_printf(seq, "."); 72 + seq_printf(seq, "] "); 73 + 74 + seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); 75 + /* if more than 1 GB display in MB */ 76 + if (mdev->rs_total > 0x100000L) 77 + seq_printf(seq, "(%lu/%lu)M\n\t", 78 + (unsigned long) Bit2KB(rs_left >> 10), 79 + (unsigned long) Bit2KB(mdev->rs_total >> 10)); 80 + else 81 + seq_printf(seq, "(%lu/%lu)K\n\t", 82 + (unsigned long) Bit2KB(rs_left), 83 + (unsigned long) Bit2KB(mdev->rs_total)); 84 + 85 + /* see drivers/md/md.c 86 + * We do not want to overflow, so the order of operands and 87 + * the * 100 / 100 trick are important. We do a +1 to be 88 + * safe against division by zero. We only estimate anyway. 89 + * 90 + * dt: time from mark until now 91 + * db: blocks written from mark until now 92 + * rt: remaining time 93 + */ 94 + dt = (jiffies - mdev->rs_mark_time) / HZ; 95 + 96 + if (dt > 20) { 97 + /* if we made no update to rs_mark_time for too long, 98 + * we are stalled. show that. */ 99 + seq_printf(seq, "stalled\n"); 100 + return; 101 + } 102 + 103 + if (!dt) 104 + dt++; 105 + db = mdev->rs_mark_left - rs_left; 106 + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ 107 + 108 + seq_printf(seq, "finish: %lu:%02lu:%02lu", 109 + rt / 3600, (rt % 3600) / 60, rt % 60); 110 + 111 + /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ 112 + dbdt = Bit2KB(db/dt); 113 + if (dbdt > 1000) 114 + seq_printf(seq, " speed: %ld,%03ld", 115 + dbdt/1000, dbdt % 1000); 116 + else 117 + seq_printf(seq, " speed: %ld", dbdt); 118 + 119 + /* mean speed since syncer started 120 + * we do account for PausedSync periods */ 121 + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; 122 + if (dt <= 0) 123 + dt = 1; 124 + db = mdev->rs_total - rs_left; 125 + dbdt = Bit2KB(db/dt); 126 + if (dbdt > 1000) 127 + seq_printf(seq, " (%ld,%03ld)", 128 + dbdt/1000, dbdt % 1000); 129 + else 130 + seq_printf(seq, " (%ld)", dbdt); 131 + 132 + seq_printf(seq, " K/sec\n"); 133 + } 134 + 135 + static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) 136 + { 137 + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); 138 + 139 + seq_printf(seq, "%5d %s %s\n", bme->rs_left, 140 + bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", 141 + bme->flags & BME_LOCKED ? "LOCKED" : "------" 142 + ); 143 + } 144 + 145 + static int drbd_seq_show(struct seq_file *seq, void *v) 146 + { 147 + int i, hole = 0; 148 + const char *sn; 149 + struct drbd_conf *mdev; 150 + 151 + static char write_ordering_chars[] = { 152 + [WO_none] = 'n', 153 + [WO_drain_io] = 'd', 154 + [WO_bdev_flush] = 'f', 155 + [WO_bio_barrier] = 'b', 156 + }; 157 + 158 + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", 159 + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); 160 + 161 + /* 162 + cs .. connection state 163 + ro .. node role (local/remote) 164 + ds .. disk state (local/remote) 165 + protocol 166 + various flags 167 + ns .. network send 168 + nr .. network receive 169 + dw .. disk write 170 + dr .. disk read 171 + al .. activity log write count 172 + bm .. bitmap update write count 173 + pe .. pending (waiting for ack or data reply) 174 + ua .. unack'd (still need to send ack or data reply) 175 + ap .. application requests accepted, but not yet completed 176 + ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending 177 + wo .. write ordering mode currently in use 178 + oos .. known out-of-sync kB 179 + */ 180 + 181 + for (i = 0; i < minor_count; i++) { 182 + mdev = minor_to_mdev(i); 183 + if (!mdev) { 184 + hole = 1; 185 + continue; 186 + } 187 + if (hole) { 188 + hole = 0; 189 + seq_printf(seq, "\n"); 190 + } 191 + 192 + sn = drbd_conn_str(mdev->state.conn); 193 + 194 + if (mdev->state.conn == C_STANDALONE && 195 + mdev->state.disk == D_DISKLESS && 196 + mdev->state.role == R_SECONDARY) { 197 + seq_printf(seq, "%2d: cs:Unconfigured\n", i); 198 + } else { 199 + seq_printf(seq, 200 + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" 201 + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " 202 + "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", 203 + i, sn, 204 + drbd_role_str(mdev->state.role), 205 + drbd_role_str(mdev->state.peer), 206 + drbd_disk_str(mdev->state.disk), 207 + drbd_disk_str(mdev->state.pdsk), 208 + (mdev->net_conf == NULL ? ' ' : 209 + (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), 210 + mdev->state.susp ? 's' : 'r', 211 + mdev->state.aftr_isp ? 'a' : '-', 212 + mdev->state.peer_isp ? 'p' : '-', 213 + mdev->state.user_isp ? 'u' : '-', 214 + mdev->congestion_reason ?: '-', 215 + mdev->send_cnt/2, 216 + mdev->recv_cnt/2, 217 + mdev->writ_cnt/2, 218 + mdev->read_cnt/2, 219 + mdev->al_writ_cnt, 220 + mdev->bm_writ_cnt, 221 + atomic_read(&mdev->local_cnt), 222 + atomic_read(&mdev->ap_pending_cnt) + 223 + atomic_read(&mdev->rs_pending_cnt), 224 + atomic_read(&mdev->unacked_cnt), 225 + atomic_read(&mdev->ap_bio_cnt), 226 + mdev->epochs, 227 + write_ordering_chars[mdev->write_ordering] 228 + ); 229 + seq_printf(seq, " oos:%lu\n", 230 + Bit2KB(drbd_bm_total_weight(mdev))); 231 + } 232 + if (mdev->state.conn == C_SYNC_SOURCE || 233 + mdev->state.conn == C_SYNC_TARGET) 234 + drbd_syncer_progress(mdev, seq); 235 + 236 + if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) 237 + seq_printf(seq, "\t%3d%% %lu/%lu\n", 238 + (int)((mdev->rs_total-mdev->ov_left) / 239 + (mdev->rs_total/100+1)), 240 + mdev->rs_total - mdev->ov_left, 241 + mdev->rs_total); 242 + 243 + if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { 244 + lc_seq_printf_stats(seq, mdev->resync); 245 + lc_seq_printf_stats(seq, mdev->act_log); 246 + put_ldev(mdev); 247 + } 248 + 249 + if (proc_details >= 2) { 250 + if (mdev->resync) { 251 + lc_seq_dump_details(seq, mdev->resync, "rs_left", 252 + resync_dump_detail); 253 + } 254 + } 255 + } 256 + 257 + return 0; 258 + } 259 + 260 + static int drbd_proc_open(struct inode *inode, struct file *file) 261 + { 262 + return single_open(file, drbd_seq_show, PDE(inode)->data); 263 + } 264 + 265 + /* PROC FS stuff end */

+4426

drivers/block/drbd/drbd_receiver.c

··· 1 + /* 2 + drbd_receiver.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + */ 24 + 25 + 26 + #include <linux/module.h> 27 + 28 + #include <asm/uaccess.h> 29 + #include <net/sock.h> 30 + 31 + #include <linux/version.h> 32 + #include <linux/drbd.h> 33 + #include <linux/fs.h> 34 + #include <linux/file.h> 35 + #include <linux/in.h> 36 + #include <linux/mm.h> 37 + #include <linux/memcontrol.h> 38 + #include <linux/mm_inline.h> 39 + #include <linux/slab.h> 40 + #include <linux/smp_lock.h> 41 + #include <linux/pkt_sched.h> 42 + #define __KERNEL_SYSCALLS__ 43 + #include <linux/unistd.h> 44 + #include <linux/vmalloc.h> 45 + #include <linux/random.h> 46 + #include <linux/mm.h> 47 + #include <linux/string.h> 48 + #include <linux/scatterlist.h> 49 + #include "drbd_int.h" 50 + #include "drbd_req.h" 51 + 52 + #include "drbd_vli.h" 53 + 54 + struct flush_work { 55 + struct drbd_work w; 56 + struct drbd_epoch *epoch; 57 + }; 58 + 59 + enum finish_epoch { 60 + FE_STILL_LIVE, 61 + FE_DESTROYED, 62 + FE_RECYCLED, 63 + }; 64 + 65 + static int drbd_do_handshake(struct drbd_conf *mdev); 66 + static int drbd_do_auth(struct drbd_conf *mdev); 67 + 68 + static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); 69 + static int e_end_block(struct drbd_conf *, struct drbd_work *, int); 70 + 71 + static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) 72 + { 73 + struct drbd_epoch *prev; 74 + spin_lock(&mdev->epoch_lock); 75 + prev = list_entry(epoch->list.prev, struct drbd_epoch, list); 76 + if (prev == epoch || prev == mdev->current_epoch) 77 + prev = NULL; 78 + spin_unlock(&mdev->epoch_lock); 79 + return prev; 80 + } 81 + 82 + #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 83 + 84 + static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) 85 + { 86 + struct page *page = NULL; 87 + 88 + /* Yes, testing drbd_pp_vacant outside the lock is racy. 89 + * So what. It saves a spin_lock. */ 90 + if (drbd_pp_vacant > 0) { 91 + spin_lock(&drbd_pp_lock); 92 + page = drbd_pp_pool; 93 + if (page) { 94 + drbd_pp_pool = (struct page *)page_private(page); 95 + set_page_private(page, 0); /* just to be polite */ 96 + drbd_pp_vacant--; 97 + } 98 + spin_unlock(&drbd_pp_lock); 99 + } 100 + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD 101 + * "criss-cross" setup, that might cause write-out on some other DRBD, 102 + * which in turn might block on the other node at this very place. */ 103 + if (!page) 104 + page = alloc_page(GFP_TRY); 105 + if (page) 106 + atomic_inc(&mdev->pp_in_use); 107 + return page; 108 + } 109 + 110 + /* kick lower level device, if we have more than (arbitrary number) 111 + * reference counts on it, which typically are locally submitted io 112 + * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ 113 + static void maybe_kick_lo(struct drbd_conf *mdev) 114 + { 115 + if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) 116 + drbd_kick_lo(mdev); 117 + } 118 + 119 + static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) 120 + { 121 + struct drbd_epoch_entry *e; 122 + struct list_head *le, *tle; 123 + 124 + /* The EEs are always appended to the end of the list. Since 125 + they are sent in order over the wire, they have to finish 126 + in order. As soon as we see the first not finished we can 127 + stop to examine the list... */ 128 + 129 + list_for_each_safe(le, tle, &mdev->net_ee) { 130 + e = list_entry(le, struct drbd_epoch_entry, w.list); 131 + if (drbd_bio_has_active_page(e->private_bio)) 132 + break; 133 + list_move(le, to_be_freed); 134 + } 135 + } 136 + 137 + static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) 138 + { 139 + LIST_HEAD(reclaimed); 140 + struct drbd_epoch_entry *e, *t; 141 + 142 + maybe_kick_lo(mdev); 143 + spin_lock_irq(&mdev->req_lock); 144 + reclaim_net_ee(mdev, &reclaimed); 145 + spin_unlock_irq(&mdev->req_lock); 146 + 147 + list_for_each_entry_safe(e, t, &reclaimed, w.list) 148 + drbd_free_ee(mdev, e); 149 + } 150 + 151 + /** 152 + * drbd_pp_alloc() - Returns a page, fails only if a signal comes in 153 + * @mdev: DRBD device. 154 + * @retry: whether or not to retry allocation forever (or until signalled) 155 + * 156 + * Tries to allocate a page, first from our own page pool, then from the 157 + * kernel, unless this allocation would exceed the max_buffers setting. 158 + * If @retry is non-zero, retry until DRBD frees a page somewhere else. 159 + */ 160 + static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) 161 + { 162 + struct page *page = NULL; 163 + DEFINE_WAIT(wait); 164 + 165 + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { 166 + page = drbd_pp_first_page_or_try_alloc(mdev); 167 + if (page) 168 + return page; 169 + } 170 + 171 + for (;;) { 172 + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); 173 + 174 + drbd_kick_lo_and_reclaim_net(mdev); 175 + 176 + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { 177 + page = drbd_pp_first_page_or_try_alloc(mdev); 178 + if (page) 179 + break; 180 + } 181 + 182 + if (!retry) 183 + break; 184 + 185 + if (signal_pending(current)) { 186 + dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); 187 + break; 188 + } 189 + 190 + schedule(); 191 + } 192 + finish_wait(&drbd_pp_wait, &wait); 193 + 194 + return page; 195 + } 196 + 197 + /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. 198 + * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ 199 + static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) 200 + { 201 + int free_it; 202 + 203 + spin_lock(&drbd_pp_lock); 204 + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { 205 + free_it = 1; 206 + } else { 207 + set_page_private(page, (unsigned long)drbd_pp_pool); 208 + drbd_pp_pool = page; 209 + drbd_pp_vacant++; 210 + free_it = 0; 211 + } 212 + spin_unlock(&drbd_pp_lock); 213 + 214 + atomic_dec(&mdev->pp_in_use); 215 + 216 + if (free_it) 217 + __free_page(page); 218 + 219 + wake_up(&drbd_pp_wait); 220 + } 221 + 222 + static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) 223 + { 224 + struct page *p_to_be_freed = NULL; 225 + struct page *page; 226 + struct bio_vec *bvec; 227 + int i; 228 + 229 + spin_lock(&drbd_pp_lock); 230 + __bio_for_each_segment(bvec, bio, i, 0) { 231 + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { 232 + set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); 233 + p_to_be_freed = bvec->bv_page; 234 + } else { 235 + set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); 236 + drbd_pp_pool = bvec->bv_page; 237 + drbd_pp_vacant++; 238 + } 239 + } 240 + spin_unlock(&drbd_pp_lock); 241 + atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); 242 + 243 + while (p_to_be_freed) { 244 + page = p_to_be_freed; 245 + p_to_be_freed = (struct page *)page_private(page); 246 + set_page_private(page, 0); /* just to be polite */ 247 + put_page(page); 248 + } 249 + 250 + wake_up(&drbd_pp_wait); 251 + } 252 + 253 + /* 254 + You need to hold the req_lock: 255 + _drbd_wait_ee_list_empty() 256 + 257 + You must not have the req_lock: 258 + drbd_free_ee() 259 + drbd_alloc_ee() 260 + drbd_init_ee() 261 + drbd_release_ee() 262 + drbd_ee_fix_bhs() 263 + drbd_process_done_ee() 264 + drbd_clear_done_ee() 265 + drbd_wait_ee_list_empty() 266 + */ 267 + 268 + struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, 269 + u64 id, 270 + sector_t sector, 271 + unsigned int data_size, 272 + gfp_t gfp_mask) __must_hold(local) 273 + { 274 + struct request_queue *q; 275 + struct drbd_epoch_entry *e; 276 + struct page *page; 277 + struct bio *bio; 278 + unsigned int ds; 279 + 280 + if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) 281 + return NULL; 282 + 283 + e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); 284 + if (!e) { 285 + if (!(gfp_mask & __GFP_NOWARN)) 286 + dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); 287 + return NULL; 288 + } 289 + 290 + bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); 291 + if (!bio) { 292 + if (!(gfp_mask & __GFP_NOWARN)) 293 + dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); 294 + goto fail1; 295 + } 296 + 297 + bio->bi_bdev = mdev->ldev->backing_bdev; 298 + bio->bi_sector = sector; 299 + 300 + ds = data_size; 301 + while (ds) { 302 + page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); 303 + if (!page) { 304 + if (!(gfp_mask & __GFP_NOWARN)) 305 + dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); 306 + goto fail2; 307 + } 308 + if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { 309 + drbd_pp_free(mdev, page); 310 + dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," 311 + "data_size=%u,ds=%u) failed\n", 312 + (unsigned long long)sector, data_size, ds); 313 + 314 + q = bdev_get_queue(bio->bi_bdev); 315 + if (q->merge_bvec_fn) { 316 + struct bvec_merge_data bvm = { 317 + .bi_bdev = bio->bi_bdev, 318 + .bi_sector = bio->bi_sector, 319 + .bi_size = bio->bi_size, 320 + .bi_rw = bio->bi_rw, 321 + }; 322 + int l = q->merge_bvec_fn(q, &bvm, 323 + &bio->bi_io_vec[bio->bi_vcnt]); 324 + dev_err(DEV, "merge_bvec_fn() = %d\n", l); 325 + } 326 + 327 + /* dump more of the bio. */ 328 + dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); 329 + dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); 330 + dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); 331 + dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); 332 + 333 + goto fail2; 334 + break; 335 + } 336 + ds -= min_t(int, ds, PAGE_SIZE); 337 + } 338 + 339 + D_ASSERT(data_size == bio->bi_size); 340 + 341 + bio->bi_private = e; 342 + e->mdev = mdev; 343 + e->sector = sector; 344 + e->size = bio->bi_size; 345 + 346 + e->private_bio = bio; 347 + e->block_id = id; 348 + INIT_HLIST_NODE(&e->colision); 349 + e->epoch = NULL; 350 + e->flags = 0; 351 + 352 + return e; 353 + 354 + fail2: 355 + drbd_pp_free_bio_pages(mdev, bio); 356 + bio_put(bio); 357 + fail1: 358 + mempool_free(e, drbd_ee_mempool); 359 + 360 + return NULL; 361 + } 362 + 363 + void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 364 + { 365 + struct bio *bio = e->private_bio; 366 + drbd_pp_free_bio_pages(mdev, bio); 367 + bio_put(bio); 368 + D_ASSERT(hlist_unhashed(&e->colision)); 369 + mempool_free(e, drbd_ee_mempool); 370 + } 371 + 372 + int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) 373 + { 374 + LIST_HEAD(work_list); 375 + struct drbd_epoch_entry *e, *t; 376 + int count = 0; 377 + 378 + spin_lock_irq(&mdev->req_lock); 379 + list_splice_init(list, &work_list); 380 + spin_unlock_irq(&mdev->req_lock); 381 + 382 + list_for_each_entry_safe(e, t, &work_list, w.list) { 383 + drbd_free_ee(mdev, e); 384 + count++; 385 + } 386 + return count; 387 + } 388 + 389 + 390 + /* 391 + * This function is called from _asender only_ 392 + * but see also comments in _req_mod(,barrier_acked) 393 + * and receive_Barrier. 394 + * 395 + * Move entries from net_ee to done_ee, if ready. 396 + * Grab done_ee, call all callbacks, free the entries. 397 + * The callbacks typically send out ACKs. 398 + */ 399 + static int drbd_process_done_ee(struct drbd_conf *mdev) 400 + { 401 + LIST_HEAD(work_list); 402 + LIST_HEAD(reclaimed); 403 + struct drbd_epoch_entry *e, *t; 404 + int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); 405 + 406 + spin_lock_irq(&mdev->req_lock); 407 + reclaim_net_ee(mdev, &reclaimed); 408 + list_splice_init(&mdev->done_ee, &work_list); 409 + spin_unlock_irq(&mdev->req_lock); 410 + 411 + list_for_each_entry_safe(e, t, &reclaimed, w.list) 412 + drbd_free_ee(mdev, e); 413 + 414 + /* possible callbacks here: 415 + * e_end_block, and e_end_resync_block, e_send_discard_ack. 416 + * all ignore the last argument. 417 + */ 418 + list_for_each_entry_safe(e, t, &work_list, w.list) { 419 + /* list_del not necessary, next/prev members not touched */ 420 + ok = e->w.cb(mdev, &e->w, !ok) && ok; 421 + drbd_free_ee(mdev, e); 422 + } 423 + wake_up(&mdev->ee_wait); 424 + 425 + return ok; 426 + } 427 + 428 + void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) 429 + { 430 + DEFINE_WAIT(wait); 431 + 432 + /* avoids spin_lock/unlock 433 + * and calling prepare_to_wait in the fast path */ 434 + while (!list_empty(head)) { 435 + prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); 436 + spin_unlock_irq(&mdev->req_lock); 437 + drbd_kick_lo(mdev); 438 + schedule(); 439 + finish_wait(&mdev->ee_wait, &wait); 440 + spin_lock_irq(&mdev->req_lock); 441 + } 442 + } 443 + 444 + void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) 445 + { 446 + spin_lock_irq(&mdev->req_lock); 447 + _drbd_wait_ee_list_empty(mdev, head); 448 + spin_unlock_irq(&mdev->req_lock); 449 + } 450 + 451 + /* see also kernel_accept; which is only present since 2.6.18. 452 + * also we want to log which part of it failed, exactly */ 453 + static int drbd_accept(struct drbd_conf *mdev, const char **what, 454 + struct socket *sock, struct socket **newsock) 455 + { 456 + struct sock *sk = sock->sk; 457 + int err = 0; 458 + 459 + *what = "listen"; 460 + err = sock->ops->listen(sock, 5); 461 + if (err < 0) 462 + goto out; 463 + 464 + *what = "sock_create_lite"; 465 + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, 466 + newsock); 467 + if (err < 0) 468 + goto out; 469 + 470 + *what = "accept"; 471 + err = sock->ops->accept(sock, *newsock, 0); 472 + if (err < 0) { 473 + sock_release(*newsock); 474 + *newsock = NULL; 475 + goto out; 476 + } 477 + (*newsock)->ops = sock->ops; 478 + 479 + out: 480 + return err; 481 + } 482 + 483 + static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, 484 + void *buf, size_t size, int flags) 485 + { 486 + mm_segment_t oldfs; 487 + struct kvec iov = { 488 + .iov_base = buf, 489 + .iov_len = size, 490 + }; 491 + struct msghdr msg = { 492 + .msg_iovlen = 1, 493 + .msg_iov = (struct iovec *)&iov, 494 + .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) 495 + }; 496 + int rv; 497 + 498 + oldfs = get_fs(); 499 + set_fs(KERNEL_DS); 500 + rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); 501 + set_fs(oldfs); 502 + 503 + return rv; 504 + } 505 + 506 + static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) 507 + { 508 + mm_segment_t oldfs; 509 + struct kvec iov = { 510 + .iov_base = buf, 511 + .iov_len = size, 512 + }; 513 + struct msghdr msg = { 514 + .msg_iovlen = 1, 515 + .msg_iov = (struct iovec *)&iov, 516 + .msg_flags = MSG_WAITALL | MSG_NOSIGNAL 517 + }; 518 + int rv; 519 + 520 + oldfs = get_fs(); 521 + set_fs(KERNEL_DS); 522 + 523 + for (;;) { 524 + rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); 525 + if (rv == size) 526 + break; 527 + 528 + /* Note: 529 + * ECONNRESET other side closed the connection 530 + * ERESTARTSYS (on sock) we got a signal 531 + */ 532 + 533 + if (rv < 0) { 534 + if (rv == -ECONNRESET) 535 + dev_info(DEV, "sock was reset by peer\n"); 536 + else if (rv != -ERESTARTSYS) 537 + dev_err(DEV, "sock_recvmsg returned %d\n", rv); 538 + break; 539 + } else if (rv == 0) { 540 + dev_info(DEV, "sock was shut down by peer\n"); 541 + break; 542 + } else { 543 + /* signal came in, or peer/link went down, 544 + * after we read a partial message 545 + */ 546 + /* D_ASSERT(signal_pending(current)); */ 547 + break; 548 + } 549 + }; 550 + 551 + set_fs(oldfs); 552 + 553 + if (rv != size) 554 + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); 555 + 556 + return rv; 557 + } 558 + 559 + static struct socket *drbd_try_connect(struct drbd_conf *mdev) 560 + { 561 + const char *what; 562 + struct socket *sock; 563 + struct sockaddr_in6 src_in6; 564 + int err; 565 + int disconnect_on_error = 1; 566 + 567 + if (!get_net_conf(mdev)) 568 + return NULL; 569 + 570 + what = "sock_create_kern"; 571 + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, 572 + SOCK_STREAM, IPPROTO_TCP, &sock); 573 + if (err < 0) { 574 + sock = NULL; 575 + goto out; 576 + } 577 + 578 + sock->sk->sk_rcvtimeo = 579 + sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; 580 + 581 + /* explicitly bind to the configured IP as source IP 582 + * for the outgoing connections. 583 + * This is needed for multihomed hosts and to be 584 + * able to use lo: interfaces for drbd. 585 + * Make sure to use 0 as port number, so linux selects 586 + * a free one dynamically. 587 + */ 588 + memcpy(&src_in6, mdev->net_conf->my_addr, 589 + min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); 590 + if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) 591 + src_in6.sin6_port = 0; 592 + else 593 + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ 594 + 595 + what = "bind before connect"; 596 + err = sock->ops->bind(sock, 597 + (struct sockaddr *) &src_in6, 598 + mdev->net_conf->my_addr_len); 599 + if (err < 0) 600 + goto out; 601 + 602 + /* connect may fail, peer not yet available. 603 + * stay C_WF_CONNECTION, don't go Disconnecting! */ 604 + disconnect_on_error = 0; 605 + what = "connect"; 606 + err = sock->ops->connect(sock, 607 + (struct sockaddr *)mdev->net_conf->peer_addr, 608 + mdev->net_conf->peer_addr_len, 0); 609 + 610 + out: 611 + if (err < 0) { 612 + if (sock) { 613 + sock_release(sock); 614 + sock = NULL; 615 + } 616 + switch (-err) { 617 + /* timeout, busy, signal pending */ 618 + case ETIMEDOUT: case EAGAIN: case EINPROGRESS: 619 + case EINTR: case ERESTARTSYS: 620 + /* peer not (yet) available, network problem */ 621 + case ECONNREFUSED: case ENETUNREACH: 622 + case EHOSTDOWN: case EHOSTUNREACH: 623 + disconnect_on_error = 0; 624 + break; 625 + default: 626 + dev_err(DEV, "%s failed, err = %d\n", what, err); 627 + } 628 + if (disconnect_on_error) 629 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 630 + } 631 + put_net_conf(mdev); 632 + return sock; 633 + } 634 + 635 + static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) 636 + { 637 + int timeo, err; 638 + struct socket *s_estab = NULL, *s_listen; 639 + const char *what; 640 + 641 + if (!get_net_conf(mdev)) 642 + return NULL; 643 + 644 + what = "sock_create_kern"; 645 + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, 646 + SOCK_STREAM, IPPROTO_TCP, &s_listen); 647 + if (err) { 648 + s_listen = NULL; 649 + goto out; 650 + } 651 + 652 + timeo = mdev->net_conf->try_connect_int * HZ; 653 + timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ 654 + 655 + s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ 656 + s_listen->sk->sk_rcvtimeo = timeo; 657 + s_listen->sk->sk_sndtimeo = timeo; 658 + 659 + what = "bind before listen"; 660 + err = s_listen->ops->bind(s_listen, 661 + (struct sockaddr *) mdev->net_conf->my_addr, 662 + mdev->net_conf->my_addr_len); 663 + if (err < 0) 664 + goto out; 665 + 666 + err = drbd_accept(mdev, &what, s_listen, &s_estab); 667 + 668 + out: 669 + if (s_listen) 670 + sock_release(s_listen); 671 + if (err < 0) { 672 + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { 673 + dev_err(DEV, "%s failed, err = %d\n", what, err); 674 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 675 + } 676 + } 677 + put_net_conf(mdev); 678 + 679 + return s_estab; 680 + } 681 + 682 + static int drbd_send_fp(struct drbd_conf *mdev, 683 + struct socket *sock, enum drbd_packets cmd) 684 + { 685 + struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; 686 + 687 + return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); 688 + } 689 + 690 + static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) 691 + { 692 + struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; 693 + int rr; 694 + 695 + rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); 696 + 697 + if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) 698 + return be16_to_cpu(h->command); 699 + 700 + return 0xffff; 701 + } 702 + 703 + /** 704 + * drbd_socket_okay() - Free the socket if its connection is not okay 705 + * @mdev: DRBD device. 706 + * @sock: pointer to the pointer to the socket. 707 + */ 708 + static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) 709 + { 710 + int rr; 711 + char tb[4]; 712 + 713 + if (!*sock) 714 + return FALSE; 715 + 716 + rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); 717 + 718 + if (rr > 0 || rr == -EAGAIN) { 719 + return TRUE; 720 + } else { 721 + sock_release(*sock); 722 + *sock = NULL; 723 + return FALSE; 724 + } 725 + } 726 + 727 + /* 728 + * return values: 729 + * 1 yes, we have a valid connection 730 + * 0 oops, did not work out, please try again 731 + * -1 peer talks different language, 732 + * no point in trying again, please go standalone. 733 + * -2 We do not have a network config... 734 + */ 735 + static int drbd_connect(struct drbd_conf *mdev) 736 + { 737 + struct socket *s, *sock, *msock; 738 + int try, h, ok; 739 + 740 + D_ASSERT(!mdev->data.socket); 741 + 742 + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) 743 + dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); 744 + 745 + if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) 746 + return -2; 747 + 748 + clear_bit(DISCARD_CONCURRENT, &mdev->flags); 749 + 750 + sock = NULL; 751 + msock = NULL; 752 + 753 + do { 754 + for (try = 0;;) { 755 + /* 3 tries, this should take less than a second! */ 756 + s = drbd_try_connect(mdev); 757 + if (s || ++try >= 3) 758 + break; 759 + /* give the other side time to call bind() & listen() */ 760 + __set_current_state(TASK_INTERRUPTIBLE); 761 + schedule_timeout(HZ / 10); 762 + } 763 + 764 + if (s) { 765 + if (!sock) { 766 + drbd_send_fp(mdev, s, P_HAND_SHAKE_S); 767 + sock = s; 768 + s = NULL; 769 + } else if (!msock) { 770 + drbd_send_fp(mdev, s, P_HAND_SHAKE_M); 771 + msock = s; 772 + s = NULL; 773 + } else { 774 + dev_err(DEV, "Logic error in drbd_connect()\n"); 775 + goto out_release_sockets; 776 + } 777 + } 778 + 779 + if (sock && msock) { 780 + __set_current_state(TASK_INTERRUPTIBLE); 781 + schedule_timeout(HZ / 10); 782 + ok = drbd_socket_okay(mdev, &sock); 783 + ok = drbd_socket_okay(mdev, &msock) && ok; 784 + if (ok) 785 + break; 786 + } 787 + 788 + retry: 789 + s = drbd_wait_for_connect(mdev); 790 + if (s) { 791 + try = drbd_recv_fp(mdev, s); 792 + drbd_socket_okay(mdev, &sock); 793 + drbd_socket_okay(mdev, &msock); 794 + switch (try) { 795 + case P_HAND_SHAKE_S: 796 + if (sock) { 797 + dev_warn(DEV, "initial packet S crossed\n"); 798 + sock_release(sock); 799 + } 800 + sock = s; 801 + break; 802 + case P_HAND_SHAKE_M: 803 + if (msock) { 804 + dev_warn(DEV, "initial packet M crossed\n"); 805 + sock_release(msock); 806 + } 807 + msock = s; 808 + set_bit(DISCARD_CONCURRENT, &mdev->flags); 809 + break; 810 + default: 811 + dev_warn(DEV, "Error receiving initial packet\n"); 812 + sock_release(s); 813 + if (random32() & 1) 814 + goto retry; 815 + } 816 + } 817 + 818 + if (mdev->state.conn <= C_DISCONNECTING) 819 + goto out_release_sockets; 820 + if (signal_pending(current)) { 821 + flush_signals(current); 822 + smp_rmb(); 823 + if (get_t_state(&mdev->receiver) == Exiting) 824 + goto out_release_sockets; 825 + } 826 + 827 + if (sock && msock) { 828 + ok = drbd_socket_okay(mdev, &sock); 829 + ok = drbd_socket_okay(mdev, &msock) && ok; 830 + if (ok) 831 + break; 832 + } 833 + } while (1); 834 + 835 + msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ 836 + sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ 837 + 838 + sock->sk->sk_allocation = GFP_NOIO; 839 + msock->sk->sk_allocation = GFP_NOIO; 840 + 841 + sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; 842 + msock->sk->sk_priority = TC_PRIO_INTERACTIVE; 843 + 844 + if (mdev->net_conf->sndbuf_size) { 845 + sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; 846 + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 847 + } 848 + 849 + if (mdev->net_conf->rcvbuf_size) { 850 + sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; 851 + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 852 + } 853 + 854 + /* NOT YET ... 855 + * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; 856 + * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 857 + * first set it to the P_HAND_SHAKE timeout, 858 + * which we set to 4x the configured ping_timeout. */ 859 + sock->sk->sk_sndtimeo = 860 + sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; 861 + 862 + msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; 863 + msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; 864 + 865 + /* we don't want delays. 866 + * we use TCP_CORK where apropriate, though */ 867 + drbd_tcp_nodelay(sock); 868 + drbd_tcp_nodelay(msock); 869 + 870 + mdev->data.socket = sock; 871 + mdev->meta.socket = msock; 872 + mdev->last_received = jiffies; 873 + 874 + D_ASSERT(mdev->asender.task == NULL); 875 + 876 + h = drbd_do_handshake(mdev); 877 + if (h <= 0) 878 + return h; 879 + 880 + if (mdev->cram_hmac_tfm) { 881 + /* drbd_request_state(mdev, NS(conn, WFAuth)); */ 882 + if (!drbd_do_auth(mdev)) { 883 + dev_err(DEV, "Authentication of peer failed\n"); 884 + return -1; 885 + } 886 + } 887 + 888 + if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) 889 + return 0; 890 + 891 + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; 892 + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 893 + 894 + atomic_set(&mdev->packet_seq, 0); 895 + mdev->peer_seq = 0; 896 + 897 + drbd_thread_start(&mdev->asender); 898 + 899 + drbd_send_protocol(mdev); 900 + drbd_send_sync_param(mdev, &mdev->sync_conf); 901 + drbd_send_sizes(mdev, 0); 902 + drbd_send_uuids(mdev); 903 + drbd_send_state(mdev); 904 + clear_bit(USE_DEGR_WFC_T, &mdev->flags); 905 + clear_bit(RESIZE_PENDING, &mdev->flags); 906 + 907 + return 1; 908 + 909 + out_release_sockets: 910 + if (sock) 911 + sock_release(sock); 912 + if (msock) 913 + sock_release(msock); 914 + return -1; 915 + } 916 + 917 + static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) 918 + { 919 + int r; 920 + 921 + r = drbd_recv(mdev, h, sizeof(*h)); 922 + 923 + if (unlikely(r != sizeof(*h))) { 924 + dev_err(DEV, "short read expecting header on sock: r=%d\n", r); 925 + return FALSE; 926 + }; 927 + h->command = be16_to_cpu(h->command); 928 + h->length = be16_to_cpu(h->length); 929 + if (unlikely(h->magic != BE_DRBD_MAGIC)) { 930 + dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", 931 + (long)be32_to_cpu(h->magic), 932 + h->command, h->length); 933 + return FALSE; 934 + } 935 + mdev->last_received = jiffies; 936 + 937 + return TRUE; 938 + } 939 + 940 + static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) 941 + { 942 + int rv; 943 + 944 + if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 945 + rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); 946 + if (rv) { 947 + dev_err(DEV, "local disk flush failed with status %d\n", rv); 948 + /* would rather check on EOPNOTSUPP, but that is not reliable. 949 + * don't try again for ANY return value != 0 950 + * if (rv == -EOPNOTSUPP) */ 951 + drbd_bump_write_ordering(mdev, WO_drain_io); 952 + } 953 + put_ldev(mdev); 954 + } 955 + 956 + return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); 957 + } 958 + 959 + static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 960 + { 961 + struct flush_work *fw = (struct flush_work *)w; 962 + struct drbd_epoch *epoch = fw->epoch; 963 + 964 + kfree(w); 965 + 966 + if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) 967 + drbd_flush_after_epoch(mdev, epoch); 968 + 969 + drbd_may_finish_epoch(mdev, epoch, EV_PUT | 970 + (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); 971 + 972 + return 1; 973 + } 974 + 975 + /** 976 + * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. 977 + * @mdev: DRBD device. 978 + * @epoch: Epoch object. 979 + * @ev: Epoch event. 980 + */ 981 + static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, 982 + struct drbd_epoch *epoch, 983 + enum epoch_event ev) 984 + { 985 + int finish, epoch_size; 986 + struct drbd_epoch *next_epoch; 987 + int schedule_flush = 0; 988 + enum finish_epoch rv = FE_STILL_LIVE; 989 + 990 + spin_lock(&mdev->epoch_lock); 991 + do { 992 + next_epoch = NULL; 993 + finish = 0; 994 + 995 + epoch_size = atomic_read(&epoch->epoch_size); 996 + 997 + switch (ev & ~EV_CLEANUP) { 998 + case EV_PUT: 999 + atomic_dec(&epoch->active); 1000 + break; 1001 + case EV_GOT_BARRIER_NR: 1002 + set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 1003 + 1004 + /* Special case: If we just switched from WO_bio_barrier to 1005 + WO_bdev_flush we should not finish the current epoch */ 1006 + if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && 1007 + mdev->write_ordering != WO_bio_barrier && 1008 + epoch == mdev->current_epoch) 1009 + clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); 1010 + break; 1011 + case EV_BARRIER_DONE: 1012 + set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); 1013 + break; 1014 + case EV_BECAME_LAST: 1015 + /* nothing to do*/ 1016 + break; 1017 + } 1018 + 1019 + if (epoch_size != 0 && 1020 + atomic_read(&epoch->active) == 0 && 1021 + test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && 1022 + epoch->list.prev == &mdev->current_epoch->list && 1023 + !test_bit(DE_IS_FINISHING, &epoch->flags)) { 1024 + /* Nearly all conditions are met to finish that epoch... */ 1025 + if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || 1026 + mdev->write_ordering == WO_none || 1027 + (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || 1028 + ev & EV_CLEANUP) { 1029 + finish = 1; 1030 + set_bit(DE_IS_FINISHING, &epoch->flags); 1031 + } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && 1032 + mdev->write_ordering == WO_bio_barrier) { 1033 + atomic_inc(&epoch->active); 1034 + schedule_flush = 1; 1035 + } 1036 + } 1037 + if (finish) { 1038 + if (!(ev & EV_CLEANUP)) { 1039 + spin_unlock(&mdev->epoch_lock); 1040 + drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); 1041 + spin_lock(&mdev->epoch_lock); 1042 + } 1043 + dec_unacked(mdev); 1044 + 1045 + if (mdev->current_epoch != epoch) { 1046 + next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); 1047 + list_del(&epoch->list); 1048 + ev = EV_BECAME_LAST | (ev & EV_CLEANUP); 1049 + mdev->epochs--; 1050 + kfree(epoch); 1051 + 1052 + if (rv == FE_STILL_LIVE) 1053 + rv = FE_DESTROYED; 1054 + } else { 1055 + epoch->flags = 0; 1056 + atomic_set(&epoch->epoch_size, 0); 1057 + /* atomic_set(&epoch->active, 0); is alrady zero */ 1058 + if (rv == FE_STILL_LIVE) 1059 + rv = FE_RECYCLED; 1060 + } 1061 + } 1062 + 1063 + if (!next_epoch) 1064 + break; 1065 + 1066 + epoch = next_epoch; 1067 + } while (1); 1068 + 1069 + spin_unlock(&mdev->epoch_lock); 1070 + 1071 + if (schedule_flush) { 1072 + struct flush_work *fw; 1073 + fw = kmalloc(sizeof(*fw), GFP_ATOMIC); 1074 + if (fw) { 1075 + fw->w.cb = w_flush; 1076 + fw->epoch = epoch; 1077 + drbd_queue_work(&mdev->data.work, &fw->w); 1078 + } else { 1079 + dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); 1080 + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); 1081 + /* That is not a recursion, only one level */ 1082 + drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); 1083 + drbd_may_finish_epoch(mdev, epoch, EV_PUT); 1084 + } 1085 + } 1086 + 1087 + return rv; 1088 + } 1089 + 1090 + /** 1091 + * drbd_bump_write_ordering() - Fall back to an other write ordering method 1092 + * @mdev: DRBD device. 1093 + * @wo: Write ordering method to try. 1094 + */ 1095 + void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) 1096 + { 1097 + enum write_ordering_e pwo; 1098 + static char *write_ordering_str[] = { 1099 + [WO_none] = "none", 1100 + [WO_drain_io] = "drain", 1101 + [WO_bdev_flush] = "flush", 1102 + [WO_bio_barrier] = "barrier", 1103 + }; 1104 + 1105 + pwo = mdev->write_ordering; 1106 + wo = min(pwo, wo); 1107 + if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) 1108 + wo = WO_bdev_flush; 1109 + if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) 1110 + wo = WO_drain_io; 1111 + if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) 1112 + wo = WO_none; 1113 + mdev->write_ordering = wo; 1114 + if (pwo != mdev->write_ordering || wo == WO_bio_barrier) 1115 + dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); 1116 + } 1117 + 1118 + /** 1119 + * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set 1120 + * @mdev: DRBD device. 1121 + * @w: work object. 1122 + * @cancel: The connection will be closed anyways (unused in this callback) 1123 + */ 1124 + int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) 1125 + { 1126 + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1127 + struct bio *bio = e->private_bio; 1128 + 1129 + /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, 1130 + (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) 1131 + so that we can finish that epoch in drbd_may_finish_epoch(). 1132 + That is necessary if we already have a long chain of Epochs, before 1133 + we realize that BIO_RW_BARRIER is actually not supported */ 1134 + 1135 + /* As long as the -ENOTSUPP on the barrier is reported immediately 1136 + that will never trigger. If it is reported late, we will just 1137 + print that warning and continue correctly for all future requests 1138 + with WO_bdev_flush */ 1139 + if (previous_epoch(mdev, e->epoch)) 1140 + dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); 1141 + 1142 + /* prepare bio for re-submit, 1143 + * re-init volatile members */ 1144 + /* we still have a local reference, 1145 + * get_ldev was done in receive_Data. */ 1146 + bio->bi_bdev = mdev->ldev->backing_bdev; 1147 + bio->bi_sector = e->sector; 1148 + bio->bi_size = e->size; 1149 + bio->bi_idx = 0; 1150 + 1151 + bio->bi_flags &= ~(BIO_POOL_MASK - 1); 1152 + bio->bi_flags |= 1 << BIO_UPTODATE; 1153 + 1154 + /* don't know whether this is necessary: */ 1155 + bio->bi_phys_segments = 0; 1156 + bio->bi_next = NULL; 1157 + 1158 + /* these should be unchanged: */ 1159 + /* bio->bi_end_io = drbd_endio_write_sec; */ 1160 + /* bio->bi_vcnt = whatever; */ 1161 + 1162 + e->w.cb = e_end_block; 1163 + 1164 + /* This is no longer a barrier request. */ 1165 + bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); 1166 + 1167 + drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); 1168 + 1169 + return 1; 1170 + } 1171 + 1172 + static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) 1173 + { 1174 + int rv, issue_flush; 1175 + struct p_barrier *p = (struct p_barrier *)h; 1176 + struct drbd_epoch *epoch; 1177 + 1178 + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 1179 + 1180 + rv = drbd_recv(mdev, h->payload, h->length); 1181 + ERR_IF(rv != h->length) return FALSE; 1182 + 1183 + inc_unacked(mdev); 1184 + 1185 + if (mdev->net_conf->wire_protocol != DRBD_PROT_C) 1186 + drbd_kick_lo(mdev); 1187 + 1188 + mdev->current_epoch->barrier_nr = p->barrier; 1189 + rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); 1190 + 1191 + /* P_BARRIER_ACK may imply that the corresponding extent is dropped from 1192 + * the activity log, which means it would not be resynced in case the 1193 + * R_PRIMARY crashes now. 1194 + * Therefore we must send the barrier_ack after the barrier request was 1195 + * completed. */ 1196 + switch (mdev->write_ordering) { 1197 + case WO_bio_barrier: 1198 + case WO_none: 1199 + if (rv == FE_RECYCLED) 1200 + return TRUE; 1201 + break; 1202 + 1203 + case WO_bdev_flush: 1204 + case WO_drain_io: 1205 + D_ASSERT(rv == FE_STILL_LIVE); 1206 + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); 1207 + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 1208 + rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); 1209 + if (rv == FE_RECYCLED) 1210 + return TRUE; 1211 + 1212 + /* The asender will send all the ACKs and barrier ACKs out, since 1213 + all EEs moved from the active_ee to the done_ee. We need to 1214 + provide a new epoch object for the EEs that come in soon */ 1215 + break; 1216 + } 1217 + 1218 + /* receiver context, in the writeout path of the other node. 1219 + * avoid potential distributed deadlock */ 1220 + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); 1221 + if (!epoch) { 1222 + dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); 1223 + issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); 1224 + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 1225 + if (issue_flush) { 1226 + rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); 1227 + if (rv == FE_RECYCLED) 1228 + return TRUE; 1229 + } 1230 + 1231 + drbd_wait_ee_list_empty(mdev, &mdev->done_ee); 1232 + 1233 + return TRUE; 1234 + } 1235 + 1236 + epoch->flags = 0; 1237 + atomic_set(&epoch->epoch_size, 0); 1238 + atomic_set(&epoch->active, 0); 1239 + 1240 + spin_lock(&mdev->epoch_lock); 1241 + if (atomic_read(&mdev->current_epoch->epoch_size)) { 1242 + list_add(&epoch->list, &mdev->current_epoch->list); 1243 + mdev->current_epoch = epoch; 1244 + mdev->epochs++; 1245 + } else { 1246 + /* The current_epoch got recycled while we allocated this one... */ 1247 + kfree(epoch); 1248 + } 1249 + spin_unlock(&mdev->epoch_lock); 1250 + 1251 + return TRUE; 1252 + } 1253 + 1254 + /* used from receive_RSDataReply (recv_resync_read) 1255 + * and from receive_Data */ 1256 + static struct drbd_epoch_entry * 1257 + read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) 1258 + { 1259 + struct drbd_epoch_entry *e; 1260 + struct bio_vec *bvec; 1261 + struct page *page; 1262 + struct bio *bio; 1263 + int dgs, ds, i, rr; 1264 + void *dig_in = mdev->int_dig_in; 1265 + void *dig_vv = mdev->int_dig_vv; 1266 + 1267 + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? 1268 + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; 1269 + 1270 + if (dgs) { 1271 + rr = drbd_recv(mdev, dig_in, dgs); 1272 + if (rr != dgs) { 1273 + dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", 1274 + rr, dgs); 1275 + return NULL; 1276 + } 1277 + } 1278 + 1279 + data_size -= dgs; 1280 + 1281 + ERR_IF(data_size & 0x1ff) return NULL; 1282 + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; 1283 + 1284 + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1285 + * "criss-cross" setup, that might cause write-out on some other DRBD, 1286 + * which in turn might block on the other node at this very place. */ 1287 + e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); 1288 + if (!e) 1289 + return NULL; 1290 + bio = e->private_bio; 1291 + ds = data_size; 1292 + bio_for_each_segment(bvec, bio, i) { 1293 + page = bvec->bv_page; 1294 + rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); 1295 + kunmap(page); 1296 + if (rr != min_t(int, ds, PAGE_SIZE)) { 1297 + drbd_free_ee(mdev, e); 1298 + dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1299 + rr, min_t(int, ds, PAGE_SIZE)); 1300 + return NULL; 1301 + } 1302 + ds -= rr; 1303 + } 1304 + 1305 + if (dgs) { 1306 + drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); 1307 + if (memcmp(dig_in, dig_vv, dgs)) { 1308 + dev_err(DEV, "Digest integrity check FAILED.\n"); 1309 + drbd_bcast_ee(mdev, "digest failed", 1310 + dgs, dig_in, dig_vv, e); 1311 + drbd_free_ee(mdev, e); 1312 + return NULL; 1313 + } 1314 + } 1315 + mdev->recv_cnt += data_size>>9; 1316 + return e; 1317 + } 1318 + 1319 + /* drbd_drain_block() just takes a data block 1320 + * out of the socket input buffer, and discards it. 1321 + */ 1322 + static int drbd_drain_block(struct drbd_conf *mdev, int data_size) 1323 + { 1324 + struct page *page; 1325 + int rr, rv = 1; 1326 + void *data; 1327 + 1328 + page = drbd_pp_alloc(mdev, 1); 1329 + 1330 + data = kmap(page); 1331 + while (data_size) { 1332 + rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); 1333 + if (rr != min_t(int, data_size, PAGE_SIZE)) { 1334 + rv = 0; 1335 + dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1336 + rr, min_t(int, data_size, PAGE_SIZE)); 1337 + break; 1338 + } 1339 + data_size -= rr; 1340 + } 1341 + kunmap(page); 1342 + drbd_pp_free(mdev, page); 1343 + return rv; 1344 + } 1345 + 1346 + static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, 1347 + sector_t sector, int data_size) 1348 + { 1349 + struct bio_vec *bvec; 1350 + struct bio *bio; 1351 + int dgs, rr, i, expect; 1352 + void *dig_in = mdev->int_dig_in; 1353 + void *dig_vv = mdev->int_dig_vv; 1354 + 1355 + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? 1356 + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; 1357 + 1358 + if (dgs) { 1359 + rr = drbd_recv(mdev, dig_in, dgs); 1360 + if (rr != dgs) { 1361 + dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", 1362 + rr, dgs); 1363 + return 0; 1364 + } 1365 + } 1366 + 1367 + data_size -= dgs; 1368 + 1369 + /* optimistically update recv_cnt. if receiving fails below, 1370 + * we disconnect anyways, and counters will be reset. */ 1371 + mdev->recv_cnt += data_size>>9; 1372 + 1373 + bio = req->master_bio; 1374 + D_ASSERT(sector == bio->bi_sector); 1375 + 1376 + bio_for_each_segment(bvec, bio, i) { 1377 + expect = min_t(int, data_size, bvec->bv_len); 1378 + rr = drbd_recv(mdev, 1379 + kmap(bvec->bv_page)+bvec->bv_offset, 1380 + expect); 1381 + kunmap(bvec->bv_page); 1382 + if (rr != expect) { 1383 + dev_warn(DEV, "short read receiving data reply: " 1384 + "read %d expected %d\n", 1385 + rr, expect); 1386 + return 0; 1387 + } 1388 + data_size -= rr; 1389 + } 1390 + 1391 + if (dgs) { 1392 + drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); 1393 + if (memcmp(dig_in, dig_vv, dgs)) { 1394 + dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); 1395 + return 0; 1396 + } 1397 + } 1398 + 1399 + D_ASSERT(data_size == 0); 1400 + return 1; 1401 + } 1402 + 1403 + /* e_end_resync_block() is called via 1404 + * drbd_process_done_ee() by asender only */ 1405 + static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) 1406 + { 1407 + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1408 + sector_t sector = e->sector; 1409 + int ok; 1410 + 1411 + D_ASSERT(hlist_unhashed(&e->colision)); 1412 + 1413 + if (likely(drbd_bio_uptodate(e->private_bio))) { 1414 + drbd_set_in_sync(mdev, sector, e->size); 1415 + ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); 1416 + } else { 1417 + /* Record failure to sync */ 1418 + drbd_rs_failed_io(mdev, sector, e->size); 1419 + 1420 + ok = drbd_send_ack(mdev, P_NEG_ACK, e); 1421 + } 1422 + dec_unacked(mdev); 1423 + 1424 + return ok; 1425 + } 1426 + 1427 + static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) 1428 + { 1429 + struct drbd_epoch_entry *e; 1430 + 1431 + e = read_in_block(mdev, ID_SYNCER, sector, data_size); 1432 + if (!e) { 1433 + put_ldev(mdev); 1434 + return FALSE; 1435 + } 1436 + 1437 + dec_rs_pending(mdev); 1438 + 1439 + e->private_bio->bi_end_io = drbd_endio_write_sec; 1440 + e->private_bio->bi_rw = WRITE; 1441 + e->w.cb = e_end_resync_block; 1442 + 1443 + inc_unacked(mdev); 1444 + /* corresponding dec_unacked() in e_end_resync_block() 1445 + * respective _drbd_clear_done_ee */ 1446 + 1447 + spin_lock_irq(&mdev->req_lock); 1448 + list_add(&e->w.list, &mdev->sync_ee); 1449 + spin_unlock_irq(&mdev->req_lock); 1450 + 1451 + drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); 1452 + /* accounting done in endio */ 1453 + 1454 + maybe_kick_lo(mdev); 1455 + return TRUE; 1456 + } 1457 + 1458 + static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) 1459 + { 1460 + struct drbd_request *req; 1461 + sector_t sector; 1462 + unsigned int header_size, data_size; 1463 + int ok; 1464 + struct p_data *p = (struct p_data *)h; 1465 + 1466 + header_size = sizeof(*p) - sizeof(*h); 1467 + data_size = h->length - header_size; 1468 + 1469 + ERR_IF(data_size == 0) return FALSE; 1470 + 1471 + if (drbd_recv(mdev, h->payload, header_size) != header_size) 1472 + return FALSE; 1473 + 1474 + sector = be64_to_cpu(p->sector); 1475 + 1476 + spin_lock_irq(&mdev->req_lock); 1477 + req = _ar_id_to_req(mdev, p->block_id, sector); 1478 + spin_unlock_irq(&mdev->req_lock); 1479 + if (unlikely(!req)) { 1480 + dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); 1481 + return FALSE; 1482 + } 1483 + 1484 + /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid 1485 + * special casing it there for the various failure cases. 1486 + * still no race with drbd_fail_pending_reads */ 1487 + ok = recv_dless_read(mdev, req, sector, data_size); 1488 + 1489 + if (ok) 1490 + req_mod(req, data_received); 1491 + /* else: nothing. handled from drbd_disconnect... 1492 + * I don't think we may complete this just yet 1493 + * in case we are "on-disconnect: freeze" */ 1494 + 1495 + return ok; 1496 + } 1497 + 1498 + static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) 1499 + { 1500 + sector_t sector; 1501 + unsigned int header_size, data_size; 1502 + int ok; 1503 + struct p_data *p = (struct p_data *)h; 1504 + 1505 + header_size = sizeof(*p) - sizeof(*h); 1506 + data_size = h->length - header_size; 1507 + 1508 + ERR_IF(data_size == 0) return FALSE; 1509 + 1510 + if (drbd_recv(mdev, h->payload, header_size) != header_size) 1511 + return FALSE; 1512 + 1513 + sector = be64_to_cpu(p->sector); 1514 + D_ASSERT(p->block_id == ID_SYNCER); 1515 + 1516 + if (get_ldev(mdev)) { 1517 + /* data is submitted to disk within recv_resync_read. 1518 + * corresponding put_ldev done below on error, 1519 + * or in drbd_endio_write_sec. */ 1520 + ok = recv_resync_read(mdev, sector, data_size); 1521 + } else { 1522 + if (__ratelimit(&drbd_ratelimit_state)) 1523 + dev_err(DEV, "Can not write resync data to local disk.\n"); 1524 + 1525 + ok = drbd_drain_block(mdev, data_size); 1526 + 1527 + drbd_send_ack_dp(mdev, P_NEG_ACK, p); 1528 + } 1529 + 1530 + return ok; 1531 + } 1532 + 1533 + /* e_end_block() is called via drbd_process_done_ee(). 1534 + * this means this function only runs in the asender thread 1535 + */ 1536 + static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1537 + { 1538 + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1539 + sector_t sector = e->sector; 1540 + struct drbd_epoch *epoch; 1541 + int ok = 1, pcmd; 1542 + 1543 + if (e->flags & EE_IS_BARRIER) { 1544 + epoch = previous_epoch(mdev, e->epoch); 1545 + if (epoch) 1546 + drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); 1547 + } 1548 + 1549 + if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1550 + if (likely(drbd_bio_uptodate(e->private_bio))) { 1551 + pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1552 + mdev->state.conn <= C_PAUSED_SYNC_T && 1553 + e->flags & EE_MAY_SET_IN_SYNC) ? 1554 + P_RS_WRITE_ACK : P_WRITE_ACK; 1555 + ok &= drbd_send_ack(mdev, pcmd, e); 1556 + if (pcmd == P_RS_WRITE_ACK) 1557 + drbd_set_in_sync(mdev, sector, e->size); 1558 + } else { 1559 + ok = drbd_send_ack(mdev, P_NEG_ACK, e); 1560 + /* we expect it to be marked out of sync anyways... 1561 + * maybe assert this? */ 1562 + } 1563 + dec_unacked(mdev); 1564 + } 1565 + /* we delete from the conflict detection hash _after_ we sent out the 1566 + * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 1567 + if (mdev->net_conf->two_primaries) { 1568 + spin_lock_irq(&mdev->req_lock); 1569 + D_ASSERT(!hlist_unhashed(&e->colision)); 1570 + hlist_del_init(&e->colision); 1571 + spin_unlock_irq(&mdev->req_lock); 1572 + } else { 1573 + D_ASSERT(hlist_unhashed(&e->colision)); 1574 + } 1575 + 1576 + drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); 1577 + 1578 + return ok; 1579 + } 1580 + 1581 + static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) 1582 + { 1583 + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1584 + int ok = 1; 1585 + 1586 + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); 1587 + ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); 1588 + 1589 + spin_lock_irq(&mdev->req_lock); 1590 + D_ASSERT(!hlist_unhashed(&e->colision)); 1591 + hlist_del_init(&e->colision); 1592 + spin_unlock_irq(&mdev->req_lock); 1593 + 1594 + dec_unacked(mdev); 1595 + 1596 + return ok; 1597 + } 1598 + 1599 + /* Called from receive_Data. 1600 + * Synchronize packets on sock with packets on msock. 1601 + * 1602 + * This is here so even when a P_DATA packet traveling via sock overtook an Ack 1603 + * packet traveling on msock, they are still processed in the order they have 1604 + * been sent. 1605 + * 1606 + * Note: we don't care for Ack packets overtaking P_DATA packets. 1607 + * 1608 + * In case packet_seq is larger than mdev->peer_seq number, there are 1609 + * outstanding packets on the msock. We wait for them to arrive. 1610 + * In case we are the logically next packet, we update mdev->peer_seq 1611 + * ourselves. Correctly handles 32bit wrap around. 1612 + * 1613 + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, 1614 + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds 1615 + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have 1616 + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... 1617 + * 1618 + * returns 0 if we may process the packet, 1619 + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ 1620 + static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) 1621 + { 1622 + DEFINE_WAIT(wait); 1623 + unsigned int p_seq; 1624 + long timeout; 1625 + int ret = 0; 1626 + spin_lock(&mdev->peer_seq_lock); 1627 + for (;;) { 1628 + prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); 1629 + if (seq_le(packet_seq, mdev->peer_seq+1)) 1630 + break; 1631 + if (signal_pending(current)) { 1632 + ret = -ERESTARTSYS; 1633 + break; 1634 + } 1635 + p_seq = mdev->peer_seq; 1636 + spin_unlock(&mdev->peer_seq_lock); 1637 + timeout = schedule_timeout(30*HZ); 1638 + spin_lock(&mdev->peer_seq_lock); 1639 + if (timeout == 0 && p_seq == mdev->peer_seq) { 1640 + ret = -ETIMEDOUT; 1641 + dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); 1642 + break; 1643 + } 1644 + } 1645 + finish_wait(&mdev->seq_wait, &wait); 1646 + if (mdev->peer_seq+1 == packet_seq) 1647 + mdev->peer_seq++; 1648 + spin_unlock(&mdev->peer_seq_lock); 1649 + return ret; 1650 + } 1651 + 1652 + /* mirrored write */ 1653 + static int receive_Data(struct drbd_conf *mdev, struct p_header *h) 1654 + { 1655 + sector_t sector; 1656 + struct drbd_epoch_entry *e; 1657 + struct p_data *p = (struct p_data *)h; 1658 + int header_size, data_size; 1659 + int rw = WRITE; 1660 + u32 dp_flags; 1661 + 1662 + header_size = sizeof(*p) - sizeof(*h); 1663 + data_size = h->length - header_size; 1664 + 1665 + ERR_IF(data_size == 0) return FALSE; 1666 + 1667 + if (drbd_recv(mdev, h->payload, header_size) != header_size) 1668 + return FALSE; 1669 + 1670 + if (!get_ldev(mdev)) { 1671 + if (__ratelimit(&drbd_ratelimit_state)) 1672 + dev_err(DEV, "Can not write mirrored data block " 1673 + "to local disk.\n"); 1674 + spin_lock(&mdev->peer_seq_lock); 1675 + if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) 1676 + mdev->peer_seq++; 1677 + spin_unlock(&mdev->peer_seq_lock); 1678 + 1679 + drbd_send_ack_dp(mdev, P_NEG_ACK, p); 1680 + atomic_inc(&mdev->current_epoch->epoch_size); 1681 + return drbd_drain_block(mdev, data_size); 1682 + } 1683 + 1684 + /* get_ldev(mdev) successful. 1685 + * Corresponding put_ldev done either below (on various errors), 1686 + * or in drbd_endio_write_sec, if we successfully submit the data at 1687 + * the end of this function. */ 1688 + 1689 + sector = be64_to_cpu(p->sector); 1690 + e = read_in_block(mdev, p->block_id, sector, data_size); 1691 + if (!e) { 1692 + put_ldev(mdev); 1693 + return FALSE; 1694 + } 1695 + 1696 + e->private_bio->bi_end_io = drbd_endio_write_sec; 1697 + e->w.cb = e_end_block; 1698 + 1699 + spin_lock(&mdev->epoch_lock); 1700 + e->epoch = mdev->current_epoch; 1701 + atomic_inc(&e->epoch->epoch_size); 1702 + atomic_inc(&e->epoch->active); 1703 + 1704 + if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { 1705 + struct drbd_epoch *epoch; 1706 + /* Issue a barrier if we start a new epoch, and the previous epoch 1707 + was not a epoch containing a single request which already was 1708 + a Barrier. */ 1709 + epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); 1710 + if (epoch == e->epoch) { 1711 + set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); 1712 + rw |= (1<<BIO_RW_BARRIER); 1713 + e->flags |= EE_IS_BARRIER; 1714 + } else { 1715 + if (atomic_read(&epoch->epoch_size) > 1 || 1716 + !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { 1717 + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); 1718 + set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); 1719 + rw |= (1<<BIO_RW_BARRIER); 1720 + e->flags |= EE_IS_BARRIER; 1721 + } 1722 + } 1723 + } 1724 + spin_unlock(&mdev->epoch_lock); 1725 + 1726 + dp_flags = be32_to_cpu(p->dp_flags); 1727 + if (dp_flags & DP_HARDBARRIER) { 1728 + dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); 1729 + /* rw |= (1<<BIO_RW_BARRIER); */ 1730 + } 1731 + if (dp_flags & DP_RW_SYNC) 1732 + rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 1733 + if (dp_flags & DP_MAY_SET_IN_SYNC) 1734 + e->flags |= EE_MAY_SET_IN_SYNC; 1735 + 1736 + /* I'm the receiver, I do hold a net_cnt reference. */ 1737 + if (!mdev->net_conf->two_primaries) { 1738 + spin_lock_irq(&mdev->req_lock); 1739 + } else { 1740 + /* don't get the req_lock yet, 1741 + * we may sleep in drbd_wait_peer_seq */ 1742 + const int size = e->size; 1743 + const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); 1744 + DEFINE_WAIT(wait); 1745 + struct drbd_request *i; 1746 + struct hlist_node *n; 1747 + struct hlist_head *slot; 1748 + int first; 1749 + 1750 + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); 1751 + BUG_ON(mdev->ee_hash == NULL); 1752 + BUG_ON(mdev->tl_hash == NULL); 1753 + 1754 + /* conflict detection and handling: 1755 + * 1. wait on the sequence number, 1756 + * in case this data packet overtook ACK packets. 1757 + * 2. check our hash tables for conflicting requests. 1758 + * we only need to walk the tl_hash, since an ee can not 1759 + * have a conflict with an other ee: on the submitting 1760 + * node, the corresponding req had already been conflicting, 1761 + * and a conflicting req is never sent. 1762 + * 1763 + * Note: for two_primaries, we are protocol C, 1764 + * so there cannot be any request that is DONE 1765 + * but still on the transfer log. 1766 + * 1767 + * unconditionally add to the ee_hash. 1768 + * 1769 + * if no conflicting request is found: 1770 + * submit. 1771 + * 1772 + * if any conflicting request is found 1773 + * that has not yet been acked, 1774 + * AND I have the "discard concurrent writes" flag: 1775 + * queue (via done_ee) the P_DISCARD_ACK; OUT. 1776 + * 1777 + * if any conflicting request is found: 1778 + * block the receiver, waiting on misc_wait 1779 + * until no more conflicting requests are there, 1780 + * or we get interrupted (disconnect). 1781 + * 1782 + * we do not just write after local io completion of those 1783 + * requests, but only after req is done completely, i.e. 1784 + * we wait for the P_DISCARD_ACK to arrive! 1785 + * 1786 + * then proceed normally, i.e. submit. 1787 + */ 1788 + if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) 1789 + goto out_interrupted; 1790 + 1791 + spin_lock_irq(&mdev->req_lock); 1792 + 1793 + hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); 1794 + 1795 + #define OVERLAPS overlaps(i->sector, i->size, sector, size) 1796 + slot = tl_hash_slot(mdev, sector); 1797 + first = 1; 1798 + for (;;) { 1799 + int have_unacked = 0; 1800 + int have_conflict = 0; 1801 + prepare_to_wait(&mdev->misc_wait, &wait, 1802 + TASK_INTERRUPTIBLE); 1803 + hlist_for_each_entry(i, n, slot, colision) { 1804 + if (OVERLAPS) { 1805 + /* only ALERT on first iteration, 1806 + * we may be woken up early... */ 1807 + if (first) 1808 + dev_alert(DEV, "%s[%u] Concurrent local write detected!" 1809 + " new: %llus +%u; pending: %llus +%u\n", 1810 + current->comm, current->pid, 1811 + (unsigned long long)sector, size, 1812 + (unsigned long long)i->sector, i->size); 1813 + if (i->rq_state & RQ_NET_PENDING) 1814 + ++have_unacked; 1815 + ++have_conflict; 1816 + } 1817 + } 1818 + #undef OVERLAPS 1819 + if (!have_conflict) 1820 + break; 1821 + 1822 + /* Discard Ack only for the _first_ iteration */ 1823 + if (first && discard && have_unacked) { 1824 + dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", 1825 + (unsigned long long)sector); 1826 + inc_unacked(mdev); 1827 + e->w.cb = e_send_discard_ack; 1828 + list_add_tail(&e->w.list, &mdev->done_ee); 1829 + 1830 + spin_unlock_irq(&mdev->req_lock); 1831 + 1832 + /* we could probably send that P_DISCARD_ACK ourselves, 1833 + * but I don't like the receiver using the msock */ 1834 + 1835 + put_ldev(mdev); 1836 + wake_asender(mdev); 1837 + finish_wait(&mdev->misc_wait, &wait); 1838 + return TRUE; 1839 + } 1840 + 1841 + if (signal_pending(current)) { 1842 + hlist_del_init(&e->colision); 1843 + 1844 + spin_unlock_irq(&mdev->req_lock); 1845 + 1846 + finish_wait(&mdev->misc_wait, &wait); 1847 + goto out_interrupted; 1848 + } 1849 + 1850 + spin_unlock_irq(&mdev->req_lock); 1851 + if (first) { 1852 + first = 0; 1853 + dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " 1854 + "sec=%llus\n", (unsigned long long)sector); 1855 + } else if (discard) { 1856 + /* we had none on the first iteration. 1857 + * there must be none now. */ 1858 + D_ASSERT(have_unacked == 0); 1859 + } 1860 + schedule(); 1861 + spin_lock_irq(&mdev->req_lock); 1862 + } 1863 + finish_wait(&mdev->misc_wait, &wait); 1864 + } 1865 + 1866 + list_add(&e->w.list, &mdev->active_ee); 1867 + spin_unlock_irq(&mdev->req_lock); 1868 + 1869 + switch (mdev->net_conf->wire_protocol) { 1870 + case DRBD_PROT_C: 1871 + inc_unacked(mdev); 1872 + /* corresponding dec_unacked() in e_end_block() 1873 + * respective _drbd_clear_done_ee */ 1874 + break; 1875 + case DRBD_PROT_B: 1876 + /* I really don't like it that the receiver thread 1877 + * sends on the msock, but anyways */ 1878 + drbd_send_ack(mdev, P_RECV_ACK, e); 1879 + break; 1880 + case DRBD_PROT_A: 1881 + /* nothing to do */ 1882 + break; 1883 + } 1884 + 1885 + if (mdev->state.pdsk == D_DISKLESS) { 1886 + /* In case we have the only disk of the cluster, */ 1887 + drbd_set_out_of_sync(mdev, e->sector, e->size); 1888 + e->flags |= EE_CALL_AL_COMPLETE_IO; 1889 + drbd_al_begin_io(mdev, e->sector); 1890 + } 1891 + 1892 + e->private_bio->bi_rw = rw; 1893 + drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); 1894 + /* accounting done in endio */ 1895 + 1896 + maybe_kick_lo(mdev); 1897 + return TRUE; 1898 + 1899 + out_interrupted: 1900 + /* yes, the epoch_size now is imbalanced. 1901 + * but we drop the connection anyways, so we don't have a chance to 1902 + * receive a barrier... atomic_inc(&mdev->epoch_size); */ 1903 + put_ldev(mdev); 1904 + drbd_free_ee(mdev, e); 1905 + return FALSE; 1906 + } 1907 + 1908 + static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) 1909 + { 1910 + sector_t sector; 1911 + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 1912 + struct drbd_epoch_entry *e; 1913 + struct digest_info *di = NULL; 1914 + int size, digest_size; 1915 + unsigned int fault_type; 1916 + struct p_block_req *p = 1917 + (struct p_block_req *)h; 1918 + const int brps = sizeof(*p)-sizeof(*h); 1919 + 1920 + if (drbd_recv(mdev, h->payload, brps) != brps) 1921 + return FALSE; 1922 + 1923 + sector = be64_to_cpu(p->sector); 1924 + size = be32_to_cpu(p->blksize); 1925 + 1926 + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1927 + dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 1928 + (unsigned long long)sector, size); 1929 + return FALSE; 1930 + } 1931 + if (sector + (size>>9) > capacity) { 1932 + dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 1933 + (unsigned long long)sector, size); 1934 + return FALSE; 1935 + } 1936 + 1937 + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { 1938 + if (__ratelimit(&drbd_ratelimit_state)) 1939 + dev_err(DEV, "Can not satisfy peer's read request, " 1940 + "no local data.\n"); 1941 + drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : 1942 + P_NEG_RS_DREPLY , p); 1943 + return TRUE; 1944 + } 1945 + 1946 + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1947 + * "criss-cross" setup, that might cause write-out on some other DRBD, 1948 + * which in turn might block on the other node at this very place. */ 1949 + e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); 1950 + if (!e) { 1951 + put_ldev(mdev); 1952 + return FALSE; 1953 + } 1954 + 1955 + e->private_bio->bi_rw = READ; 1956 + e->private_bio->bi_end_io = drbd_endio_read_sec; 1957 + 1958 + switch (h->command) { 1959 + case P_DATA_REQUEST: 1960 + e->w.cb = w_e_end_data_req; 1961 + fault_type = DRBD_FAULT_DT_RD; 1962 + break; 1963 + case P_RS_DATA_REQUEST: 1964 + e->w.cb = w_e_end_rsdata_req; 1965 + fault_type = DRBD_FAULT_RS_RD; 1966 + /* Eventually this should become asynchronously. Currently it 1967 + * blocks the whole receiver just to delay the reading of a 1968 + * resync data block. 1969 + * the drbd_work_queue mechanism is made for this... 1970 + */ 1971 + if (!drbd_rs_begin_io(mdev, sector)) { 1972 + /* we have been interrupted, 1973 + * probably connection lost! */ 1974 + D_ASSERT(signal_pending(current)); 1975 + goto out_free_e; 1976 + } 1977 + break; 1978 + 1979 + case P_OV_REPLY: 1980 + case P_CSUM_RS_REQUEST: 1981 + fault_type = DRBD_FAULT_RS_RD; 1982 + digest_size = h->length - brps ; 1983 + di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); 1984 + if (!di) 1985 + goto out_free_e; 1986 + 1987 + di->digest_size = digest_size; 1988 + di->digest = (((char *)di)+sizeof(struct digest_info)); 1989 + 1990 + if (drbd_recv(mdev, di->digest, digest_size) != digest_size) 1991 + goto out_free_e; 1992 + 1993 + e->block_id = (u64)(unsigned long)di; 1994 + if (h->command == P_CSUM_RS_REQUEST) { 1995 + D_ASSERT(mdev->agreed_pro_version >= 89); 1996 + e->w.cb = w_e_end_csum_rs_req; 1997 + } else if (h->command == P_OV_REPLY) { 1998 + e->w.cb = w_e_end_ov_reply; 1999 + dec_rs_pending(mdev); 2000 + break; 2001 + } 2002 + 2003 + if (!drbd_rs_begin_io(mdev, sector)) { 2004 + /* we have been interrupted, probably connection lost! */ 2005 + D_ASSERT(signal_pending(current)); 2006 + goto out_free_e; 2007 + } 2008 + break; 2009 + 2010 + case P_OV_REQUEST: 2011 + if (mdev->state.conn >= C_CONNECTED && 2012 + mdev->state.conn != C_VERIFY_T) 2013 + dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", 2014 + drbd_conn_str(mdev->state.conn)); 2015 + if (mdev->ov_start_sector == ~(sector_t)0 && 2016 + mdev->agreed_pro_version >= 90) { 2017 + mdev->ov_start_sector = sector; 2018 + mdev->ov_position = sector; 2019 + mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); 2020 + dev_info(DEV, "Online Verify start sector: %llu\n", 2021 + (unsigned long long)sector); 2022 + } 2023 + e->w.cb = w_e_end_ov_req; 2024 + fault_type = DRBD_FAULT_RS_RD; 2025 + /* Eventually this should become asynchronous. Currently it 2026 + * blocks the whole receiver just to delay the reading of a 2027 + * resync data block. 2028 + * the drbd_work_queue mechanism is made for this... 2029 + */ 2030 + if (!drbd_rs_begin_io(mdev, sector)) { 2031 + /* we have been interrupted, 2032 + * probably connection lost! */ 2033 + D_ASSERT(signal_pending(current)); 2034 + goto out_free_e; 2035 + } 2036 + break; 2037 + 2038 + 2039 + default: 2040 + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", 2041 + cmdname(h->command)); 2042 + fault_type = DRBD_FAULT_MAX; 2043 + } 2044 + 2045 + spin_lock_irq(&mdev->req_lock); 2046 + list_add(&e->w.list, &mdev->read_ee); 2047 + spin_unlock_irq(&mdev->req_lock); 2048 + 2049 + inc_unacked(mdev); 2050 + 2051 + drbd_generic_make_request(mdev, fault_type, e->private_bio); 2052 + maybe_kick_lo(mdev); 2053 + 2054 + return TRUE; 2055 + 2056 + out_free_e: 2057 + kfree(di); 2058 + put_ldev(mdev); 2059 + drbd_free_ee(mdev, e); 2060 + return FALSE; 2061 + } 2062 + 2063 + static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) 2064 + { 2065 + int self, peer, rv = -100; 2066 + unsigned long ch_self, ch_peer; 2067 + 2068 + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; 2069 + peer = mdev->p_uuid[UI_BITMAP] & 1; 2070 + 2071 + ch_peer = mdev->p_uuid[UI_SIZE]; 2072 + ch_self = mdev->comm_bm_set; 2073 + 2074 + switch (mdev->net_conf->after_sb_0p) { 2075 + case ASB_CONSENSUS: 2076 + case ASB_DISCARD_SECONDARY: 2077 + case ASB_CALL_HELPER: 2078 + dev_err(DEV, "Configuration error.\n"); 2079 + break; 2080 + case ASB_DISCONNECT: 2081 + break; 2082 + case ASB_DISCARD_YOUNGER_PRI: 2083 + if (self == 0 && peer == 1) { 2084 + rv = -1; 2085 + break; 2086 + } 2087 + if (self == 1 && peer == 0) { 2088 + rv = 1; 2089 + break; 2090 + } 2091 + /* Else fall through to one of the other strategies... */ 2092 + case ASB_DISCARD_OLDER_PRI: 2093 + if (self == 0 && peer == 1) { 2094 + rv = 1; 2095 + break; 2096 + } 2097 + if (self == 1 && peer == 0) { 2098 + rv = -1; 2099 + break; 2100 + } 2101 + /* Else fall through to one of the other strategies... */ 2102 + dev_warn(DEV, "Discard younger/older primary did not find a decision\n" 2103 + "Using discard-least-changes instead\n"); 2104 + case ASB_DISCARD_ZERO_CHG: 2105 + if (ch_peer == 0 && ch_self == 0) { 2106 + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) 2107 + ? -1 : 1; 2108 + break; 2109 + } else { 2110 + if (ch_peer == 0) { rv = 1; break; } 2111 + if (ch_self == 0) { rv = -1; break; } 2112 + } 2113 + if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) 2114 + break; 2115 + case ASB_DISCARD_LEAST_CHG: 2116 + if (ch_self < ch_peer) 2117 + rv = -1; 2118 + else if (ch_self > ch_peer) 2119 + rv = 1; 2120 + else /* ( ch_self == ch_peer ) */ 2121 + /* Well, then use something else. */ 2122 + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) 2123 + ? -1 : 1; 2124 + break; 2125 + case ASB_DISCARD_LOCAL: 2126 + rv = -1; 2127 + break; 2128 + case ASB_DISCARD_REMOTE: 2129 + rv = 1; 2130 + } 2131 + 2132 + return rv; 2133 + } 2134 + 2135 + static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) 2136 + { 2137 + int self, peer, hg, rv = -100; 2138 + 2139 + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; 2140 + peer = mdev->p_uuid[UI_BITMAP] & 1; 2141 + 2142 + switch (mdev->net_conf->after_sb_1p) { 2143 + case ASB_DISCARD_YOUNGER_PRI: 2144 + case ASB_DISCARD_OLDER_PRI: 2145 + case ASB_DISCARD_LEAST_CHG: 2146 + case ASB_DISCARD_LOCAL: 2147 + case ASB_DISCARD_REMOTE: 2148 + dev_err(DEV, "Configuration error.\n"); 2149 + break; 2150 + case ASB_DISCONNECT: 2151 + break; 2152 + case ASB_CONSENSUS: 2153 + hg = drbd_asb_recover_0p(mdev); 2154 + if (hg == -1 && mdev->state.role == R_SECONDARY) 2155 + rv = hg; 2156 + if (hg == 1 && mdev->state.role == R_PRIMARY) 2157 + rv = hg; 2158 + break; 2159 + case ASB_VIOLENTLY: 2160 + rv = drbd_asb_recover_0p(mdev); 2161 + break; 2162 + case ASB_DISCARD_SECONDARY: 2163 + return mdev->state.role == R_PRIMARY ? 1 : -1; 2164 + case ASB_CALL_HELPER: 2165 + hg = drbd_asb_recover_0p(mdev); 2166 + if (hg == -1 && mdev->state.role == R_PRIMARY) { 2167 + self = drbd_set_role(mdev, R_SECONDARY, 0); 2168 + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2169 + * we might be here in C_WF_REPORT_PARAMS which is transient. 2170 + * we do not need to wait for the after state change work either. */ 2171 + self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); 2172 + if (self != SS_SUCCESS) { 2173 + drbd_khelper(mdev, "pri-lost-after-sb"); 2174 + } else { 2175 + dev_warn(DEV, "Successfully gave up primary role.\n"); 2176 + rv = hg; 2177 + } 2178 + } else 2179 + rv = hg; 2180 + } 2181 + 2182 + return rv; 2183 + } 2184 + 2185 + static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) 2186 + { 2187 + int self, peer, hg, rv = -100; 2188 + 2189 + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; 2190 + peer = mdev->p_uuid[UI_BITMAP] & 1; 2191 + 2192 + switch (mdev->net_conf->after_sb_2p) { 2193 + case ASB_DISCARD_YOUNGER_PRI: 2194 + case ASB_DISCARD_OLDER_PRI: 2195 + case ASB_DISCARD_LEAST_CHG: 2196 + case ASB_DISCARD_LOCAL: 2197 + case ASB_DISCARD_REMOTE: 2198 + case ASB_CONSENSUS: 2199 + case ASB_DISCARD_SECONDARY: 2200 + dev_err(DEV, "Configuration error.\n"); 2201 + break; 2202 + case ASB_VIOLENTLY: 2203 + rv = drbd_asb_recover_0p(mdev); 2204 + break; 2205 + case ASB_DISCONNECT: 2206 + break; 2207 + case ASB_CALL_HELPER: 2208 + hg = drbd_asb_recover_0p(mdev); 2209 + if (hg == -1) { 2210 + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2211 + * we might be here in C_WF_REPORT_PARAMS which is transient. 2212 + * we do not need to wait for the after state change work either. */ 2213 + self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); 2214 + if (self != SS_SUCCESS) { 2215 + drbd_khelper(mdev, "pri-lost-after-sb"); 2216 + } else { 2217 + dev_warn(DEV, "Successfully gave up primary role.\n"); 2218 + rv = hg; 2219 + } 2220 + } else 2221 + rv = hg; 2222 + } 2223 + 2224 + return rv; 2225 + } 2226 + 2227 + static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid, 2228 + u64 bits, u64 flags) 2229 + { 2230 + if (!uuid) { 2231 + dev_info(DEV, "%s uuid info vanished while I was looking!\n", text); 2232 + return; 2233 + } 2234 + dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", 2235 + text, 2236 + (unsigned long long)uuid[UI_CURRENT], 2237 + (unsigned long long)uuid[UI_BITMAP], 2238 + (unsigned long long)uuid[UI_HISTORY_START], 2239 + (unsigned long long)uuid[UI_HISTORY_END], 2240 + (unsigned long long)bits, 2241 + (unsigned long long)flags); 2242 + } 2243 + 2244 + /* 2245 + 100 after split brain try auto recover 2246 + 2 C_SYNC_SOURCE set BitMap 2247 + 1 C_SYNC_SOURCE use BitMap 2248 + 0 no Sync 2249 + -1 C_SYNC_TARGET use BitMap 2250 + -2 C_SYNC_TARGET set BitMap 2251 + -100 after split brain, disconnect 2252 + -1000 unrelated data 2253 + */ 2254 + static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) 2255 + { 2256 + u64 self, peer; 2257 + int i, j; 2258 + 2259 + self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); 2260 + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); 2261 + 2262 + *rule_nr = 10; 2263 + if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) 2264 + return 0; 2265 + 2266 + *rule_nr = 20; 2267 + if ((self == UUID_JUST_CREATED || self == (u64)0) && 2268 + peer != UUID_JUST_CREATED) 2269 + return -2; 2270 + 2271 + *rule_nr = 30; 2272 + if (self != UUID_JUST_CREATED && 2273 + (peer == UUID_JUST_CREATED || peer == (u64)0)) 2274 + return 2; 2275 + 2276 + if (self == peer) { 2277 + int rct, dc; /* roles at crash time */ 2278 + 2279 + if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { 2280 + 2281 + if (mdev->agreed_pro_version < 91) 2282 + return -1001; 2283 + 2284 + if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && 2285 + (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { 2286 + dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); 2287 + drbd_uuid_set_bm(mdev, 0UL); 2288 + 2289 + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, 2290 + mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); 2291 + *rule_nr = 34; 2292 + } else { 2293 + dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n"); 2294 + *rule_nr = 36; 2295 + } 2296 + 2297 + return 1; 2298 + } 2299 + 2300 + if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { 2301 + 2302 + if (mdev->agreed_pro_version < 91) 2303 + return -1001; 2304 + 2305 + if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && 2306 + (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { 2307 + dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); 2308 + 2309 + mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START]; 2310 + mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP]; 2311 + mdev->p_uuid[UI_BITMAP] = 0UL; 2312 + 2313 + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); 2314 + *rule_nr = 35; 2315 + } else { 2316 + dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n"); 2317 + *rule_nr = 37; 2318 + } 2319 + 2320 + return -1; 2321 + } 2322 + 2323 + /* Common power [off|failure] */ 2324 + rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + 2325 + (mdev->p_uuid[UI_FLAGS] & 2); 2326 + /* lowest bit is set when we were primary, 2327 + * next bit (weight 2) is set when peer was primary */ 2328 + *rule_nr = 40; 2329 + 2330 + switch (rct) { 2331 + case 0: /* !self_pri && !peer_pri */ return 0; 2332 + case 1: /* self_pri && !peer_pri */ return 1; 2333 + case 2: /* !self_pri && peer_pri */ return -1; 2334 + case 3: /* self_pri && peer_pri */ 2335 + dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); 2336 + return dc ? -1 : 1; 2337 + } 2338 + } 2339 + 2340 + *rule_nr = 50; 2341 + peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); 2342 + if (self == peer) 2343 + return -1; 2344 + 2345 + *rule_nr = 51; 2346 + peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); 2347 + if (self == peer) { 2348 + self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 2349 + peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); 2350 + if (self == peer) { 2351 + /* The last P_SYNC_UUID did not get though. Undo the last start of 2352 + resync as sync source modifications of the peer's UUIDs. */ 2353 + 2354 + if (mdev->agreed_pro_version < 91) 2355 + return -1001; 2356 + 2357 + mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; 2358 + mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; 2359 + return -1; 2360 + } 2361 + } 2362 + 2363 + *rule_nr = 60; 2364 + self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); 2365 + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 2366 + peer = mdev->p_uuid[i] & ~((u64)1); 2367 + if (self == peer) 2368 + return -2; 2369 + } 2370 + 2371 + *rule_nr = 70; 2372 + self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); 2373 + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); 2374 + if (self == peer) 2375 + return 1; 2376 + 2377 + *rule_nr = 71; 2378 + self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 2379 + if (self == peer) { 2380 + self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); 2381 + peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); 2382 + if (self == peer) { 2383 + /* The last P_SYNC_UUID did not get though. Undo the last start of 2384 + resync as sync source modifications of our UUIDs. */ 2385 + 2386 + if (mdev->agreed_pro_version < 91) 2387 + return -1001; 2388 + 2389 + _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); 2390 + _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); 2391 + 2392 + dev_info(DEV, "Undid last start of resync:\n"); 2393 + 2394 + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, 2395 + mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); 2396 + 2397 + return 1; 2398 + } 2399 + } 2400 + 2401 + 2402 + *rule_nr = 80; 2403 + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); 2404 + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 2405 + self = mdev->ldev->md.uuid[i] & ~((u64)1); 2406 + if (self == peer) 2407 + return 2; 2408 + } 2409 + 2410 + *rule_nr = 90; 2411 + self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); 2412 + peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); 2413 + if (self == peer && self != ((u64)0)) 2414 + return 100; 2415 + 2416 + *rule_nr = 100; 2417 + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { 2418 + self = mdev->ldev->md.uuid[i] & ~((u64)1); 2419 + for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { 2420 + peer = mdev->p_uuid[j] & ~((u64)1); 2421 + if (self == peer) 2422 + return -100; 2423 + } 2424 + } 2425 + 2426 + return -1000; 2427 + } 2428 + 2429 + /* drbd_sync_handshake() returns the new conn state on success, or 2430 + CONN_MASK (-1) on failure. 2431 + */ 2432 + static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, 2433 + enum drbd_disk_state peer_disk) __must_hold(local) 2434 + { 2435 + int hg, rule_nr; 2436 + enum drbd_conns rv = C_MASK; 2437 + enum drbd_disk_state mydisk; 2438 + 2439 + mydisk = mdev->state.disk; 2440 + if (mydisk == D_NEGOTIATING) 2441 + mydisk = mdev->new_state_tmp.disk; 2442 + 2443 + dev_info(DEV, "drbd_sync_handshake:\n"); 2444 + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); 2445 + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, 2446 + mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); 2447 + 2448 + hg = drbd_uuid_compare(mdev, &rule_nr); 2449 + 2450 + dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); 2451 + 2452 + if (hg == -1000) { 2453 + dev_alert(DEV, "Unrelated data, aborting!\n"); 2454 + return C_MASK; 2455 + } 2456 + if (hg == -1001) { 2457 + dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); 2458 + return C_MASK; 2459 + } 2460 + 2461 + if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || 2462 + (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { 2463 + int f = (hg == -100) || abs(hg) == 2; 2464 + hg = mydisk > D_INCONSISTENT ? 1 : -1; 2465 + if (f) 2466 + hg = hg*2; 2467 + dev_info(DEV, "Becoming sync %s due to disk states.\n", 2468 + hg > 0 ? "source" : "target"); 2469 + } 2470 + 2471 + if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { 2472 + int pcount = (mdev->state.role == R_PRIMARY) 2473 + + (peer_role == R_PRIMARY); 2474 + int forced = (hg == -100); 2475 + 2476 + switch (pcount) { 2477 + case 0: 2478 + hg = drbd_asb_recover_0p(mdev); 2479 + break; 2480 + case 1: 2481 + hg = drbd_asb_recover_1p(mdev); 2482 + break; 2483 + case 2: 2484 + hg = drbd_asb_recover_2p(mdev); 2485 + break; 2486 + } 2487 + if (abs(hg) < 100) { 2488 + dev_warn(DEV, "Split-Brain detected, %d primaries, " 2489 + "automatically solved. Sync from %s node\n", 2490 + pcount, (hg < 0) ? "peer" : "this"); 2491 + if (forced) { 2492 + dev_warn(DEV, "Doing a full sync, since" 2493 + " UUIDs where ambiguous.\n"); 2494 + hg = hg*2; 2495 + } 2496 + } 2497 + } 2498 + 2499 + if (hg == -100) { 2500 + if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) 2501 + hg = -1; 2502 + if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) 2503 + hg = 1; 2504 + 2505 + if (abs(hg) < 100) 2506 + dev_warn(DEV, "Split-Brain detected, manually solved. " 2507 + "Sync from %s node\n", 2508 + (hg < 0) ? "peer" : "this"); 2509 + } 2510 + 2511 + if (hg == -100) { 2512 + dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); 2513 + drbd_khelper(mdev, "split-brain"); 2514 + return C_MASK; 2515 + } 2516 + 2517 + if (hg > 0 && mydisk <= D_INCONSISTENT) { 2518 + dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n"); 2519 + return C_MASK; 2520 + } 2521 + 2522 + if (hg < 0 && /* by intention we do not use mydisk here. */ 2523 + mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { 2524 + switch (mdev->net_conf->rr_conflict) { 2525 + case ASB_CALL_HELPER: 2526 + drbd_khelper(mdev, "pri-lost"); 2527 + /* fall through */ 2528 + case ASB_DISCONNECT: 2529 + dev_err(DEV, "I shall become SyncTarget, but I am primary!\n"); 2530 + return C_MASK; 2531 + case ASB_VIOLENTLY: 2532 + dev_warn(DEV, "Becoming SyncTarget, violating the stable-data" 2533 + "assumption\n"); 2534 + } 2535 + } 2536 + 2537 + if (abs(hg) >= 2) { 2538 + dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); 2539 + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) 2540 + return C_MASK; 2541 + } 2542 + 2543 + if (hg > 0) { /* become sync source. */ 2544 + rv = C_WF_BITMAP_S; 2545 + } else if (hg < 0) { /* become sync target */ 2546 + rv = C_WF_BITMAP_T; 2547 + } else { 2548 + rv = C_CONNECTED; 2549 + if (drbd_bm_total_weight(mdev)) { 2550 + dev_info(DEV, "No resync, but %lu bits in bitmap!\n", 2551 + drbd_bm_total_weight(mdev)); 2552 + } 2553 + } 2554 + 2555 + return rv; 2556 + } 2557 + 2558 + /* returns 1 if invalid */ 2559 + static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) 2560 + { 2561 + /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ 2562 + if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || 2563 + (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) 2564 + return 0; 2565 + 2566 + /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ 2567 + if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || 2568 + self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) 2569 + return 1; 2570 + 2571 + /* everything else is valid if they are equal on both sides. */ 2572 + if (peer == self) 2573 + return 0; 2574 + 2575 + /* everything es is invalid. */ 2576 + return 1; 2577 + } 2578 + 2579 + static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) 2580 + { 2581 + struct p_protocol *p = (struct p_protocol *)h; 2582 + int header_size, data_size; 2583 + int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; 2584 + int p_want_lose, p_two_primaries; 2585 + char p_integrity_alg[SHARED_SECRET_MAX] = ""; 2586 + 2587 + header_size = sizeof(*p) - sizeof(*h); 2588 + data_size = h->length - header_size; 2589 + 2590 + if (drbd_recv(mdev, h->payload, header_size) != header_size) 2591 + return FALSE; 2592 + 2593 + p_proto = be32_to_cpu(p->protocol); 2594 + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); 2595 + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); 2596 + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); 2597 + p_want_lose = be32_to_cpu(p->want_lose); 2598 + p_two_primaries = be32_to_cpu(p->two_primaries); 2599 + 2600 + if (p_proto != mdev->net_conf->wire_protocol) { 2601 + dev_err(DEV, "incompatible communication protocols\n"); 2602 + goto disconnect; 2603 + } 2604 + 2605 + if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { 2606 + dev_err(DEV, "incompatible after-sb-0pri settings\n"); 2607 + goto disconnect; 2608 + } 2609 + 2610 + if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { 2611 + dev_err(DEV, "incompatible after-sb-1pri settings\n"); 2612 + goto disconnect; 2613 + } 2614 + 2615 + if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { 2616 + dev_err(DEV, "incompatible after-sb-2pri settings\n"); 2617 + goto disconnect; 2618 + } 2619 + 2620 + if (p_want_lose && mdev->net_conf->want_lose) { 2621 + dev_err(DEV, "both sides have the 'want_lose' flag set\n"); 2622 + goto disconnect; 2623 + } 2624 + 2625 + if (p_two_primaries != mdev->net_conf->two_primaries) { 2626 + dev_err(DEV, "incompatible setting of the two-primaries options\n"); 2627 + goto disconnect; 2628 + } 2629 + 2630 + if (mdev->agreed_pro_version >= 87) { 2631 + unsigned char *my_alg = mdev->net_conf->integrity_alg; 2632 + 2633 + if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) 2634 + return FALSE; 2635 + 2636 + p_integrity_alg[SHARED_SECRET_MAX-1] = 0; 2637 + if (strcmp(p_integrity_alg, my_alg)) { 2638 + dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); 2639 + goto disconnect; 2640 + } 2641 + dev_info(DEV, "data-integrity-alg: %s\n", 2642 + my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); 2643 + } 2644 + 2645 + return TRUE; 2646 + 2647 + disconnect: 2648 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2649 + return FALSE; 2650 + } 2651 + 2652 + /* helper function 2653 + * input: alg name, feature name 2654 + * return: NULL (alg name was "") 2655 + * ERR_PTR(error) if something goes wrong 2656 + * or the crypto hash ptr, if it worked out ok. */ 2657 + struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, 2658 + const char *alg, const char *name) 2659 + { 2660 + struct crypto_hash *tfm; 2661 + 2662 + if (!alg[0]) 2663 + return NULL; 2664 + 2665 + tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); 2666 + if (IS_ERR(tfm)) { 2667 + dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n", 2668 + alg, name, PTR_ERR(tfm)); 2669 + return tfm; 2670 + } 2671 + if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { 2672 + crypto_free_hash(tfm); 2673 + dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); 2674 + return ERR_PTR(-EINVAL); 2675 + } 2676 + return tfm; 2677 + } 2678 + 2679 + static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) 2680 + { 2681 + int ok = TRUE; 2682 + struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; 2683 + unsigned int header_size, data_size, exp_max_sz; 2684 + struct crypto_hash *verify_tfm = NULL; 2685 + struct crypto_hash *csums_tfm = NULL; 2686 + const int apv = mdev->agreed_pro_version; 2687 + 2688 + exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) 2689 + : apv == 88 ? sizeof(struct p_rs_param) 2690 + + SHARED_SECRET_MAX 2691 + : /* 89 */ sizeof(struct p_rs_param_89); 2692 + 2693 + if (h->length > exp_max_sz) { 2694 + dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", 2695 + h->length, exp_max_sz); 2696 + return FALSE; 2697 + } 2698 + 2699 + if (apv <= 88) { 2700 + header_size = sizeof(struct p_rs_param) - sizeof(*h); 2701 + data_size = h->length - header_size; 2702 + } else /* apv >= 89 */ { 2703 + header_size = sizeof(struct p_rs_param_89) - sizeof(*h); 2704 + data_size = h->length - header_size; 2705 + D_ASSERT(data_size == 0); 2706 + } 2707 + 2708 + /* initialize verify_alg and csums_alg */ 2709 + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 2710 + 2711 + if (drbd_recv(mdev, h->payload, header_size) != header_size) 2712 + return FALSE; 2713 + 2714 + mdev->sync_conf.rate = be32_to_cpu(p->rate); 2715 + 2716 + if (apv >= 88) { 2717 + if (apv == 88) { 2718 + if (data_size > SHARED_SECRET_MAX) { 2719 + dev_err(DEV, "verify-alg too long, " 2720 + "peer wants %u, accepting only %u byte\n", 2721 + data_size, SHARED_SECRET_MAX); 2722 + return FALSE; 2723 + } 2724 + 2725 + if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) 2726 + return FALSE; 2727 + 2728 + /* we expect NUL terminated string */ 2729 + /* but just in case someone tries to be evil */ 2730 + D_ASSERT(p->verify_alg[data_size-1] == 0); 2731 + p->verify_alg[data_size-1] = 0; 2732 + 2733 + } else /* apv >= 89 */ { 2734 + /* we still expect NUL terminated strings */ 2735 + /* but just in case someone tries to be evil */ 2736 + D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0); 2737 + D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0); 2738 + p->verify_alg[SHARED_SECRET_MAX-1] = 0; 2739 + p->csums_alg[SHARED_SECRET_MAX-1] = 0; 2740 + } 2741 + 2742 + if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { 2743 + if (mdev->state.conn == C_WF_REPORT_PARAMS) { 2744 + dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", 2745 + mdev->sync_conf.verify_alg, p->verify_alg); 2746 + goto disconnect; 2747 + } 2748 + verify_tfm = drbd_crypto_alloc_digest_safe(mdev, 2749 + p->verify_alg, "verify-alg"); 2750 + if (IS_ERR(verify_tfm)) { 2751 + verify_tfm = NULL; 2752 + goto disconnect; 2753 + } 2754 + } 2755 + 2756 + if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { 2757 + if (mdev->state.conn == C_WF_REPORT_PARAMS) { 2758 + dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", 2759 + mdev->sync_conf.csums_alg, p->csums_alg); 2760 + goto disconnect; 2761 + } 2762 + csums_tfm = drbd_crypto_alloc_digest_safe(mdev, 2763 + p->csums_alg, "csums-alg"); 2764 + if (IS_ERR(csums_tfm)) { 2765 + csums_tfm = NULL; 2766 + goto disconnect; 2767 + } 2768 + } 2769 + 2770 + 2771 + spin_lock(&mdev->peer_seq_lock); 2772 + /* lock against drbd_nl_syncer_conf() */ 2773 + if (verify_tfm) { 2774 + strcpy(mdev->sync_conf.verify_alg, p->verify_alg); 2775 + mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; 2776 + crypto_free_hash(mdev->verify_tfm); 2777 + mdev->verify_tfm = verify_tfm; 2778 + dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); 2779 + } 2780 + if (csums_tfm) { 2781 + strcpy(mdev->sync_conf.csums_alg, p->csums_alg); 2782 + mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; 2783 + crypto_free_hash(mdev->csums_tfm); 2784 + mdev->csums_tfm = csums_tfm; 2785 + dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); 2786 + } 2787 + spin_unlock(&mdev->peer_seq_lock); 2788 + } 2789 + 2790 + return ok; 2791 + disconnect: 2792 + /* just for completeness: actually not needed, 2793 + * as this is not reached if csums_tfm was ok. */ 2794 + crypto_free_hash(csums_tfm); 2795 + /* but free the verify_tfm again, if csums_tfm did not work out */ 2796 + crypto_free_hash(verify_tfm); 2797 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2798 + return FALSE; 2799 + } 2800 + 2801 + static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) 2802 + { 2803 + /* sorry, we currently have no working implementation 2804 + * of distributed TCQ */ 2805 + } 2806 + 2807 + /* warn if the arguments differ by more than 12.5% */ 2808 + static void warn_if_differ_considerably(struct drbd_conf *mdev, 2809 + const char *s, sector_t a, sector_t b) 2810 + { 2811 + sector_t d; 2812 + if (a == 0 || b == 0) 2813 + return; 2814 + d = (a > b) ? (a - b) : (b - a); 2815 + if (d > (a>>3) || d > (b>>3)) 2816 + dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s, 2817 + (unsigned long long)a, (unsigned long long)b); 2818 + } 2819 + 2820 + static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) 2821 + { 2822 + struct p_sizes *p = (struct p_sizes *)h; 2823 + enum determine_dev_size dd = unchanged; 2824 + unsigned int max_seg_s; 2825 + sector_t p_size, p_usize, my_usize; 2826 + int ldsc = 0; /* local disk size changed */ 2827 + enum drbd_conns nconn; 2828 + 2829 + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 2830 + if (drbd_recv(mdev, h->payload, h->length) != h->length) 2831 + return FALSE; 2832 + 2833 + p_size = be64_to_cpu(p->d_size); 2834 + p_usize = be64_to_cpu(p->u_size); 2835 + 2836 + if (p_size == 0 && mdev->state.disk == D_DISKLESS) { 2837 + dev_err(DEV, "some backing storage is needed\n"); 2838 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2839 + return FALSE; 2840 + } 2841 + 2842 + /* just store the peer's disk size for now. 2843 + * we still need to figure out whether we accept that. */ 2844 + mdev->p_size = p_size; 2845 + 2846 + #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 2847 + if (get_ldev(mdev)) { 2848 + warn_if_differ_considerably(mdev, "lower level device sizes", 2849 + p_size, drbd_get_max_capacity(mdev->ldev)); 2850 + warn_if_differ_considerably(mdev, "user requested size", 2851 + p_usize, mdev->ldev->dc.disk_size); 2852 + 2853 + /* if this is the first connect, or an otherwise expected 2854 + * param exchange, choose the minimum */ 2855 + if (mdev->state.conn == C_WF_REPORT_PARAMS) 2856 + p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, 2857 + p_usize); 2858 + 2859 + my_usize = mdev->ldev->dc.disk_size; 2860 + 2861 + if (mdev->ldev->dc.disk_size != p_usize) { 2862 + mdev->ldev->dc.disk_size = p_usize; 2863 + dev_info(DEV, "Peer sets u_size to %lu sectors\n", 2864 + (unsigned long)mdev->ldev->dc.disk_size); 2865 + } 2866 + 2867 + /* Never shrink a device with usable data during connect. 2868 + But allow online shrinking if we are connected. */ 2869 + if (drbd_new_dev_size(mdev, mdev->ldev) < 2870 + drbd_get_capacity(mdev->this_bdev) && 2871 + mdev->state.disk >= D_OUTDATED && 2872 + mdev->state.conn < C_CONNECTED) { 2873 + dev_err(DEV, "The peer's disk size is too small!\n"); 2874 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2875 + mdev->ldev->dc.disk_size = my_usize; 2876 + put_ldev(mdev); 2877 + return FALSE; 2878 + } 2879 + put_ldev(mdev); 2880 + } 2881 + #undef min_not_zero 2882 + 2883 + if (get_ldev(mdev)) { 2884 + dd = drbd_determin_dev_size(mdev); 2885 + put_ldev(mdev); 2886 + if (dd == dev_size_error) 2887 + return FALSE; 2888 + drbd_md_sync(mdev); 2889 + } else { 2890 + /* I am diskless, need to accept the peer's size. */ 2891 + drbd_set_my_capacity(mdev, p_size); 2892 + } 2893 + 2894 + if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { 2895 + nconn = drbd_sync_handshake(mdev, 2896 + mdev->state.peer, mdev->state.pdsk); 2897 + put_ldev(mdev); 2898 + 2899 + if (nconn == C_MASK) { 2900 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2901 + return FALSE; 2902 + } 2903 + 2904 + if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { 2905 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2906 + return FALSE; 2907 + } 2908 + } 2909 + 2910 + if (get_ldev(mdev)) { 2911 + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 2912 + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 2913 + ldsc = 1; 2914 + } 2915 + 2916 + max_seg_s = be32_to_cpu(p->max_segment_size); 2917 + if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) 2918 + drbd_setup_queue_param(mdev, max_seg_s); 2919 + 2920 + drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); 2921 + put_ldev(mdev); 2922 + } 2923 + 2924 + if (mdev->state.conn > C_WF_REPORT_PARAMS) { 2925 + if (be64_to_cpu(p->c_size) != 2926 + drbd_get_capacity(mdev->this_bdev) || ldsc) { 2927 + /* we have different sizes, probably peer 2928 + * needs to know my new size... */ 2929 + drbd_send_sizes(mdev, 0); 2930 + } 2931 + if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || 2932 + (dd == grew && mdev->state.conn == C_CONNECTED)) { 2933 + if (mdev->state.pdsk >= D_INCONSISTENT && 2934 + mdev->state.disk >= D_INCONSISTENT) 2935 + resync_after_online_grow(mdev); 2936 + else 2937 + set_bit(RESYNC_AFTER_NEG, &mdev->flags); 2938 + } 2939 + } 2940 + 2941 + return TRUE; 2942 + } 2943 + 2944 + static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) 2945 + { 2946 + struct p_uuids *p = (struct p_uuids *)h; 2947 + u64 *p_uuid; 2948 + int i; 2949 + 2950 + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 2951 + if (drbd_recv(mdev, h->payload, h->length) != h->length) 2952 + return FALSE; 2953 + 2954 + p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); 2955 + 2956 + for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) 2957 + p_uuid[i] = be64_to_cpu(p->uuid[i]); 2958 + 2959 + kfree(mdev->p_uuid); 2960 + mdev->p_uuid = p_uuid; 2961 + 2962 + if (mdev->state.conn < C_CONNECTED && 2963 + mdev->state.disk < D_INCONSISTENT && 2964 + mdev->state.role == R_PRIMARY && 2965 + (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { 2966 + dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", 2967 + (unsigned long long)mdev->ed_uuid); 2968 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2969 + return FALSE; 2970 + } 2971 + 2972 + if (get_ldev(mdev)) { 2973 + int skip_initial_sync = 2974 + mdev->state.conn == C_CONNECTED && 2975 + mdev->agreed_pro_version >= 90 && 2976 + mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && 2977 + (p_uuid[UI_FLAGS] & 8); 2978 + if (skip_initial_sync) { 2979 + dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); 2980 + drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, 2981 + "clear_n_write from receive_uuids"); 2982 + _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); 2983 + _drbd_uuid_set(mdev, UI_BITMAP, 0); 2984 + _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 2985 + CS_VERBOSE, NULL); 2986 + drbd_md_sync(mdev); 2987 + } 2988 + put_ldev(mdev); 2989 + } 2990 + 2991 + /* Before we test for the disk state, we should wait until an eventually 2992 + ongoing cluster wide state change is finished. That is important if 2993 + we are primary and are detaching from our disk. We need to see the 2994 + new disk state... */ 2995 + wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); 2996 + if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) 2997 + drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); 2998 + 2999 + return TRUE; 3000 + } 3001 + 3002 + /** 3003 + * convert_state() - Converts the peer's view of the cluster state to our point of view 3004 + * @ps: The state as seen by the peer. 3005 + */ 3006 + static union drbd_state convert_state(union drbd_state ps) 3007 + { 3008 + union drbd_state ms; 3009 + 3010 + static enum drbd_conns c_tab[] = { 3011 + [C_CONNECTED] = C_CONNECTED, 3012 + 3013 + [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, 3014 + [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, 3015 + [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ 3016 + [C_VERIFY_S] = C_VERIFY_T, 3017 + [C_MASK] = C_MASK, 3018 + }; 3019 + 3020 + ms.i = ps.i; 3021 + 3022 + ms.conn = c_tab[ps.conn]; 3023 + ms.peer = ps.role; 3024 + ms.role = ps.peer; 3025 + ms.pdsk = ps.disk; 3026 + ms.disk = ps.pdsk; 3027 + ms.peer_isp = (ps.aftr_isp | ps.user_isp); 3028 + 3029 + return ms; 3030 + } 3031 + 3032 + static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) 3033 + { 3034 + struct p_req_state *p = (struct p_req_state *)h; 3035 + union drbd_state mask, val; 3036 + int rv; 3037 + 3038 + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 3039 + if (drbd_recv(mdev, h->payload, h->length) != h->length) 3040 + return FALSE; 3041 + 3042 + mask.i = be32_to_cpu(p->mask); 3043 + val.i = be32_to_cpu(p->val); 3044 + 3045 + if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && 3046 + test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { 3047 + drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); 3048 + return TRUE; 3049 + } 3050 + 3051 + mask = convert_state(mask); 3052 + val = convert_state(val); 3053 + 3054 + rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); 3055 + 3056 + drbd_send_sr_reply(mdev, rv); 3057 + drbd_md_sync(mdev); 3058 + 3059 + return TRUE; 3060 + } 3061 + 3062 + static int receive_state(struct drbd_conf *mdev, struct p_header *h) 3063 + { 3064 + struct p_state *p = (struct p_state *)h; 3065 + enum drbd_conns nconn, oconn; 3066 + union drbd_state ns, peer_state; 3067 + enum drbd_disk_state real_peer_disk; 3068 + int rv; 3069 + 3070 + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) 3071 + return FALSE; 3072 + 3073 + if (drbd_recv(mdev, h->payload, h->length) != h->length) 3074 + return FALSE; 3075 + 3076 + peer_state.i = be32_to_cpu(p->state); 3077 + 3078 + real_peer_disk = peer_state.disk; 3079 + if (peer_state.disk == D_NEGOTIATING) { 3080 + real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; 3081 + dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); 3082 + } 3083 + 3084 + spin_lock_irq(&mdev->req_lock); 3085 + retry: 3086 + oconn = nconn = mdev->state.conn; 3087 + spin_unlock_irq(&mdev->req_lock); 3088 + 3089 + if (nconn == C_WF_REPORT_PARAMS) 3090 + nconn = C_CONNECTED; 3091 + 3092 + if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && 3093 + get_ldev_if_state(mdev, D_NEGOTIATING)) { 3094 + int cr; /* consider resync */ 3095 + 3096 + /* if we established a new connection */ 3097 + cr = (oconn < C_CONNECTED); 3098 + /* if we had an established connection 3099 + * and one of the nodes newly attaches a disk */ 3100 + cr |= (oconn == C_CONNECTED && 3101 + (peer_state.disk == D_NEGOTIATING || 3102 + mdev->state.disk == D_NEGOTIATING)); 3103 + /* if we have both been inconsistent, and the peer has been 3104 + * forced to be UpToDate with --overwrite-data */ 3105 + cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); 3106 + /* if we had been plain connected, and the admin requested to 3107 + * start a sync by "invalidate" or "invalidate-remote" */ 3108 + cr |= (oconn == C_CONNECTED && 3109 + (peer_state.conn >= C_STARTING_SYNC_S && 3110 + peer_state.conn <= C_WF_BITMAP_T)); 3111 + 3112 + if (cr) 3113 + nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); 3114 + 3115 + put_ldev(mdev); 3116 + if (nconn == C_MASK) { 3117 + if (mdev->state.disk == D_NEGOTIATING) { 3118 + drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3119 + nconn = C_CONNECTED; 3120 + } else if (peer_state.disk == D_NEGOTIATING) { 3121 + dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); 3122 + peer_state.disk = D_DISKLESS; 3123 + } else { 3124 + D_ASSERT(oconn == C_WF_REPORT_PARAMS); 3125 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3126 + return FALSE; 3127 + } 3128 + } 3129 + } 3130 + 3131 + spin_lock_irq(&mdev->req_lock); 3132 + if (mdev->state.conn != oconn) 3133 + goto retry; 3134 + clear_bit(CONSIDER_RESYNC, &mdev->flags); 3135 + ns.i = mdev->state.i; 3136 + ns.conn = nconn; 3137 + ns.peer = peer_state.role; 3138 + ns.pdsk = real_peer_disk; 3139 + ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); 3140 + if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) 3141 + ns.disk = mdev->new_state_tmp.disk; 3142 + 3143 + rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); 3144 + ns = mdev->state; 3145 + spin_unlock_irq(&mdev->req_lock); 3146 + 3147 + if (rv < SS_SUCCESS) { 3148 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3149 + return FALSE; 3150 + } 3151 + 3152 + if (oconn > C_WF_REPORT_PARAMS) { 3153 + if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && 3154 + peer_state.disk != D_NEGOTIATING ) { 3155 + /* we want resync, peer has not yet decided to sync... */ 3156 + /* Nowadays only used when forcing a node into primary role and 3157 + setting its disk to UpToDate with that */ 3158 + drbd_send_uuids(mdev); 3159 + drbd_send_state(mdev); 3160 + } 3161 + } 3162 + 3163 + mdev->net_conf->want_lose = 0; 3164 + 3165 + drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ 3166 + 3167 + return TRUE; 3168 + } 3169 + 3170 + static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) 3171 + { 3172 + struct p_rs_uuid *p = (struct p_rs_uuid *)h; 3173 + 3174 + wait_event(mdev->misc_wait, 3175 + mdev->state.conn == C_WF_SYNC_UUID || 3176 + mdev->state.conn < C_CONNECTED || 3177 + mdev->state.disk < D_NEGOTIATING); 3178 + 3179 + /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ 3180 + 3181 + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 3182 + if (drbd_recv(mdev, h->payload, h->length) != h->length) 3183 + return FALSE; 3184 + 3185 + /* Here the _drbd_uuid_ functions are right, current should 3186 + _not_ be rotated into the history */ 3187 + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 3188 + _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); 3189 + _drbd_uuid_set(mdev, UI_BITMAP, 0UL); 3190 + 3191 + drbd_start_resync(mdev, C_SYNC_TARGET); 3192 + 3193 + put_ldev(mdev); 3194 + } else 3195 + dev_err(DEV, "Ignoring SyncUUID packet!\n"); 3196 + 3197 + return TRUE; 3198 + } 3199 + 3200 + enum receive_bitmap_ret { OK, DONE, FAILED }; 3201 + 3202 + static enum receive_bitmap_ret 3203 + receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, 3204 + unsigned long *buffer, struct bm_xfer_ctx *c) 3205 + { 3206 + unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); 3207 + unsigned want = num_words * sizeof(long); 3208 + 3209 + if (want != h->length) { 3210 + dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); 3211 + return FAILED; 3212 + } 3213 + if (want == 0) 3214 + return DONE; 3215 + if (drbd_recv(mdev, buffer, want) != want) 3216 + return FAILED; 3217 + 3218 + drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); 3219 + 3220 + c->word_offset += num_words; 3221 + c->bit_offset = c->word_offset * BITS_PER_LONG; 3222 + if (c->bit_offset > c->bm_bits) 3223 + c->bit_offset = c->bm_bits; 3224 + 3225 + return OK; 3226 + } 3227 + 3228 + static enum receive_bitmap_ret 3229 + recv_bm_rle_bits(struct drbd_conf *mdev, 3230 + struct p_compressed_bm *p, 3231 + struct bm_xfer_ctx *c) 3232 + { 3233 + struct bitstream bs; 3234 + u64 look_ahead; 3235 + u64 rl; 3236 + u64 tmp; 3237 + unsigned long s = c->bit_offset; 3238 + unsigned long e; 3239 + int len = p->head.length - (sizeof(*p) - sizeof(p->head)); 3240 + int toggle = DCBP_get_start(p); 3241 + int have; 3242 + int bits; 3243 + 3244 + bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); 3245 + 3246 + bits = bitstream_get_bits(&bs, &look_ahead, 64); 3247 + if (bits < 0) 3248 + return FAILED; 3249 + 3250 + for (have = bits; have > 0; s += rl, toggle = !toggle) { 3251 + bits = vli_decode_bits(&rl, look_ahead); 3252 + if (bits <= 0) 3253 + return FAILED; 3254 + 3255 + if (toggle) { 3256 + e = s + rl -1; 3257 + if (e >= c->bm_bits) { 3258 + dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); 3259 + return FAILED; 3260 + } 3261 + _drbd_bm_set_bits(mdev, s, e); 3262 + } 3263 + 3264 + if (have < bits) { 3265 + dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", 3266 + have, bits, look_ahead, 3267 + (unsigned int)(bs.cur.b - p->code), 3268 + (unsigned int)bs.buf_len); 3269 + return FAILED; 3270 + } 3271 + look_ahead >>= bits; 3272 + have -= bits; 3273 + 3274 + bits = bitstream_get_bits(&bs, &tmp, 64 - have); 3275 + if (bits < 0) 3276 + return FAILED; 3277 + look_ahead |= tmp << have; 3278 + have += bits; 3279 + } 3280 + 3281 + c->bit_offset = s; 3282 + bm_xfer_ctx_bit_to_word_offset(c); 3283 + 3284 + return (s == c->bm_bits) ? DONE : OK; 3285 + } 3286 + 3287 + static enum receive_bitmap_ret 3288 + decode_bitmap_c(struct drbd_conf *mdev, 3289 + struct p_compressed_bm *p, 3290 + struct bm_xfer_ctx *c) 3291 + { 3292 + if (DCBP_get_code(p) == RLE_VLI_Bits) 3293 + return recv_bm_rle_bits(mdev, p, c); 3294 + 3295 + /* other variants had been implemented for evaluation, 3296 + * but have been dropped as this one turned out to be "best" 3297 + * during all our tests. */ 3298 + 3299 + dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); 3300 + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3301 + return FAILED; 3302 + } 3303 + 3304 + void INFO_bm_xfer_stats(struct drbd_conf *mdev, 3305 + const char *direction, struct bm_xfer_ctx *c) 3306 + { 3307 + /* what would it take to transfer it "plaintext" */ 3308 + unsigned plain = sizeof(struct p_header) * 3309 + ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) 3310 + + c->bm_words * sizeof(long); 3311 + unsigned total = c->bytes[0] + c->bytes[1]; 3312 + unsigned r; 3313 + 3314 + /* total can not be zero. but just in case: */ 3315 + if (total == 0) 3316 + return; 3317 + 3318 + /* don't report if not compressed */ 3319 + if (total >= plain) 3320 + return; 3321 + 3322 + /* total < plain. check for overflow, still */ 3323 + r = (total > UINT_MAX/1000) ? (total / (plain/1000)) 3324 + : (1000 * total / plain); 3325 + 3326 + if (r > 1000) 3327 + r = 1000; 3328 + 3329 + r = 1000 - r; 3330 + dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " 3331 + "total %u; compression: %u.%u%%\n", 3332 + direction, 3333 + c->bytes[1], c->packets[1], 3334 + c->bytes[0], c->packets[0], 3335 + total, r/10, r % 10); 3336 + } 3337 + 3338 + /* Since we are processing the bitfield from lower addresses to higher, 3339 + it does not matter if the process it in 32 bit chunks or 64 bit 3340 + chunks as long as it is little endian. (Understand it as byte stream, 3341 + beginning with the lowest byte...) If we would use big endian 3342 + we would need to process it from the highest address to the lowest, 3343 + in order to be agnostic to the 32 vs 64 bits issue. 3344 + 3345 + returns 0 on failure, 1 if we successfully received it. */ 3346 + static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) 3347 + { 3348 + struct bm_xfer_ctx c; 3349 + void *buffer; 3350 + enum receive_bitmap_ret ret; 3351 + int ok = FALSE; 3352 + 3353 + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 3354 + 3355 + drbd_bm_lock(mdev, "receive bitmap"); 3356 + 3357 + /* maybe we should use some per thread scratch page, 3358 + * and allocate that during initial device creation? */ 3359 + buffer = (unsigned long *) __get_free_page(GFP_NOIO); 3360 + if (!buffer) { 3361 + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); 3362 + goto out; 3363 + } 3364 + 3365 + c = (struct bm_xfer_ctx) { 3366 + .bm_bits = drbd_bm_bits(mdev), 3367 + .bm_words = drbd_bm_words(mdev), 3368 + }; 3369 + 3370 + do { 3371 + if (h->command == P_BITMAP) { 3372 + ret = receive_bitmap_plain(mdev, h, buffer, &c); 3373 + } else if (h->command == P_COMPRESSED_BITMAP) { 3374 + /* MAYBE: sanity check that we speak proto >= 90, 3375 + * and the feature is enabled! */ 3376 + struct p_compressed_bm *p; 3377 + 3378 + if (h->length > BM_PACKET_PAYLOAD_BYTES) { 3379 + dev_err(DEV, "ReportCBitmap packet too large\n"); 3380 + goto out; 3381 + } 3382 + /* use the page buff */ 3383 + p = buffer; 3384 + memcpy(p, h, sizeof(*h)); 3385 + if (drbd_recv(mdev, p->head.payload, h->length) != h->length) 3386 + goto out; 3387 + if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { 3388 + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); 3389 + return FAILED; 3390 + } 3391 + ret = decode_bitmap_c(mdev, p, &c); 3392 + } else { 3393 + dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); 3394 + goto out; 3395 + } 3396 + 3397 + c.packets[h->command == P_BITMAP]++; 3398 + c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; 3399 + 3400 + if (ret != OK) 3401 + break; 3402 + 3403 + if (!drbd_recv_header(mdev, h)) 3404 + goto out; 3405 + } while (ret == OK); 3406 + if (ret == FAILED) 3407 + goto out; 3408 + 3409 + INFO_bm_xfer_stats(mdev, "receive", &c); 3410 + 3411 + if (mdev->state.conn == C_WF_BITMAP_T) { 3412 + ok = !drbd_send_bitmap(mdev); 3413 + if (!ok) 3414 + goto out; 3415 + /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ 3416 + ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); 3417 + D_ASSERT(ok == SS_SUCCESS); 3418 + } else if (mdev->state.conn != C_WF_BITMAP_S) { 3419 + /* admin may have requested C_DISCONNECTING, 3420 + * other threads may have noticed network errors */ 3421 + dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", 3422 + drbd_conn_str(mdev->state.conn)); 3423 + } 3424 + 3425 + ok = TRUE; 3426 + out: 3427 + drbd_bm_unlock(mdev); 3428 + if (ok && mdev->state.conn == C_WF_BITMAP_S) 3429 + drbd_start_resync(mdev, C_SYNC_SOURCE); 3430 + free_page((unsigned long) buffer); 3431 + return ok; 3432 + } 3433 + 3434 + static int receive_skip(struct drbd_conf *mdev, struct p_header *h) 3435 + { 3436 + /* TODO zero copy sink :) */ 3437 + static char sink[128]; 3438 + int size, want, r; 3439 + 3440 + dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", 3441 + h->command, h->length); 3442 + 3443 + size = h->length; 3444 + while (size > 0) { 3445 + want = min_t(int, size, sizeof(sink)); 3446 + r = drbd_recv(mdev, sink, want); 3447 + ERR_IF(r <= 0) break; 3448 + size -= r; 3449 + } 3450 + return size == 0; 3451 + } 3452 + 3453 + static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) 3454 + { 3455 + if (mdev->state.disk >= D_INCONSISTENT) 3456 + drbd_kick_lo(mdev); 3457 + 3458 + /* Make sure we've acked all the TCP data associated 3459 + * with the data requests being unplugged */ 3460 + drbd_tcp_quickack(mdev->data.socket); 3461 + 3462 + return TRUE; 3463 + } 3464 + 3465 + typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); 3466 + 3467 + static drbd_cmd_handler_f drbd_default_handler[] = { 3468 + [P_DATA] = receive_Data, 3469 + [P_DATA_REPLY] = receive_DataReply, 3470 + [P_RS_DATA_REPLY] = receive_RSDataReply, 3471 + [P_BARRIER] = receive_Barrier, 3472 + [P_BITMAP] = receive_bitmap, 3473 + [P_COMPRESSED_BITMAP] = receive_bitmap, 3474 + [P_UNPLUG_REMOTE] = receive_UnplugRemote, 3475 + [P_DATA_REQUEST] = receive_DataRequest, 3476 + [P_RS_DATA_REQUEST] = receive_DataRequest, 3477 + [P_SYNC_PARAM] = receive_SyncParam, 3478 + [P_SYNC_PARAM89] = receive_SyncParam, 3479 + [P_PROTOCOL] = receive_protocol, 3480 + [P_UUIDS] = receive_uuids, 3481 + [P_SIZES] = receive_sizes, 3482 + [P_STATE] = receive_state, 3483 + [P_STATE_CHG_REQ] = receive_req_state, 3484 + [P_SYNC_UUID] = receive_sync_uuid, 3485 + [P_OV_REQUEST] = receive_DataRequest, 3486 + [P_OV_REPLY] = receive_DataRequest, 3487 + [P_CSUM_RS_REQUEST] = receive_DataRequest, 3488 + /* anything missing from this table is in 3489 + * the asender_tbl, see get_asender_cmd */ 3490 + [P_MAX_CMD] = NULL, 3491 + }; 3492 + 3493 + static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; 3494 + static drbd_cmd_handler_f *drbd_opt_cmd_handler; 3495 + 3496 + static void drbdd(struct drbd_conf *mdev) 3497 + { 3498 + drbd_cmd_handler_f handler; 3499 + struct p_header *header = &mdev->data.rbuf.header; 3500 + 3501 + while (get_t_state(&mdev->receiver) == Running) { 3502 + drbd_thread_current_set_cpu(mdev); 3503 + if (!drbd_recv_header(mdev, header)) { 3504 + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3505 + break; 3506 + } 3507 + 3508 + if (header->command < P_MAX_CMD) 3509 + handler = drbd_cmd_handler[header->command]; 3510 + else if (P_MAY_IGNORE < header->command 3511 + && header->command < P_MAX_OPT_CMD) 3512 + handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; 3513 + else if (header->command > P_MAX_OPT_CMD) 3514 + handler = receive_skip; 3515 + else 3516 + handler = NULL; 3517 + 3518 + if (unlikely(!handler)) { 3519 + dev_err(DEV, "unknown packet type %d, l: %d!\n", 3520 + header->command, header->length); 3521 + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3522 + break; 3523 + } 3524 + if (unlikely(!handler(mdev, header))) { 3525 + dev_err(DEV, "error receiving %s, l: %d!\n", 3526 + cmdname(header->command), header->length); 3527 + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3528 + break; 3529 + } 3530 + } 3531 + } 3532 + 3533 + static void drbd_fail_pending_reads(struct drbd_conf *mdev) 3534 + { 3535 + struct hlist_head *slot; 3536 + struct hlist_node *pos; 3537 + struct hlist_node *tmp; 3538 + struct drbd_request *req; 3539 + int i; 3540 + 3541 + /* 3542 + * Application READ requests 3543 + */ 3544 + spin_lock_irq(&mdev->req_lock); 3545 + for (i = 0; i < APP_R_HSIZE; i++) { 3546 + slot = mdev->app_reads_hash+i; 3547 + hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { 3548 + /* it may (but should not any longer!) 3549 + * be on the work queue; if that assert triggers, 3550 + * we need to also grab the 3551 + * spin_lock_irq(&mdev->data.work.q_lock); 3552 + * and list_del_init here. */ 3553 + D_ASSERT(list_empty(&req->w.list)); 3554 + /* It would be nice to complete outside of spinlock. 3555 + * But this is easier for now. */ 3556 + _req_mod(req, connection_lost_while_pending); 3557 + } 3558 + } 3559 + for (i = 0; i < APP_R_HSIZE; i++) 3560 + if (!hlist_empty(mdev->app_reads_hash+i)) 3561 + dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " 3562 + "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); 3563 + 3564 + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); 3565 + spin_unlock_irq(&mdev->req_lock); 3566 + } 3567 + 3568 + void drbd_flush_workqueue(struct drbd_conf *mdev) 3569 + { 3570 + struct drbd_wq_barrier barr; 3571 + 3572 + barr.w.cb = w_prev_work_done; 3573 + init_completion(&barr.done); 3574 + drbd_queue_work(&mdev->data.work, &barr.w); 3575 + wait_for_completion(&barr.done); 3576 + } 3577 + 3578 + static void drbd_disconnect(struct drbd_conf *mdev) 3579 + { 3580 + enum drbd_fencing_p fp; 3581 + union drbd_state os, ns; 3582 + int rv = SS_UNKNOWN_ERROR; 3583 + unsigned int i; 3584 + 3585 + if (mdev->state.conn == C_STANDALONE) 3586 + return; 3587 + if (mdev->state.conn >= C_WF_CONNECTION) 3588 + dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", 3589 + drbd_conn_str(mdev->state.conn)); 3590 + 3591 + /* asender does not clean up anything. it must not interfere, either */ 3592 + drbd_thread_stop(&mdev->asender); 3593 + 3594 + mutex_lock(&mdev->data.mutex); 3595 + drbd_free_sock(mdev); 3596 + mutex_unlock(&mdev->data.mutex); 3597 + 3598 + spin_lock_irq(&mdev->req_lock); 3599 + _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 3600 + _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); 3601 + _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); 3602 + spin_unlock_irq(&mdev->req_lock); 3603 + 3604 + /* We do not have data structures that would allow us to 3605 + * get the rs_pending_cnt down to 0 again. 3606 + * * On C_SYNC_TARGET we do not have any data structures describing 3607 + * the pending RSDataRequest's we have sent. 3608 + * * On C_SYNC_SOURCE there is no data structure that tracks 3609 + * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. 3610 + * And no, it is not the sum of the reference counts in the 3611 + * resync_LRU. The resync_LRU tracks the whole operation including 3612 + * the disk-IO, while the rs_pending_cnt only tracks the blocks 3613 + * on the fly. */ 3614 + drbd_rs_cancel_all(mdev); 3615 + mdev->rs_total = 0; 3616 + mdev->rs_failed = 0; 3617 + atomic_set(&mdev->rs_pending_cnt, 0); 3618 + wake_up(&mdev->misc_wait); 3619 + 3620 + /* make sure syncer is stopped and w_resume_next_sg queued */ 3621 + del_timer_sync(&mdev->resync_timer); 3622 + set_bit(STOP_SYNC_TIMER, &mdev->flags); 3623 + resync_timer_fn((unsigned long)mdev); 3624 + 3625 + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, 3626 + * w_make_resync_request etc. which may still be on the worker queue 3627 + * to be "canceled" */ 3628 + drbd_flush_workqueue(mdev); 3629 + 3630 + /* This also does reclaim_net_ee(). If we do this too early, we might 3631 + * miss some resync ee and pages.*/ 3632 + drbd_process_done_ee(mdev); 3633 + 3634 + kfree(mdev->p_uuid); 3635 + mdev->p_uuid = NULL; 3636 + 3637 + if (!mdev->state.susp) 3638 + tl_clear(mdev); 3639 + 3640 + drbd_fail_pending_reads(mdev); 3641 + 3642 + dev_info(DEV, "Connection closed\n"); 3643 + 3644 + drbd_md_sync(mdev); 3645 + 3646 + fp = FP_DONT_CARE; 3647 + if (get_ldev(mdev)) { 3648 + fp = mdev->ldev->dc.fencing; 3649 + put_ldev(mdev); 3650 + } 3651 + 3652 + if (mdev->state.role == R_PRIMARY) { 3653 + if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { 3654 + enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); 3655 + drbd_request_state(mdev, NS(pdsk, nps)); 3656 + } 3657 + } 3658 + 3659 + spin_lock_irq(&mdev->req_lock); 3660 + os = mdev->state; 3661 + if (os.conn >= C_UNCONNECTED) { 3662 + /* Do not restart in case we are C_DISCONNECTING */ 3663 + ns = os; 3664 + ns.conn = C_UNCONNECTED; 3665 + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); 3666 + } 3667 + spin_unlock_irq(&mdev->req_lock); 3668 + 3669 + if (os.conn == C_DISCONNECTING) { 3670 + struct hlist_head *h; 3671 + wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); 3672 + 3673 + /* we must not free the tl_hash 3674 + * while application io is still on the fly */ 3675 + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); 3676 + 3677 + spin_lock_irq(&mdev->req_lock); 3678 + /* paranoia code */ 3679 + for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) 3680 + if (h->first) 3681 + dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", 3682 + (int)(h - mdev->ee_hash), h->first); 3683 + kfree(mdev->ee_hash); 3684 + mdev->ee_hash = NULL; 3685 + mdev->ee_hash_s = 0; 3686 + 3687 + /* paranoia code */ 3688 + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) 3689 + if (h->first) 3690 + dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", 3691 + (int)(h - mdev->tl_hash), h->first); 3692 + kfree(mdev->tl_hash); 3693 + mdev->tl_hash = NULL; 3694 + mdev->tl_hash_s = 0; 3695 + spin_unlock_irq(&mdev->req_lock); 3696 + 3697 + crypto_free_hash(mdev->cram_hmac_tfm); 3698 + mdev->cram_hmac_tfm = NULL; 3699 + 3700 + kfree(mdev->net_conf); 3701 + mdev->net_conf = NULL; 3702 + drbd_request_state(mdev, NS(conn, C_STANDALONE)); 3703 + } 3704 + 3705 + /* tcp_close and release of sendpage pages can be deferred. I don't 3706 + * want to use SO_LINGER, because apparently it can be deferred for 3707 + * more than 20 seconds (longest time I checked). 3708 + * 3709 + * Actually we don't care for exactly when the network stack does its 3710 + * put_page(), but release our reference on these pages right here. 3711 + */ 3712 + i = drbd_release_ee(mdev, &mdev->net_ee); 3713 + if (i) 3714 + dev_info(DEV, "net_ee not empty, killed %u entries\n", i); 3715 + i = atomic_read(&mdev->pp_in_use); 3716 + if (i) 3717 + dev_info(DEV, "pp_in_use = %u, expected 0\n", i); 3718 + 3719 + D_ASSERT(list_empty(&mdev->read_ee)); 3720 + D_ASSERT(list_empty(&mdev->active_ee)); 3721 + D_ASSERT(list_empty(&mdev->sync_ee)); 3722 + D_ASSERT(list_empty(&mdev->done_ee)); 3723 + 3724 + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ 3725 + atomic_set(&mdev->current_epoch->epoch_size, 0); 3726 + D_ASSERT(list_empty(&mdev->current_epoch->list)); 3727 + } 3728 + 3729 + /* 3730 + * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version 3731 + * we can agree on is stored in agreed_pro_version. 3732 + * 3733 + * feature flags and the reserved array should be enough room for future 3734 + * enhancements of the handshake protocol, and possible plugins... 3735 + * 3736 + * for now, they are expected to be zero, but ignored. 3737 + */ 3738 + static int drbd_send_handshake(struct drbd_conf *mdev) 3739 + { 3740 + /* ASSERT current == mdev->receiver ... */ 3741 + struct p_handshake *p = &mdev->data.sbuf.handshake; 3742 + int ok; 3743 + 3744 + if (mutex_lock_interruptible(&mdev->data.mutex)) { 3745 + dev_err(DEV, "interrupted during initial handshake\n"); 3746 + return 0; /* interrupted. not ok. */ 3747 + } 3748 + 3749 + if (mdev->data.socket == NULL) { 3750 + mutex_unlock(&mdev->data.mutex); 3751 + return 0; 3752 + } 3753 + 3754 + memset(p, 0, sizeof(*p)); 3755 + p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); 3756 + p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); 3757 + ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, 3758 + (struct p_header *)p, sizeof(*p), 0 ); 3759 + mutex_unlock(&mdev->data.mutex); 3760 + return ok; 3761 + } 3762 + 3763 + /* 3764 + * return values: 3765 + * 1 yes, we have a valid connection 3766 + * 0 oops, did not work out, please try again 3767 + * -1 peer talks different language, 3768 + * no point in trying again, please go standalone. 3769 + */ 3770 + static int drbd_do_handshake(struct drbd_conf *mdev) 3771 + { 3772 + /* ASSERT current == mdev->receiver ... */ 3773 + struct p_handshake *p = &mdev->data.rbuf.handshake; 3774 + const int expect = sizeof(struct p_handshake) 3775 + -sizeof(struct p_header); 3776 + int rv; 3777 + 3778 + rv = drbd_send_handshake(mdev); 3779 + if (!rv) 3780 + return 0; 3781 + 3782 + rv = drbd_recv_header(mdev, &p->head); 3783 + if (!rv) 3784 + return 0; 3785 + 3786 + if (p->head.command != P_HAND_SHAKE) { 3787 + dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", 3788 + cmdname(p->head.command), p->head.command); 3789 + return -1; 3790 + } 3791 + 3792 + if (p->head.length != expect) { 3793 + dev_err(DEV, "expected HandShake length: %u, received: %u\n", 3794 + expect, p->head.length); 3795 + return -1; 3796 + } 3797 + 3798 + rv = drbd_recv(mdev, &p->head.payload, expect); 3799 + 3800 + if (rv != expect) { 3801 + dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); 3802 + return 0; 3803 + } 3804 + 3805 + p->protocol_min = be32_to_cpu(p->protocol_min); 3806 + p->protocol_max = be32_to_cpu(p->protocol_max); 3807 + if (p->protocol_max == 0) 3808 + p->protocol_max = p->protocol_min; 3809 + 3810 + if (PRO_VERSION_MAX < p->protocol_min || 3811 + PRO_VERSION_MIN > p->protocol_max) 3812 + goto incompat; 3813 + 3814 + mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); 3815 + 3816 + dev_info(DEV, "Handshake successful: " 3817 + "Agreed network protocol version %d\n", mdev->agreed_pro_version); 3818 + 3819 + return 1; 3820 + 3821 + incompat: 3822 + dev_err(DEV, "incompatible DRBD dialects: " 3823 + "I support %d-%d, peer supports %d-%d\n", 3824 + PRO_VERSION_MIN, PRO_VERSION_MAX, 3825 + p->protocol_min, p->protocol_max); 3826 + return -1; 3827 + } 3828 + 3829 + #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) 3830 + static int drbd_do_auth(struct drbd_conf *mdev) 3831 + { 3832 + dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); 3833 + dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); 3834 + return 0; 3835 + } 3836 + #else 3837 + #define CHALLENGE_LEN 64 3838 + static int drbd_do_auth(struct drbd_conf *mdev) 3839 + { 3840 + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ 3841 + struct scatterlist sg; 3842 + char *response = NULL; 3843 + char *right_response = NULL; 3844 + char *peers_ch = NULL; 3845 + struct p_header p; 3846 + unsigned int key_len = strlen(mdev->net_conf->shared_secret); 3847 + unsigned int resp_size; 3848 + struct hash_desc desc; 3849 + int rv; 3850 + 3851 + desc.tfm = mdev->cram_hmac_tfm; 3852 + desc.flags = 0; 3853 + 3854 + rv = crypto_hash_setkey(mdev->cram_hmac_tfm, 3855 + (u8 *)mdev->net_conf->shared_secret, key_len); 3856 + if (rv) { 3857 + dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); 3858 + rv = 0; 3859 + goto fail; 3860 + } 3861 + 3862 + get_random_bytes(my_challenge, CHALLENGE_LEN); 3863 + 3864 + rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); 3865 + if (!rv) 3866 + goto fail; 3867 + 3868 + rv = drbd_recv_header(mdev, &p); 3869 + if (!rv) 3870 + goto fail; 3871 + 3872 + if (p.command != P_AUTH_CHALLENGE) { 3873 + dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", 3874 + cmdname(p.command), p.command); 3875 + rv = 0; 3876 + goto fail; 3877 + } 3878 + 3879 + if (p.length > CHALLENGE_LEN*2) { 3880 + dev_err(DEV, "expected AuthChallenge payload too big.\n"); 3881 + rv = 0; 3882 + goto fail; 3883 + } 3884 + 3885 + peers_ch = kmalloc(p.length, GFP_NOIO); 3886 + if (peers_ch == NULL) { 3887 + dev_err(DEV, "kmalloc of peers_ch failed\n"); 3888 + rv = 0; 3889 + goto fail; 3890 + } 3891 + 3892 + rv = drbd_recv(mdev, peers_ch, p.length); 3893 + 3894 + if (rv != p.length) { 3895 + dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); 3896 + rv = 0; 3897 + goto fail; 3898 + } 3899 + 3900 + resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); 3901 + response = kmalloc(resp_size, GFP_NOIO); 3902 + if (response == NULL) { 3903 + dev_err(DEV, "kmalloc of response failed\n"); 3904 + rv = 0; 3905 + goto fail; 3906 + } 3907 + 3908 + sg_init_table(&sg, 1); 3909 + sg_set_buf(&sg, peers_ch, p.length); 3910 + 3911 + rv = crypto_hash_digest(&desc, &sg, sg.length, response); 3912 + if (rv) { 3913 + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); 3914 + rv = 0; 3915 + goto fail; 3916 + } 3917 + 3918 + rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); 3919 + if (!rv) 3920 + goto fail; 3921 + 3922 + rv = drbd_recv_header(mdev, &p); 3923 + if (!rv) 3924 + goto fail; 3925 + 3926 + if (p.command != P_AUTH_RESPONSE) { 3927 + dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", 3928 + cmdname(p.command), p.command); 3929 + rv = 0; 3930 + goto fail; 3931 + } 3932 + 3933 + if (p.length != resp_size) { 3934 + dev_err(DEV, "expected AuthResponse payload of wrong size\n"); 3935 + rv = 0; 3936 + goto fail; 3937 + } 3938 + 3939 + rv = drbd_recv(mdev, response , resp_size); 3940 + 3941 + if (rv != resp_size) { 3942 + dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); 3943 + rv = 0; 3944 + goto fail; 3945 + } 3946 + 3947 + right_response = kmalloc(resp_size, GFP_NOIO); 3948 + if (response == NULL) { 3949 + dev_err(DEV, "kmalloc of right_response failed\n"); 3950 + rv = 0; 3951 + goto fail; 3952 + } 3953 + 3954 + sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); 3955 + 3956 + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); 3957 + if (rv) { 3958 + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); 3959 + rv = 0; 3960 + goto fail; 3961 + } 3962 + 3963 + rv = !memcmp(response, right_response, resp_size); 3964 + 3965 + if (rv) 3966 + dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", 3967 + resp_size, mdev->net_conf->cram_hmac_alg); 3968 + 3969 + fail: 3970 + kfree(peers_ch); 3971 + kfree(response); 3972 + kfree(right_response); 3973 + 3974 + return rv; 3975 + } 3976 + #endif 3977 + 3978 + int drbdd_init(struct drbd_thread *thi) 3979 + { 3980 + struct drbd_conf *mdev = thi->mdev; 3981 + unsigned int minor = mdev_to_minor(mdev); 3982 + int h; 3983 + 3984 + sprintf(current->comm, "drbd%d_receiver", minor); 3985 + 3986 + dev_info(DEV, "receiver (re)started\n"); 3987 + 3988 + do { 3989 + h = drbd_connect(mdev); 3990 + if (h == 0) { 3991 + drbd_disconnect(mdev); 3992 + __set_current_state(TASK_INTERRUPTIBLE); 3993 + schedule_timeout(HZ); 3994 + } 3995 + if (h == -1) { 3996 + dev_warn(DEV, "Discarding network configuration.\n"); 3997 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3998 + } 3999 + } while (h == 0); 4000 + 4001 + if (h > 0) { 4002 + if (get_net_conf(mdev)) { 4003 + drbdd(mdev); 4004 + put_net_conf(mdev); 4005 + } 4006 + } 4007 + 4008 + drbd_disconnect(mdev); 4009 + 4010 + dev_info(DEV, "receiver terminated\n"); 4011 + return 0; 4012 + } 4013 + 4014 + /* ********* acknowledge sender ******** */ 4015 + 4016 + static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) 4017 + { 4018 + struct p_req_state_reply *p = (struct p_req_state_reply *)h; 4019 + 4020 + int retcode = be32_to_cpu(p->retcode); 4021 + 4022 + if (retcode >= SS_SUCCESS) { 4023 + set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); 4024 + } else { 4025 + set_bit(CL_ST_CHG_FAIL, &mdev->flags); 4026 + dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", 4027 + drbd_set_st_err_str(retcode), retcode); 4028 + } 4029 + wake_up(&mdev->state_wait); 4030 + 4031 + return TRUE; 4032 + } 4033 + 4034 + static int got_Ping(struct drbd_conf *mdev, struct p_header *h) 4035 + { 4036 + return drbd_send_ping_ack(mdev); 4037 + 4038 + } 4039 + 4040 + static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) 4041 + { 4042 + /* restore idle timeout */ 4043 + mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; 4044 + 4045 + return TRUE; 4046 + } 4047 + 4048 + static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) 4049 + { 4050 + struct p_block_ack *p = (struct p_block_ack *)h; 4051 + sector_t sector = be64_to_cpu(p->sector); 4052 + int blksize = be32_to_cpu(p->blksize); 4053 + 4054 + D_ASSERT(mdev->agreed_pro_version >= 89); 4055 + 4056 + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4057 + 4058 + drbd_rs_complete_io(mdev, sector); 4059 + drbd_set_in_sync(mdev, sector, blksize); 4060 + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ 4061 + mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); 4062 + dec_rs_pending(mdev); 4063 + 4064 + return TRUE; 4065 + } 4066 + 4067 + /* when we receive the ACK for a write request, 4068 + * verify that we actually know about it */ 4069 + static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, 4070 + u64 id, sector_t sector) 4071 + { 4072 + struct hlist_head *slot = tl_hash_slot(mdev, sector); 4073 + struct hlist_node *n; 4074 + struct drbd_request *req; 4075 + 4076 + hlist_for_each_entry(req, n, slot, colision) { 4077 + if ((unsigned long)req == (unsigned long)id) { 4078 + if (req->sector != sector) { 4079 + dev_err(DEV, "_ack_id_to_req: found req %p but it has " 4080 + "wrong sector (%llus versus %llus)\n", req, 4081 + (unsigned long long)req->sector, 4082 + (unsigned long long)sector); 4083 + break; 4084 + } 4085 + return req; 4086 + } 4087 + } 4088 + dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", 4089 + (void *)(unsigned long)id, (unsigned long long)sector); 4090 + return NULL; 4091 + } 4092 + 4093 + typedef struct drbd_request *(req_validator_fn) 4094 + (struct drbd_conf *mdev, u64 id, sector_t sector); 4095 + 4096 + static int validate_req_change_req_state(struct drbd_conf *mdev, 4097 + u64 id, sector_t sector, req_validator_fn validator, 4098 + const char *func, enum drbd_req_event what) 4099 + { 4100 + struct drbd_request *req; 4101 + struct bio_and_error m; 4102 + 4103 + spin_lock_irq(&mdev->req_lock); 4104 + req = validator(mdev, id, sector); 4105 + if (unlikely(!req)) { 4106 + spin_unlock_irq(&mdev->req_lock); 4107 + dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); 4108 + return FALSE; 4109 + } 4110 + __req_mod(req, what, &m); 4111 + spin_unlock_irq(&mdev->req_lock); 4112 + 4113 + if (m.bio) 4114 + complete_master_bio(mdev, &m); 4115 + return TRUE; 4116 + } 4117 + 4118 + static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) 4119 + { 4120 + struct p_block_ack *p = (struct p_block_ack *)h; 4121 + sector_t sector = be64_to_cpu(p->sector); 4122 + int blksize = be32_to_cpu(p->blksize); 4123 + enum drbd_req_event what; 4124 + 4125 + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4126 + 4127 + if (is_syncer_block_id(p->block_id)) { 4128 + drbd_set_in_sync(mdev, sector, blksize); 4129 + dec_rs_pending(mdev); 4130 + return TRUE; 4131 + } 4132 + switch (be16_to_cpu(h->command)) { 4133 + case P_RS_WRITE_ACK: 4134 + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); 4135 + what = write_acked_by_peer_and_sis; 4136 + break; 4137 + case P_WRITE_ACK: 4138 + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); 4139 + what = write_acked_by_peer; 4140 + break; 4141 + case P_RECV_ACK: 4142 + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); 4143 + what = recv_acked_by_peer; 4144 + break; 4145 + case P_DISCARD_ACK: 4146 + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); 4147 + what = conflict_discarded_by_peer; 4148 + break; 4149 + default: 4150 + D_ASSERT(0); 4151 + return FALSE; 4152 + } 4153 + 4154 + return validate_req_change_req_state(mdev, p->block_id, sector, 4155 + _ack_id_to_req, __func__ , what); 4156 + } 4157 + 4158 + static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) 4159 + { 4160 + struct p_block_ack *p = (struct p_block_ack *)h; 4161 + sector_t sector = be64_to_cpu(p->sector); 4162 + 4163 + if (__ratelimit(&drbd_ratelimit_state)) 4164 + dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); 4165 + 4166 + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4167 + 4168 + if (is_syncer_block_id(p->block_id)) { 4169 + int size = be32_to_cpu(p->blksize); 4170 + dec_rs_pending(mdev); 4171 + drbd_rs_failed_io(mdev, sector, size); 4172 + return TRUE; 4173 + } 4174 + return validate_req_change_req_state(mdev, p->block_id, sector, 4175 + _ack_id_to_req, __func__ , neg_acked); 4176 + } 4177 + 4178 + static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) 4179 + { 4180 + struct p_block_ack *p = (struct p_block_ack *)h; 4181 + sector_t sector = be64_to_cpu(p->sector); 4182 + 4183 + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4184 + dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", 4185 + (unsigned long long)sector, be32_to_cpu(p->blksize)); 4186 + 4187 + return validate_req_change_req_state(mdev, p->block_id, sector, 4188 + _ar_id_to_req, __func__ , neg_acked); 4189 + } 4190 + 4191 + static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) 4192 + { 4193 + sector_t sector; 4194 + int size; 4195 + struct p_block_ack *p = (struct p_block_ack *)h; 4196 + 4197 + sector = be64_to_cpu(p->sector); 4198 + size = be32_to_cpu(p->blksize); 4199 + D_ASSERT(p->block_id == ID_SYNCER); 4200 + 4201 + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4202 + 4203 + dec_rs_pending(mdev); 4204 + 4205 + if (get_ldev_if_state(mdev, D_FAILED)) { 4206 + drbd_rs_complete_io(mdev, sector); 4207 + drbd_rs_failed_io(mdev, sector, size); 4208 + put_ldev(mdev); 4209 + } 4210 + 4211 + return TRUE; 4212 + } 4213 + 4214 + static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) 4215 + { 4216 + struct p_barrier_ack *p = (struct p_barrier_ack *)h; 4217 + 4218 + tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); 4219 + 4220 + return TRUE; 4221 + } 4222 + 4223 + static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) 4224 + { 4225 + struct p_block_ack *p = (struct p_block_ack *)h; 4226 + struct drbd_work *w; 4227 + sector_t sector; 4228 + int size; 4229 + 4230 + sector = be64_to_cpu(p->sector); 4231 + size = be32_to_cpu(p->blksize); 4232 + 4233 + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4234 + 4235 + if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) 4236 + drbd_ov_oos_found(mdev, sector, size); 4237 + else 4238 + ov_oos_print(mdev); 4239 + 4240 + drbd_rs_complete_io(mdev, sector); 4241 + dec_rs_pending(mdev); 4242 + 4243 + if (--mdev->ov_left == 0) { 4244 + w = kmalloc(sizeof(*w), GFP_NOIO); 4245 + if (w) { 4246 + w->cb = w_ov_finished; 4247 + drbd_queue_work_front(&mdev->data.work, w); 4248 + } else { 4249 + dev_err(DEV, "kmalloc(w) failed."); 4250 + ov_oos_print(mdev); 4251 + drbd_resync_finished(mdev); 4252 + } 4253 + } 4254 + return TRUE; 4255 + } 4256 + 4257 + struct asender_cmd { 4258 + size_t pkt_size; 4259 + int (*process)(struct drbd_conf *mdev, struct p_header *h); 4260 + }; 4261 + 4262 + static struct asender_cmd *get_asender_cmd(int cmd) 4263 + { 4264 + static struct asender_cmd asender_tbl[] = { 4265 + /* anything missing from this table is in 4266 + * the drbd_cmd_handler (drbd_default_handler) table, 4267 + * see the beginning of drbdd() */ 4268 + [P_PING] = { sizeof(struct p_header), got_Ping }, 4269 + [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, 4270 + [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4271 + [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4272 + [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4273 + [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 4274 + [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, 4275 + [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, 4276 + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, 4277 + [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, 4278 + [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 4279 + [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 4280 + [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 4281 + [P_MAX_CMD] = { 0, NULL }, 4282 + }; 4283 + if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) 4284 + return NULL; 4285 + return &asender_tbl[cmd]; 4286 + } 4287 + 4288 + int drbd_asender(struct drbd_thread *thi) 4289 + { 4290 + struct drbd_conf *mdev = thi->mdev; 4291 + struct p_header *h = &mdev->meta.rbuf.header; 4292 + struct asender_cmd *cmd = NULL; 4293 + 4294 + int rv, len; 4295 + void *buf = h; 4296 + int received = 0; 4297 + int expect = sizeof(struct p_header); 4298 + int empty; 4299 + 4300 + sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); 4301 + 4302 + current->policy = SCHED_RR; /* Make this a realtime task! */ 4303 + current->rt_priority = 2; /* more important than all other tasks */ 4304 + 4305 + while (get_t_state(thi) == Running) { 4306 + drbd_thread_current_set_cpu(mdev); 4307 + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { 4308 + ERR_IF(!drbd_send_ping(mdev)) goto reconnect; 4309 + mdev->meta.socket->sk->sk_rcvtimeo = 4310 + mdev->net_conf->ping_timeo*HZ/10; 4311 + } 4312 + 4313 + /* conditionally cork; 4314 + * it may hurt latency if we cork without much to send */ 4315 + if (!mdev->net_conf->no_cork && 4316 + 3 < atomic_read(&mdev->unacked_cnt)) 4317 + drbd_tcp_cork(mdev->meta.socket); 4318 + while (1) { 4319 + clear_bit(SIGNAL_ASENDER, &mdev->flags); 4320 + flush_signals(current); 4321 + if (!drbd_process_done_ee(mdev)) { 4322 + dev_err(DEV, "process_done_ee() = NOT_OK\n"); 4323 + goto reconnect; 4324 + } 4325 + /* to avoid race with newly queued ACKs */ 4326 + set_bit(SIGNAL_ASENDER, &mdev->flags); 4327 + spin_lock_irq(&mdev->req_lock); 4328 + empty = list_empty(&mdev->done_ee); 4329 + spin_unlock_irq(&mdev->req_lock); 4330 + /* new ack may have been queued right here, 4331 + * but then there is also a signal pending, 4332 + * and we start over... */ 4333 + if (empty) 4334 + break; 4335 + } 4336 + /* but unconditionally uncork unless disabled */ 4337 + if (!mdev->net_conf->no_cork) 4338 + drbd_tcp_uncork(mdev->meta.socket); 4339 + 4340 + /* short circuit, recv_msg would return EINTR anyways. */ 4341 + if (signal_pending(current)) 4342 + continue; 4343 + 4344 + rv = drbd_recv_short(mdev, mdev->meta.socket, 4345 + buf, expect-received, 0); 4346 + clear_bit(SIGNAL_ASENDER, &mdev->flags); 4347 + 4348 + flush_signals(current); 4349 + 4350 + /* Note: 4351 + * -EINTR (on meta) we got a signal 4352 + * -EAGAIN (on meta) rcvtimeo expired 4353 + * -ECONNRESET other side closed the connection 4354 + * -ERESTARTSYS (on data) we got a signal 4355 + * rv < 0 other than above: unexpected error! 4356 + * rv == expected: full header or command 4357 + * rv < expected: "woken" by signal during receive 4358 + * rv == 0 : "connection shut down by peer" 4359 + */ 4360 + if (likely(rv > 0)) { 4361 + received += rv; 4362 + buf += rv; 4363 + } else if (rv == 0) { 4364 + dev_err(DEV, "meta connection shut down by peer.\n"); 4365 + goto reconnect; 4366 + } else if (rv == -EAGAIN) { 4367 + if (mdev->meta.socket->sk->sk_rcvtimeo == 4368 + mdev->net_conf->ping_timeo*HZ/10) { 4369 + dev_err(DEV, "PingAck did not arrive in time.\n"); 4370 + goto reconnect; 4371 + } 4372 + set_bit(SEND_PING, &mdev->flags); 4373 + continue; 4374 + } else if (rv == -EINTR) { 4375 + continue; 4376 + } else { 4377 + dev_err(DEV, "sock_recvmsg returned %d\n", rv); 4378 + goto reconnect; 4379 + } 4380 + 4381 + if (received == expect && cmd == NULL) { 4382 + if (unlikely(h->magic != BE_DRBD_MAGIC)) { 4383 + dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", 4384 + (long)be32_to_cpu(h->magic), 4385 + h->command, h->length); 4386 + goto reconnect; 4387 + } 4388 + cmd = get_asender_cmd(be16_to_cpu(h->command)); 4389 + len = be16_to_cpu(h->length); 4390 + if (unlikely(cmd == NULL)) { 4391 + dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", 4392 + (long)be32_to_cpu(h->magic), 4393 + h->command, h->length); 4394 + goto disconnect; 4395 + } 4396 + expect = cmd->pkt_size; 4397 + ERR_IF(len != expect-sizeof(struct p_header)) 4398 + goto reconnect; 4399 + } 4400 + if (received == expect) { 4401 + D_ASSERT(cmd != NULL); 4402 + if (!cmd->process(mdev, h)) 4403 + goto reconnect; 4404 + 4405 + buf = h; 4406 + received = 0; 4407 + expect = sizeof(struct p_header); 4408 + cmd = NULL; 4409 + } 4410 + } 4411 + 4412 + if (0) { 4413 + reconnect: 4414 + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); 4415 + } 4416 + if (0) { 4417 + disconnect: 4418 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 4419 + } 4420 + clear_bit(SIGNAL_ASENDER, &mdev->flags); 4421 + 4422 + D_ASSERT(mdev->state.conn < C_CONNECTED); 4423 + dev_info(DEV, "asender terminated\n"); 4424 + 4425 + return 0; 4426 + }

+1125

drivers/block/drbd/drbd_req.c

··· 1 + /* 2 + drbd_req.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/module.h> 27 + 28 + #include <linux/slab.h> 29 + #include <linux/drbd.h> 30 + #include "drbd_int.h" 31 + #include "drbd_req.h" 32 + 33 + 34 + /* Update disk stats at start of I/O request */ 35 + static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) 36 + { 37 + const int rw = bio_data_dir(bio); 38 + int cpu; 39 + cpu = part_stat_lock(); 40 + part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); 41 + part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); 42 + part_inc_in_flight(&mdev->vdisk->part0, rw); 43 + part_stat_unlock(); 44 + } 45 + 46 + /* Update disk stats when completing request upwards */ 47 + static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) 48 + { 49 + int rw = bio_data_dir(req->master_bio); 50 + unsigned long duration = jiffies - req->start_time; 51 + int cpu; 52 + cpu = part_stat_lock(); 53 + part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); 54 + part_round_stats(cpu, &mdev->vdisk->part0); 55 + part_dec_in_flight(&mdev->vdisk->part0, rw); 56 + part_stat_unlock(); 57 + } 58 + 59 + static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) 60 + { 61 + const unsigned long s = req->rq_state; 62 + /* if it was a write, we may have to set the corresponding 63 + * bit(s) out-of-sync first. If it had a local part, we need to 64 + * release the reference to the activity log. */ 65 + if (rw == WRITE) { 66 + /* remove it from the transfer log. 67 + * well, only if it had been there in the first 68 + * place... if it had not (local only or conflicting 69 + * and never sent), it should still be "empty" as 70 + * initialized in drbd_req_new(), so we can list_del() it 71 + * here unconditionally */ 72 + list_del(&req->tl_requests); 73 + /* Set out-of-sync unless both OK flags are set 74 + * (local only or remote failed). 75 + * Other places where we set out-of-sync: 76 + * READ with local io-error */ 77 + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) 78 + drbd_set_out_of_sync(mdev, req->sector, req->size); 79 + 80 + if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) 81 + drbd_set_in_sync(mdev, req->sector, req->size); 82 + 83 + /* one might be tempted to move the drbd_al_complete_io 84 + * to the local io completion callback drbd_endio_pri. 85 + * but, if this was a mirror write, we may only 86 + * drbd_al_complete_io after this is RQ_NET_DONE, 87 + * otherwise the extent could be dropped from the al 88 + * before it has actually been written on the peer. 89 + * if we crash before our peer knows about the request, 90 + * but after the extent has been dropped from the al, 91 + * we would forget to resync the corresponding extent. 92 + */ 93 + if (s & RQ_LOCAL_MASK) { 94 + if (get_ldev_if_state(mdev, D_FAILED)) { 95 + drbd_al_complete_io(mdev, req->sector); 96 + put_ldev(mdev); 97 + } else if (__ratelimit(&drbd_ratelimit_state)) { 98 + dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " 99 + "but my Disk seems to have failed :(\n", 100 + (unsigned long long) req->sector); 101 + } 102 + } 103 + } 104 + 105 + /* if it was a local io error, we want to notify our 106 + * peer about that, and see if we need to 107 + * detach the disk and stuff. 108 + * to avoid allocating some special work 109 + * struct, reuse the request. */ 110 + 111 + /* THINK 112 + * why do we do this not when we detect the error, 113 + * but delay it until it is "done", i.e. possibly 114 + * until the next barrier ack? */ 115 + 116 + if (rw == WRITE && 117 + ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { 118 + if (!(req->w.list.next == LIST_POISON1 || 119 + list_empty(&req->w.list))) { 120 + /* DEBUG ASSERT only; if this triggers, we 121 + * probably corrupt the worker list here */ 122 + dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); 123 + dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); 124 + } 125 + req->w.cb = w_io_error; 126 + drbd_queue_work(&mdev->data.work, &req->w); 127 + /* drbd_req_free() is done in w_io_error */ 128 + } else { 129 + drbd_req_free(req); 130 + } 131 + } 132 + 133 + static void queue_barrier(struct drbd_conf *mdev) 134 + { 135 + struct drbd_tl_epoch *b; 136 + 137 + /* We are within the req_lock. Once we queued the barrier for sending, 138 + * we set the CREATE_BARRIER bit. It is cleared as soon as a new 139 + * barrier/epoch object is added. This is the only place this bit is 140 + * set. It indicates that the barrier for this epoch is already queued, 141 + * and no new epoch has been created yet. */ 142 + if (test_bit(CREATE_BARRIER, &mdev->flags)) 143 + return; 144 + 145 + b = mdev->newest_tle; 146 + b->w.cb = w_send_barrier; 147 + /* inc_ap_pending done here, so we won't 148 + * get imbalanced on connection loss. 149 + * dec_ap_pending will be done in got_BarrierAck 150 + * or (on connection loss) in tl_clear. */ 151 + inc_ap_pending(mdev); 152 + drbd_queue_work(&mdev->data.work, &b->w); 153 + set_bit(CREATE_BARRIER, &mdev->flags); 154 + } 155 + 156 + static void _about_to_complete_local_write(struct drbd_conf *mdev, 157 + struct drbd_request *req) 158 + { 159 + const unsigned long s = req->rq_state; 160 + struct drbd_request *i; 161 + struct drbd_epoch_entry *e; 162 + struct hlist_node *n; 163 + struct hlist_head *slot; 164 + 165 + /* before we can signal completion to the upper layers, 166 + * we may need to close the current epoch */ 167 + if (mdev->state.conn >= C_CONNECTED && 168 + req->epoch == mdev->newest_tle->br_number) 169 + queue_barrier(mdev); 170 + 171 + /* we need to do the conflict detection stuff, 172 + * if we have the ee_hash (two_primaries) and 173 + * this has been on the network */ 174 + if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { 175 + const sector_t sector = req->sector; 176 + const int size = req->size; 177 + 178 + /* ASSERT: 179 + * there must be no conflicting requests, since 180 + * they must have been failed on the spot */ 181 + #define OVERLAPS overlaps(sector, size, i->sector, i->size) 182 + slot = tl_hash_slot(mdev, sector); 183 + hlist_for_each_entry(i, n, slot, colision) { 184 + if (OVERLAPS) { 185 + dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " 186 + "other: %p %llus +%u\n", 187 + req, (unsigned long long)sector, size, 188 + i, (unsigned long long)i->sector, i->size); 189 + } 190 + } 191 + 192 + /* maybe "wake" those conflicting epoch entries 193 + * that wait for this request to finish. 194 + * 195 + * currently, there can be only _one_ such ee 196 + * (well, or some more, which would be pending 197 + * P_DISCARD_ACK not yet sent by the asender...), 198 + * since we block the receiver thread upon the 199 + * first conflict detection, which will wait on 200 + * misc_wait. maybe we want to assert that? 201 + * 202 + * anyways, if we found one, 203 + * we just have to do a wake_up. */ 204 + #undef OVERLAPS 205 + #define OVERLAPS overlaps(sector, size, e->sector, e->size) 206 + slot = ee_hash_slot(mdev, req->sector); 207 + hlist_for_each_entry(e, n, slot, colision) { 208 + if (OVERLAPS) { 209 + wake_up(&mdev->misc_wait); 210 + break; 211 + } 212 + } 213 + } 214 + #undef OVERLAPS 215 + } 216 + 217 + void complete_master_bio(struct drbd_conf *mdev, 218 + struct bio_and_error *m) 219 + { 220 + bio_endio(m->bio, m->error); 221 + dec_ap_bio(mdev); 222 + } 223 + 224 + /* Helper for __req_mod(). 225 + * Set m->bio to the master bio, if it is fit to be completed, 226 + * or leave it alone (it is initialized to NULL in __req_mod), 227 + * if it has already been completed, or cannot be completed yet. 228 + * If m->bio is set, the error status to be returned is placed in m->error. 229 + */ 230 + void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) 231 + { 232 + const unsigned long s = req->rq_state; 233 + struct drbd_conf *mdev = req->mdev; 234 + /* only WRITES may end up here without a master bio (on barrier ack) */ 235 + int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; 236 + 237 + /* we must not complete the master bio, while it is 238 + * still being processed by _drbd_send_zc_bio (drbd_send_dblock) 239 + * not yet acknowledged by the peer 240 + * not yet completed by the local io subsystem 241 + * these flags may get cleared in any order by 242 + * the worker, 243 + * the receiver, 244 + * the bio_endio completion callbacks. 245 + */ 246 + if (s & RQ_NET_QUEUED) 247 + return; 248 + if (s & RQ_NET_PENDING) 249 + return; 250 + if (s & RQ_LOCAL_PENDING) 251 + return; 252 + 253 + if (req->master_bio) { 254 + /* this is data_received (remote read) 255 + * or protocol C P_WRITE_ACK 256 + * or protocol B P_RECV_ACK 257 + * or protocol A "handed_over_to_network" (SendAck) 258 + * or canceled or failed, 259 + * or killed from the transfer log due to connection loss. 260 + */ 261 + 262 + /* 263 + * figure out whether to report success or failure. 264 + * 265 + * report success when at least one of the operations succeeded. 266 + * or, to put the other way, 267 + * only report failure, when both operations failed. 268 + * 269 + * what to do about the failures is handled elsewhere. 270 + * what we need to do here is just: complete the master_bio. 271 + * 272 + * local completion error, if any, has been stored as ERR_PTR 273 + * in private_bio within drbd_endio_pri. 274 + */ 275 + int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); 276 + int error = PTR_ERR(req->private_bio); 277 + 278 + /* remove the request from the conflict detection 279 + * respective block_id verification hash */ 280 + if (!hlist_unhashed(&req->colision)) 281 + hlist_del(&req->colision); 282 + else 283 + D_ASSERT((s & RQ_NET_MASK) == 0); 284 + 285 + /* for writes we need to do some extra housekeeping */ 286 + if (rw == WRITE) 287 + _about_to_complete_local_write(mdev, req); 288 + 289 + /* Update disk stats */ 290 + _drbd_end_io_acct(mdev, req); 291 + 292 + m->error = ok ? 0 : (error ?: -EIO); 293 + m->bio = req->master_bio; 294 + req->master_bio = NULL; 295 + } 296 + 297 + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { 298 + /* this is disconnected (local only) operation, 299 + * or protocol C P_WRITE_ACK, 300 + * or protocol A or B P_BARRIER_ACK, 301 + * or killed from the transfer log due to connection loss. */ 302 + _req_is_done(mdev, req, rw); 303 + } 304 + /* else: network part and not DONE yet. that is 305 + * protocol A or B, barrier ack still pending... */ 306 + } 307 + 308 + /* 309 + * checks whether there was an overlapping request 310 + * or ee already registered. 311 + * 312 + * if so, return 1, in which case this request is completed on the spot, 313 + * without ever being submitted or send. 314 + * 315 + * return 0 if it is ok to submit this request. 316 + * 317 + * NOTE: 318 + * paranoia: assume something above us is broken, and issues different write 319 + * requests for the same block simultaneously... 320 + * 321 + * To ensure these won't be reordered differently on both nodes, resulting in 322 + * diverging data sets, we discard the later one(s). Not that this is supposed 323 + * to happen, but this is the rationale why we also have to check for 324 + * conflicting requests with local origin, and why we have to do so regardless 325 + * of whether we allowed multiple primaries. 326 + * 327 + * BTW, in case we only have one primary, the ee_hash is empty anyways, and the 328 + * second hlist_for_each_entry becomes a noop. This is even simpler than to 329 + * grab a reference on the net_conf, and check for the two_primaries flag... 330 + */ 331 + static int _req_conflicts(struct drbd_request *req) 332 + { 333 + struct drbd_conf *mdev = req->mdev; 334 + const sector_t sector = req->sector; 335 + const int size = req->size; 336 + struct drbd_request *i; 337 + struct drbd_epoch_entry *e; 338 + struct hlist_node *n; 339 + struct hlist_head *slot; 340 + 341 + D_ASSERT(hlist_unhashed(&req->colision)); 342 + 343 + if (!get_net_conf(mdev)) 344 + return 0; 345 + 346 + /* BUG_ON */ 347 + ERR_IF (mdev->tl_hash_s == 0) 348 + goto out_no_conflict; 349 + BUG_ON(mdev->tl_hash == NULL); 350 + 351 + #define OVERLAPS overlaps(i->sector, i->size, sector, size) 352 + slot = tl_hash_slot(mdev, sector); 353 + hlist_for_each_entry(i, n, slot, colision) { 354 + if (OVERLAPS) { 355 + dev_alert(DEV, "%s[%u] Concurrent local write detected! " 356 + "[DISCARD L] new: %llus +%u; " 357 + "pending: %llus +%u\n", 358 + current->comm, current->pid, 359 + (unsigned long long)sector, size, 360 + (unsigned long long)i->sector, i->size); 361 + goto out_conflict; 362 + } 363 + } 364 + 365 + if (mdev->ee_hash_s) { 366 + /* now, check for overlapping requests with remote origin */ 367 + BUG_ON(mdev->ee_hash == NULL); 368 + #undef OVERLAPS 369 + #define OVERLAPS overlaps(e->sector, e->size, sector, size) 370 + slot = ee_hash_slot(mdev, sector); 371 + hlist_for_each_entry(e, n, slot, colision) { 372 + if (OVERLAPS) { 373 + dev_alert(DEV, "%s[%u] Concurrent remote write detected!" 374 + " [DISCARD L] new: %llus +%u; " 375 + "pending: %llus +%u\n", 376 + current->comm, current->pid, 377 + (unsigned long long)sector, size, 378 + (unsigned long long)e->sector, e->size); 379 + goto out_conflict; 380 + } 381 + } 382 + } 383 + #undef OVERLAPS 384 + 385 + out_no_conflict: 386 + /* this is like it should be, and what we expected. 387 + * our users do behave after all... */ 388 + put_net_conf(mdev); 389 + return 0; 390 + 391 + out_conflict: 392 + put_net_conf(mdev); 393 + return 1; 394 + } 395 + 396 + /* obviously this could be coded as many single functions 397 + * instead of one huge switch, 398 + * or by putting the code directly in the respective locations 399 + * (as it has been before). 400 + * 401 + * but having it this way 402 + * enforces that it is all in this one place, where it is easier to audit, 403 + * it makes it obvious that whatever "event" "happens" to a request should 404 + * happen "atomically" within the req_lock, 405 + * and it enforces that we have to think in a very structured manner 406 + * about the "events" that may happen to a request during its life time ... 407 + */ 408 + void __req_mod(struct drbd_request *req, enum drbd_req_event what, 409 + struct bio_and_error *m) 410 + { 411 + struct drbd_conf *mdev = req->mdev; 412 + m->bio = NULL; 413 + 414 + switch (what) { 415 + default: 416 + dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); 417 + break; 418 + 419 + /* does not happen... 420 + * initialization done in drbd_req_new 421 + case created: 422 + break; 423 + */ 424 + 425 + case to_be_send: /* via network */ 426 + /* reached via drbd_make_request_common 427 + * and from w_read_retry_remote */ 428 + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 429 + req->rq_state |= RQ_NET_PENDING; 430 + inc_ap_pending(mdev); 431 + break; 432 + 433 + case to_be_submitted: /* locally */ 434 + /* reached via drbd_make_request_common */ 435 + D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); 436 + req->rq_state |= RQ_LOCAL_PENDING; 437 + break; 438 + 439 + case completed_ok: 440 + if (bio_data_dir(req->master_bio) == WRITE) 441 + mdev->writ_cnt += req->size>>9; 442 + else 443 + mdev->read_cnt += req->size>>9; 444 + 445 + req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); 446 + req->rq_state &= ~RQ_LOCAL_PENDING; 447 + 448 + _req_may_be_done(req, m); 449 + put_ldev(mdev); 450 + break; 451 + 452 + case write_completed_with_error: 453 + req->rq_state |= RQ_LOCAL_COMPLETED; 454 + req->rq_state &= ~RQ_LOCAL_PENDING; 455 + 456 + dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", 457 + (unsigned long long)req->sector, req->size); 458 + /* and now: check how to handle local io error. */ 459 + __drbd_chk_io_error(mdev, FALSE); 460 + _req_may_be_done(req, m); 461 + put_ldev(mdev); 462 + break; 463 + 464 + case read_ahead_completed_with_error: 465 + /* it is legal to fail READA */ 466 + req->rq_state |= RQ_LOCAL_COMPLETED; 467 + req->rq_state &= ~RQ_LOCAL_PENDING; 468 + _req_may_be_done(req, m); 469 + put_ldev(mdev); 470 + break; 471 + 472 + case read_completed_with_error: 473 + drbd_set_out_of_sync(mdev, req->sector, req->size); 474 + 475 + req->rq_state |= RQ_LOCAL_COMPLETED; 476 + req->rq_state &= ~RQ_LOCAL_PENDING; 477 + 478 + dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", 479 + (unsigned long long)req->sector, req->size); 480 + /* _req_mod(req,to_be_send); oops, recursion... */ 481 + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 482 + req->rq_state |= RQ_NET_PENDING; 483 + inc_ap_pending(mdev); 484 + 485 + __drbd_chk_io_error(mdev, FALSE); 486 + put_ldev(mdev); 487 + /* NOTE: if we have no connection, 488 + * or know the peer has no good data either, 489 + * then we don't actually need to "queue_for_net_read", 490 + * but we do so anyways, since the drbd_io_error() 491 + * and the potential state change to "Diskless" 492 + * needs to be done from process context */ 493 + 494 + /* fall through: _req_mod(req,queue_for_net_read); */ 495 + 496 + case queue_for_net_read: 497 + /* READ or READA, and 498 + * no local disk, 499 + * or target area marked as invalid, 500 + * or just got an io-error. */ 501 + /* from drbd_make_request_common 502 + * or from bio_endio during read io-error recovery */ 503 + 504 + /* so we can verify the handle in the answer packet 505 + * corresponding hlist_del is in _req_may_be_done() */ 506 + hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); 507 + 508 + set_bit(UNPLUG_REMOTE, &mdev->flags); 509 + 510 + D_ASSERT(req->rq_state & RQ_NET_PENDING); 511 + req->rq_state |= RQ_NET_QUEUED; 512 + req->w.cb = (req->rq_state & RQ_LOCAL_MASK) 513 + ? w_read_retry_remote 514 + : w_send_read_req; 515 + drbd_queue_work(&mdev->data.work, &req->w); 516 + break; 517 + 518 + case queue_for_net_write: 519 + /* assert something? */ 520 + /* from drbd_make_request_common only */ 521 + 522 + hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); 523 + /* corresponding hlist_del is in _req_may_be_done() */ 524 + 525 + /* NOTE 526 + * In case the req ended up on the transfer log before being 527 + * queued on the worker, it could lead to this request being 528 + * missed during cleanup after connection loss. 529 + * So we have to do both operations here, 530 + * within the same lock that protects the transfer log. 531 + * 532 + * _req_add_to_epoch(req); this has to be after the 533 + * _maybe_start_new_epoch(req); which happened in 534 + * drbd_make_request_common, because we now may set the bit 535 + * again ourselves to close the current epoch. 536 + * 537 + * Add req to the (now) current epoch (barrier). */ 538 + 539 + /* otherwise we may lose an unplug, which may cause some remote 540 + * io-scheduler timeout to expire, increasing maximum latency, 541 + * hurting performance. */ 542 + set_bit(UNPLUG_REMOTE, &mdev->flags); 543 + 544 + /* see drbd_make_request_common, 545 + * just after it grabs the req_lock */ 546 + D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); 547 + 548 + req->epoch = mdev->newest_tle->br_number; 549 + list_add_tail(&req->tl_requests, 550 + &mdev->newest_tle->requests); 551 + 552 + /* increment size of current epoch */ 553 + mdev->newest_tle->n_req++; 554 + 555 + /* queue work item to send data */ 556 + D_ASSERT(req->rq_state & RQ_NET_PENDING); 557 + req->rq_state |= RQ_NET_QUEUED; 558 + req->w.cb = w_send_dblock; 559 + drbd_queue_work(&mdev->data.work, &req->w); 560 + 561 + /* close the epoch, in case it outgrew the limit */ 562 + if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) 563 + queue_barrier(mdev); 564 + 565 + break; 566 + 567 + case send_canceled: 568 + /* treat it the same */ 569 + case send_failed: 570 + /* real cleanup will be done from tl_clear. just update flags 571 + * so it is no longer marked as on the worker queue */ 572 + req->rq_state &= ~RQ_NET_QUEUED; 573 + /* if we did it right, tl_clear should be scheduled only after 574 + * this, so this should not be necessary! */ 575 + _req_may_be_done(req, m); 576 + break; 577 + 578 + case handed_over_to_network: 579 + /* assert something? */ 580 + if (bio_data_dir(req->master_bio) == WRITE && 581 + mdev->net_conf->wire_protocol == DRBD_PROT_A) { 582 + /* this is what is dangerous about protocol A: 583 + * pretend it was successfully written on the peer. */ 584 + if (req->rq_state & RQ_NET_PENDING) { 585 + dec_ap_pending(mdev); 586 + req->rq_state &= ~RQ_NET_PENDING; 587 + req->rq_state |= RQ_NET_OK; 588 + } /* else: neg-ack was faster... */ 589 + /* it is still not yet RQ_NET_DONE until the 590 + * corresponding epoch barrier got acked as well, 591 + * so we know what to dirty on connection loss */ 592 + } 593 + req->rq_state &= ~RQ_NET_QUEUED; 594 + req->rq_state |= RQ_NET_SENT; 595 + /* because _drbd_send_zc_bio could sleep, and may want to 596 + * dereference the bio even after the "write_acked_by_peer" and 597 + * "completed_ok" events came in, once we return from 598 + * _drbd_send_zc_bio (drbd_send_dblock), we have to check 599 + * whether it is done already, and end it. */ 600 + _req_may_be_done(req, m); 601 + break; 602 + 603 + case connection_lost_while_pending: 604 + /* transfer log cleanup after connection loss */ 605 + /* assert something? */ 606 + if (req->rq_state & RQ_NET_PENDING) 607 + dec_ap_pending(mdev); 608 + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 609 + req->rq_state |= RQ_NET_DONE; 610 + /* if it is still queued, we may not complete it here. 611 + * it will be canceled soon. */ 612 + if (!(req->rq_state & RQ_NET_QUEUED)) 613 + _req_may_be_done(req, m); 614 + break; 615 + 616 + case write_acked_by_peer_and_sis: 617 + req->rq_state |= RQ_NET_SIS; 618 + case conflict_discarded_by_peer: 619 + /* for discarded conflicting writes of multiple primaries, 620 + * there is no need to keep anything in the tl, potential 621 + * node crashes are covered by the activity log. */ 622 + if (what == conflict_discarded_by_peer) 623 + dev_alert(DEV, "Got DiscardAck packet %llus +%u!" 624 + " DRBD is not a random data generator!\n", 625 + (unsigned long long)req->sector, req->size); 626 + req->rq_state |= RQ_NET_DONE; 627 + /* fall through */ 628 + case write_acked_by_peer: 629 + /* protocol C; successfully written on peer. 630 + * Nothing to do here. 631 + * We want to keep the tl in place for all protocols, to cater 632 + * for volatile write-back caches on lower level devices. 633 + * 634 + * A barrier request is expected to have forced all prior 635 + * requests onto stable storage, so completion of a barrier 636 + * request could set NET_DONE right here, and not wait for the 637 + * P_BARRIER_ACK, but that is an unnecessary optimization. */ 638 + 639 + /* this makes it effectively the same as for: */ 640 + case recv_acked_by_peer: 641 + /* protocol B; pretends to be successfully written on peer. 642 + * see also notes above in handed_over_to_network about 643 + * protocol != C */ 644 + req->rq_state |= RQ_NET_OK; 645 + D_ASSERT(req->rq_state & RQ_NET_PENDING); 646 + dec_ap_pending(mdev); 647 + req->rq_state &= ~RQ_NET_PENDING; 648 + _req_may_be_done(req, m); 649 + break; 650 + 651 + case neg_acked: 652 + /* assert something? */ 653 + if (req->rq_state & RQ_NET_PENDING) 654 + dec_ap_pending(mdev); 655 + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 656 + 657 + req->rq_state |= RQ_NET_DONE; 658 + _req_may_be_done(req, m); 659 + /* else: done by handed_over_to_network */ 660 + break; 661 + 662 + case barrier_acked: 663 + if (req->rq_state & RQ_NET_PENDING) { 664 + /* barrier came in before all requests have been acked. 665 + * this is bad, because if the connection is lost now, 666 + * we won't be able to clean them up... */ 667 + dev_err(DEV, "FIXME (barrier_acked but pending)\n"); 668 + list_move(&req->tl_requests, &mdev->out_of_sequence_requests); 669 + } 670 + D_ASSERT(req->rq_state & RQ_NET_SENT); 671 + req->rq_state |= RQ_NET_DONE; 672 + _req_may_be_done(req, m); 673 + break; 674 + 675 + case data_received: 676 + D_ASSERT(req->rq_state & RQ_NET_PENDING); 677 + dec_ap_pending(mdev); 678 + req->rq_state &= ~RQ_NET_PENDING; 679 + req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); 680 + _req_may_be_done(req, m); 681 + break; 682 + }; 683 + } 684 + 685 + /* we may do a local read if: 686 + * - we are consistent (of course), 687 + * - or we are generally inconsistent, 688 + * BUT we are still/already IN SYNC for this area. 689 + * since size may be bigger than BM_BLOCK_SIZE, 690 + * we may need to check several bits. 691 + */ 692 + static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) 693 + { 694 + unsigned long sbnr, ebnr; 695 + sector_t esector, nr_sectors; 696 + 697 + if (mdev->state.disk == D_UP_TO_DATE) 698 + return 1; 699 + if (mdev->state.disk >= D_OUTDATED) 700 + return 0; 701 + if (mdev->state.disk < D_INCONSISTENT) 702 + return 0; 703 + /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ 704 + nr_sectors = drbd_get_capacity(mdev->this_bdev); 705 + esector = sector + (size >> 9) - 1; 706 + 707 + D_ASSERT(sector < nr_sectors); 708 + D_ASSERT(esector < nr_sectors); 709 + 710 + sbnr = BM_SECT_TO_BIT(sector); 711 + ebnr = BM_SECT_TO_BIT(esector); 712 + 713 + return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); 714 + } 715 + 716 + static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) 717 + { 718 + const int rw = bio_rw(bio); 719 + const int size = bio->bi_size; 720 + const sector_t sector = bio->bi_sector; 721 + struct drbd_tl_epoch *b = NULL; 722 + struct drbd_request *req; 723 + int local, remote; 724 + int err = -EIO; 725 + 726 + /* allocate outside of all locks; */ 727 + req = drbd_req_new(mdev, bio); 728 + if (!req) { 729 + dec_ap_bio(mdev); 730 + /* only pass the error to the upper layers. 731 + * if user cannot handle io errors, that's not our business. */ 732 + dev_err(DEV, "could not kmalloc() req\n"); 733 + bio_endio(bio, -ENOMEM); 734 + return 0; 735 + } 736 + 737 + local = get_ldev(mdev); 738 + if (!local) { 739 + bio_put(req->private_bio); /* or we get a bio leak */ 740 + req->private_bio = NULL; 741 + } 742 + if (rw == WRITE) { 743 + remote = 1; 744 + } else { 745 + /* READ || READA */ 746 + if (local) { 747 + if (!drbd_may_do_local_read(mdev, sector, size)) { 748 + /* we could kick the syncer to 749 + * sync this extent asap, wait for 750 + * it, then continue locally. 751 + * Or just issue the request remotely. 752 + */ 753 + local = 0; 754 + bio_put(req->private_bio); 755 + req->private_bio = NULL; 756 + put_ldev(mdev); 757 + } 758 + } 759 + remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; 760 + } 761 + 762 + /* If we have a disk, but a READA request is mapped to remote, 763 + * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. 764 + * Just fail that READA request right here. 765 + * 766 + * THINK: maybe fail all READA when not local? 767 + * or make this configurable... 768 + * if network is slow, READA won't do any good. 769 + */ 770 + if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { 771 + err = -EWOULDBLOCK; 772 + goto fail_and_free_req; 773 + } 774 + 775 + /* For WRITES going to the local disk, grab a reference on the target 776 + * extent. This waits for any resync activity in the corresponding 777 + * resync extent to finish, and, if necessary, pulls in the target 778 + * extent into the activity log, which involves further disk io because 779 + * of transactional on-disk meta data updates. */ 780 + if (rw == WRITE && local) 781 + drbd_al_begin_io(mdev, sector); 782 + 783 + remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || 784 + (mdev->state.pdsk == D_INCONSISTENT && 785 + mdev->state.conn >= C_CONNECTED)); 786 + 787 + if (!(local || remote)) { 788 + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 789 + goto fail_free_complete; 790 + } 791 + 792 + /* For WRITE request, we have to make sure that we have an 793 + * unused_spare_tle, in case we need to start a new epoch. 794 + * I try to be smart and avoid to pre-allocate always "just in case", 795 + * but there is a race between testing the bit and pointer outside the 796 + * spinlock, and grabbing the spinlock. 797 + * if we lost that race, we retry. */ 798 + if (rw == WRITE && remote && 799 + mdev->unused_spare_tle == NULL && 800 + test_bit(CREATE_BARRIER, &mdev->flags)) { 801 + allocate_barrier: 802 + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); 803 + if (!b) { 804 + dev_err(DEV, "Failed to alloc barrier.\n"); 805 + err = -ENOMEM; 806 + goto fail_free_complete; 807 + } 808 + } 809 + 810 + /* GOOD, everything prepared, grab the spin_lock */ 811 + spin_lock_irq(&mdev->req_lock); 812 + 813 + if (remote) { 814 + remote = (mdev->state.pdsk == D_UP_TO_DATE || 815 + (mdev->state.pdsk == D_INCONSISTENT && 816 + mdev->state.conn >= C_CONNECTED)); 817 + if (!remote) 818 + dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); 819 + if (!(local || remote)) { 820 + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 821 + spin_unlock_irq(&mdev->req_lock); 822 + goto fail_free_complete; 823 + } 824 + } 825 + 826 + if (b && mdev->unused_spare_tle == NULL) { 827 + mdev->unused_spare_tle = b; 828 + b = NULL; 829 + } 830 + if (rw == WRITE && remote && 831 + mdev->unused_spare_tle == NULL && 832 + test_bit(CREATE_BARRIER, &mdev->flags)) { 833 + /* someone closed the current epoch 834 + * while we were grabbing the spinlock */ 835 + spin_unlock_irq(&mdev->req_lock); 836 + goto allocate_barrier; 837 + } 838 + 839 + 840 + /* Update disk stats */ 841 + _drbd_start_io_acct(mdev, req, bio); 842 + 843 + /* _maybe_start_new_epoch(mdev); 844 + * If we need to generate a write barrier packet, we have to add the 845 + * new epoch (barrier) object, and queue the barrier packet for sending, 846 + * and queue the req's data after it _within the same lock_, otherwise 847 + * we have race conditions were the reorder domains could be mixed up. 848 + * 849 + * Even read requests may start a new epoch and queue the corresponding 850 + * barrier packet. To get the write ordering right, we only have to 851 + * make sure that, if this is a write request and it triggered a 852 + * barrier packet, this request is queued within the same spinlock. */ 853 + if (remote && mdev->unused_spare_tle && 854 + test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { 855 + _tl_add_barrier(mdev, mdev->unused_spare_tle); 856 + mdev->unused_spare_tle = NULL; 857 + } else { 858 + D_ASSERT(!(remote && rw == WRITE && 859 + test_bit(CREATE_BARRIER, &mdev->flags))); 860 + } 861 + 862 + /* NOTE 863 + * Actually, 'local' may be wrong here already, since we may have failed 864 + * to write to the meta data, and may become wrong anytime because of 865 + * local io-error for some other request, which would lead to us 866 + * "detaching" the local disk. 867 + * 868 + * 'remote' may become wrong any time because the network could fail. 869 + * 870 + * This is a harmless race condition, though, since it is handled 871 + * correctly at the appropriate places; so it just defers the failure 872 + * of the respective operation. 873 + */ 874 + 875 + /* mark them early for readability. 876 + * this just sets some state flags. */ 877 + if (remote) 878 + _req_mod(req, to_be_send); 879 + if (local) 880 + _req_mod(req, to_be_submitted); 881 + 882 + /* check this request on the collision detection hash tables. 883 + * if we have a conflict, just complete it here. 884 + * THINK do we want to check reads, too? (I don't think so...) */ 885 + if (rw == WRITE && _req_conflicts(req)) { 886 + /* this is a conflicting request. 887 + * even though it may have been only _partially_ 888 + * overlapping with one of the currently pending requests, 889 + * without even submitting or sending it, we will 890 + * pretend that it was successfully served right now. 891 + */ 892 + if (local) { 893 + bio_put(req->private_bio); 894 + req->private_bio = NULL; 895 + drbd_al_complete_io(mdev, req->sector); 896 + put_ldev(mdev); 897 + local = 0; 898 + } 899 + if (remote) 900 + dec_ap_pending(mdev); 901 + _drbd_end_io_acct(mdev, req); 902 + /* THINK: do we want to fail it (-EIO), or pretend success? */ 903 + bio_endio(req->master_bio, 0); 904 + req->master_bio = NULL; 905 + dec_ap_bio(mdev); 906 + drbd_req_free(req); 907 + remote = 0; 908 + } 909 + 910 + /* NOTE remote first: to get the concurrent write detection right, 911 + * we must register the request before start of local IO. */ 912 + if (remote) { 913 + /* either WRITE and C_CONNECTED, 914 + * or READ, and no local disk, 915 + * or READ, but not in sync. 916 + */ 917 + _req_mod(req, (rw == WRITE) 918 + ? queue_for_net_write 919 + : queue_for_net_read); 920 + } 921 + spin_unlock_irq(&mdev->req_lock); 922 + kfree(b); /* if someone else has beaten us to it... */ 923 + 924 + if (local) { 925 + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 926 + 927 + if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR 928 + : rw == READ ? DRBD_FAULT_DT_RD 929 + : DRBD_FAULT_DT_RA)) 930 + bio_endio(req->private_bio, -EIO); 931 + else 932 + generic_make_request(req->private_bio); 933 + } 934 + 935 + /* we need to plug ALWAYS since we possibly need to kick lo_dev. 936 + * we plug after submit, so we won't miss an unplug event */ 937 + drbd_plug_device(mdev); 938 + 939 + return 0; 940 + 941 + fail_free_complete: 942 + if (rw == WRITE && local) 943 + drbd_al_complete_io(mdev, sector); 944 + fail_and_free_req: 945 + if (local) { 946 + bio_put(req->private_bio); 947 + req->private_bio = NULL; 948 + put_ldev(mdev); 949 + } 950 + bio_endio(bio, err); 951 + drbd_req_free(req); 952 + dec_ap_bio(mdev); 953 + kfree(b); 954 + 955 + return 0; 956 + } 957 + 958 + /* helper function for drbd_make_request 959 + * if we can determine just by the mdev (state) that this request will fail, 960 + * return 1 961 + * otherwise return 0 962 + */ 963 + static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) 964 + { 965 + /* Unconfigured */ 966 + if (mdev->state.conn == C_DISCONNECTING && 967 + mdev->state.disk == D_DISKLESS) 968 + return 1; 969 + 970 + if (mdev->state.role != R_PRIMARY && 971 + (!allow_oos || is_write)) { 972 + if (__ratelimit(&drbd_ratelimit_state)) { 973 + dev_err(DEV, "Process %s[%u] tried to %s; " 974 + "since we are not in Primary state, " 975 + "we cannot allow this\n", 976 + current->comm, current->pid, 977 + is_write ? "WRITE" : "READ"); 978 + } 979 + return 1; 980 + } 981 + 982 + /* 983 + * Paranoia: we might have been primary, but sync target, or 984 + * even diskless, then lost the connection. 985 + * This should have been handled (panic? suspend?) somewhere 986 + * else. But maybe it was not, so check again here. 987 + * Caution: as long as we do not have a read/write lock on mdev, 988 + * to serialize state changes, this is racy, since we may lose 989 + * the connection *after* we test for the cstate. 990 + */ 991 + if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { 992 + if (__ratelimit(&drbd_ratelimit_state)) 993 + dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); 994 + return 1; 995 + } 996 + 997 + return 0; 998 + } 999 + 1000 + int drbd_make_request_26(struct request_queue *q, struct bio *bio) 1001 + { 1002 + unsigned int s_enr, e_enr; 1003 + struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; 1004 + 1005 + if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { 1006 + bio_endio(bio, -EPERM); 1007 + return 0; 1008 + } 1009 + 1010 + /* Reject barrier requests if we know the underlying device does 1011 + * not support them. 1012 + * XXX: Need to get this info from peer as well some how so we 1013 + * XXX: reject if EITHER side/data/metadata area does not support them. 1014 + * 1015 + * because of those XXX, this is not yet enabled, 1016 + * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. 1017 + */ 1018 + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { 1019 + /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ 1020 + bio_endio(bio, -EOPNOTSUPP); 1021 + return 0; 1022 + } 1023 + 1024 + /* 1025 + * what we "blindly" assume: 1026 + */ 1027 + D_ASSERT(bio->bi_size > 0); 1028 + D_ASSERT((bio->bi_size & 0x1ff) == 0); 1029 + D_ASSERT(bio->bi_idx == 0); 1030 + 1031 + /* to make some things easier, force alignment of requests within the 1032 + * granularity of our hash tables */ 1033 + s_enr = bio->bi_sector >> HT_SHIFT; 1034 + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; 1035 + 1036 + if (likely(s_enr == e_enr)) { 1037 + inc_ap_bio(mdev, 1); 1038 + return drbd_make_request_common(mdev, bio); 1039 + } 1040 + 1041 + /* can this bio be split generically? 1042 + * Maybe add our own split-arbitrary-bios function. */ 1043 + if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { 1044 + /* rather error out here than BUG in bio_split */ 1045 + dev_err(DEV, "bio would need to, but cannot, be split: " 1046 + "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", 1047 + bio->bi_vcnt, bio->bi_idx, bio->bi_size, 1048 + (unsigned long long)bio->bi_sector); 1049 + bio_endio(bio, -EINVAL); 1050 + } else { 1051 + /* This bio crosses some boundary, so we have to split it. */ 1052 + struct bio_pair *bp; 1053 + /* works for the "do not cross hash slot boundaries" case 1054 + * e.g. sector 262269, size 4096 1055 + * s_enr = 262269 >> 6 = 4097 1056 + * e_enr = (262269+8-1) >> 6 = 4098 1057 + * HT_SHIFT = 6 1058 + * sps = 64, mask = 63 1059 + * first_sectors = 64 - (262269 & 63) = 3 1060 + */ 1061 + const sector_t sect = bio->bi_sector; 1062 + const int sps = 1 << HT_SHIFT; /* sectors per slot */ 1063 + const int mask = sps - 1; 1064 + const sector_t first_sectors = sps - (sect & mask); 1065 + bp = bio_split(bio, 1066 + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) 1067 + bio_split_pool, 1068 + #endif 1069 + first_sectors); 1070 + 1071 + /* we need to get a "reference count" (ap_bio_cnt) 1072 + * to avoid races with the disconnect/reconnect/suspend code. 1073 + * In case we need to split the bio here, we need to get two references 1074 + * atomically, otherwise we might deadlock when trying to submit the 1075 + * second one! */ 1076 + inc_ap_bio(mdev, 2); 1077 + 1078 + D_ASSERT(e_enr == s_enr + 1); 1079 + 1080 + drbd_make_request_common(mdev, &bp->bio1); 1081 + drbd_make_request_common(mdev, &bp->bio2); 1082 + bio_pair_release(bp); 1083 + } 1084 + return 0; 1085 + } 1086 + 1087 + /* This is called by bio_add_page(). With this function we reduce 1088 + * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs 1089 + * units (was AL_EXTENTs). 1090 + * 1091 + * we do the calculation within the lower 32bit of the byte offsets, 1092 + * since we don't care for actual offset, but only check whether it 1093 + * would cross "activity log extent" boundaries. 1094 + * 1095 + * As long as the BIO is empty we have to allow at least one bvec, 1096 + * regardless of size and offset. so the resulting bio may still 1097 + * cross extent boundaries. those are dealt with (bio_split) in 1098 + * drbd_make_request_26. 1099 + */ 1100 + int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) 1101 + { 1102 + struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; 1103 + unsigned int bio_offset = 1104 + (unsigned int)bvm->bi_sector << 9; /* 32 bit */ 1105 + unsigned int bio_size = bvm->bi_size; 1106 + int limit, backing_limit; 1107 + 1108 + limit = DRBD_MAX_SEGMENT_SIZE 1109 + - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); 1110 + if (limit < 0) 1111 + limit = 0; 1112 + if (bio_size == 0) { 1113 + if (limit <= bvec->bv_len) 1114 + limit = bvec->bv_len; 1115 + } else if (limit && get_ldev(mdev)) { 1116 + struct request_queue * const b = 1117 + mdev->ldev->backing_bdev->bd_disk->queue; 1118 + if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { 1119 + backing_limit = b->merge_bvec_fn(b, bvm, bvec); 1120 + limit = min(limit, backing_limit); 1121 + } 1122 + put_ldev(mdev); 1123 + } 1124 + return limit; 1125 + }

+326

drivers/block/drbd/drbd_req.h

··· 1 + /* 2 + drbd_req.h 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 8 + Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>. 9 + 10 + DRBD is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + DRBD is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + */ 24 + 25 + #ifndef _DRBD_REQ_H 26 + #define _DRBD_REQ_H 27 + 28 + #include <linux/module.h> 29 + 30 + #include <linux/slab.h> 31 + #include <linux/drbd.h> 32 + #include "drbd_int.h" 33 + #include "drbd_wrappers.h" 34 + 35 + /* The request callbacks will be called in irq context by the IDE drivers, 36 + and in Softirqs/Tasklets/BH context by the SCSI drivers, 37 + and by the receiver and worker in kernel-thread context. 38 + Try to get the locking right :) */ 39 + 40 + /* 41 + * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are 42 + * associated with IO requests originating from the block layer above us. 43 + * 44 + * There are quite a few things that may happen to a drbd request 45 + * during its lifetime. 46 + * 47 + * It will be created. 48 + * It will be marked with the intention to be 49 + * submitted to local disk and/or 50 + * send via the network. 51 + * 52 + * It has to be placed on the transfer log and other housekeeping lists, 53 + * In case we have a network connection. 54 + * 55 + * It may be identified as a concurrent (write) request 56 + * and be handled accordingly. 57 + * 58 + * It may me handed over to the local disk subsystem. 59 + * It may be completed by the local disk subsystem, 60 + * either sucessfully or with io-error. 61 + * In case it is a READ request, and it failed locally, 62 + * it may be retried remotely. 63 + * 64 + * It may be queued for sending. 65 + * It may be handed over to the network stack, 66 + * which may fail. 67 + * It may be acknowledged by the "peer" according to the wire_protocol in use. 68 + * this may be a negative ack. 69 + * It may receive a faked ack when the network connection is lost and the 70 + * transfer log is cleaned up. 71 + * Sending may be canceled due to network connection loss. 72 + * When it finally has outlived its time, 73 + * corresponding dirty bits in the resync-bitmap may be cleared or set, 74 + * it will be destroyed, 75 + * and completion will be signalled to the originator, 76 + * with or without "success". 77 + */ 78 + 79 + enum drbd_req_event { 80 + created, 81 + to_be_send, 82 + to_be_submitted, 83 + 84 + /* XXX yes, now I am inconsistent... 85 + * these two are not "events" but "actions" 86 + * oh, well... */ 87 + queue_for_net_write, 88 + queue_for_net_read, 89 + 90 + send_canceled, 91 + send_failed, 92 + handed_over_to_network, 93 + connection_lost_while_pending, 94 + recv_acked_by_peer, 95 + write_acked_by_peer, 96 + write_acked_by_peer_and_sis, /* and set_in_sync */ 97 + conflict_discarded_by_peer, 98 + neg_acked, 99 + barrier_acked, /* in protocol A and B */ 100 + data_received, /* (remote read) */ 101 + 102 + read_completed_with_error, 103 + read_ahead_completed_with_error, 104 + write_completed_with_error, 105 + completed_ok, 106 + nothing, /* for tracing only */ 107 + }; 108 + 109 + /* encoding of request states for now. we don't actually need that many bits. 110 + * we don't need to do atomic bit operations either, since most of the time we 111 + * need to look at the connection state and/or manipulate some lists at the 112 + * same time, so we should hold the request lock anyways. 113 + */ 114 + enum drbd_req_state_bits { 115 + /* 210 116 + * 000: no local possible 117 + * 001: to be submitted 118 + * UNUSED, we could map: 011: submitted, completion still pending 119 + * 110: completed ok 120 + * 010: completed with error 121 + */ 122 + __RQ_LOCAL_PENDING, 123 + __RQ_LOCAL_COMPLETED, 124 + __RQ_LOCAL_OK, 125 + 126 + /* 76543 127 + * 00000: no network possible 128 + * 00001: to be send 129 + * 00011: to be send, on worker queue 130 + * 00101: sent, expecting recv_ack (B) or write_ack (C) 131 + * 11101: sent, 132 + * recv_ack (B) or implicit "ack" (A), 133 + * still waiting for the barrier ack. 134 + * master_bio may already be completed and invalidated. 135 + * 11100: write_acked (C), 136 + * data_received (for remote read, any protocol) 137 + * or finally the barrier ack has arrived (B,A)... 138 + * request can be freed 139 + * 01100: neg-acked (write, protocol C) 140 + * or neg-d-acked (read, any protocol) 141 + * or killed from the transfer log 142 + * during cleanup after connection loss 143 + * request can be freed 144 + * 01000: canceled or send failed... 145 + * request can be freed 146 + */ 147 + 148 + /* if "SENT" is not set, yet, this can still fail or be canceled. 149 + * if "SENT" is set already, we still wait for an Ack packet. 150 + * when cleared, the master_bio may be completed. 151 + * in (B,A) the request object may still linger on the transaction log 152 + * until the corresponding barrier ack comes in */ 153 + __RQ_NET_PENDING, 154 + 155 + /* If it is QUEUED, and it is a WRITE, it is also registered in the 156 + * transfer log. Currently we need this flag to avoid conflicts between 157 + * worker canceling the request and tl_clear_barrier killing it from 158 + * transfer log. We should restructure the code so this conflict does 159 + * no longer occur. */ 160 + __RQ_NET_QUEUED, 161 + 162 + /* well, actually only "handed over to the network stack". 163 + * 164 + * TODO can potentially be dropped because of the similar meaning 165 + * of RQ_NET_SENT and ~RQ_NET_QUEUED. 166 + * however it is not exactly the same. before we drop it 167 + * we must ensure that we can tell a request with network part 168 + * from a request without, regardless of what happens to it. */ 169 + __RQ_NET_SENT, 170 + 171 + /* when set, the request may be freed (if RQ_NET_QUEUED is clear). 172 + * basically this means the corresponding P_BARRIER_ACK was received */ 173 + __RQ_NET_DONE, 174 + 175 + /* whether or not we know (C) or pretend (B,A) that the write 176 + * was successfully written on the peer. 177 + */ 178 + __RQ_NET_OK, 179 + 180 + /* peer called drbd_set_in_sync() for this write */ 181 + __RQ_NET_SIS, 182 + 183 + /* keep this last, its for the RQ_NET_MASK */ 184 + __RQ_NET_MAX, 185 + }; 186 + 187 + #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) 188 + #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) 189 + #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) 190 + 191 + #define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ 192 + 193 + #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) 194 + #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) 195 + #define RQ_NET_SENT (1UL << __RQ_NET_SENT) 196 + #define RQ_NET_DONE (1UL << __RQ_NET_DONE) 197 + #define RQ_NET_OK (1UL << __RQ_NET_OK) 198 + #define RQ_NET_SIS (1UL << __RQ_NET_SIS) 199 + 200 + /* 0x1f8 */ 201 + #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) 202 + 203 + /* epoch entries */ 204 + static inline 205 + struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) 206 + { 207 + BUG_ON(mdev->ee_hash_s == 0); 208 + return mdev->ee_hash + 209 + ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); 210 + } 211 + 212 + /* transfer log (drbd_request objects) */ 213 + static inline 214 + struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) 215 + { 216 + BUG_ON(mdev->tl_hash_s == 0); 217 + return mdev->tl_hash + 218 + ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); 219 + } 220 + 221 + /* application reads (drbd_request objects) */ 222 + static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) 223 + { 224 + return mdev->app_reads_hash 225 + + ((unsigned int)(sector) % APP_R_HSIZE); 226 + } 227 + 228 + /* when we receive the answer for a read request, 229 + * verify that we actually know about it */ 230 + static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, 231 + u64 id, sector_t sector) 232 + { 233 + struct hlist_head *slot = ar_hash_slot(mdev, sector); 234 + struct hlist_node *n; 235 + struct drbd_request *req; 236 + 237 + hlist_for_each_entry(req, n, slot, colision) { 238 + if ((unsigned long)req == (unsigned long)id) { 239 + D_ASSERT(req->sector == sector); 240 + return req; 241 + } 242 + } 243 + return NULL; 244 + } 245 + 246 + static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, 247 + struct bio *bio_src) 248 + { 249 + struct bio *bio; 250 + struct drbd_request *req = 251 + mempool_alloc(drbd_request_mempool, GFP_NOIO); 252 + if (likely(req)) { 253 + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ 254 + 255 + req->rq_state = 0; 256 + req->mdev = mdev; 257 + req->master_bio = bio_src; 258 + req->private_bio = bio; 259 + req->epoch = 0; 260 + req->sector = bio->bi_sector; 261 + req->size = bio->bi_size; 262 + req->start_time = jiffies; 263 + INIT_HLIST_NODE(&req->colision); 264 + INIT_LIST_HEAD(&req->tl_requests); 265 + INIT_LIST_HEAD(&req->w.list); 266 + 267 + bio->bi_private = req; 268 + bio->bi_end_io = drbd_endio_pri; 269 + bio->bi_next = NULL; 270 + } 271 + return req; 272 + } 273 + 274 + static inline void drbd_req_free(struct drbd_request *req) 275 + { 276 + mempool_free(req, drbd_request_mempool); 277 + } 278 + 279 + static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) 280 + { 281 + return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); 282 + } 283 + 284 + /* Short lived temporary struct on the stack. 285 + * We could squirrel the error to be returned into 286 + * bio->bi_size, or similar. But that would be too ugly. */ 287 + struct bio_and_error { 288 + struct bio *bio; 289 + int error; 290 + }; 291 + 292 + extern void _req_may_be_done(struct drbd_request *req, 293 + struct bio_and_error *m); 294 + extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, 295 + struct bio_and_error *m); 296 + extern void complete_master_bio(struct drbd_conf *mdev, 297 + struct bio_and_error *m); 298 + 299 + /* use this if you don't want to deal with calling complete_master_bio() 300 + * outside the spinlock, e.g. when walking some list on cleanup. */ 301 + static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) 302 + { 303 + struct drbd_conf *mdev = req->mdev; 304 + struct bio_and_error m; 305 + 306 + /* __req_mod possibly frees req, do not touch req after that! */ 307 + __req_mod(req, what, &m); 308 + if (m.bio) 309 + complete_master_bio(mdev, &m); 310 + } 311 + 312 + /* completion of master bio is outside of spinlock. 313 + * If you need it irqsave, do it your self! */ 314 + static inline void req_mod(struct drbd_request *req, 315 + enum drbd_req_event what) 316 + { 317 + struct drbd_conf *mdev = req->mdev; 318 + struct bio_and_error m; 319 + spin_lock_irq(&mdev->req_lock); 320 + __req_mod(req, what, &m); 321 + spin_unlock_irq(&mdev->req_lock); 322 + 323 + if (m.bio) 324 + complete_master_bio(mdev, &m); 325 + } 326 + #endif

+113

drivers/block/drbd/drbd_strings.c

··· 1 + /* 2 + drbd.h 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/drbd.h> 27 + 28 + static const char *drbd_conn_s_names[] = { 29 + [C_STANDALONE] = "StandAlone", 30 + [C_DISCONNECTING] = "Disconnecting", 31 + [C_UNCONNECTED] = "Unconnected", 32 + [C_TIMEOUT] = "Timeout", 33 + [C_BROKEN_PIPE] = "BrokenPipe", 34 + [C_NETWORK_FAILURE] = "NetworkFailure", 35 + [C_PROTOCOL_ERROR] = "ProtocolError", 36 + [C_WF_CONNECTION] = "WFConnection", 37 + [C_WF_REPORT_PARAMS] = "WFReportParams", 38 + [C_TEAR_DOWN] = "TearDown", 39 + [C_CONNECTED] = "Connected", 40 + [C_STARTING_SYNC_S] = "StartingSyncS", 41 + [C_STARTING_SYNC_T] = "StartingSyncT", 42 + [C_WF_BITMAP_S] = "WFBitMapS", 43 + [C_WF_BITMAP_T] = "WFBitMapT", 44 + [C_WF_SYNC_UUID] = "WFSyncUUID", 45 + [C_SYNC_SOURCE] = "SyncSource", 46 + [C_SYNC_TARGET] = "SyncTarget", 47 + [C_PAUSED_SYNC_S] = "PausedSyncS", 48 + [C_PAUSED_SYNC_T] = "PausedSyncT", 49 + [C_VERIFY_S] = "VerifyS", 50 + [C_VERIFY_T] = "VerifyT", 51 + }; 52 + 53 + static const char *drbd_role_s_names[] = { 54 + [R_PRIMARY] = "Primary", 55 + [R_SECONDARY] = "Secondary", 56 + [R_UNKNOWN] = "Unknown" 57 + }; 58 + 59 + static const char *drbd_disk_s_names[] = { 60 + [D_DISKLESS] = "Diskless", 61 + [D_ATTACHING] = "Attaching", 62 + [D_FAILED] = "Failed", 63 + [D_NEGOTIATING] = "Negotiating", 64 + [D_INCONSISTENT] = "Inconsistent", 65 + [D_OUTDATED] = "Outdated", 66 + [D_UNKNOWN] = "DUnknown", 67 + [D_CONSISTENT] = "Consistent", 68 + [D_UP_TO_DATE] = "UpToDate", 69 + }; 70 + 71 + static const char *drbd_state_sw_errors[] = { 72 + [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", 73 + [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", 74 + [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", 75 + [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", 76 + [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", 77 + [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", 78 + [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", 79 + [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", 80 + [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", 81 + [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", 82 + [-SS_DEVICE_IN_USE] = "Device is held open by someone", 83 + [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", 84 + [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", 85 + [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", 86 + [-SS_NOT_SUPPORTED] = "Peer does not support protocol", 87 + [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", 88 + [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", 89 + [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", 90 + }; 91 + 92 + const char *drbd_conn_str(enum drbd_conns s) 93 + { 94 + /* enums are unsigned... */ 95 + return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; 96 + } 97 + 98 + const char *drbd_role_str(enum drbd_role s) 99 + { 100 + return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; 101 + } 102 + 103 + const char *drbd_disk_str(enum drbd_disk_state s) 104 + { 105 + return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; 106 + } 107 + 108 + const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) 109 + { 110 + return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : 111 + err > SS_TWO_PRIMARIES ? "TOO_LARGE" 112 + : drbd_state_sw_errors[-err]; 113 + }

+351

drivers/block/drbd/drbd_vli.h

··· 1 + /* 2 + -*- linux-c -*- 3 + drbd_receiver.c 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + */ 24 + 25 + #ifndef _DRBD_VLI_H 26 + #define _DRBD_VLI_H 27 + 28 + /* 29 + * At a granularity of 4KiB storage represented per bit, 30 + * and stroage sizes of several TiB, 31 + * and possibly small-bandwidth replication, 32 + * the bitmap transfer time can take much too long, 33 + * if transmitted in plain text. 34 + * 35 + * We try to reduce the transfered bitmap information 36 + * by encoding runlengths of bit polarity. 37 + * 38 + * We never actually need to encode a "zero" (runlengths are positive). 39 + * But then we have to store the value of the first bit. 40 + * The first bit of information thus shall encode if the first runlength 41 + * gives the number of set or unset bits. 42 + * 43 + * We assume that large areas are either completely set or unset, 44 + * which gives good compression with any runlength method, 45 + * even when encoding the runlength as fixed size 32bit/64bit integers. 46 + * 47 + * Still, there may be areas where the polarity flips every few bits, 48 + * and encoding the runlength sequence of those areas with fix size 49 + * integers would be much worse than plaintext. 50 + * 51 + * We want to encode small runlength values with minimum code length, 52 + * while still being able to encode a Huge run of all zeros. 53 + * 54 + * Thus we need a Variable Length Integer encoding, VLI. 55 + * 56 + * For some cases, we produce more code bits than plaintext input. 57 + * We need to send incompressible chunks as plaintext, skip over them 58 + * and then see if the next chunk compresses better. 59 + * 60 + * We don't care too much about "excellent" compression ratio for large 61 + * runlengths (all set/all clear): whether we achieve a factor of 100 62 + * or 1000 is not that much of an issue. 63 + * We do not want to waste too much on short runlengths in the "noisy" 64 + * parts of the bitmap, though. 65 + * 66 + * There are endless variants of VLI, we experimented with: 67 + * * simple byte-based 68 + * * various bit based with different code word length. 69 + * 70 + * To avoid yet an other configuration parameter (choice of bitmap compression 71 + * algorithm) which was difficult to explain and tune, we just chose the one 72 + * variant that turned out best in all test cases. 73 + * Based on real world usage patterns, with device sizes ranging from a few GiB 74 + * to several TiB, file server/mailserver/webserver/mysql/postgress, 75 + * mostly idle to really busy, the all time winner (though sometimes only 76 + * marginally better) is: 77 + */ 78 + 79 + /* 80 + * encoding is "visualised" as 81 + * __little endian__ bitstream, least significant bit first (left most) 82 + * 83 + * this particular encoding is chosen so that the prefix code 84 + * starts as unary encoding the level, then modified so that 85 + * 10 levels can be described in 8bit, with minimal overhead 86 + * for the smaller levels. 87 + * 88 + * Number of data bits follow fibonacci sequence, with the exception of the 89 + * last level (+1 data bit, so it makes 64bit total). The only worse code when 90 + * encoding bit polarity runlength is 1 plain bits => 2 code bits. 91 + prefix data bits max val Nº data bits 92 + 0 x 0x2 1 93 + 10 x 0x4 1 94 + 110 xx 0x8 2 95 + 1110 xxx 0x10 3 96 + 11110 xxx xx 0x30 5 97 + 111110 xx xxxxxx 0x130 8 98 + 11111100 xxxxxxxx xxxxx 0x2130 13 99 + 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 100 + 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 101 + 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 102 + * maximum encodable value: 0x100000400202130 == 2**56 + some */ 103 + 104 + /* compression "table": 105 + transmitted x 0.29 106 + as plaintext x ........................ 107 + x ........................ 108 + x ........................ 109 + x 0.59 0.21........................ 110 + x ........................................................ 111 + x .. c ................................................... 112 + x 0.44.. o ................................................... 113 + x .......... d ................................................... 114 + x .......... e ................................................... 115 + X............. ................................................... 116 + x.............. b ................................................... 117 + 2.0x............... i ................................................... 118 + #X................ t ................................................... 119 + #................. s ........................... plain bits .......... 120 + -+----------------------------------------------------------------------- 121 + 1 16 32 64 122 + */ 123 + 124 + /* LEVEL: (total bits, prefix bits, prefix value), 125 + * sorted ascending by number of total bits. 126 + * The rest of the code table is calculated at compiletime from this. */ 127 + 128 + /* fibonacci data 1, 1, ... */ 129 + #define VLI_L_1_1() do { \ 130 + LEVEL( 2, 1, 0x00); \ 131 + LEVEL( 3, 2, 0x01); \ 132 + LEVEL( 5, 3, 0x03); \ 133 + LEVEL( 7, 4, 0x07); \ 134 + LEVEL(10, 5, 0x0f); \ 135 + LEVEL(14, 6, 0x1f); \ 136 + LEVEL(21, 8, 0x3f); \ 137 + LEVEL(29, 8, 0x7f); \ 138 + LEVEL(42, 8, 0xbf); \ 139 + LEVEL(64, 8, 0xff); \ 140 + } while (0) 141 + 142 + /* finds a suitable level to decode the least significant part of in. 143 + * returns number of bits consumed. 144 + * 145 + * BUG() for bad input, as that would mean a buggy code table. */ 146 + static inline int vli_decode_bits(u64 *out, const u64 in) 147 + { 148 + u64 adj = 1; 149 + 150 + #define LEVEL(t,b,v) \ 151 + do { \ 152 + if ((in & ((1 << b) -1)) == v) { \ 153 + *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 154 + return t; \ 155 + } \ 156 + adj += 1ULL << (t - b); \ 157 + } while (0) 158 + 159 + VLI_L_1_1(); 160 + 161 + /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 162 + BUG(); 163 + #undef LEVEL 164 + } 165 + 166 + /* return number of code bits needed, 167 + * or negative error number */ 168 + static inline int __vli_encode_bits(u64 *out, const u64 in) 169 + { 170 + u64 max = 0; 171 + u64 adj = 1; 172 + 173 + if (in == 0) 174 + return -EINVAL; 175 + 176 + #define LEVEL(t,b,v) do { \ 177 + max += 1ULL << (t - b); \ 178 + if (in <= max) { \ 179 + if (out) \ 180 + *out = ((in - adj) << b) | v; \ 181 + return t; \ 182 + } \ 183 + adj = max + 1; \ 184 + } while (0) 185 + 186 + VLI_L_1_1(); 187 + 188 + return -EOVERFLOW; 189 + #undef LEVEL 190 + } 191 + 192 + #undef VLI_L_1_1 193 + 194 + /* code from here down is independend of actually used bit code */ 195 + 196 + /* 197 + * Code length is determined by some unique (e.g. unary) prefix. 198 + * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 199 + * not a byte stream. 200 + */ 201 + 202 + /* for the bitstream, we need a cursor */ 203 + struct bitstream_cursor { 204 + /* the current byte */ 205 + u8 *b; 206 + /* the current bit within *b, nomalized: 0..7 */ 207 + unsigned int bit; 208 + }; 209 + 210 + /* initialize cursor to point to first bit of stream */ 211 + static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 212 + { 213 + cur->b = s; 214 + cur->bit = 0; 215 + } 216 + 217 + /* advance cursor by that many bits; maximum expected input value: 64, 218 + * but depending on VLI implementation, it may be more. */ 219 + static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 220 + { 221 + bits += cur->bit; 222 + cur->b = cur->b + (bits >> 3); 223 + cur->bit = bits & 7; 224 + } 225 + 226 + /* the bitstream itself knows its length */ 227 + struct bitstream { 228 + struct bitstream_cursor cur; 229 + unsigned char *buf; 230 + size_t buf_len; /* in bytes */ 231 + 232 + /* for input stream: 233 + * number of trailing 0 bits for padding 234 + * total number of valid bits in stream: buf_len * 8 - pad_bits */ 235 + unsigned int pad_bits; 236 + }; 237 + 238 + static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 239 + { 240 + bs->buf = s; 241 + bs->buf_len = len; 242 + bs->pad_bits = pad_bits; 243 + bitstream_cursor_reset(&bs->cur, bs->buf); 244 + } 245 + 246 + static inline void bitstream_rewind(struct bitstream *bs) 247 + { 248 + bitstream_cursor_reset(&bs->cur, bs->buf); 249 + memset(bs->buf, 0, bs->buf_len); 250 + } 251 + 252 + /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 253 + * Ignores "pad_bits". 254 + * Returns zero if bits == 0 (nothing to do). 255 + * Returns number of bits used if successful. 256 + * 257 + * If there is not enough room left in bitstream, 258 + * leaves bitstream unchanged and returns -ENOBUFS. 259 + */ 260 + static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 261 + { 262 + unsigned char *b = bs->cur.b; 263 + unsigned int tmp; 264 + 265 + if (bits == 0) 266 + return 0; 267 + 268 + if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 269 + return -ENOBUFS; 270 + 271 + /* paranoia: strip off hi bits; they should not be set anyways. */ 272 + if (bits < 64) 273 + val &= ~0ULL >> (64 - bits); 274 + 275 + *b++ |= (val & 0xff) << bs->cur.bit; 276 + 277 + for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 278 + *b++ |= (val >> tmp) & 0xff; 279 + 280 + bitstream_cursor_advance(&bs->cur, bits); 281 + return bits; 282 + } 283 + 284 + /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 285 + * 286 + * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 287 + * 288 + * If there are less than the requested number of valid bits left in the 289 + * bitstream, still fetches all available bits. 290 + * 291 + * Returns number of actually fetched bits. 292 + */ 293 + static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 294 + { 295 + u64 val; 296 + unsigned int n; 297 + 298 + if (bits > 64) 299 + return -EINVAL; 300 + 301 + if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 302 + bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 303 + - bs->cur.bit - bs->pad_bits; 304 + 305 + if (bits == 0) { 306 + *out = 0; 307 + return 0; 308 + } 309 + 310 + /* get the high bits */ 311 + val = 0; 312 + n = (bs->cur.bit + bits + 7) >> 3; 313 + /* n may be at most 9, if cur.bit + bits > 64 */ 314 + /* which means this copies at most 8 byte */ 315 + if (n) { 316 + memcpy(&val, bs->cur.b+1, n - 1); 317 + val = le64_to_cpu(val) << (8 - bs->cur.bit); 318 + } 319 + 320 + /* we still need the low bits */ 321 + val |= bs->cur.b[0] >> bs->cur.bit; 322 + 323 + /* and mask out bits we don't want */ 324 + val &= ~0ULL >> (64 - bits); 325 + 326 + bitstream_cursor_advance(&bs->cur, bits); 327 + *out = val; 328 + 329 + return bits; 330 + } 331 + 332 + /* encodes @in as vli into @bs; 333 + 334 + * return values 335 + * > 0: number of bits successfully stored in bitstream 336 + * -ENOBUFS @bs is full 337 + * -EINVAL input zero (invalid) 338 + * -EOVERFLOW input too large for this vli code (invalid) 339 + */ 340 + static inline int vli_encode_bits(struct bitstream *bs, u64 in) 341 + { 342 + u64 code = code; 343 + int bits = __vli_encode_bits(&code, in); 344 + 345 + if (bits <= 0) 346 + return bits; 347 + 348 + return bitstream_put_bits(bs, code, bits); 349 + } 350 + 351 + #endif

+1512

drivers/block/drbd/drbd_worker.c

··· 1 + /* 2 + drbd_worker.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/module.h> 27 + #include <linux/version.h> 28 + #include <linux/drbd.h> 29 + #include <linux/sched.h> 30 + #include <linux/smp_lock.h> 31 + #include <linux/wait.h> 32 + #include <linux/mm.h> 33 + #include <linux/memcontrol.h> 34 + #include <linux/mm_inline.h> 35 + #include <linux/slab.h> 36 + #include <linux/random.h> 37 + #include <linux/mm.h> 38 + #include <linux/string.h> 39 + #include <linux/scatterlist.h> 40 + 41 + #include "drbd_int.h" 42 + #include "drbd_req.h" 43 + 44 + #define SLEEP_TIME (HZ/10) 45 + 46 + static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); 47 + 48 + 49 + 50 + /* defined here: 51 + drbd_md_io_complete 52 + drbd_endio_write_sec 53 + drbd_endio_read_sec 54 + drbd_endio_pri 55 + 56 + * more endio handlers: 57 + atodb_endio in drbd_actlog.c 58 + drbd_bm_async_io_complete in drbd_bitmap.c 59 + 60 + * For all these callbacks, note the following: 61 + * The callbacks will be called in irq context by the IDE drivers, 62 + * and in Softirqs/Tasklets/BH context by the SCSI drivers. 63 + * Try to get the locking right :) 64 + * 65 + */ 66 + 67 + 68 + /* About the global_state_lock 69 + Each state transition on an device holds a read lock. In case we have 70 + to evaluate the sync after dependencies, we grab a write lock, because 71 + we need stable states on all devices for that. */ 72 + rwlock_t global_state_lock; 73 + 74 + /* used for synchronous meta data and bitmap IO 75 + * submitted by drbd_md_sync_page_io() 76 + */ 77 + void drbd_md_io_complete(struct bio *bio, int error) 78 + { 79 + struct drbd_md_io *md_io; 80 + 81 + md_io = (struct drbd_md_io *)bio->bi_private; 82 + md_io->error = error; 83 + 84 + complete(&md_io->event); 85 + } 86 + 87 + /* reads on behalf of the partner, 88 + * "submitted" by the receiver 89 + */ 90 + void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) 91 + { 92 + unsigned long flags = 0; 93 + struct drbd_epoch_entry *e = NULL; 94 + struct drbd_conf *mdev; 95 + int uptodate = bio_flagged(bio, BIO_UPTODATE); 96 + 97 + e = bio->bi_private; 98 + mdev = e->mdev; 99 + 100 + if (error) 101 + dev_warn(DEV, "read: error=%d s=%llus\n", error, 102 + (unsigned long long)e->sector); 103 + if (!error && !uptodate) { 104 + dev_warn(DEV, "read: setting error to -EIO s=%llus\n", 105 + (unsigned long long)e->sector); 106 + /* strange behavior of some lower level drivers... 107 + * fail the request by clearing the uptodate flag, 108 + * but do not return any error?! */ 109 + error = -EIO; 110 + } 111 + 112 + D_ASSERT(e->block_id != ID_VACANT); 113 + 114 + spin_lock_irqsave(&mdev->req_lock, flags); 115 + mdev->read_cnt += e->size >> 9; 116 + list_del(&e->w.list); 117 + if (list_empty(&mdev->read_ee)) 118 + wake_up(&mdev->ee_wait); 119 + spin_unlock_irqrestore(&mdev->req_lock, flags); 120 + 121 + drbd_chk_io_error(mdev, error, FALSE); 122 + drbd_queue_work(&mdev->data.work, &e->w); 123 + put_ldev(mdev); 124 + } 125 + 126 + /* writes on behalf of the partner, or resync writes, 127 + * "submitted" by the receiver. 128 + */ 129 + void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) 130 + { 131 + unsigned long flags = 0; 132 + struct drbd_epoch_entry *e = NULL; 133 + struct drbd_conf *mdev; 134 + sector_t e_sector; 135 + int do_wake; 136 + int is_syncer_req; 137 + int do_al_complete_io; 138 + int uptodate = bio_flagged(bio, BIO_UPTODATE); 139 + int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); 140 + 141 + e = bio->bi_private; 142 + mdev = e->mdev; 143 + 144 + if (error) 145 + dev_warn(DEV, "write: error=%d s=%llus\n", error, 146 + (unsigned long long)e->sector); 147 + if (!error && !uptodate) { 148 + dev_warn(DEV, "write: setting error to -EIO s=%llus\n", 149 + (unsigned long long)e->sector); 150 + /* strange behavior of some lower level drivers... 151 + * fail the request by clearing the uptodate flag, 152 + * but do not return any error?! */ 153 + error = -EIO; 154 + } 155 + 156 + /* error == -ENOTSUPP would be a better test, 157 + * alas it is not reliable */ 158 + if (error && is_barrier && e->flags & EE_IS_BARRIER) { 159 + drbd_bump_write_ordering(mdev, WO_bdev_flush); 160 + spin_lock_irqsave(&mdev->req_lock, flags); 161 + list_del(&e->w.list); 162 + e->w.cb = w_e_reissue; 163 + /* put_ldev actually happens below, once we come here again. */ 164 + __release(local); 165 + spin_unlock_irqrestore(&mdev->req_lock, flags); 166 + drbd_queue_work(&mdev->data.work, &e->w); 167 + return; 168 + } 169 + 170 + D_ASSERT(e->block_id != ID_VACANT); 171 + 172 + spin_lock_irqsave(&mdev->req_lock, flags); 173 + mdev->writ_cnt += e->size >> 9; 174 + is_syncer_req = is_syncer_block_id(e->block_id); 175 + 176 + /* after we moved e to done_ee, 177 + * we may no longer access it, 178 + * it may be freed/reused already! 179 + * (as soon as we release the req_lock) */ 180 + e_sector = e->sector; 181 + do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; 182 + 183 + list_del(&e->w.list); /* has been on active_ee or sync_ee */ 184 + list_add_tail(&e->w.list, &mdev->done_ee); 185 + 186 + /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, 187 + * neither did we wake possibly waiting conflicting requests. 188 + * done from "drbd_process_done_ee" within the appropriate w.cb 189 + * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ 190 + 191 + do_wake = is_syncer_req 192 + ? list_empty(&mdev->sync_ee) 193 + : list_empty(&mdev->active_ee); 194 + 195 + if (error) 196 + __drbd_chk_io_error(mdev, FALSE); 197 + spin_unlock_irqrestore(&mdev->req_lock, flags); 198 + 199 + if (is_syncer_req) 200 + drbd_rs_complete_io(mdev, e_sector); 201 + 202 + if (do_wake) 203 + wake_up(&mdev->ee_wait); 204 + 205 + if (do_al_complete_io) 206 + drbd_al_complete_io(mdev, e_sector); 207 + 208 + wake_asender(mdev); 209 + put_ldev(mdev); 210 + 211 + } 212 + 213 + /* read, readA or write requests on R_PRIMARY coming from drbd_make_request 214 + */ 215 + void drbd_endio_pri(struct bio *bio, int error) 216 + { 217 + unsigned long flags; 218 + struct drbd_request *req = bio->bi_private; 219 + struct drbd_conf *mdev = req->mdev; 220 + struct bio_and_error m; 221 + enum drbd_req_event what; 222 + int uptodate = bio_flagged(bio, BIO_UPTODATE); 223 + 224 + if (error) 225 + dev_warn(DEV, "p %s: error=%d\n", 226 + bio_data_dir(bio) == WRITE ? "write" : "read", error); 227 + if (!error && !uptodate) { 228 + dev_warn(DEV, "p %s: setting error to -EIO\n", 229 + bio_data_dir(bio) == WRITE ? "write" : "read"); 230 + /* strange behavior of some lower level drivers... 231 + * fail the request by clearing the uptodate flag, 232 + * but do not return any error?! */ 233 + error = -EIO; 234 + } 235 + 236 + /* to avoid recursion in __req_mod */ 237 + if (unlikely(error)) { 238 + what = (bio_data_dir(bio) == WRITE) 239 + ? write_completed_with_error 240 + : (bio_rw(bio) == READA) 241 + ? read_completed_with_error 242 + : read_ahead_completed_with_error; 243 + } else 244 + what = completed_ok; 245 + 246 + bio_put(req->private_bio); 247 + req->private_bio = ERR_PTR(error); 248 + 249 + spin_lock_irqsave(&mdev->req_lock, flags); 250 + __req_mod(req, what, &m); 251 + spin_unlock_irqrestore(&mdev->req_lock, flags); 252 + 253 + if (m.bio) 254 + complete_master_bio(mdev, &m); 255 + } 256 + 257 + int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 258 + { 259 + struct drbd_request *req = container_of(w, struct drbd_request, w); 260 + 261 + /* NOTE: mdev->ldev can be NULL by the time we get here! */ 262 + /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ 263 + 264 + /* the only way this callback is scheduled is from _req_may_be_done, 265 + * when it is done and had a local write error, see comments there */ 266 + drbd_req_free(req); 267 + 268 + return TRUE; 269 + } 270 + 271 + int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 272 + { 273 + struct drbd_request *req = container_of(w, struct drbd_request, w); 274 + 275 + /* We should not detach for read io-error, 276 + * but try to WRITE the P_DATA_REPLY to the failed location, 277 + * to give the disk the chance to relocate that block */ 278 + 279 + spin_lock_irq(&mdev->req_lock); 280 + if (cancel || 281 + mdev->state.conn < C_CONNECTED || 282 + mdev->state.pdsk <= D_INCONSISTENT) { 283 + _req_mod(req, send_canceled); 284 + spin_unlock_irq(&mdev->req_lock); 285 + dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); 286 + return 1; 287 + } 288 + spin_unlock_irq(&mdev->req_lock); 289 + 290 + return w_send_read_req(mdev, w, 0); 291 + } 292 + 293 + int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 294 + { 295 + ERR_IF(cancel) return 1; 296 + dev_err(DEV, "resync inactive, but callback triggered??\n"); 297 + return 1; /* Simply ignore this! */ 298 + } 299 + 300 + void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) 301 + { 302 + struct hash_desc desc; 303 + struct scatterlist sg; 304 + struct bio_vec *bvec; 305 + int i; 306 + 307 + desc.tfm = tfm; 308 + desc.flags = 0; 309 + 310 + sg_init_table(&sg, 1); 311 + crypto_hash_init(&desc); 312 + 313 + __bio_for_each_segment(bvec, bio, i, 0) { 314 + sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); 315 + crypto_hash_update(&desc, &sg, sg.length); 316 + } 317 + crypto_hash_final(&desc, digest); 318 + } 319 + 320 + static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 321 + { 322 + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 323 + int digest_size; 324 + void *digest; 325 + int ok; 326 + 327 + D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); 328 + 329 + if (unlikely(cancel)) { 330 + drbd_free_ee(mdev, e); 331 + return 1; 332 + } 333 + 334 + if (likely(drbd_bio_uptodate(e->private_bio))) { 335 + digest_size = crypto_hash_digestsize(mdev->csums_tfm); 336 + digest = kmalloc(digest_size, GFP_NOIO); 337 + if (digest) { 338 + drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); 339 + 340 + inc_rs_pending(mdev); 341 + ok = drbd_send_drequest_csum(mdev, 342 + e->sector, 343 + e->size, 344 + digest, 345 + digest_size, 346 + P_CSUM_RS_REQUEST); 347 + kfree(digest); 348 + } else { 349 + dev_err(DEV, "kmalloc() of digest failed.\n"); 350 + ok = 0; 351 + } 352 + } else 353 + ok = 1; 354 + 355 + drbd_free_ee(mdev, e); 356 + 357 + if (unlikely(!ok)) 358 + dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); 359 + return ok; 360 + } 361 + 362 + #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 363 + 364 + static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) 365 + { 366 + struct drbd_epoch_entry *e; 367 + 368 + if (!get_ldev(mdev)) 369 + return 0; 370 + 371 + /* GFP_TRY, because if there is no memory available right now, this may 372 + * be rescheduled for later. It is "only" background resync, after all. */ 373 + e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); 374 + if (!e) { 375 + put_ldev(mdev); 376 + return 2; 377 + } 378 + 379 + spin_lock_irq(&mdev->req_lock); 380 + list_add(&e->w.list, &mdev->read_ee); 381 + spin_unlock_irq(&mdev->req_lock); 382 + 383 + e->private_bio->bi_end_io = drbd_endio_read_sec; 384 + e->private_bio->bi_rw = READ; 385 + e->w.cb = w_e_send_csum; 386 + 387 + mdev->read_cnt += size >> 9; 388 + drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); 389 + 390 + return 1; 391 + } 392 + 393 + void resync_timer_fn(unsigned long data) 394 + { 395 + unsigned long flags; 396 + struct drbd_conf *mdev = (struct drbd_conf *) data; 397 + int queue; 398 + 399 + spin_lock_irqsave(&mdev->req_lock, flags); 400 + 401 + if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { 402 + queue = 1; 403 + if (mdev->state.conn == C_VERIFY_S) 404 + mdev->resync_work.cb = w_make_ov_request; 405 + else 406 + mdev->resync_work.cb = w_make_resync_request; 407 + } else { 408 + queue = 0; 409 + mdev->resync_work.cb = w_resync_inactive; 410 + } 411 + 412 + spin_unlock_irqrestore(&mdev->req_lock, flags); 413 + 414 + /* harmless race: list_empty outside data.work.q_lock */ 415 + if (list_empty(&mdev->resync_work.list) && queue) 416 + drbd_queue_work(&mdev->data.work, &mdev->resync_work); 417 + } 418 + 419 + int w_make_resync_request(struct drbd_conf *mdev, 420 + struct drbd_work *w, int cancel) 421 + { 422 + unsigned long bit; 423 + sector_t sector; 424 + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 425 + int max_segment_size = queue_max_segment_size(mdev->rq_queue); 426 + int number, i, size, pe, mx; 427 + int align, queued, sndbuf; 428 + 429 + if (unlikely(cancel)) 430 + return 1; 431 + 432 + if (unlikely(mdev->state.conn < C_CONNECTED)) { 433 + dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); 434 + return 0; 435 + } 436 + 437 + if (mdev->state.conn != C_SYNC_TARGET) 438 + dev_err(DEV, "%s in w_make_resync_request\n", 439 + drbd_conn_str(mdev->state.conn)); 440 + 441 + if (!get_ldev(mdev)) { 442 + /* Since we only need to access mdev->rsync a 443 + get_ldev_if_state(mdev,D_FAILED) would be sufficient, but 444 + to continue resync with a broken disk makes no sense at 445 + all */ 446 + dev_err(DEV, "Disk broke down during resync!\n"); 447 + mdev->resync_work.cb = w_resync_inactive; 448 + return 1; 449 + } 450 + 451 + number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); 452 + pe = atomic_read(&mdev->rs_pending_cnt); 453 + 454 + mutex_lock(&mdev->data.mutex); 455 + if (mdev->data.socket) 456 + mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); 457 + else 458 + mx = 1; 459 + mutex_unlock(&mdev->data.mutex); 460 + 461 + /* For resync rates >160MB/sec, allow more pending RS requests */ 462 + if (number > mx) 463 + mx = number; 464 + 465 + /* Limit the number of pending RS requests to no more than the peer's receive buffer */ 466 + if ((pe + number) > mx) { 467 + number = mx - pe; 468 + } 469 + 470 + for (i = 0; i < number; i++) { 471 + /* Stop generating RS requests, when half of the send buffer is filled */ 472 + mutex_lock(&mdev->data.mutex); 473 + if (mdev->data.socket) { 474 + queued = mdev->data.socket->sk->sk_wmem_queued; 475 + sndbuf = mdev->data.socket->sk->sk_sndbuf; 476 + } else { 477 + queued = 1; 478 + sndbuf = 0; 479 + } 480 + mutex_unlock(&mdev->data.mutex); 481 + if (queued > sndbuf / 2) 482 + goto requeue; 483 + 484 + next_sector: 485 + size = BM_BLOCK_SIZE; 486 + bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); 487 + 488 + if (bit == -1UL) { 489 + mdev->bm_resync_fo = drbd_bm_bits(mdev); 490 + mdev->resync_work.cb = w_resync_inactive; 491 + put_ldev(mdev); 492 + return 1; 493 + } 494 + 495 + sector = BM_BIT_TO_SECT(bit); 496 + 497 + if (drbd_try_rs_begin_io(mdev, sector)) { 498 + mdev->bm_resync_fo = bit; 499 + goto requeue; 500 + } 501 + mdev->bm_resync_fo = bit + 1; 502 + 503 + if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) { 504 + drbd_rs_complete_io(mdev, sector); 505 + goto next_sector; 506 + } 507 + 508 + #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE 509 + /* try to find some adjacent bits. 510 + * we stop if we have already the maximum req size. 511 + * 512 + * Additionally always align bigger requests, in order to 513 + * be prepared for all stripe sizes of software RAIDs. 514 + * 515 + * we _do_ care about the agreed-upon q->max_segment_size 516 + * here, as splitting up the requests on the other side is more 517 + * difficult. the consequence is, that on lvm and md and other 518 + * "indirect" devices, this is dead code, since 519 + * q->max_segment_size will be PAGE_SIZE. 520 + */ 521 + align = 1; 522 + for (;;) { 523 + if (size + BM_BLOCK_SIZE > max_segment_size) 524 + break; 525 + 526 + /* Be always aligned */ 527 + if (sector & ((1<<(align+3))-1)) 528 + break; 529 + 530 + /* do not cross extent boundaries */ 531 + if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) 532 + break; 533 + /* now, is it actually dirty, after all? 534 + * caution, drbd_bm_test_bit is tri-state for some 535 + * obscure reason; ( b == 0 ) would get the out-of-band 536 + * only accidentally right because of the "oddly sized" 537 + * adjustment below */ 538 + if (drbd_bm_test_bit(mdev, bit+1) != 1) 539 + break; 540 + bit++; 541 + size += BM_BLOCK_SIZE; 542 + if ((BM_BLOCK_SIZE << align) <= size) 543 + align++; 544 + i++; 545 + } 546 + /* if we merged some, 547 + * reset the offset to start the next drbd_bm_find_next from */ 548 + if (size > BM_BLOCK_SIZE) 549 + mdev->bm_resync_fo = bit + 1; 550 + #endif 551 + 552 + /* adjust very last sectors, in case we are oddly sized */ 553 + if (sector + (size>>9) > capacity) 554 + size = (capacity-sector)<<9; 555 + if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { 556 + switch (read_for_csum(mdev, sector, size)) { 557 + case 0: /* Disk failure*/ 558 + put_ldev(mdev); 559 + return 0; 560 + case 2: /* Allocation failed */ 561 + drbd_rs_complete_io(mdev, sector); 562 + mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); 563 + goto requeue; 564 + /* case 1: everything ok */ 565 + } 566 + } else { 567 + inc_rs_pending(mdev); 568 + if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, 569 + sector, size, ID_SYNCER)) { 570 + dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); 571 + dec_rs_pending(mdev); 572 + put_ldev(mdev); 573 + return 0; 574 + } 575 + } 576 + } 577 + 578 + if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) { 579 + /* last syncer _request_ was sent, 580 + * but the P_RS_DATA_REPLY not yet received. sync will end (and 581 + * next sync group will resume), as soon as we receive the last 582 + * resync data block, and the last bit is cleared. 583 + * until then resync "work" is "inactive" ... 584 + */ 585 + mdev->resync_work.cb = w_resync_inactive; 586 + put_ldev(mdev); 587 + return 1; 588 + } 589 + 590 + requeue: 591 + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); 592 + put_ldev(mdev); 593 + return 1; 594 + } 595 + 596 + static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 597 + { 598 + int number, i, size; 599 + sector_t sector; 600 + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 601 + 602 + if (unlikely(cancel)) 603 + return 1; 604 + 605 + if (unlikely(mdev->state.conn < C_CONNECTED)) { 606 + dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); 607 + return 0; 608 + } 609 + 610 + number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); 611 + if (atomic_read(&mdev->rs_pending_cnt) > number) 612 + goto requeue; 613 + 614 + number -= atomic_read(&mdev->rs_pending_cnt); 615 + 616 + sector = mdev->ov_position; 617 + for (i = 0; i < number; i++) { 618 + if (sector >= capacity) { 619 + mdev->resync_work.cb = w_resync_inactive; 620 + return 1; 621 + } 622 + 623 + size = BM_BLOCK_SIZE; 624 + 625 + if (drbd_try_rs_begin_io(mdev, sector)) { 626 + mdev->ov_position = sector; 627 + goto requeue; 628 + } 629 + 630 + if (sector + (size>>9) > capacity) 631 + size = (capacity-sector)<<9; 632 + 633 + inc_rs_pending(mdev); 634 + if (!drbd_send_ov_request(mdev, sector, size)) { 635 + dec_rs_pending(mdev); 636 + return 0; 637 + } 638 + sector += BM_SECT_PER_BIT; 639 + } 640 + mdev->ov_position = sector; 641 + 642 + requeue: 643 + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); 644 + return 1; 645 + } 646 + 647 + 648 + int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 649 + { 650 + kfree(w); 651 + ov_oos_print(mdev); 652 + drbd_resync_finished(mdev); 653 + 654 + return 1; 655 + } 656 + 657 + static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 658 + { 659 + kfree(w); 660 + 661 + drbd_resync_finished(mdev); 662 + 663 + return 1; 664 + } 665 + 666 + int drbd_resync_finished(struct drbd_conf *mdev) 667 + { 668 + unsigned long db, dt, dbdt; 669 + unsigned long n_oos; 670 + union drbd_state os, ns; 671 + struct drbd_work *w; 672 + char *khelper_cmd = NULL; 673 + 674 + /* Remove all elements from the resync LRU. Since future actions 675 + * might set bits in the (main) bitmap, then the entries in the 676 + * resync LRU would be wrong. */ 677 + if (drbd_rs_del_all(mdev)) { 678 + /* In case this is not possible now, most probably because 679 + * there are P_RS_DATA_REPLY Packets lingering on the worker's 680 + * queue (or even the read operations for those packets 681 + * is not finished by now). Retry in 100ms. */ 682 + 683 + drbd_kick_lo(mdev); 684 + __set_current_state(TASK_INTERRUPTIBLE); 685 + schedule_timeout(HZ / 10); 686 + w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); 687 + if (w) { 688 + w->cb = w_resync_finished; 689 + drbd_queue_work(&mdev->data.work, w); 690 + return 1; 691 + } 692 + dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); 693 + } 694 + 695 + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; 696 + if (dt <= 0) 697 + dt = 1; 698 + db = mdev->rs_total; 699 + dbdt = Bit2KB(db/dt); 700 + mdev->rs_paused /= HZ; 701 + 702 + if (!get_ldev(mdev)) 703 + goto out; 704 + 705 + spin_lock_irq(&mdev->req_lock); 706 + os = mdev->state; 707 + 708 + /* This protects us against multiple calls (that can happen in the presence 709 + of application IO), and against connectivity loss just before we arrive here. */ 710 + if (os.conn <= C_CONNECTED) 711 + goto out_unlock; 712 + 713 + ns = os; 714 + ns.conn = C_CONNECTED; 715 + 716 + dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", 717 + (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? 718 + "Online verify " : "Resync", 719 + dt + mdev->rs_paused, mdev->rs_paused, dbdt); 720 + 721 + n_oos = drbd_bm_total_weight(mdev); 722 + 723 + if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { 724 + if (n_oos) { 725 + dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n", 726 + n_oos, Bit2KB(1)); 727 + khelper_cmd = "out-of-sync"; 728 + } 729 + } else { 730 + D_ASSERT((n_oos - mdev->rs_failed) == 0); 731 + 732 + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) 733 + khelper_cmd = "after-resync-target"; 734 + 735 + if (mdev->csums_tfm && mdev->rs_total) { 736 + const unsigned long s = mdev->rs_same_csum; 737 + const unsigned long t = mdev->rs_total; 738 + const int ratio = 739 + (t == 0) ? 0 : 740 + (t < 100000) ? ((s*100)/t) : (s/(t/100)); 741 + dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " 742 + "transferred %luK total %luK\n", 743 + ratio, 744 + Bit2KB(mdev->rs_same_csum), 745 + Bit2KB(mdev->rs_total - mdev->rs_same_csum), 746 + Bit2KB(mdev->rs_total)); 747 + } 748 + } 749 + 750 + if (mdev->rs_failed) { 751 + dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed); 752 + 753 + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { 754 + ns.disk = D_INCONSISTENT; 755 + ns.pdsk = D_UP_TO_DATE; 756 + } else { 757 + ns.disk = D_UP_TO_DATE; 758 + ns.pdsk = D_INCONSISTENT; 759 + } 760 + } else { 761 + ns.disk = D_UP_TO_DATE; 762 + ns.pdsk = D_UP_TO_DATE; 763 + 764 + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { 765 + if (mdev->p_uuid) { 766 + int i; 767 + for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) 768 + _drbd_uuid_set(mdev, i, mdev->p_uuid[i]); 769 + drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]); 770 + _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]); 771 + } else { 772 + dev_err(DEV, "mdev->p_uuid is NULL! BUG\n"); 773 + } 774 + } 775 + 776 + drbd_uuid_set_bm(mdev, 0UL); 777 + 778 + if (mdev->p_uuid) { 779 + /* Now the two UUID sets are equal, update what we 780 + * know of the peer. */ 781 + int i; 782 + for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) 783 + mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; 784 + } 785 + } 786 + 787 + _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); 788 + out_unlock: 789 + spin_unlock_irq(&mdev->req_lock); 790 + put_ldev(mdev); 791 + out: 792 + mdev->rs_total = 0; 793 + mdev->rs_failed = 0; 794 + mdev->rs_paused = 0; 795 + mdev->ov_start_sector = 0; 796 + 797 + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { 798 + dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); 799 + drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); 800 + } 801 + 802 + if (khelper_cmd) 803 + drbd_khelper(mdev, khelper_cmd); 804 + 805 + return 1; 806 + } 807 + 808 + /* helper */ 809 + static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 810 + { 811 + if (drbd_bio_has_active_page(e->private_bio)) { 812 + /* This might happen if sendpage() has not finished */ 813 + spin_lock_irq(&mdev->req_lock); 814 + list_add_tail(&e->w.list, &mdev->net_ee); 815 + spin_unlock_irq(&mdev->req_lock); 816 + } else 817 + drbd_free_ee(mdev, e); 818 + } 819 + 820 + /** 821 + * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST 822 + * @mdev: DRBD device. 823 + * @w: work object. 824 + * @cancel: The connection will be closed anyways 825 + */ 826 + int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 827 + { 828 + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 829 + int ok; 830 + 831 + if (unlikely(cancel)) { 832 + drbd_free_ee(mdev, e); 833 + dec_unacked(mdev); 834 + return 1; 835 + } 836 + 837 + if (likely(drbd_bio_uptodate(e->private_bio))) { 838 + ok = drbd_send_block(mdev, P_DATA_REPLY, e); 839 + } else { 840 + if (__ratelimit(&drbd_ratelimit_state)) 841 + dev_err(DEV, "Sending NegDReply. sector=%llus.\n", 842 + (unsigned long long)e->sector); 843 + 844 + ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); 845 + } 846 + 847 + dec_unacked(mdev); 848 + 849 + move_to_net_ee_or_free(mdev, e); 850 + 851 + if (unlikely(!ok)) 852 + dev_err(DEV, "drbd_send_block() failed\n"); 853 + return ok; 854 + } 855 + 856 + /** 857 + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS 858 + * @mdev: DRBD device. 859 + * @w: work object. 860 + * @cancel: The connection will be closed anyways 861 + */ 862 + int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 863 + { 864 + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 865 + int ok; 866 + 867 + if (unlikely(cancel)) { 868 + drbd_free_ee(mdev, e); 869 + dec_unacked(mdev); 870 + return 1; 871 + } 872 + 873 + if (get_ldev_if_state(mdev, D_FAILED)) { 874 + drbd_rs_complete_io(mdev, e->sector); 875 + put_ldev(mdev); 876 + } 877 + 878 + if (likely(drbd_bio_uptodate(e->private_bio))) { 879 + if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { 880 + inc_rs_pending(mdev); 881 + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 882 + } else { 883 + if (__ratelimit(&drbd_ratelimit_state)) 884 + dev_err(DEV, "Not sending RSDataReply, " 885 + "partner DISKLESS!\n"); 886 + ok = 1; 887 + } 888 + } else { 889 + if (__ratelimit(&drbd_ratelimit_state)) 890 + dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", 891 + (unsigned long long)e->sector); 892 + 893 + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); 894 + 895 + /* update resync data with failure */ 896 + drbd_rs_failed_io(mdev, e->sector, e->size); 897 + } 898 + 899 + dec_unacked(mdev); 900 + 901 + move_to_net_ee_or_free(mdev, e); 902 + 903 + if (unlikely(!ok)) 904 + dev_err(DEV, "drbd_send_block() failed\n"); 905 + return ok; 906 + } 907 + 908 + int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 909 + { 910 + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 911 + struct digest_info *di; 912 + int digest_size; 913 + void *digest = NULL; 914 + int ok, eq = 0; 915 + 916 + if (unlikely(cancel)) { 917 + drbd_free_ee(mdev, e); 918 + dec_unacked(mdev); 919 + return 1; 920 + } 921 + 922 + drbd_rs_complete_io(mdev, e->sector); 923 + 924 + di = (struct digest_info *)(unsigned long)e->block_id; 925 + 926 + if (likely(drbd_bio_uptodate(e->private_bio))) { 927 + /* quick hack to try to avoid a race against reconfiguration. 928 + * a real fix would be much more involved, 929 + * introducing more locking mechanisms */ 930 + if (mdev->csums_tfm) { 931 + digest_size = crypto_hash_digestsize(mdev->csums_tfm); 932 + D_ASSERT(digest_size == di->digest_size); 933 + digest = kmalloc(digest_size, GFP_NOIO); 934 + } 935 + if (digest) { 936 + drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); 937 + eq = !memcmp(digest, di->digest, digest_size); 938 + kfree(digest); 939 + } 940 + 941 + if (eq) { 942 + drbd_set_in_sync(mdev, e->sector, e->size); 943 + mdev->rs_same_csum++; 944 + ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); 945 + } else { 946 + inc_rs_pending(mdev); 947 + e->block_id = ID_SYNCER; 948 + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 949 + } 950 + } else { 951 + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); 952 + if (__ratelimit(&drbd_ratelimit_state)) 953 + dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); 954 + } 955 + 956 + dec_unacked(mdev); 957 + 958 + kfree(di); 959 + 960 + move_to_net_ee_or_free(mdev, e); 961 + 962 + if (unlikely(!ok)) 963 + dev_err(DEV, "drbd_send_block/ack() failed\n"); 964 + return ok; 965 + } 966 + 967 + int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 968 + { 969 + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 970 + int digest_size; 971 + void *digest; 972 + int ok = 1; 973 + 974 + if (unlikely(cancel)) 975 + goto out; 976 + 977 + if (unlikely(!drbd_bio_uptodate(e->private_bio))) 978 + goto out; 979 + 980 + digest_size = crypto_hash_digestsize(mdev->verify_tfm); 981 + /* FIXME if this allocation fails, online verify will not terminate! */ 982 + digest = kmalloc(digest_size, GFP_NOIO); 983 + if (digest) { 984 + drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); 985 + inc_rs_pending(mdev); 986 + ok = drbd_send_drequest_csum(mdev, e->sector, e->size, 987 + digest, digest_size, P_OV_REPLY); 988 + if (!ok) 989 + dec_rs_pending(mdev); 990 + kfree(digest); 991 + } 992 + 993 + out: 994 + drbd_free_ee(mdev, e); 995 + 996 + dec_unacked(mdev); 997 + 998 + return ok; 999 + } 1000 + 1001 + void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) 1002 + { 1003 + if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { 1004 + mdev->ov_last_oos_size += size>>9; 1005 + } else { 1006 + mdev->ov_last_oos_start = sector; 1007 + mdev->ov_last_oos_size = size>>9; 1008 + } 1009 + drbd_set_out_of_sync(mdev, sector, size); 1010 + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); 1011 + } 1012 + 1013 + int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1014 + { 1015 + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 1016 + struct digest_info *di; 1017 + int digest_size; 1018 + void *digest; 1019 + int ok, eq = 0; 1020 + 1021 + if (unlikely(cancel)) { 1022 + drbd_free_ee(mdev, e); 1023 + dec_unacked(mdev); 1024 + return 1; 1025 + } 1026 + 1027 + /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all 1028 + * the resync lru has been cleaned up already */ 1029 + drbd_rs_complete_io(mdev, e->sector); 1030 + 1031 + di = (struct digest_info *)(unsigned long)e->block_id; 1032 + 1033 + if (likely(drbd_bio_uptodate(e->private_bio))) { 1034 + digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1035 + digest = kmalloc(digest_size, GFP_NOIO); 1036 + if (digest) { 1037 + drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); 1038 + 1039 + D_ASSERT(digest_size == di->digest_size); 1040 + eq = !memcmp(digest, di->digest, digest_size); 1041 + kfree(digest); 1042 + } 1043 + } else { 1044 + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); 1045 + if (__ratelimit(&drbd_ratelimit_state)) 1046 + dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); 1047 + } 1048 + 1049 + dec_unacked(mdev); 1050 + 1051 + kfree(di); 1052 + 1053 + if (!eq) 1054 + drbd_ov_oos_found(mdev, e->sector, e->size); 1055 + else 1056 + ov_oos_print(mdev); 1057 + 1058 + ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, 1059 + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); 1060 + 1061 + drbd_free_ee(mdev, e); 1062 + 1063 + if (--mdev->ov_left == 0) { 1064 + ov_oos_print(mdev); 1065 + drbd_resync_finished(mdev); 1066 + } 1067 + 1068 + return ok; 1069 + } 1070 + 1071 + int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1072 + { 1073 + struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); 1074 + complete(&b->done); 1075 + return 1; 1076 + } 1077 + 1078 + int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1079 + { 1080 + struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); 1081 + struct p_barrier *p = &mdev->data.sbuf.barrier; 1082 + int ok = 1; 1083 + 1084 + /* really avoid racing with tl_clear. w.cb may have been referenced 1085 + * just before it was reassigned and re-queued, so double check that. 1086 + * actually, this race was harmless, since we only try to send the 1087 + * barrier packet here, and otherwise do nothing with the object. 1088 + * but compare with the head of w_clear_epoch */ 1089 + spin_lock_irq(&mdev->req_lock); 1090 + if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) 1091 + cancel = 1; 1092 + spin_unlock_irq(&mdev->req_lock); 1093 + if (cancel) 1094 + return 1; 1095 + 1096 + if (!drbd_get_data_sock(mdev)) 1097 + return 0; 1098 + p->barrier = b->br_number; 1099 + /* inc_ap_pending was done where this was queued. 1100 + * dec_ap_pending will be done in got_BarrierAck 1101 + * or (on connection loss) in w_clear_epoch. */ 1102 + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, 1103 + (struct p_header *)p, sizeof(*p), 0); 1104 + drbd_put_data_sock(mdev); 1105 + 1106 + return ok; 1107 + } 1108 + 1109 + int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1110 + { 1111 + if (cancel) 1112 + return 1; 1113 + return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); 1114 + } 1115 + 1116 + /** 1117 + * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request 1118 + * @mdev: DRBD device. 1119 + * @w: work object. 1120 + * @cancel: The connection will be closed anyways 1121 + */ 1122 + int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1123 + { 1124 + struct drbd_request *req = container_of(w, struct drbd_request, w); 1125 + int ok; 1126 + 1127 + if (unlikely(cancel)) { 1128 + req_mod(req, send_canceled); 1129 + return 1; 1130 + } 1131 + 1132 + ok = drbd_send_dblock(mdev, req); 1133 + req_mod(req, ok ? handed_over_to_network : send_failed); 1134 + 1135 + return ok; 1136 + } 1137 + 1138 + /** 1139 + * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet 1140 + * @mdev: DRBD device. 1141 + * @w: work object. 1142 + * @cancel: The connection will be closed anyways 1143 + */ 1144 + int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1145 + { 1146 + struct drbd_request *req = container_of(w, struct drbd_request, w); 1147 + int ok; 1148 + 1149 + if (unlikely(cancel)) { 1150 + req_mod(req, send_canceled); 1151 + return 1; 1152 + } 1153 + 1154 + ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, 1155 + (unsigned long)req); 1156 + 1157 + if (!ok) { 1158 + /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); 1159 + * so this is probably redundant */ 1160 + if (mdev->state.conn >= C_CONNECTED) 1161 + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); 1162 + } 1163 + req_mod(req, ok ? handed_over_to_network : send_failed); 1164 + 1165 + return ok; 1166 + } 1167 + 1168 + static int _drbd_may_sync_now(struct drbd_conf *mdev) 1169 + { 1170 + struct drbd_conf *odev = mdev; 1171 + 1172 + while (1) { 1173 + if (odev->sync_conf.after == -1) 1174 + return 1; 1175 + odev = minor_to_mdev(odev->sync_conf.after); 1176 + ERR_IF(!odev) return 1; 1177 + if ((odev->state.conn >= C_SYNC_SOURCE && 1178 + odev->state.conn <= C_PAUSED_SYNC_T) || 1179 + odev->state.aftr_isp || odev->state.peer_isp || 1180 + odev->state.user_isp) 1181 + return 0; 1182 + } 1183 + } 1184 + 1185 + /** 1186 + * _drbd_pause_after() - Pause resync on all devices that may not resync now 1187 + * @mdev: DRBD device. 1188 + * 1189 + * Called from process context only (admin command and after_state_ch). 1190 + */ 1191 + static int _drbd_pause_after(struct drbd_conf *mdev) 1192 + { 1193 + struct drbd_conf *odev; 1194 + int i, rv = 0; 1195 + 1196 + for (i = 0; i < minor_count; i++) { 1197 + odev = minor_to_mdev(i); 1198 + if (!odev) 1199 + continue; 1200 + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) 1201 + continue; 1202 + if (!_drbd_may_sync_now(odev)) 1203 + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) 1204 + != SS_NOTHING_TO_DO); 1205 + } 1206 + 1207 + return rv; 1208 + } 1209 + 1210 + /** 1211 + * _drbd_resume_next() - Resume resync on all devices that may resync now 1212 + * @mdev: DRBD device. 1213 + * 1214 + * Called from process context only (admin command and worker). 1215 + */ 1216 + static int _drbd_resume_next(struct drbd_conf *mdev) 1217 + { 1218 + struct drbd_conf *odev; 1219 + int i, rv = 0; 1220 + 1221 + for (i = 0; i < minor_count; i++) { 1222 + odev = minor_to_mdev(i); 1223 + if (!odev) 1224 + continue; 1225 + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) 1226 + continue; 1227 + if (odev->state.aftr_isp) { 1228 + if (_drbd_may_sync_now(odev)) 1229 + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), 1230 + CS_HARD, NULL) 1231 + != SS_NOTHING_TO_DO) ; 1232 + } 1233 + } 1234 + return rv; 1235 + } 1236 + 1237 + void resume_next_sg(struct drbd_conf *mdev) 1238 + { 1239 + write_lock_irq(&global_state_lock); 1240 + _drbd_resume_next(mdev); 1241 + write_unlock_irq(&global_state_lock); 1242 + } 1243 + 1244 + void suspend_other_sg(struct drbd_conf *mdev) 1245 + { 1246 + write_lock_irq(&global_state_lock); 1247 + _drbd_pause_after(mdev); 1248 + write_unlock_irq(&global_state_lock); 1249 + } 1250 + 1251 + static int sync_after_error(struct drbd_conf *mdev, int o_minor) 1252 + { 1253 + struct drbd_conf *odev; 1254 + 1255 + if (o_minor == -1) 1256 + return NO_ERROR; 1257 + if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) 1258 + return ERR_SYNC_AFTER; 1259 + 1260 + /* check for loops */ 1261 + odev = minor_to_mdev(o_minor); 1262 + while (1) { 1263 + if (odev == mdev) 1264 + return ERR_SYNC_AFTER_CYCLE; 1265 + 1266 + /* dependency chain ends here, no cycles. */ 1267 + if (odev->sync_conf.after == -1) 1268 + return NO_ERROR; 1269 + 1270 + /* follow the dependency chain */ 1271 + odev = minor_to_mdev(odev->sync_conf.after); 1272 + } 1273 + } 1274 + 1275 + int drbd_alter_sa(struct drbd_conf *mdev, int na) 1276 + { 1277 + int changes; 1278 + int retcode; 1279 + 1280 + write_lock_irq(&global_state_lock); 1281 + retcode = sync_after_error(mdev, na); 1282 + if (retcode == NO_ERROR) { 1283 + mdev->sync_conf.after = na; 1284 + do { 1285 + changes = _drbd_pause_after(mdev); 1286 + changes |= _drbd_resume_next(mdev); 1287 + } while (changes); 1288 + } 1289 + write_unlock_irq(&global_state_lock); 1290 + return retcode; 1291 + } 1292 + 1293 + /** 1294 + * drbd_start_resync() - Start the resync process 1295 + * @mdev: DRBD device. 1296 + * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET 1297 + * 1298 + * This function might bring you directly into one of the 1299 + * C_PAUSED_SYNC_* states. 1300 + */ 1301 + void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) 1302 + { 1303 + union drbd_state ns; 1304 + int r; 1305 + 1306 + if (mdev->state.conn >= C_SYNC_SOURCE) { 1307 + dev_err(DEV, "Resync already running!\n"); 1308 + return; 1309 + } 1310 + 1311 + /* In case a previous resync run was aborted by an IO error/detach on the peer. */ 1312 + drbd_rs_cancel_all(mdev); 1313 + 1314 + if (side == C_SYNC_TARGET) { 1315 + /* Since application IO was locked out during C_WF_BITMAP_T and 1316 + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET 1317 + we check that we might make the data inconsistent. */ 1318 + r = drbd_khelper(mdev, "before-resync-target"); 1319 + r = (r >> 8) & 0xff; 1320 + if (r > 0) { 1321 + dev_info(DEV, "before-resync-target handler returned %d, " 1322 + "dropping connection.\n", r); 1323 + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 1324 + return; 1325 + } 1326 + } 1327 + 1328 + drbd_state_lock(mdev); 1329 + 1330 + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { 1331 + drbd_state_unlock(mdev); 1332 + return; 1333 + } 1334 + 1335 + if (side == C_SYNC_TARGET) { 1336 + mdev->bm_resync_fo = 0; 1337 + } else /* side == C_SYNC_SOURCE */ { 1338 + u64 uuid; 1339 + 1340 + get_random_bytes(&uuid, sizeof(u64)); 1341 + drbd_uuid_set(mdev, UI_BITMAP, uuid); 1342 + drbd_send_sync_uuid(mdev, uuid); 1343 + 1344 + D_ASSERT(mdev->state.disk == D_UP_TO_DATE); 1345 + } 1346 + 1347 + write_lock_irq(&global_state_lock); 1348 + ns = mdev->state; 1349 + 1350 + ns.aftr_isp = !_drbd_may_sync_now(mdev); 1351 + 1352 + ns.conn = side; 1353 + 1354 + if (side == C_SYNC_TARGET) 1355 + ns.disk = D_INCONSISTENT; 1356 + else /* side == C_SYNC_SOURCE */ 1357 + ns.pdsk = D_INCONSISTENT; 1358 + 1359 + r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); 1360 + ns = mdev->state; 1361 + 1362 + if (ns.conn < C_CONNECTED) 1363 + r = SS_UNKNOWN_ERROR; 1364 + 1365 + if (r == SS_SUCCESS) { 1366 + mdev->rs_total = 1367 + mdev->rs_mark_left = drbd_bm_total_weight(mdev); 1368 + mdev->rs_failed = 0; 1369 + mdev->rs_paused = 0; 1370 + mdev->rs_start = 1371 + mdev->rs_mark_time = jiffies; 1372 + mdev->rs_same_csum = 0; 1373 + _drbd_pause_after(mdev); 1374 + } 1375 + write_unlock_irq(&global_state_lock); 1376 + drbd_state_unlock(mdev); 1377 + put_ldev(mdev); 1378 + 1379 + if (r == SS_SUCCESS) { 1380 + dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", 1381 + drbd_conn_str(ns.conn), 1382 + (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), 1383 + (unsigned long) mdev->rs_total); 1384 + 1385 + if (mdev->rs_total == 0) { 1386 + /* Peer still reachable? Beware of failing before-resync-target handlers! */ 1387 + request_ping(mdev); 1388 + __set_current_state(TASK_INTERRUPTIBLE); 1389 + schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ 1390 + drbd_resync_finished(mdev); 1391 + return; 1392 + } 1393 + 1394 + /* ns.conn may already be != mdev->state.conn, 1395 + * we may have been paused in between, or become paused until 1396 + * the timer triggers. 1397 + * No matter, that is handled in resync_timer_fn() */ 1398 + if (ns.conn == C_SYNC_TARGET) 1399 + mod_timer(&mdev->resync_timer, jiffies); 1400 + 1401 + drbd_md_sync(mdev); 1402 + } 1403 + } 1404 + 1405 + int drbd_worker(struct drbd_thread *thi) 1406 + { 1407 + struct drbd_conf *mdev = thi->mdev; 1408 + struct drbd_work *w = NULL; 1409 + LIST_HEAD(work_list); 1410 + int intr = 0, i; 1411 + 1412 + sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); 1413 + 1414 + while (get_t_state(thi) == Running) { 1415 + drbd_thread_current_set_cpu(mdev); 1416 + 1417 + if (down_trylock(&mdev->data.work.s)) { 1418 + mutex_lock(&mdev->data.mutex); 1419 + if (mdev->data.socket && !mdev->net_conf->no_cork) 1420 + drbd_tcp_uncork(mdev->data.socket); 1421 + mutex_unlock(&mdev->data.mutex); 1422 + 1423 + intr = down_interruptible(&mdev->data.work.s); 1424 + 1425 + mutex_lock(&mdev->data.mutex); 1426 + if (mdev->data.socket && !mdev->net_conf->no_cork) 1427 + drbd_tcp_cork(mdev->data.socket); 1428 + mutex_unlock(&mdev->data.mutex); 1429 + } 1430 + 1431 + if (intr) { 1432 + D_ASSERT(intr == -EINTR); 1433 + flush_signals(current); 1434 + ERR_IF (get_t_state(thi) == Running) 1435 + continue; 1436 + break; 1437 + } 1438 + 1439 + if (get_t_state(thi) != Running) 1440 + break; 1441 + /* With this break, we have done a down() but not consumed 1442 + the entry from the list. The cleanup code takes care of 1443 + this... */ 1444 + 1445 + w = NULL; 1446 + spin_lock_irq(&mdev->data.work.q_lock); 1447 + ERR_IF(list_empty(&mdev->data.work.q)) { 1448 + /* something terribly wrong in our logic. 1449 + * we were able to down() the semaphore, 1450 + * but the list is empty... doh. 1451 + * 1452 + * what is the best thing to do now? 1453 + * try again from scratch, restarting the receiver, 1454 + * asender, whatnot? could break even more ugly, 1455 + * e.g. when we are primary, but no good local data. 1456 + * 1457 + * I'll try to get away just starting over this loop. 1458 + */ 1459 + spin_unlock_irq(&mdev->data.work.q_lock); 1460 + continue; 1461 + } 1462 + w = list_entry(mdev->data.work.q.next, struct drbd_work, list); 1463 + list_del_init(&w->list); 1464 + spin_unlock_irq(&mdev->data.work.q_lock); 1465 + 1466 + if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { 1467 + /* dev_warn(DEV, "worker: a callback failed! \n"); */ 1468 + if (mdev->state.conn >= C_CONNECTED) 1469 + drbd_force_state(mdev, 1470 + NS(conn, C_NETWORK_FAILURE)); 1471 + } 1472 + } 1473 + D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); 1474 + D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); 1475 + 1476 + spin_lock_irq(&mdev->data.work.q_lock); 1477 + i = 0; 1478 + while (!list_empty(&mdev->data.work.q)) { 1479 + list_splice_init(&mdev->data.work.q, &work_list); 1480 + spin_unlock_irq(&mdev->data.work.q_lock); 1481 + 1482 + while (!list_empty(&work_list)) { 1483 + w = list_entry(work_list.next, struct drbd_work, list); 1484 + list_del_init(&w->list); 1485 + w->cb(mdev, w, 1); 1486 + i++; /* dead debugging code */ 1487 + } 1488 + 1489 + spin_lock_irq(&mdev->data.work.q_lock); 1490 + } 1491 + sema_init(&mdev->data.work.s, 0); 1492 + /* DANGEROUS race: if someone did queue his work within the spinlock, 1493 + * but up() ed outside the spinlock, we could get an up() on the 1494 + * semaphore without corresponding list entry. 1495 + * So don't do that. 1496 + */ 1497 + spin_unlock_irq(&mdev->data.work.q_lock); 1498 + 1499 + D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); 1500 + /* _drbd_set_state only uses stop_nowait. 1501 + * wait here for the Exiting receiver. */ 1502 + drbd_thread_stop(&mdev->receiver); 1503 + drbd_mdev_cleanup(mdev); 1504 + 1505 + dev_info(DEV, "worker terminated\n"); 1506 + 1507 + clear_bit(DEVICE_DYING, &mdev->flags); 1508 + clear_bit(CONFIG_PENDING, &mdev->flags); 1509 + wake_up(&mdev->state_wait); 1510 + 1511 + return 0; 1512 + }

+91

drivers/block/drbd/drbd_wrappers.h

··· 1 + #ifndef _DRBD_WRAPPERS_H 2 + #define _DRBD_WRAPPERS_H 3 + 4 + #include <linux/ctype.h> 5 + #include <linux/mm.h> 6 + 7 + /* see get_sb_bdev and bd_claim */ 8 + extern char *drbd_sec_holder; 9 + 10 + /* sets the number of 512 byte sectors of our virtual device */ 11 + static inline void drbd_set_my_capacity(struct drbd_conf *mdev, 12 + sector_t size) 13 + { 14 + /* set_capacity(mdev->this_bdev->bd_disk, size); */ 15 + set_capacity(mdev->vdisk, size); 16 + mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; 17 + } 18 + 19 + #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) 20 + 21 + static inline int drbd_bio_has_active_page(struct bio *bio) 22 + { 23 + struct bio_vec *bvec; 24 + int i; 25 + 26 + __bio_for_each_segment(bvec, bio, i, 0) { 27 + if (page_count(bvec->bv_page) > 1) 28 + return 1; 29 + } 30 + 31 + return 0; 32 + } 33 + 34 + /* bi_end_io handlers */ 35 + extern void drbd_md_io_complete(struct bio *bio, int error); 36 + extern void drbd_endio_read_sec(struct bio *bio, int error); 37 + extern void drbd_endio_write_sec(struct bio *bio, int error); 38 + extern void drbd_endio_pri(struct bio *bio, int error); 39 + 40 + /* 41 + * used to submit our private bio 42 + */ 43 + static inline void drbd_generic_make_request(struct drbd_conf *mdev, 44 + int fault_type, struct bio *bio) 45 + { 46 + __release(local); 47 + if (!bio->bi_bdev) { 48 + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " 49 + "bio->bi_bdev == NULL\n", 50 + mdev_to_minor(mdev)); 51 + dump_stack(); 52 + bio_endio(bio, -ENODEV); 53 + return; 54 + } 55 + 56 + if (FAULT_ACTIVE(mdev, fault_type)) 57 + bio_endio(bio, -EIO); 58 + else 59 + generic_make_request(bio); 60 + } 61 + 62 + static inline void drbd_plug_device(struct drbd_conf *mdev) 63 + { 64 + struct request_queue *q; 65 + q = bdev_get_queue(mdev->this_bdev); 66 + 67 + spin_lock_irq(q->queue_lock); 68 + 69 + /* XXX the check on !blk_queue_plugged is redundant, 70 + * implicitly checked in blk_plug_device */ 71 + 72 + if (!blk_queue_plugged(q)) { 73 + blk_plug_device(q); 74 + del_timer(&q->unplug_timer); 75 + /* unplugging should not happen automatically... */ 76 + } 77 + spin_unlock_irq(q->queue_lock); 78 + } 79 + 80 + static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) 81 + { 82 + return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) 83 + == CRYPTO_ALG_TYPE_HASH; 84 + } 85 + 86 + #ifndef __CHECKER__ 87 + # undef __cond_lock 88 + # define __cond_lock(x,c) (c) 89 + #endif 90 + 91 + #endif

+9 -1

drivers/block/ps3vram.c

··· 123 123 { 124 124 struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); 125 125 u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); 126 - unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); 126 + unsigned long timeout; 127 + 128 + for (timeout = 20; timeout; timeout--) { 129 + if (!notify[3]) 130 + return 0; 131 + udelay(10); 132 + } 133 + 134 + timeout = jiffies + msecs_to_jiffies(timeout_ms); 127 135 128 136 do { 129 137 if (!notify[3])

+2

drivers/mtd/mtd_blkdevs.c

··· 59 59 for (; nsect > 0; nsect--, block++, buf += tr->blksize) 60 60 if (tr->readsect(dev, block, buf)) 61 61 return -EIO; 62 + rq_flush_dcache_pages(req); 62 63 return 0; 63 64 64 65 case WRITE: 65 66 if (!tr->writesect) 66 67 return -EIO; 67 68 69 + rq_flush_dcache_pages(req); 68 70 for (; nsect > 0; nsect--, block++, buf += tr->blksize) 69 71 if (tr->writesect(dev, block, buf)) 70 72 return -EIO;

-10

drivers/staging/pohmelfs/inode.c

··· 143 143 struct inode *inode = mapping->host; 144 144 struct pohmelfs_inode *pi = POHMELFS_I(inode); 145 145 struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb); 146 - struct backing_dev_info *bdi = mapping->backing_dev_info; 147 146 int err = 0; 148 147 int done = 0; 149 148 int nr_pages; ··· 150 151 pgoff_t end; /* Inclusive */ 151 152 int scanned = 0; 152 153 int range_whole = 0; 153 - 154 - if (wbc->nonblocking && bdi_write_congested(bdi)) { 155 - wbc->encountered_congestion = 1; 156 - return 0; 157 - } 158 154 159 155 if (wbc->range_cyclic) { 160 156 index = mapping->writeback_index; /* Start from prev offset */ ··· 242 248 243 249 if (wbc->nr_to_write <= 0) 244 250 done = 1; 245 - if (wbc->nonblocking && bdi_write_congested(bdi)) { 246 - wbc->encountered_congestion = 1; 247 - done = 1; 248 - } 249 251 250 252 continue; 251 253 out_continue:

+60 -2

fs/aio.c

··· 15 15 #include <linux/aio_abi.h> 16 16 #include <linux/module.h> 17 17 #include <linux/syscalls.h> 18 + #include <linux/backing-dev.h> 18 19 #include <linux/uio.h> 19 20 20 21 #define DEBUG 0 ··· 33 32 #include <linux/workqueue.h> 34 33 #include <linux/security.h> 35 34 #include <linux/eventfd.h> 35 + #include <linux/blkdev.h> 36 + #include <linux/mempool.h> 37 + #include <linux/hash.h> 36 38 37 39 #include <asm/kmap_types.h> 38 40 #include <asm/uaccess.h> ··· 64 60 static DEFINE_SPINLOCK(fput_lock); 65 61 static LIST_HEAD(fput_head); 66 62 63 + #define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ 64 + #define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) 65 + struct aio_batch_entry { 66 + struct hlist_node list; 67 + struct address_space *mapping; 68 + }; 69 + mempool_t *abe_pool; 70 + 67 71 static void aio_kick_handler(struct work_struct *); 68 72 static void aio_queue_work(struct kioctx *); 69 73 ··· 85 73 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 86 74 87 75 aio_wq = create_workqueue("aio"); 76 + abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 77 + BUG_ON(!abe_pool); 88 78 89 79 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 90 80 ··· 1545 1531 return 1; 1546 1532 } 1547 1533 1534 + static void aio_batch_add(struct address_space *mapping, 1535 + struct hlist_head *batch_hash) 1536 + { 1537 + struct aio_batch_entry *abe; 1538 + struct hlist_node *pos; 1539 + unsigned bucket; 1540 + 1541 + bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); 1542 + hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { 1543 + if (abe->mapping == mapping) 1544 + return; 1545 + } 1546 + 1547 + abe = mempool_alloc(abe_pool, GFP_KERNEL); 1548 + BUG_ON(!igrab(mapping->host)); 1549 + abe->mapping = mapping; 1550 + hlist_add_head(&abe->list, &batch_hash[bucket]); 1551 + return; 1552 + } 1553 + 1554 + static void aio_batch_free(struct hlist_head *batch_hash) 1555 + { 1556 + struct aio_batch_entry *abe; 1557 + struct hlist_node *pos, *n; 1558 + int i; 1559 + 1560 + for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { 1561 + hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { 1562 + blk_run_address_space(abe->mapping); 1563 + iput(abe->mapping->host); 1564 + hlist_del(&abe->list); 1565 + mempool_free(abe, abe_pool); 1566 + } 1567 + } 1568 + } 1569 + 1548 1570 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1549 - struct iocb *iocb) 1571 + struct iocb *iocb, struct hlist_head *batch_hash) 1550 1572 { 1551 1573 struct kiocb *req; 1552 1574 struct file *file; ··· 1658 1608 ; 1659 1609 } 1660 1610 spin_unlock_irq(&ctx->ctx_lock); 1611 + if (req->ki_opcode == IOCB_CMD_PREAD || 1612 + req->ki_opcode == IOCB_CMD_PREADV || 1613 + req->ki_opcode == IOCB_CMD_PWRITE || 1614 + req->ki_opcode == IOCB_CMD_PWRITEV) 1615 + aio_batch_add(file->f_mapping, batch_hash); 1616 + 1661 1617 aio_put_req(req); /* drop extra ref to req */ 1662 1618 return 0; 1663 1619 ··· 1691 1635 struct kioctx *ctx; 1692 1636 long ret = 0; 1693 1637 int i; 1638 + struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; 1694 1639 1695 1640 if (unlikely(nr < 0)) 1696 1641 return -EINVAL; ··· 1723 1666 break; 1724 1667 } 1725 1668 1726 - ret = io_submit_one(ctx, user_iocb, &tmp); 1669 + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1727 1670 if (ret) 1728 1671 break; 1729 1672 } 1673 + aio_batch_free(batch_hash); 1730 1674 1731 1675 put_ioctx(ctx); 1732 1676 return i ? i : ret;

+12

fs/bio.c

··· 1393 1393 } 1394 1394 } 1395 1395 1396 + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1397 + void bio_flush_dcache_pages(struct bio *bi) 1398 + { 1399 + int i; 1400 + struct bio_vec *bvec; 1401 + 1402 + bio_for_each_segment(bvec, bi, i) 1403 + flush_dcache_page(bvec->bv_page); 1404 + } 1405 + EXPORT_SYMBOL(bio_flush_dcache_pages); 1406 + #endif 1407 + 1396 1408 /** 1397 1409 * bio_endio - end I/O on a bio 1398 1410 * @bio: bio

+11 -1

fs/block_dev.c

··· 405 405 406 406 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 407 407 { 408 - return sync_blockdev(I_BDEV(filp->f_mapping->host)); 408 + struct block_device *bdev = I_BDEV(filp->f_mapping->host); 409 + int error; 410 + 411 + error = sync_blockdev(bdev); 412 + if (error) 413 + return error; 414 + 415 + error = blkdev_issue_flush(bdev, NULL); 416 + if (error == -EOPNOTSUPP) 417 + error = 0; 418 + return error; 409 419 } 410 420 411 421 /*

+5 -5

fs/direct-io.c

··· 1028 1028 if (dio->bio) 1029 1029 dio_bio_submit(dio); 1030 1030 1031 - /* All IO is now issued, send it on its way */ 1032 - blk_run_address_space(inode->i_mapping); 1033 - 1034 1031 /* 1035 1032 * It is possible that, we return short IO due to end of file. 1036 1033 * In that case, we need to release all the pages we got hold on. ··· 1054 1057 ((rw & READ) || (dio->result == dio->size))) 1055 1058 ret = -EIOCBQUEUED; 1056 1059 1057 - if (ret != -EIOCBQUEUED) 1060 + if (ret != -EIOCBQUEUED) { 1061 + /* All IO is now issued, send it on its way */ 1062 + blk_run_address_space(inode->i_mapping); 1058 1063 dio_await_completion(dio); 1064 + } 1059 1065 1060 1066 /* 1061 1067 * Sync will always be dropping the final ref and completing the ··· 1124 1124 int acquire_i_mutex = 0; 1125 1125 1126 1126 if (rw & WRITE) 1127 - rw = WRITE_ODIRECT; 1127 + rw = WRITE_ODIRECT_PLUG; 1128 1128 1129 1129 if (bdev) 1130 1130 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));

+1 -27

fs/fs-writeback.c

··· 614 614 struct writeback_control *wbc) 615 615 { 616 616 struct super_block *sb = wbc->sb, *pin_sb = NULL; 617 - const int is_blkdev_sb = sb_is_blkdev_sb(sb); 618 617 const unsigned long start = jiffies; /* livelock avoidance */ 619 618 620 619 spin_lock(&inode_lock); ··· 634 635 continue; 635 636 } 636 637 637 - if (!bdi_cap_writeback_dirty(wb->bdi)) { 638 - redirty_tail(inode); 639 - if (is_blkdev_sb) { 640 - /* 641 - * Dirty memory-backed blockdev: the ramdisk 642 - * driver does this. Skip just this inode 643 - */ 644 - continue; 645 - } 646 - /* 647 - * Dirty memory-backed inode against a filesystem other 648 - * than the kernel-internal bdev filesystem. Skip the 649 - * entire superblock. 650 - */ 651 - break; 652 - } 653 - 654 638 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 655 639 requeue_io(inode); 656 640 continue; 657 - } 658 - 659 - if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { 660 - wbc->encountered_congestion = 1; 661 - if (!is_blkdev_sb) 662 - break; /* Skip a congested fs */ 663 - requeue_io(inode); 664 - continue; /* Skip a congested blockdev */ 665 641 } 666 642 667 643 /* ··· 730 756 .sync_mode = args->sync_mode, 731 757 .older_than_this = NULL, 732 758 .for_kupdate = args->for_kupdate, 759 + .for_background = args->for_background, 733 760 .range_cyclic = args->range_cyclic, 734 761 }; 735 762 unsigned long oldest_jif; ··· 762 787 break; 763 788 764 789 wbc.more_io = 0; 765 - wbc.encountered_congestion = 0; 766 790 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 767 791 wbc.pages_skipped = 0; 768 792 writeback_inodes_wb(wb, &wbc);

+1 -1

fs/nfs/write.c

··· 178 178 { 179 179 if (wbc->for_reclaim) 180 180 return FLUSH_HIGHPRI | FLUSH_STABLE; 181 - if (wbc->for_kupdate) 181 + if (wbc->for_kupdate || wbc->for_background) 182 182 return FLUSH_LOWPRI; 183 183 return 0; 184 184 }

+12

fs/partitions/check.c

··· 226 226 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); 227 227 } 228 228 229 + ssize_t part_discard_alignment_show(struct device *dev, 230 + struct device_attribute *attr, char *buf) 231 + { 232 + struct hd_struct *p = dev_to_part(dev); 233 + return sprintf(buf, "%u\n", p->discard_alignment); 234 + } 235 + 229 236 ssize_t part_stat_show(struct device *dev, 230 237 struct device_attribute *attr, char *buf) 231 238 { ··· 295 288 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 296 289 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 297 290 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 291 + static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, 292 + NULL); 298 293 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 299 294 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); 300 295 #ifdef CONFIG_FAIL_MAKE_REQUEST ··· 309 300 &dev_attr_start.attr, 310 301 &dev_attr_size.attr, 311 302 &dev_attr_alignment_offset.attr, 303 + &dev_attr_discard_alignment.attr, 312 304 &dev_attr_stat.attr, 313 305 &dev_attr_inflight.attr, 314 306 #ifdef CONFIG_FAIL_MAKE_REQUEST ··· 413 403 414 404 p->start_sect = start; 415 405 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); 406 + p->discard_alignment = queue_sector_discard_alignment(disk->queue, 407 + start); 416 408 p->nr_sects = len; 417 409 p->partno = partno; 418 410 p->policy = get_disk_ro(disk);

+19 -11

fs/partitions/efi.c

··· 1 1 /************************************************************ 2 2 * EFI GUID Partition Table handling 3 - * Per Intel EFI Specification v1.02 4 - * http://developer.intel.com/technology/efi/efi.htm 3 + * 4 + * http://www.uefi.org/specs/ 5 + * http://www.intel.com/technology/efi/ 6 + * 5 7 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> 6 8 * Copyright 2000,2001,2002,2004 Dell Inc. 7 9 * ··· 94 92 * 95 93 ************************************************************/ 96 94 #include <linux/crc32.h> 95 + #include <linux/math64.h> 97 96 #include "check.h" 98 97 #include "efi.h" 99 98 ··· 144 141 { 145 142 if (!bdev || !bdev->bd_inode) 146 143 return 0; 147 - return (bdev->bd_inode->i_size >> 9) - 1ULL; 144 + return div_u64(bdev->bd_inode->i_size, 145 + bdev_logical_block_size(bdev)) - 1ULL; 148 146 } 149 147 150 148 static inline int ··· 192 188 read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 193 189 { 194 190 size_t totalreadcount = 0; 191 + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 195 192 196 193 if (!bdev || !buffer || lba > last_lba(bdev)) 197 194 return 0; ··· 200 195 while (count) { 201 196 int copied = 512; 202 197 Sector sect; 203 - unsigned char *data = read_dev_sector(bdev, lba++, &sect); 198 + unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 199 if (!data) 205 200 break; 206 201 if (copied > count) ··· 262 257 alloc_read_gpt_header(struct block_device *bdev, u64 lba) 263 258 { 264 259 gpt_header *gpt; 260 + unsigned ssz = bdev_logical_block_size(bdev); 261 + 265 262 if (!bdev) 266 263 return NULL; 267 264 268 - gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); 265 + gpt = kzalloc(ssz, GFP_KERNEL); 269 266 if (!gpt) 270 267 return NULL; 271 268 272 - if (read_lba(bdev, lba, (u8 *) gpt, 273 - sizeof (gpt_header)) < sizeof (gpt_header)) { 269 + if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 274 270 kfree(gpt); 275 271 gpt=NULL; 276 272 return NULL; ··· 607 601 gpt_header *gpt = NULL; 608 602 gpt_entry *ptes = NULL; 609 603 u32 i; 604 + unsigned ssz = bdev_logical_block_size(bdev) / 512; 610 605 611 606 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 612 607 kfree(gpt); ··· 618 611 pr_debug("GUID Partition Table is valid! Yea!\n"); 619 612 620 613 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 614 + u64 start = le64_to_cpu(ptes[i].starting_lba); 615 + u64 size = le64_to_cpu(ptes[i].ending_lba) - 616 + le64_to_cpu(ptes[i].starting_lba) + 1ULL; 617 + 621 618 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 622 619 continue; 623 620 624 - put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba), 625 - (le64_to_cpu(ptes[i].ending_lba) - 626 - le64_to_cpu(ptes[i].starting_lba) + 627 - 1ULL)); 621 + put_partition(state, i+1, start * ssz, size * ssz); 628 622 629 623 /* If this is a RAID volume, tell md */ 630 624 if (!efi_guidcmp(ptes[i].partition_type_guid,

+6 -2

fs/partitions/efi.h

··· 37 37 #define EFI_PMBR_OSTYPE_EFI 0xEF 38 38 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE 39 39 40 - #define GPT_BLOCK_SIZE 512 41 40 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL 42 41 #define GPT_HEADER_REVISION_V1 0x00010000 43 42 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 ··· 78 79 __le32 num_partition_entries; 79 80 __le32 sizeof_partition_entry; 80 81 __le32 partition_entry_array_crc32; 81 - u8 reserved2[GPT_BLOCK_SIZE - 92]; 82 + 83 + /* The rest of the logical block is reserved by UEFI and must be zero. 84 + * EFI standard handles this by: 85 + * 86 + * uint8_t reserved2[ BlockSize - 92 ]; 87 + */ 82 88 } __attribute__ ((packed)) gpt_header; 83 89 84 90 typedef struct _gpt_entry_attributes {

-2

fs/read_write.c

··· 826 826 if (!(out_file->f_mode & FMODE_WRITE)) 827 827 goto fput_out; 828 828 retval = -EINVAL; 829 - if (!out_file->f_op || !out_file->f_op->sendpage) 830 - goto fput_out; 831 829 in_inode = in_file->f_path.dentry->d_inode; 832 830 out_inode = out_file->f_path.dentry->d_inode; 833 831 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);

+15 -9

fs/splice.c

··· 648 648 ret = buf->ops->confirm(pipe, buf); 649 649 if (!ret) { 650 650 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 651 - 652 - ret = file->f_op->sendpage(file, buf->page, buf->offset, 653 - sd->len, &pos, more); 651 + if (file->f_op && file->f_op->sendpage) 652 + ret = file->f_op->sendpage(file, buf->page, buf->offset, 653 + sd->len, &pos, more); 654 + else 655 + ret = -EINVAL; 654 656 } 655 657 656 658 return ret; ··· 1070 1068 if (unlikely(ret < 0)) 1071 1069 return ret; 1072 1070 1073 - splice_write = out->f_op->splice_write; 1074 - if (!splice_write) 1071 + if (out->f_op && out->f_op->splice_write) 1072 + splice_write = out->f_op->splice_write; 1073 + else 1075 1074 splice_write = default_file_splice_write; 1076 1075 1077 1076 return splice_write(pipe, out, ppos, len, flags); ··· 1096 1093 if (unlikely(ret < 0)) 1097 1094 return ret; 1098 1095 1099 - splice_read = in->f_op->splice_read; 1100 - if (!splice_read) 1096 + if (in->f_op && in->f_op->splice_read) 1097 + splice_read = in->f_op->splice_read; 1098 + else 1101 1099 splice_read = default_file_splice_read; 1102 1100 1103 1101 return splice_read(in, ppos, pipe, len, flags); ··· 1320 1316 if (off_in) 1321 1317 return -ESPIPE; 1322 1318 if (off_out) { 1323 - if (out->f_op->llseek == no_llseek) 1319 + if (!out->f_op || !out->f_op->llseek || 1320 + out->f_op->llseek == no_llseek) 1324 1321 return -EINVAL; 1325 1322 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1326 1323 return -EFAULT; ··· 1341 1336 if (off_out) 1342 1337 return -ESPIPE; 1343 1338 if (off_in) { 1344 - if (in->f_op->llseek == no_llseek) 1339 + if (!in->f_op || !in->f_op->llseek || 1340 + in->f_op->llseek == no_llseek) 1345 1341 return -EINVAL; 1346 1342 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1347 1343 return -EFAULT;

+1 -8

fs/xfs/linux-2.6/xfs_aops.c

··· 904 904 905 905 if (startio) { 906 906 if (count) { 907 - struct backing_dev_info *bdi; 908 - 909 - bdi = inode->i_mapping->backing_dev_info; 910 907 wbc->nr_to_write--; 911 - if (bdi_write_congested(bdi)) { 912 - wbc->encountered_congestion = 1; 908 + if (wbc->nr_to_write <= 0) 913 909 done = 1; 914 - } else if (wbc->nr_to_write <= 0) { 915 - done = 1; 916 - } 917 910 } 918 911 xfs_start_page_writeback(page, !page_dirty, count); 919 912 }

+1

include/asm-generic/cacheflush.h

··· 13 13 #define flush_cache_dup_mm(mm) do { } while (0) 14 14 #define flush_cache_range(vma, start, end) do { } while (0) 15 15 #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 16 + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 16 17 #define flush_dcache_page(page) do { } while (0) 17 18 #define flush_dcache_mmap_lock(mapping) do { } while (0) 18 19 #define flush_dcache_mmap_unlock(mapping) do { } while (0)

+13

include/linux/backing-dev.h

··· 331 331 return 0; 332 332 } 333 333 334 + static inline void blk_run_backing_dev(struct backing_dev_info *bdi, 335 + struct page *page) 336 + { 337 + if (bdi && bdi->unplug_io_fn) 338 + bdi->unplug_io_fn(bdi, page); 339 + } 340 + 341 + static inline void blk_run_address_space(struct address_space *mapping) 342 + { 343 + if (mapping) 344 + blk_run_backing_dev(mapping->backing_dev_info, NULL); 345 + } 346 + 334 347 #endif /* _LINUX_BACKING_DEV_H */

+14 -6

include/linux/bio.h

··· 391 391 gfp_t, int); 392 392 extern void bio_set_pages_dirty(struct bio *bio); 393 393 extern void bio_check_pages_dirty(struct bio *bio); 394 + 395 + #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 396 + # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" 397 + #endif 398 + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 399 + extern void bio_flush_dcache_pages(struct bio *bi); 400 + #else 401 + static inline void bio_flush_dcache_pages(struct bio *bi) 402 + { 403 + } 404 + #endif 405 + 394 406 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, 395 407 unsigned long, unsigned int, int, gfp_t); 396 408 extern struct bio *bio_copy_user_iov(struct request_queue *, ··· 462 450 /* 463 451 * remember never ever reenable interrupts between a bvec_kmap_irq and 464 452 * bvec_kunmap_irq! 465 - * 466 - * This function MUST be inlined - it plays with the CPU interrupt flags. 467 453 */ 468 - static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, 469 - unsigned long *flags) 454 + static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) 470 455 { 471 456 unsigned long addr; 472 457 ··· 479 470 return (char *) addr + bvec->bv_offset; 480 471 } 481 472 482 - static __always_inline void bvec_kunmap_irq(char *buffer, 483 - unsigned long *flags) 473 + static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) 484 474 { 485 475 unsigned long ptr = (unsigned long) buffer & PAGE_MASK; 486 476

+43 -13

include/linux/blkdev.h

··· 312 312 unsigned int io_min; 313 313 unsigned int io_opt; 314 314 unsigned int max_discard_sectors; 315 + unsigned int discard_granularity; 316 + unsigned int discard_alignment; 315 317 316 318 unsigned short logical_block_size; 317 319 unsigned short max_hw_segments; 318 320 unsigned short max_phys_segments; 319 321 320 322 unsigned char misaligned; 323 + unsigned char discard_misaligned; 321 324 unsigned char no_cluster; 325 + signed char discard_zeroes_data; 322 326 }; 323 327 324 328 struct request_queue ··· 753 749 #define rq_iter_last(rq, _iter) \ 754 750 (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1) 755 751 752 + #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 753 + # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" 754 + #endif 755 + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 756 + extern void rq_flush_dcache_pages(struct request *rq); 757 + #else 758 + static inline void rq_flush_dcache_pages(struct request *rq) 759 + { 760 + } 761 + #endif 762 + 756 763 extern int blk_register_queue(struct gendisk *disk); 757 764 extern void blk_unregister_queue(struct gendisk *disk); 758 765 extern void register_disk(struct gendisk *dev); ··· 836 821 static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 837 822 { 838 823 return bdev->bd_disk->queue; 839 - } 840 - 841 - static inline void blk_run_backing_dev(struct backing_dev_info *bdi, 842 - struct page *page) 843 - { 844 - if (bdi && bdi->unplug_io_fn) 845 - bdi->unplug_io_fn(bdi, page); 846 - } 847 - 848 - static inline void blk_run_address_space(struct address_space *mapping) 849 - { 850 - if (mapping) 851 - blk_run_backing_dev(mapping->backing_dev_info, NULL); 852 824 } 853 825 854 826 /* ··· 1134 1132 return bdev->bd_part->alignment_offset; 1135 1133 1136 1134 return q->limits.alignment_offset; 1135 + } 1136 + 1137 + static inline int queue_discard_alignment(struct request_queue *q) 1138 + { 1139 + if (q->limits.discard_misaligned) 1140 + return -1; 1141 + 1142 + return q->limits.discard_alignment; 1143 + } 1144 + 1145 + static inline int queue_sector_discard_alignment(struct request_queue *q, 1146 + sector_t sector) 1147 + { 1148 + return ((sector << 9) - q->limits.discard_alignment) 1149 + & (q->limits.discard_granularity - 1); 1150 + } 1151 + 1152 + static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) 1153 + { 1154 + if (q->limits.discard_zeroes_data == 1) 1155 + return 1; 1156 + 1157 + return 0; 1158 + } 1159 + 1160 + static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) 1161 + { 1162 + return queue_discard_zeroes_data(bdev_get_queue(bdev)); 1137 1163 } 1138 1164 1139 1165 static inline int queue_dma_alignment(struct request_queue *q)

+6

include/linux/cgroup_subsys.h

··· 60 60 #endif 61 61 62 62 /* */ 63 + 64 + #ifdef CONFIG_BLK_CGROUP 65 + SUBSYS(blkio) 66 + #endif 67 + 68 + /* */

+2

include/linux/connector.h

··· 43 43 #define CN_DST_VAL 0x1 44 44 #define CN_IDX_DM 0x7 /* Device Mapper */ 45 45 #define CN_VAL_DM_USERSPACE_LOG 0x1 46 + #define CN_IDX_DRBD 0x8 47 + #define CN_VAL_DRBD 0x1 46 48 47 49 #define CN_NETLINK_USERS 8 48 50

+343

include/linux/drbd.h

··· 1 + /* 2 + drbd.h 3 + Kernel module for 2.6.x Kernels 4 + 5 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 6 + 7 + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 8 + Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>. 9 + Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 10 + 11 + drbd is free software; you can redistribute it and/or modify 12 + it under the terms of the GNU General Public License as published by 13 + the Free Software Foundation; either version 2, or (at your option) 14 + any later version. 15 + 16 + drbd is distributed in the hope that it will be useful, 17 + but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + GNU General Public License for more details. 20 + 21 + You should have received a copy of the GNU General Public License 22 + along with drbd; see the file COPYING. If not, write to 23 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 24 + 25 + */ 26 + #ifndef DRBD_H 27 + #define DRBD_H 28 + #include <linux/connector.h> 29 + #include <asm/types.h> 30 + 31 + #ifdef __KERNEL__ 32 + #include <linux/types.h> 33 + #include <asm/byteorder.h> 34 + #else 35 + #include <sys/types.h> 36 + #include <sys/wait.h> 37 + #include <limits.h> 38 + 39 + /* Altough the Linux source code makes a difference between 40 + generic endianness and the bitfields' endianness, there is no 41 + architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness 42 + does not match the generic endianness. */ 43 + 44 + #if __BYTE_ORDER == __LITTLE_ENDIAN 45 + #define __LITTLE_ENDIAN_BITFIELD 46 + #elif __BYTE_ORDER == __BIG_ENDIAN 47 + #define __BIG_ENDIAN_BITFIELD 48 + #else 49 + # error "sorry, weird endianness on this box" 50 + #endif 51 + 52 + #endif 53 + 54 + 55 + extern const char *drbd_buildtag(void); 56 + #define REL_VERSION "8.3.6" 57 + #define API_VERSION 88 58 + #define PRO_VERSION_MIN 86 59 + #define PRO_VERSION_MAX 91 60 + 61 + 62 + enum drbd_io_error_p { 63 + EP_PASS_ON, /* FIXME should the better be named "Ignore"? */ 64 + EP_CALL_HELPER, 65 + EP_DETACH 66 + }; 67 + 68 + enum drbd_fencing_p { 69 + FP_DONT_CARE, 70 + FP_RESOURCE, 71 + FP_STONITH 72 + }; 73 + 74 + enum drbd_disconnect_p { 75 + DP_RECONNECT, 76 + DP_DROP_NET_CONF, 77 + DP_FREEZE_IO 78 + }; 79 + 80 + enum drbd_after_sb_p { 81 + ASB_DISCONNECT, 82 + ASB_DISCARD_YOUNGER_PRI, 83 + ASB_DISCARD_OLDER_PRI, 84 + ASB_DISCARD_ZERO_CHG, 85 + ASB_DISCARD_LEAST_CHG, 86 + ASB_DISCARD_LOCAL, 87 + ASB_DISCARD_REMOTE, 88 + ASB_CONSENSUS, 89 + ASB_DISCARD_SECONDARY, 90 + ASB_CALL_HELPER, 91 + ASB_VIOLENTLY 92 + }; 93 + 94 + /* KEEP the order, do not delete or insert. Only append. */ 95 + enum drbd_ret_codes { 96 + ERR_CODE_BASE = 100, 97 + NO_ERROR = 101, 98 + ERR_LOCAL_ADDR = 102, 99 + ERR_PEER_ADDR = 103, 100 + ERR_OPEN_DISK = 104, 101 + ERR_OPEN_MD_DISK = 105, 102 + ERR_DISK_NOT_BDEV = 107, 103 + ERR_MD_NOT_BDEV = 108, 104 + ERR_DISK_TO_SMALL = 111, 105 + ERR_MD_DISK_TO_SMALL = 112, 106 + ERR_BDCLAIM_DISK = 114, 107 + ERR_BDCLAIM_MD_DISK = 115, 108 + ERR_MD_IDX_INVALID = 116, 109 + ERR_IO_MD_DISK = 118, 110 + ERR_MD_INVALID = 119, 111 + ERR_AUTH_ALG = 120, 112 + ERR_AUTH_ALG_ND = 121, 113 + ERR_NOMEM = 122, 114 + ERR_DISCARD = 123, 115 + ERR_DISK_CONFIGURED = 124, 116 + ERR_NET_CONFIGURED = 125, 117 + ERR_MANDATORY_TAG = 126, 118 + ERR_MINOR_INVALID = 127, 119 + ERR_INTR = 129, /* EINTR */ 120 + ERR_RESIZE_RESYNC = 130, 121 + ERR_NO_PRIMARY = 131, 122 + ERR_SYNC_AFTER = 132, 123 + ERR_SYNC_AFTER_CYCLE = 133, 124 + ERR_PAUSE_IS_SET = 134, 125 + ERR_PAUSE_IS_CLEAR = 135, 126 + ERR_PACKET_NR = 137, 127 + ERR_NO_DISK = 138, 128 + ERR_NOT_PROTO_C = 139, 129 + ERR_NOMEM_BITMAP = 140, 130 + ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */ 131 + ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */ 132 + ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */ 133 + ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */ 134 + ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */ 135 + ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */ 136 + ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */ 137 + ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */ 138 + ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */ 139 + ERR_DATA_NOT_CURRENT = 150, 140 + ERR_CONNECTED = 151, /* DRBD 8.3 only */ 141 + ERR_PERM = 152, 142 + 143 + /* insert new ones above this line */ 144 + AFTER_LAST_ERR_CODE 145 + }; 146 + 147 + #define DRBD_PROT_A 1 148 + #define DRBD_PROT_B 2 149 + #define DRBD_PROT_C 3 150 + 151 + enum drbd_role { 152 + R_UNKNOWN = 0, 153 + R_PRIMARY = 1, /* role */ 154 + R_SECONDARY = 2, /* role */ 155 + R_MASK = 3, 156 + }; 157 + 158 + /* The order of these constants is important. 159 + * The lower ones (<C_WF_REPORT_PARAMS) indicate 160 + * that there is no socket! 161 + * >=C_WF_REPORT_PARAMS ==> There is a socket 162 + */ 163 + enum drbd_conns { 164 + C_STANDALONE, 165 + C_DISCONNECTING, /* Temporal state on the way to StandAlone. */ 166 + C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */ 167 + 168 + /* These temporal states are all used on the way 169 + * from >= C_CONNECTED to Unconnected. 170 + * The 'disconnect reason' states 171 + * I do not allow to change beween them. */ 172 + C_TIMEOUT, 173 + C_BROKEN_PIPE, 174 + C_NETWORK_FAILURE, 175 + C_PROTOCOL_ERROR, 176 + C_TEAR_DOWN, 177 + 178 + C_WF_CONNECTION, 179 + C_WF_REPORT_PARAMS, /* we have a socket */ 180 + C_CONNECTED, /* we have introduced each other */ 181 + C_STARTING_SYNC_S, /* starting full sync by admin request. */ 182 + C_STARTING_SYNC_T, /* stariing full sync by admin request. */ 183 + C_WF_BITMAP_S, 184 + C_WF_BITMAP_T, 185 + C_WF_SYNC_UUID, 186 + 187 + /* All SyncStates are tested with this comparison 188 + * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */ 189 + C_SYNC_SOURCE, 190 + C_SYNC_TARGET, 191 + C_VERIFY_S, 192 + C_VERIFY_T, 193 + C_PAUSED_SYNC_S, 194 + C_PAUSED_SYNC_T, 195 + C_MASK = 31 196 + }; 197 + 198 + enum drbd_disk_state { 199 + D_DISKLESS, 200 + D_ATTACHING, /* In the process of reading the meta-data */ 201 + D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */ 202 + /* when >= D_FAILED it is legal to access mdev->bc */ 203 + D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */ 204 + D_INCONSISTENT, 205 + D_OUTDATED, 206 + D_UNKNOWN, /* Only used for the peer, never for myself */ 207 + D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */ 208 + D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */ 209 + D_MASK = 15 210 + }; 211 + 212 + union drbd_state { 213 + /* According to gcc's docs is the ... 214 + * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1). 215 + * Determined by ABI. 216 + * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com> 217 + * even though we transmit as "cpu_to_be32(state)", 218 + * the offsets of the bitfields still need to be swapped 219 + * on different endianess. 220 + */ 221 + struct { 222 + #if defined(__LITTLE_ENDIAN_BITFIELD) 223 + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ 224 + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ 225 + unsigned conn:5 ; /* 17/32 cstates */ 226 + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ 227 + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ 228 + unsigned susp:1 ; /* 2/2 IO suspended no/yes */ 229 + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ 230 + unsigned peer_isp:1 ; 231 + unsigned user_isp:1 ; 232 + unsigned _pad:11; /* 0 unused */ 233 + #elif defined(__BIG_ENDIAN_BITFIELD) 234 + unsigned _pad:11; /* 0 unused */ 235 + unsigned user_isp:1 ; 236 + unsigned peer_isp:1 ; 237 + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ 238 + unsigned susp:1 ; /* 2/2 IO suspended no/yes */ 239 + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ 240 + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ 241 + unsigned conn:5 ; /* 17/32 cstates */ 242 + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ 243 + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ 244 + #else 245 + # error "this endianess is not supported" 246 + #endif 247 + }; 248 + unsigned int i; 249 + }; 250 + 251 + enum drbd_state_ret_codes { 252 + SS_CW_NO_NEED = 4, 253 + SS_CW_SUCCESS = 3, 254 + SS_NOTHING_TO_DO = 2, 255 + SS_SUCCESS = 1, 256 + SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */ 257 + SS_TWO_PRIMARIES = -1, 258 + SS_NO_UP_TO_DATE_DISK = -2, 259 + SS_NO_LOCAL_DISK = -4, 260 + SS_NO_REMOTE_DISK = -5, 261 + SS_CONNECTED_OUTDATES = -6, 262 + SS_PRIMARY_NOP = -7, 263 + SS_RESYNC_RUNNING = -8, 264 + SS_ALREADY_STANDALONE = -9, 265 + SS_CW_FAILED_BY_PEER = -10, 266 + SS_IS_DISKLESS = -11, 267 + SS_DEVICE_IN_USE = -12, 268 + SS_NO_NET_CONFIG = -13, 269 + SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */ 270 + SS_NEED_CONNECTION = -15, /* drbd-8.2 only */ 271 + SS_LOWER_THAN_OUTDATED = -16, 272 + SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ 273 + SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ 274 + SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ 275 + SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ 276 + }; 277 + 278 + /* from drbd_strings.c */ 279 + extern const char *drbd_conn_str(enum drbd_conns); 280 + extern const char *drbd_role_str(enum drbd_role); 281 + extern const char *drbd_disk_str(enum drbd_disk_state); 282 + extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes); 283 + 284 + #define SHARED_SECRET_MAX 64 285 + 286 + #define MDF_CONSISTENT (1 << 0) 287 + #define MDF_PRIMARY_IND (1 << 1) 288 + #define MDF_CONNECTED_IND (1 << 2) 289 + #define MDF_FULL_SYNC (1 << 3) 290 + #define MDF_WAS_UP_TO_DATE (1 << 4) 291 + #define MDF_PEER_OUT_DATED (1 << 5) 292 + #define MDF_CRASHED_PRIMARY (1 << 6) 293 + 294 + enum drbd_uuid_index { 295 + UI_CURRENT, 296 + UI_BITMAP, 297 + UI_HISTORY_START, 298 + UI_HISTORY_END, 299 + UI_SIZE, /* nl-packet: number of dirty bits */ 300 + UI_FLAGS, /* nl-packet: flags */ 301 + UI_EXTENDED_SIZE /* Everything. */ 302 + }; 303 + 304 + enum drbd_timeout_flag { 305 + UT_DEFAULT = 0, 306 + UT_DEGRADED = 1, 307 + UT_PEER_OUTDATED = 2, 308 + }; 309 + 310 + #define UUID_JUST_CREATED ((__u64)4) 311 + 312 + #define DRBD_MAGIC 0x83740267 313 + #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) 314 + 315 + /* these are of type "int" */ 316 + #define DRBD_MD_INDEX_INTERNAL -1 317 + #define DRBD_MD_INDEX_FLEX_EXT -2 318 + #define DRBD_MD_INDEX_FLEX_INT -3 319 + 320 + /* Start of the new netlink/connector stuff */ 321 + 322 + #define DRBD_NL_CREATE_DEVICE 0x01 323 + #define DRBD_NL_SET_DEFAULTS 0x02 324 + 325 + 326 + /* For searching a vacant cn_idx value */ 327 + #define CN_IDX_STEP 6977 328 + 329 + struct drbd_nl_cfg_req { 330 + int packet_type; 331 + unsigned int drbd_minor; 332 + int flags; 333 + unsigned short tag_list[]; 334 + }; 335 + 336 + struct drbd_nl_cfg_reply { 337 + int packet_type; 338 + unsigned int minor; 339 + int ret_code; /* enum ret_code or set_st_err_t */ 340 + unsigned short tag_list[]; /* only used with get_* calls */ 341 + }; 342 + 343 + #endif

+137

include/linux/drbd_limits.h

··· 1 + /* 2 + drbd_limits.h 3 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 4 + */ 5 + 6 + /* 7 + * Our current limitations. 8 + * Some of them are hard limits, 9 + * some of them are arbitrary range limits, that make it easier to provide 10 + * feedback about nonsense settings for certain configurable values. 11 + */ 12 + 13 + #ifndef DRBD_LIMITS_H 14 + #define DRBD_LIMITS_H 1 15 + 16 + #define DEBUG_RANGE_CHECK 0 17 + 18 + #define DRBD_MINOR_COUNT_MIN 1 19 + #define DRBD_MINOR_COUNT_MAX 255 20 + 21 + #define DRBD_DIALOG_REFRESH_MIN 0 22 + #define DRBD_DIALOG_REFRESH_MAX 600 23 + 24 + /* valid port number */ 25 + #define DRBD_PORT_MIN 1 26 + #define DRBD_PORT_MAX 0xffff 27 + 28 + /* startup { */ 29 + /* if you want more than 3.4 days, disable */ 30 + #define DRBD_WFC_TIMEOUT_MIN 0 31 + #define DRBD_WFC_TIMEOUT_MAX 300000 32 + #define DRBD_WFC_TIMEOUT_DEF 0 33 + 34 + #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 35 + #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 36 + #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 37 + 38 + #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 39 + #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 40 + #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 41 + /* }*/ 42 + 43 + /* net { */ 44 + /* timeout, unit centi seconds 45 + * more than one minute timeout is not usefull */ 46 + #define DRBD_TIMEOUT_MIN 1 47 + #define DRBD_TIMEOUT_MAX 600 48 + #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ 49 + 50 + /* active connection retries when C_WF_CONNECTION */ 51 + #define DRBD_CONNECT_INT_MIN 1 52 + #define DRBD_CONNECT_INT_MAX 120 53 + #define DRBD_CONNECT_INT_DEF 10 /* seconds */ 54 + 55 + /* keep-alive probes when idle */ 56 + #define DRBD_PING_INT_MIN 1 57 + #define DRBD_PING_INT_MAX 120 58 + #define DRBD_PING_INT_DEF 10 59 + 60 + /* timeout for the ping packets.*/ 61 + #define DRBD_PING_TIMEO_MIN 1 62 + #define DRBD_PING_TIMEO_MAX 100 63 + #define DRBD_PING_TIMEO_DEF 5 64 + 65 + /* max number of write requests between write barriers */ 66 + #define DRBD_MAX_EPOCH_SIZE_MIN 1 67 + #define DRBD_MAX_EPOCH_SIZE_MAX 20000 68 + #define DRBD_MAX_EPOCH_SIZE_DEF 2048 69 + 70 + /* I don't think that a tcp send buffer of more than 10M is usefull */ 71 + #define DRBD_SNDBUF_SIZE_MIN 0 72 + #define DRBD_SNDBUF_SIZE_MAX (10<<20) 73 + #define DRBD_SNDBUF_SIZE_DEF 0 74 + 75 + #define DRBD_RCVBUF_SIZE_MIN 0 76 + #define DRBD_RCVBUF_SIZE_MAX (10<<20) 77 + #define DRBD_RCVBUF_SIZE_DEF 0 78 + 79 + /* @4k PageSize -> 128kB - 512MB */ 80 + #define DRBD_MAX_BUFFERS_MIN 32 81 + #define DRBD_MAX_BUFFERS_MAX 131072 82 + #define DRBD_MAX_BUFFERS_DEF 2048 83 + 84 + /* @4k PageSize -> 4kB - 512MB */ 85 + #define DRBD_UNPLUG_WATERMARK_MIN 1 86 + #define DRBD_UNPLUG_WATERMARK_MAX 131072 87 + #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) 88 + 89 + /* 0 is disabled. 90 + * 200 should be more than enough even for very short timeouts */ 91 + #define DRBD_KO_COUNT_MIN 0 92 + #define DRBD_KO_COUNT_MAX 200 93 + #define DRBD_KO_COUNT_DEF 0 94 + /* } */ 95 + 96 + /* syncer { */ 97 + /* FIXME allow rate to be zero? */ 98 + #define DRBD_RATE_MIN 1 99 + /* channel bonding 10 GbE, or other hardware */ 100 + #define DRBD_RATE_MAX (4 << 20) 101 + #define DRBD_RATE_DEF 250 /* kb/second */ 102 + 103 + /* less than 7 would hit performance unneccessarily. 104 + * 3833 is the largest prime that still does fit 105 + * into 64 sectors of activity log */ 106 + #define DRBD_AL_EXTENTS_MIN 7 107 + #define DRBD_AL_EXTENTS_MAX 3833 108 + #define DRBD_AL_EXTENTS_DEF 127 109 + 110 + #define DRBD_AFTER_MIN -1 111 + #define DRBD_AFTER_MAX 255 112 + #define DRBD_AFTER_DEF -1 113 + 114 + /* } */ 115 + 116 + /* drbdsetup XY resize -d Z 117 + * you are free to reduce the device size to nothing, if you want to. 118 + * the upper limit with 64bit kernel, enough ram and flexible meta data 119 + * is 16 TB, currently. */ 120 + /* DRBD_MAX_SECTORS */ 121 + #define DRBD_DISK_SIZE_SECT_MIN 0 122 + #define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30)) 123 + #define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ 124 + 125 + #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON 126 + #define DRBD_FENCING_DEF FP_DONT_CARE 127 + #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT 128 + #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT 129 + #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT 130 + #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT 131 + 132 + #define DRBD_MAX_BIO_BVECS_MIN 0 133 + #define DRBD_MAX_BIO_BVECS_MAX 128 134 + #define DRBD_MAX_BIO_BVECS_DEF 0 135 + 136 + #undef RANGE 137 + #endif

+137

include/linux/drbd_nl.h

··· 1 + /* 2 + PAKET( name, 3 + TYPE ( pn, pr, member ) 4 + ... 5 + ) 6 + 7 + You may never reissue one of the pn arguments 8 + */ 9 + 10 + #if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) 11 + #error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" 12 + #endif 13 + 14 + NL_PACKET(primary, 1, 15 + NL_BIT( 1, T_MAY_IGNORE, overwrite_peer) 16 + ) 17 + 18 + NL_PACKET(secondary, 2, ) 19 + 20 + NL_PACKET(disk_conf, 3, 21 + NL_INT64( 2, T_MAY_IGNORE, disk_size) 22 + NL_STRING( 3, T_MANDATORY, backing_dev, 128) 23 + NL_STRING( 4, T_MANDATORY, meta_dev, 128) 24 + NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) 25 + NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) 26 + NL_INTEGER( 7, T_MAY_IGNORE, fencing) 27 + NL_BIT( 37, T_MAY_IGNORE, use_bmbv) 28 + NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) 29 + NL_BIT( 54, T_MAY_IGNORE, no_md_flush) 30 + /* 55 max_bio_size was available in 8.2.6rc2 */ 31 + NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) 32 + NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) 33 + NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) 34 + ) 35 + 36 + NL_PACKET(detach, 4, ) 37 + 38 + NL_PACKET(net_conf, 5, 39 + NL_STRING( 8, T_MANDATORY, my_addr, 128) 40 + NL_STRING( 9, T_MANDATORY, peer_addr, 128) 41 + NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) 42 + NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) 43 + NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) 44 + NL_INTEGER( 14, T_MAY_IGNORE, timeout) 45 + NL_INTEGER( 15, T_MANDATORY, wire_protocol) 46 + NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) 47 + NL_INTEGER( 17, T_MAY_IGNORE, ping_int) 48 + NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) 49 + NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) 50 + NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) 51 + NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) 52 + NL_INTEGER( 22, T_MAY_IGNORE, ko_count) 53 + NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) 54 + NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) 55 + NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) 56 + NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) 57 + NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) 58 + NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) 59 + /* 59 addr_family was available in GIT, never released */ 60 + NL_BIT( 60, T_MANDATORY, mind_af) 61 + NL_BIT( 27, T_MAY_IGNORE, want_lose) 62 + NL_BIT( 28, T_MAY_IGNORE, two_primaries) 63 + NL_BIT( 41, T_MAY_IGNORE, always_asbp) 64 + NL_BIT( 61, T_MAY_IGNORE, no_cork) 65 + NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) 66 + ) 67 + 68 + NL_PACKET(disconnect, 6, ) 69 + 70 + NL_PACKET(resize, 7, 71 + NL_INT64( 29, T_MAY_IGNORE, resize_size) 72 + ) 73 + 74 + NL_PACKET(syncer_conf, 8, 75 + NL_INTEGER( 30, T_MAY_IGNORE, rate) 76 + NL_INTEGER( 31, T_MAY_IGNORE, after) 77 + NL_INTEGER( 32, T_MAY_IGNORE, al_extents) 78 + NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) 79 + NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) 80 + NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) 81 + NL_BIT( 65, T_MAY_IGNORE, use_rle) 82 + ) 83 + 84 + NL_PACKET(invalidate, 9, ) 85 + NL_PACKET(invalidate_peer, 10, ) 86 + NL_PACKET(pause_sync, 11, ) 87 + NL_PACKET(resume_sync, 12, ) 88 + NL_PACKET(suspend_io, 13, ) 89 + NL_PACKET(resume_io, 14, ) 90 + NL_PACKET(outdate, 15, ) 91 + NL_PACKET(get_config, 16, ) 92 + NL_PACKET(get_state, 17, 93 + NL_INTEGER( 33, T_MAY_IGNORE, state_i) 94 + ) 95 + 96 + NL_PACKET(get_uuids, 18, 97 + NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) 98 + NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) 99 + ) 100 + 101 + NL_PACKET(get_timeout_flag, 19, 102 + NL_BIT( 36, T_MAY_IGNORE, use_degraded) 103 + ) 104 + 105 + NL_PACKET(call_helper, 20, 106 + NL_STRING( 38, T_MAY_IGNORE, helper, 32) 107 + ) 108 + 109 + /* Tag nr 42 already allocated in drbd-8.1 development. */ 110 + 111 + NL_PACKET(sync_progress, 23, 112 + NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) 113 + ) 114 + 115 + NL_PACKET(dump_ee, 24, 116 + NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) 117 + NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) 118 + NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) 119 + NL_INT64( 48, T_MAY_IGNORE, ee_sector) 120 + NL_INT64( 49, T_MAY_IGNORE, ee_block_id) 121 + NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) 122 + ) 123 + 124 + NL_PACKET(start_ov, 25, 125 + NL_INT64( 66, T_MAY_IGNORE, start_sector) 126 + ) 127 + 128 + NL_PACKET(new_c_uuid, 26, 129 + NL_BIT( 63, T_MANDATORY, clear_bm) 130 + ) 131 + 132 + #undef NL_PACKET 133 + #undef NL_INTEGER 134 + #undef NL_INT64 135 + #undef NL_BIT 136 + #undef NL_STRING 137 +

+83

include/linux/drbd_tag_magic.h

··· 1 + #ifndef DRBD_TAG_MAGIC_H 2 + #define DRBD_TAG_MAGIC_H 3 + 4 + #define TT_END 0 5 + #define TT_REMOVED 0xE000 6 + 7 + /* declare packet_type enums */ 8 + enum packet_types { 9 + #define NL_PACKET(name, number, fields) P_ ## name = number, 10 + #define NL_INTEGER(pn, pr, member) 11 + #define NL_INT64(pn, pr, member) 12 + #define NL_BIT(pn, pr, member) 13 + #define NL_STRING(pn, pr, member, len) 14 + #include "drbd_nl.h" 15 + P_nl_after_last_packet, 16 + }; 17 + 18 + /* These struct are used to deduce the size of the tag lists: */ 19 + #define NL_PACKET(name, number, fields) \ 20 + struct name ## _tag_len_struct { fields }; 21 + #define NL_INTEGER(pn, pr, member) \ 22 + int member; int tag_and_len ## member; 23 + #define NL_INT64(pn, pr, member) \ 24 + __u64 member; int tag_and_len ## member; 25 + #define NL_BIT(pn, pr, member) \ 26 + unsigned char member:1; int tag_and_len ## member; 27 + #define NL_STRING(pn, pr, member, len) \ 28 + unsigned char member[len]; int member ## _len; \ 29 + int tag_and_len ## member; 30 + #include "linux/drbd_nl.h" 31 + 32 + /* declate tag-list-sizes */ 33 + static const int tag_list_sizes[] = { 34 + #define NL_PACKET(name, number, fields) 2 fields , 35 + #define NL_INTEGER(pn, pr, member) + 4 + 4 36 + #define NL_INT64(pn, pr, member) + 4 + 8 37 + #define NL_BIT(pn, pr, member) + 4 + 1 38 + #define NL_STRING(pn, pr, member, len) + 4 + (len) 39 + #include "drbd_nl.h" 40 + }; 41 + 42 + /* The two highest bits are used for the tag type */ 43 + #define TT_MASK 0xC000 44 + #define TT_INTEGER 0x0000 45 + #define TT_INT64 0x4000 46 + #define TT_BIT 0x8000 47 + #define TT_STRING 0xC000 48 + /* The next bit indicates if processing of the tag is mandatory */ 49 + #define T_MANDATORY 0x2000 50 + #define T_MAY_IGNORE 0x0000 51 + #define TN_MASK 0x1fff 52 + /* The remaining 13 bits are used to enumerate the tags */ 53 + 54 + #define tag_type(T) ((T) & TT_MASK) 55 + #define tag_number(T) ((T) & TN_MASK) 56 + 57 + /* declare tag enums */ 58 + #define NL_PACKET(name, number, fields) fields 59 + enum drbd_tags { 60 + #define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , 61 + #define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , 62 + #define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , 63 + #define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , 64 + #include "drbd_nl.h" 65 + }; 66 + 67 + struct tag { 68 + const char *name; 69 + int type_n_flags; 70 + int max_len; 71 + }; 72 + 73 + /* declare tag names */ 74 + #define NL_PACKET(name, number, fields) fields 75 + static const struct tag tag_descriptions[] = { 76 + #define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, 77 + #define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, 78 + #define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, 79 + #define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, 80 + #include "drbd_nl.h" 81 + }; 82 + 83 + #endif

+3 -2

include/linux/fs.h

··· 129 129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device 130 130 * immediately after submission. The write equivalent 131 131 * of READ_SYNC. 132 - * WRITE_ODIRECT Special case write for O_DIRECT only. 132 + * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. 133 133 * SWRITE_SYNC 134 134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. 135 135 * See SWRITE. ··· 151 151 #define READ_META (READ | (1 << BIO_RW_META)) 152 152 #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 153 153 #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 154 - #define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) 154 + #define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) 155 155 #define SWRITE_SYNC_PLUG \ 156 156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 157 157 #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) ··· 304 304 #define BLKIOOPT _IO(0x12,121) 305 305 #define BLKALIGNOFF _IO(0x12,122) 306 306 #define BLKPBSZGET _IO(0x12,123) 307 + #define BLKDISCARDZEROES _IO(0x12,124) 307 308 308 309 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 309 310 #define FIBMAP _IO(0x00,1) /* bmap access */

+1

include/linux/genhd.h

··· 91 91 sector_t start_sect; 92 92 sector_t nr_sects; 93 93 sector_t alignment_offset; 94 + unsigned int discard_alignment; 94 95 struct device __dev; 95 96 struct kobject *holder_dir; 96 97 int policy, partno;

+7 -7

include/linux/iocontext.h

··· 40 40 struct io_context *ioc; 41 41 42 42 unsigned long last_end_request; 43 - sector_t last_request_pos; 44 43 45 44 unsigned long ttime_total; 46 45 unsigned long ttime_samples; 47 46 unsigned long ttime_mean; 48 - 49 - unsigned int seek_samples; 50 - u64 seek_total; 51 - sector_t seek_mean; 52 47 53 48 struct list_head queue_list; 54 49 struct hlist_node cic_list; ··· 67 72 68 73 unsigned short ioprio; 69 74 unsigned short ioprio_changed; 75 + 76 + #ifdef CONFIG_BLK_CGROUP 77 + unsigned short cgroup_changed; 78 + #endif 70 79 71 80 /* 72 81 * For request batching ··· 98 99 return NULL; 99 100 } 100 101 102 + struct task_struct; 101 103 #ifdef CONFIG_BLOCK 102 104 int put_io_context(struct io_context *ioc); 103 - void exit_io_context(void); 105 + void exit_io_context(struct task_struct *task); 104 106 struct io_context *get_io_context(gfp_t gfp_flags, int node); 105 107 struct io_context *alloc_io_context(gfp_t gfp_flags, int node); 106 108 void copy_io_context(struct io_context **pdst, struct io_context **psrc); 107 109 #else 108 - static inline void exit_io_context(void) 110 + static inline void exit_io_context(struct task_struct *task) 109 111 { 110 112 } 111 113

+294

include/linux/lru_cache.h

··· 1 + /* 2 + lru_cache.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #ifndef LRU_CACHE_H 27 + #define LRU_CACHE_H 28 + 29 + #include <linux/list.h> 30 + #include <linux/slab.h> 31 + #include <linux/bitops.h> 32 + #include <linux/string.h> /* for memset */ 33 + #include <linux/seq_file.h> 34 + 35 + /* 36 + This header file (and its .c file; kernel-doc of functions see there) 37 + define a helper framework to easily keep track of index:label associations, 38 + and changes to an "active set" of objects, as well as pending transactions, 39 + to persistently record those changes. 40 + 41 + We use an LRU policy if it is necessary to "cool down" a region currently in 42 + the active set before we can "heat" a previously unused region. 43 + 44 + Because of this later property, it is called "lru_cache". 45 + As it actually Tracks Objects in an Active SeT, we could also call it 46 + toast (incidentally that is what may happen to the data on the 47 + backend storage uppon next resync, if we don't get it right). 48 + 49 + What for? 50 + 51 + We replicate IO (more or less synchronously) to local and remote disk. 52 + 53 + For crash recovery after replication node failure, 54 + we need to resync all regions that have been target of in-flight WRITE IO 55 + (in use, or "hot", regions), as we don't know wether or not those WRITEs have 56 + made it to stable storage. 57 + 58 + To avoid a "full resync", we need to persistently track these regions. 59 + 60 + This is known as "write intent log", and can be implemented as on-disk 61 + (coarse or fine grained) bitmap, or other meta data. 62 + 63 + To avoid the overhead of frequent extra writes to this meta data area, 64 + usually the condition is softened to regions that _may_ have been target of 65 + in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent 66 + bitmap, trading frequency of meta data transactions against amount of 67 + (possibly unneccessary) resync traffic. 68 + 69 + If we set a hard limit on the area that may be "hot" at any given time, we 70 + limit the amount of resync traffic needed for crash recovery. 71 + 72 + For recovery after replication link failure, 73 + we need to resync all blocks that have been changed on the other replica 74 + in the mean time, or, if both replica have been changed independently [*], 75 + all blocks that have been changed on either replica in the mean time. 76 + [*] usually as a result of a cluster split-brain and insufficient protection. 77 + but there are valid use cases to do this on purpose. 78 + 79 + Tracking those blocks can be implemented as "dirty bitmap". 80 + Having it fine-grained reduces the amount of resync traffic. 81 + It should also be persistent, to allow for reboots (or crashes) 82 + while the replication link is down. 83 + 84 + There are various possible implementations for persistently storing 85 + write intent log information, three of which are mentioned here. 86 + 87 + "Chunk dirtying" 88 + The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well. 89 + To reduce the frequency of bitmap updates for write-intent log purposes, 90 + one could dirty "chunks" (of some size) at a time of the (fine grained) 91 + on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as 92 + possible, flushing it to disk again when a previously "hot" (and on-disk 93 + dirtied as full chunk) area "cools down" again (no IO in flight anymore, 94 + and none expected in the near future either). 95 + 96 + "Explicit (coarse) write intent bitmap" 97 + An other implementation could chose a (probably coarse) explicit bitmap, 98 + for write-intent log purposes, additionally to the fine grained dirty bitmap. 99 + 100 + "Activity log" 101 + Yet an other implementation may keep track of the hot regions, by starting 102 + with an empty set, and writing down a journal of region numbers that have 103 + become "hot", or have "cooled down" again. 104 + 105 + To be able to use a ring buffer for this journal of changes to the active 106 + set, we not only record the actual changes to that set, but also record the 107 + not changing members of the set in a round robin fashion. To do so, we use a 108 + fixed (but configurable) number of slots which we can identify by index, and 109 + associate region numbers (labels) with these indices. 110 + For each transaction recording a change to the active set, we record the 111 + change itself (index: -old_label, +new_label), and which index is associated 112 + with which label (index: current_label) within a certain sliding window that 113 + is moved further over the available indices with each such transaction. 114 + 115 + Thus, for crash recovery, if the ringbuffer is sufficiently large, we can 116 + accurately reconstruct the active set. 117 + 118 + Sufficiently large depends only on maximum number of active objects, and the 119 + size of the sliding window recording "index: current_label" associations within 120 + each transaction. 121 + 122 + This is what we call the "activity log". 123 + 124 + Currently we need one activity log transaction per single label change, which 125 + does not give much benefit over the "dirty chunks of bitmap" approach, other 126 + than potentially less seeks. 127 + 128 + We plan to change the transaction format to support multiple changes per 129 + transaction, which then would reduce several (disjoint, "random") updates to 130 + the bitmap into one transaction to the activity log ring buffer. 131 + */ 132 + 133 + /* this defines an element in a tracked set 134 + * .colision is for hash table lookup. 135 + * When we process a new IO request, we know its sector, thus can deduce the 136 + * region number (label) easily. To do the label -> object lookup without a 137 + * full list walk, we use a simple hash table. 138 + * 139 + * .list is on one of three lists: 140 + * in_use: currently in use (refcnt > 0, lc_number != LC_FREE) 141 + * lru: unused but ready to be reused or recycled 142 + * (ts_refcnt == 0, lc_number != LC_FREE), 143 + * free: unused but ready to be recycled 144 + * (ts_refcnt == 0, lc_number == LC_FREE), 145 + * 146 + * an element is said to be "in the active set", 147 + * if either on "in_use" or "lru", i.e. lc_number != LC_FREE. 148 + * 149 + * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache 150 + * (total memory usage 2 pages), and up to 3833 elements on the act_log 151 + * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages. 152 + * 153 + * We usually do not actually free these objects again, but only "recycle" 154 + * them, as the change "index: -old_label, +LC_FREE" would need a transaction 155 + * as well. Which also means that using a kmem_cache to allocate the objects 156 + * from wastes some resources. 157 + * But it avoids high order page allocations in kmalloc. 158 + */ 159 + struct lc_element { 160 + struct hlist_node colision; 161 + struct list_head list; /* LRU list or free list */ 162 + unsigned refcnt; 163 + /* back "pointer" into ts_cache->element[index], 164 + * for paranoia, and for "ts_element_to_index" */ 165 + unsigned lc_index; 166 + /* if we want to track a larger set of objects, 167 + * it needs to become arch independend u64 */ 168 + unsigned lc_number; 169 + 170 + /* special label when on free list */ 171 + #define LC_FREE (~0U) 172 + }; 173 + 174 + struct lru_cache { 175 + /* the least recently used item is kept at lru->prev */ 176 + struct list_head lru; 177 + struct list_head free; 178 + struct list_head in_use; 179 + 180 + /* the pre-created kmem cache to allocate the objects from */ 181 + struct kmem_cache *lc_cache; 182 + 183 + /* size of tracked objects, used to memset(,0,) them in lc_reset */ 184 + size_t element_size; 185 + /* offset of struct lc_element member in the tracked object */ 186 + size_t element_off; 187 + 188 + /* number of elements (indices) */ 189 + unsigned int nr_elements; 190 + /* Arbitrary limit on maximum tracked objects. Practical limit is much 191 + * lower due to allocation failures, probably. For typical use cases, 192 + * nr_elements should be a few thousand at most. 193 + * This also limits the maximum value of ts_element.ts_index, allowing the 194 + * 8 high bits of .ts_index to be overloaded with flags in the future. */ 195 + #define LC_MAX_ACTIVE (1<<24) 196 + 197 + /* statistics */ 198 + unsigned used; /* number of lelements currently on in_use list */ 199 + unsigned long hits, misses, starving, dirty, changed; 200 + 201 + /* see below: flag-bits for lru_cache */ 202 + unsigned long flags; 203 + 204 + /* when changing the label of an index element */ 205 + unsigned int new_number; 206 + 207 + /* for paranoia when changing the label of an index element */ 208 + struct lc_element *changing_element; 209 + 210 + void *lc_private; 211 + const char *name; 212 + 213 + /* nr_elements there */ 214 + struct hlist_head *lc_slot; 215 + struct lc_element **lc_element; 216 + }; 217 + 218 + 219 + /* flag-bits for lru_cache */ 220 + enum { 221 + /* debugging aid, to catch concurrent access early. 222 + * user needs to guarantee exclusive access by proper locking! */ 223 + __LC_PARANOIA, 224 + /* if we need to change the set, but currently there is a changing 225 + * transaction pending, we are "dirty", and must deferr further 226 + * changing requests */ 227 + __LC_DIRTY, 228 + /* if we need to change the set, but currently there is no free nor 229 + * unused element available, we are "starving", and must not give out 230 + * further references, to guarantee that eventually some refcnt will 231 + * drop to zero and we will be able to make progress again, changing 232 + * the set, writing the transaction. 233 + * if the statistics say we are frequently starving, 234 + * nr_elements is too small. */ 235 + __LC_STARVING, 236 + }; 237 + #define LC_PARANOIA (1<<__LC_PARANOIA) 238 + #define LC_DIRTY (1<<__LC_DIRTY) 239 + #define LC_STARVING (1<<__LC_STARVING) 240 + 241 + extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, 242 + unsigned e_count, size_t e_size, size_t e_off); 243 + extern void lc_reset(struct lru_cache *lc); 244 + extern void lc_destroy(struct lru_cache *lc); 245 + extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); 246 + extern void lc_del(struct lru_cache *lc, struct lc_element *element); 247 + 248 + extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); 249 + extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); 250 + extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); 251 + extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); 252 + extern void lc_changed(struct lru_cache *lc, struct lc_element *e); 253 + 254 + struct seq_file; 255 + extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); 256 + 257 + extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, 258 + void (*detail) (struct seq_file *, struct lc_element *)); 259 + 260 + /** 261 + * lc_try_lock - can be used to stop lc_get() from changing the tracked set 262 + * @lc: the lru cache to operate on 263 + * 264 + * Note that the reference counts and order on the active and lru lists may 265 + * still change. Returns true if we aquired the lock. 266 + */ 267 + static inline int lc_try_lock(struct lru_cache *lc) 268 + { 269 + return !test_and_set_bit(__LC_DIRTY, &lc->flags); 270 + } 271 + 272 + /** 273 + * lc_unlock - unlock @lc, allow lc_get() to change the set again 274 + * @lc: the lru cache to operate on 275 + */ 276 + static inline void lc_unlock(struct lru_cache *lc) 277 + { 278 + clear_bit(__LC_DIRTY, &lc->flags); 279 + smp_mb__after_clear_bit(); 280 + } 281 + 282 + static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) 283 + { 284 + struct lc_element *e = lc_find(lc, enr); 285 + return e && e->refcnt; 286 + } 287 + 288 + #define lc_entry(ptr, type, member) \ 289 + container_of(ptr, type, member) 290 + 291 + extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); 292 + extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); 293 + 294 + #endif

+1

include/linux/writeback.h

··· 49 49 unsigned nonblocking:1; /* Don't get stuck on request queues */ 50 50 unsigned encountered_congestion:1; /* An output: a queue is full */ 51 51 unsigned for_kupdate:1; /* A kupdate writeback */ 52 + unsigned for_background:1; /* A background writeback */ 52 53 unsigned for_reclaim:1; /* Invoked from the page allocator */ 53 54 unsigned range_cyclic:1; /* range_start is cyclic */ 54 55 unsigned more_io:1; /* more io to be dispatched */

+1 -1

kernel/exit.c

··· 1009 1009 tsk->flags |= PF_EXITPIDONE; 1010 1010 1011 1011 if (tsk->io_context) 1012 - exit_io_context(); 1012 + exit_io_context(tsk); 1013 1013 1014 1014 if (tsk->splice_pipe) 1015 1015 __free_pipe_info(tsk->splice_pipe);

+2 -1

kernel/fork.c

··· 1317 1317 if (pid != &init_struct_pid) 1318 1318 free_pid(pid); 1319 1319 bad_fork_cleanup_io: 1320 - put_io_context(p->io_context); 1320 + if (p->io_context) 1321 + exit_io_context(p); 1321 1322 bad_fork_cleanup_namespaces: 1322 1323 exit_task_namespaces(p); 1323 1324 bad_fork_cleanup_mm:

+3

lib/Kconfig

··· 200 200 config GENERIC_ATOMIC64 201 201 bool 202 202 203 + config LRU_CACHE 204 + tristate 205 + 203 206 endmenu

+2

lib/Makefile

··· 91 91 92 92 obj-$(CONFIG_NLATTR) += nlattr.o 93 93 94 + obj-$(CONFIG_LRU_CACHE) += lru_cache.o 95 + 94 96 obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o 95 97 96 98 obj-$(CONFIG_GENERIC_CSUM) += checksum.o

+560

lib/lru_cache.c

··· 1 + /* 2 + lru_cache.c 3 + 4 + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 + 6 + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 + Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 + Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 + 10 + drbd is free software; you can redistribute it and/or modify 11 + it under the terms of the GNU General Public License as published by 12 + the Free Software Foundation; either version 2, or (at your option) 13 + any later version. 14 + 15 + drbd is distributed in the hope that it will be useful, 16 + but WITHOUT ANY WARRANTY; without even the implied warranty of 17 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 + GNU General Public License for more details. 19 + 20 + You should have received a copy of the GNU General Public License 21 + along with drbd; see the file COPYING. If not, write to 22 + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 + 24 + */ 25 + 26 + #include <linux/module.h> 27 + #include <linux/bitops.h> 28 + #include <linux/slab.h> 29 + #include <linux/string.h> /* for memset */ 30 + #include <linux/seq_file.h> /* for seq_printf */ 31 + #include <linux/lru_cache.h> 32 + 33 + MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 34 + "Lars Ellenberg <lars@linbit.com>"); 35 + MODULE_DESCRIPTION("lru_cache - Track sets of hot objects"); 36 + MODULE_LICENSE("GPL"); 37 + 38 + /* this is developers aid only. 39 + * it catches concurrent access (lack of locking on the users part) */ 40 + #define PARANOIA_ENTRY() do { \ 41 + BUG_ON(!lc); \ 42 + BUG_ON(!lc->nr_elements); \ 43 + BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \ 44 + } while (0) 45 + 46 + #define RETURN(x...) do { \ 47 + clear_bit(__LC_PARANOIA, &lc->flags); \ 48 + smp_mb__after_clear_bit(); return x ; } while (0) 49 + 50 + /* BUG() if e is not one of the elements tracked by lc */ 51 + #define PARANOIA_LC_ELEMENT(lc, e) do { \ 52 + struct lru_cache *lc_ = (lc); \ 53 + struct lc_element *e_ = (e); \ 54 + unsigned i = e_->lc_index; \ 55 + BUG_ON(i >= lc_->nr_elements); \ 56 + BUG_ON(lc_->lc_element[i] != e_); } while (0) 57 + 58 + /** 59 + * lc_create - prepares to track objects in an active set 60 + * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details 61 + * @e_count: number of elements allowed to be active simultaneously 62 + * @e_size: size of the tracked objects 63 + * @e_off: offset to the &struct lc_element member in a tracked object 64 + * 65 + * Returns a pointer to a newly initialized struct lru_cache on success, 66 + * or NULL on (allocation) failure. 67 + */ 68 + struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, 69 + unsigned e_count, size_t e_size, size_t e_off) 70 + { 71 + struct hlist_head *slot = NULL; 72 + struct lc_element **element = NULL; 73 + struct lru_cache *lc; 74 + struct lc_element *e; 75 + unsigned cache_obj_size = kmem_cache_size(cache); 76 + unsigned i; 77 + 78 + WARN_ON(cache_obj_size < e_size); 79 + if (cache_obj_size < e_size) 80 + return NULL; 81 + 82 + /* e_count too big; would probably fail the allocation below anyways. 83 + * for typical use cases, e_count should be few thousand at most. */ 84 + if (e_count > LC_MAX_ACTIVE) 85 + return NULL; 86 + 87 + slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL); 88 + if (!slot) 89 + goto out_fail; 90 + element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL); 91 + if (!element) 92 + goto out_fail; 93 + 94 + lc = kzalloc(sizeof(*lc), GFP_KERNEL); 95 + if (!lc) 96 + goto out_fail; 97 + 98 + INIT_LIST_HEAD(&lc->in_use); 99 + INIT_LIST_HEAD(&lc->lru); 100 + INIT_LIST_HEAD(&lc->free); 101 + 102 + lc->name = name; 103 + lc->element_size = e_size; 104 + lc->element_off = e_off; 105 + lc->nr_elements = e_count; 106 + lc->new_number = LC_FREE; 107 + lc->lc_cache = cache; 108 + lc->lc_element = element; 109 + lc->lc_slot = slot; 110 + 111 + /* preallocate all objects */ 112 + for (i = 0; i < e_count; i++) { 113 + void *p = kmem_cache_alloc(cache, GFP_KERNEL); 114 + if (!p) 115 + break; 116 + memset(p, 0, lc->element_size); 117 + e = p + e_off; 118 + e->lc_index = i; 119 + e->lc_number = LC_FREE; 120 + list_add(&e->list, &lc->free); 121 + element[i] = e; 122 + } 123 + if (i == e_count) 124 + return lc; 125 + 126 + /* else: could not allocate all elements, give up */ 127 + for (i--; i; i--) { 128 + void *p = element[i]; 129 + kmem_cache_free(cache, p - e_off); 130 + } 131 + kfree(lc); 132 + out_fail: 133 + kfree(element); 134 + kfree(slot); 135 + return NULL; 136 + } 137 + 138 + void lc_free_by_index(struct lru_cache *lc, unsigned i) 139 + { 140 + void *p = lc->lc_element[i]; 141 + WARN_ON(!p); 142 + if (p) { 143 + p -= lc->element_off; 144 + kmem_cache_free(lc->lc_cache, p); 145 + } 146 + } 147 + 148 + /** 149 + * lc_destroy - frees memory allocated by lc_create() 150 + * @lc: the lru cache to destroy 151 + */ 152 + void lc_destroy(struct lru_cache *lc) 153 + { 154 + unsigned i; 155 + if (!lc) 156 + return; 157 + for (i = 0; i < lc->nr_elements; i++) 158 + lc_free_by_index(lc, i); 159 + kfree(lc->lc_element); 160 + kfree(lc->lc_slot); 161 + kfree(lc); 162 + } 163 + 164 + /** 165 + * lc_reset - does a full reset for @lc and the hash table slots. 166 + * @lc: the lru cache to operate on 167 + * 168 + * It is roughly the equivalent of re-allocating a fresh lru_cache object, 169 + * basically a short cut to lc_destroy(lc); lc = lc_create(...); 170 + */ 171 + void lc_reset(struct lru_cache *lc) 172 + { 173 + unsigned i; 174 + 175 + INIT_LIST_HEAD(&lc->in_use); 176 + INIT_LIST_HEAD(&lc->lru); 177 + INIT_LIST_HEAD(&lc->free); 178 + lc->used = 0; 179 + lc->hits = 0; 180 + lc->misses = 0; 181 + lc->starving = 0; 182 + lc->dirty = 0; 183 + lc->changed = 0; 184 + lc->flags = 0; 185 + lc->changing_element = NULL; 186 + lc->new_number = LC_FREE; 187 + memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); 188 + 189 + for (i = 0; i < lc->nr_elements; i++) { 190 + struct lc_element *e = lc->lc_element[i]; 191 + void *p = e; 192 + p -= lc->element_off; 193 + memset(p, 0, lc->element_size); 194 + /* re-init it */ 195 + e->lc_index = i; 196 + e->lc_number = LC_FREE; 197 + list_add(&e->list, &lc->free); 198 + } 199 + } 200 + 201 + /** 202 + * lc_seq_printf_stats - print stats about @lc into @seq 203 + * @seq: the seq_file to print into 204 + * @lc: the lru cache to print statistics of 205 + */ 206 + size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) 207 + { 208 + /* NOTE: 209 + * total calls to lc_get are 210 + * (starving + hits + misses) 211 + * misses include "dirty" count (update from an other thread in 212 + * progress) and "changed", when this in fact lead to an successful 213 + * update of the cache. 214 + */ 215 + return seq_printf(seq, "\t%s: used:%u/%u " 216 + "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", 217 + lc->name, lc->used, lc->nr_elements, 218 + lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); 219 + } 220 + 221 + static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) 222 + { 223 + return lc->lc_slot + (enr % lc->nr_elements); 224 + } 225 + 226 + 227 + /** 228 + * lc_find - find element by label, if present in the hash table 229 + * @lc: The lru_cache object 230 + * @enr: element number 231 + * 232 + * Returns the pointer to an element, if the element with the requested 233 + * "label" or element number is present in the hash table, 234 + * or NULL if not found. Does not change the refcnt. 235 + */ 236 + struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) 237 + { 238 + struct hlist_node *n; 239 + struct lc_element *e; 240 + 241 + BUG_ON(!lc); 242 + BUG_ON(!lc->nr_elements); 243 + hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { 244 + if (e->lc_number == enr) 245 + return e; 246 + } 247 + return NULL; 248 + } 249 + 250 + /* returned element will be "recycled" immediately */ 251 + static struct lc_element *lc_evict(struct lru_cache *lc) 252 + { 253 + struct list_head *n; 254 + struct lc_element *e; 255 + 256 + if (list_empty(&lc->lru)) 257 + return NULL; 258 + 259 + n = lc->lru.prev; 260 + e = list_entry(n, struct lc_element, list); 261 + 262 + PARANOIA_LC_ELEMENT(lc, e); 263 + 264 + list_del(&e->list); 265 + hlist_del(&e->colision); 266 + return e; 267 + } 268 + 269 + /** 270 + * lc_del - removes an element from the cache 271 + * @lc: The lru_cache object 272 + * @e: The element to remove 273 + * 274 + * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list, 275 + * sets @e->enr to %LC_FREE. 276 + */ 277 + void lc_del(struct lru_cache *lc, struct lc_element *e) 278 + { 279 + PARANOIA_ENTRY(); 280 + PARANOIA_LC_ELEMENT(lc, e); 281 + BUG_ON(e->refcnt); 282 + 283 + e->lc_number = LC_FREE; 284 + hlist_del_init(&e->colision); 285 + list_move(&e->list, &lc->free); 286 + RETURN(); 287 + } 288 + 289 + static struct lc_element *lc_get_unused_element(struct lru_cache *lc) 290 + { 291 + struct list_head *n; 292 + 293 + if (list_empty(&lc->free)) 294 + return lc_evict(lc); 295 + 296 + n = lc->free.next; 297 + list_del(n); 298 + return list_entry(n, struct lc_element, list); 299 + } 300 + 301 + static int lc_unused_element_available(struct lru_cache *lc) 302 + { 303 + if (!list_empty(&lc->free)) 304 + return 1; /* something on the free list */ 305 + if (!list_empty(&lc->lru)) 306 + return 1; /* something to evict */ 307 + 308 + return 0; 309 + } 310 + 311 + 312 + /** 313 + * lc_get - get element by label, maybe change the active set 314 + * @lc: the lru cache to operate on 315 + * @enr: the label to look up 316 + * 317 + * Finds an element in the cache, increases its usage count, 318 + * "touches" and returns it. 319 + * 320 + * In case the requested number is not present, it needs to be added to the 321 + * cache. Therefore it is possible that an other element becomes evicted from 322 + * the cache. In either case, the user is notified so he is able to e.g. keep 323 + * a persistent log of the cache changes, and therefore the objects in use. 324 + * 325 + * Return values: 326 + * NULL 327 + * The cache was marked %LC_STARVING, 328 + * or the requested label was not in the active set 329 + * and a changing transaction is still pending (@lc was marked %LC_DIRTY). 330 + * Or no unused or free element could be recycled (@lc will be marked as 331 + * %LC_STARVING, blocking further lc_get() operations). 332 + * 333 + * pointer to the element with the REQUESTED element number. 334 + * In this case, it can be used right away 335 + * 336 + * pointer to an UNUSED element with some different element number, 337 + * where that different number may also be %LC_FREE. 338 + * 339 + * In this case, the cache is marked %LC_DIRTY (blocking further changes), 340 + * and the returned element pointer is removed from the lru list and 341 + * hash collision chains. The user now should do whatever housekeeping 342 + * is necessary. 343 + * Then he must call lc_changed(lc,element_pointer), to finish 344 + * the change. 345 + * 346 + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes 347 + * any cache set change. 348 + */ 349 + struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) 350 + { 351 + struct lc_element *e; 352 + 353 + PARANOIA_ENTRY(); 354 + if (lc->flags & LC_STARVING) { 355 + ++lc->starving; 356 + RETURN(NULL); 357 + } 358 + 359 + e = lc_find(lc, enr); 360 + if (e) { 361 + ++lc->hits; 362 + if (e->refcnt++ == 0) 363 + lc->used++; 364 + list_move(&e->list, &lc->in_use); /* Not evictable... */ 365 + RETURN(e); 366 + } 367 + 368 + ++lc->misses; 369 + 370 + /* In case there is nothing available and we can not kick out 371 + * the LRU element, we have to wait ... 372 + */ 373 + if (!lc_unused_element_available(lc)) { 374 + __set_bit(__LC_STARVING, &lc->flags); 375 + RETURN(NULL); 376 + } 377 + 378 + /* it was not present in the active set. 379 + * we are going to recycle an unused (or even "free") element. 380 + * user may need to commit a transaction to record that change. 381 + * we serialize on flags & TF_DIRTY */ 382 + if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { 383 + ++lc->dirty; 384 + RETURN(NULL); 385 + } 386 + 387 + e = lc_get_unused_element(lc); 388 + BUG_ON(!e); 389 + 390 + clear_bit(__LC_STARVING, &lc->flags); 391 + BUG_ON(++e->refcnt != 1); 392 + lc->used++; 393 + 394 + lc->changing_element = e; 395 + lc->new_number = enr; 396 + 397 + RETURN(e); 398 + } 399 + 400 + /* similar to lc_get, 401 + * but only gets a new reference on an existing element. 402 + * you either get the requested element, or NULL. 403 + * will be consolidated into one function. 404 + */ 405 + struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) 406 + { 407 + struct lc_element *e; 408 + 409 + PARANOIA_ENTRY(); 410 + if (lc->flags & LC_STARVING) { 411 + ++lc->starving; 412 + RETURN(NULL); 413 + } 414 + 415 + e = lc_find(lc, enr); 416 + if (e) { 417 + ++lc->hits; 418 + if (e->refcnt++ == 0) 419 + lc->used++; 420 + list_move(&e->list, &lc->in_use); /* Not evictable... */ 421 + } 422 + RETURN(e); 423 + } 424 + 425 + /** 426 + * lc_changed - tell @lc that the change has been recorded 427 + * @lc: the lru cache to operate on 428 + * @e: the element pending label change 429 + */ 430 + void lc_changed(struct lru_cache *lc, struct lc_element *e) 431 + { 432 + PARANOIA_ENTRY(); 433 + BUG_ON(e != lc->changing_element); 434 + PARANOIA_LC_ELEMENT(lc, e); 435 + ++lc->changed; 436 + e->lc_number = lc->new_number; 437 + list_add(&e->list, &lc->in_use); 438 + hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); 439 + lc->changing_element = NULL; 440 + lc->new_number = LC_FREE; 441 + clear_bit(__LC_DIRTY, &lc->flags); 442 + smp_mb__after_clear_bit(); 443 + RETURN(); 444 + } 445 + 446 + 447 + /** 448 + * lc_put - give up refcnt of @e 449 + * @lc: the lru cache to operate on 450 + * @e: the element to put 451 + * 452 + * If refcnt reaches zero, the element is moved to the lru list, 453 + * and a %LC_STARVING (if set) is cleared. 454 + * Returns the new (post-decrement) refcnt. 455 + */ 456 + unsigned int lc_put(struct lru_cache *lc, struct lc_element *e) 457 + { 458 + PARANOIA_ENTRY(); 459 + PARANOIA_LC_ELEMENT(lc, e); 460 + BUG_ON(e->refcnt == 0); 461 + BUG_ON(e == lc->changing_element); 462 + if (--e->refcnt == 0) { 463 + /* move it to the front of LRU. */ 464 + list_move(&e->list, &lc->lru); 465 + lc->used--; 466 + clear_bit(__LC_STARVING, &lc->flags); 467 + smp_mb__after_clear_bit(); 468 + } 469 + RETURN(e->refcnt); 470 + } 471 + 472 + /** 473 + * lc_element_by_index 474 + * @lc: the lru cache to operate on 475 + * @i: the index of the element to return 476 + */ 477 + struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i) 478 + { 479 + BUG_ON(i >= lc->nr_elements); 480 + BUG_ON(lc->lc_element[i] == NULL); 481 + BUG_ON(lc->lc_element[i]->lc_index != i); 482 + return lc->lc_element[i]; 483 + } 484 + 485 + /** 486 + * lc_index_of 487 + * @lc: the lru cache to operate on 488 + * @e: the element to query for its index position in lc->element 489 + */ 490 + unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) 491 + { 492 + PARANOIA_LC_ELEMENT(lc, e); 493 + return e->lc_index; 494 + } 495 + 496 + /** 497 + * lc_set - associate index with label 498 + * @lc: the lru cache to operate on 499 + * @enr: the label to set 500 + * @index: the element index to associate label with. 501 + * 502 + * Used to initialize the active set to some previously recorded state. 503 + */ 504 + void lc_set(struct lru_cache *lc, unsigned int enr, int index) 505 + { 506 + struct lc_element *e; 507 + 508 + if (index < 0 || index >= lc->nr_elements) 509 + return; 510 + 511 + e = lc_element_by_index(lc, index); 512 + e->lc_number = enr; 513 + 514 + hlist_del_init(&e->colision); 515 + hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); 516 + list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); 517 + } 518 + 519 + /** 520 + * lc_dump - Dump a complete LRU cache to seq in textual form. 521 + * @lc: the lru cache to operate on 522 + * @seq: the &struct seq_file pointer to seq_printf into 523 + * @utext: user supplied "heading" or other info 524 + * @detail: function pointer the user may provide to dump further details 525 + * of the object the lc_element is embedded in. 526 + */ 527 + void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, 528 + void (*detail) (struct seq_file *, struct lc_element *)) 529 + { 530 + unsigned int nr_elements = lc->nr_elements; 531 + struct lc_element *e; 532 + int i; 533 + 534 + seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext); 535 + for (i = 0; i < nr_elements; i++) { 536 + e = lc_element_by_index(lc, i); 537 + if (e->lc_number == LC_FREE) { 538 + seq_printf(seq, "\t%2d: FREE\n", i); 539 + } else { 540 + seq_printf(seq, "\t%2d: %4u %4u ", i, 541 + e->lc_number, e->refcnt); 542 + detail(seq, e); 543 + } 544 + } 545 + } 546 + 547 + EXPORT_SYMBOL(lc_create); 548 + EXPORT_SYMBOL(lc_reset); 549 + EXPORT_SYMBOL(lc_destroy); 550 + EXPORT_SYMBOL(lc_set); 551 + EXPORT_SYMBOL(lc_del); 552 + EXPORT_SYMBOL(lc_try_get); 553 + EXPORT_SYMBOL(lc_find); 554 + EXPORT_SYMBOL(lc_get); 555 + EXPORT_SYMBOL(lc_put); 556 + EXPORT_SYMBOL(lc_changed); 557 + EXPORT_SYMBOL(lc_element_by_index); 558 + EXPORT_SYMBOL(lc_index_of); 559 + EXPORT_SYMBOL(lc_seq_printf_stats); 560 + EXPORT_SYMBOL(lc_seq_dump_details);

+1 -1

mm/backing-dev.c

··· 609 609 * it would never exet if it is currently stuck in the refrigerator. 610 610 */ 611 611 list_for_each_entry(wb, &bdi->wb_list, list) { 612 - wb->task->flags &= ~PF_FROZEN; 612 + thaw_process(wb->task); 613 613 kthread_stop(wb->task); 614 614 } 615 615 }

-12

mm/page-writeback.c

··· 821 821 struct writeback_control *wbc, writepage_t writepage, 822 822 void *data) 823 823 { 824 - struct backing_dev_info *bdi = mapping->backing_dev_info; 825 824 int ret = 0; 826 825 int done = 0; 827 826 struct pagevec pvec; ··· 832 833 int cycled; 833 834 int range_whole = 0; 834 835 long nr_to_write = wbc->nr_to_write; 835 - 836 - if (wbc->nonblocking && bdi_write_congested(bdi)) { 837 - wbc->encountered_congestion = 1; 838 - return 0; 839 - } 840 836 841 837 pagevec_init(&pvec, 0); 842 838 if (wbc->range_cyclic) { ··· 950 956 done = 1; 951 957 break; 952 958 } 953 - } 954 - 955 - if (wbc->nonblocking && bdi_write_congested(bdi)) { 956 - wbc->encountered_congestion = 1; 957 - done = 1; 958 - break; 959 959 } 960 960 } 961 961 pagevec_release(&pvec);