Fast and robust atproto CAR file processing in rust

Compare changes

Choose any two refs to compare.

Changed files
+2677 -59
examples
disk-read-file
src
+1188 -26
Cargo.lock
··· 3 3 version = 4 4 4 5 5 [[package]] 6 + name = "addr2line" 7 + version = "0.25.1" 8 + source = "registry+https://github.com/rust-lang/crates.io-index" 9 + checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" 10 + dependencies = [ 11 + "gimli", 12 + ] 13 + 14 + [[package]] 15 + name = "adler2" 16 + version = "2.0.1" 17 + source = "registry+https://github.com/rust-lang/crates.io-index" 18 + checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" 19 + 20 + [[package]] 21 + name = "aho-corasick" 22 + version = "1.1.3" 23 + source = "registry+https://github.com/rust-lang/crates.io-index" 24 + checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 25 + dependencies = [ 26 + "memchr", 27 + ] 28 + 29 + [[package]] 30 + name = "anes" 31 + version = "0.1.6" 32 + source = "registry+https://github.com/rust-lang/crates.io-index" 33 + checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 34 + 35 + [[package]] 6 36 name = "anstream" 7 37 version = "0.6.21" 8 38 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 38 68 source = "registry+https://github.com/rust-lang/crates.io-index" 39 69 checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" 40 70 dependencies = [ 41 - "windows-sys", 71 + "windows-sys 0.60.2", 42 72 ] 43 73 44 74 [[package]] ··· 49 79 dependencies = [ 50 80 "anstyle", 51 81 "once_cell_polyfill", 52 - "windows-sys", 82 + "windows-sys 0.60.2", 83 + ] 84 + 85 + [[package]] 86 + name = "anyhow" 87 + version = "1.0.100" 88 + source = "registry+https://github.com/rust-lang/crates.io-index" 89 + checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" 90 + 91 + [[package]] 92 + name = "autocfg" 93 + version = "1.5.0" 94 + source = "registry+https://github.com/rust-lang/crates.io-index" 95 + checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" 96 + 97 + [[package]] 98 + name = "backtrace" 99 + version = "0.3.76" 100 + source = "registry+https://github.com/rust-lang/crates.io-index" 101 + checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" 102 + dependencies = [ 103 + "addr2line", 104 + "cfg-if", 105 + "libc", 106 + "miniz_oxide", 107 + "object", 108 + "rustc-demangle", 109 + "windows-link", 110 + ] 111 + 112 + [[package]] 113 + name = "base-x" 114 + version = "0.2.11" 115 + source = "registry+https://github.com/rust-lang/crates.io-index" 116 + checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" 117 + 118 + [[package]] 119 + name = "base256emoji" 120 + version = "1.0.2" 121 + source = "registry+https://github.com/rust-lang/crates.io-index" 122 + checksum = "b5e9430d9a245a77c92176e649af6e275f20839a48389859d1661e9a128d077c" 123 + dependencies = [ 124 + "const-str", 125 + "match-lookup", 126 + ] 127 + 128 + [[package]] 129 + name = "bincode" 130 + version = "2.0.1" 131 + source = "registry+https://github.com/rust-lang/crates.io-index" 132 + checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" 133 + dependencies = [ 134 + "bincode_derive", 135 + "serde", 136 + "unty", 137 + ] 138 + 139 + [[package]] 140 + name = "bincode_derive" 141 + version = "2.0.1" 142 + source = "registry+https://github.com/rust-lang/crates.io-index" 143 + checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" 144 + dependencies = [ 145 + "virtue", 53 146 ] 54 147 55 148 [[package]] ··· 59 152 checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" 60 153 61 154 [[package]] 155 + name = "block-buffer" 156 + version = "0.10.4" 157 + source = "registry+https://github.com/rust-lang/crates.io-index" 158 + checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" 159 + dependencies = [ 160 + "generic-array", 161 + ] 162 + 163 + [[package]] 164 + name = "bumpalo" 165 + version = "3.19.0" 166 + source = "registry+https://github.com/rust-lang/crates.io-index" 167 + checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 168 + 169 + [[package]] 62 170 name = "byteorder-lite" 63 171 version = "0.1.0" 64 172 source = "registry+https://github.com/rust-lang/crates.io-index" 65 173 checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" 66 174 67 175 [[package]] 176 + name = "bytes" 177 + version = "1.10.1" 178 + source = "registry+https://github.com/rust-lang/crates.io-index" 179 + checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" 180 + 181 + [[package]] 68 182 name = "byteview" 69 183 version = "0.10.0" 70 184 source = "registry+https://github.com/rust-lang/crates.io-index" 71 185 checksum = "dda4398f387cc6395a3e93b3867cd9abda914c97a0b344d1eefb2e5c51785fca" 186 + 187 + [[package]] 188 + name = "cast" 189 + version = "0.3.0" 190 + source = "registry+https://github.com/rust-lang/crates.io-index" 191 + checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 192 + 193 + [[package]] 194 + name = "cbor4ii" 195 + version = "0.2.14" 196 + source = "registry+https://github.com/rust-lang/crates.io-index" 197 + checksum = "b544cf8c89359205f4f990d0e6f3828db42df85b5dac95d09157a250eb0749c4" 198 + dependencies = [ 199 + "serde", 200 + ] 72 201 73 202 [[package]] 74 203 name = "cfg-if" ··· 77 206 checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" 78 207 79 208 [[package]] 209 + name = "ciborium" 210 + version = "0.2.2" 211 + source = "registry+https://github.com/rust-lang/crates.io-index" 212 + checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" 213 + dependencies = [ 214 + "ciborium-io", 215 + "ciborium-ll", 216 + "serde", 217 + ] 218 + 219 + [[package]] 220 + name = "ciborium-io" 221 + version = "0.2.2" 222 + source = "registry+https://github.com/rust-lang/crates.io-index" 223 + checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" 224 + 225 + [[package]] 226 + name = "ciborium-ll" 227 + version = "0.2.2" 228 + source = "registry+https://github.com/rust-lang/crates.io-index" 229 + checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" 230 + dependencies = [ 231 + "ciborium-io", 232 + "half", 233 + ] 234 + 235 + [[package]] 236 + name = "cid" 237 + version = "0.11.1" 238 + source = "registry+https://github.com/rust-lang/crates.io-index" 239 + checksum = "3147d8272e8fa0ccd29ce51194dd98f79ddfb8191ba9e3409884e751798acf3a" 240 + dependencies = [ 241 + "core2", 242 + "multibase", 243 + "multihash", 244 + "serde", 245 + "serde_bytes", 246 + "unsigned-varint 0.8.0", 247 + ] 248 + 249 + [[package]] 80 250 name = "clap" 81 251 version = "4.5.48" 82 252 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 107 277 "heck", 108 278 "proc-macro2", 109 279 "quote", 110 - "syn", 280 + "syn 2.0.106", 111 281 ] 112 282 113 283 [[package]] ··· 129 299 checksum = "ea0095f6103c2a8b44acd6fd15960c801dafebf02e21940360833e0673f48ba7" 130 300 131 301 [[package]] 302 + name = "const-str" 303 + version = "0.4.3" 304 + source = "registry+https://github.com/rust-lang/crates.io-index" 305 + checksum = "2f421161cb492475f1661ddc9815a745a1c894592070661180fdec3d4872e9c3" 306 + 307 + [[package]] 308 + name = "core2" 309 + version = "0.4.0" 310 + source = "registry+https://github.com/rust-lang/crates.io-index" 311 + checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" 312 + dependencies = [ 313 + "memchr", 314 + ] 315 + 316 + [[package]] 317 + name = "cpufeatures" 318 + version = "0.2.17" 319 + source = "registry+https://github.com/rust-lang/crates.io-index" 320 + checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" 321 + dependencies = [ 322 + "libc", 323 + ] 324 + 325 + [[package]] 326 + name = "criterion" 327 + version = "0.7.0" 328 + source = "registry+https://github.com/rust-lang/crates.io-index" 329 + checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" 330 + dependencies = [ 331 + "anes", 332 + "cast", 333 + "ciborium", 334 + "clap", 335 + "criterion-plot", 336 + "itertools", 337 + "num-traits", 338 + "oorandom", 339 + "plotters", 340 + "rayon", 341 + "regex", 342 + "serde", 343 + "serde_json", 344 + "tinytemplate", 345 + "tokio", 346 + "walkdir", 347 + ] 348 + 349 + [[package]] 350 + name = "criterion-plot" 351 + version = "0.6.0" 352 + source = "registry+https://github.com/rust-lang/crates.io-index" 353 + checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" 354 + dependencies = [ 355 + "cast", 356 + "itertools", 357 + ] 358 + 359 + [[package]] 360 + name = "crossbeam-deque" 361 + version = "0.8.6" 362 + source = "registry+https://github.com/rust-lang/crates.io-index" 363 + checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 364 + dependencies = [ 365 + "crossbeam-epoch", 366 + "crossbeam-utils", 367 + ] 368 + 369 + [[package]] 132 370 name = "crossbeam-epoch" 133 371 version = "0.9.18" 134 372 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 154 392 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 155 393 156 394 [[package]] 395 + name = "crunchy" 396 + version = "0.2.4" 397 + source = "registry+https://github.com/rust-lang/crates.io-index" 398 + checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" 399 + 400 + [[package]] 401 + name = "crypto-common" 402 + version = "0.1.6" 403 + source = "registry+https://github.com/rust-lang/crates.io-index" 404 + checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" 405 + dependencies = [ 406 + "generic-array", 407 + "typenum", 408 + ] 409 + 410 + [[package]] 157 411 name = "dashmap" 158 412 version = "6.1.0" 159 413 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 168 422 ] 169 423 170 424 [[package]] 425 + name = "data-encoding" 426 + version = "2.9.0" 427 + source = "registry+https://github.com/rust-lang/crates.io-index" 428 + checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" 429 + 430 + [[package]] 431 + name = "data-encoding-macro" 432 + version = "0.1.18" 433 + source = "registry+https://github.com/rust-lang/crates.io-index" 434 + checksum = "47ce6c96ea0102f01122a185683611bd5ac8d99e62bc59dd12e6bda344ee673d" 435 + dependencies = [ 436 + "data-encoding", 437 + "data-encoding-macro-internal", 438 + ] 439 + 440 + [[package]] 441 + name = "data-encoding-macro-internal" 442 + version = "0.1.16" 443 + source = "registry+https://github.com/rust-lang/crates.io-index" 444 + checksum = "8d162beedaa69905488a8da94f5ac3edb4dd4788b732fadb7bd120b2625c1976" 445 + dependencies = [ 446 + "data-encoding", 447 + "syn 2.0.106", 448 + ] 449 + 450 + [[package]] 451 + name = "digest" 452 + version = "0.10.7" 453 + source = "registry+https://github.com/rust-lang/crates.io-index" 454 + checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" 455 + dependencies = [ 456 + "block-buffer", 457 + "crypto-common", 458 + ] 459 + 460 + [[package]] 461 + name = "either" 462 + version = "1.15.0" 463 + source = "registry+https://github.com/rust-lang/crates.io-index" 464 + checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 465 + 466 + [[package]] 171 467 name = "enum_dispatch" 172 468 version = "0.3.13" 173 469 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 176 472 "once_cell", 177 473 "proc-macro2", 178 474 "quote", 179 - "syn", 475 + "syn 2.0.106", 476 + ] 477 + 478 + [[package]] 479 + name = "env_filter" 480 + version = "0.1.3" 481 + source = "registry+https://github.com/rust-lang/crates.io-index" 482 + checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" 483 + dependencies = [ 484 + "log", 485 + "regex", 486 + ] 487 + 488 + [[package]] 489 + name = "env_logger" 490 + version = "0.11.8" 491 + source = "registry+https://github.com/rust-lang/crates.io-index" 492 + checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" 493 + dependencies = [ 494 + "anstream", 495 + "anstyle", 496 + "env_filter", 497 + "jiff", 498 + "log", 180 499 ] 181 500 182 501 [[package]] ··· 192 511 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" 193 512 dependencies = [ 194 513 "libc", 195 - "windows-sys", 514 + "windows-sys 0.60.2", 196 515 ] 197 516 198 517 [[package]] ··· 213 532 "flume", 214 533 "log", 215 534 "lsm-tree", 216 - "lz4_flex", 217 535 "tempfile", 218 536 "xxhash-rust", 219 537 ] ··· 228 546 ] 229 547 230 548 [[package]] 549 + name = "futures" 550 + version = "0.3.31" 551 + source = "registry+https://github.com/rust-lang/crates.io-index" 552 + checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" 553 + dependencies = [ 554 + "futures-channel", 555 + "futures-core", 556 + "futures-executor", 557 + "futures-io", 558 + "futures-sink", 559 + "futures-task", 560 + "futures-util", 561 + ] 562 + 563 + [[package]] 564 + name = "futures-channel" 565 + version = "0.3.31" 566 + source = "registry+https://github.com/rust-lang/crates.io-index" 567 + checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" 568 + dependencies = [ 569 + "futures-core", 570 + "futures-sink", 571 + ] 572 + 573 + [[package]] 574 + name = "futures-core" 575 + version = "0.3.31" 576 + source = "registry+https://github.com/rust-lang/crates.io-index" 577 + checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" 578 + 579 + [[package]] 580 + name = "futures-executor" 581 + version = "0.3.31" 582 + source = "registry+https://github.com/rust-lang/crates.io-index" 583 + checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" 584 + dependencies = [ 585 + "futures-core", 586 + "futures-task", 587 + "futures-util", 588 + ] 589 + 590 + [[package]] 591 + name = "futures-io" 592 + version = "0.3.31" 593 + source = "registry+https://github.com/rust-lang/crates.io-index" 594 + checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" 595 + 596 + [[package]] 597 + name = "futures-macro" 598 + version = "0.3.31" 599 + source = "registry+https://github.com/rust-lang/crates.io-index" 600 + checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" 601 + dependencies = [ 602 + "proc-macro2", 603 + "quote", 604 + "syn 2.0.106", 605 + ] 606 + 607 + [[package]] 608 + name = "futures-sink" 609 + version = "0.3.31" 610 + source = "registry+https://github.com/rust-lang/crates.io-index" 611 + checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" 612 + 613 + [[package]] 614 + name = "futures-task" 615 + version = "0.3.31" 616 + source = "registry+https://github.com/rust-lang/crates.io-index" 617 + checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" 618 + 619 + [[package]] 620 + name = "futures-util" 621 + version = "0.3.31" 622 + source = "registry+https://github.com/rust-lang/crates.io-index" 623 + checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" 624 + dependencies = [ 625 + "futures-channel", 626 + "futures-core", 627 + "futures-io", 628 + "futures-macro", 629 + "futures-sink", 630 + "futures-task", 631 + "memchr", 632 + "pin-project-lite", 633 + "pin-utils", 634 + "slab", 635 + ] 636 + 637 + [[package]] 638 + name = "generic-array" 639 + version = "0.14.9" 640 + source = "registry+https://github.com/rust-lang/crates.io-index" 641 + checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" 642 + dependencies = [ 643 + "typenum", 644 + "version_check", 645 + ] 646 + 647 + [[package]] 231 648 name = "getrandom" 232 649 version = "0.3.3" 233 650 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 236 653 "cfg-if", 237 654 "libc", 238 655 "r-efi", 239 - "wasi", 656 + "wasi 0.14.7+wasi-0.2.4", 657 + ] 658 + 659 + [[package]] 660 + name = "gimli" 661 + version = "0.32.3" 662 + source = "registry+https://github.com/rust-lang/crates.io-index" 663 + checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" 664 + 665 + [[package]] 666 + name = "half" 667 + version = "2.7.0" 668 + source = "registry+https://github.com/rust-lang/crates.io-index" 669 + checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5" 670 + dependencies = [ 671 + "cfg-if", 672 + "crunchy", 673 + "zerocopy", 240 674 ] 241 675 242 676 [[package]] ··· 267 701 ] 268 702 269 703 [[package]] 704 + name = "io-uring" 705 + version = "0.7.10" 706 + source = "registry+https://github.com/rust-lang/crates.io-index" 707 + checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" 708 + dependencies = [ 709 + "bitflags", 710 + "cfg-if", 711 + "libc", 712 + ] 713 + 714 + [[package]] 715 + name = "ipld-core" 716 + version = "0.4.2" 717 + source = "registry+https://github.com/rust-lang/crates.io-index" 718 + checksum = "104718b1cc124d92a6d01ca9c9258a7df311405debb3408c445a36452f9bf8db" 719 + dependencies = [ 720 + "cid", 721 + "serde", 722 + "serde_bytes", 723 + ] 724 + 725 + [[package]] 726 + name = "iroh-car" 727 + version = "0.5.1" 728 + source = "registry+https://github.com/rust-lang/crates.io-index" 729 + checksum = "cb7f8cd4cb9aa083fba8b52e921764252d0b4dcb1cd6d120b809dbfe1106e81a" 730 + dependencies = [ 731 + "anyhow", 732 + "cid", 733 + "futures", 734 + "serde", 735 + "serde_ipld_dagcbor", 736 + "thiserror 1.0.69", 737 + "tokio", 738 + "unsigned-varint 0.7.2", 739 + ] 740 + 741 + [[package]] 270 742 name = "is_terminal_polyfill" 271 743 version = "1.70.1" 272 744 source = "registry+https://github.com/rust-lang/crates.io-index" 273 745 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 274 746 275 747 [[package]] 748 + name = "itertools" 749 + version = "0.13.0" 750 + source = "registry+https://github.com/rust-lang/crates.io-index" 751 + checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" 752 + dependencies = [ 753 + "either", 754 + ] 755 + 756 + [[package]] 757 + name = "itoa" 758 + version = "1.0.15" 759 + source = "registry+https://github.com/rust-lang/crates.io-index" 760 + checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 761 + 762 + [[package]] 763 + name = "jiff" 764 + version = "0.2.15" 765 + source = "registry+https://github.com/rust-lang/crates.io-index" 766 + checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" 767 + dependencies = [ 768 + "jiff-static", 769 + "log", 770 + "portable-atomic", 771 + "portable-atomic-util", 772 + "serde", 773 + ] 774 + 775 + [[package]] 776 + name = "jiff-static" 777 + version = "0.2.15" 778 + source = "registry+https://github.com/rust-lang/crates.io-index" 779 + checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" 780 + dependencies = [ 781 + "proc-macro2", 782 + "quote", 783 + "syn 2.0.106", 784 + ] 785 + 786 + [[package]] 787 + name = "js-sys" 788 + version = "0.3.81" 789 + source = "registry+https://github.com/rust-lang/crates.io-index" 790 + checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" 791 + dependencies = [ 792 + "once_cell", 793 + "wasm-bindgen", 794 + ] 795 + 796 + [[package]] 276 797 name = "libc" 277 798 version = "0.2.176" 278 799 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 311 832 "enum_dispatch", 312 833 "interval-heap", 313 834 "log", 314 - "lz4_flex", 315 835 "quick_cache", 316 836 "rustc-hash", 317 837 "self_cell", ··· 322 842 ] 323 843 324 844 [[package]] 325 - name = "lz4_flex" 326 - version = "0.11.5" 845 + name = "match-lookup" 846 + version = "0.1.1" 847 + source = "registry+https://github.com/rust-lang/crates.io-index" 848 + checksum = "1265724d8cb29dbbc2b0f06fffb8bf1a8c0cf73a78eede9ba73a4a66c52a981e" 849 + dependencies = [ 850 + "proc-macro2", 851 + "quote", 852 + "syn 1.0.109", 853 + ] 854 + 855 + [[package]] 856 + name = "memchr" 857 + version = "2.7.6" 858 + source = "registry+https://github.com/rust-lang/crates.io-index" 859 + checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" 860 + 861 + [[package]] 862 + name = "miniz_oxide" 863 + version = "0.8.9" 864 + source = "registry+https://github.com/rust-lang/crates.io-index" 865 + checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" 866 + dependencies = [ 867 + "adler2", 868 + ] 869 + 870 + [[package]] 871 + name = "mio" 872 + version = "1.0.4" 873 + source = "registry+https://github.com/rust-lang/crates.io-index" 874 + checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" 875 + dependencies = [ 876 + "libc", 877 + "wasi 0.11.1+wasi-snapshot-preview1", 878 + "windows-sys 0.59.0", 879 + ] 880 + 881 + [[package]] 882 + name = "multibase" 883 + version = "0.9.2" 884 + source = "registry+https://github.com/rust-lang/crates.io-index" 885 + checksum = "8694bb4835f452b0e3bb06dbebb1d6fc5385b6ca1caf2e55fd165c042390ec77" 886 + dependencies = [ 887 + "base-x", 888 + "base256emoji", 889 + "data-encoding", 890 + "data-encoding-macro", 891 + ] 892 + 893 + [[package]] 894 + name = "multihash" 895 + version = "0.19.3" 896 + source = "registry+https://github.com/rust-lang/crates.io-index" 897 + checksum = "6b430e7953c29dd6a09afc29ff0bb69c6e306329ee6794700aee27b76a1aea8d" 898 + dependencies = [ 899 + "core2", 900 + "serde", 901 + "unsigned-varint 0.8.0", 902 + ] 903 + 904 + [[package]] 905 + name = "num-traits" 906 + version = "0.2.19" 327 907 source = "registry+https://github.com/rust-lang/crates.io-index" 328 - checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" 908 + checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 329 909 dependencies = [ 330 - "twox-hash", 910 + "autocfg", 911 + ] 912 + 913 + [[package]] 914 + name = "object" 915 + version = "0.37.3" 916 + source = "registry+https://github.com/rust-lang/crates.io-index" 917 + checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" 918 + dependencies = [ 919 + "memchr", 331 920 ] 332 921 333 922 [[package]] ··· 343 932 checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" 344 933 345 934 [[package]] 935 + name = "oorandom" 936 + version = "11.1.5" 937 + source = "registry+https://github.com/rust-lang/crates.io-index" 938 + checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" 939 + 940 + [[package]] 941 + name = "parking_lot" 942 + version = "0.12.5" 943 + source = "registry+https://github.com/rust-lang/crates.io-index" 944 + checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" 945 + dependencies = [ 946 + "lock_api", 947 + "parking_lot_core", 948 + ] 949 + 950 + [[package]] 346 951 name = "parking_lot_core" 347 952 version = "0.9.12" 348 953 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 356 961 ] 357 962 358 963 [[package]] 964 + name = "pin-project-lite" 965 + version = "0.2.16" 966 + source = "registry+https://github.com/rust-lang/crates.io-index" 967 + checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" 968 + 969 + [[package]] 970 + name = "pin-utils" 971 + version = "0.1.0" 972 + source = "registry+https://github.com/rust-lang/crates.io-index" 973 + checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" 974 + 975 + [[package]] 976 + name = "plotters" 977 + version = "0.3.7" 978 + source = "registry+https://github.com/rust-lang/crates.io-index" 979 + checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" 980 + dependencies = [ 981 + "num-traits", 982 + "plotters-backend", 983 + "plotters-svg", 984 + "wasm-bindgen", 985 + "web-sys", 986 + ] 987 + 988 + [[package]] 989 + name = "plotters-backend" 990 + version = "0.3.7" 991 + source = "registry+https://github.com/rust-lang/crates.io-index" 992 + checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" 993 + 994 + [[package]] 995 + name = "plotters-svg" 996 + version = "0.3.7" 997 + source = "registry+https://github.com/rust-lang/crates.io-index" 998 + checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" 999 + dependencies = [ 1000 + "plotters-backend", 1001 + ] 1002 + 1003 + [[package]] 1004 + name = "portable-atomic" 1005 + version = "1.11.1" 1006 + source = "registry+https://github.com/rust-lang/crates.io-index" 1007 + checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" 1008 + 1009 + [[package]] 1010 + name = "portable-atomic-util" 1011 + version = "0.2.4" 1012 + source = "registry+https://github.com/rust-lang/crates.io-index" 1013 + checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" 1014 + dependencies = [ 1015 + "portable-atomic", 1016 + ] 1017 + 1018 + [[package]] 359 1019 name = "proc-macro2" 360 1020 version = "1.0.101" 361 1021 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 390 1050 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" 391 1051 392 1052 [[package]] 1053 + name = "rayon" 1054 + version = "1.11.0" 1055 + source = "registry+https://github.com/rust-lang/crates.io-index" 1056 + checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" 1057 + dependencies = [ 1058 + "either", 1059 + "rayon-core", 1060 + ] 1061 + 1062 + [[package]] 1063 + name = "rayon-core" 1064 + version = "1.13.0" 1065 + source = "registry+https://github.com/rust-lang/crates.io-index" 1066 + checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" 1067 + dependencies = [ 1068 + "crossbeam-deque", 1069 + "crossbeam-utils", 1070 + ] 1071 + 1072 + [[package]] 393 1073 name = "redox_syscall" 394 1074 version = "0.5.18" 395 1075 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 399 1079 ] 400 1080 401 1081 [[package]] 1082 + name = "regex" 1083 + version = "1.11.3" 1084 + source = "registry+https://github.com/rust-lang/crates.io-index" 1085 + checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" 1086 + dependencies = [ 1087 + "aho-corasick", 1088 + "memchr", 1089 + "regex-automata", 1090 + "regex-syntax", 1091 + ] 1092 + 1093 + [[package]] 1094 + name = "regex-automata" 1095 + version = "0.4.11" 1096 + source = "registry+https://github.com/rust-lang/crates.io-index" 1097 + checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" 1098 + dependencies = [ 1099 + "aho-corasick", 1100 + "memchr", 1101 + "regex-syntax", 1102 + ] 1103 + 1104 + [[package]] 1105 + name = "regex-syntax" 1106 + version = "0.8.6" 1107 + source = "registry+https://github.com/rust-lang/crates.io-index" 1108 + checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" 1109 + 1110 + [[package]] 402 1111 name = "repo-stream" 403 1112 version = "0.2.2" 404 1113 dependencies = [ 1114 + "bincode", 405 1115 "clap", 1116 + "criterion", 1117 + "env_logger", 406 1118 "fjall", 1119 + "futures", 1120 + "futures-core", 1121 + "ipld-core", 1122 + "iroh-car", 1123 + "log", 1124 + "multibase", 1125 + "serde", 1126 + "serde_bytes", 1127 + "serde_ipld_dagcbor", 1128 + "sha2", 1129 + "tempfile", 1130 + "thiserror 2.0.17", 1131 + "tokio", 407 1132 ] 408 1133 409 1134 [[package]] 1135 + name = "rustc-demangle" 1136 + version = "0.1.26" 1137 + source = "registry+https://github.com/rust-lang/crates.io-index" 1138 + checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" 1139 + 1140 + [[package]] 410 1141 name = "rustc-hash" 411 1142 version = "2.1.1" 412 1143 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 422 1153 "errno", 423 1154 "libc", 424 1155 "linux-raw-sys", 425 - "windows-sys", 1156 + "windows-sys 0.60.2", 1157 + ] 1158 + 1159 + [[package]] 1160 + name = "rustversion" 1161 + version = "1.0.22" 1162 + source = "registry+https://github.com/rust-lang/crates.io-index" 1163 + checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 1164 + 1165 + [[package]] 1166 + name = "ryu" 1167 + version = "1.0.20" 1168 + source = "registry+https://github.com/rust-lang/crates.io-index" 1169 + checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 1170 + 1171 + [[package]] 1172 + name = "same-file" 1173 + version = "1.0.6" 1174 + source = "registry+https://github.com/rust-lang/crates.io-index" 1175 + checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 1176 + dependencies = [ 1177 + "winapi-util", 426 1178 ] 427 1179 428 1180 [[package]] ··· 438 1190 checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89" 439 1191 440 1192 [[package]] 1193 + name = "serde" 1194 + version = "1.0.228" 1195 + source = "registry+https://github.com/rust-lang/crates.io-index" 1196 + checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" 1197 + dependencies = [ 1198 + "serde_core", 1199 + "serde_derive", 1200 + ] 1201 + 1202 + [[package]] 1203 + name = "serde_bytes" 1204 + version = "0.11.19" 1205 + source = "registry+https://github.com/rust-lang/crates.io-index" 1206 + checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" 1207 + dependencies = [ 1208 + "serde", 1209 + "serde_core", 1210 + ] 1211 + 1212 + [[package]] 1213 + name = "serde_core" 1214 + version = "1.0.228" 1215 + source = "registry+https://github.com/rust-lang/crates.io-index" 1216 + checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" 1217 + dependencies = [ 1218 + "serde_derive", 1219 + ] 1220 + 1221 + [[package]] 1222 + name = "serde_derive" 1223 + version = "1.0.228" 1224 + source = "registry+https://github.com/rust-lang/crates.io-index" 1225 + checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" 1226 + dependencies = [ 1227 + "proc-macro2", 1228 + "quote", 1229 + "syn 2.0.106", 1230 + ] 1231 + 1232 + [[package]] 1233 + name = "serde_ipld_dagcbor" 1234 + version = "0.6.4" 1235 + source = "registry+https://github.com/rust-lang/crates.io-index" 1236 + checksum = "46182f4f08349a02b45c998ba3215d3f9de826246ba02bb9dddfe9a2a2100778" 1237 + dependencies = [ 1238 + "cbor4ii", 1239 + "ipld-core", 1240 + "scopeguard", 1241 + "serde", 1242 + ] 1243 + 1244 + [[package]] 1245 + name = "serde_json" 1246 + version = "1.0.145" 1247 + source = "registry+https://github.com/rust-lang/crates.io-index" 1248 + checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 1249 + dependencies = [ 1250 + "itoa", 1251 + "memchr", 1252 + "ryu", 1253 + "serde", 1254 + "serde_core", 1255 + ] 1256 + 1257 + [[package]] 441 1258 name = "sfa" 442 1259 version = "1.0.0" 443 1260 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 449 1266 ] 450 1267 451 1268 [[package]] 1269 + name = "sha2" 1270 + version = "0.10.9" 1271 + source = "registry+https://github.com/rust-lang/crates.io-index" 1272 + checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" 1273 + dependencies = [ 1274 + "cfg-if", 1275 + "cpufeatures", 1276 + "digest", 1277 + ] 1278 + 1279 + [[package]] 1280 + name = "signal-hook-registry" 1281 + version = "1.4.6" 1282 + source = "registry+https://github.com/rust-lang/crates.io-index" 1283 + checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" 1284 + dependencies = [ 1285 + "libc", 1286 + ] 1287 + 1288 + [[package]] 1289 + name = "slab" 1290 + version = "0.4.11" 1291 + source = "registry+https://github.com/rust-lang/crates.io-index" 1292 + checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" 1293 + 1294 + [[package]] 452 1295 name = "smallvec" 453 1296 version = "1.15.1" 454 1297 source = "registry+https://github.com/rust-lang/crates.io-index" 455 1298 checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" 456 1299 457 1300 [[package]] 1301 + name = "socket2" 1302 + version = "0.6.0" 1303 + source = "registry+https://github.com/rust-lang/crates.io-index" 1304 + checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" 1305 + dependencies = [ 1306 + "libc", 1307 + "windows-sys 0.59.0", 1308 + ] 1309 + 1310 + [[package]] 458 1311 name = "spin" 459 1312 version = "0.9.8" 460 1313 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 471 1324 472 1325 [[package]] 473 1326 name = "syn" 1327 + version = "1.0.109" 1328 + source = "registry+https://github.com/rust-lang/crates.io-index" 1329 + checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" 1330 + dependencies = [ 1331 + "proc-macro2", 1332 + "quote", 1333 + "unicode-ident", 1334 + ] 1335 + 1336 + [[package]] 1337 + name = "syn" 474 1338 version = "2.0.106" 475 1339 source = "registry+https://github.com/rust-lang/crates.io-index" 476 1340 checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" ··· 490 1354 "getrandom", 491 1355 "once_cell", 492 1356 "rustix", 493 - "windows-sys", 1357 + "windows-sys 0.60.2", 1358 + ] 1359 + 1360 + [[package]] 1361 + name = "thiserror" 1362 + version = "1.0.69" 1363 + source = "registry+https://github.com/rust-lang/crates.io-index" 1364 + checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" 1365 + dependencies = [ 1366 + "thiserror-impl 1.0.69", 1367 + ] 1368 + 1369 + [[package]] 1370 + name = "thiserror" 1371 + version = "2.0.17" 1372 + source = "registry+https://github.com/rust-lang/crates.io-index" 1373 + checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" 1374 + dependencies = [ 1375 + "thiserror-impl 2.0.17", 1376 + ] 1377 + 1378 + [[package]] 1379 + name = "thiserror-impl" 1380 + version = "1.0.69" 1381 + source = "registry+https://github.com/rust-lang/crates.io-index" 1382 + checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" 1383 + dependencies = [ 1384 + "proc-macro2", 1385 + "quote", 1386 + "syn 2.0.106", 494 1387 ] 495 1388 496 1389 [[package]] 497 - name = "twox-hash" 498 - version = "2.1.2" 1390 + name = "thiserror-impl" 1391 + version = "2.0.17" 499 1392 source = "registry+https://github.com/rust-lang/crates.io-index" 500 - checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" 1393 + checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" 1394 + dependencies = [ 1395 + "proc-macro2", 1396 + "quote", 1397 + "syn 2.0.106", 1398 + ] 1399 + 1400 + [[package]] 1401 + name = "tinytemplate" 1402 + version = "1.2.1" 1403 + source = "registry+https://github.com/rust-lang/crates.io-index" 1404 + checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 1405 + dependencies = [ 1406 + "serde", 1407 + "serde_json", 1408 + ] 1409 + 1410 + [[package]] 1411 + name = "tokio" 1412 + version = "1.47.1" 1413 + source = "registry+https://github.com/rust-lang/crates.io-index" 1414 + checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" 1415 + dependencies = [ 1416 + "backtrace", 1417 + "bytes", 1418 + "io-uring", 1419 + "libc", 1420 + "mio", 1421 + "parking_lot", 1422 + "pin-project-lite", 1423 + "signal-hook-registry", 1424 + "slab", 1425 + "socket2", 1426 + "tokio-macros", 1427 + "windows-sys 0.59.0", 1428 + ] 1429 + 1430 + [[package]] 1431 + name = "tokio-macros" 1432 + version = "2.5.0" 1433 + source = "registry+https://github.com/rust-lang/crates.io-index" 1434 + checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" 1435 + dependencies = [ 1436 + "proc-macro2", 1437 + "quote", 1438 + "syn 2.0.106", 1439 + ] 1440 + 1441 + [[package]] 1442 + name = "typenum" 1443 + version = "1.19.0" 1444 + source = "registry+https://github.com/rust-lang/crates.io-index" 1445 + checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" 501 1446 502 1447 [[package]] 503 1448 name = "unicode-ident" ··· 506 1451 checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" 507 1452 508 1453 [[package]] 1454 + name = "unsigned-varint" 1455 + version = "0.7.2" 1456 + source = "registry+https://github.com/rust-lang/crates.io-index" 1457 + checksum = "6889a77d49f1f013504cec6bf97a2c730394adedaeb1deb5ea08949a50541105" 1458 + 1459 + [[package]] 1460 + name = "unsigned-varint" 1461 + version = "0.8.0" 1462 + source = "registry+https://github.com/rust-lang/crates.io-index" 1463 + checksum = "eb066959b24b5196ae73cb057f45598450d2c5f71460e98c49b738086eff9c06" 1464 + 1465 + [[package]] 1466 + name = "unty" 1467 + version = "0.0.4" 1468 + source = "registry+https://github.com/rust-lang/crates.io-index" 1469 + checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" 1470 + 1471 + [[package]] 509 1472 name = "utf8parse" 510 1473 version = "0.2.2" 511 1474 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 516 1479 version = "2.2.0" 517 1480 source = "registry+https://github.com/rust-lang/crates.io-index" 518 1481 checksum = "8f54a172d0620933a27a4360d3db3e2ae0dd6cceae9730751a036bbf182c4b23" 1482 + 1483 + [[package]] 1484 + name = "version_check" 1485 + version = "0.9.5" 1486 + source = "registry+https://github.com/rust-lang/crates.io-index" 1487 + checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 1488 + 1489 + [[package]] 1490 + name = "virtue" 1491 + version = "0.0.18" 1492 + source = "registry+https://github.com/rust-lang/crates.io-index" 1493 + checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" 1494 + 1495 + [[package]] 1496 + name = "walkdir" 1497 + version = "2.5.0" 1498 + source = "registry+https://github.com/rust-lang/crates.io-index" 1499 + checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 1500 + dependencies = [ 1501 + "same-file", 1502 + "winapi-util", 1503 + ] 1504 + 1505 + [[package]] 1506 + name = "wasi" 1507 + version = "0.11.1+wasi-snapshot-preview1" 1508 + source = "registry+https://github.com/rust-lang/crates.io-index" 1509 + checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" 519 1510 520 1511 [[package]] 521 1512 name = "wasi" ··· 536 1527 ] 537 1528 538 1529 [[package]] 1530 + name = "wasm-bindgen" 1531 + version = "0.2.104" 1532 + source = "registry+https://github.com/rust-lang/crates.io-index" 1533 + checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" 1534 + dependencies = [ 1535 + "cfg-if", 1536 + "once_cell", 1537 + "rustversion", 1538 + "wasm-bindgen-macro", 1539 + "wasm-bindgen-shared", 1540 + ] 1541 + 1542 + [[package]] 1543 + name = "wasm-bindgen-backend" 1544 + version = "0.2.104" 1545 + source = "registry+https://github.com/rust-lang/crates.io-index" 1546 + checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" 1547 + dependencies = [ 1548 + "bumpalo", 1549 + "log", 1550 + "proc-macro2", 1551 + "quote", 1552 + "syn 2.0.106", 1553 + "wasm-bindgen-shared", 1554 + ] 1555 + 1556 + [[package]] 1557 + name = "wasm-bindgen-macro" 1558 + version = "0.2.104" 1559 + source = "registry+https://github.com/rust-lang/crates.io-index" 1560 + checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" 1561 + dependencies = [ 1562 + "quote", 1563 + "wasm-bindgen-macro-support", 1564 + ] 1565 + 1566 + [[package]] 1567 + name = "wasm-bindgen-macro-support" 1568 + version = "0.2.104" 1569 + source = "registry+https://github.com/rust-lang/crates.io-index" 1570 + checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" 1571 + dependencies = [ 1572 + "proc-macro2", 1573 + "quote", 1574 + "syn 2.0.106", 1575 + "wasm-bindgen-backend", 1576 + "wasm-bindgen-shared", 1577 + ] 1578 + 1579 + [[package]] 1580 + name = "wasm-bindgen-shared" 1581 + version = "0.2.104" 1582 + source = "registry+https://github.com/rust-lang/crates.io-index" 1583 + checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" 1584 + dependencies = [ 1585 + "unicode-ident", 1586 + ] 1587 + 1588 + [[package]] 1589 + name = "web-sys" 1590 + version = "0.3.81" 1591 + source = "registry+https://github.com/rust-lang/crates.io-index" 1592 + checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" 1593 + dependencies = [ 1594 + "js-sys", 1595 + "wasm-bindgen", 1596 + ] 1597 + 1598 + [[package]] 1599 + name = "winapi-util" 1600 + version = "0.1.11" 1601 + source = "registry+https://github.com/rust-lang/crates.io-index" 1602 + checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" 1603 + dependencies = [ 1604 + "windows-sys 0.60.2", 1605 + ] 1606 + 1607 + [[package]] 539 1608 name = "windows-link" 540 1609 version = "0.2.1" 541 1610 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 543 1612 544 1613 [[package]] 545 1614 name = "windows-sys" 1615 + version = "0.59.0" 1616 + source = "registry+https://github.com/rust-lang/crates.io-index" 1617 + checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 1618 + dependencies = [ 1619 + "windows-targets 0.52.6", 1620 + ] 1621 + 1622 + [[package]] 1623 + name = "windows-sys" 546 1624 version = "0.60.2" 547 1625 source = "registry+https://github.com/rust-lang/crates.io-index" 548 1626 checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" 549 1627 dependencies = [ 550 - "windows-targets", 1628 + "windows-targets 0.53.5", 1629 + ] 1630 + 1631 + [[package]] 1632 + name = "windows-targets" 1633 + version = "0.52.6" 1634 + source = "registry+https://github.com/rust-lang/crates.io-index" 1635 + checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 1636 + dependencies = [ 1637 + "windows_aarch64_gnullvm 0.52.6", 1638 + "windows_aarch64_msvc 0.52.6", 1639 + "windows_i686_gnu 0.52.6", 1640 + "windows_i686_gnullvm 0.52.6", 1641 + "windows_i686_msvc 0.52.6", 1642 + "windows_x86_64_gnu 0.52.6", 1643 + "windows_x86_64_gnullvm 0.52.6", 1644 + "windows_x86_64_msvc 0.52.6", 551 1645 ] 552 1646 553 1647 [[package]] ··· 557 1651 checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" 558 1652 dependencies = [ 559 1653 "windows-link", 560 - "windows_aarch64_gnullvm", 561 - "windows_aarch64_msvc", 562 - "windows_i686_gnu", 563 - "windows_i686_gnullvm", 564 - "windows_i686_msvc", 565 - "windows_x86_64_gnu", 566 - "windows_x86_64_gnullvm", 567 - "windows_x86_64_msvc", 1654 + "windows_aarch64_gnullvm 0.53.1", 1655 + "windows_aarch64_msvc 0.53.1", 1656 + "windows_i686_gnu 0.53.1", 1657 + "windows_i686_gnullvm 0.53.1", 1658 + "windows_i686_msvc 0.53.1", 1659 + "windows_x86_64_gnu 0.53.1", 1660 + "windows_x86_64_gnullvm 0.53.1", 1661 + "windows_x86_64_msvc 0.53.1", 568 1662 ] 569 1663 570 1664 [[package]] 571 1665 name = "windows_aarch64_gnullvm" 1666 + version = "0.52.6" 1667 + source = "registry+https://github.com/rust-lang/crates.io-index" 1668 + checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 1669 + 1670 + [[package]] 1671 + name = "windows_aarch64_gnullvm" 572 1672 version = "0.53.1" 573 1673 source = "registry+https://github.com/rust-lang/crates.io-index" 574 1674 checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" 1675 + 1676 + [[package]] 1677 + name = "windows_aarch64_msvc" 1678 + version = "0.52.6" 1679 + source = "registry+https://github.com/rust-lang/crates.io-index" 1680 + checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 575 1681 576 1682 [[package]] 577 1683 name = "windows_aarch64_msvc" ··· 581 1687 582 1688 [[package]] 583 1689 name = "windows_i686_gnu" 1690 + version = "0.52.6" 1691 + source = "registry+https://github.com/rust-lang/crates.io-index" 1692 + checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 1693 + 1694 + [[package]] 1695 + name = "windows_i686_gnu" 584 1696 version = "0.53.1" 585 1697 source = "registry+https://github.com/rust-lang/crates.io-index" 586 1698 checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" 587 1699 588 1700 [[package]] 589 1701 name = "windows_i686_gnullvm" 1702 + version = "0.52.6" 1703 + source = "registry+https://github.com/rust-lang/crates.io-index" 1704 + checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 1705 + 1706 + [[package]] 1707 + name = "windows_i686_gnullvm" 590 1708 version = "0.53.1" 591 1709 source = "registry+https://github.com/rust-lang/crates.io-index" 592 1710 checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" 593 1711 594 1712 [[package]] 595 1713 name = "windows_i686_msvc" 1714 + version = "0.52.6" 1715 + source = "registry+https://github.com/rust-lang/crates.io-index" 1716 + checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 1717 + 1718 + [[package]] 1719 + name = "windows_i686_msvc" 596 1720 version = "0.53.1" 597 1721 source = "registry+https://github.com/rust-lang/crates.io-index" 598 1722 checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" 599 1723 600 1724 [[package]] 601 1725 name = "windows_x86_64_gnu" 1726 + version = "0.52.6" 1727 + source = "registry+https://github.com/rust-lang/crates.io-index" 1728 + checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 1729 + 1730 + [[package]] 1731 + name = "windows_x86_64_gnu" 602 1732 version = "0.53.1" 603 1733 source = "registry+https://github.com/rust-lang/crates.io-index" 604 1734 checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" 605 1735 606 1736 [[package]] 607 1737 name = "windows_x86_64_gnullvm" 1738 + version = "0.52.6" 1739 + source = "registry+https://github.com/rust-lang/crates.io-index" 1740 + checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 1741 + 1742 + [[package]] 1743 + name = "windows_x86_64_gnullvm" 608 1744 version = "0.53.1" 609 1745 source = "registry+https://github.com/rust-lang/crates.io-index" 610 1746 checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" 1747 + 1748 + [[package]] 1749 + name = "windows_x86_64_msvc" 1750 + version = "0.52.6" 1751 + source = "registry+https://github.com/rust-lang/crates.io-index" 1752 + checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 611 1753 612 1754 [[package]] 613 1755 name = "windows_x86_64_msvc" ··· 626 1768 version = "0.8.15" 627 1769 source = "registry+https://github.com/rust-lang/crates.io-index" 628 1770 checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" 1771 + 1772 + [[package]] 1773 + name = "zerocopy" 1774 + version = "0.8.27" 1775 + source = "registry+https://github.com/rust-lang/crates.io-index" 1776 + checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" 1777 + dependencies = [ 1778 + "zerocopy-derive", 1779 + ] 1780 + 1781 + [[package]] 1782 + name = "zerocopy-derive" 1783 + version = "0.8.27" 1784 + source = "registry+https://github.com/rust-lang/crates.io-index" 1785 + checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" 1786 + dependencies = [ 1787 + "proc-macro2", 1788 + "quote", 1789 + "syn 2.0.106", 1790 + ]
+35 -1
Cargo.toml
··· 7 7 repository = "https://tangled.org/@microcosm.blue/repo-stream" 8 8 9 9 [dependencies] 10 - fjall = "3.0.1" 10 + bincode = { version = "2.0.1", features = ["serde"] } 11 + fjall = { version = "3.0.1", default-features = false } 12 + futures = "0.3.31" 13 + futures-core = "0.3.31" 14 + ipld-core = { version = "0.4.2", features = ["serde"] } 15 + iroh-car = "0.5.1" 16 + log = "0.4.28" 17 + multibase = "0.9.2" 18 + serde = { version = "1.0.228", features = ["derive"] } 19 + serde_bytes = "0.11.19" 20 + serde_ipld_dagcbor = "0.6.4" 21 + sha2 = "0.10.9" 22 + thiserror = "2.0.17" 23 + tokio = { version = "1.47.1", features = ["rt", "sync"] } 24 + 25 + [dev-dependencies] 11 26 clap = { version = "4.5.48", features = ["derive"] } 27 + criterion = { version = "0.7.0", features = ["async_tokio"] } 28 + env_logger = "0.11.8" 29 + multibase = "0.9.2" 30 + tempfile = "3.23.0" 31 + tokio = { version = "1.47.1", features = ["full"] } 12 32 33 + [profile.profiling] 34 + inherits = "release" 35 + debug = true 36 + 37 + # [profile.release] 38 + # debug = true 39 + 40 + [[bench]] 41 + name = "non-huge-cars" 42 + harness = false 43 + 44 + [[bench]] 45 + name = "huge-car" 46 + harness = false
+73 -26
examples/disk-read-file/main.rs
··· 1 + /*! 2 + Read a CAR file by spilling to disk 3 + */ 4 + 5 + extern crate repo_stream; 1 6 use clap::Parser; 2 - use fjall::{Database, KeyspaceCreateOptions}; 3 - use std::{path::PathBuf, collections::BTreeMap}; 7 + use repo_stream::{DiskBuilder, Driver, DriverBuilder}; 8 + use std::path::PathBuf; 9 + use std::time::Instant; 4 10 5 11 #[derive(Debug, Parser)] 6 12 struct Args { 7 13 #[arg()] 8 - db_path: PathBuf, 14 + car: PathBuf, 15 + #[arg()] 16 + tmpfile: PathBuf, 9 17 } 10 18 11 - fn main() -> Result<(), Box<dyn std::error::Error>> { 12 - let Args { db_path } = Args::parse(); 19 + #[tokio::main] 20 + async fn main() -> Result<(), Box<dyn std::error::Error>> { 21 + env_logger::init(); 22 + 23 + let Args { car, tmpfile } = Args::parse(); 24 + 25 + // repo-stream takes an AsyncRead as input. wrapping a filesystem read in 26 + // BufReader can provide a really significant performance win. 27 + let reader = tokio::fs::File::open(car).await?; 28 + let reader = tokio::io::BufReader::new(reader); 29 + 30 + log::info!("hello! reading the car..."); 31 + let t0 = Instant::now(); 32 + 33 + // in this example we only bother handling CARs that are too big for memory 34 + // `noop` helper means: do no block processing, store the raw blocks 35 + let driver = match DriverBuilder::new() 36 + .with_mem_limit_mb(32) // how much memory can be used before disk spill 37 + .load_car(reader) 38 + .await? 39 + { 40 + Driver::Memory(_, _) => panic!("try this on a bigger car"), 41 + Driver::Disk(big_stuff) => { 42 + // we reach here if the repo was too big and needs to be spilled to 43 + // disk to continue 44 + 45 + // set up a disk store we can spill to 46 + let disk_store = DiskBuilder::new().open(tmpfile).await?; 47 + 48 + // do the spilling, get back a (similar) driver 49 + let (commit, driver) = big_stuff.finish_loading(disk_store).await?; 50 + 51 + // at this point you might want to fetch the account's signing key 52 + // via the DID from the commit, and then verify the signature. 53 + log::warn!("big's comit ({:?}): {:?}", t0.elapsed(), commit); 54 + 55 + // pop the driver back out to get some code indentation relief 56 + driver 57 + } 58 + }; 13 59 14 - let db = Database::builder(db_path).open()?; 15 - let ks = db.keyspace("z", KeyspaceCreateOptions::default)?; 16 - let mut seen_keys: BTreeMap<Vec<u8>, usize> = BTreeMap::default(); 60 + // collect some random stats about the blocks 61 + let mut n = 0; 62 + let mut zeros = 0; 17 63 18 - print!("writing..."); 19 - for i in 0..250_000_usize { 20 - let k = i.to_be_bytes().to_vec(); 21 - ks.insert(k.clone(), vec![0xAA; 256])?; 22 - seen_keys.insert(k, i); 23 - } 64 + log::info!("walking..."); 24 65 25 - println!(" done. checking keys..."); 66 + // this example uses the disk driver's channel mode: the tree walking is 67 + // spawned onto a blocking thread, and we get chunks of rkey+blocks back 68 + let (mut rx, join) = driver.to_channel(512); 69 + while let Some(r) = rx.recv().await { 70 + let pairs = r?; 26 71 27 - // remove every seen key that fjall actually has, to see what's left 28 - for guard in ks.iter() { 29 - seen_keys.remove(guard.key()?.as_ref()); 30 - } 72 + // keep a count of the total number of blocks seen 73 + n += pairs.len(); 31 74 32 - // report the result 33 - if seen_keys.len() == 0 { 34 - println!("[ OK ] all keys found"); 35 - } else { 36 - println!("[FAIL] fjall did not have all seen_keys:"); 37 - for (k, i) in seen_keys { 38 - println!(" insert #{i} missing, key bytes: {k:?}"); 75 + for (_, block) in pairs { 76 + // for each block, count how many bytes are equal to '0' 77 + // (this is just an example, you probably want to do something more 78 + // interesting) 79 + zeros += block.into_iter().filter(|&b| b == b'0').count() 39 80 } 40 81 } 82 + 83 + log::info!("arrived! ({:?}) joining rx...", t0.elapsed()); 84 + 85 + join.await?; 86 + 87 + log::info!("done. n={n} zeros={zeros}"); 41 88 42 89 Ok(()) 43 90 }
-3
readme.md
··· 50 50 total_size += size; 51 51 } 52 52 } 53 - 54 - // clean up the disk store (drop tables etc) 55 - driver.reset_store().await?; 56 53 } 57 54 }; 58 55 println!("sum of size of all records: {total_size}");
+162
src/disk.rs
··· 1 + /*! 2 + Disk storage for blocks on disk 3 + 4 + Currently this uses sqlite. In testing sqlite wasn't the fastest, but it seemed 5 + to be the best behaved in terms of both on-disk space usage and memory usage. 6 + 7 + ```no_run 8 + # use repo_stream::{DiskBuilder, DiskError}; 9 + # #[tokio::main] 10 + # async fn main() -> Result<(), DiskError> { 11 + let store = DiskBuilder::new() 12 + .with_cache_size_mb(32) 13 + .with_max_stored_mb(1024) // errors when >1GiB of processed blocks are inserted 14 + .open("/some/path.db".into()).await?; 15 + # Ok(()) 16 + # } 17 + ``` 18 + */ 19 + 20 + use crate::drive::DriveError; 21 + use fjall::config::{CompressionPolicy, PinningPolicy, RestartIntervalPolicy}; 22 + use fjall::{CompressionType, Database, Error as FjallError, Keyspace, KeyspaceCreateOptions}; 23 + use std::path::PathBuf; 24 + 25 + #[derive(Debug, thiserror::Error)] 26 + pub enum DiskError { 27 + /// A wrapped database error 28 + /// 29 + /// (The wrapped err should probably be obscured to remove public-facing 30 + /// sqlite bits) 31 + #[error(transparent)] 32 + DbError(#[from] FjallError), 33 + /// A tokio blocking task failed to join 34 + #[error("Failed to join a tokio blocking task: {0}")] 35 + JoinError(#[from] tokio::task::JoinError), 36 + /// The total size of stored blocks exceeded the allowed size 37 + /// 38 + /// If you need to process *really* big CARs, you can configure a higher 39 + /// limit. 40 + #[error("Maximum disk size reached")] 41 + MaxSizeExceeded, 42 + } 43 + 44 + /// Builder-style disk store setup 45 + #[derive(Debug, Clone)] 46 + pub struct DiskBuilder { 47 + /// Database in-memory cache allowance 48 + /// 49 + /// Default: 32 MiB 50 + pub cache_size_mb: usize, 51 + /// Database stored block size limit 52 + /// 53 + /// Default: 10 GiB 54 + /// 55 + /// Note: actual size on disk may be more, but should approximately scale 56 + /// with this limit 57 + pub max_stored_mb: usize, 58 + } 59 + 60 + impl Default for DiskBuilder { 61 + fn default() -> Self { 62 + Self { 63 + cache_size_mb: 64, 64 + max_stored_mb: 10 * 1024, // 10 GiB 65 + } 66 + } 67 + } 68 + 69 + impl DiskBuilder { 70 + /// Begin configuring the storage with defaults 71 + pub fn new() -> Self { 72 + Default::default() 73 + } 74 + /// Set the in-memory cache allowance for the database 75 + /// 76 + /// Default: 64 MiB 77 + pub fn with_cache_size_mb(mut self, size: usize) -> Self { 78 + self.cache_size_mb = size; 79 + self 80 + } 81 + /// Set the approximate stored block size limit 82 + /// 83 + /// Default: 10 GiB 84 + pub fn with_max_stored_mb(mut self, max: usize) -> Self { 85 + self.max_stored_mb = max; 86 + self 87 + } 88 + /// Open and initialize the actual disk storage 89 + pub async fn open(&self, path: PathBuf) -> Result<DiskStore, DiskError> { 90 + DiskStore::new(path, self.cache_size_mb, self.max_stored_mb).await 91 + } 92 + } 93 + 94 + /// On-disk block storage 95 + pub struct DiskStore { 96 + #[allow(unused)] 97 + db: Database, 98 + partition: Keyspace, 99 + max_stored: usize, 100 + stored: usize, 101 + } 102 + 103 + impl DiskStore { 104 + /// Initialize a new disk store 105 + pub async fn new( 106 + path: PathBuf, 107 + cache_mb: usize, 108 + max_stored_mb: usize, 109 + ) -> Result<Self, DiskError> { 110 + let max_stored = max_stored_mb * 2_usize.pow(20); 111 + let (db, partition) = tokio::task::spawn_blocking(move || { 112 + let db = Database::builder(path) 113 + // .manual_journal_persist(true) 114 + // .flush_workers(1) 115 + // .compaction_workers(1) 116 + .journal_compression(CompressionType::None) 117 + .cache_size(cache_mb as u64 * 2_u64.pow(20)) 118 + .temporary(true) 119 + .open()?; 120 + let opts = KeyspaceCreateOptions::default() 121 + .data_block_restart_interval_policy(RestartIntervalPolicy::all(8)) 122 + .filter_block_pinning_policy(PinningPolicy::disabled()) 123 + .expect_point_read_hits(true) 124 + .data_block_compression_policy(CompressionPolicy::disabled()) 125 + .manual_journal_persist(true) 126 + .max_memtable_size(32 * 2_u64.pow(20)); 127 + let partition = db.keyspace("z", || opts)?; 128 + 129 + Ok::<_, DiskError>((db, partition)) 130 + }) 131 + .await??; 132 + 133 + Ok(Self { 134 + db, 135 + partition, 136 + max_stored, 137 + stored: 0, 138 + }) 139 + } 140 + 141 + pub(crate) fn put_many( 142 + &mut self, 143 + kv: impl Iterator<Item = Result<(Vec<u8>, Vec<u8>), DriveError>>, 144 + ) -> Result<(), DriveError> { 145 + let mut batch = self.db.batch(); 146 + for pair in kv { 147 + let (k, v) = pair?; 148 + self.stored += v.len(); 149 + if self.stored > self.max_stored { 150 + return Err(DiskError::MaxSizeExceeded.into()); 151 + } 152 + batch.insert(&self.partition, k, v); 153 + } 154 + batch.commit().map_err(DiskError::DbError)?; 155 + Ok(()) 156 + } 157 + 158 + #[inline] 159 + pub(crate) fn get(&mut self, key: &[u8]) -> Result<Option<fjall::Slice>, FjallError> { 160 + self.partition.get(key) 161 + } 162 + }
+583
src/drive.rs
··· 1 + //! Consume a CAR from an AsyncRead, producing an ordered stream of records 2 + 3 + use crate::disk::{DiskError, DiskStore}; 4 + use crate::process::Processable; 5 + use ipld_core::cid::Cid; 6 + use iroh_car::CarReader; 7 + use serde::{Deserialize, Serialize}; 8 + use std::collections::HashMap; 9 + use std::convert::Infallible; 10 + use tokio::{io::AsyncRead, sync::mpsc}; 11 + 12 + use crate::mst::{Commit, Node}; 13 + use crate::walk::{Step, WalkError, Walker}; 14 + 15 + /// Errors that can happen while consuming and emitting blocks and records 16 + #[derive(Debug, thiserror::Error)] 17 + pub enum DriveError { 18 + #[error("Error from iroh_car: {0}")] 19 + CarReader(#[from] iroh_car::Error), 20 + #[error("Failed to decode commit block: {0}")] 21 + BadBlock(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 22 + #[error("The Commit block reference by the root was not found")] 23 + MissingCommit, 24 + #[error("The MST block {0} could not be found")] 25 + MissingBlock(Cid), 26 + #[error("Failed to walk the mst tree: {0}")] 27 + WalkError(#[from] WalkError), 28 + #[error("CAR file had no roots")] 29 + MissingRoot, 30 + #[error("Storage error")] 31 + StorageError(#[from] DiskError), 32 + #[error("Encode error: {0}")] 33 + BincodeEncodeError(#[from] bincode::error::EncodeError), 34 + #[error("Tried to send on a closed channel")] 35 + ChannelSendError, // SendError takes <T> which we don't need 36 + #[error("Failed to join a task: {0}")] 37 + JoinError(#[from] tokio::task::JoinError), 38 + } 39 + 40 + #[derive(Debug, thiserror::Error)] 41 + pub enum DecodeError { 42 + #[error(transparent)] 43 + BincodeDecodeError(#[from] bincode::error::DecodeError), 44 + #[error("extra bytes remained after decoding")] 45 + ExtraGarbage, 46 + } 47 + 48 + /// An in-order chunk of Rkey + (processed) Block pairs 49 + pub type BlockChunk<T> = Vec<(String, T)>; 50 + 51 + #[derive(Debug, Clone, Serialize, Deserialize)] 52 + pub(crate) enum MaybeProcessedBlock<T> { 53 + /// A block that's *probably* a Node (but we can't know yet) 54 + /// 55 + /// It *can be* a record that suspiciously looks a lot like a node, so we 56 + /// cannot eagerly turn it into a Node. We only know for sure what it is 57 + /// when we actually walk down the MST 58 + Raw(Vec<u8>), 59 + /// A processed record from a block that was definitely not a Node 60 + /// 61 + /// Processing has to be fallible because the CAR can have totally-unused 62 + /// blocks, which can just be garbage. since we're eagerly trying to process 63 + /// record blocks without knowing for sure that they *are* records, we 64 + /// discard any definitely-not-nodes that fail processing and keep their 65 + /// error in the buffer for them. if we later try to retreive them as a 66 + /// record, then we can surface the error. 67 + /// 68 + /// If we _never_ needed this block, then we may have wasted a bit of effort 69 + /// trying to process it. Oh well. 70 + /// 71 + /// There's an alternative here, which would be to kick unprocessable blocks 72 + /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could 73 + /// surface the typed error later if needed by trying to reprocess. 74 + Processed(T), 75 + } 76 + 77 + impl<T: Processable> Processable for MaybeProcessedBlock<T> { 78 + /// TODO this is probably a little broken 79 + fn get_size(&self) -> usize { 80 + use std::{cmp::max, mem::size_of}; 81 + 82 + // enum is always as big as its biggest member? 83 + let base_size = max(size_of::<Vec<u8>>(), size_of::<T>()); 84 + 85 + let extra = match self { 86 + Self::Raw(bytes) => bytes.len(), 87 + Self::Processed(t) => t.get_size(), 88 + }; 89 + 90 + base_size + extra 91 + } 92 + } 93 + 94 + impl<T> MaybeProcessedBlock<T> { 95 + fn maybe(process: fn(Vec<u8>) -> T, data: Vec<u8>) -> Self { 96 + if Node::could_be(&data) { 97 + MaybeProcessedBlock::Raw(data) 98 + } else { 99 + MaybeProcessedBlock::Processed(process(data)) 100 + } 101 + } 102 + } 103 + 104 + /// Read a CAR file, buffering blocks in memory or to disk 105 + pub enum Driver<R: AsyncRead + Unpin, T: Processable> { 106 + /// All blocks fit within the memory limit 107 + /// 108 + /// You probably want to check the commit's signature. You can go ahead and 109 + /// walk the MST right away. 110 + Memory(Commit, MemDriver<T>), 111 + /// Blocks exceed the memory limit 112 + /// 113 + /// You'll need to provide a disk storage to continue. The commit will be 114 + /// returned and can be validated only once all blocks are loaded. 115 + Disk(NeedDisk<R, T>), 116 + } 117 + 118 + /// Builder-style driver setup 119 + #[derive(Debug, Clone)] 120 + pub struct DriverBuilder { 121 + pub mem_limit_mb: usize, 122 + } 123 + 124 + impl Default for DriverBuilder { 125 + fn default() -> Self { 126 + Self { mem_limit_mb: 16 } 127 + } 128 + } 129 + 130 + impl DriverBuilder { 131 + /// Begin configuring the driver with defaults 132 + pub fn new() -> Self { 133 + Default::default() 134 + } 135 + /// Set the in-memory size limit, in MiB 136 + /// 137 + /// Default: 16 MiB 138 + pub fn with_mem_limit_mb(self, new_limit: usize) -> Self { 139 + Self { 140 + mem_limit_mb: new_limit, 141 + } 142 + } 143 + /// Set the block processor 144 + /// 145 + /// Default: noop, raw blocks will be emitted 146 + pub fn with_block_processor<T: Processable>( 147 + self, 148 + p: fn(Vec<u8>) -> T, 149 + ) -> DriverBuilderWithProcessor<T> { 150 + DriverBuilderWithProcessor { 151 + mem_limit_mb: self.mem_limit_mb, 152 + block_processor: p, 153 + } 154 + } 155 + /// Begin processing an atproto MST from a CAR file 156 + pub async fn load_car<R: AsyncRead + Unpin>( 157 + &self, 158 + reader: R, 159 + ) -> Result<Driver<R, Vec<u8>>, DriveError> { 160 + Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await 161 + } 162 + } 163 + 164 + /// Builder-style driver intermediate step 165 + /// 166 + /// start from `DriverBuilder` 167 + #[derive(Debug, Clone)] 168 + pub struct DriverBuilderWithProcessor<T: Processable> { 169 + pub mem_limit_mb: usize, 170 + pub block_processor: fn(Vec<u8>) -> T, 171 + } 172 + 173 + impl<T: Processable> DriverBuilderWithProcessor<T> { 174 + /// Set the in-memory size limit, in MiB 175 + /// 176 + /// Default: 16 MiB 177 + pub fn with_mem_limit_mb(mut self, new_limit: usize) -> Self { 178 + self.mem_limit_mb = new_limit; 179 + self 180 + } 181 + /// Begin processing an atproto MST from a CAR file 182 + pub async fn load_car<R: AsyncRead + Unpin>( 183 + &self, 184 + reader: R, 185 + ) -> Result<Driver<R, T>, DriveError> { 186 + Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await 187 + } 188 + } 189 + 190 + impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> { 191 + /// Begin processing an atproto MST from a CAR file 192 + /// 193 + /// Blocks will be loaded, processed, and buffered in memory. If the entire 194 + /// processed size is under the `mem_limit_mb` limit, a `Driver::Memory` 195 + /// will be returned along with a `Commit` ready for validation. 196 + /// 197 + /// If the `mem_limit_mb` limit is reached before loading all blocks, the 198 + /// partial state will be returned as `Driver::Disk(needed)`, which can be 199 + /// resumed by providing a `SqliteStorage` for on-disk block storage. 200 + pub async fn load_car( 201 + reader: R, 202 + process: fn(Vec<u8>) -> T, 203 + mem_limit_mb: usize, 204 + ) -> Result<Driver<R, T>, DriveError> { 205 + let max_size = mem_limit_mb * 2_usize.pow(20); 206 + let mut mem_blocks = HashMap::new(); 207 + 208 + let mut car = CarReader::new(reader).await?; 209 + 210 + let root = *car 211 + .header() 212 + .roots() 213 + .first() 214 + .ok_or(DriveError::MissingRoot)?; 215 + log::debug!("root: {root:?}"); 216 + 217 + let mut commit = None; 218 + 219 + // try to load all the blocks into memory 220 + let mut mem_size = 0; 221 + while let Some((cid, data)) = car.next_block().await? { 222 + // the root commit is a Special Third Kind of block that we need to make 223 + // sure not to optimistically send to the processing function 224 + if cid == root { 225 + let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 226 + commit = Some(c); 227 + continue; 228 + } 229 + 230 + // remaining possible types: node, record, other. optimistically process 231 + let maybe_processed = MaybeProcessedBlock::maybe(process, data); 232 + 233 + // stash (maybe processed) blocks in memory as long as we have room 234 + mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size(); 235 + mem_blocks.insert(cid, maybe_processed); 236 + if mem_size >= max_size { 237 + return Ok(Driver::Disk(NeedDisk { 238 + car, 239 + root, 240 + process, 241 + max_size, 242 + mem_blocks, 243 + commit, 244 + })); 245 + } 246 + } 247 + 248 + // all blocks loaded and we fit in memory! hopefully we found the commit... 249 + let commit = commit.ok_or(DriveError::MissingCommit)?; 250 + 251 + let walker = Walker::new(commit.data); 252 + 253 + Ok(Driver::Memory( 254 + commit, 255 + MemDriver { 256 + blocks: mem_blocks, 257 + walker, 258 + process, 259 + }, 260 + )) 261 + } 262 + } 263 + 264 + /// The core driver between the block stream and MST walker 265 + /// 266 + /// In the future, PDSs will export CARs in a stream-friendly order that will 267 + /// enable processing them with tiny memory overhead. But that future is not 268 + /// here yet. 269 + /// 270 + /// CARs are almost always in a stream-unfriendly order, so I'm reverting the 271 + /// optimistic stream features: we load all block first, then walk the MST. 272 + /// 273 + /// This makes things much simpler: we only need to worry about spilling to disk 274 + /// in one place, and we always have a reasonable expecatation about how much 275 + /// work the init function will do. We can drop the CAR reader before walking, 276 + /// so the sync/async boundaries become a little easier to work around. 277 + #[derive(Debug)] 278 + pub struct MemDriver<T: Processable> { 279 + blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 280 + walker: Walker, 281 + process: fn(Vec<u8>) -> T, 282 + } 283 + 284 + impl<T: Processable> MemDriver<T> { 285 + /// Step through the record outputs, in rkey order 286 + pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 287 + let mut out = Vec::with_capacity(n); 288 + for _ in 0..n { 289 + // walk as far as we can until we run out of blocks or find a record 290 + match self.walker.step(&mut self.blocks, self.process)? { 291 + Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)), 292 + Step::Finish => break, 293 + Step::Found { rkey, data } => { 294 + out.push((rkey, data)); 295 + continue; 296 + } 297 + }; 298 + } 299 + 300 + if out.is_empty() { 301 + Ok(None) 302 + } else { 303 + Ok(Some(out)) 304 + } 305 + } 306 + } 307 + 308 + /// A partially memory-loaded car file that needs disk spillover to continue 309 + pub struct NeedDisk<R: AsyncRead + Unpin, T: Processable> { 310 + car: CarReader<R>, 311 + root: Cid, 312 + process: fn(Vec<u8>) -> T, 313 + max_size: usize, 314 + mem_blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 315 + pub commit: Option<Commit>, 316 + } 317 + 318 + fn encode(v: impl Serialize) -> Result<Vec<u8>, bincode::error::EncodeError> { 319 + bincode::serde::encode_to_vec(v, bincode::config::standard()) 320 + } 321 + 322 + pub(crate) fn decode<T: Processable>(bytes: &[u8]) -> Result<T, DecodeError> { 323 + let (t, n) = bincode::serde::decode_from_slice(bytes, bincode::config::standard())?; 324 + if n != bytes.len() { 325 + return Err(DecodeError::ExtraGarbage); 326 + } 327 + Ok(t) 328 + } 329 + 330 + impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> NeedDisk<R, T> { 331 + pub async fn finish_loading( 332 + mut self, 333 + mut store: DiskStore, 334 + ) -> Result<(Commit, DiskDriver<T>), DriveError> { 335 + // move store in and back out so we can manage lifetimes 336 + // dump mem blocks into the store 337 + store = tokio::task::spawn(async move { 338 + let kvs = self 339 + .mem_blocks 340 + .into_iter() 341 + .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?)); 342 + 343 + store.put_many(kvs)?; 344 + Ok::<_, DriveError>(store) 345 + }) 346 + .await??; 347 + 348 + let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(1); 349 + 350 + let store_worker = tokio::task::spawn_blocking(move || { 351 + while let Some(chunk) = rx.blocking_recv() { 352 + let kvs = chunk 353 + .into_iter() 354 + .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?)); 355 + store.put_many(kvs)?; 356 + } 357 + Ok::<_, DriveError>(store) 358 + }); // await later 359 + 360 + // dump the rest to disk (in chunks) 361 + log::debug!("dumping the rest of the stream..."); 362 + loop { 363 + let mut mem_size = 0; 364 + let mut chunk = vec![]; 365 + loop { 366 + let Some((cid, data)) = self.car.next_block().await? else { 367 + break; 368 + }; 369 + // we still gotta keep checking for the root since we might not have it 370 + if cid == self.root { 371 + let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 372 + self.commit = Some(c); 373 + continue; 374 + } 375 + // remaining possible types: node, record, other. optimistically process 376 + // TODO: get the actual in-memory size to compute disk spill 377 + let maybe_processed = MaybeProcessedBlock::maybe(self.process, data); 378 + mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size(); 379 + chunk.push((cid, maybe_processed)); 380 + if mem_size >= self.max_size { 381 + // soooooo if we're setting the db cache to max_size and then letting 382 + // multiple chunks in the queue that are >= max_size, then at any time 383 + // we might be using some multiple of max_size? 384 + break; 385 + } 386 + } 387 + if chunk.is_empty() { 388 + break; 389 + } 390 + tx.send(chunk) 391 + .await 392 + .map_err(|_| DriveError::ChannelSendError)?; 393 + } 394 + drop(tx); 395 + log::debug!("done. waiting for worker to finish..."); 396 + 397 + store = store_worker.await??; 398 + 399 + log::debug!("worker finished."); 400 + 401 + let commit = self.commit.ok_or(DriveError::MissingCommit)?; 402 + 403 + let walker = Walker::new(commit.data); 404 + 405 + Ok(( 406 + commit, 407 + DiskDriver { 408 + process: self.process, 409 + state: Some(BigState { store, walker }), 410 + }, 411 + )) 412 + } 413 + } 414 + 415 + struct BigState { 416 + store: DiskStore, 417 + walker: Walker, 418 + } 419 + 420 + /// MST walker that reads from disk instead of an in-memory hashmap 421 + pub struct DiskDriver<T: Clone> { 422 + process: fn(Vec<u8>) -> T, 423 + state: Option<BigState>, 424 + } 425 + 426 + // for doctests only 427 + #[doc(hidden)] 428 + pub fn _get_fake_disk_driver() -> DiskDriver<Vec<u8>> { 429 + use crate::process::noop; 430 + DiskDriver { 431 + process: noop, 432 + state: None, 433 + } 434 + } 435 + 436 + impl<T: Processable + Send + 'static> DiskDriver<T> { 437 + /// Walk the MST returning up to `n` rkey + record pairs 438 + /// 439 + /// ```no_run 440 + /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop}; 441 + /// # #[tokio::main] 442 + /// # async fn main() -> Result<(), DriveError> { 443 + /// # let mut disk_driver = _get_fake_disk_driver(); 444 + /// while let Some(pairs) = disk_driver.next_chunk(256).await? { 445 + /// for (rkey, record) in pairs { 446 + /// println!("{rkey}: size={}", record.len()); 447 + /// } 448 + /// } 449 + /// # Ok(()) 450 + /// # } 451 + /// ``` 452 + pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 453 + let process = self.process; 454 + 455 + // state should only *ever* be None transiently while inside here 456 + let mut state = self.state.take().expect("DiskDriver must have Some(state)"); 457 + 458 + // the big pain here is that we don't want to leave self.state in an 459 + // invalid state (None), so all the error paths have to make sure it 460 + // comes out again. 461 + let (state, res) = tokio::task::spawn_blocking( 462 + move || -> (BigState, Result<BlockChunk<T>, DriveError>) { 463 + let mut out = Vec::with_capacity(n); 464 + 465 + for _ in 0..n { 466 + // walk as far as we can until we run out of blocks or find a record 467 + let step = match state.walker.disk_step(&mut state.store, process) { 468 + Ok(s) => s, 469 + Err(e) => { 470 + return (state, Err(e.into())); 471 + } 472 + }; 473 + match step { 474 + Step::Missing(cid) => { 475 + return (state, Err(DriveError::MissingBlock(cid))); 476 + } 477 + Step::Finish => break, 478 + Step::Found { rkey, data } => out.push((rkey, data)), 479 + }; 480 + } 481 + 482 + (state, Ok::<_, DriveError>(out)) 483 + }, 484 + ) 485 + .await?; // on tokio JoinError, we'll be left with invalid state :( 486 + 487 + // *must* restore state before dealing with the actual result 488 + self.state = Some(state); 489 + 490 + let out = res?; 491 + 492 + if out.is_empty() { 493 + Ok(None) 494 + } else { 495 + Ok(Some(out)) 496 + } 497 + } 498 + 499 + fn read_tx_blocking( 500 + &mut self, 501 + n: usize, 502 + tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>, 503 + ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> { 504 + let BigState { store, walker } = self.state.as_mut().expect("valid state"); 505 + 506 + loop { 507 + let mut out: BlockChunk<T> = Vec::with_capacity(n); 508 + 509 + for _ in 0..n { 510 + // walk as far as we can until we run out of blocks or find a record 511 + 512 + let step = match walker.disk_step(store, self.process) { 513 + Ok(s) => s, 514 + Err(e) => return tx.blocking_send(Err(e.into())), 515 + }; 516 + 517 + match step { 518 + Step::Missing(cid) => { 519 + return tx.blocking_send(Err(DriveError::MissingBlock(cid))); 520 + } 521 + Step::Finish => return Ok(()), 522 + Step::Found { rkey, data } => { 523 + out.push((rkey, data)); 524 + continue; 525 + } 526 + }; 527 + } 528 + 529 + if out.is_empty() { 530 + break; 531 + } 532 + tx.blocking_send(Ok(out))?; 533 + } 534 + 535 + Ok(()) 536 + } 537 + 538 + /// Spawn the disk reading task into a tokio blocking thread 539 + /// 540 + /// The idea is to avoid so much sending back and forth to the blocking 541 + /// thread, letting a blocking task do all the disk reading work and sending 542 + /// records and rkeys back through an `mpsc` channel instead. 543 + /// 544 + /// This might also allow the disk work to continue while processing the 545 + /// records. It's still not yet clear if this method actually has much 546 + /// benefit over just using `.next_chunk(n)`. 547 + /// 548 + /// ```no_run 549 + /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop}; 550 + /// # #[tokio::main] 551 + /// # async fn main() -> Result<(), DriveError> { 552 + /// # let mut disk_driver = _get_fake_disk_driver(); 553 + /// let (mut rx, join) = disk_driver.to_channel(512); 554 + /// while let Some(recvd) = rx.recv().await { 555 + /// let pairs = recvd?; 556 + /// for (rkey, record) in pairs { 557 + /// println!("{rkey}: size={}", record.len()); 558 + /// } 559 + /// 560 + /// } 561 + /// # Ok(()) 562 + /// # } 563 + /// ``` 564 + pub fn to_channel( 565 + mut self, 566 + n: usize, 567 + ) -> ( 568 + mpsc::Receiver<Result<BlockChunk<T>, DriveError>>, 569 + tokio::task::JoinHandle<Self>, 570 + ) { 571 + let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1); 572 + 573 + // sketch: this worker is going to be allowed to execute without a join handle 574 + let chan_task = tokio::task::spawn_blocking(move || { 575 + if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) { 576 + log::debug!("big car reader exited early due to dropped receiver channel"); 577 + } 578 + self 579 + }); 580 + 581 + (rx, chan_task) 582 + } 583 + }
+12 -3
src/lib.rs
··· 53 53 total_size += size; 54 54 } 55 55 } 56 - 57 - // clean up the disk store (drop tables etc) 58 - driver.reset_store().await?; 59 56 } 60 57 }; 61 58 println!("sum of size of all records: {total_size}"); ··· 73 70 Find more [examples in the repo](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples). 74 71 75 72 */ 73 + 74 + pub mod mst; 75 + mod walk; 76 + 77 + pub mod disk; 78 + pub mod drive; 79 + pub mod process; 80 + 81 + pub use disk::{DiskBuilder, DiskError, DiskStore}; 82 + pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk}; 83 + pub use mst::Commit; 84 + pub use process::Processable;
+110
src/mst.rs
··· 1 + //! Low-level types for parsing raw atproto MST CARs 2 + //! 3 + //! The primary aim is to work through the **tree** structure. Non-node blocks 4 + //! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever. 5 + 6 + use ipld_core::cid::Cid; 7 + use serde::Deserialize; 8 + 9 + /// The top-level data object in a repository's tree is a signed commit. 10 + #[derive(Debug, Deserialize)] 11 + // #[serde(deny_unknown_fields)] 12 + pub struct Commit { 13 + /// the account DID associated with the repo, in strictly normalized form 14 + /// (eg, lowercase as appropriate) 15 + pub did: String, 16 + /// fixed value of 3 for this repo format version 17 + pub version: u64, 18 + /// pointer to the top of the repo contents tree structure (MST) 19 + pub data: Cid, 20 + /// revision of the repo, used as a logical clock. 21 + /// 22 + /// TID format. Must increase monotonically. Recommend using current 23 + /// timestamp as TID; rev values in the "future" (beyond a fudge factor) 24 + /// should be ignored and not processed 25 + pub rev: String, 26 + /// pointer (by hash) to a previous commit object for this repository. 27 + /// 28 + /// Could be used to create a chain of history, but largely unused (included 29 + /// for v2 backwards compatibility). In version 3 repos, this field must 30 + /// exist in the CBOR object, but is virtually always null. NOTE: previously 31 + /// specified as nullable and optional, but this caused interoperability 32 + /// issues. 33 + pub prev: Option<Cid>, 34 + /// cryptographic signature of this commit, as raw bytes 35 + #[serde(with = "serde_bytes")] 36 + pub sig: Vec<u8>, 37 + } 38 + 39 + /// MST node data schema 40 + #[derive(Debug, Deserialize, PartialEq)] 41 + #[serde(deny_unknown_fields)] 42 + pub(crate) struct Node { 43 + /// link to sub-tree Node on a lower level and with all keys sorting before 44 + /// keys at this node 45 + #[serde(rename = "l")] 46 + pub left: Option<Cid>, 47 + /// ordered list of TreeEntry objects 48 + /// 49 + /// atproto MSTs have a fanout of 4, so there can be max 4 entries. 50 + #[serde(rename = "e")] 51 + pub entries: Vec<Entry>, // maybe we can do [Option<Entry>; 4]? 52 + } 53 + 54 + impl Node { 55 + /// test if a block could possibly be a node 56 + /// 57 + /// we can't eagerly decode records except where we're *sure* they cannot be 58 + /// an mst node (and even then we can only attempt) because you can't know 59 + /// with certainty what a block is supposed to be without actually walking 60 + /// the tree. 61 + /// 62 + /// so if a block *could be* a node, any record converter must postpone 63 + /// processing. if it turns out it happens to be a very node-looking record, 64 + /// well, sorry, it just has to only be processed later when that's known. 65 + pub(crate) fn could_be(bytes: impl AsRef<[u8]>) -> bool { 66 + const NODE_FINGERPRINT: [u8; 3] = [ 67 + 0xA2, // map length 2 (for "l" and "e" keys) 68 + 0x61, // text length 1 69 + b'e', // "e" before "l" because map keys have to be lex-sorted 70 + // 0x8?: "e" has array (0x100 upper 3 bits) of some length 71 + ]; 72 + let bytes = bytes.as_ref(); 73 + bytes.starts_with(&NODE_FINGERPRINT) 74 + && bytes 75 + .get(3) 76 + .map(|b| b & 0b1110_0000 == 0x80) 77 + .unwrap_or(false) 78 + } 79 + 80 + /// Check if a node has any entries 81 + /// 82 + /// An empty repository with no records is represented as a single MST node 83 + /// with an empty array of entries. This is the only situation in which a 84 + /// tree may contain an empty leaf node which does not either contain keys 85 + /// ("entries") or point to a sub-tree containing entries. 86 + pub(crate) fn is_empty(&self) -> bool { 87 + self.left.is_none() && self.entries.is_empty() 88 + } 89 + } 90 + 91 + /// TreeEntry object 92 + #[derive(Debug, Deserialize, PartialEq)] 93 + #[serde(deny_unknown_fields)] 94 + pub(crate) struct Entry { 95 + /// count of bytes shared with previous TreeEntry in this Node (if any) 96 + #[serde(rename = "p")] 97 + pub prefix_len: usize, 98 + /// remainder of key for this TreeEntry, after "prefixlen" have been removed 99 + #[serde(rename = "k", with = "serde_bytes")] 100 + pub keysuffix: Vec<u8>, // can we String this here? 101 + /// link to the record data (CBOR) for this entry 102 + #[serde(rename = "v")] 103 + pub value: Cid, 104 + /// link to a sub-tree Node at a lower level 105 + /// 106 + /// the lower level must have keys sorting after this TreeEntry's key (to 107 + /// the "right"), but before the next TreeEntry's key in this Node (if any) 108 + #[serde(rename = "t")] 109 + pub tree: Option<Cid>, 110 + }
+108
src/process.rs
··· 1 + /*! 2 + Record processor function output trait 3 + 4 + The return type must satisfy the `Processable` trait, which requires: 5 + 6 + - `Clone` because two rkeys can refer to the same record by CID, which may 7 + only appear once in the CAR file. 8 + - `Serialize + DeserializeOwned` so it can be spilled to disk. 9 + 10 + One required function must be implemented, `get_size()`: this should return the 11 + approximate total off-stack size of the type. (the on-stack size will be added 12 + automatically via `std::mem::get_size`). 13 + 14 + Note that it is **not guaranteed** that the `process` function will run on a 15 + block before storing it in memory or on disk: it's not possible to know if a 16 + block is a record without actually walking the MST, so the best we can do is 17 + apply `process` to any block that we know *cannot* be an MST node, and otherwise 18 + store the raw block bytes. 19 + 20 + Here's a silly processing function that just collects 'eyy's found in the raw 21 + record bytes 22 + 23 + ``` 24 + # use repo_stream::Processable; 25 + # use serde::{Serialize, Deserialize}; 26 + #[derive(Debug, Clone, Serialize, Deserialize)] 27 + struct Eyy(usize, String); 28 + 29 + impl Processable for Eyy { 30 + fn get_size(&self) -> usize { 31 + // don't need to compute the usize, it's on the stack 32 + self.1.capacity() // in-mem size from the string's capacity, in bytes 33 + } 34 + } 35 + 36 + fn process(raw: Vec<u8>) -> Vec<Eyy> { 37 + let mut out = Vec::new(); 38 + let to_find = "eyy".as_bytes(); 39 + for i in 0..(raw.len() - 3) { 40 + if &raw[i..(i+3)] == to_find { 41 + out.push(Eyy(i, "eyy".to_string())); 42 + } 43 + } 44 + out 45 + } 46 + ``` 47 + 48 + The memory sizing stuff is a little sketch but probably at least approximately 49 + works. 50 + */ 51 + 52 + use serde::{Serialize, de::DeserializeOwned}; 53 + 54 + /// Output trait for record processing 55 + pub trait Processable: Clone + Serialize + DeserializeOwned { 56 + /// Any additional in-memory size taken by the processed type 57 + /// 58 + /// Do not include stack size (`std::mem::size_of`) 59 + fn get_size(&self) -> usize; 60 + } 61 + 62 + /// Processor that just returns the raw blocks 63 + #[inline] 64 + pub fn noop(block: Vec<u8>) -> Vec<u8> { 65 + block 66 + } 67 + 68 + impl Processable for u8 { 69 + fn get_size(&self) -> usize { 70 + 0 71 + } 72 + } 73 + 74 + impl Processable for usize { 75 + fn get_size(&self) -> usize { 76 + 0 // no additional space taken, just its stack size (newtype is free) 77 + } 78 + } 79 + 80 + impl Processable for String { 81 + fn get_size(&self) -> usize { 82 + self.capacity() 83 + } 84 + } 85 + 86 + impl<Item: Sized + Processable> Processable for Vec<Item> { 87 + fn get_size(&self) -> usize { 88 + let slot_size = std::mem::size_of::<Item>(); 89 + let direct_size = slot_size * self.capacity(); 90 + let items_referenced_size: usize = self.iter().map(|item| item.get_size()).sum(); 91 + direct_size + items_referenced_size 92 + } 93 + } 94 + 95 + impl<Item: Processable> Processable for Option<Item> { 96 + fn get_size(&self) -> usize { 97 + self.as_ref().map(|item| item.get_size()).unwrap_or(0) 98 + } 99 + } 100 + 101 + impl<Item: Processable, Error: Processable> Processable for Result<Item, Error> { 102 + fn get_size(&self) -> usize { 103 + match self { 104 + Ok(item) => item.get_size(), 105 + Err(err) => err.get_size(), 106 + } 107 + } 108 + }
+406
src/walk.rs
··· 1 + //! Depth-first MST traversal 2 + 3 + use crate::disk::DiskStore; 4 + use crate::drive::{DecodeError, MaybeProcessedBlock}; 5 + use crate::mst::Node; 6 + use crate::process::Processable; 7 + use ipld_core::cid::Cid; 8 + use sha2::{Digest, Sha256}; 9 + use std::collections::HashMap; 10 + use std::convert::Infallible; 11 + 12 + /// Errors that can happen while walking 13 + #[derive(Debug, thiserror::Error)] 14 + pub enum WalkError { 15 + #[error("Failed to fingerprint commit block")] 16 + BadCommitFingerprint, 17 + #[error("Failed to decode commit block: {0}")] 18 + BadCommit(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 19 + #[error("Action node error: {0}")] 20 + MstError(#[from] MstError), 21 + #[error("storage error: {0}")] 22 + StorageError(#[from] fjall::Error), 23 + #[error("Decode error: {0}")] 24 + DecodeError(#[from] DecodeError), 25 + } 26 + 27 + /// Errors from invalid Rkeys 28 + #[derive(Debug, PartialEq, thiserror::Error)] 29 + pub enum MstError { 30 + #[error("Failed to compute an rkey due to invalid prefix_len")] 31 + EntryPrefixOutOfbounds, 32 + #[error("RKey was not utf-8")] 33 + EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error), 34 + #[error("Nodes cannot be empty (except for an entirely empty MST)")] 35 + EmptyNode, 36 + #[error("Found an entry with rkey at the wrong depth")] 37 + WrongDepth, 38 + #[error("Lost track of our depth (possible bug?)")] 39 + LostDepth, 40 + #[error("MST depth underflow: depth-0 node with child trees")] 41 + DepthUnderflow, 42 + #[error("Encountered an rkey out of order while walking the MST")] 43 + RkeyOutOfOrder, 44 + } 45 + 46 + /// Walker outputs 47 + #[derive(Debug)] 48 + pub enum Step<T> { 49 + /// We needed this CID but it's not in the block store 50 + Missing(Cid), 51 + /// Reached the end of the MST! yay! 52 + Finish, 53 + /// A record was found! 54 + Found { rkey: String, data: T }, 55 + } 56 + 57 + #[derive(Debug, Clone, PartialEq)] 58 + enum Need { 59 + Node { depth: Depth, cid: Cid }, 60 + Record { rkey: String, cid: Cid }, 61 + } 62 + 63 + #[derive(Debug, Clone, Copy, PartialEq)] 64 + enum Depth { 65 + Root, 66 + Depth(u32), 67 + } 68 + 69 + impl Depth { 70 + fn from_key(key: &[u8]) -> Self { 71 + let mut zeros = 0; 72 + for byte in Sha256::digest(key) { 73 + let leading = byte.leading_zeros(); 74 + zeros += leading; 75 + if leading < 8 { 76 + break; 77 + } 78 + } 79 + Self::Depth(zeros / 2) // truncating divide (rounds down) 80 + } 81 + fn next_expected(&self) -> Result<Option<u32>, MstError> { 82 + match self { 83 + Self::Root => Ok(None), 84 + Self::Depth(d) => d.checked_sub(1).ok_or(MstError::DepthUnderflow).map(Some), 85 + } 86 + } 87 + } 88 + 89 + fn push_from_node(stack: &mut Vec<Need>, node: &Node, parent_depth: Depth) -> Result<(), MstError> { 90 + // empty nodes are not allowed in the MST except in an empty MST 91 + if node.is_empty() { 92 + if parent_depth == Depth::Root { 93 + return Ok(()); // empty mst, nothing to push 94 + } else { 95 + return Err(MstError::EmptyNode); 96 + } 97 + } 98 + 99 + let mut entries = Vec::with_capacity(node.entries.len()); 100 + let mut prefix = vec![]; 101 + let mut this_depth = parent_depth.next_expected()?; 102 + 103 + for entry in &node.entries { 104 + let mut rkey = vec![]; 105 + let pre_checked = prefix 106 + .get(..entry.prefix_len) 107 + .ok_or(MstError::EntryPrefixOutOfbounds)?; 108 + rkey.extend_from_slice(pre_checked); 109 + rkey.extend_from_slice(&entry.keysuffix); 110 + 111 + let Depth::Depth(key_depth) = Depth::from_key(&rkey) else { 112 + return Err(MstError::WrongDepth); 113 + }; 114 + 115 + // this_depth is `none` if we are the deepest child (directly below root) 116 + // in that case we accept whatever highest depth is claimed 117 + let expected_depth = match this_depth { 118 + Some(d) => d, 119 + None => { 120 + this_depth = Some(key_depth); 121 + key_depth 122 + } 123 + }; 124 + 125 + // all keys we find should be this depth 126 + if key_depth != expected_depth { 127 + return Err(MstError::DepthUnderflow); 128 + } 129 + 130 + prefix = rkey.clone(); 131 + 132 + entries.push(Need::Record { 133 + rkey: String::from_utf8(rkey)?, 134 + cid: entry.value, 135 + }); 136 + if let Some(ref tree) = entry.tree { 137 + entries.push(Need::Node { 138 + depth: Depth::Depth(key_depth), 139 + cid: *tree, 140 + }); 141 + } 142 + } 143 + 144 + entries.reverse(); 145 + stack.append(&mut entries); 146 + 147 + let d = this_depth.ok_or(MstError::LostDepth)?; 148 + 149 + if let Some(tree) = node.left { 150 + stack.push(Need::Node { 151 + depth: Depth::Depth(d), 152 + cid: tree, 153 + }); 154 + } 155 + Ok(()) 156 + } 157 + 158 + /// Traverser of an atproto MST 159 + /// 160 + /// Walks the tree from left-to-right in depth-first order 161 + #[derive(Debug)] 162 + pub struct Walker { 163 + stack: Vec<Need>, 164 + prev: String, 165 + } 166 + 167 + impl Walker { 168 + pub fn new(tree_root_cid: Cid) -> Self { 169 + Self { 170 + stack: vec![Need::Node { 171 + depth: Depth::Root, 172 + cid: tree_root_cid, 173 + }], 174 + prev: "".to_string(), 175 + } 176 + } 177 + 178 + /// Advance through nodes until we find a record or can't go further 179 + pub fn step<T: Processable>( 180 + &mut self, 181 + blocks: &mut HashMap<Cid, MaybeProcessedBlock<T>>, 182 + process: impl Fn(Vec<u8>) -> T, 183 + ) -> Result<Step<T>, WalkError> { 184 + loop { 185 + let Some(need) = self.stack.last_mut() else { 186 + log::trace!("tried to walk but we're actually done."); 187 + return Ok(Step::Finish); 188 + }; 189 + 190 + match need { 191 + &mut Need::Node { depth, cid } => { 192 + log::trace!("need node {cid:?}"); 193 + let Some(block) = blocks.remove(&cid) else { 194 + log::trace!("node not found, resting"); 195 + return Ok(Step::Missing(cid)); 196 + }; 197 + 198 + let MaybeProcessedBlock::Raw(data) = block else { 199 + return Err(WalkError::BadCommitFingerprint); 200 + }; 201 + let node = serde_ipld_dagcbor::from_slice::<Node>(&data) 202 + .map_err(WalkError::BadCommit)?; 203 + 204 + // found node, make sure we remember 205 + self.stack.pop(); 206 + 207 + // queue up work on the found node next 208 + push_from_node(&mut self.stack, &node, depth)?; 209 + } 210 + Need::Record { rkey, cid } => { 211 + log::trace!("need record {cid:?}"); 212 + // note that we cannot *remove* a record block, sadly, since 213 + // there can be multiple rkeys pointing to the same cid. 214 + let Some(data) = blocks.get_mut(cid) else { 215 + return Ok(Step::Missing(*cid)); 216 + }; 217 + let rkey = rkey.clone(); 218 + let data = match data { 219 + MaybeProcessedBlock::Raw(data) => process(data.to_vec()), 220 + MaybeProcessedBlock::Processed(t) => t.clone(), 221 + }; 222 + 223 + // found node, make sure we remember 224 + self.stack.pop(); 225 + 226 + // rkeys *must* be in order or else the tree is invalid (or 227 + // we have a bug) 228 + if rkey <= self.prev { 229 + return Err(MstError::RkeyOutOfOrder)?; 230 + } 231 + self.prev = rkey.clone(); 232 + 233 + return Ok(Step::Found { rkey, data }); 234 + } 235 + } 236 + } 237 + } 238 + 239 + /// blocking!!!!!! 240 + pub fn disk_step<T: Processable>( 241 + &mut self, 242 + reader: &mut DiskStore, 243 + process: impl Fn(Vec<u8>) -> T, 244 + ) -> Result<Step<T>, WalkError> { 245 + loop { 246 + let Some(need) = self.stack.last_mut() else { 247 + log::trace!("tried to walk but we're actually done."); 248 + return Ok(Step::Finish); 249 + }; 250 + 251 + match need { 252 + &mut Need::Node { depth, cid } => { 253 + let cid_bytes = cid.to_bytes(); 254 + log::trace!("need node {cid:?}"); 255 + let Some(block_bytes) = reader.get(&cid_bytes)? else { 256 + log::trace!("node not found, resting"); 257 + return Ok(Step::Missing(cid)); 258 + }; 259 + 260 + let block: MaybeProcessedBlock<T> = crate::drive::decode(&block_bytes)?; 261 + 262 + let MaybeProcessedBlock::Raw(data) = block else { 263 + return Err(WalkError::BadCommitFingerprint); 264 + }; 265 + let node = serde_ipld_dagcbor::from_slice::<Node>(&data) 266 + .map_err(WalkError::BadCommit)?; 267 + 268 + // found node, make sure we remember 269 + self.stack.pop(); 270 + 271 + // queue up work on the found node next 272 + push_from_node(&mut self.stack, &node, depth).map_err(WalkError::MstError)?; 273 + } 274 + Need::Record { rkey, cid } => { 275 + log::trace!("need record {cid:?}"); 276 + let cid_bytes = cid.to_bytes(); 277 + let Some(data_bytes) = reader.get(&cid_bytes)? else { 278 + log::trace!("record block not found, resting"); 279 + return Ok(Step::Missing(*cid)); 280 + }; 281 + let data: MaybeProcessedBlock<T> = crate::drive::decode(&data_bytes)?; 282 + let rkey = rkey.clone(); 283 + let data = match data { 284 + MaybeProcessedBlock::Raw(data) => process(data), 285 + MaybeProcessedBlock::Processed(t) => t.clone(), 286 + }; 287 + 288 + // found node, make sure we remember 289 + self.stack.pop(); 290 + 291 + log::trace!("emitting a block as a step. depth={}", self.stack.len()); 292 + 293 + // rkeys *must* be in order or else the tree is invalid (or 294 + // we have a bug) 295 + if rkey <= self.prev { 296 + return Err(MstError::RkeyOutOfOrder)?; 297 + } 298 + self.prev = rkey.clone(); 299 + 300 + return Ok(Step::Found { rkey, data }); 301 + } 302 + } 303 + } 304 + } 305 + } 306 + 307 + #[cfg(test)] 308 + mod test { 309 + use super::*; 310 + 311 + fn cid1() -> Cid { 312 + "bafyreihixenvk3ahqbytas4hk4a26w43bh6eo3w6usjqtxkpzsvi655a3m" 313 + .parse() 314 + .unwrap() 315 + } 316 + 317 + #[test] 318 + fn test_depth_spec_0() { 319 + let d = Depth::from_key(b"2653ae71"); 320 + assert_eq!(d, Depth::Depth(0)) 321 + } 322 + 323 + #[test] 324 + fn test_depth_spec_1() { 325 + let d = Depth::from_key(b"blue"); 326 + assert_eq!(d, Depth::Depth(1)) 327 + } 328 + 329 + #[test] 330 + fn test_depth_spec_4() { 331 + let d = Depth::from_key(b"app.bsky.feed.post/454397e440ec"); 332 + assert_eq!(d, Depth::Depth(4)) 333 + } 334 + 335 + #[test] 336 + fn test_depth_spec_8() { 337 + let d = Depth::from_key(b"app.bsky.feed.post/9adeb165882c"); 338 + assert_eq!(d, Depth::Depth(8)) 339 + } 340 + 341 + #[test] 342 + fn test_depth_ietf_draft_0() { 343 + let d = Depth::from_key(b"key1"); 344 + assert_eq!(d, Depth::Depth(0)) 345 + } 346 + 347 + #[test] 348 + fn test_depth_ietf_draft_1() { 349 + let d = Depth::from_key(b"key7"); 350 + assert_eq!(d, Depth::Depth(1)) 351 + } 352 + 353 + #[test] 354 + fn test_depth_ietf_draft_4() { 355 + let d = Depth::from_key(b"key515"); 356 + assert_eq!(d, Depth::Depth(4)) 357 + } 358 + 359 + #[test] 360 + fn test_depth_interop() { 361 + // examples from https://github.com/bluesky-social/atproto-interop-tests/blob/main/mst/key_heights.json 362 + for (k, expected) in [ 363 + ("", 0), 364 + ("asdf", 0), 365 + ("blue", 1), 366 + ("2653ae71", 0), 367 + ("88bfafc7", 2), 368 + ("2a92d355", 4), 369 + ("884976f5", 6), 370 + ("app.bsky.feed.post/454397e440ec", 4), 371 + ("app.bsky.feed.post/9adeb165882c", 8), 372 + ] { 373 + let d = Depth::from_key(k.as_bytes()); 374 + assert_eq!(d, Depth::Depth(expected), "key: {}", k); 375 + } 376 + } 377 + 378 + #[test] 379 + fn test_push_empty_fails() { 380 + let empty_node = Node { 381 + left: None, 382 + entries: vec![], 383 + }; 384 + let mut stack = vec![]; 385 + let err = push_from_node(&mut stack, &empty_node, Depth::Depth(4)); 386 + assert_eq!(err, Err(MstError::EmptyNode)); 387 + } 388 + 389 + #[test] 390 + fn test_push_one_node() { 391 + let node = Node { 392 + left: Some(cid1()), 393 + entries: vec![], 394 + }; 395 + let mut stack = vec![]; 396 + push_from_node(&mut stack, &node, Depth::Depth(4)).unwrap(); 397 + assert_eq!( 398 + stack.last(), 399 + Some(Need::Node { 400 + depth: Depth::Depth(3), 401 + cid: cid1() 402 + }) 403 + .as_ref() 404 + ); 405 + } 406 + }