Fast and robust atproto CAR file processing in rust

Compare changes

Choose any two refs to compare.

Changed files
+59 -2677
examples
disk-read-file
src
+26 -1188
Cargo.lock
··· 3 3 version = 4 4 4 5 5 [[package]] 6 - name = "addr2line" 7 - version = "0.25.1" 8 - source = "registry+https://github.com/rust-lang/crates.io-index" 9 - checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" 10 - dependencies = [ 11 - "gimli", 12 - ] 13 - 14 - [[package]] 15 - name = "adler2" 16 - version = "2.0.1" 17 - source = "registry+https://github.com/rust-lang/crates.io-index" 18 - checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" 19 - 20 - [[package]] 21 - name = "aho-corasick" 22 - version = "1.1.3" 23 - source = "registry+https://github.com/rust-lang/crates.io-index" 24 - checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 25 - dependencies = [ 26 - "memchr", 27 - ] 28 - 29 - [[package]] 30 - name = "anes" 31 - version = "0.1.6" 32 - source = "registry+https://github.com/rust-lang/crates.io-index" 33 - checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 34 - 35 - [[package]] 36 6 name = "anstream" 37 7 version = "0.6.21" 38 8 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 68 38 source = "registry+https://github.com/rust-lang/crates.io-index" 69 39 checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" 70 40 dependencies = [ 71 - "windows-sys 0.60.2", 41 + "windows-sys", 72 42 ] 73 43 74 44 [[package]] ··· 79 49 dependencies = [ 80 50 "anstyle", 81 51 "once_cell_polyfill", 82 - "windows-sys 0.60.2", 83 - ] 84 - 85 - [[package]] 86 - name = "anyhow" 87 - version = "1.0.100" 88 - source = "registry+https://github.com/rust-lang/crates.io-index" 89 - checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" 90 - 91 - [[package]] 92 - name = "autocfg" 93 - version = "1.5.0" 94 - source = "registry+https://github.com/rust-lang/crates.io-index" 95 - checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" 96 - 97 - [[package]] 98 - name = "backtrace" 99 - version = "0.3.76" 100 - source = "registry+https://github.com/rust-lang/crates.io-index" 101 - checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" 102 - dependencies = [ 103 - "addr2line", 104 - "cfg-if", 105 - "libc", 106 - "miniz_oxide", 107 - "object", 108 - "rustc-demangle", 109 - "windows-link", 110 - ] 111 - 112 - [[package]] 113 - name = "base-x" 114 - version = "0.2.11" 115 - source = "registry+https://github.com/rust-lang/crates.io-index" 116 - checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" 117 - 118 - [[package]] 119 - name = "base256emoji" 120 - version = "1.0.2" 121 - source = "registry+https://github.com/rust-lang/crates.io-index" 122 - checksum = "b5e9430d9a245a77c92176e649af6e275f20839a48389859d1661e9a128d077c" 123 - dependencies = [ 124 - "const-str", 125 - "match-lookup", 126 - ] 127 - 128 - [[package]] 129 - name = "bincode" 130 - version = "2.0.1" 131 - source = "registry+https://github.com/rust-lang/crates.io-index" 132 - checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" 133 - dependencies = [ 134 - "bincode_derive", 135 - "serde", 136 - "unty", 137 - ] 138 - 139 - [[package]] 140 - name = "bincode_derive" 141 - version = "2.0.1" 142 - source = "registry+https://github.com/rust-lang/crates.io-index" 143 - checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" 144 - dependencies = [ 145 - "virtue", 52 + "windows-sys", 146 53 ] 147 54 148 55 [[package]] ··· 152 59 checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" 153 60 154 61 [[package]] 155 - name = "block-buffer" 156 - version = "0.10.4" 157 - source = "registry+https://github.com/rust-lang/crates.io-index" 158 - checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" 159 - dependencies = [ 160 - "generic-array", 161 - ] 162 - 163 - [[package]] 164 - name = "bumpalo" 165 - version = "3.19.0" 166 - source = "registry+https://github.com/rust-lang/crates.io-index" 167 - checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 168 - 169 - [[package]] 170 62 name = "byteorder-lite" 171 63 version = "0.1.0" 172 64 source = "registry+https://github.com/rust-lang/crates.io-index" 173 65 checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" 174 66 175 67 [[package]] 176 - name = "bytes" 177 - version = "1.10.1" 178 - source = "registry+https://github.com/rust-lang/crates.io-index" 179 - checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" 180 - 181 - [[package]] 182 68 name = "byteview" 183 69 version = "0.10.0" 184 70 source = "registry+https://github.com/rust-lang/crates.io-index" 185 71 checksum = "dda4398f387cc6395a3e93b3867cd9abda914c97a0b344d1eefb2e5c51785fca" 186 - 187 - [[package]] 188 - name = "cast" 189 - version = "0.3.0" 190 - source = "registry+https://github.com/rust-lang/crates.io-index" 191 - checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 192 - 193 - [[package]] 194 - name = "cbor4ii" 195 - version = "0.2.14" 196 - source = "registry+https://github.com/rust-lang/crates.io-index" 197 - checksum = "b544cf8c89359205f4f990d0e6f3828db42df85b5dac95d09157a250eb0749c4" 198 - dependencies = [ 199 - "serde", 200 - ] 201 72 202 73 [[package]] 203 74 name = "cfg-if" ··· 206 77 checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" 207 78 208 79 [[package]] 209 - name = "ciborium" 210 - version = "0.2.2" 211 - source = "registry+https://github.com/rust-lang/crates.io-index" 212 - checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" 213 - dependencies = [ 214 - "ciborium-io", 215 - "ciborium-ll", 216 - "serde", 217 - ] 218 - 219 - [[package]] 220 - name = "ciborium-io" 221 - version = "0.2.2" 222 - source = "registry+https://github.com/rust-lang/crates.io-index" 223 - checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" 224 - 225 - [[package]] 226 - name = "ciborium-ll" 227 - version = "0.2.2" 228 - source = "registry+https://github.com/rust-lang/crates.io-index" 229 - checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" 230 - dependencies = [ 231 - "ciborium-io", 232 - "half", 233 - ] 234 - 235 - [[package]] 236 - name = "cid" 237 - version = "0.11.1" 238 - source = "registry+https://github.com/rust-lang/crates.io-index" 239 - checksum = "3147d8272e8fa0ccd29ce51194dd98f79ddfb8191ba9e3409884e751798acf3a" 240 - dependencies = [ 241 - "core2", 242 - "multibase", 243 - "multihash", 244 - "serde", 245 - "serde_bytes", 246 - "unsigned-varint 0.8.0", 247 - ] 248 - 249 - [[package]] 250 80 name = "clap" 251 81 version = "4.5.48" 252 82 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 277 107 "heck", 278 108 "proc-macro2", 279 109 "quote", 280 - "syn 2.0.106", 110 + "syn", 281 111 ] 282 112 283 113 [[package]] ··· 299 129 checksum = "ea0095f6103c2a8b44acd6fd15960c801dafebf02e21940360833e0673f48ba7" 300 130 301 131 [[package]] 302 - name = "const-str" 303 - version = "0.4.3" 304 - source = "registry+https://github.com/rust-lang/crates.io-index" 305 - checksum = "2f421161cb492475f1661ddc9815a745a1c894592070661180fdec3d4872e9c3" 306 - 307 - [[package]] 308 - name = "core2" 309 - version = "0.4.0" 310 - source = "registry+https://github.com/rust-lang/crates.io-index" 311 - checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" 312 - dependencies = [ 313 - "memchr", 314 - ] 315 - 316 - [[package]] 317 - name = "cpufeatures" 318 - version = "0.2.17" 319 - source = "registry+https://github.com/rust-lang/crates.io-index" 320 - checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" 321 - dependencies = [ 322 - "libc", 323 - ] 324 - 325 - [[package]] 326 - name = "criterion" 327 - version = "0.7.0" 328 - source = "registry+https://github.com/rust-lang/crates.io-index" 329 - checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" 330 - dependencies = [ 331 - "anes", 332 - "cast", 333 - "ciborium", 334 - "clap", 335 - "criterion-plot", 336 - "itertools", 337 - "num-traits", 338 - "oorandom", 339 - "plotters", 340 - "rayon", 341 - "regex", 342 - "serde", 343 - "serde_json", 344 - "tinytemplate", 345 - "tokio", 346 - "walkdir", 347 - ] 348 - 349 - [[package]] 350 - name = "criterion-plot" 351 - version = "0.6.0" 352 - source = "registry+https://github.com/rust-lang/crates.io-index" 353 - checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" 354 - dependencies = [ 355 - "cast", 356 - "itertools", 357 - ] 358 - 359 - [[package]] 360 - name = "crossbeam-deque" 361 - version = "0.8.6" 362 - source = "registry+https://github.com/rust-lang/crates.io-index" 363 - checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 364 - dependencies = [ 365 - "crossbeam-epoch", 366 - "crossbeam-utils", 367 - ] 368 - 369 - [[package]] 370 132 name = "crossbeam-epoch" 371 133 version = "0.9.18" 372 134 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 392 154 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 393 155 394 156 [[package]] 395 - name = "crunchy" 396 - version = "0.2.4" 397 - source = "registry+https://github.com/rust-lang/crates.io-index" 398 - checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" 399 - 400 - [[package]] 401 - name = "crypto-common" 402 - version = "0.1.6" 403 - source = "registry+https://github.com/rust-lang/crates.io-index" 404 - checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" 405 - dependencies = [ 406 - "generic-array", 407 - "typenum", 408 - ] 409 - 410 - [[package]] 411 157 name = "dashmap" 412 158 version = "6.1.0" 413 159 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 422 168 ] 423 169 424 170 [[package]] 425 - name = "data-encoding" 426 - version = "2.9.0" 427 - source = "registry+https://github.com/rust-lang/crates.io-index" 428 - checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" 429 - 430 - [[package]] 431 - name = "data-encoding-macro" 432 - version = "0.1.18" 433 - source = "registry+https://github.com/rust-lang/crates.io-index" 434 - checksum = "47ce6c96ea0102f01122a185683611bd5ac8d99e62bc59dd12e6bda344ee673d" 435 - dependencies = [ 436 - "data-encoding", 437 - "data-encoding-macro-internal", 438 - ] 439 - 440 - [[package]] 441 - name = "data-encoding-macro-internal" 442 - version = "0.1.16" 443 - source = "registry+https://github.com/rust-lang/crates.io-index" 444 - checksum = "8d162beedaa69905488a8da94f5ac3edb4dd4788b732fadb7bd120b2625c1976" 445 - dependencies = [ 446 - "data-encoding", 447 - "syn 2.0.106", 448 - ] 449 - 450 - [[package]] 451 - name = "digest" 452 - version = "0.10.7" 453 - source = "registry+https://github.com/rust-lang/crates.io-index" 454 - checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" 455 - dependencies = [ 456 - "block-buffer", 457 - "crypto-common", 458 - ] 459 - 460 - [[package]] 461 - name = "either" 462 - version = "1.15.0" 463 - source = "registry+https://github.com/rust-lang/crates.io-index" 464 - checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 465 - 466 - [[package]] 467 171 name = "enum_dispatch" 468 172 version = "0.3.13" 469 173 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 472 176 "once_cell", 473 177 "proc-macro2", 474 178 "quote", 475 - "syn 2.0.106", 476 - ] 477 - 478 - [[package]] 479 - name = "env_filter" 480 - version = "0.1.3" 481 - source = "registry+https://github.com/rust-lang/crates.io-index" 482 - checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" 483 - dependencies = [ 484 - "log", 485 - "regex", 486 - ] 487 - 488 - [[package]] 489 - name = "env_logger" 490 - version = "0.11.8" 491 - source = "registry+https://github.com/rust-lang/crates.io-index" 492 - checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" 493 - dependencies = [ 494 - "anstream", 495 - "anstyle", 496 - "env_filter", 497 - "jiff", 498 - "log", 179 + "syn", 499 180 ] 500 181 501 182 [[package]] ··· 511 192 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" 512 193 dependencies = [ 513 194 "libc", 514 - "windows-sys 0.60.2", 195 + "windows-sys", 515 196 ] 516 197 517 198 [[package]] ··· 532 213 "flume", 533 214 "log", 534 215 "lsm-tree", 216 + "lz4_flex", 535 217 "tempfile", 536 218 "xxhash-rust", 537 219 ] ··· 546 228 ] 547 229 548 230 [[package]] 549 - name = "futures" 550 - version = "0.3.31" 551 - source = "registry+https://github.com/rust-lang/crates.io-index" 552 - checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" 553 - dependencies = [ 554 - "futures-channel", 555 - "futures-core", 556 - "futures-executor", 557 - "futures-io", 558 - "futures-sink", 559 - "futures-task", 560 - "futures-util", 561 - ] 562 - 563 - [[package]] 564 - name = "futures-channel" 565 - version = "0.3.31" 566 - source = "registry+https://github.com/rust-lang/crates.io-index" 567 - checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" 568 - dependencies = [ 569 - "futures-core", 570 - "futures-sink", 571 - ] 572 - 573 - [[package]] 574 - name = "futures-core" 575 - version = "0.3.31" 576 - source = "registry+https://github.com/rust-lang/crates.io-index" 577 - checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" 578 - 579 - [[package]] 580 - name = "futures-executor" 581 - version = "0.3.31" 582 - source = "registry+https://github.com/rust-lang/crates.io-index" 583 - checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" 584 - dependencies = [ 585 - "futures-core", 586 - "futures-task", 587 - "futures-util", 588 - ] 589 - 590 - [[package]] 591 - name = "futures-io" 592 - version = "0.3.31" 593 - source = "registry+https://github.com/rust-lang/crates.io-index" 594 - checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" 595 - 596 - [[package]] 597 - name = "futures-macro" 598 - version = "0.3.31" 599 - source = "registry+https://github.com/rust-lang/crates.io-index" 600 - checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" 601 - dependencies = [ 602 - "proc-macro2", 603 - "quote", 604 - "syn 2.0.106", 605 - ] 606 - 607 - [[package]] 608 - name = "futures-sink" 609 - version = "0.3.31" 610 - source = "registry+https://github.com/rust-lang/crates.io-index" 611 - checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" 612 - 613 - [[package]] 614 - name = "futures-task" 615 - version = "0.3.31" 616 - source = "registry+https://github.com/rust-lang/crates.io-index" 617 - checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" 618 - 619 - [[package]] 620 - name = "futures-util" 621 - version = "0.3.31" 622 - source = "registry+https://github.com/rust-lang/crates.io-index" 623 - checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" 624 - dependencies = [ 625 - "futures-channel", 626 - "futures-core", 627 - "futures-io", 628 - "futures-macro", 629 - "futures-sink", 630 - "futures-task", 631 - "memchr", 632 - "pin-project-lite", 633 - "pin-utils", 634 - "slab", 635 - ] 636 - 637 - [[package]] 638 - name = "generic-array" 639 - version = "0.14.9" 640 - source = "registry+https://github.com/rust-lang/crates.io-index" 641 - checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" 642 - dependencies = [ 643 - "typenum", 644 - "version_check", 645 - ] 646 - 647 - [[package]] 648 231 name = "getrandom" 649 232 version = "0.3.3" 650 233 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 653 236 "cfg-if", 654 237 "libc", 655 238 "r-efi", 656 - "wasi 0.14.7+wasi-0.2.4", 657 - ] 658 - 659 - [[package]] 660 - name = "gimli" 661 - version = "0.32.3" 662 - source = "registry+https://github.com/rust-lang/crates.io-index" 663 - checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" 664 - 665 - [[package]] 666 - name = "half" 667 - version = "2.7.0" 668 - source = "registry+https://github.com/rust-lang/crates.io-index" 669 - checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5" 670 - dependencies = [ 671 - "cfg-if", 672 - "crunchy", 673 - "zerocopy", 239 + "wasi", 674 240 ] 675 241 676 242 [[package]] ··· 701 267 ] 702 268 703 269 [[package]] 704 - name = "io-uring" 705 - version = "0.7.10" 706 - source = "registry+https://github.com/rust-lang/crates.io-index" 707 - checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" 708 - dependencies = [ 709 - "bitflags", 710 - "cfg-if", 711 - "libc", 712 - ] 713 - 714 - [[package]] 715 - name = "ipld-core" 716 - version = "0.4.2" 717 - source = "registry+https://github.com/rust-lang/crates.io-index" 718 - checksum = "104718b1cc124d92a6d01ca9c9258a7df311405debb3408c445a36452f9bf8db" 719 - dependencies = [ 720 - "cid", 721 - "serde", 722 - "serde_bytes", 723 - ] 724 - 725 - [[package]] 726 - name = "iroh-car" 727 - version = "0.5.1" 728 - source = "registry+https://github.com/rust-lang/crates.io-index" 729 - checksum = "cb7f8cd4cb9aa083fba8b52e921764252d0b4dcb1cd6d120b809dbfe1106e81a" 730 - dependencies = [ 731 - "anyhow", 732 - "cid", 733 - "futures", 734 - "serde", 735 - "serde_ipld_dagcbor", 736 - "thiserror 1.0.69", 737 - "tokio", 738 - "unsigned-varint 0.7.2", 739 - ] 740 - 741 - [[package]] 742 270 name = "is_terminal_polyfill" 743 271 version = "1.70.1" 744 272 source = "registry+https://github.com/rust-lang/crates.io-index" 745 273 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 746 274 747 275 [[package]] 748 - name = "itertools" 749 - version = "0.13.0" 750 - source = "registry+https://github.com/rust-lang/crates.io-index" 751 - checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" 752 - dependencies = [ 753 - "either", 754 - ] 755 - 756 - [[package]] 757 - name = "itoa" 758 - version = "1.0.15" 759 - source = "registry+https://github.com/rust-lang/crates.io-index" 760 - checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 761 - 762 - [[package]] 763 - name = "jiff" 764 - version = "0.2.15" 765 - source = "registry+https://github.com/rust-lang/crates.io-index" 766 - checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" 767 - dependencies = [ 768 - "jiff-static", 769 - "log", 770 - "portable-atomic", 771 - "portable-atomic-util", 772 - "serde", 773 - ] 774 - 775 - [[package]] 776 - name = "jiff-static" 777 - version = "0.2.15" 778 - source = "registry+https://github.com/rust-lang/crates.io-index" 779 - checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" 780 - dependencies = [ 781 - "proc-macro2", 782 - "quote", 783 - "syn 2.0.106", 784 - ] 785 - 786 - [[package]] 787 - name = "js-sys" 788 - version = "0.3.81" 789 - source = "registry+https://github.com/rust-lang/crates.io-index" 790 - checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" 791 - dependencies = [ 792 - "once_cell", 793 - "wasm-bindgen", 794 - ] 795 - 796 - [[package]] 797 276 name = "libc" 798 277 version = "0.2.176" 799 278 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 832 311 "enum_dispatch", 833 312 "interval-heap", 834 313 "log", 314 + "lz4_flex", 835 315 "quick_cache", 836 316 "rustc-hash", 837 317 "self_cell", ··· 842 322 ] 843 323 844 324 [[package]] 845 - name = "match-lookup" 846 - version = "0.1.1" 847 - source = "registry+https://github.com/rust-lang/crates.io-index" 848 - checksum = "1265724d8cb29dbbc2b0f06fffb8bf1a8c0cf73a78eede9ba73a4a66c52a981e" 849 - dependencies = [ 850 - "proc-macro2", 851 - "quote", 852 - "syn 1.0.109", 853 - ] 854 - 855 - [[package]] 856 - name = "memchr" 857 - version = "2.7.6" 858 - source = "registry+https://github.com/rust-lang/crates.io-index" 859 - checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" 860 - 861 - [[package]] 862 - name = "miniz_oxide" 863 - version = "0.8.9" 864 - source = "registry+https://github.com/rust-lang/crates.io-index" 865 - checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" 866 - dependencies = [ 867 - "adler2", 868 - ] 869 - 870 - [[package]] 871 - name = "mio" 872 - version = "1.0.4" 873 - source = "registry+https://github.com/rust-lang/crates.io-index" 874 - checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" 875 - dependencies = [ 876 - "libc", 877 - "wasi 0.11.1+wasi-snapshot-preview1", 878 - "windows-sys 0.59.0", 879 - ] 880 - 881 - [[package]] 882 - name = "multibase" 883 - version = "0.9.2" 884 - source = "registry+https://github.com/rust-lang/crates.io-index" 885 - checksum = "8694bb4835f452b0e3bb06dbebb1d6fc5385b6ca1caf2e55fd165c042390ec77" 886 - dependencies = [ 887 - "base-x", 888 - "base256emoji", 889 - "data-encoding", 890 - "data-encoding-macro", 891 - ] 892 - 893 - [[package]] 894 - name = "multihash" 895 - version = "0.19.3" 896 - source = "registry+https://github.com/rust-lang/crates.io-index" 897 - checksum = "6b430e7953c29dd6a09afc29ff0bb69c6e306329ee6794700aee27b76a1aea8d" 898 - dependencies = [ 899 - "core2", 900 - "serde", 901 - "unsigned-varint 0.8.0", 902 - ] 903 - 904 - [[package]] 905 - name = "num-traits" 906 - version = "0.2.19" 907 - source = "registry+https://github.com/rust-lang/crates.io-index" 908 - checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 909 - dependencies = [ 910 - "autocfg", 911 - ] 912 - 913 - [[package]] 914 - name = "object" 915 - version = "0.37.3" 325 + name = "lz4_flex" 326 + version = "0.11.5" 916 327 source = "registry+https://github.com/rust-lang/crates.io-index" 917 - checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" 328 + checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" 918 329 dependencies = [ 919 - "memchr", 330 + "twox-hash", 920 331 ] 921 332 922 333 [[package]] ··· 932 343 checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" 933 344 934 345 [[package]] 935 - name = "oorandom" 936 - version = "11.1.5" 937 - source = "registry+https://github.com/rust-lang/crates.io-index" 938 - checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" 939 - 940 - [[package]] 941 - name = "parking_lot" 942 - version = "0.12.5" 943 - source = "registry+https://github.com/rust-lang/crates.io-index" 944 - checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" 945 - dependencies = [ 946 - "lock_api", 947 - "parking_lot_core", 948 - ] 949 - 950 - [[package]] 951 346 name = "parking_lot_core" 952 347 version = "0.9.12" 953 348 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 961 356 ] 962 357 963 358 [[package]] 964 - name = "pin-project-lite" 965 - version = "0.2.16" 966 - source = "registry+https://github.com/rust-lang/crates.io-index" 967 - checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" 968 - 969 - [[package]] 970 - name = "pin-utils" 971 - version = "0.1.0" 972 - source = "registry+https://github.com/rust-lang/crates.io-index" 973 - checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" 974 - 975 - [[package]] 976 - name = "plotters" 977 - version = "0.3.7" 978 - source = "registry+https://github.com/rust-lang/crates.io-index" 979 - checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" 980 - dependencies = [ 981 - "num-traits", 982 - "plotters-backend", 983 - "plotters-svg", 984 - "wasm-bindgen", 985 - "web-sys", 986 - ] 987 - 988 - [[package]] 989 - name = "plotters-backend" 990 - version = "0.3.7" 991 - source = "registry+https://github.com/rust-lang/crates.io-index" 992 - checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" 993 - 994 - [[package]] 995 - name = "plotters-svg" 996 - version = "0.3.7" 997 - source = "registry+https://github.com/rust-lang/crates.io-index" 998 - checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" 999 - dependencies = [ 1000 - "plotters-backend", 1001 - ] 1002 - 1003 - [[package]] 1004 - name = "portable-atomic" 1005 - version = "1.11.1" 1006 - source = "registry+https://github.com/rust-lang/crates.io-index" 1007 - checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" 1008 - 1009 - [[package]] 1010 - name = "portable-atomic-util" 1011 - version = "0.2.4" 1012 - source = "registry+https://github.com/rust-lang/crates.io-index" 1013 - checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" 1014 - dependencies = [ 1015 - "portable-atomic", 1016 - ] 1017 - 1018 - [[package]] 1019 359 name = "proc-macro2" 1020 360 version = "1.0.101" 1021 361 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1050 390 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" 1051 391 1052 392 [[package]] 1053 - name = "rayon" 1054 - version = "1.11.0" 1055 - source = "registry+https://github.com/rust-lang/crates.io-index" 1056 - checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" 1057 - dependencies = [ 1058 - "either", 1059 - "rayon-core", 1060 - ] 1061 - 1062 - [[package]] 1063 - name = "rayon-core" 1064 - version = "1.13.0" 1065 - source = "registry+https://github.com/rust-lang/crates.io-index" 1066 - checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" 1067 - dependencies = [ 1068 - "crossbeam-deque", 1069 - "crossbeam-utils", 1070 - ] 1071 - 1072 - [[package]] 1073 393 name = "redox_syscall" 1074 394 version = "0.5.18" 1075 395 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1079 399 ] 1080 400 1081 401 [[package]] 1082 - name = "regex" 1083 - version = "1.11.3" 1084 - source = "registry+https://github.com/rust-lang/crates.io-index" 1085 - checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" 1086 - dependencies = [ 1087 - "aho-corasick", 1088 - "memchr", 1089 - "regex-automata", 1090 - "regex-syntax", 1091 - ] 1092 - 1093 - [[package]] 1094 - name = "regex-automata" 1095 - version = "0.4.11" 1096 - source = "registry+https://github.com/rust-lang/crates.io-index" 1097 - checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" 1098 - dependencies = [ 1099 - "aho-corasick", 1100 - "memchr", 1101 - "regex-syntax", 1102 - ] 1103 - 1104 - [[package]] 1105 - name = "regex-syntax" 1106 - version = "0.8.6" 1107 - source = "registry+https://github.com/rust-lang/crates.io-index" 1108 - checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" 1109 - 1110 - [[package]] 1111 402 name = "repo-stream" 1112 403 version = "0.2.2" 1113 404 dependencies = [ 1114 - "bincode", 1115 405 "clap", 1116 - "criterion", 1117 - "env_logger", 1118 406 "fjall", 1119 - "futures", 1120 - "futures-core", 1121 - "ipld-core", 1122 - "iroh-car", 1123 - "log", 1124 - "multibase", 1125 - "serde", 1126 - "serde_bytes", 1127 - "serde_ipld_dagcbor", 1128 - "sha2", 1129 - "tempfile", 1130 - "thiserror 2.0.17", 1131 - "tokio", 1132 407 ] 1133 408 1134 409 [[package]] 1135 - name = "rustc-demangle" 1136 - version = "0.1.26" 1137 - source = "registry+https://github.com/rust-lang/crates.io-index" 1138 - checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" 1139 - 1140 - [[package]] 1141 410 name = "rustc-hash" 1142 411 version = "2.1.1" 1143 412 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1153 422 "errno", 1154 423 "libc", 1155 424 "linux-raw-sys", 1156 - "windows-sys 0.60.2", 1157 - ] 1158 - 1159 - [[package]] 1160 - name = "rustversion" 1161 - version = "1.0.22" 1162 - source = "registry+https://github.com/rust-lang/crates.io-index" 1163 - checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 1164 - 1165 - [[package]] 1166 - name = "ryu" 1167 - version = "1.0.20" 1168 - source = "registry+https://github.com/rust-lang/crates.io-index" 1169 - checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 1170 - 1171 - [[package]] 1172 - name = "same-file" 1173 - version = "1.0.6" 1174 - source = "registry+https://github.com/rust-lang/crates.io-index" 1175 - checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 1176 - dependencies = [ 1177 - "winapi-util", 425 + "windows-sys", 1178 426 ] 1179 427 1180 428 [[package]] ··· 1190 438 checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89" 1191 439 1192 440 [[package]] 1193 - name = "serde" 1194 - version = "1.0.228" 1195 - source = "registry+https://github.com/rust-lang/crates.io-index" 1196 - checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" 1197 - dependencies = [ 1198 - "serde_core", 1199 - "serde_derive", 1200 - ] 1201 - 1202 - [[package]] 1203 - name = "serde_bytes" 1204 - version = "0.11.19" 1205 - source = "registry+https://github.com/rust-lang/crates.io-index" 1206 - checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" 1207 - dependencies = [ 1208 - "serde", 1209 - "serde_core", 1210 - ] 1211 - 1212 - [[package]] 1213 - name = "serde_core" 1214 - version = "1.0.228" 1215 - source = "registry+https://github.com/rust-lang/crates.io-index" 1216 - checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" 1217 - dependencies = [ 1218 - "serde_derive", 1219 - ] 1220 - 1221 - [[package]] 1222 - name = "serde_derive" 1223 - version = "1.0.228" 1224 - source = "registry+https://github.com/rust-lang/crates.io-index" 1225 - checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" 1226 - dependencies = [ 1227 - "proc-macro2", 1228 - "quote", 1229 - "syn 2.0.106", 1230 - ] 1231 - 1232 - [[package]] 1233 - name = "serde_ipld_dagcbor" 1234 - version = "0.6.4" 1235 - source = "registry+https://github.com/rust-lang/crates.io-index" 1236 - checksum = "46182f4f08349a02b45c998ba3215d3f9de826246ba02bb9dddfe9a2a2100778" 1237 - dependencies = [ 1238 - "cbor4ii", 1239 - "ipld-core", 1240 - "scopeguard", 1241 - "serde", 1242 - ] 1243 - 1244 - [[package]] 1245 - name = "serde_json" 1246 - version = "1.0.145" 1247 - source = "registry+https://github.com/rust-lang/crates.io-index" 1248 - checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 1249 - dependencies = [ 1250 - "itoa", 1251 - "memchr", 1252 - "ryu", 1253 - "serde", 1254 - "serde_core", 1255 - ] 1256 - 1257 - [[package]] 1258 441 name = "sfa" 1259 442 version = "1.0.0" 1260 443 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1266 449 ] 1267 450 1268 451 [[package]] 1269 - name = "sha2" 1270 - version = "0.10.9" 1271 - source = "registry+https://github.com/rust-lang/crates.io-index" 1272 - checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" 1273 - dependencies = [ 1274 - "cfg-if", 1275 - "cpufeatures", 1276 - "digest", 1277 - ] 1278 - 1279 - [[package]] 1280 - name = "signal-hook-registry" 1281 - version = "1.4.6" 1282 - source = "registry+https://github.com/rust-lang/crates.io-index" 1283 - checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" 1284 - dependencies = [ 1285 - "libc", 1286 - ] 1287 - 1288 - [[package]] 1289 - name = "slab" 1290 - version = "0.4.11" 1291 - source = "registry+https://github.com/rust-lang/crates.io-index" 1292 - checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" 1293 - 1294 - [[package]] 1295 452 name = "smallvec" 1296 453 version = "1.15.1" 1297 454 source = "registry+https://github.com/rust-lang/crates.io-index" 1298 455 checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" 1299 456 1300 457 [[package]] 1301 - name = "socket2" 1302 - version = "0.6.0" 1303 - source = "registry+https://github.com/rust-lang/crates.io-index" 1304 - checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" 1305 - dependencies = [ 1306 - "libc", 1307 - "windows-sys 0.59.0", 1308 - ] 1309 - 1310 - [[package]] 1311 458 name = "spin" 1312 459 version = "0.9.8" 1313 460 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1324 471 1325 472 [[package]] 1326 473 name = "syn" 1327 - version = "1.0.109" 1328 - source = "registry+https://github.com/rust-lang/crates.io-index" 1329 - checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" 1330 - dependencies = [ 1331 - "proc-macro2", 1332 - "quote", 1333 - "unicode-ident", 1334 - ] 1335 - 1336 - [[package]] 1337 - name = "syn" 1338 474 version = "2.0.106" 1339 475 source = "registry+https://github.com/rust-lang/crates.io-index" 1340 476 checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" ··· 1354 490 "getrandom", 1355 491 "once_cell", 1356 492 "rustix", 1357 - "windows-sys 0.60.2", 1358 - ] 1359 - 1360 - [[package]] 1361 - name = "thiserror" 1362 - version = "1.0.69" 1363 - source = "registry+https://github.com/rust-lang/crates.io-index" 1364 - checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" 1365 - dependencies = [ 1366 - "thiserror-impl 1.0.69", 493 + "windows-sys", 1367 494 ] 1368 495 1369 496 [[package]] 1370 - name = "thiserror" 1371 - version = "2.0.17" 497 + name = "twox-hash" 498 + version = "2.1.2" 1372 499 source = "registry+https://github.com/rust-lang/crates.io-index" 1373 - checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" 1374 - dependencies = [ 1375 - "thiserror-impl 2.0.17", 1376 - ] 1377 - 1378 - [[package]] 1379 - name = "thiserror-impl" 1380 - version = "1.0.69" 1381 - source = "registry+https://github.com/rust-lang/crates.io-index" 1382 - checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" 1383 - dependencies = [ 1384 - "proc-macro2", 1385 - "quote", 1386 - "syn 2.0.106", 1387 - ] 1388 - 1389 - [[package]] 1390 - name = "thiserror-impl" 1391 - version = "2.0.17" 1392 - source = "registry+https://github.com/rust-lang/crates.io-index" 1393 - checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" 1394 - dependencies = [ 1395 - "proc-macro2", 1396 - "quote", 1397 - "syn 2.0.106", 1398 - ] 1399 - 1400 - [[package]] 1401 - name = "tinytemplate" 1402 - version = "1.2.1" 1403 - source = "registry+https://github.com/rust-lang/crates.io-index" 1404 - checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 1405 - dependencies = [ 1406 - "serde", 1407 - "serde_json", 1408 - ] 1409 - 1410 - [[package]] 1411 - name = "tokio" 1412 - version = "1.47.1" 1413 - source = "registry+https://github.com/rust-lang/crates.io-index" 1414 - checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" 1415 - dependencies = [ 1416 - "backtrace", 1417 - "bytes", 1418 - "io-uring", 1419 - "libc", 1420 - "mio", 1421 - "parking_lot", 1422 - "pin-project-lite", 1423 - "signal-hook-registry", 1424 - "slab", 1425 - "socket2", 1426 - "tokio-macros", 1427 - "windows-sys 0.59.0", 1428 - ] 1429 - 1430 - [[package]] 1431 - name = "tokio-macros" 1432 - version = "2.5.0" 1433 - source = "registry+https://github.com/rust-lang/crates.io-index" 1434 - checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" 1435 - dependencies = [ 1436 - "proc-macro2", 1437 - "quote", 1438 - "syn 2.0.106", 1439 - ] 1440 - 1441 - [[package]] 1442 - name = "typenum" 1443 - version = "1.19.0" 1444 - source = "registry+https://github.com/rust-lang/crates.io-index" 1445 - checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" 500 + checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" 1446 501 1447 502 [[package]] 1448 503 name = "unicode-ident" ··· 1451 506 checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" 1452 507 1453 508 [[package]] 1454 - name = "unsigned-varint" 1455 - version = "0.7.2" 1456 - source = "registry+https://github.com/rust-lang/crates.io-index" 1457 - checksum = "6889a77d49f1f013504cec6bf97a2c730394adedaeb1deb5ea08949a50541105" 1458 - 1459 - [[package]] 1460 - name = "unsigned-varint" 1461 - version = "0.8.0" 1462 - source = "registry+https://github.com/rust-lang/crates.io-index" 1463 - checksum = "eb066959b24b5196ae73cb057f45598450d2c5f71460e98c49b738086eff9c06" 1464 - 1465 - [[package]] 1466 - name = "unty" 1467 - version = "0.0.4" 1468 - source = "registry+https://github.com/rust-lang/crates.io-index" 1469 - checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" 1470 - 1471 - [[package]] 1472 509 name = "utf8parse" 1473 510 version = "0.2.2" 1474 511 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1479 516 version = "2.2.0" 1480 517 source = "registry+https://github.com/rust-lang/crates.io-index" 1481 518 checksum = "8f54a172d0620933a27a4360d3db3e2ae0dd6cceae9730751a036bbf182c4b23" 1482 - 1483 - [[package]] 1484 - name = "version_check" 1485 - version = "0.9.5" 1486 - source = "registry+https://github.com/rust-lang/crates.io-index" 1487 - checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 1488 - 1489 - [[package]] 1490 - name = "virtue" 1491 - version = "0.0.18" 1492 - source = "registry+https://github.com/rust-lang/crates.io-index" 1493 - checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" 1494 - 1495 - [[package]] 1496 - name = "walkdir" 1497 - version = "2.5.0" 1498 - source = "registry+https://github.com/rust-lang/crates.io-index" 1499 - checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 1500 - dependencies = [ 1501 - "same-file", 1502 - "winapi-util", 1503 - ] 1504 - 1505 - [[package]] 1506 - name = "wasi" 1507 - version = "0.11.1+wasi-snapshot-preview1" 1508 - source = "registry+https://github.com/rust-lang/crates.io-index" 1509 - checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" 1510 519 1511 520 [[package]] 1512 521 name = "wasi" ··· 1527 536 ] 1528 537 1529 538 [[package]] 1530 - name = "wasm-bindgen" 1531 - version = "0.2.104" 1532 - source = "registry+https://github.com/rust-lang/crates.io-index" 1533 - checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" 1534 - dependencies = [ 1535 - "cfg-if", 1536 - "once_cell", 1537 - "rustversion", 1538 - "wasm-bindgen-macro", 1539 - "wasm-bindgen-shared", 1540 - ] 1541 - 1542 - [[package]] 1543 - name = "wasm-bindgen-backend" 1544 - version = "0.2.104" 1545 - source = "registry+https://github.com/rust-lang/crates.io-index" 1546 - checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" 1547 - dependencies = [ 1548 - "bumpalo", 1549 - "log", 1550 - "proc-macro2", 1551 - "quote", 1552 - "syn 2.0.106", 1553 - "wasm-bindgen-shared", 1554 - ] 1555 - 1556 - [[package]] 1557 - name = "wasm-bindgen-macro" 1558 - version = "0.2.104" 1559 - source = "registry+https://github.com/rust-lang/crates.io-index" 1560 - checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" 1561 - dependencies = [ 1562 - "quote", 1563 - "wasm-bindgen-macro-support", 1564 - ] 1565 - 1566 - [[package]] 1567 - name = "wasm-bindgen-macro-support" 1568 - version = "0.2.104" 1569 - source = "registry+https://github.com/rust-lang/crates.io-index" 1570 - checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" 1571 - dependencies = [ 1572 - "proc-macro2", 1573 - "quote", 1574 - "syn 2.0.106", 1575 - "wasm-bindgen-backend", 1576 - "wasm-bindgen-shared", 1577 - ] 1578 - 1579 - [[package]] 1580 - name = "wasm-bindgen-shared" 1581 - version = "0.2.104" 1582 - source = "registry+https://github.com/rust-lang/crates.io-index" 1583 - checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" 1584 - dependencies = [ 1585 - "unicode-ident", 1586 - ] 1587 - 1588 - [[package]] 1589 - name = "web-sys" 1590 - version = "0.3.81" 1591 - source = "registry+https://github.com/rust-lang/crates.io-index" 1592 - checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" 1593 - dependencies = [ 1594 - "js-sys", 1595 - "wasm-bindgen", 1596 - ] 1597 - 1598 - [[package]] 1599 - name = "winapi-util" 1600 - version = "0.1.11" 1601 - source = "registry+https://github.com/rust-lang/crates.io-index" 1602 - checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" 1603 - dependencies = [ 1604 - "windows-sys 0.60.2", 1605 - ] 1606 - 1607 - [[package]] 1608 539 name = "windows-link" 1609 540 version = "0.2.1" 1610 541 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1612 543 1613 544 [[package]] 1614 545 name = "windows-sys" 1615 - version = "0.59.0" 1616 - source = "registry+https://github.com/rust-lang/crates.io-index" 1617 - checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 1618 - dependencies = [ 1619 - "windows-targets 0.52.6", 1620 - ] 1621 - 1622 - [[package]] 1623 - name = "windows-sys" 1624 546 version = "0.60.2" 1625 547 source = "registry+https://github.com/rust-lang/crates.io-index" 1626 548 checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" 1627 549 dependencies = [ 1628 - "windows-targets 0.53.5", 1629 - ] 1630 - 1631 - [[package]] 1632 - name = "windows-targets" 1633 - version = "0.52.6" 1634 - source = "registry+https://github.com/rust-lang/crates.io-index" 1635 - checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 1636 - dependencies = [ 1637 - "windows_aarch64_gnullvm 0.52.6", 1638 - "windows_aarch64_msvc 0.52.6", 1639 - "windows_i686_gnu 0.52.6", 1640 - "windows_i686_gnullvm 0.52.6", 1641 - "windows_i686_msvc 0.52.6", 1642 - "windows_x86_64_gnu 0.52.6", 1643 - "windows_x86_64_gnullvm 0.52.6", 1644 - "windows_x86_64_msvc 0.52.6", 550 + "windows-targets", 1645 551 ] 1646 552 1647 553 [[package]] ··· 1651 557 checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" 1652 558 dependencies = [ 1653 559 "windows-link", 1654 - "windows_aarch64_gnullvm 0.53.1", 1655 - "windows_aarch64_msvc 0.53.1", 1656 - "windows_i686_gnu 0.53.1", 1657 - "windows_i686_gnullvm 0.53.1", 1658 - "windows_i686_msvc 0.53.1", 1659 - "windows_x86_64_gnu 0.53.1", 1660 - "windows_x86_64_gnullvm 0.53.1", 1661 - "windows_x86_64_msvc 0.53.1", 560 + "windows_aarch64_gnullvm", 561 + "windows_aarch64_msvc", 562 + "windows_i686_gnu", 563 + "windows_i686_gnullvm", 564 + "windows_i686_msvc", 565 + "windows_x86_64_gnu", 566 + "windows_x86_64_gnullvm", 567 + "windows_x86_64_msvc", 1662 568 ] 1663 569 1664 570 [[package]] 1665 571 name = "windows_aarch64_gnullvm" 1666 - version = "0.52.6" 1667 - source = "registry+https://github.com/rust-lang/crates.io-index" 1668 - checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 1669 - 1670 - [[package]] 1671 - name = "windows_aarch64_gnullvm" 1672 572 version = "0.53.1" 1673 573 source = "registry+https://github.com/rust-lang/crates.io-index" 1674 574 checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" 1675 - 1676 - [[package]] 1677 - name = "windows_aarch64_msvc" 1678 - version = "0.52.6" 1679 - source = "registry+https://github.com/rust-lang/crates.io-index" 1680 - checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 1681 575 1682 576 [[package]] 1683 577 name = "windows_aarch64_msvc" ··· 1687 581 1688 582 [[package]] 1689 583 name = "windows_i686_gnu" 1690 - version = "0.52.6" 1691 - source = "registry+https://github.com/rust-lang/crates.io-index" 1692 - checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 1693 - 1694 - [[package]] 1695 - name = "windows_i686_gnu" 1696 584 version = "0.53.1" 1697 585 source = "registry+https://github.com/rust-lang/crates.io-index" 1698 586 checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" 1699 587 1700 588 [[package]] 1701 589 name = "windows_i686_gnullvm" 1702 - version = "0.52.6" 1703 - source = "registry+https://github.com/rust-lang/crates.io-index" 1704 - checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 1705 - 1706 - [[package]] 1707 - name = "windows_i686_gnullvm" 1708 590 version = "0.53.1" 1709 591 source = "registry+https://github.com/rust-lang/crates.io-index" 1710 592 checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" 1711 593 1712 594 [[package]] 1713 595 name = "windows_i686_msvc" 1714 - version = "0.52.6" 1715 - source = "registry+https://github.com/rust-lang/crates.io-index" 1716 - checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 1717 - 1718 - [[package]] 1719 - name = "windows_i686_msvc" 1720 596 version = "0.53.1" 1721 597 source = "registry+https://github.com/rust-lang/crates.io-index" 1722 598 checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" 1723 599 1724 600 [[package]] 1725 601 name = "windows_x86_64_gnu" 1726 - version = "0.52.6" 1727 - source = "registry+https://github.com/rust-lang/crates.io-index" 1728 - checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 1729 - 1730 - [[package]] 1731 - name = "windows_x86_64_gnu" 1732 602 version = "0.53.1" 1733 603 source = "registry+https://github.com/rust-lang/crates.io-index" 1734 604 checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" 1735 605 1736 606 [[package]] 1737 607 name = "windows_x86_64_gnullvm" 1738 - version = "0.52.6" 1739 - source = "registry+https://github.com/rust-lang/crates.io-index" 1740 - checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 1741 - 1742 - [[package]] 1743 - name = "windows_x86_64_gnullvm" 1744 608 version = "0.53.1" 1745 609 source = "registry+https://github.com/rust-lang/crates.io-index" 1746 610 checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" 1747 - 1748 - [[package]] 1749 - name = "windows_x86_64_msvc" 1750 - version = "0.52.6" 1751 - source = "registry+https://github.com/rust-lang/crates.io-index" 1752 - checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 1753 611 1754 612 [[package]] 1755 613 name = "windows_x86_64_msvc" ··· 1768 626 version = "0.8.15" 1769 627 source = "registry+https://github.com/rust-lang/crates.io-index" 1770 628 checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" 1771 - 1772 - [[package]] 1773 - name = "zerocopy" 1774 - version = "0.8.27" 1775 - source = "registry+https://github.com/rust-lang/crates.io-index" 1776 - checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" 1777 - dependencies = [ 1778 - "zerocopy-derive", 1779 - ] 1780 - 1781 - [[package]] 1782 - name = "zerocopy-derive" 1783 - version = "0.8.27" 1784 - source = "registry+https://github.com/rust-lang/crates.io-index" 1785 - checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" 1786 - dependencies = [ 1787 - "proc-macro2", 1788 - "quote", 1789 - "syn 2.0.106", 1790 - ]
+1 -35
Cargo.toml
··· 7 7 repository = "https://tangled.org/@microcosm.blue/repo-stream" 8 8 9 9 [dependencies] 10 - bincode = { version = "2.0.1", features = ["serde"] } 11 - fjall = { version = "3.0.1", default-features = false } 12 - futures = "0.3.31" 13 - futures-core = "0.3.31" 14 - ipld-core = { version = "0.4.2", features = ["serde"] } 15 - iroh-car = "0.5.1" 16 - log = "0.4.28" 17 - multibase = "0.9.2" 18 - serde = { version = "1.0.228", features = ["derive"] } 19 - serde_bytes = "0.11.19" 20 - serde_ipld_dagcbor = "0.6.4" 21 - sha2 = "0.10.9" 22 - thiserror = "2.0.17" 23 - tokio = { version = "1.47.1", features = ["rt", "sync"] } 24 - 25 - [dev-dependencies] 10 + fjall = "3.0.1" 26 11 clap = { version = "4.5.48", features = ["derive"] } 27 - criterion = { version = "0.7.0", features = ["async_tokio"] } 28 - env_logger = "0.11.8" 29 - multibase = "0.9.2" 30 - tempfile = "3.23.0" 31 - tokio = { version = "1.47.1", features = ["full"] } 32 12 33 - [profile.profiling] 34 - inherits = "release" 35 - debug = true 36 - 37 - # [profile.release] 38 - # debug = true 39 - 40 - [[bench]] 41 - name = "non-huge-cars" 42 - harness = false 43 - 44 - [[bench]] 45 - name = "huge-car" 46 - harness = false
+26 -73
examples/disk-read-file/main.rs
··· 1 - /*! 2 - Read a CAR file by spilling to disk 3 - */ 4 - 5 - extern crate repo_stream; 6 1 use clap::Parser; 7 - use repo_stream::{DiskBuilder, Driver, DriverBuilder}; 8 - use std::path::PathBuf; 9 - use std::time::Instant; 2 + use fjall::{Database, KeyspaceCreateOptions}; 3 + use std::{path::PathBuf, collections::BTreeMap}; 10 4 11 5 #[derive(Debug, Parser)] 12 6 struct Args { 13 7 #[arg()] 14 - car: PathBuf, 15 - #[arg()] 16 - tmpfile: PathBuf, 8 + db_path: PathBuf, 17 9 } 18 10 19 - #[tokio::main] 20 - async fn main() -> Result<(), Box<dyn std::error::Error>> { 21 - env_logger::init(); 11 + fn main() -> Result<(), Box<dyn std::error::Error>> { 12 + let Args { db_path } = Args::parse(); 22 13 23 - let Args { car, tmpfile } = Args::parse(); 14 + let db = Database::builder(db_path).open()?; 15 + let ks = db.keyspace("z", KeyspaceCreateOptions::default)?; 16 + let mut seen_keys: BTreeMap<Vec<u8>, usize> = BTreeMap::default(); 24 17 25 - // repo-stream takes an AsyncRead as input. wrapping a filesystem read in 26 - // BufReader can provide a really significant performance win. 27 - let reader = tokio::fs::File::open(car).await?; 28 - let reader = tokio::io::BufReader::new(reader); 18 + print!("writing..."); 19 + for i in 0..250_000_usize { 20 + let k = i.to_be_bytes().to_vec(); 21 + ks.insert(k.clone(), vec![0xAA; 256])?; 22 + seen_keys.insert(k, i); 23 + } 29 24 30 - log::info!("hello! reading the car..."); 31 - let t0 = Instant::now(); 25 + println!(" done. checking keys..."); 32 26 33 - // in this example we only bother handling CARs that are too big for memory 34 - // `noop` helper means: do no block processing, store the raw blocks 35 - let driver = match DriverBuilder::new() 36 - .with_mem_limit_mb(32) // how much memory can be used before disk spill 37 - .load_car(reader) 38 - .await? 39 - { 40 - Driver::Memory(_, _) => panic!("try this on a bigger car"), 41 - Driver::Disk(big_stuff) => { 42 - // we reach here if the repo was too big and needs to be spilled to 43 - // disk to continue 27 + // remove every seen key that fjall actually has, to see what's left 28 + for guard in ks.iter() { 29 + seen_keys.remove(guard.key()?.as_ref()); 30 + } 44 31 45 - // set up a disk store we can spill to 46 - let disk_store = DiskBuilder::new().open(tmpfile).await?; 47 - 48 - // do the spilling, get back a (similar) driver 49 - let (commit, driver) = big_stuff.finish_loading(disk_store).await?; 50 - 51 - // at this point you might want to fetch the account's signing key 52 - // via the DID from the commit, and then verify the signature. 53 - log::warn!("big's comit ({:?}): {:?}", t0.elapsed(), commit); 54 - 55 - // pop the driver back out to get some code indentation relief 56 - driver 57 - } 58 - }; 59 - 60 - // collect some random stats about the blocks 61 - let mut n = 0; 62 - let mut zeros = 0; 63 - 64 - log::info!("walking..."); 65 - 66 - // this example uses the disk driver's channel mode: the tree walking is 67 - // spawned onto a blocking thread, and we get chunks of rkey+blocks back 68 - let (mut rx, join) = driver.to_channel(512); 69 - while let Some(r) = rx.recv().await { 70 - let pairs = r?; 71 - 72 - // keep a count of the total number of blocks seen 73 - n += pairs.len(); 74 - 75 - for (_, block) in pairs { 76 - // for each block, count how many bytes are equal to '0' 77 - // (this is just an example, you probably want to do something more 78 - // interesting) 79 - zeros += block.into_iter().filter(|&b| b == b'0').count() 32 + // report the result 33 + if seen_keys.len() == 0 { 34 + println!("[ OK ] all keys found"); 35 + } else { 36 + println!("[FAIL] fjall did not have all seen_keys:"); 37 + for (k, i) in seen_keys { 38 + println!(" insert #{i} missing, key bytes: {k:?}"); 80 39 } 81 40 } 82 - 83 - log::info!("arrived! ({:?}) joining rx...", t0.elapsed()); 84 - 85 - join.await?; 86 - 87 - log::info!("done. n={n} zeros={zeros}"); 88 41 89 42 Ok(()) 90 43 }
+3
readme.md
··· 50 50 total_size += size; 51 51 } 52 52 } 53 + 54 + // clean up the disk store (drop tables etc) 55 + driver.reset_store().await?; 53 56 } 54 57 }; 55 58 println!("sum of size of all records: {total_size}");
-162
src/disk.rs
··· 1 - /*! 2 - Disk storage for blocks on disk 3 - 4 - Currently this uses sqlite. In testing sqlite wasn't the fastest, but it seemed 5 - to be the best behaved in terms of both on-disk space usage and memory usage. 6 - 7 - ```no_run 8 - # use repo_stream::{DiskBuilder, DiskError}; 9 - # #[tokio::main] 10 - # async fn main() -> Result<(), DiskError> { 11 - let store = DiskBuilder::new() 12 - .with_cache_size_mb(32) 13 - .with_max_stored_mb(1024) // errors when >1GiB of processed blocks are inserted 14 - .open("/some/path.db".into()).await?; 15 - # Ok(()) 16 - # } 17 - ``` 18 - */ 19 - 20 - use crate::drive::DriveError; 21 - use fjall::config::{CompressionPolicy, PinningPolicy, RestartIntervalPolicy}; 22 - use fjall::{CompressionType, Database, Error as FjallError, Keyspace, KeyspaceCreateOptions}; 23 - use std::path::PathBuf; 24 - 25 - #[derive(Debug, thiserror::Error)] 26 - pub enum DiskError { 27 - /// A wrapped database error 28 - /// 29 - /// (The wrapped err should probably be obscured to remove public-facing 30 - /// sqlite bits) 31 - #[error(transparent)] 32 - DbError(#[from] FjallError), 33 - /// A tokio blocking task failed to join 34 - #[error("Failed to join a tokio blocking task: {0}")] 35 - JoinError(#[from] tokio::task::JoinError), 36 - /// The total size of stored blocks exceeded the allowed size 37 - /// 38 - /// If you need to process *really* big CARs, you can configure a higher 39 - /// limit. 40 - #[error("Maximum disk size reached")] 41 - MaxSizeExceeded, 42 - } 43 - 44 - /// Builder-style disk store setup 45 - #[derive(Debug, Clone)] 46 - pub struct DiskBuilder { 47 - /// Database in-memory cache allowance 48 - /// 49 - /// Default: 32 MiB 50 - pub cache_size_mb: usize, 51 - /// Database stored block size limit 52 - /// 53 - /// Default: 10 GiB 54 - /// 55 - /// Note: actual size on disk may be more, but should approximately scale 56 - /// with this limit 57 - pub max_stored_mb: usize, 58 - } 59 - 60 - impl Default for DiskBuilder { 61 - fn default() -> Self { 62 - Self { 63 - cache_size_mb: 64, 64 - max_stored_mb: 10 * 1024, // 10 GiB 65 - } 66 - } 67 - } 68 - 69 - impl DiskBuilder { 70 - /// Begin configuring the storage with defaults 71 - pub fn new() -> Self { 72 - Default::default() 73 - } 74 - /// Set the in-memory cache allowance for the database 75 - /// 76 - /// Default: 64 MiB 77 - pub fn with_cache_size_mb(mut self, size: usize) -> Self { 78 - self.cache_size_mb = size; 79 - self 80 - } 81 - /// Set the approximate stored block size limit 82 - /// 83 - /// Default: 10 GiB 84 - pub fn with_max_stored_mb(mut self, max: usize) -> Self { 85 - self.max_stored_mb = max; 86 - self 87 - } 88 - /// Open and initialize the actual disk storage 89 - pub async fn open(&self, path: PathBuf) -> Result<DiskStore, DiskError> { 90 - DiskStore::new(path, self.cache_size_mb, self.max_stored_mb).await 91 - } 92 - } 93 - 94 - /// On-disk block storage 95 - pub struct DiskStore { 96 - #[allow(unused)] 97 - db: Database, 98 - partition: Keyspace, 99 - max_stored: usize, 100 - stored: usize, 101 - } 102 - 103 - impl DiskStore { 104 - /// Initialize a new disk store 105 - pub async fn new( 106 - path: PathBuf, 107 - cache_mb: usize, 108 - max_stored_mb: usize, 109 - ) -> Result<Self, DiskError> { 110 - let max_stored = max_stored_mb * 2_usize.pow(20); 111 - let (db, partition) = tokio::task::spawn_blocking(move || { 112 - let db = Database::builder(path) 113 - // .manual_journal_persist(true) 114 - // .flush_workers(1) 115 - // .compaction_workers(1) 116 - .journal_compression(CompressionType::None) 117 - .cache_size(cache_mb as u64 * 2_u64.pow(20)) 118 - .temporary(true) 119 - .open()?; 120 - let opts = KeyspaceCreateOptions::default() 121 - .data_block_restart_interval_policy(RestartIntervalPolicy::all(8)) 122 - .filter_block_pinning_policy(PinningPolicy::disabled()) 123 - .expect_point_read_hits(true) 124 - .data_block_compression_policy(CompressionPolicy::disabled()) 125 - .manual_journal_persist(true) 126 - .max_memtable_size(32 * 2_u64.pow(20)); 127 - let partition = db.keyspace("z", || opts)?; 128 - 129 - Ok::<_, DiskError>((db, partition)) 130 - }) 131 - .await??; 132 - 133 - Ok(Self { 134 - db, 135 - partition, 136 - max_stored, 137 - stored: 0, 138 - }) 139 - } 140 - 141 - pub(crate) fn put_many( 142 - &mut self, 143 - kv: impl Iterator<Item = Result<(Vec<u8>, Vec<u8>), DriveError>>, 144 - ) -> Result<(), DriveError> { 145 - let mut batch = self.db.batch(); 146 - for pair in kv { 147 - let (k, v) = pair?; 148 - self.stored += v.len(); 149 - if self.stored > self.max_stored { 150 - return Err(DiskError::MaxSizeExceeded.into()); 151 - } 152 - batch.insert(&self.partition, k, v); 153 - } 154 - batch.commit().map_err(DiskError::DbError)?; 155 - Ok(()) 156 - } 157 - 158 - #[inline] 159 - pub(crate) fn get(&mut self, key: &[u8]) -> Result<Option<fjall::Slice>, FjallError> { 160 - self.partition.get(key) 161 - } 162 - }
-583
src/drive.rs
··· 1 - //! Consume a CAR from an AsyncRead, producing an ordered stream of records 2 - 3 - use crate::disk::{DiskError, DiskStore}; 4 - use crate::process::Processable; 5 - use ipld_core::cid::Cid; 6 - use iroh_car::CarReader; 7 - use serde::{Deserialize, Serialize}; 8 - use std::collections::HashMap; 9 - use std::convert::Infallible; 10 - use tokio::{io::AsyncRead, sync::mpsc}; 11 - 12 - use crate::mst::{Commit, Node}; 13 - use crate::walk::{Step, WalkError, Walker}; 14 - 15 - /// Errors that can happen while consuming and emitting blocks and records 16 - #[derive(Debug, thiserror::Error)] 17 - pub enum DriveError { 18 - #[error("Error from iroh_car: {0}")] 19 - CarReader(#[from] iroh_car::Error), 20 - #[error("Failed to decode commit block: {0}")] 21 - BadBlock(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 22 - #[error("The Commit block reference by the root was not found")] 23 - MissingCommit, 24 - #[error("The MST block {0} could not be found")] 25 - MissingBlock(Cid), 26 - #[error("Failed to walk the mst tree: {0}")] 27 - WalkError(#[from] WalkError), 28 - #[error("CAR file had no roots")] 29 - MissingRoot, 30 - #[error("Storage error")] 31 - StorageError(#[from] DiskError), 32 - #[error("Encode error: {0}")] 33 - BincodeEncodeError(#[from] bincode::error::EncodeError), 34 - #[error("Tried to send on a closed channel")] 35 - ChannelSendError, // SendError takes <T> which we don't need 36 - #[error("Failed to join a task: {0}")] 37 - JoinError(#[from] tokio::task::JoinError), 38 - } 39 - 40 - #[derive(Debug, thiserror::Error)] 41 - pub enum DecodeError { 42 - #[error(transparent)] 43 - BincodeDecodeError(#[from] bincode::error::DecodeError), 44 - #[error("extra bytes remained after decoding")] 45 - ExtraGarbage, 46 - } 47 - 48 - /// An in-order chunk of Rkey + (processed) Block pairs 49 - pub type BlockChunk<T> = Vec<(String, T)>; 50 - 51 - #[derive(Debug, Clone, Serialize, Deserialize)] 52 - pub(crate) enum MaybeProcessedBlock<T> { 53 - /// A block that's *probably* a Node (but we can't know yet) 54 - /// 55 - /// It *can be* a record that suspiciously looks a lot like a node, so we 56 - /// cannot eagerly turn it into a Node. We only know for sure what it is 57 - /// when we actually walk down the MST 58 - Raw(Vec<u8>), 59 - /// A processed record from a block that was definitely not a Node 60 - /// 61 - /// Processing has to be fallible because the CAR can have totally-unused 62 - /// blocks, which can just be garbage. since we're eagerly trying to process 63 - /// record blocks without knowing for sure that they *are* records, we 64 - /// discard any definitely-not-nodes that fail processing and keep their 65 - /// error in the buffer for them. if we later try to retreive them as a 66 - /// record, then we can surface the error. 67 - /// 68 - /// If we _never_ needed this block, then we may have wasted a bit of effort 69 - /// trying to process it. Oh well. 70 - /// 71 - /// There's an alternative here, which would be to kick unprocessable blocks 72 - /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could 73 - /// surface the typed error later if needed by trying to reprocess. 74 - Processed(T), 75 - } 76 - 77 - impl<T: Processable> Processable for MaybeProcessedBlock<T> { 78 - /// TODO this is probably a little broken 79 - fn get_size(&self) -> usize { 80 - use std::{cmp::max, mem::size_of}; 81 - 82 - // enum is always as big as its biggest member? 83 - let base_size = max(size_of::<Vec<u8>>(), size_of::<T>()); 84 - 85 - let extra = match self { 86 - Self::Raw(bytes) => bytes.len(), 87 - Self::Processed(t) => t.get_size(), 88 - }; 89 - 90 - base_size + extra 91 - } 92 - } 93 - 94 - impl<T> MaybeProcessedBlock<T> { 95 - fn maybe(process: fn(Vec<u8>) -> T, data: Vec<u8>) -> Self { 96 - if Node::could_be(&data) { 97 - MaybeProcessedBlock::Raw(data) 98 - } else { 99 - MaybeProcessedBlock::Processed(process(data)) 100 - } 101 - } 102 - } 103 - 104 - /// Read a CAR file, buffering blocks in memory or to disk 105 - pub enum Driver<R: AsyncRead + Unpin, T: Processable> { 106 - /// All blocks fit within the memory limit 107 - /// 108 - /// You probably want to check the commit's signature. You can go ahead and 109 - /// walk the MST right away. 110 - Memory(Commit, MemDriver<T>), 111 - /// Blocks exceed the memory limit 112 - /// 113 - /// You'll need to provide a disk storage to continue. The commit will be 114 - /// returned and can be validated only once all blocks are loaded. 115 - Disk(NeedDisk<R, T>), 116 - } 117 - 118 - /// Builder-style driver setup 119 - #[derive(Debug, Clone)] 120 - pub struct DriverBuilder { 121 - pub mem_limit_mb: usize, 122 - } 123 - 124 - impl Default for DriverBuilder { 125 - fn default() -> Self { 126 - Self { mem_limit_mb: 16 } 127 - } 128 - } 129 - 130 - impl DriverBuilder { 131 - /// Begin configuring the driver with defaults 132 - pub fn new() -> Self { 133 - Default::default() 134 - } 135 - /// Set the in-memory size limit, in MiB 136 - /// 137 - /// Default: 16 MiB 138 - pub fn with_mem_limit_mb(self, new_limit: usize) -> Self { 139 - Self { 140 - mem_limit_mb: new_limit, 141 - } 142 - } 143 - /// Set the block processor 144 - /// 145 - /// Default: noop, raw blocks will be emitted 146 - pub fn with_block_processor<T: Processable>( 147 - self, 148 - p: fn(Vec<u8>) -> T, 149 - ) -> DriverBuilderWithProcessor<T> { 150 - DriverBuilderWithProcessor { 151 - mem_limit_mb: self.mem_limit_mb, 152 - block_processor: p, 153 - } 154 - } 155 - /// Begin processing an atproto MST from a CAR file 156 - pub async fn load_car<R: AsyncRead + Unpin>( 157 - &self, 158 - reader: R, 159 - ) -> Result<Driver<R, Vec<u8>>, DriveError> { 160 - Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await 161 - } 162 - } 163 - 164 - /// Builder-style driver intermediate step 165 - /// 166 - /// start from `DriverBuilder` 167 - #[derive(Debug, Clone)] 168 - pub struct DriverBuilderWithProcessor<T: Processable> { 169 - pub mem_limit_mb: usize, 170 - pub block_processor: fn(Vec<u8>) -> T, 171 - } 172 - 173 - impl<T: Processable> DriverBuilderWithProcessor<T> { 174 - /// Set the in-memory size limit, in MiB 175 - /// 176 - /// Default: 16 MiB 177 - pub fn with_mem_limit_mb(mut self, new_limit: usize) -> Self { 178 - self.mem_limit_mb = new_limit; 179 - self 180 - } 181 - /// Begin processing an atproto MST from a CAR file 182 - pub async fn load_car<R: AsyncRead + Unpin>( 183 - &self, 184 - reader: R, 185 - ) -> Result<Driver<R, T>, DriveError> { 186 - Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await 187 - } 188 - } 189 - 190 - impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> { 191 - /// Begin processing an atproto MST from a CAR file 192 - /// 193 - /// Blocks will be loaded, processed, and buffered in memory. If the entire 194 - /// processed size is under the `mem_limit_mb` limit, a `Driver::Memory` 195 - /// will be returned along with a `Commit` ready for validation. 196 - /// 197 - /// If the `mem_limit_mb` limit is reached before loading all blocks, the 198 - /// partial state will be returned as `Driver::Disk(needed)`, which can be 199 - /// resumed by providing a `SqliteStorage` for on-disk block storage. 200 - pub async fn load_car( 201 - reader: R, 202 - process: fn(Vec<u8>) -> T, 203 - mem_limit_mb: usize, 204 - ) -> Result<Driver<R, T>, DriveError> { 205 - let max_size = mem_limit_mb * 2_usize.pow(20); 206 - let mut mem_blocks = HashMap::new(); 207 - 208 - let mut car = CarReader::new(reader).await?; 209 - 210 - let root = *car 211 - .header() 212 - .roots() 213 - .first() 214 - .ok_or(DriveError::MissingRoot)?; 215 - log::debug!("root: {root:?}"); 216 - 217 - let mut commit = None; 218 - 219 - // try to load all the blocks into memory 220 - let mut mem_size = 0; 221 - while let Some((cid, data)) = car.next_block().await? { 222 - // the root commit is a Special Third Kind of block that we need to make 223 - // sure not to optimistically send to the processing function 224 - if cid == root { 225 - let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 226 - commit = Some(c); 227 - continue; 228 - } 229 - 230 - // remaining possible types: node, record, other. optimistically process 231 - let maybe_processed = MaybeProcessedBlock::maybe(process, data); 232 - 233 - // stash (maybe processed) blocks in memory as long as we have room 234 - mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size(); 235 - mem_blocks.insert(cid, maybe_processed); 236 - if mem_size >= max_size { 237 - return Ok(Driver::Disk(NeedDisk { 238 - car, 239 - root, 240 - process, 241 - max_size, 242 - mem_blocks, 243 - commit, 244 - })); 245 - } 246 - } 247 - 248 - // all blocks loaded and we fit in memory! hopefully we found the commit... 249 - let commit = commit.ok_or(DriveError::MissingCommit)?; 250 - 251 - let walker = Walker::new(commit.data); 252 - 253 - Ok(Driver::Memory( 254 - commit, 255 - MemDriver { 256 - blocks: mem_blocks, 257 - walker, 258 - process, 259 - }, 260 - )) 261 - } 262 - } 263 - 264 - /// The core driver between the block stream and MST walker 265 - /// 266 - /// In the future, PDSs will export CARs in a stream-friendly order that will 267 - /// enable processing them with tiny memory overhead. But that future is not 268 - /// here yet. 269 - /// 270 - /// CARs are almost always in a stream-unfriendly order, so I'm reverting the 271 - /// optimistic stream features: we load all block first, then walk the MST. 272 - /// 273 - /// This makes things much simpler: we only need to worry about spilling to disk 274 - /// in one place, and we always have a reasonable expecatation about how much 275 - /// work the init function will do. We can drop the CAR reader before walking, 276 - /// so the sync/async boundaries become a little easier to work around. 277 - #[derive(Debug)] 278 - pub struct MemDriver<T: Processable> { 279 - blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 280 - walker: Walker, 281 - process: fn(Vec<u8>) -> T, 282 - } 283 - 284 - impl<T: Processable> MemDriver<T> { 285 - /// Step through the record outputs, in rkey order 286 - pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 287 - let mut out = Vec::with_capacity(n); 288 - for _ in 0..n { 289 - // walk as far as we can until we run out of blocks or find a record 290 - match self.walker.step(&mut self.blocks, self.process)? { 291 - Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)), 292 - Step::Finish => break, 293 - Step::Found { rkey, data } => { 294 - out.push((rkey, data)); 295 - continue; 296 - } 297 - }; 298 - } 299 - 300 - if out.is_empty() { 301 - Ok(None) 302 - } else { 303 - Ok(Some(out)) 304 - } 305 - } 306 - } 307 - 308 - /// A partially memory-loaded car file that needs disk spillover to continue 309 - pub struct NeedDisk<R: AsyncRead + Unpin, T: Processable> { 310 - car: CarReader<R>, 311 - root: Cid, 312 - process: fn(Vec<u8>) -> T, 313 - max_size: usize, 314 - mem_blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 315 - pub commit: Option<Commit>, 316 - } 317 - 318 - fn encode(v: impl Serialize) -> Result<Vec<u8>, bincode::error::EncodeError> { 319 - bincode::serde::encode_to_vec(v, bincode::config::standard()) 320 - } 321 - 322 - pub(crate) fn decode<T: Processable>(bytes: &[u8]) -> Result<T, DecodeError> { 323 - let (t, n) = bincode::serde::decode_from_slice(bytes, bincode::config::standard())?; 324 - if n != bytes.len() { 325 - return Err(DecodeError::ExtraGarbage); 326 - } 327 - Ok(t) 328 - } 329 - 330 - impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> NeedDisk<R, T> { 331 - pub async fn finish_loading( 332 - mut self, 333 - mut store: DiskStore, 334 - ) -> Result<(Commit, DiskDriver<T>), DriveError> { 335 - // move store in and back out so we can manage lifetimes 336 - // dump mem blocks into the store 337 - store = tokio::task::spawn(async move { 338 - let kvs = self 339 - .mem_blocks 340 - .into_iter() 341 - .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?)); 342 - 343 - store.put_many(kvs)?; 344 - Ok::<_, DriveError>(store) 345 - }) 346 - .await??; 347 - 348 - let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(1); 349 - 350 - let store_worker = tokio::task::spawn_blocking(move || { 351 - while let Some(chunk) = rx.blocking_recv() { 352 - let kvs = chunk 353 - .into_iter() 354 - .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?)); 355 - store.put_many(kvs)?; 356 - } 357 - Ok::<_, DriveError>(store) 358 - }); // await later 359 - 360 - // dump the rest to disk (in chunks) 361 - log::debug!("dumping the rest of the stream..."); 362 - loop { 363 - let mut mem_size = 0; 364 - let mut chunk = vec![]; 365 - loop { 366 - let Some((cid, data)) = self.car.next_block().await? else { 367 - break; 368 - }; 369 - // we still gotta keep checking for the root since we might not have it 370 - if cid == self.root { 371 - let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 372 - self.commit = Some(c); 373 - continue; 374 - } 375 - // remaining possible types: node, record, other. optimistically process 376 - // TODO: get the actual in-memory size to compute disk spill 377 - let maybe_processed = MaybeProcessedBlock::maybe(self.process, data); 378 - mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size(); 379 - chunk.push((cid, maybe_processed)); 380 - if mem_size >= self.max_size { 381 - // soooooo if we're setting the db cache to max_size and then letting 382 - // multiple chunks in the queue that are >= max_size, then at any time 383 - // we might be using some multiple of max_size? 384 - break; 385 - } 386 - } 387 - if chunk.is_empty() { 388 - break; 389 - } 390 - tx.send(chunk) 391 - .await 392 - .map_err(|_| DriveError::ChannelSendError)?; 393 - } 394 - drop(tx); 395 - log::debug!("done. waiting for worker to finish..."); 396 - 397 - store = store_worker.await??; 398 - 399 - log::debug!("worker finished."); 400 - 401 - let commit = self.commit.ok_or(DriveError::MissingCommit)?; 402 - 403 - let walker = Walker::new(commit.data); 404 - 405 - Ok(( 406 - commit, 407 - DiskDriver { 408 - process: self.process, 409 - state: Some(BigState { store, walker }), 410 - }, 411 - )) 412 - } 413 - } 414 - 415 - struct BigState { 416 - store: DiskStore, 417 - walker: Walker, 418 - } 419 - 420 - /// MST walker that reads from disk instead of an in-memory hashmap 421 - pub struct DiskDriver<T: Clone> { 422 - process: fn(Vec<u8>) -> T, 423 - state: Option<BigState>, 424 - } 425 - 426 - // for doctests only 427 - #[doc(hidden)] 428 - pub fn _get_fake_disk_driver() -> DiskDriver<Vec<u8>> { 429 - use crate::process::noop; 430 - DiskDriver { 431 - process: noop, 432 - state: None, 433 - } 434 - } 435 - 436 - impl<T: Processable + Send + 'static> DiskDriver<T> { 437 - /// Walk the MST returning up to `n` rkey + record pairs 438 - /// 439 - /// ```no_run 440 - /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop}; 441 - /// # #[tokio::main] 442 - /// # async fn main() -> Result<(), DriveError> { 443 - /// # let mut disk_driver = _get_fake_disk_driver(); 444 - /// while let Some(pairs) = disk_driver.next_chunk(256).await? { 445 - /// for (rkey, record) in pairs { 446 - /// println!("{rkey}: size={}", record.len()); 447 - /// } 448 - /// } 449 - /// # Ok(()) 450 - /// # } 451 - /// ``` 452 - pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 453 - let process = self.process; 454 - 455 - // state should only *ever* be None transiently while inside here 456 - let mut state = self.state.take().expect("DiskDriver must have Some(state)"); 457 - 458 - // the big pain here is that we don't want to leave self.state in an 459 - // invalid state (None), so all the error paths have to make sure it 460 - // comes out again. 461 - let (state, res) = tokio::task::spawn_blocking( 462 - move || -> (BigState, Result<BlockChunk<T>, DriveError>) { 463 - let mut out = Vec::with_capacity(n); 464 - 465 - for _ in 0..n { 466 - // walk as far as we can until we run out of blocks or find a record 467 - let step = match state.walker.disk_step(&mut state.store, process) { 468 - Ok(s) => s, 469 - Err(e) => { 470 - return (state, Err(e.into())); 471 - } 472 - }; 473 - match step { 474 - Step::Missing(cid) => { 475 - return (state, Err(DriveError::MissingBlock(cid))); 476 - } 477 - Step::Finish => break, 478 - Step::Found { rkey, data } => out.push((rkey, data)), 479 - }; 480 - } 481 - 482 - (state, Ok::<_, DriveError>(out)) 483 - }, 484 - ) 485 - .await?; // on tokio JoinError, we'll be left with invalid state :( 486 - 487 - // *must* restore state before dealing with the actual result 488 - self.state = Some(state); 489 - 490 - let out = res?; 491 - 492 - if out.is_empty() { 493 - Ok(None) 494 - } else { 495 - Ok(Some(out)) 496 - } 497 - } 498 - 499 - fn read_tx_blocking( 500 - &mut self, 501 - n: usize, 502 - tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>, 503 - ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> { 504 - let BigState { store, walker } = self.state.as_mut().expect("valid state"); 505 - 506 - loop { 507 - let mut out: BlockChunk<T> = Vec::with_capacity(n); 508 - 509 - for _ in 0..n { 510 - // walk as far as we can until we run out of blocks or find a record 511 - 512 - let step = match walker.disk_step(store, self.process) { 513 - Ok(s) => s, 514 - Err(e) => return tx.blocking_send(Err(e.into())), 515 - }; 516 - 517 - match step { 518 - Step::Missing(cid) => { 519 - return tx.blocking_send(Err(DriveError::MissingBlock(cid))); 520 - } 521 - Step::Finish => return Ok(()), 522 - Step::Found { rkey, data } => { 523 - out.push((rkey, data)); 524 - continue; 525 - } 526 - }; 527 - } 528 - 529 - if out.is_empty() { 530 - break; 531 - } 532 - tx.blocking_send(Ok(out))?; 533 - } 534 - 535 - Ok(()) 536 - } 537 - 538 - /// Spawn the disk reading task into a tokio blocking thread 539 - /// 540 - /// The idea is to avoid so much sending back and forth to the blocking 541 - /// thread, letting a blocking task do all the disk reading work and sending 542 - /// records and rkeys back through an `mpsc` channel instead. 543 - /// 544 - /// This might also allow the disk work to continue while processing the 545 - /// records. It's still not yet clear if this method actually has much 546 - /// benefit over just using `.next_chunk(n)`. 547 - /// 548 - /// ```no_run 549 - /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop}; 550 - /// # #[tokio::main] 551 - /// # async fn main() -> Result<(), DriveError> { 552 - /// # let mut disk_driver = _get_fake_disk_driver(); 553 - /// let (mut rx, join) = disk_driver.to_channel(512); 554 - /// while let Some(recvd) = rx.recv().await { 555 - /// let pairs = recvd?; 556 - /// for (rkey, record) in pairs { 557 - /// println!("{rkey}: size={}", record.len()); 558 - /// } 559 - /// 560 - /// } 561 - /// # Ok(()) 562 - /// # } 563 - /// ``` 564 - pub fn to_channel( 565 - mut self, 566 - n: usize, 567 - ) -> ( 568 - mpsc::Receiver<Result<BlockChunk<T>, DriveError>>, 569 - tokio::task::JoinHandle<Self>, 570 - ) { 571 - let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1); 572 - 573 - // sketch: this worker is going to be allowed to execute without a join handle 574 - let chan_task = tokio::task::spawn_blocking(move || { 575 - if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) { 576 - log::debug!("big car reader exited early due to dropped receiver channel"); 577 - } 578 - self 579 - }); 580 - 581 - (rx, chan_task) 582 - } 583 - }
+3 -12
src/lib.rs
··· 53 53 total_size += size; 54 54 } 55 55 } 56 + 57 + // clean up the disk store (drop tables etc) 58 + driver.reset_store().await?; 56 59 } 57 60 }; 58 61 println!("sum of size of all records: {total_size}"); ··· 70 73 Find more [examples in the repo](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples). 71 74 72 75 */ 73 - 74 - pub mod mst; 75 - mod walk; 76 - 77 - pub mod disk; 78 - pub mod drive; 79 - pub mod process; 80 - 81 - pub use disk::{DiskBuilder, DiskError, DiskStore}; 82 - pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk}; 83 - pub use mst::Commit; 84 - pub use process::Processable;
-110
src/mst.rs
··· 1 - //! Low-level types for parsing raw atproto MST CARs 2 - //! 3 - //! The primary aim is to work through the **tree** structure. Non-node blocks 4 - //! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever. 5 - 6 - use ipld_core::cid::Cid; 7 - use serde::Deserialize; 8 - 9 - /// The top-level data object in a repository's tree is a signed commit. 10 - #[derive(Debug, Deserialize)] 11 - // #[serde(deny_unknown_fields)] 12 - pub struct Commit { 13 - /// the account DID associated with the repo, in strictly normalized form 14 - /// (eg, lowercase as appropriate) 15 - pub did: String, 16 - /// fixed value of 3 for this repo format version 17 - pub version: u64, 18 - /// pointer to the top of the repo contents tree structure (MST) 19 - pub data: Cid, 20 - /// revision of the repo, used as a logical clock. 21 - /// 22 - /// TID format. Must increase monotonically. Recommend using current 23 - /// timestamp as TID; rev values in the "future" (beyond a fudge factor) 24 - /// should be ignored and not processed 25 - pub rev: String, 26 - /// pointer (by hash) to a previous commit object for this repository. 27 - /// 28 - /// Could be used to create a chain of history, but largely unused (included 29 - /// for v2 backwards compatibility). In version 3 repos, this field must 30 - /// exist in the CBOR object, but is virtually always null. NOTE: previously 31 - /// specified as nullable and optional, but this caused interoperability 32 - /// issues. 33 - pub prev: Option<Cid>, 34 - /// cryptographic signature of this commit, as raw bytes 35 - #[serde(with = "serde_bytes")] 36 - pub sig: Vec<u8>, 37 - } 38 - 39 - /// MST node data schema 40 - #[derive(Debug, Deserialize, PartialEq)] 41 - #[serde(deny_unknown_fields)] 42 - pub(crate) struct Node { 43 - /// link to sub-tree Node on a lower level and with all keys sorting before 44 - /// keys at this node 45 - #[serde(rename = "l")] 46 - pub left: Option<Cid>, 47 - /// ordered list of TreeEntry objects 48 - /// 49 - /// atproto MSTs have a fanout of 4, so there can be max 4 entries. 50 - #[serde(rename = "e")] 51 - pub entries: Vec<Entry>, // maybe we can do [Option<Entry>; 4]? 52 - } 53 - 54 - impl Node { 55 - /// test if a block could possibly be a node 56 - /// 57 - /// we can't eagerly decode records except where we're *sure* they cannot be 58 - /// an mst node (and even then we can only attempt) because you can't know 59 - /// with certainty what a block is supposed to be without actually walking 60 - /// the tree. 61 - /// 62 - /// so if a block *could be* a node, any record converter must postpone 63 - /// processing. if it turns out it happens to be a very node-looking record, 64 - /// well, sorry, it just has to only be processed later when that's known. 65 - pub(crate) fn could_be(bytes: impl AsRef<[u8]>) -> bool { 66 - const NODE_FINGERPRINT: [u8; 3] = [ 67 - 0xA2, // map length 2 (for "l" and "e" keys) 68 - 0x61, // text length 1 69 - b'e', // "e" before "l" because map keys have to be lex-sorted 70 - // 0x8?: "e" has array (0x100 upper 3 bits) of some length 71 - ]; 72 - let bytes = bytes.as_ref(); 73 - bytes.starts_with(&NODE_FINGERPRINT) 74 - && bytes 75 - .get(3) 76 - .map(|b| b & 0b1110_0000 == 0x80) 77 - .unwrap_or(false) 78 - } 79 - 80 - /// Check if a node has any entries 81 - /// 82 - /// An empty repository with no records is represented as a single MST node 83 - /// with an empty array of entries. This is the only situation in which a 84 - /// tree may contain an empty leaf node which does not either contain keys 85 - /// ("entries") or point to a sub-tree containing entries. 86 - pub(crate) fn is_empty(&self) -> bool { 87 - self.left.is_none() && self.entries.is_empty() 88 - } 89 - } 90 - 91 - /// TreeEntry object 92 - #[derive(Debug, Deserialize, PartialEq)] 93 - #[serde(deny_unknown_fields)] 94 - pub(crate) struct Entry { 95 - /// count of bytes shared with previous TreeEntry in this Node (if any) 96 - #[serde(rename = "p")] 97 - pub prefix_len: usize, 98 - /// remainder of key for this TreeEntry, after "prefixlen" have been removed 99 - #[serde(rename = "k", with = "serde_bytes")] 100 - pub keysuffix: Vec<u8>, // can we String this here? 101 - /// link to the record data (CBOR) for this entry 102 - #[serde(rename = "v")] 103 - pub value: Cid, 104 - /// link to a sub-tree Node at a lower level 105 - /// 106 - /// the lower level must have keys sorting after this TreeEntry's key (to 107 - /// the "right"), but before the next TreeEntry's key in this Node (if any) 108 - #[serde(rename = "t")] 109 - pub tree: Option<Cid>, 110 - }
-108
src/process.rs
··· 1 - /*! 2 - Record processor function output trait 3 - 4 - The return type must satisfy the `Processable` trait, which requires: 5 - 6 - - `Clone` because two rkeys can refer to the same record by CID, which may 7 - only appear once in the CAR file. 8 - - `Serialize + DeserializeOwned` so it can be spilled to disk. 9 - 10 - One required function must be implemented, `get_size()`: this should return the 11 - approximate total off-stack size of the type. (the on-stack size will be added 12 - automatically via `std::mem::get_size`). 13 - 14 - Note that it is **not guaranteed** that the `process` function will run on a 15 - block before storing it in memory or on disk: it's not possible to know if a 16 - block is a record without actually walking the MST, so the best we can do is 17 - apply `process` to any block that we know *cannot* be an MST node, and otherwise 18 - store the raw block bytes. 19 - 20 - Here's a silly processing function that just collects 'eyy's found in the raw 21 - record bytes 22 - 23 - ``` 24 - # use repo_stream::Processable; 25 - # use serde::{Serialize, Deserialize}; 26 - #[derive(Debug, Clone, Serialize, Deserialize)] 27 - struct Eyy(usize, String); 28 - 29 - impl Processable for Eyy { 30 - fn get_size(&self) -> usize { 31 - // don't need to compute the usize, it's on the stack 32 - self.1.capacity() // in-mem size from the string's capacity, in bytes 33 - } 34 - } 35 - 36 - fn process(raw: Vec<u8>) -> Vec<Eyy> { 37 - let mut out = Vec::new(); 38 - let to_find = "eyy".as_bytes(); 39 - for i in 0..(raw.len() - 3) { 40 - if &raw[i..(i+3)] == to_find { 41 - out.push(Eyy(i, "eyy".to_string())); 42 - } 43 - } 44 - out 45 - } 46 - ``` 47 - 48 - The memory sizing stuff is a little sketch but probably at least approximately 49 - works. 50 - */ 51 - 52 - use serde::{Serialize, de::DeserializeOwned}; 53 - 54 - /// Output trait for record processing 55 - pub trait Processable: Clone + Serialize + DeserializeOwned { 56 - /// Any additional in-memory size taken by the processed type 57 - /// 58 - /// Do not include stack size (`std::mem::size_of`) 59 - fn get_size(&self) -> usize; 60 - } 61 - 62 - /// Processor that just returns the raw blocks 63 - #[inline] 64 - pub fn noop(block: Vec<u8>) -> Vec<u8> { 65 - block 66 - } 67 - 68 - impl Processable for u8 { 69 - fn get_size(&self) -> usize { 70 - 0 71 - } 72 - } 73 - 74 - impl Processable for usize { 75 - fn get_size(&self) -> usize { 76 - 0 // no additional space taken, just its stack size (newtype is free) 77 - } 78 - } 79 - 80 - impl Processable for String { 81 - fn get_size(&self) -> usize { 82 - self.capacity() 83 - } 84 - } 85 - 86 - impl<Item: Sized + Processable> Processable for Vec<Item> { 87 - fn get_size(&self) -> usize { 88 - let slot_size = std::mem::size_of::<Item>(); 89 - let direct_size = slot_size * self.capacity(); 90 - let items_referenced_size: usize = self.iter().map(|item| item.get_size()).sum(); 91 - direct_size + items_referenced_size 92 - } 93 - } 94 - 95 - impl<Item: Processable> Processable for Option<Item> { 96 - fn get_size(&self) -> usize { 97 - self.as_ref().map(|item| item.get_size()).unwrap_or(0) 98 - } 99 - } 100 - 101 - impl<Item: Processable, Error: Processable> Processable for Result<Item, Error> { 102 - fn get_size(&self) -> usize { 103 - match self { 104 - Ok(item) => item.get_size(), 105 - Err(err) => err.get_size(), 106 - } 107 - } 108 - }
-406
src/walk.rs
··· 1 - //! Depth-first MST traversal 2 - 3 - use crate::disk::DiskStore; 4 - use crate::drive::{DecodeError, MaybeProcessedBlock}; 5 - use crate::mst::Node; 6 - use crate::process::Processable; 7 - use ipld_core::cid::Cid; 8 - use sha2::{Digest, Sha256}; 9 - use std::collections::HashMap; 10 - use std::convert::Infallible; 11 - 12 - /// Errors that can happen while walking 13 - #[derive(Debug, thiserror::Error)] 14 - pub enum WalkError { 15 - #[error("Failed to fingerprint commit block")] 16 - BadCommitFingerprint, 17 - #[error("Failed to decode commit block: {0}")] 18 - BadCommit(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 19 - #[error("Action node error: {0}")] 20 - MstError(#[from] MstError), 21 - #[error("storage error: {0}")] 22 - StorageError(#[from] fjall::Error), 23 - #[error("Decode error: {0}")] 24 - DecodeError(#[from] DecodeError), 25 - } 26 - 27 - /// Errors from invalid Rkeys 28 - #[derive(Debug, PartialEq, thiserror::Error)] 29 - pub enum MstError { 30 - #[error("Failed to compute an rkey due to invalid prefix_len")] 31 - EntryPrefixOutOfbounds, 32 - #[error("RKey was not utf-8")] 33 - EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error), 34 - #[error("Nodes cannot be empty (except for an entirely empty MST)")] 35 - EmptyNode, 36 - #[error("Found an entry with rkey at the wrong depth")] 37 - WrongDepth, 38 - #[error("Lost track of our depth (possible bug?)")] 39 - LostDepth, 40 - #[error("MST depth underflow: depth-0 node with child trees")] 41 - DepthUnderflow, 42 - #[error("Encountered an rkey out of order while walking the MST")] 43 - RkeyOutOfOrder, 44 - } 45 - 46 - /// Walker outputs 47 - #[derive(Debug)] 48 - pub enum Step<T> { 49 - /// We needed this CID but it's not in the block store 50 - Missing(Cid), 51 - /// Reached the end of the MST! yay! 52 - Finish, 53 - /// A record was found! 54 - Found { rkey: String, data: T }, 55 - } 56 - 57 - #[derive(Debug, Clone, PartialEq)] 58 - enum Need { 59 - Node { depth: Depth, cid: Cid }, 60 - Record { rkey: String, cid: Cid }, 61 - } 62 - 63 - #[derive(Debug, Clone, Copy, PartialEq)] 64 - enum Depth { 65 - Root, 66 - Depth(u32), 67 - } 68 - 69 - impl Depth { 70 - fn from_key(key: &[u8]) -> Self { 71 - let mut zeros = 0; 72 - for byte in Sha256::digest(key) { 73 - let leading = byte.leading_zeros(); 74 - zeros += leading; 75 - if leading < 8 { 76 - break; 77 - } 78 - } 79 - Self::Depth(zeros / 2) // truncating divide (rounds down) 80 - } 81 - fn next_expected(&self) -> Result<Option<u32>, MstError> { 82 - match self { 83 - Self::Root => Ok(None), 84 - Self::Depth(d) => d.checked_sub(1).ok_or(MstError::DepthUnderflow).map(Some), 85 - } 86 - } 87 - } 88 - 89 - fn push_from_node(stack: &mut Vec<Need>, node: &Node, parent_depth: Depth) -> Result<(), MstError> { 90 - // empty nodes are not allowed in the MST except in an empty MST 91 - if node.is_empty() { 92 - if parent_depth == Depth::Root { 93 - return Ok(()); // empty mst, nothing to push 94 - } else { 95 - return Err(MstError::EmptyNode); 96 - } 97 - } 98 - 99 - let mut entries = Vec::with_capacity(node.entries.len()); 100 - let mut prefix = vec![]; 101 - let mut this_depth = parent_depth.next_expected()?; 102 - 103 - for entry in &node.entries { 104 - let mut rkey = vec![]; 105 - let pre_checked = prefix 106 - .get(..entry.prefix_len) 107 - .ok_or(MstError::EntryPrefixOutOfbounds)?; 108 - rkey.extend_from_slice(pre_checked); 109 - rkey.extend_from_slice(&entry.keysuffix); 110 - 111 - let Depth::Depth(key_depth) = Depth::from_key(&rkey) else { 112 - return Err(MstError::WrongDepth); 113 - }; 114 - 115 - // this_depth is `none` if we are the deepest child (directly below root) 116 - // in that case we accept whatever highest depth is claimed 117 - let expected_depth = match this_depth { 118 - Some(d) => d, 119 - None => { 120 - this_depth = Some(key_depth); 121 - key_depth 122 - } 123 - }; 124 - 125 - // all keys we find should be this depth 126 - if key_depth != expected_depth { 127 - return Err(MstError::DepthUnderflow); 128 - } 129 - 130 - prefix = rkey.clone(); 131 - 132 - entries.push(Need::Record { 133 - rkey: String::from_utf8(rkey)?, 134 - cid: entry.value, 135 - }); 136 - if let Some(ref tree) = entry.tree { 137 - entries.push(Need::Node { 138 - depth: Depth::Depth(key_depth), 139 - cid: *tree, 140 - }); 141 - } 142 - } 143 - 144 - entries.reverse(); 145 - stack.append(&mut entries); 146 - 147 - let d = this_depth.ok_or(MstError::LostDepth)?; 148 - 149 - if let Some(tree) = node.left { 150 - stack.push(Need::Node { 151 - depth: Depth::Depth(d), 152 - cid: tree, 153 - }); 154 - } 155 - Ok(()) 156 - } 157 - 158 - /// Traverser of an atproto MST 159 - /// 160 - /// Walks the tree from left-to-right in depth-first order 161 - #[derive(Debug)] 162 - pub struct Walker { 163 - stack: Vec<Need>, 164 - prev: String, 165 - } 166 - 167 - impl Walker { 168 - pub fn new(tree_root_cid: Cid) -> Self { 169 - Self { 170 - stack: vec![Need::Node { 171 - depth: Depth::Root, 172 - cid: tree_root_cid, 173 - }], 174 - prev: "".to_string(), 175 - } 176 - } 177 - 178 - /// Advance through nodes until we find a record or can't go further 179 - pub fn step<T: Processable>( 180 - &mut self, 181 - blocks: &mut HashMap<Cid, MaybeProcessedBlock<T>>, 182 - process: impl Fn(Vec<u8>) -> T, 183 - ) -> Result<Step<T>, WalkError> { 184 - loop { 185 - let Some(need) = self.stack.last_mut() else { 186 - log::trace!("tried to walk but we're actually done."); 187 - return Ok(Step::Finish); 188 - }; 189 - 190 - match need { 191 - &mut Need::Node { depth, cid } => { 192 - log::trace!("need node {cid:?}"); 193 - let Some(block) = blocks.remove(&cid) else { 194 - log::trace!("node not found, resting"); 195 - return Ok(Step::Missing(cid)); 196 - }; 197 - 198 - let MaybeProcessedBlock::Raw(data) = block else { 199 - return Err(WalkError::BadCommitFingerprint); 200 - }; 201 - let node = serde_ipld_dagcbor::from_slice::<Node>(&data) 202 - .map_err(WalkError::BadCommit)?; 203 - 204 - // found node, make sure we remember 205 - self.stack.pop(); 206 - 207 - // queue up work on the found node next 208 - push_from_node(&mut self.stack, &node, depth)?; 209 - } 210 - Need::Record { rkey, cid } => { 211 - log::trace!("need record {cid:?}"); 212 - // note that we cannot *remove* a record block, sadly, since 213 - // there can be multiple rkeys pointing to the same cid. 214 - let Some(data) = blocks.get_mut(cid) else { 215 - return Ok(Step::Missing(*cid)); 216 - }; 217 - let rkey = rkey.clone(); 218 - let data = match data { 219 - MaybeProcessedBlock::Raw(data) => process(data.to_vec()), 220 - MaybeProcessedBlock::Processed(t) => t.clone(), 221 - }; 222 - 223 - // found node, make sure we remember 224 - self.stack.pop(); 225 - 226 - // rkeys *must* be in order or else the tree is invalid (or 227 - // we have a bug) 228 - if rkey <= self.prev { 229 - return Err(MstError::RkeyOutOfOrder)?; 230 - } 231 - self.prev = rkey.clone(); 232 - 233 - return Ok(Step::Found { rkey, data }); 234 - } 235 - } 236 - } 237 - } 238 - 239 - /// blocking!!!!!! 240 - pub fn disk_step<T: Processable>( 241 - &mut self, 242 - reader: &mut DiskStore, 243 - process: impl Fn(Vec<u8>) -> T, 244 - ) -> Result<Step<T>, WalkError> { 245 - loop { 246 - let Some(need) = self.stack.last_mut() else { 247 - log::trace!("tried to walk but we're actually done."); 248 - return Ok(Step::Finish); 249 - }; 250 - 251 - match need { 252 - &mut Need::Node { depth, cid } => { 253 - let cid_bytes = cid.to_bytes(); 254 - log::trace!("need node {cid:?}"); 255 - let Some(block_bytes) = reader.get(&cid_bytes)? else { 256 - log::trace!("node not found, resting"); 257 - return Ok(Step::Missing(cid)); 258 - }; 259 - 260 - let block: MaybeProcessedBlock<T> = crate::drive::decode(&block_bytes)?; 261 - 262 - let MaybeProcessedBlock::Raw(data) = block else { 263 - return Err(WalkError::BadCommitFingerprint); 264 - }; 265 - let node = serde_ipld_dagcbor::from_slice::<Node>(&data) 266 - .map_err(WalkError::BadCommit)?; 267 - 268 - // found node, make sure we remember 269 - self.stack.pop(); 270 - 271 - // queue up work on the found node next 272 - push_from_node(&mut self.stack, &node, depth).map_err(WalkError::MstError)?; 273 - } 274 - Need::Record { rkey, cid } => { 275 - log::trace!("need record {cid:?}"); 276 - let cid_bytes = cid.to_bytes(); 277 - let Some(data_bytes) = reader.get(&cid_bytes)? else { 278 - log::trace!("record block not found, resting"); 279 - return Ok(Step::Missing(*cid)); 280 - }; 281 - let data: MaybeProcessedBlock<T> = crate::drive::decode(&data_bytes)?; 282 - let rkey = rkey.clone(); 283 - let data = match data { 284 - MaybeProcessedBlock::Raw(data) => process(data), 285 - MaybeProcessedBlock::Processed(t) => t.clone(), 286 - }; 287 - 288 - // found node, make sure we remember 289 - self.stack.pop(); 290 - 291 - log::trace!("emitting a block as a step. depth={}", self.stack.len()); 292 - 293 - // rkeys *must* be in order or else the tree is invalid (or 294 - // we have a bug) 295 - if rkey <= self.prev { 296 - return Err(MstError::RkeyOutOfOrder)?; 297 - } 298 - self.prev = rkey.clone(); 299 - 300 - return Ok(Step::Found { rkey, data }); 301 - } 302 - } 303 - } 304 - } 305 - } 306 - 307 - #[cfg(test)] 308 - mod test { 309 - use super::*; 310 - 311 - fn cid1() -> Cid { 312 - "bafyreihixenvk3ahqbytas4hk4a26w43bh6eo3w6usjqtxkpzsvi655a3m" 313 - .parse() 314 - .unwrap() 315 - } 316 - 317 - #[test] 318 - fn test_depth_spec_0() { 319 - let d = Depth::from_key(b"2653ae71"); 320 - assert_eq!(d, Depth::Depth(0)) 321 - } 322 - 323 - #[test] 324 - fn test_depth_spec_1() { 325 - let d = Depth::from_key(b"blue"); 326 - assert_eq!(d, Depth::Depth(1)) 327 - } 328 - 329 - #[test] 330 - fn test_depth_spec_4() { 331 - let d = Depth::from_key(b"app.bsky.feed.post/454397e440ec"); 332 - assert_eq!(d, Depth::Depth(4)) 333 - } 334 - 335 - #[test] 336 - fn test_depth_spec_8() { 337 - let d = Depth::from_key(b"app.bsky.feed.post/9adeb165882c"); 338 - assert_eq!(d, Depth::Depth(8)) 339 - } 340 - 341 - #[test] 342 - fn test_depth_ietf_draft_0() { 343 - let d = Depth::from_key(b"key1"); 344 - assert_eq!(d, Depth::Depth(0)) 345 - } 346 - 347 - #[test] 348 - fn test_depth_ietf_draft_1() { 349 - let d = Depth::from_key(b"key7"); 350 - assert_eq!(d, Depth::Depth(1)) 351 - } 352 - 353 - #[test] 354 - fn test_depth_ietf_draft_4() { 355 - let d = Depth::from_key(b"key515"); 356 - assert_eq!(d, Depth::Depth(4)) 357 - } 358 - 359 - #[test] 360 - fn test_depth_interop() { 361 - // examples from https://github.com/bluesky-social/atproto-interop-tests/blob/main/mst/key_heights.json 362 - for (k, expected) in [ 363 - ("", 0), 364 - ("asdf", 0), 365 - ("blue", 1), 366 - ("2653ae71", 0), 367 - ("88bfafc7", 2), 368 - ("2a92d355", 4), 369 - ("884976f5", 6), 370 - ("app.bsky.feed.post/454397e440ec", 4), 371 - ("app.bsky.feed.post/9adeb165882c", 8), 372 - ] { 373 - let d = Depth::from_key(k.as_bytes()); 374 - assert_eq!(d, Depth::Depth(expected), "key: {}", k); 375 - } 376 - } 377 - 378 - #[test] 379 - fn test_push_empty_fails() { 380 - let empty_node = Node { 381 - left: None, 382 - entries: vec![], 383 - }; 384 - let mut stack = vec![]; 385 - let err = push_from_node(&mut stack, &empty_node, Depth::Depth(4)); 386 - assert_eq!(err, Err(MstError::EmptyNode)); 387 - } 388 - 389 - #[test] 390 - fn test_push_one_node() { 391 - let node = Node { 392 - left: Some(cid1()), 393 - entries: vec![], 394 - }; 395 - let mut stack = vec![]; 396 - push_from_node(&mut stack, &node, Depth::Depth(4)).unwrap(); 397 - assert_eq!( 398 - stack.last(), 399 - Some(Need::Node { 400 - depth: Depth::Depth(3), 401 - cid: cid1() 402 - }) 403 - .as_ref() 404 - ); 405 - } 406 - }