comparing 4e8617e1c52b3c5fcfd1f9359cb63ae559e67f3c and fjall3 on microcosm.blue/repo-stream

+242 -71

Cargo.lock

··· 152 152 checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" 153 153 154 154 [[package]] 155 + name = "block-buffer" 156 + version = "0.10.4" 157 + source = "registry+https://github.com/rust-lang/crates.io-index" 158 + checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" 159 + dependencies = [ 160 + "generic-array", 161 + ] 162 + 163 + [[package]] 155 164 name = "bumpalo" 156 165 version = "3.19.0" 157 166 source = "registry+https://github.com/rust-lang/crates.io-index" 158 167 checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 159 168 160 169 [[package]] 170 + name = "byteorder-lite" 171 + version = "0.1.0" 172 + source = "registry+https://github.com/rust-lang/crates.io-index" 173 + checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" 174 + 175 + [[package]] 161 176 name = "bytes" 162 177 version = "1.10.1" 163 178 source = "registry+https://github.com/rust-lang/crates.io-index" 164 179 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" 165 180 166 181 [[package]] 182 + name = "byteview" 183 + version = "0.10.0" 184 + source = "registry+https://github.com/rust-lang/crates.io-index" 185 + checksum = "dda4398f387cc6395a3e93b3867cd9abda914c97a0b344d1eefb2e5c51785fca" 186 + 187 + [[package]] 167 188 name = "cast" 168 189 version = "0.3.0" 169 190 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 272 293 checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" 273 294 274 295 [[package]] 296 + name = "compare" 297 + version = "0.0.6" 298 + source = "registry+https://github.com/rust-lang/crates.io-index" 299 + checksum = "ea0095f6103c2a8b44acd6fd15960c801dafebf02e21940360833e0673f48ba7" 300 + 301 + [[package]] 275 302 name = "const-str" 276 303 version = "0.4.3" 277 304 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 284 311 checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" 285 312 dependencies = [ 286 313 "memchr", 314 + ] 315 + 316 + [[package]] 317 + name = "cpufeatures" 318 + version = "0.2.17" 319 + source = "registry+https://github.com/rust-lang/crates.io-index" 320 + checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" 321 + dependencies = [ 322 + "libc", 287 323 ] 288 324 289 325 [[package]] ··· 340 376 ] 341 377 342 378 [[package]] 379 + name = "crossbeam-skiplist" 380 + version = "0.1.3" 381 + source = "registry+https://github.com/rust-lang/crates.io-index" 382 + checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b" 383 + dependencies = [ 384 + "crossbeam-epoch", 385 + "crossbeam-utils", 386 + ] 387 + 388 + [[package]] 343 389 name = "crossbeam-utils" 344 390 version = "0.8.21" 345 391 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 352 398 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" 353 399 354 400 [[package]] 401 + name = "crypto-common" 402 + version = "0.1.6" 403 + source = "registry+https://github.com/rust-lang/crates.io-index" 404 + checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" 405 + dependencies = [ 406 + "generic-array", 407 + "typenum", 408 + ] 409 + 410 + [[package]] 411 + name = "dashmap" 412 + version = "6.1.0" 413 + source = "registry+https://github.com/rust-lang/crates.io-index" 414 + checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" 415 + dependencies = [ 416 + "cfg-if", 417 + "crossbeam-utils", 418 + "hashbrown 0.14.5", 419 + "lock_api", 420 + "once_cell", 421 + "parking_lot_core", 422 + ] 423 + 424 + [[package]] 355 425 name = "data-encoding" 356 426 version = "2.9.0" 357 427 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 378 448 ] 379 449 380 450 [[package]] 451 + name = "digest" 452 + version = "0.10.7" 453 + source = "registry+https://github.com/rust-lang/crates.io-index" 454 + checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" 455 + dependencies = [ 456 + "block-buffer", 457 + "crypto-common", 458 + ] 459 + 460 + [[package]] 381 461 name = "either" 382 462 version = "1.15.0" 383 463 source = "registry+https://github.com/rust-lang/crates.io-index" 384 464 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 385 465 386 466 [[package]] 467 + name = "enum_dispatch" 468 + version = "0.3.13" 469 + source = "registry+https://github.com/rust-lang/crates.io-index" 470 + checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" 471 + dependencies = [ 472 + "once_cell", 473 + "proc-macro2", 474 + "quote", 475 + "syn 2.0.106", 476 + ] 477 + 478 + [[package]] 387 479 name = "env_filter" 388 480 version = "0.1.3" 389 481 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 407 499 ] 408 500 409 501 [[package]] 502 + name = "equivalent" 503 + version = "1.0.2" 504 + source = "registry+https://github.com/rust-lang/crates.io-index" 505 + checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 506 + 507 + [[package]] 410 508 name = "errno" 411 509 version = "0.3.14" 412 510 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 417 515 ] 418 516 419 517 [[package]] 420 - name = "fallible-iterator" 421 - version = "0.3.0" 422 - source = "registry+https://github.com/rust-lang/crates.io-index" 423 - checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" 424 - 425 - [[package]] 426 - name = "fallible-streaming-iterator" 427 - version = "0.1.9" 428 - source = "registry+https://github.com/rust-lang/crates.io-index" 429 - checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" 430 - 431 - [[package]] 432 518 name = "fastrand" 433 519 version = "2.3.0" 434 520 source = "registry+https://github.com/rust-lang/crates.io-index" 435 521 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" 436 522 437 523 [[package]] 438 - name = "foldhash" 439 - version = "0.1.5" 524 + name = "fjall" 525 + version = "3.0.1" 440 526 source = "registry+https://github.com/rust-lang/crates.io-index" 441 - checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" 527 + checksum = "4f69637c02d38ad1b0f003101d0195a60368130aa17d9ef78b1557d265a22093" 528 + dependencies = [ 529 + "byteorder-lite", 530 + "byteview", 531 + "dashmap", 532 + "flume", 533 + "log", 534 + "lsm-tree", 535 + "tempfile", 536 + "xxhash-rust", 537 + ] 538 + 539 + [[package]] 540 + name = "flume" 541 + version = "0.12.0" 542 + source = "registry+https://github.com/rust-lang/crates.io-index" 543 + checksum = "5e139bc46ca777eb5efaf62df0ab8cc5fd400866427e56c68b22e414e53bd3be" 544 + dependencies = [ 545 + "spin", 546 + ] 442 547 443 548 [[package]] 444 549 name = "futures" ··· 530 635 ] 531 636 532 637 [[package]] 638 + name = "generic-array" 639 + version = "0.14.9" 640 + source = "registry+https://github.com/rust-lang/crates.io-index" 641 + checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" 642 + dependencies = [ 643 + "typenum", 644 + "version_check", 645 + ] 646 + 647 + [[package]] 533 648 name = "getrandom" 534 649 version = "0.3.3" 535 650 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 560 675 561 676 [[package]] 562 677 name = "hashbrown" 563 - version = "0.15.5" 678 + version = "0.14.5" 564 679 source = "registry+https://github.com/rust-lang/crates.io-index" 565 - checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" 566 - dependencies = [ 567 - "foldhash", 568 - ] 680 + checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" 569 681 570 682 [[package]] 571 - name = "hashlink" 572 - version = "0.10.0" 683 + name = "hashbrown" 684 + version = "0.16.1" 573 685 source = "registry+https://github.com/rust-lang/crates.io-index" 574 - checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" 575 - dependencies = [ 576 - "hashbrown", 577 - ] 686 + checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" 578 687 579 688 [[package]] 580 689 name = "heck" 581 690 version = "0.5.0" 582 691 source = "registry+https://github.com/rust-lang/crates.io-index" 583 692 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 693 + 694 + [[package]] 695 + name = "interval-heap" 696 + version = "0.0.5" 697 + source = "registry+https://github.com/rust-lang/crates.io-index" 698 + checksum = "11274e5e8e89b8607cfedc2910b6626e998779b48a019151c7604d0adcb86ac6" 699 + dependencies = [ 700 + "compare", 701 + ] 584 702 585 703 [[package]] 586 704 name = "io-uring" ··· 682 800 checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" 683 801 684 802 [[package]] 685 - name = "libsqlite3-sys" 686 - version = "0.35.0" 687 - source = "registry+https://github.com/rust-lang/crates.io-index" 688 - checksum = "133c182a6a2c87864fe97778797e46c7e999672690dc9fa3ee8e241aa4a9c13f" 689 - dependencies = [ 690 - "pkg-config", 691 - "vcpkg", 692 - ] 693 - 694 - [[package]] 695 803 name = "linux-raw-sys" 696 804 version = "0.11.0" 697 805 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 713 821 checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" 714 822 715 823 [[package]] 824 + name = "lsm-tree" 825 + version = "3.0.1" 826 + source = "registry+https://github.com/rust-lang/crates.io-index" 827 + checksum = "b875f1dfe14f557f805b167fb9b0fc54c5560c7a4bd6ae02535b2846f276a8cb" 828 + dependencies = [ 829 + "byteorder-lite", 830 + "byteview", 831 + "crossbeam-skiplist", 832 + "enum_dispatch", 833 + "interval-heap", 834 + "log", 835 + "quick_cache", 836 + "rustc-hash", 837 + "self_cell", 838 + "sfa", 839 + "tempfile", 840 + "varint-rs", 841 + "xxhash-rust", 842 + ] 843 + 844 + [[package]] 716 845 name = "match-lookup" 717 846 version = "0.1.1" 718 847 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 844 973 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" 845 974 846 975 [[package]] 847 - name = "pkg-config" 848 - version = "0.3.32" 849 - source = "registry+https://github.com/rust-lang/crates.io-index" 850 - checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" 851 - 852 - [[package]] 853 976 name = "plotters" 854 977 version = "0.3.7" 855 978 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 902 1025 ] 903 1026 904 1027 [[package]] 1028 + name = "quick_cache" 1029 + version = "0.6.18" 1030 + source = "registry+https://github.com/rust-lang/crates.io-index" 1031 + checksum = "7ada44a88ef953a3294f6eb55d2007ba44646015e18613d2f213016379203ef3" 1032 + dependencies = [ 1033 + "equivalent", 1034 + "hashbrown 0.16.1", 1035 + ] 1036 + 1037 + [[package]] 905 1038 name = "quote" 906 1039 version = "1.0.41" 907 1040 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 934 1067 dependencies = [ 935 1068 "crossbeam-deque", 936 1069 "crossbeam-utils", 937 - ] 938 - 939 - [[package]] 940 - name = "redb" 941 - version = "3.1.0" 942 - source = "registry+https://github.com/rust-lang/crates.io-index" 943 - checksum = "ae323eb086579a3769daa2c753bb96deb95993c534711e0dbe881b5192906a06" 944 - dependencies = [ 945 - "libc", 946 1070 ] 947 1071 948 1072 [[package]] ··· 985 1109 986 1110 [[package]] 987 1111 name = "repo-stream" 988 - version = "0.1.1" 1112 + version = "0.2.2" 989 1113 dependencies = [ 990 1114 "bincode", 991 1115 "clap", 992 1116 "criterion", 993 1117 "env_logger", 1118 + "fjall", 994 1119 "futures", 995 1120 "futures-core", 996 1121 "ipld-core", 997 1122 "iroh-car", 998 1123 "log", 999 1124 "multibase", 1000 - "redb", 1001 - "rusqlite", 1002 1125 "serde", 1003 1126 "serde_bytes", 1004 1127 "serde_ipld_dagcbor", 1128 + "sha2", 1005 1129 "tempfile", 1006 1130 "thiserror 2.0.17", 1007 1131 "tokio", 1008 1132 ] 1009 1133 1010 1134 [[package]] 1011 - name = "rusqlite" 1012 - version = "0.37.0" 1013 - source = "registry+https://github.com/rust-lang/crates.io-index" 1014 - checksum = "165ca6e57b20e1351573e3729b958bc62f0e48025386970b6e4d29e7a7e71f3f" 1015 - dependencies = [ 1016 - "bitflags", 1017 - "fallible-iterator", 1018 - "fallible-streaming-iterator", 1019 - "hashlink", 1020 - "libsqlite3-sys", 1021 - "smallvec", 1022 - ] 1023 - 1024 - [[package]] 1025 1135 name = "rustc-demangle" 1026 1136 version = "0.1.26" 1027 1137 source = "registry+https://github.com/rust-lang/crates.io-index" 1028 1138 checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" 1139 + 1140 + [[package]] 1141 + name = "rustc-hash" 1142 + version = "2.1.1" 1143 + source = "registry+https://github.com/rust-lang/crates.io-index" 1144 + checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" 1029 1145 1030 1146 [[package]] 1031 1147 name = "rustix" ··· 1068 1184 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 1069 1185 1070 1186 [[package]] 1187 + name = "self_cell" 1188 + version = "1.2.2" 1189 + source = "registry+https://github.com/rust-lang/crates.io-index" 1190 + checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89" 1191 + 1192 + [[package]] 1071 1193 name = "serde" 1072 1194 version = "1.0.228" 1073 1195 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1133 1255 ] 1134 1256 1135 1257 [[package]] 1258 + name = "sfa" 1259 + version = "1.0.0" 1260 + source = "registry+https://github.com/rust-lang/crates.io-index" 1261 + checksum = "a1296838937cab56cd6c4eeeb8718ec777383700c33f060e2869867bd01d1175" 1262 + dependencies = [ 1263 + "byteorder-lite", 1264 + "log", 1265 + "xxhash-rust", 1266 + ] 1267 + 1268 + [[package]] 1269 + name = "sha2" 1270 + version = "0.10.9" 1271 + source = "registry+https://github.com/rust-lang/crates.io-index" 1272 + checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" 1273 + dependencies = [ 1274 + "cfg-if", 1275 + "cpufeatures", 1276 + "digest", 1277 + ] 1278 + 1279 + [[package]] 1136 1280 name = "signal-hook-registry" 1137 1281 version = "1.4.6" 1138 1282 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1161 1305 dependencies = [ 1162 1306 "libc", 1163 1307 "windows-sys 0.59.0", 1308 + ] 1309 + 1310 + [[package]] 1311 + name = "spin" 1312 + version = "0.9.8" 1313 + source = "registry+https://github.com/rust-lang/crates.io-index" 1314 + checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" 1315 + dependencies = [ 1316 + "lock_api", 1164 1317 ] 1165 1318 1166 1319 [[package]] ··· 1286 1439 ] 1287 1440 1288 1441 [[package]] 1442 + name = "typenum" 1443 + version = "1.19.0" 1444 + source = "registry+https://github.com/rust-lang/crates.io-index" 1445 + checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" 1446 + 1447 + [[package]] 1289 1448 name = "unicode-ident" 1290 1449 version = "1.0.19" 1291 1450 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1316 1475 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 1317 1476 1318 1477 [[package]] 1319 - name = "vcpkg" 1320 - version = "0.2.15" 1478 + name = "varint-rs" 1479 + version = "2.2.0" 1321 1480 source = "registry+https://github.com/rust-lang/crates.io-index" 1322 - checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 1481 + checksum = "8f54a172d0620933a27a4360d3db3e2ae0dd6cceae9730751a036bbf182c4b23" 1482 + 1483 + [[package]] 1484 + name = "version_check" 1485 + version = "0.9.5" 1486 + source = "registry+https://github.com/rust-lang/crates.io-index" 1487 + checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 1323 1488 1324 1489 [[package]] 1325 1490 name = "virtue" ··· 1597 1762 version = "0.46.0" 1598 1763 source = "registry+https://github.com/rust-lang/crates.io-index" 1599 1764 checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" 1765 + 1766 + [[package]] 1767 + name = "xxhash-rust" 1768 + version = "0.8.15" 1769 + source = "registry+https://github.com/rust-lang/crates.io-index" 1770 + checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" 1600 1771 1601 1772 [[package]] 1602 1773 name = "zerocopy"

+8 -5

Cargo.toml

··· 1 1 [package] 2 2 name = "repo-stream" 3 - version = "0.1.1" 3 + version = "0.2.2" 4 4 edition = "2024" 5 5 license = "MIT OR Apache-2.0" 6 - description = "Fast and robust atproto CAR file processing in rust" 6 + description = "A robust CAR file -> MST walker for atproto" 7 7 repository = "https://tangled.org/@microcosm.blue/repo-stream" 8 8 9 9 [dependencies] 10 10 bincode = { version = "2.0.1", features = ["serde"] } 11 + fjall = { version = "3.0.1", default-features = false } 11 12 futures = "0.3.31" 12 13 futures-core = "0.3.31" 13 14 ipld-core = { version = "0.4.2", features = ["serde"] } 14 15 iroh-car = "0.5.1" 15 16 log = "0.4.28" 16 17 multibase = "0.9.2" 17 - redb = "3.1.0" 18 - rusqlite = "0.37.0" 19 18 serde = { version = "1.0.228", features = ["derive"] } 20 19 serde_bytes = "0.11.19" 21 20 serde_ipld_dagcbor = "0.6.4" 21 + sha2 = "0.10.9" 22 22 thiserror = "2.0.17" 23 - tokio = "1.47.1" 23 + tokio = { version = "1.47.1", features = ["rt", "sync"] } 24 24 25 25 [dev-dependencies] 26 26 clap = { version = "4.5.48", features = ["derive"] } ··· 33 33 [profile.profiling] 34 34 inherits = "release" 35 35 debug = true 36 + 37 + # [profile.release] 38 + # debug = true 36 39 37 40 [[bench]] 38 41 name = "non-huge-cars"

+12 -21

benches/huge-car.rs

··· 1 1 extern crate repo_stream; 2 - use futures::TryStreamExt; 3 - use iroh_car::CarReader; 4 - use std::convert::Infallible; 2 + use repo_stream::Driver; 5 3 use std::path::{Path, PathBuf}; 6 4 7 5 use criterion::{Criterion, criterion_group, criterion_main}; ··· 20 18 }); 21 19 } 22 20 23 - async fn drive_car(filename: impl AsRef<Path>) { 21 + async fn drive_car(filename: impl AsRef<Path>) -> usize { 24 22 let reader = tokio::fs::File::open(filename).await.unwrap(); 25 23 let reader = tokio::io::BufReader::new(reader); 26 - let reader = CarReader::new(reader).await.unwrap(); 27 24 28 - let root = reader 29 - .header() 30 - .roots() 31 - .first() 32 - .ok_or("missing root") 25 + let mut driver = match Driver::load_car(reader, |block| block.len(), 1024) 26 + .await 33 27 .unwrap() 34 - .clone(); 35 - 36 - let stream = std::pin::pin!(reader.stream()); 37 - 38 - let (_commit, v) = 39 - repo_stream::drive::Vehicle::init(root, stream, |block| Ok::<_, Infallible>(block.len())) 40 - .await 41 - .unwrap(); 42 - let mut record_stream = std::pin::pin!(v.stream()); 28 + { 29 + Driver::Memory(_, mem_driver) => mem_driver, 30 + Driver::Disk(_) => panic!("not doing disk for benchmark"), 31 + }; 43 32 44 - while let Some(_) = record_stream.try_next().await.unwrap() { 45 - // just here for the drive 33 + let mut n = 0; 34 + while let Some(pairs) = driver.next_chunk(256).await.unwrap() { 35 + n += pairs.len(); 46 36 } 37 + n 47 38 } 48 39 49 40 criterion_group!(benches, criterion_benchmark);

+16 -22

benches/non-huge-cars.rs

··· 1 1 extern crate repo_stream; 2 - use futures::TryStreamExt; 3 - use iroh_car::CarReader; 4 - use std::convert::Infallible; 2 + use repo_stream::Driver; 5 3 6 4 use criterion::{Criterion, criterion_group, criterion_main}; 7 5 6 + const EMPTY_CAR: &'static [u8] = include_bytes!("../car-samples/empty.car"); 8 7 const TINY_CAR: &'static [u8] = include_bytes!("../car-samples/tiny.car"); 9 8 const LITTLE_CAR: &'static [u8] = include_bytes!("../car-samples/little.car"); 10 9 const MIDSIZE_CAR: &'static [u8] = include_bytes!("../car-samples/midsize.car"); ··· 15 14 .build() 16 15 .expect("Creating runtime failed"); 17 16 17 + c.bench_function("empty-car", |b| { 18 + b.to_async(&rt).iter(async || drive_car(EMPTY_CAR).await) 19 + }); 18 20 c.bench_function("tiny-car", |b| { 19 21 b.to_async(&rt).iter(async || drive_car(TINY_CAR).await) 20 22 }); ··· 26 28 }); 27 29 } 28 30 29 - async fn drive_car(bytes: &[u8]) { 30 - let reader = CarReader::new(bytes).await.unwrap(); 31 - 32 - let root = reader 33 - .header() 34 - .roots() 35 - .first() 36 - .ok_or("missing root") 31 + async fn drive_car(bytes: &[u8]) -> usize { 32 + let mut driver = match Driver::load_car(bytes, |block| block.len(), 32) 33 + .await 37 34 .unwrap() 38 - .clone(); 39 - 40 - let stream = std::pin::pin!(reader.stream()); 35 + { 36 + Driver::Memory(_, mem_driver) => mem_driver, 37 + Driver::Disk(_) => panic!("not benching big cars here"), 38 + }; 41 39 42 - let (_commit, v) = 43 - repo_stream::drive::Vehicle::init(root, stream, |block| Ok::<_, Infallible>(block.len())) 44 - .await 45 - .unwrap(); 46 - let mut record_stream = std::pin::pin!(v.stream()); 47 - 48 - while let Some(_) = record_stream.try_next().await.unwrap() { 49 - // just here for the drive 40 + let mut n = 0; 41 + while let Some(pairs) = driver.next_chunk(256).await.unwrap() { 42 + n += pairs.len(); 50 43 } 44 + n 51 45 } 52 46 53 47 criterion_group!(benches, criterion_benchmark);

car-samples/empty.car

This is a binary file and will not be displayed.

+60 -27

examples/disk-read-file/main.rs

··· 1 + /*! 2 + Read a CAR file by spilling to disk 3 + */ 4 + 1 5 extern crate repo_stream; 2 6 use clap::Parser; 3 - use futures::TryStreamExt; 4 - use iroh_car::CarReader; 5 - use std::convert::Infallible; 7 + use repo_stream::{DiskBuilder, Driver, DriverBuilder}; 6 8 use std::path::PathBuf; 7 - 8 - type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>; 9 + use std::time::Instant; 9 10 10 11 #[derive(Debug, Parser)] 11 12 struct Args { ··· 16 17 } 17 18 18 19 #[tokio::main] 19 - async fn main() -> Result<()> { 20 + async fn main() -> Result<(), Box<dyn std::error::Error>> { 20 21 env_logger::init(); 21 22 22 23 let Args { car, tmpfile } = Args::parse(); 24 + 25 + // repo-stream takes an AsyncRead as input. wrapping a filesystem read in 26 + // BufReader can provide a really significant performance win. 23 27 let reader = tokio::fs::File::open(car).await?; 24 28 let reader = tokio::io::BufReader::new(reader); 25 29 26 - println!("hello!"); 30 + log::info!("hello! reading the car..."); 31 + let t0 = Instant::now(); 27 32 28 - let reader = CarReader::new(reader).await?; 33 + // in this example we only bother handling CARs that are too big for memory 34 + // `noop` helper means: do no block processing, store the raw blocks 35 + let driver = match DriverBuilder::new() 36 + .with_mem_limit_mb(32) // how much memory can be used before disk spill 37 + .load_car(reader) 38 + .await? 39 + { 40 + Driver::Memory(_, _) => panic!("try this on a bigger car"), 41 + Driver::Disk(big_stuff) => { 42 + // we reach here if the repo was too big and needs to be spilled to 43 + // disk to continue 29 44 30 - let redb_store = repo_stream::disk_redb::RedbStore::new(tmpfile)?; 45 + // set up a disk store we can spill to 46 + let disk_store = DiskBuilder::new().open(tmpfile).await?; 47 + 48 + // do the spilling, get back a (similar) driver 49 + let (commit, driver) = big_stuff.finish_loading(disk_store).await?; 50 + 51 + // at this point you might want to fetch the account's signing key 52 + // via the DID from the commit, and then verify the signature. 53 + log::warn!("big's comit ({:?}): {:?}", t0.elapsed(), commit); 54 + 55 + // pop the driver back out to get some code indentation relief 56 + driver 57 + } 58 + }; 31 59 32 - let root = reader 33 - .header() 34 - .roots() 35 - .first() 36 - .ok_or("missing root")? 37 - .clone(); 38 - log::debug!("root: {root:?}"); 60 + // collect some random stats about the blocks 61 + let mut n = 0; 62 + let mut zeros = 0; 39 63 40 - // let stream = Box::pin(reader.stream()); 41 - let stream = std::pin::pin!(reader.stream()); 64 + log::info!("walking..."); 42 65 43 - let (commit, v) = repo_stream::disk_drive::Vehicle::init(root, stream, redb_store, |block| { 44 - Ok::<_, Infallible>(block.len()) 45 - }) 46 - .await?; 47 - let mut record_stream = std::pin::pin!(v.stream()); 66 + // this example uses the disk driver's channel mode: the tree walking is 67 + // spawned onto a blocking thread, and we get chunks of rkey+blocks back 68 + let (mut rx, join) = driver.to_channel(512); 69 + while let Some(r) = rx.recv().await { 70 + let pairs = r?; 48 71 49 - log::info!("got commit: {commit:?}"); 72 + // keep a count of the total number of blocks seen 73 + n += pairs.len(); 50 74 51 - while let Some((rkey, _rec)) = record_stream.try_next().await? { 52 - log::info!("got {rkey:?}"); 75 + for (_, block) in pairs { 76 + // for each block, count how many bytes are equal to '0' 77 + // (this is just an example, you probably want to do something more 78 + // interesting) 79 + zeros += block.into_iter().filter(|&b| b == b'0').count() 80 + } 53 81 } 54 - log::info!("bye!"); 82 + 83 + log::info!("arrived! ({:?}) joining rx...", t0.elapsed()); 84 + 85 + join.await?; 86 + 87 + log::info!("done. n={n} zeros={zeros}"); 55 88 56 89 Ok(()) 57 90 }

+18 -25

examples/read-file/main.rs

··· 1 + /*! 2 + Read a CAR file with in-memory processing 3 + */ 4 + 1 5 extern crate repo_stream; 2 6 use clap::Parser; 3 - use futures::TryStreamExt; 4 - use iroh_car::CarReader; 5 - use std::convert::Infallible; 7 + use repo_stream::{Driver, DriverBuilder}; 6 8 use std::path::PathBuf; 7 9 8 10 type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>; ··· 21 23 let reader = tokio::fs::File::open(file).await?; 22 24 let reader = tokio::io::BufReader::new(reader); 23 25 24 - println!("hello!"); 25 - 26 - let reader = CarReader::new(reader).await?; 27 - 28 - let root = reader 29 - .header() 30 - .roots() 31 - .first() 32 - .ok_or("missing root")? 33 - .clone(); 34 - log::debug!("root: {root:?}"); 35 - 36 - // let stream = Box::pin(reader.stream()); 37 - let stream = std::pin::pin!(reader.stream()); 38 - 39 - let (commit, v) = 40 - repo_stream::drive::Vehicle::init(root, stream, |block| Ok::<_, Infallible>(block.len())) 41 - .await?; 42 - let mut record_stream = std::pin::pin!(v.stream()); 26 + let (commit, mut driver) = match DriverBuilder::new() 27 + .with_block_processor(|block| block.len()) 28 + .load_car(reader) 29 + .await? 30 + { 31 + Driver::Memory(commit, mem_driver) => (commit, mem_driver), 32 + Driver::Disk(_) => panic!("this example doesn't handle big CARs"), 33 + }; 43 34 44 35 log::info!("got commit: {commit:?}"); 45 36 46 - while let Some((rkey, _rec)) = record_stream.try_next().await? { 47 - log::info!("got {rkey:?}"); 37 + let mut n = 0; 38 + while let Some(pairs) = driver.next_chunk(256).await? { 39 + n += pairs.len(); 40 + // log::info!("got {rkey:?}"); 48 41 } 49 - log::info!("bye!"); 42 + log::info!("bye! total records={n}"); 50 43 51 44 Ok(()) 52 45 }

+67 -2

readme.md

··· 1 1 # repo-stream 2 2 3 - Fast and (aspirationally) robust atproto CAR file processing in rust 3 + A robust CAR file -> MST walker for atproto 4 + 5 + [![Crates.io][crates-badge]](https://crates.io/crates/repo-stream) 6 + [![Documentation][docs-badge]](https://docs.rs/repo-stream) 7 + [![Sponsor][sponsor-badge]](https://github.com/sponsors/uniphil) 8 + 9 + [crates-badge]: https://img.shields.io/crates/v/repo-stream.svg 10 + [docs-badge]: https://docs.rs/repo-stream/badge.svg 11 + [sponsor-badge]: https://img.shields.io/badge/at-microcosm-b820f9?labelColor=b820f9&logo=githubsponsors&logoColor=fff 12 + 13 + ```rust 14 + use repo_stream::{Driver, DriverBuilder, DriveError, DiskBuilder}; 15 + 16 + #[tokio::main] 17 + async fn main() -> Result<(), DriveError> { 18 + // repo-stream takes any AsyncRead as input, like a tokio::fs::File 19 + let reader = tokio::fs::File::open("repo.car".into()).await?; 20 + let reader = tokio::io::BufReader::new(reader); 21 + 22 + // example repo workload is simply counting the total record bytes 23 + let mut total_size = 0; 24 + 25 + match DriverBuilder::new() 26 + .with_mem_limit_mb(10) 27 + .with_block_processor(|rec| rec.len()) // block processing: just extract the raw record size 28 + .load_car(reader) 29 + .await? 30 + { 31 + 32 + // if all blocks fit within memory 33 + Driver::Memory(_commit, mut driver) => { 34 + while let Some(chunk) = driver.next_chunk(256).await? { 35 + for (_rkey, size) in chunk { 36 + total_size += size; 37 + } 38 + } 39 + }, 40 + 41 + // if the CAR was too big for in-memory processing 42 + Driver::Disk(paused) => { 43 + // set up a disk store we can spill to 44 + let store = DiskBuilder::new().open("some/path.db".into()).await?; 45 + // do the spilling, get back a (similar) driver 46 + let (_commit, mut driver) = paused.finish_loading(store).await?; 47 + 48 + while let Some(chunk) = driver.next_chunk(256).await? { 49 + for (_rkey, size) in chunk { 50 + total_size += size; 51 + } 52 + } 53 + } 54 + }; 55 + println!("sum of size of all records: {total_size}"); 56 + Ok(()) 57 + } 58 + ``` 59 + 60 + more recent todo 61 + 62 + - [ ] get an *emtpy* car for the test suite 63 + - [x] implement a max size on disk limit 64 + 65 + 66 + ----- 67 + 68 + older stuff (to clean up): 4 69 5 70 6 71 current car processing times (records processed into their length usize, phil's dev machine): ··· 27 92 -> yeah the commit is returned from init 28 93 - [ ] spec compliance todos 29 94 - [x] assert that keys are ordered and fail if not 30 - - [ ] verify node mst depth from key (possibly pending [interop test fixes](https://github.com/bluesky-social/atproto-interop-tests/issues/5)) 95 + - [x] verify node mst depth from key (possibly pending [interop test fixes](https://github.com/bluesky-social/atproto-interop-tests/issues/5)) 31 96 - [ ] performance todos 32 97 - [x] consume the serialized nodes into a mutable efficient format 33 98 - [ ] maybe customize the deserialize impl to do that directly?

+162

src/disk.rs

··· 1 + /*! 2 + Disk storage for blocks on disk 3 + 4 + Currently this uses sqlite. In testing sqlite wasn't the fastest, but it seemed 5 + to be the best behaved in terms of both on-disk space usage and memory usage. 6 + 7 + ```no_run 8 + # use repo_stream::{DiskBuilder, DiskError}; 9 + # #[tokio::main] 10 + # async fn main() -> Result<(), DiskError> { 11 + let store = DiskBuilder::new() 12 + .with_cache_size_mb(32) 13 + .with_max_stored_mb(1024) // errors when >1GiB of processed blocks are inserted 14 + .open("/some/path.db".into()).await?; 15 + # Ok(()) 16 + # } 17 + ``` 18 + */ 19 + 20 + use crate::drive::DriveError; 21 + use fjall::config::{CompressionPolicy, PinningPolicy, RestartIntervalPolicy}; 22 + use fjall::{CompressionType, Database, Error as FjallError, Keyspace, KeyspaceCreateOptions}; 23 + use std::path::PathBuf; 24 + 25 + #[derive(Debug, thiserror::Error)] 26 + pub enum DiskError { 27 + /// A wrapped database error 28 + /// 29 + /// (The wrapped err should probably be obscured to remove public-facing 30 + /// sqlite bits) 31 + #[error(transparent)] 32 + DbError(#[from] FjallError), 33 + /// A tokio blocking task failed to join 34 + #[error("Failed to join a tokio blocking task: {0}")] 35 + JoinError(#[from] tokio::task::JoinError), 36 + /// The total size of stored blocks exceeded the allowed size 37 + /// 38 + /// If you need to process *really* big CARs, you can configure a higher 39 + /// limit. 40 + #[error("Maximum disk size reached")] 41 + MaxSizeExceeded, 42 + } 43 + 44 + /// Builder-style disk store setup 45 + #[derive(Debug, Clone)] 46 + pub struct DiskBuilder { 47 + /// Database in-memory cache allowance 48 + /// 49 + /// Default: 32 MiB 50 + pub cache_size_mb: usize, 51 + /// Database stored block size limit 52 + /// 53 + /// Default: 10 GiB 54 + /// 55 + /// Note: actual size on disk may be more, but should approximately scale 56 + /// with this limit 57 + pub max_stored_mb: usize, 58 + } 59 + 60 + impl Default for DiskBuilder { 61 + fn default() -> Self { 62 + Self { 63 + cache_size_mb: 64, 64 + max_stored_mb: 10 * 1024, // 10 GiB 65 + } 66 + } 67 + } 68 + 69 + impl DiskBuilder { 70 + /// Begin configuring the storage with defaults 71 + pub fn new() -> Self { 72 + Default::default() 73 + } 74 + /// Set the in-memory cache allowance for the database 75 + /// 76 + /// Default: 64 MiB 77 + pub fn with_cache_size_mb(mut self, size: usize) -> Self { 78 + self.cache_size_mb = size; 79 + self 80 + } 81 + /// Set the approximate stored block size limit 82 + /// 83 + /// Default: 10 GiB 84 + pub fn with_max_stored_mb(mut self, max: usize) -> Self { 85 + self.max_stored_mb = max; 86 + self 87 + } 88 + /// Open and initialize the actual disk storage 89 + pub async fn open(&self, path: PathBuf) -> Result<DiskStore, DiskError> { 90 + DiskStore::new(path, self.cache_size_mb, self.max_stored_mb).await 91 + } 92 + } 93 + 94 + /// On-disk block storage 95 + pub struct DiskStore { 96 + #[allow(unused)] 97 + db: Database, 98 + partition: Keyspace, 99 + max_stored: usize, 100 + stored: usize, 101 + } 102 + 103 + impl DiskStore { 104 + /// Initialize a new disk store 105 + pub async fn new( 106 + path: PathBuf, 107 + cache_mb: usize, 108 + max_stored_mb: usize, 109 + ) -> Result<Self, DiskError> { 110 + let max_stored = max_stored_mb * 2_usize.pow(20); 111 + let (db, partition) = tokio::task::spawn_blocking(move || { 112 + let db = Database::builder(path) 113 + // .manual_journal_persist(true) 114 + // .flush_workers(1) 115 + // .compaction_workers(1) 116 + .journal_compression(CompressionType::None) 117 + .cache_size(cache_mb as u64 * 2_u64.pow(20)) 118 + .temporary(true) 119 + .open()?; 120 + let opts = KeyspaceCreateOptions::default() 121 + .data_block_restart_interval_policy(RestartIntervalPolicy::all(8)) 122 + .filter_block_pinning_policy(PinningPolicy::disabled()) 123 + .expect_point_read_hits(true) 124 + .data_block_compression_policy(CompressionPolicy::disabled()) 125 + .manual_journal_persist(true) 126 + .max_memtable_size(32 * 2_u64.pow(20)); 127 + let partition = db.keyspace("z", || opts)?; 128 + 129 + Ok::<_, DiskError>((db, partition)) 130 + }) 131 + .await??; 132 + 133 + Ok(Self { 134 + db, 135 + partition, 136 + max_stored, 137 + stored: 0, 138 + }) 139 + } 140 + 141 + pub(crate) fn put_many( 142 + &mut self, 143 + kv: impl Iterator<Item = Result<(Vec<u8>, Vec<u8>), DriveError>>, 144 + ) -> Result<(), DriveError> { 145 + let mut batch = self.db.batch(); 146 + for pair in kv { 147 + let (k, v) = pair?; 148 + self.stored += v.len(); 149 + if self.stored > self.max_stored { 150 + return Err(DiskError::MaxSizeExceeded.into()); 151 + } 152 + batch.insert(&self.partition, k, v); 153 + } 154 + batch.commit().map_err(DiskError::DbError)?; 155 + Ok(()) 156 + } 157 + 158 + #[inline] 159 + pub(crate) fn get(&mut self, key: &[u8]) -> Result<Option<fjall::Slice>, FjallError> { 160 + self.partition.get(key) 161 + } 162 + }

-175

src/disk_drive.rs

··· 1 - use futures::Stream; 2 - use futures::TryStreamExt; 3 - use std::error::Error; 4 - 5 - use crate::disk_walk::{Step, Trip, Walker}; 6 - use crate::mst::Commit; 7 - use crate::mst::Node; 8 - 9 - use ipld_core::cid::Cid; 10 - use serde::{Deserialize, Serialize, de::DeserializeOwned}; 11 - 12 - /// Errors that can happen while consuming and emitting blocks and records 13 - #[derive(Debug, thiserror::Error)] 14 - pub enum DriveError { 15 - #[error("Failed to initialize CarReader: {0}")] 16 - CarReader(#[from] iroh_car::Error), 17 - #[error("Car block stream error: {0}")] 18 - CarBlockError(Box<dyn Error>), 19 - #[error("Failed to decode commit block: {0}")] 20 - BadCommit(Box<dyn Error>), 21 - #[error("The Commit block reference by the root was not found")] 22 - MissingCommit, 23 - #[error("The MST block {0} could not be found")] 24 - MissingBlock(Cid), 25 - #[error("Failed to walk the mst tree: {0}")] 26 - Tripped(#[from] Trip), 27 - } 28 - 29 - #[derive(Debug, Clone, Serialize, Deserialize)] 30 - pub enum MaybeProcessedBlock<T: Clone + Serialize> { 31 - /// A block that's *probably* a Node (but we can't know yet) 32 - /// 33 - /// It *can be* a record that suspiciously looks a lot like a node, so we 34 - /// cannot eagerly turn it into a Node. We only know for sure what it is 35 - /// when we actually walk down the MST 36 - Raw(Vec<u8>), 37 - /// A processed record from a block that was definitely not a Node 38 - /// 39 - /// If we _never_ needed this block, then we may have wasted a bit of effort 40 - /// trying to process it. Oh well. 41 - /// 42 - /// Processing has to be fallible because the CAR can have totally-unused 43 - /// blocks, which can just be garbage. since we're eagerly trying to process 44 - /// record blocks without knowing for sure that they *are* records, we 45 - /// discard any definitely-not-nodes that fail processing and keep their 46 - /// error in the buffer for them. if we later try to retreive them as a 47 - /// record, then we can surface the error. 48 - /// 49 - /// The error type is `String` because we don't really want to put 50 - /// any constraints like `Serialize` on the error type, and `Error` 51 - /// at least requires `Display`. It's a compromise. 52 - ProcessedOk(T), 53 - Unprocessable(String), 54 - } 55 - 56 - pub trait BlockStore<MPB: Serialize + DeserializeOwned> { 57 - fn put(&self, key: Cid, value: MPB); // unwraps for now 58 - fn get(&self, key: Cid) -> Option<MPB>; 59 - } 60 - 61 - type CarBlock<E> = Result<(Cid, Vec<u8>), E>; 62 - 63 - /// The core driver between the block stream and MST walker 64 - pub struct Vehicle<SE, S, T, BS, P, PE> 65 - where 66 - SE: Error + 'static, 67 - S: Stream<Item = CarBlock<SE>>, 68 - T: Clone + Serialize + DeserializeOwned, 69 - BS: BlockStore<MaybeProcessedBlock<T>>, 70 - P: Fn(&[u8]) -> Result<T, PE>, 71 - PE: Error, 72 - { 73 - #[allow(dead_code)] 74 - block_stream: S, 75 - block_store: BS, 76 - walker: Walker, 77 - process: P, 78 - } 79 - 80 - impl<SE, S, T, BS, P, PE> Vehicle<SE, S, T, BS, P, PE> 81 - where 82 - SE: Error + 'static, 83 - S: Stream<Item = CarBlock<SE>> + Unpin, 84 - T: Clone + Serialize + DeserializeOwned, 85 - BS: BlockStore<MaybeProcessedBlock<T>>, 86 - P: Fn(&[u8]) -> Result<T, PE>, 87 - PE: Error, 88 - { 89 - /// Set up the stream 90 - /// 91 - /// This will eagerly consume blocks until the `Commit` object is found. 92 - /// *Usually* the it's the first block, but there is no guarantee. 93 - /// 94 - /// ### Parameters 95 - /// 96 - /// `root`: CID of the commit object that is the root of the MST 97 - /// 98 - /// `block_stream`: Input stream of raw CAR blocks 99 - /// 100 - /// `process`: record-transforming callback: 101 - /// 102 - /// For tasks where records can be quickly processed into a *smaller* 103 - /// useful representation, you can do that eagerly as blocks come in by 104 - /// passing the processor as a callback here. This can reduce overall 105 - /// memory usage. 106 - pub async fn init( 107 - root: Cid, 108 - mut block_stream: S, 109 - block_store: BS, 110 - process: P, 111 - ) -> Result<(Commit, Self), DriveError> { 112 - let mut commit = None; 113 - 114 - log::warn!("init: load blocks"); 115 - 116 - // go ahead and put all blocks in the block store 117 - while let Some((cid, data)) = block_stream 118 - .try_next() 119 - .await 120 - .map_err(|e| DriveError::CarBlockError(e.into()))? 121 - { 122 - if cid == root { 123 - let c: Commit = serde_ipld_dagcbor::from_slice(&data) 124 - .map_err(|e| DriveError::BadCommit(e.into()))?; 125 - commit = Some(c); 126 - } else { 127 - block_store.put( 128 - cid, 129 - if Node::could_be(&data) { 130 - MaybeProcessedBlock::Raw(data) 131 - } else { 132 - match process(&data) { 133 - Ok(t) => MaybeProcessedBlock::ProcessedOk(t), 134 - Err(e) => MaybeProcessedBlock::Unprocessable(e.to_string()), 135 - } 136 - }, 137 - ); 138 - } 139 - } 140 - 141 - log::warn!("init: got commit?"); 142 - 143 - // we either broke out or read all the blocks without finding the commit... 144 - let commit = commit.ok_or(DriveError::MissingCommit)?; 145 - 146 - let walker = Walker::new(commit.data); 147 - 148 - log::warn!("init: wrapping up"); 149 - 150 - let me = Self { 151 - block_stream, 152 - block_store, 153 - walker, 154 - process, 155 - }; 156 - Ok((commit, me)) 157 - } 158 - 159 - /// Manually step through the record outputs 160 - pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError> { 161 - match self.walker.step(&mut self.block_store, &self.process)? { 162 - Step::Rest(cid) => Err(DriveError::MissingBlock(cid)), 163 - Step::Finish => Ok(None), 164 - Step::Step { rkey, data } => Ok(Some((rkey, data))), 165 - } 166 - } 167 - 168 - /// Convert to a futures::stream of record outputs 169 - pub fn stream(self) -> impl Stream<Item = Result<(String, T), DriveError>> { 170 - futures::stream::try_unfold(self, |mut this| async move { 171 - let maybe_record = this.next_record().await?; 172 - Ok(maybe_record.map(|b| (b, this))) 173 - }) 174 - } 175 - }

-53

src/disk_redb.rs

··· 1 - use crate::disk_drive::BlockStore; 2 - use ipld_core::cid::Cid; 3 - use redb::{Database, Error, ReadableTable, TableDefinition, WriteTransaction}; 4 - use serde::{Serialize, de::DeserializeOwned}; 5 - use std::path::Path; 6 - 7 - const TABLE: TableDefinition<&[u8], &[u8]> = TableDefinition::new("blocks"); 8 - 9 - pub struct RedbStore { 10 - #[allow(dead_code)] 11 - db: Database, 12 - tx: Option<WriteTransaction>, 13 - } 14 - 15 - impl RedbStore { 16 - pub fn new(path: impl AsRef<Path>) -> Result<Self, Error> { 17 - log::warn!("redb new"); 18 - let db = Database::create(path)?; 19 - log::warn!("db created"); 20 - let mut tx = db.begin_write()?; 21 - tx.set_durability(redb::Durability::None).unwrap(); 22 - log::warn!("transaction begun"); 23 - Ok(Self { db, tx: Some(tx) }) 24 - } 25 - } 26 - 27 - impl Drop for RedbStore { 28 - fn drop(&mut self) { 29 - let tx = self.tx.take(); 30 - tx.unwrap().abort().unwrap(); 31 - } 32 - } 33 - 34 - impl<MPB: Serialize + DeserializeOwned> BlockStore<MPB> for RedbStore { 35 - fn put(&self, c: Cid, t: MPB) { 36 - let key_bytes = c.to_bytes(); 37 - let val_bytes = bincode::serde::encode_to_vec(t, bincode::config::standard()).unwrap(); 38 - { 39 - let mut table = self.tx.as_ref().unwrap().open_table(TABLE).unwrap(); 40 - table.insert(&*key_bytes, &*val_bytes).unwrap(); 41 - } 42 - } 43 - fn get(&self, c: Cid) -> Option<MPB> { 44 - let key_bytes = c.to_bytes(); 45 - let table = self.tx.as_ref().unwrap().open_table(TABLE).unwrap(); 46 - let maybe_val_bytes = table.get(&*key_bytes).unwrap()?; 47 - let (t, n): (MPB, usize) = 48 - bincode::serde::decode_from_slice(maybe_val_bytes.value(), bincode::config::standard()) 49 - .unwrap(); 50 - assert_eq!(maybe_val_bytes.value().len(), n); 51 - Some(t) 52 - } 53 - }

-65

src/disk_sqlite.rs

··· 1 - use crate::disk_drive::BlockStore; 2 - use ipld_core::cid::Cid; 3 - use rusqlite::{Connection, OptionalExtension, Result}; 4 - use serde::{Serialize, de::DeserializeOwned}; 5 - use std::path::Path; 6 - 7 - pub struct SqliteStore { 8 - conn: Connection, 9 - } 10 - 11 - impl SqliteStore { 12 - pub fn new(path: impl AsRef<Path>) -> Result<Self> { 13 - let conn = Connection::open(path)?; 14 - conn.pragma_update(None, "journal_mode", "WAL")?; 15 - conn.pragma_update(None, "synchronous", "OFF")?; 16 - conn.pragma_update(None, "cache_size", (-32 * 2_i64.pow(10)).to_string())?; 17 - conn.execute( 18 - "CREATE TABLE blocks ( 19 - key BLOB PRIMARY KEY NOT NULL, 20 - val BLOB NOT NULL 21 - ) WITHOUT ROWID", 22 - (), 23 - )?; 24 - 25 - Ok(Self { conn }) 26 - } 27 - } 28 - 29 - impl Drop for SqliteStore { 30 - fn drop(&mut self) { 31 - self.conn.execute("DROP TABLE blocks", ()).unwrap(); 32 - } 33 - } 34 - 35 - impl<MPB: Serialize + DeserializeOwned> BlockStore<MPB> for SqliteStore { 36 - fn put(&self, c: Cid, t: MPB) { 37 - let key_bytes = c.to_bytes(); 38 - let val_bytes = bincode::serde::encode_to_vec(t, bincode::config::standard()).unwrap(); 39 - 40 - self.conn 41 - .execute( 42 - "INSERT INTO blocks (key, val) VALUES (?1, ?2)", 43 - (&key_bytes, &val_bytes), 44 - ) 45 - .unwrap(); 46 - } 47 - fn get(&self, c: Cid) -> Option<MPB> { 48 - let key_bytes = c.to_bytes(); 49 - 50 - let val_bytes: Vec<u8> = self 51 - .conn 52 - .query_one( 53 - "SELECT val FROM blocks WHERE key = ?1", 54 - (&key_bytes,), 55 - |row| row.get(0), 56 - ) 57 - .optional() 58 - .unwrap()?; 59 - 60 - let (t, n): (MPB, usize) = 61 - bincode::serde::decode_from_slice(&val_bytes, bincode::config::standard()).unwrap(); 62 - assert_eq!(val_bytes.len(), n); 63 - Some(t) 64 - } 65 - }

-403

src/disk_walk.rs

··· 1 - //! Depth-first MST traversal 2 - 3 - use crate::disk_drive::{BlockStore, MaybeProcessedBlock}; 4 - use crate::mst::Node; 5 - 6 - use ipld_core::cid::Cid; 7 - use serde::{Serialize, de::DeserializeOwned}; 8 - use std::error::Error; 9 - 10 - /// Errors that can happen while walking 11 - #[derive(Debug, thiserror::Error)] 12 - pub enum Trip { 13 - #[error("empty mst nodes are not allowed")] 14 - NodeEmpty, 15 - #[error("Failed to decode commit block: {0}")] 16 - BadCommit(Box<dyn std::error::Error>), 17 - #[error("Action node error: {0}")] 18 - RkeyError(#[from] RkeyError), 19 - #[error("Process failed: {0}")] 20 - ProcessFailed(String), 21 - #[error("Encountered an rkey out of order while walking the MST")] 22 - RkeyOutOfOrder, 23 - } 24 - 25 - /// Errors from invalid Rkeys 26 - #[derive(Debug, thiserror::Error)] 27 - pub enum RkeyError { 28 - #[error("Failed to compute an rkey due to invalid prefix_len")] 29 - EntryPrefixOutOfbounds, 30 - #[error("RKey was not utf-8")] 31 - EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error), 32 - } 33 - 34 - /// Walker outputs 35 - #[derive(Debug)] 36 - pub enum Step<T: Serialize + DeserializeOwned> { 37 - /// We need a CID but it's not in the block store 38 - /// 39 - /// Give the needed CID to the driver so it can load blocks until it's found 40 - Rest(Cid), 41 - /// Reached the end of the MST! yay! 42 - Finish, 43 - /// A record was found! 44 - Step { rkey: String, data: T }, 45 - } 46 - 47 - #[derive(Debug, Clone, PartialEq)] 48 - enum Need { 49 - Node(Cid), 50 - Record { rkey: String, cid: Cid }, 51 - } 52 - 53 - fn push_from_node(stack: &mut Vec<Need>, node: &Node) -> Result<(), RkeyError> { 54 - let mut entries = Vec::with_capacity(node.entries.len()); 55 - 56 - let mut prefix = vec![]; 57 - for entry in &node.entries { 58 - let mut rkey = vec![]; 59 - let pre_checked = prefix 60 - .get(..entry.prefix_len) 61 - .ok_or(RkeyError::EntryPrefixOutOfbounds)?; 62 - rkey.extend_from_slice(pre_checked); 63 - rkey.extend_from_slice(&entry.keysuffix); 64 - prefix = rkey.clone(); 65 - 66 - entries.push(Need::Record { 67 - rkey: String::from_utf8(rkey)?, 68 - cid: entry.value, 69 - }); 70 - if let Some(ref tree) = entry.tree { 71 - entries.push(Need::Node(*tree)); 72 - } 73 - } 74 - 75 - entries.reverse(); 76 - stack.append(&mut entries); 77 - 78 - if let Some(tree) = node.left { 79 - stack.push(Need::Node(tree)); 80 - } 81 - Ok(()) 82 - } 83 - 84 - /// Traverser of an atproto MST 85 - /// 86 - /// Walks the tree from left-to-right in depth-first order 87 - #[derive(Debug)] 88 - pub struct Walker { 89 - stack: Vec<Need>, 90 - prev: String, 91 - } 92 - 93 - impl Walker { 94 - pub fn new(tree_root_cid: Cid) -> Self { 95 - Self { 96 - stack: vec![Need::Node(tree_root_cid)], 97 - prev: "".to_string(), 98 - } 99 - } 100 - 101 - /// Advance through nodes until we find a record or can't go further 102 - pub fn step<T: Clone + Serialize + DeserializeOwned, E: Error>( 103 - &mut self, 104 - block_store: &mut impl BlockStore<MaybeProcessedBlock<T>>, 105 - process: impl Fn(&[u8]) -> Result<T, E>, 106 - ) -> Result<Step<T>, Trip> { 107 - loop { 108 - let Some(mut need) = self.stack.last() else { 109 - log::trace!("tried to walk but we're actually done."); 110 - return Ok(Step::Finish); 111 - }; 112 - 113 - match &mut need { 114 - Need::Node(cid) => { 115 - log::trace!("need node {cid:?}"); 116 - let Some(mpb) = block_store.get(*cid) else { 117 - log::trace!("node not found, resting"); 118 - return Ok(Step::Rest(*cid)); 119 - }; 120 - 121 - let MaybeProcessedBlock::<T>::Raw(block) = mpb else { 122 - return Err(Trip::BadCommit("failed commit fingerprint".into())); 123 - }; 124 - let node = serde_ipld_dagcbor::from_slice::<Node>(&block) 125 - .map_err(|e| Trip::BadCommit(e.into()))?; 126 - 127 - // found node, make sure we remember 128 - self.stack.pop(); 129 - 130 - // queue up work on the found node next 131 - push_from_node(&mut self.stack, &node)?; 132 - } 133 - Need::Record { rkey, cid } => { 134 - log::trace!("need record {cid:?}"); 135 - let Some(mpb) = block_store.get(*cid) else { 136 - log::trace!("record block not found, resting"); 137 - return Ok(Step::Rest(*cid)); 138 - }; 139 - let rkey = rkey.clone(); 140 - let data = match mpb { 141 - MaybeProcessedBlock::Raw(data) => match process(&data) { 142 - Ok(t) => Ok(t), 143 - Err(e) => Err(Trip::ProcessFailed(e.to_string())), 144 - }, 145 - MaybeProcessedBlock::ProcessedOk(t) => Ok(t.clone()), 146 - MaybeProcessedBlock::Unprocessable(s) => { 147 - return Err(Trip::ProcessFailed(s.clone())); 148 - } 149 - }; 150 - 151 - // found node, make sure we remember 152 - self.stack.pop(); 153 - 154 - log::trace!("emitting a block as a step. depth={}", self.stack.len()); 155 - 156 - let data = data.map_err(|e| Trip::ProcessFailed(e.to_string()))?; 157 - 158 - // rkeys *must* be in order or else the tree is invalid (or 159 - // we have a bug) 160 - if rkey <= self.prev { 161 - return Err(Trip::RkeyOutOfOrder); 162 - } 163 - self.prev = rkey.clone(); 164 - 165 - return Ok(Step::Step { rkey, data }); 166 - } 167 - } 168 - } 169 - } 170 - } 171 - 172 - #[cfg(test)] 173 - mod test { 174 - use super::*; 175 - // use crate::mst::Entry; 176 - 177 - fn cid1() -> Cid { 178 - "bafyreihixenvk3ahqbytas4hk4a26w43bh6eo3w6usjqtxkpzsvi655a3m" 179 - .parse() 180 - .unwrap() 181 - } 182 - // fn cid2() -> Cid { 183 - // "QmY7Yh4UquoXHLPFo2XbhXkhBvFoPwmQUSa92pxnxjQuPU" 184 - // .parse() 185 - // .unwrap() 186 - // } 187 - // fn cid3() -> Cid { 188 - // "bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi" 189 - // .parse() 190 - // .unwrap() 191 - // } 192 - // fn cid4() -> Cid { 193 - // "QmbWqxBEKC3P8tqsKc98xmWNzrzDtRLMiMPL8wBuTGsMnR" 194 - // .parse() 195 - // .unwrap() 196 - // } 197 - // fn cid5() -> Cid { 198 - // "QmSnuWmxptJZdLJpKRarxBMS2Ju2oANVrgbr2xWbie9b2D" 199 - // .parse() 200 - // .unwrap() 201 - // } 202 - // fn cid6() -> Cid { 203 - // "QmdmQXB2mzChmMeKY47C43LxUdg1NDJ5MWcKMKxDu7RgQm" 204 - // .parse() 205 - // .unwrap() 206 - // } 207 - // fn cid7() -> Cid { 208 - // "bafybeiaysi4s6lnjev27ln5icwm6tueaw2vdykrtjkwiphwekaywqhcjze" 209 - // .parse() 210 - // .unwrap() 211 - // } 212 - // fn cid8() -> Cid { 213 - // "bafyreif3tfdpr5n4jdrbielmcapwvbpcthepfkwq2vwonmlhirbjmotedi" 214 - // .parse() 215 - // .unwrap() 216 - // } 217 - // fn cid9() -> Cid { 218 - // "bafyreicnokmhmrnlp2wjhyk2haep4tqxiptwfrp2rrs7rzq7uk766chqvq" 219 - // .parse() 220 - // .unwrap() 221 - // } 222 - 223 - #[test] 224 - fn test_next_from_node_empty() { 225 - let node = Node { 226 - left: None, 227 - entries: vec![], 228 - }; 229 - let mut stack = vec![]; 230 - push_from_node(&mut stack, &node).unwrap(); 231 - assert_eq!(stack.last(), None); 232 - } 233 - 234 - #[test] 235 - fn test_needs_from_node_just_left() { 236 - let node = Node { 237 - left: Some(cid1()), 238 - entries: vec![], 239 - }; 240 - let mut stack = vec![]; 241 - push_from_node(&mut stack, &node).unwrap(); 242 - assert_eq!(stack.last(), Some(Need::Node(cid1())).as_ref()); 243 - } 244 - 245 - // #[test] 246 - // fn test_needs_from_node_just_one_record() { 247 - // let node = Node { 248 - // left: None, 249 - // entries: vec![Entry { 250 - // keysuffix: "asdf".into(), 251 - // prefix_len: 0, 252 - // value: cid1(), 253 - // tree: None, 254 - // }], 255 - // }; 256 - // assert_eq!( 257 - // needs_from_node(node).unwrap(), 258 - // vec![Need::Record { 259 - // rkey: "asdf".into(), 260 - // cid: cid1(), 261 - // },] 262 - // ); 263 - // } 264 - 265 - // #[test] 266 - // fn test_needs_from_node_two_records() { 267 - // let node = Node { 268 - // left: None, 269 - // entries: vec![ 270 - // Entry { 271 - // keysuffix: "asdf".into(), 272 - // prefix_len: 0, 273 - // value: cid1(), 274 - // tree: None, 275 - // }, 276 - // Entry { 277 - // keysuffix: "gh".into(), 278 - // prefix_len: 2, 279 - // value: cid2(), 280 - // tree: None, 281 - // }, 282 - // ], 283 - // }; 284 - // assert_eq!( 285 - // needs_from_node(node).unwrap(), 286 - // vec![ 287 - // Need::Record { 288 - // rkey: "asdf".into(), 289 - // cid: cid1(), 290 - // }, 291 - // Need::Record { 292 - // rkey: "asgh".into(), 293 - // cid: cid2(), 294 - // }, 295 - // ] 296 - // ); 297 - // } 298 - 299 - // #[test] 300 - // fn test_needs_from_node_with_both() { 301 - // let node = Node { 302 - // left: None, 303 - // entries: vec![Entry { 304 - // keysuffix: "asdf".into(), 305 - // prefix_len: 0, 306 - // value: cid1(), 307 - // tree: Some(cid2()), 308 - // }], 309 - // }; 310 - // assert_eq!( 311 - // needs_from_node(node).unwrap(), 312 - // vec![ 313 - // Need::Record { 314 - // rkey: "asdf".into(), 315 - // cid: cid1(), 316 - // }, 317 - // Need::Node(cid2()), 318 - // ] 319 - // ); 320 - // } 321 - 322 - // #[test] 323 - // fn test_needs_from_node_left_and_record() { 324 - // let node = Node { 325 - // left: Some(cid1()), 326 - // entries: vec![Entry { 327 - // keysuffix: "asdf".into(), 328 - // prefix_len: 0, 329 - // value: cid2(), 330 - // tree: None, 331 - // }], 332 - // }; 333 - // assert_eq!( 334 - // needs_from_node(node).unwrap(), 335 - // vec![ 336 - // Need::Node(cid1()), 337 - // Need::Record { 338 - // rkey: "asdf".into(), 339 - // cid: cid2(), 340 - // }, 341 - // ] 342 - // ); 343 - // } 344 - 345 - // #[test] 346 - // fn test_needs_from_full_node() { 347 - // let node = Node { 348 - // left: Some(cid1()), 349 - // entries: vec![ 350 - // Entry { 351 - // keysuffix: "asdf".into(), 352 - // prefix_len: 0, 353 - // value: cid2(), 354 - // tree: Some(cid3()), 355 - // }, 356 - // Entry { 357 - // keysuffix: "ghi".into(), 358 - // prefix_len: 1, 359 - // value: cid4(), 360 - // tree: Some(cid5()), 361 - // }, 362 - // Entry { 363 - // keysuffix: "jkl".into(), 364 - // prefix_len: 2, 365 - // value: cid6(), 366 - // tree: Some(cid7()), 367 - // }, 368 - // Entry { 369 - // keysuffix: "mno".into(), 370 - // prefix_len: 4, 371 - // value: cid8(), 372 - // tree: Some(cid9()), 373 - // }, 374 - // ], 375 - // }; 376 - // assert_eq!( 377 - // needs_from_node(node).unwrap(), 378 - // vec![ 379 - // Need::Node(cid1()), 380 - // Need::Record { 381 - // rkey: "asdf".into(), 382 - // cid: cid2(), 383 - // }, 384 - // Need::Node(cid3()), 385 - // Need::Record { 386 - // rkey: "aghi".into(), 387 - // cid: cid4(), 388 - // }, 389 - // Need::Node(cid5()), 390 - // Need::Record { 391 - // rkey: "agjkl".into(), 392 - // cid: cid6(), 393 - // }, 394 - // Need::Node(cid7()), 395 - // Need::Record { 396 - // rkey: "agjkmno".into(), 397 - // cid: cid8(), 398 - // }, 399 - // Need::Node(cid9()), 400 - // ] 401 - // ); 402 - // } 403 - }

+508 -109

src/drive.rs

··· 1 - //! Consume an MST block stream, producing an ordered stream of records 1 + //! Consume a CAR from an AsyncRead, producing an ordered stream of records 2 2 3 - use futures::{Stream, TryStreamExt}; 3 + use crate::disk::{DiskError, DiskStore}; 4 + use crate::process::Processable; 4 5 use ipld_core::cid::Cid; 6 + use iroh_car::CarReader; 7 + use serde::{Deserialize, Serialize}; 5 8 use std::collections::HashMap; 6 - use std::error::Error; 9 + use std::convert::Infallible; 10 + use tokio::{io::AsyncRead, sync::mpsc}; 7 11 8 12 use crate::mst::{Commit, Node}; 9 - use crate::walk::{Step, Trip, Walker}; 13 + use crate::walk::{Step, WalkError, Walker}; 10 14 11 15 /// Errors that can happen while consuming and emitting blocks and records 12 16 #[derive(Debug, thiserror::Error)] 13 - pub enum DriveError<E: Error> { 14 - #[error("Failed to initialize CarReader: {0}")] 17 + pub enum DriveError { 18 + #[error("Error from iroh_car: {0}")] 15 19 CarReader(#[from] iroh_car::Error), 16 - #[error("Car block stream error: {0}")] 17 - CarBlockError(Box<dyn Error>), 18 20 #[error("Failed to decode commit block: {0}")] 19 - BadCommit(Box<dyn Error>), 21 + BadBlock(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 20 22 #[error("The Commit block reference by the root was not found")] 21 23 MissingCommit, 22 24 #[error("The MST block {0} could not be found")] 23 25 MissingBlock(Cid), 24 26 #[error("Failed to walk the mst tree: {0}")] 25 - Tripped(#[from] Trip<E>), 27 + WalkError(#[from] WalkError), 28 + #[error("CAR file had no roots")] 29 + MissingRoot, 30 + #[error("Storage error")] 31 + StorageError(#[from] DiskError), 32 + #[error("Encode error: {0}")] 33 + BincodeEncodeError(#[from] bincode::error::EncodeError), 34 + #[error("Tried to send on a closed channel")] 35 + ChannelSendError, // SendError takes <T> which we don't need 36 + #[error("Failed to join a task: {0}")] 37 + JoinError(#[from] tokio::task::JoinError), 26 38 } 27 39 28 - type CarBlock<E> = Result<(Cid, Vec<u8>), E>; 40 + #[derive(Debug, thiserror::Error)] 41 + pub enum DecodeError { 42 + #[error(transparent)] 43 + BincodeDecodeError(#[from] bincode::error::DecodeError), 44 + #[error("extra bytes remained after decoding")] 45 + ExtraGarbage, 46 + } 29 47 30 - #[derive(Debug)] 31 - pub enum MaybeProcessedBlock<T, E> { 48 + /// An in-order chunk of Rkey + (processed) Block pairs 49 + pub type BlockChunk<T> = Vec<(String, T)>; 50 + 51 + #[derive(Debug, Clone, Serialize, Deserialize)] 52 + pub(crate) enum MaybeProcessedBlock<T> { 32 53 /// A block that's *probably* a Node (but we can't know yet) 33 54 /// 34 55 /// It *can be* a record that suspiciously looks a lot like a node, so we ··· 50 71 /// There's an alternative here, which would be to kick unprocessable blocks 51 72 /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could 52 73 /// surface the typed error later if needed by trying to reprocess. 53 - Processed(Result<T, E>), 74 + Processed(T), 75 + } 76 + 77 + impl<T: Processable> Processable for MaybeProcessedBlock<T> { 78 + /// TODO this is probably a little broken 79 + fn get_size(&self) -> usize { 80 + use std::{cmp::max, mem::size_of}; 81 + 82 + // enum is always as big as its biggest member? 83 + let base_size = max(size_of::<Vec<u8>>(), size_of::<T>()); 84 + 85 + let extra = match self { 86 + Self::Raw(bytes) => bytes.len(), 87 + Self::Processed(t) => t.get_size(), 88 + }; 89 + 90 + base_size + extra 91 + } 54 92 } 55 93 56 - /// The core driver between the block stream and MST walker 57 - pub struct Vehicle<SE, S, T, P, PE> 58 - where 59 - S: Stream<Item = CarBlock<SE>>, 60 - P: Fn(&[u8]) -> Result<T, PE>, 61 - PE: Error, 62 - { 63 - block_stream: S, 64 - blocks: HashMap<Cid, MaybeProcessedBlock<T, PE>>, 65 - walker: Walker, 66 - process: P, 94 + impl<T> MaybeProcessedBlock<T> { 95 + fn maybe(process: fn(Vec<u8>) -> T, data: Vec<u8>) -> Self { 96 + if Node::could_be(&data) { 97 + MaybeProcessedBlock::Raw(data) 98 + } else { 99 + MaybeProcessedBlock::Processed(process(data)) 100 + } 101 + } 67 102 } 68 103 69 - impl<SE, S, T: Clone, P, PE> Vehicle<SE, S, T, P, PE> 70 - where 71 - SE: Error + 'static, 72 - S: Stream<Item = CarBlock<SE>> + Unpin, 73 - P: Fn(&[u8]) -> Result<T, PE>, 74 - PE: Error, 75 - { 76 - /// Set up the stream 104 + /// Read a CAR file, buffering blocks in memory or to disk 105 + pub enum Driver<R: AsyncRead + Unpin, T: Processable> { 106 + /// All blocks fit within the memory limit 107 + /// 108 + /// You probably want to check the commit's signature. You can go ahead and 109 + /// walk the MST right away. 110 + Memory(Commit, MemDriver<T>), 111 + /// Blocks exceed the memory limit 77 112 /// 78 - /// This will eagerly consume blocks until the `Commit` object is found. 79 - /// *Usually* the it's the first block, but there is no guarantee. 113 + /// You'll need to provide a disk storage to continue. The commit will be 114 + /// returned and can be validated only once all blocks are loaded. 115 + Disk(NeedDisk<R, T>), 116 + } 117 + 118 + /// Builder-style driver setup 119 + #[derive(Debug, Clone)] 120 + pub struct DriverBuilder { 121 + pub mem_limit_mb: usize, 122 + } 123 + 124 + impl Default for DriverBuilder { 125 + fn default() -> Self { 126 + Self { mem_limit_mb: 16 } 127 + } 128 + } 129 + 130 + impl DriverBuilder { 131 + /// Begin configuring the driver with defaults 132 + pub fn new() -> Self { 133 + Default::default() 134 + } 135 + /// Set the in-memory size limit, in MiB 80 136 /// 81 - /// ### Parameters 137 + /// Default: 16 MiB 138 + pub fn with_mem_limit_mb(self, new_limit: usize) -> Self { 139 + Self { 140 + mem_limit_mb: new_limit, 141 + } 142 + } 143 + /// Set the block processor 82 144 /// 83 - /// `root`: CID of the commit object that is the root of the MST 145 + /// Default: noop, raw blocks will be emitted 146 + pub fn with_block_processor<T: Processable>( 147 + self, 148 + p: fn(Vec<u8>) -> T, 149 + ) -> DriverBuilderWithProcessor<T> { 150 + DriverBuilderWithProcessor { 151 + mem_limit_mb: self.mem_limit_mb, 152 + block_processor: p, 153 + } 154 + } 155 + /// Begin processing an atproto MST from a CAR file 156 + pub async fn load_car<R: AsyncRead + Unpin>( 157 + &self, 158 + reader: R, 159 + ) -> Result<Driver<R, Vec<u8>>, DriveError> { 160 + Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await 161 + } 162 + } 163 + 164 + /// Builder-style driver intermediate step 165 + /// 166 + /// start from `DriverBuilder` 167 + #[derive(Debug, Clone)] 168 + pub struct DriverBuilderWithProcessor<T: Processable> { 169 + pub mem_limit_mb: usize, 170 + pub block_processor: fn(Vec<u8>) -> T, 171 + } 172 + 173 + impl<T: Processable> DriverBuilderWithProcessor<T> { 174 + /// Set the in-memory size limit, in MiB 84 175 /// 85 - /// `block_stream`: Input stream of raw CAR blocks 176 + /// Default: 16 MiB 177 + pub fn with_mem_limit_mb(mut self, new_limit: usize) -> Self { 178 + self.mem_limit_mb = new_limit; 179 + self 180 + } 181 + /// Begin processing an atproto MST from a CAR file 182 + pub async fn load_car<R: AsyncRead + Unpin>( 183 + &self, 184 + reader: R, 185 + ) -> Result<Driver<R, T>, DriveError> { 186 + Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await 187 + } 188 + } 189 + 190 + impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> { 191 + /// Begin processing an atproto MST from a CAR file 86 192 /// 87 - /// `process`: record-transforming callback: 193 + /// Blocks will be loaded, processed, and buffered in memory. If the entire 194 + /// processed size is under the `mem_limit_mb` limit, a `Driver::Memory` 195 + /// will be returned along with a `Commit` ready for validation. 88 196 /// 89 - /// For tasks where records can be quickly processed into a *smaller* 90 - /// useful representation, you can do that eagerly as blocks come in by 91 - /// passing the processor as a callback here. This can reduce overall 92 - /// memory usage. 93 - pub async fn init( 94 - root: Cid, 95 - mut block_stream: S, 96 - process: P, 97 - ) -> Result<(Commit, Self), DriveError<PE>> { 98 - let mut blocks = HashMap::new(); 197 + /// If the `mem_limit_mb` limit is reached before loading all blocks, the 198 + /// partial state will be returned as `Driver::Disk(needed)`, which can be 199 + /// resumed by providing a `SqliteStorage` for on-disk block storage. 200 + pub async fn load_car( 201 + reader: R, 202 + process: fn(Vec<u8>) -> T, 203 + mem_limit_mb: usize, 204 + ) -> Result<Driver<R, T>, DriveError> { 205 + let max_size = mem_limit_mb * 2_usize.pow(20); 206 + let mut mem_blocks = HashMap::new(); 207 + 208 + let mut car = CarReader::new(reader).await?; 209 + 210 + let root = *car 211 + .header() 212 + .roots() 213 + .first() 214 + .ok_or(DriveError::MissingRoot)?; 215 + log::debug!("root: {root:?}"); 99 216 100 217 let mut commit = None; 101 218 102 - while let Some((cid, data)) = block_stream 103 - .try_next() 104 - .await 105 - .map_err(|e| DriveError::CarBlockError(e.into()))? 106 - { 219 + // try to load all the blocks into memory 220 + let mut mem_size = 0; 221 + while let Some((cid, data)) = car.next_block().await? { 222 + // the root commit is a Special Third Kind of block that we need to make 223 + // sure not to optimistically send to the processing function 107 224 if cid == root { 108 - let c: Commit = serde_ipld_dagcbor::from_slice(&data) 109 - .map_err(|e| DriveError::BadCommit(e.into()))?; 225 + let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 110 226 commit = Some(c); 111 - break; 112 - } else { 113 - blocks.insert( 114 - cid, 115 - if Node::could_be(&data) { 116 - MaybeProcessedBlock::Raw(data) 117 - } else { 118 - MaybeProcessedBlock::Processed(process(&data)) 119 - }, 120 - ); 227 + continue; 228 + } 229 + 230 + // remaining possible types: node, record, other. optimistically process 231 + let maybe_processed = MaybeProcessedBlock::maybe(process, data); 232 + 233 + // stash (maybe processed) blocks in memory as long as we have room 234 + mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size(); 235 + mem_blocks.insert(cid, maybe_processed); 236 + if mem_size >= max_size { 237 + return Ok(Driver::Disk(NeedDisk { 238 + car, 239 + root, 240 + process, 241 + max_size, 242 + mem_blocks, 243 + commit, 244 + })); 121 245 } 122 246 } 123 247 124 - // we either broke out or read all the blocks without finding the commit... 248 + // all blocks loaded and we fit in memory! hopefully we found the commit... 125 249 let commit = commit.ok_or(DriveError::MissingCommit)?; 126 250 127 251 let walker = Walker::new(commit.data); 128 252 129 - let me = Self { 130 - block_stream, 131 - blocks, 132 - walker, 133 - process, 134 - }; 135 - Ok((commit, me)) 253 + Ok(Driver::Memory( 254 + commit, 255 + MemDriver { 256 + blocks: mem_blocks, 257 + walker, 258 + process, 259 + }, 260 + )) 261 + } 262 + } 263 + 264 + /// The core driver between the block stream and MST walker 265 + /// 266 + /// In the future, PDSs will export CARs in a stream-friendly order that will 267 + /// enable processing them with tiny memory overhead. But that future is not 268 + /// here yet. 269 + /// 270 + /// CARs are almost always in a stream-unfriendly order, so I'm reverting the 271 + /// optimistic stream features: we load all block first, then walk the MST. 272 + /// 273 + /// This makes things much simpler: we only need to worry about spilling to disk 274 + /// in one place, and we always have a reasonable expecatation about how much 275 + /// work the init function will do. We can drop the CAR reader before walking, 276 + /// so the sync/async boundaries become a little easier to work around. 277 + #[derive(Debug)] 278 + pub struct MemDriver<T: Processable> { 279 + blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 280 + walker: Walker, 281 + process: fn(Vec<u8>) -> T, 282 + } 283 + 284 + impl<T: Processable> MemDriver<T> { 285 + /// Step through the record outputs, in rkey order 286 + pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 287 + let mut out = Vec::with_capacity(n); 288 + for _ in 0..n { 289 + // walk as far as we can until we run out of blocks or find a record 290 + match self.walker.step(&mut self.blocks, self.process)? { 291 + Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)), 292 + Step::Finish => break, 293 + Step::Found { rkey, data } => { 294 + out.push((rkey, data)); 295 + continue; 296 + } 297 + }; 298 + } 299 + 300 + if out.is_empty() { 301 + Ok(None) 302 + } else { 303 + Ok(Some(out)) 304 + } 305 + } 306 + } 307 + 308 + /// A partially memory-loaded car file that needs disk spillover to continue 309 + pub struct NeedDisk<R: AsyncRead + Unpin, T: Processable> { 310 + car: CarReader<R>, 311 + root: Cid, 312 + process: fn(Vec<u8>) -> T, 313 + max_size: usize, 314 + mem_blocks: HashMap<Cid, MaybeProcessedBlock<T>>, 315 + pub commit: Option<Commit>, 316 + } 317 + 318 + fn encode(v: impl Serialize) -> Result<Vec<u8>, bincode::error::EncodeError> { 319 + bincode::serde::encode_to_vec(v, bincode::config::standard()) 320 + } 321 + 322 + pub(crate) fn decode<T: Processable>(bytes: &[u8]) -> Result<T, DecodeError> { 323 + let (t, n) = bincode::serde::decode_from_slice(bytes, bincode::config::standard())?; 324 + if n != bytes.len() { 325 + return Err(DecodeError::ExtraGarbage); 136 326 } 327 + Ok(t) 328 + } 137 329 138 - async fn drive_until(&mut self, cid_needed: Cid) -> Result<(), DriveError<PE>> { 139 - while let Some((cid, data)) = self 140 - .block_stream 141 - .try_next() 142 - .await 143 - .map_err(|e| DriveError::CarBlockError(e.into()))? 144 - { 145 - self.blocks.insert( 146 - cid, 147 - if Node::could_be(&data) { 148 - MaybeProcessedBlock::Raw(data) 149 - } else { 150 - MaybeProcessedBlock::Processed((self.process)(&data)) 151 - }, 152 - ); 153 - if cid == cid_needed { 154 - return Ok(()); 330 + impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> NeedDisk<R, T> { 331 + pub async fn finish_loading( 332 + mut self, 333 + mut store: DiskStore, 334 + ) -> Result<(Commit, DiskDriver<T>), DriveError> { 335 + // move store in and back out so we can manage lifetimes 336 + // dump mem blocks into the store 337 + store = tokio::task::spawn(async move { 338 + let kvs = self 339 + .mem_blocks 340 + .into_iter() 341 + .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?)); 342 + 343 + store.put_many(kvs)?; 344 + Ok::<_, DriveError>(store) 345 + }) 346 + .await??; 347 + 348 + let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(1); 349 + 350 + let store_worker = tokio::task::spawn_blocking(move || { 351 + while let Some(chunk) = rx.blocking_recv() { 352 + let kvs = chunk 353 + .into_iter() 354 + .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?)); 355 + store.put_many(kvs)?; 155 356 } 357 + Ok::<_, DriveError>(store) 358 + }); // await later 359 + 360 + // dump the rest to disk (in chunks) 361 + log::debug!("dumping the rest of the stream..."); 362 + loop { 363 + let mut mem_size = 0; 364 + let mut chunk = vec![]; 365 + loop { 366 + let Some((cid, data)) = self.car.next_block().await? else { 367 + break; 368 + }; 369 + // we still gotta keep checking for the root since we might not have it 370 + if cid == self.root { 371 + let c: Commit = serde_ipld_dagcbor::from_slice(&data)?; 372 + self.commit = Some(c); 373 + continue; 374 + } 375 + // remaining possible types: node, record, other. optimistically process 376 + // TODO: get the actual in-memory size to compute disk spill 377 + let maybe_processed = MaybeProcessedBlock::maybe(self.process, data); 378 + mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size(); 379 + chunk.push((cid, maybe_processed)); 380 + if mem_size >= self.max_size { 381 + // soooooo if we're setting the db cache to max_size and then letting 382 + // multiple chunks in the queue that are >= max_size, then at any time 383 + // we might be using some multiple of max_size? 384 + break; 385 + } 386 + } 387 + if chunk.is_empty() { 388 + break; 389 + } 390 + tx.send(chunk) 391 + .await 392 + .map_err(|_| DriveError::ChannelSendError)?; 156 393 } 394 + drop(tx); 395 + log::debug!("done. waiting for worker to finish..."); 396 + 397 + store = store_worker.await??; 398 + 399 + log::debug!("worker finished."); 157 400 158 - // if we never found the block 159 - Err(DriveError::MissingBlock(cid_needed)) 401 + let commit = self.commit.ok_or(DriveError::MissingCommit)?; 402 + 403 + let walker = Walker::new(commit.data); 404 + 405 + Ok(( 406 + commit, 407 + DiskDriver { 408 + process: self.process, 409 + state: Some(BigState { store, walker }), 410 + }, 411 + )) 160 412 } 413 + } 161 414 162 - /// Manually step through the record outputs 163 - pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError<PE>> { 415 + struct BigState { 416 + store: DiskStore, 417 + walker: Walker, 418 + } 419 + 420 + /// MST walker that reads from disk instead of an in-memory hashmap 421 + pub struct DiskDriver<T: Clone> { 422 + process: fn(Vec<u8>) -> T, 423 + state: Option<BigState>, 424 + } 425 + 426 + // for doctests only 427 + #[doc(hidden)] 428 + pub fn _get_fake_disk_driver() -> DiskDriver<Vec<u8>> { 429 + use crate::process::noop; 430 + DiskDriver { 431 + process: noop, 432 + state: None, 433 + } 434 + } 435 + 436 + impl<T: Processable + Send + 'static> DiskDriver<T> { 437 + /// Walk the MST returning up to `n` rkey + record pairs 438 + /// 439 + /// ```no_run 440 + /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop}; 441 + /// # #[tokio::main] 442 + /// # async fn main() -> Result<(), DriveError> { 443 + /// # let mut disk_driver = _get_fake_disk_driver(); 444 + /// while let Some(pairs) = disk_driver.next_chunk(256).await? { 445 + /// for (rkey, record) in pairs { 446 + /// println!("{rkey}: size={}", record.len()); 447 + /// } 448 + /// } 449 + /// # Ok(()) 450 + /// # } 451 + /// ``` 452 + pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> { 453 + let process = self.process; 454 + 455 + // state should only *ever* be None transiently while inside here 456 + let mut state = self.state.take().expect("DiskDriver must have Some(state)"); 457 + 458 + // the big pain here is that we don't want to leave self.state in an 459 + // invalid state (None), so all the error paths have to make sure it 460 + // comes out again. 461 + let (state, res) = tokio::task::spawn_blocking( 462 + move || -> (BigState, Result<BlockChunk<T>, DriveError>) { 463 + let mut out = Vec::with_capacity(n); 464 + 465 + for _ in 0..n { 466 + // walk as far as we can until we run out of blocks or find a record 467 + let step = match state.walker.disk_step(&mut state.store, process) { 468 + Ok(s) => s, 469 + Err(e) => { 470 + return (state, Err(e.into())); 471 + } 472 + }; 473 + match step { 474 + Step::Missing(cid) => { 475 + return (state, Err(DriveError::MissingBlock(cid))); 476 + } 477 + Step::Finish => break, 478 + Step::Found { rkey, data } => out.push((rkey, data)), 479 + }; 480 + } 481 + 482 + (state, Ok::<_, DriveError>(out)) 483 + }, 484 + ) 485 + .await?; // on tokio JoinError, we'll be left with invalid state :( 486 + 487 + // *must* restore state before dealing with the actual result 488 + self.state = Some(state); 489 + 490 + let out = res?; 491 + 492 + if out.is_empty() { 493 + Ok(None) 494 + } else { 495 + Ok(Some(out)) 496 + } 497 + } 498 + 499 + fn read_tx_blocking( 500 + &mut self, 501 + n: usize, 502 + tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>, 503 + ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> { 504 + let BigState { store, walker } = self.state.as_mut().expect("valid state"); 505 + 164 506 loop { 165 - // walk as far as we can until we run out of blocks or find a record 166 - let cid_needed = match self.walker.step(&mut self.blocks, &self.process)? { 167 - Step::Rest(cid) => cid, 168 - Step::Finish => return Ok(None), 169 - Step::Step { rkey, data } => return Ok(Some((rkey, data))), 170 - }; 507 + let mut out: BlockChunk<T> = Vec::with_capacity(n); 171 508 172 - // load blocks until we reach that cid 173 - self.drive_until(cid_needed).await?; 509 + for _ in 0..n { 510 + // walk as far as we can until we run out of blocks or find a record 511 + 512 + let step = match walker.disk_step(store, self.process) { 513 + Ok(s) => s, 514 + Err(e) => return tx.blocking_send(Err(e.into())), 515 + }; 516 + 517 + match step { 518 + Step::Missing(cid) => { 519 + return tx.blocking_send(Err(DriveError::MissingBlock(cid))); 520 + } 521 + Step::Finish => return Ok(()), 522 + Step::Found { rkey, data } => { 523 + out.push((rkey, data)); 524 + continue; 525 + } 526 + }; 527 + } 528 + 529 + if out.is_empty() { 530 + break; 531 + } 532 + tx.blocking_send(Ok(out))?; 174 533 } 534 + 535 + Ok(()) 175 536 } 176 537 177 - /// Convert to a futures::stream of record outputs 178 - pub fn stream(self) -> impl Stream<Item = Result<(String, T), DriveError<PE>>> { 179 - futures::stream::try_unfold(self, |mut this| async move { 180 - let maybe_record = this.next_record().await?; 181 - Ok(maybe_record.map(|b| (b, this))) 182 - }) 538 + /// Spawn the disk reading task into a tokio blocking thread 539 + /// 540 + /// The idea is to avoid so much sending back and forth to the blocking 541 + /// thread, letting a blocking task do all the disk reading work and sending 542 + /// records and rkeys back through an `mpsc` channel instead. 543 + /// 544 + /// This might also allow the disk work to continue while processing the 545 + /// records. It's still not yet clear if this method actually has much 546 + /// benefit over just using `.next_chunk(n)`. 547 + /// 548 + /// ```no_run 549 + /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop}; 550 + /// # #[tokio::main] 551 + /// # async fn main() -> Result<(), DriveError> { 552 + /// # let mut disk_driver = _get_fake_disk_driver(); 553 + /// let (mut rx, join) = disk_driver.to_channel(512); 554 + /// while let Some(recvd) = rx.recv().await { 555 + /// let pairs = recvd?; 556 + /// for (rkey, record) in pairs { 557 + /// println!("{rkey}: size={}", record.len()); 558 + /// } 559 + /// 560 + /// } 561 + /// # Ok(()) 562 + /// # } 563 + /// ``` 564 + pub fn to_channel( 565 + mut self, 566 + n: usize, 567 + ) -> ( 568 + mpsc::Receiver<Result<BlockChunk<T>, DriveError>>, 569 + tokio::task::JoinHandle<Self>, 570 + ) { 571 + let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1); 572 + 573 + // sketch: this worker is going to be allowed to execute without a join handle 574 + let chan_task = tokio::task::spawn_blocking(move || { 575 + if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) { 576 + log::debug!("big car reader exited early due to dropped receiver channel"); 577 + } 578 + self 579 + }); 580 + 581 + (rx, chan_task) 183 582 } 184 583 }

+82 -9

src/lib.rs

··· 1 - //! Fast and robust atproto CAR file processing in rust 2 - //! 3 - //! For now see the [examples](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples) 1 + /*! 2 + A robust CAR file -> MST walker for atproto 3 + 4 + Small CARs have their blocks buffered in memory. If a configurable memory limit 5 + is reached while reading blocks, CAR reading is suspended, and can be continued 6 + by providing disk storage to buffer the CAR blocks instead. 7 + 8 + A `process` function can be provided for tasks where records are transformed 9 + into a smaller representation, to save memory (and disk) during block reading. 10 + 11 + Once blocks are loaded, the MST is walked and emitted as chunks of pairs of 12 + `(rkey, processed_block)` pairs, in order (depth first, left-to-right). 13 + 14 + Some MST validations are applied 15 + - Keys must appear in order 16 + - Keys must be at the correct MST tree depth 17 + 18 + `iroh_car` additionally applies a block size limit of `2MiB`. 19 + 20 + ``` 21 + use repo_stream::{Driver, DriverBuilder, DiskBuilder}; 22 + 23 + # #[tokio::main] 24 + # async fn main() -> Result<(), Box<dyn std::error::Error>> { 25 + # let reader = include_bytes!("../car-samples/tiny.car").as_slice(); 26 + let mut total_size = 0; 27 + 28 + match DriverBuilder::new() 29 + .with_mem_limit_mb(10) 30 + .with_block_processor(|rec| rec.len()) // block processing: just extract the raw record size 31 + .load_car(reader) 32 + .await? 33 + { 34 + 35 + // if all blocks fit within memory 36 + Driver::Memory(_commit, mut driver) => { 37 + while let Some(chunk) = driver.next_chunk(256).await? { 38 + for (_rkey, size) in chunk { 39 + total_size += size; 40 + } 41 + } 42 + }, 43 + 44 + // if the CAR was too big for in-memory processing 45 + Driver::Disk(paused) => { 46 + // set up a disk store we can spill to 47 + let store = DiskBuilder::new().open("some/path.db".into()).await?; 48 + // do the spilling, get back a (similar) driver 49 + let (_commit, mut driver) = paused.finish_loading(store).await?; 50 + 51 + while let Some(chunk) = driver.next_chunk(256).await? { 52 + for (_rkey, size) in chunk { 53 + total_size += size; 54 + } 55 + } 56 + } 57 + }; 58 + println!("sum of size of all records: {total_size}"); 59 + # Ok(()) 60 + # } 61 + ``` 62 + 63 + Disk spilling suspends and returns a `Driver::Disk(paused)` instead of going 64 + ahead and eagerly using disk I/O. This means you have to write a bit more code 65 + to handle both cases, but it allows you to have finer control over resource 66 + usage. For example, you can drive a number of parallel memory CAR workers, and 67 + separately have a different number of disk workers picking up suspended disk 68 + tasks from a queue. 69 + 70 + Find more [examples in the repo](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples). 71 + 72 + */ 4 73 5 - pub mod disk_drive; 6 - pub mod disk_redb; 7 - pub mod disk_sqlite; 8 - pub mod disk_walk; 9 - pub mod drive; 10 74 pub mod mst; 11 - pub mod walk; 75 + mod walk; 76 + 77 + pub mod disk; 78 + pub mod drive; 79 + pub mod process; 80 + 81 + pub use disk::{DiskBuilder, DiskError, DiskStore}; 82 + pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk}; 83 + pub use mst::Commit; 84 + pub use process::Processable;

+4 -8

src/mst.rs

··· 39 39 /// MST node data schema 40 40 #[derive(Debug, Deserialize, PartialEq)] 41 41 #[serde(deny_unknown_fields)] 42 - pub struct Node { 42 + pub(crate) struct Node { 43 43 /// link to sub-tree Node on a lower level and with all keys sorting before 44 44 /// keys at this node 45 45 #[serde(rename = "l")] ··· 62 62 /// so if a block *could be* a node, any record converter must postpone 63 63 /// processing. if it turns out it happens to be a very node-looking record, 64 64 /// well, sorry, it just has to only be processed later when that's known. 65 - pub fn could_be(bytes: impl AsRef<[u8]>) -> bool { 65 + pub(crate) fn could_be(bytes: impl AsRef<[u8]>) -> bool { 66 66 const NODE_FINGERPRINT: [u8; 3] = [ 67 67 0xA2, // map length 2 (for "l" and "e" keys) 68 68 0x61, // text length 1 ··· 83 83 /// with an empty array of entries. This is the only situation in which a 84 84 /// tree may contain an empty leaf node which does not either contain keys 85 85 /// ("entries") or point to a sub-tree containing entries. 86 - /// 87 - /// TODO: to me this is slightly unclear with respect to `l` (ask someone). 88 - /// ...is that what "The top of the tree must not be a an empty node which 89 - /// only points to a sub-tree." is referring to? 90 - pub fn is_empty(&self) -> bool { 86 + pub(crate) fn is_empty(&self) -> bool { 91 87 self.left.is_none() && self.entries.is_empty() 92 88 } 93 89 } ··· 95 91 /// TreeEntry object 96 92 #[derive(Debug, Deserialize, PartialEq)] 97 93 #[serde(deny_unknown_fields)] 98 - pub struct Entry { 94 + pub(crate) struct Entry { 99 95 /// count of bytes shared with previous TreeEntry in this Node (if any) 100 96 #[serde(rename = "p")] 101 97 pub prefix_len: usize,

+108

src/process.rs

··· 1 + /*! 2 + Record processor function output trait 3 + 4 + The return type must satisfy the `Processable` trait, which requires: 5 + 6 + - `Clone` because two rkeys can refer to the same record by CID, which may 7 + only appear once in the CAR file. 8 + - `Serialize + DeserializeOwned` so it can be spilled to disk. 9 + 10 + One required function must be implemented, `get_size()`: this should return the 11 + approximate total off-stack size of the type. (the on-stack size will be added 12 + automatically via `std::mem::get_size`). 13 + 14 + Note that it is **not guaranteed** that the `process` function will run on a 15 + block before storing it in memory or on disk: it's not possible to know if a 16 + block is a record without actually walking the MST, so the best we can do is 17 + apply `process` to any block that we know *cannot* be an MST node, and otherwise 18 + store the raw block bytes. 19 + 20 + Here's a silly processing function that just collects 'eyy's found in the raw 21 + record bytes 22 + 23 + ``` 24 + # use repo_stream::Processable; 25 + # use serde::{Serialize, Deserialize}; 26 + #[derive(Debug, Clone, Serialize, Deserialize)] 27 + struct Eyy(usize, String); 28 + 29 + impl Processable for Eyy { 30 + fn get_size(&self) -> usize { 31 + // don't need to compute the usize, it's on the stack 32 + self.1.capacity() // in-mem size from the string's capacity, in bytes 33 + } 34 + } 35 + 36 + fn process(raw: Vec<u8>) -> Vec<Eyy> { 37 + let mut out = Vec::new(); 38 + let to_find = "eyy".as_bytes(); 39 + for i in 0..(raw.len() - 3) { 40 + if &raw[i..(i+3)] == to_find { 41 + out.push(Eyy(i, "eyy".to_string())); 42 + } 43 + } 44 + out 45 + } 46 + ``` 47 + 48 + The memory sizing stuff is a little sketch but probably at least approximately 49 + works. 50 + */ 51 + 52 + use serde::{Serialize, de::DeserializeOwned}; 53 + 54 + /// Output trait for record processing 55 + pub trait Processable: Clone + Serialize + DeserializeOwned { 56 + /// Any additional in-memory size taken by the processed type 57 + /// 58 + /// Do not include stack size (`std::mem::size_of`) 59 + fn get_size(&self) -> usize; 60 + } 61 + 62 + /// Processor that just returns the raw blocks 63 + #[inline] 64 + pub fn noop(block: Vec<u8>) -> Vec<u8> { 65 + block 66 + } 67 + 68 + impl Processable for u8 { 69 + fn get_size(&self) -> usize { 70 + 0 71 + } 72 + } 73 + 74 + impl Processable for usize { 75 + fn get_size(&self) -> usize { 76 + 0 // no additional space taken, just its stack size (newtype is free) 77 + } 78 + } 79 + 80 + impl Processable for String { 81 + fn get_size(&self) -> usize { 82 + self.capacity() 83 + } 84 + } 85 + 86 + impl<Item: Sized + Processable> Processable for Vec<Item> { 87 + fn get_size(&self) -> usize { 88 + let slot_size = std::mem::size_of::<Item>(); 89 + let direct_size = slot_size * self.capacity(); 90 + let items_referenced_size: usize = self.iter().map(|item| item.get_size()).sum(); 91 + direct_size + items_referenced_size 92 + } 93 + } 94 + 95 + impl<Item: Processable> Processable for Option<Item> { 96 + fn get_size(&self) -> usize { 97 + self.as_ref().map(|item| item.get_size()).unwrap_or(0) 98 + } 99 + } 100 + 101 + impl<Item: Processable, Error: Processable> Processable for Result<Item, Error> { 102 + fn get_size(&self) -> usize { 103 + match self { 104 + Ok(item) => item.get_size(), 105 + Err(err) => err.get_size(), 106 + } 107 + } 108 + }

+260 -259

src/walk.rs

··· 1 1 //! Depth-first MST traversal 2 2 3 - use crate::drive::MaybeProcessedBlock; 3 + use crate::disk::DiskStore; 4 + use crate::drive::{DecodeError, MaybeProcessedBlock}; 4 5 use crate::mst::Node; 6 + use crate::process::Processable; 5 7 use ipld_core::cid::Cid; 8 + use sha2::{Digest, Sha256}; 6 9 use std::collections::HashMap; 7 - use std::error::Error; 10 + use std::convert::Infallible; 8 11 9 12 /// Errors that can happen while walking 10 13 #[derive(Debug, thiserror::Error)] 11 - pub enum Trip<E: Error> { 12 - #[error("empty mst nodes are not allowed")] 13 - NodeEmpty, 14 + pub enum WalkError { 15 + #[error("Failed to fingerprint commit block")] 16 + BadCommitFingerprint, 14 17 #[error("Failed to decode commit block: {0}")] 15 - BadCommit(Box<dyn std::error::Error>), 18 + BadCommit(#[from] serde_ipld_dagcbor::DecodeError<Infallible>), 16 19 #[error("Action node error: {0}")] 17 - RkeyError(#[from] RkeyError), 18 - #[error("Process failed: {0}")] 19 - ProcessFailed(E), 20 - #[error("Encountered an rkey out of order while walking the MST")] 21 - RkeyOutOfOrder, 20 + MstError(#[from] MstError), 21 + #[error("storage error: {0}")] 22 + StorageError(#[from] fjall::Error), 23 + #[error("Decode error: {0}")] 24 + DecodeError(#[from] DecodeError), 22 25 } 23 26 24 27 /// Errors from invalid Rkeys 25 - #[derive(Debug, thiserror::Error)] 26 - pub enum RkeyError { 28 + #[derive(Debug, PartialEq, thiserror::Error)] 29 + pub enum MstError { 27 30 #[error("Failed to compute an rkey due to invalid prefix_len")] 28 31 EntryPrefixOutOfbounds, 29 32 #[error("RKey was not utf-8")] 30 33 EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error), 34 + #[error("Nodes cannot be empty (except for an entirely empty MST)")] 35 + EmptyNode, 36 + #[error("Found an entry with rkey at the wrong depth")] 37 + WrongDepth, 38 + #[error("Lost track of our depth (possible bug?)")] 39 + LostDepth, 40 + #[error("MST depth underflow: depth-0 node with child trees")] 41 + DepthUnderflow, 42 + #[error("Encountered an rkey out of order while walking the MST")] 43 + RkeyOutOfOrder, 31 44 } 32 45 33 46 /// Walker outputs 34 47 #[derive(Debug)] 35 48 pub enum Step<T> { 36 - /// We need a CID but it's not in the block store 37 - /// 38 - /// Give the needed CID to the driver so it can load blocks until it's found 39 - Rest(Cid), 49 + /// We needed this CID but it's not in the block store 50 + Missing(Cid), 40 51 /// Reached the end of the MST! yay! 41 52 Finish, 42 53 /// A record was found! 43 - Step { rkey: String, data: T }, 54 + Found { rkey: String, data: T }, 44 55 } 45 56 46 57 #[derive(Debug, Clone, PartialEq)] 47 58 enum Need { 48 - Node(Cid), 59 + Node { depth: Depth, cid: Cid }, 49 60 Record { rkey: String, cid: Cid }, 50 61 } 51 62 52 - fn push_from_node(stack: &mut Vec<Need>, node: &Node) -> Result<(), RkeyError> { 53 - let mut entries = Vec::with_capacity(node.entries.len()); 63 + #[derive(Debug, Clone, Copy, PartialEq)] 64 + enum Depth { 65 + Root, 66 + Depth(u32), 67 + } 54 68 69 + impl Depth { 70 + fn from_key(key: &[u8]) -> Self { 71 + let mut zeros = 0; 72 + for byte in Sha256::digest(key) { 73 + let leading = byte.leading_zeros(); 74 + zeros += leading; 75 + if leading < 8 { 76 + break; 77 + } 78 + } 79 + Self::Depth(zeros / 2) // truncating divide (rounds down) 80 + } 81 + fn next_expected(&self) -> Result<Option<u32>, MstError> { 82 + match self { 83 + Self::Root => Ok(None), 84 + Self::Depth(d) => d.checked_sub(1).ok_or(MstError::DepthUnderflow).map(Some), 85 + } 86 + } 87 + } 88 + 89 + fn push_from_node(stack: &mut Vec<Need>, node: &Node, parent_depth: Depth) -> Result<(), MstError> { 90 + // empty nodes are not allowed in the MST except in an empty MST 91 + if node.is_empty() { 92 + if parent_depth == Depth::Root { 93 + return Ok(()); // empty mst, nothing to push 94 + } else { 95 + return Err(MstError::EmptyNode); 96 + } 97 + } 98 + 99 + let mut entries = Vec::with_capacity(node.entries.len()); 55 100 let mut prefix = vec![]; 101 + let mut this_depth = parent_depth.next_expected()?; 102 + 56 103 for entry in &node.entries { 57 104 let mut rkey = vec![]; 58 105 let pre_checked = prefix 59 106 .get(..entry.prefix_len) 60 - .ok_or(RkeyError::EntryPrefixOutOfbounds)?; 107 + .ok_or(MstError::EntryPrefixOutOfbounds)?; 61 108 rkey.extend_from_slice(pre_checked); 62 109 rkey.extend_from_slice(&entry.keysuffix); 110 + 111 + let Depth::Depth(key_depth) = Depth::from_key(&rkey) else { 112 + return Err(MstError::WrongDepth); 113 + }; 114 + 115 + // this_depth is `none` if we are the deepest child (directly below root) 116 + // in that case we accept whatever highest depth is claimed 117 + let expected_depth = match this_depth { 118 + Some(d) => d, 119 + None => { 120 + this_depth = Some(key_depth); 121 + key_depth 122 + } 123 + }; 124 + 125 + // all keys we find should be this depth 126 + if key_depth != expected_depth { 127 + return Err(MstError::DepthUnderflow); 128 + } 129 + 63 130 prefix = rkey.clone(); 64 131 65 132 entries.push(Need::Record { ··· 67 134 cid: entry.value, 68 135 }); 69 136 if let Some(ref tree) = entry.tree { 70 - entries.push(Need::Node(*tree)); 137 + entries.push(Need::Node { 138 + depth: Depth::Depth(key_depth), 139 + cid: *tree, 140 + }); 71 141 } 72 142 } 73 143 74 144 entries.reverse(); 75 145 stack.append(&mut entries); 146 + 147 + let d = this_depth.ok_or(MstError::LostDepth)?; 76 148 77 149 if let Some(tree) = node.left { 78 - stack.push(Need::Node(tree)); 150 + stack.push(Need::Node { 151 + depth: Depth::Depth(d), 152 + cid: tree, 153 + }); 79 154 } 80 155 Ok(()) 81 156 } ··· 92 167 impl Walker { 93 168 pub fn new(tree_root_cid: Cid) -> Self { 94 169 Self { 95 - stack: vec![Need::Node(tree_root_cid)], 170 + stack: vec![Need::Node { 171 + depth: Depth::Root, 172 + cid: tree_root_cid, 173 + }], 96 174 prev: "".to_string(), 97 175 } 98 176 } 99 177 100 178 /// Advance through nodes until we find a record or can't go further 101 - pub fn step<T: Clone, E: Error>( 179 + pub fn step<T: Processable>( 102 180 &mut self, 103 - blocks: &mut HashMap<Cid, MaybeProcessedBlock<T, E>>, 104 - process: impl Fn(&[u8]) -> Result<T, E>, 105 - ) -> Result<Step<T>, Trip<E>> { 181 + blocks: &mut HashMap<Cid, MaybeProcessedBlock<T>>, 182 + process: impl Fn(Vec<u8>) -> T, 183 + ) -> Result<Step<T>, WalkError> { 106 184 loop { 107 - let Some(mut need) = self.stack.last() else { 185 + let Some(need) = self.stack.last_mut() else { 108 186 log::trace!("tried to walk but we're actually done."); 109 187 return Ok(Step::Finish); 110 188 }; 111 189 112 - match &mut need { 113 - Need::Node(cid) => { 190 + match need { 191 + &mut Need::Node { depth, cid } => { 114 192 log::trace!("need node {cid:?}"); 115 - let Some(block) = blocks.remove(cid) else { 193 + let Some(block) = blocks.remove(&cid) else { 116 194 log::trace!("node not found, resting"); 117 - return Ok(Step::Rest(*cid)); 195 + return Ok(Step::Missing(cid)); 118 196 }; 119 197 120 198 let MaybeProcessedBlock::Raw(data) = block else { 121 - return Err(Trip::BadCommit("failed commit fingerprint".into())); 199 + return Err(WalkError::BadCommitFingerprint); 122 200 }; 123 201 let node = serde_ipld_dagcbor::from_slice::<Node>(&data) 124 - .map_err(|e| Trip::BadCommit(e.into()))?; 202 + .map_err(WalkError::BadCommit)?; 125 203 126 204 // found node, make sure we remember 127 205 self.stack.pop(); 128 206 129 207 // queue up work on the found node next 130 - push_from_node(&mut self.stack, &node)?; 208 + push_from_node(&mut self.stack, &node, depth)?; 131 209 } 132 210 Need::Record { rkey, cid } => { 133 211 log::trace!("need record {cid:?}"); 212 + // note that we cannot *remove* a record block, sadly, since 213 + // there can be multiple rkeys pointing to the same cid. 134 214 let Some(data) = blocks.get_mut(cid) else { 215 + return Ok(Step::Missing(*cid)); 216 + }; 217 + let rkey = rkey.clone(); 218 + let data = match data { 219 + MaybeProcessedBlock::Raw(data) => process(data.to_vec()), 220 + MaybeProcessedBlock::Processed(t) => t.clone(), 221 + }; 222 + 223 + // found node, make sure we remember 224 + self.stack.pop(); 225 + 226 + // rkeys *must* be in order or else the tree is invalid (or 227 + // we have a bug) 228 + if rkey <= self.prev { 229 + return Err(MstError::RkeyOutOfOrder)?; 230 + } 231 + self.prev = rkey.clone(); 232 + 233 + return Ok(Step::Found { rkey, data }); 234 + } 235 + } 236 + } 237 + } 238 + 239 + /// blocking!!!!!! 240 + pub fn disk_step<T: Processable>( 241 + &mut self, 242 + reader: &mut DiskStore, 243 + process: impl Fn(Vec<u8>) -> T, 244 + ) -> Result<Step<T>, WalkError> { 245 + loop { 246 + let Some(need) = self.stack.last_mut() else { 247 + log::trace!("tried to walk but we're actually done."); 248 + return Ok(Step::Finish); 249 + }; 250 + 251 + match need { 252 + &mut Need::Node { depth, cid } => { 253 + let cid_bytes = cid.to_bytes(); 254 + log::trace!("need node {cid:?}"); 255 + let Some(block_bytes) = reader.get(&cid_bytes)? else { 256 + log::trace!("node not found, resting"); 257 + return Ok(Step::Missing(cid)); 258 + }; 259 + 260 + let block: MaybeProcessedBlock<T> = crate::drive::decode(&block_bytes)?; 261 + 262 + let MaybeProcessedBlock::Raw(data) = block else { 263 + return Err(WalkError::BadCommitFingerprint); 264 + }; 265 + let node = serde_ipld_dagcbor::from_slice::<Node>(&data) 266 + .map_err(WalkError::BadCommit)?; 267 + 268 + // found node, make sure we remember 269 + self.stack.pop(); 270 + 271 + // queue up work on the found node next 272 + push_from_node(&mut self.stack, &node, depth).map_err(WalkError::MstError)?; 273 + } 274 + Need::Record { rkey, cid } => { 275 + log::trace!("need record {cid:?}"); 276 + let cid_bytes = cid.to_bytes(); 277 + let Some(data_bytes) = reader.get(&cid_bytes)? else { 135 278 log::trace!("record block not found, resting"); 136 - return Ok(Step::Rest(*cid)); 279 + return Ok(Step::Missing(*cid)); 137 280 }; 281 + let data: MaybeProcessedBlock<T> = crate::drive::decode(&data_bytes)?; 138 282 let rkey = rkey.clone(); 139 283 let data = match data { 140 284 MaybeProcessedBlock::Raw(data) => process(data), 141 - MaybeProcessedBlock::Processed(Ok(t)) => Ok(t.clone()), 142 - bad => { 143 - // big hack to pull the error out -- this corrupts 144 - // a block, so we should not continue trying to work 145 - let mut steal = MaybeProcessedBlock::Raw(vec![]); 146 - std::mem::swap(&mut steal, bad); 147 - let MaybeProcessedBlock::Processed(Err(e)) = steal else { 148 - unreachable!(); 149 - }; 150 - return Err(Trip::ProcessFailed(e)); 151 - } 285 + MaybeProcessedBlock::Processed(t) => t.clone(), 152 286 }; 153 287 154 288 // found node, make sure we remember 155 289 self.stack.pop(); 156 290 157 291 log::trace!("emitting a block as a step. depth={}", self.stack.len()); 158 - let data = data.map_err(Trip::ProcessFailed)?; 159 292 160 293 // rkeys *must* be in order or else the tree is invalid (or 161 294 // we have a bug) 162 295 if rkey <= self.prev { 163 - return Err(Trip::RkeyOutOfOrder); 296 + return Err(MstError::RkeyOutOfOrder)?; 164 297 } 165 298 self.prev = rkey.clone(); 166 299 167 - return Ok(Step::Step { rkey, data }); 300 + return Ok(Step::Found { rkey, data }); 168 301 } 169 302 } 170 303 } ··· 174 307 #[cfg(test)] 175 308 mod test { 176 309 use super::*; 177 - // use crate::mst::Entry; 178 310 179 311 fn cid1() -> Cid { 180 312 "bafyreihixenvk3ahqbytas4hk4a26w43bh6eo3w6usjqtxkpzsvi655a3m" 181 313 .parse() 182 314 .unwrap() 183 315 } 184 - // fn cid2() -> Cid { 185 - // "QmY7Yh4UquoXHLPFo2XbhXkhBvFoPwmQUSa92pxnxjQuPU" 186 - // .parse() 187 - // .unwrap() 188 - // } 189 - // fn cid3() -> Cid { 190 - // "bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi" 191 - // .parse() 192 - // .unwrap() 193 - // } 194 - // fn cid4() -> Cid { 195 - // "QmbWqxBEKC3P8tqsKc98xmWNzrzDtRLMiMPL8wBuTGsMnR" 196 - // .parse() 197 - // .unwrap() 198 - // } 199 - // fn cid5() -> Cid { 200 - // "QmSnuWmxptJZdLJpKRarxBMS2Ju2oANVrgbr2xWbie9b2D" 201 - // .parse() 202 - // .unwrap() 203 - // } 204 - // fn cid6() -> Cid { 205 - // "QmdmQXB2mzChmMeKY47C43LxUdg1NDJ5MWcKMKxDu7RgQm" 206 - // .parse() 207 - // .unwrap() 208 - // } 209 - // fn cid7() -> Cid { 210 - // "bafybeiaysi4s6lnjev27ln5icwm6tueaw2vdykrtjkwiphwekaywqhcjze" 211 - // .parse() 212 - // .unwrap() 213 - // } 214 - // fn cid8() -> Cid { 215 - // "bafyreif3tfdpr5n4jdrbielmcapwvbpcthepfkwq2vwonmlhirbjmotedi" 216 - // .parse() 217 - // .unwrap() 218 - // } 219 - // fn cid9() -> Cid { 220 - // "bafyreicnokmhmrnlp2wjhyk2haep4tqxiptwfrp2rrs7rzq7uk766chqvq" 221 - // .parse() 222 - // .unwrap() 223 - // } 316 + 317 + #[test] 318 + fn test_depth_spec_0() { 319 + let d = Depth::from_key(b"2653ae71"); 320 + assert_eq!(d, Depth::Depth(0)) 321 + } 322 + 323 + #[test] 324 + fn test_depth_spec_1() { 325 + let d = Depth::from_key(b"blue"); 326 + assert_eq!(d, Depth::Depth(1)) 327 + } 328 + 329 + #[test] 330 + fn test_depth_spec_4() { 331 + let d = Depth::from_key(b"app.bsky.feed.post/454397e440ec"); 332 + assert_eq!(d, Depth::Depth(4)) 333 + } 334 + 335 + #[test] 336 + fn test_depth_spec_8() { 337 + let d = Depth::from_key(b"app.bsky.feed.post/9adeb165882c"); 338 + assert_eq!(d, Depth::Depth(8)) 339 + } 340 + 341 + #[test] 342 + fn test_depth_ietf_draft_0() { 343 + let d = Depth::from_key(b"key1"); 344 + assert_eq!(d, Depth::Depth(0)) 345 + } 346 + 347 + #[test] 348 + fn test_depth_ietf_draft_1() { 349 + let d = Depth::from_key(b"key7"); 350 + assert_eq!(d, Depth::Depth(1)) 351 + } 352 + 353 + #[test] 354 + fn test_depth_ietf_draft_4() { 355 + let d = Depth::from_key(b"key515"); 356 + assert_eq!(d, Depth::Depth(4)) 357 + } 224 358 225 359 #[test] 226 - fn test_next_from_node_empty() { 227 - let node = Node { 360 + fn test_depth_interop() { 361 + // examples from https://github.com/bluesky-social/atproto-interop-tests/blob/main/mst/key_heights.json 362 + for (k, expected) in [ 363 + ("", 0), 364 + ("asdf", 0), 365 + ("blue", 1), 366 + ("2653ae71", 0), 367 + ("88bfafc7", 2), 368 + ("2a92d355", 4), 369 + ("884976f5", 6), 370 + ("app.bsky.feed.post/454397e440ec", 4), 371 + ("app.bsky.feed.post/9adeb165882c", 8), 372 + ] { 373 + let d = Depth::from_key(k.as_bytes()); 374 + assert_eq!(d, Depth::Depth(expected), "key: {}", k); 375 + } 376 + } 377 + 378 + #[test] 379 + fn test_push_empty_fails() { 380 + let empty_node = Node { 228 381 left: None, 229 382 entries: vec![], 230 383 }; 231 384 let mut stack = vec![]; 232 - push_from_node(&mut stack, &node).unwrap(); 233 - assert_eq!(stack.last(), None); 385 + let err = push_from_node(&mut stack, &empty_node, Depth::Depth(4)); 386 + assert_eq!(err, Err(MstError::EmptyNode)); 234 387 } 235 388 236 389 #[test] 237 - fn test_needs_from_node_just_left() { 390 + fn test_push_one_node() { 238 391 let node = Node { 239 392 left: Some(cid1()), 240 393 entries: vec![], 241 394 }; 242 395 let mut stack = vec![]; 243 - push_from_node(&mut stack, &node).unwrap(); 244 - assert_eq!(stack.last(), Some(Need::Node(cid1())).as_ref()); 396 + push_from_node(&mut stack, &node, Depth::Depth(4)).unwrap(); 397 + assert_eq!( 398 + stack.last(), 399 + Some(Need::Node { 400 + depth: Depth::Depth(3), 401 + cid: cid1() 402 + }) 403 + .as_ref() 404 + ); 245 405 } 246 - 247 - // #[test] 248 - // fn test_needs_from_node_just_one_record() { 249 - // let node = Node { 250 - // left: None, 251 - // entries: vec![Entry { 252 - // keysuffix: "asdf".into(), 253 - // prefix_len: 0, 254 - // value: cid1(), 255 - // tree: None, 256 - // }], 257 - // }; 258 - // assert_eq!( 259 - // needs_from_node(node).unwrap(), 260 - // vec![Need::Record { 261 - // rkey: "asdf".into(), 262 - // cid: cid1(), 263 - // },] 264 - // ); 265 - // } 266 - 267 - // #[test] 268 - // fn test_needs_from_node_two_records() { 269 - // let node = Node { 270 - // left: None, 271 - // entries: vec![ 272 - // Entry { 273 - // keysuffix: "asdf".into(), 274 - // prefix_len: 0, 275 - // value: cid1(), 276 - // tree: None, 277 - // }, 278 - // Entry { 279 - // keysuffix: "gh".into(), 280 - // prefix_len: 2, 281 - // value: cid2(), 282 - // tree: None, 283 - // }, 284 - // ], 285 - // }; 286 - // assert_eq!( 287 - // needs_from_node(node).unwrap(), 288 - // vec![ 289 - // Need::Record { 290 - // rkey: "asdf".into(), 291 - // cid: cid1(), 292 - // }, 293 - // Need::Record { 294 - // rkey: "asgh".into(), 295 - // cid: cid2(), 296 - // }, 297 - // ] 298 - // ); 299 - // } 300 - 301 - // #[test] 302 - // fn test_needs_from_node_with_both() { 303 - // let node = Node { 304 - // left: None, 305 - // entries: vec![Entry { 306 - // keysuffix: "asdf".into(), 307 - // prefix_len: 0, 308 - // value: cid1(), 309 - // tree: Some(cid2()), 310 - // }], 311 - // }; 312 - // assert_eq!( 313 - // needs_from_node(node).unwrap(), 314 - // vec![ 315 - // Need::Record { 316 - // rkey: "asdf".into(), 317 - // cid: cid1(), 318 - // }, 319 - // Need::Node(cid2()), 320 - // ] 321 - // ); 322 - // } 323 - 324 - // #[test] 325 - // fn test_needs_from_node_left_and_record() { 326 - // let node = Node { 327 - // left: Some(cid1()), 328 - // entries: vec![Entry { 329 - // keysuffix: "asdf".into(), 330 - // prefix_len: 0, 331 - // value: cid2(), 332 - // tree: None, 333 - // }], 334 - // }; 335 - // assert_eq!( 336 - // needs_from_node(node).unwrap(), 337 - // vec![ 338 - // Need::Node(cid1()), 339 - // Need::Record { 340 - // rkey: "asdf".into(), 341 - // cid: cid2(), 342 - // }, 343 - // ] 344 - // ); 345 - // } 346 - 347 - // #[test] 348 - // fn test_needs_from_full_node() { 349 - // let node = Node { 350 - // left: Some(cid1()), 351 - // entries: vec![ 352 - // Entry { 353 - // keysuffix: "asdf".into(), 354 - // prefix_len: 0, 355 - // value: cid2(), 356 - // tree: Some(cid3()), 357 - // }, 358 - // Entry { 359 - // keysuffix: "ghi".into(), 360 - // prefix_len: 1, 361 - // value: cid4(), 362 - // tree: Some(cid5()), 363 - // }, 364 - // Entry { 365 - // keysuffix: "jkl".into(), 366 - // prefix_len: 2, 367 - // value: cid6(), 368 - // tree: Some(cid7()), 369 - // }, 370 - // Entry { 371 - // keysuffix: "mno".into(), 372 - // prefix_len: 4, 373 - // value: cid8(), 374 - // tree: Some(cid9()), 375 - // }, 376 - // ], 377 - // }; 378 - // assert_eq!( 379 - // needs_from_node(node).unwrap(), 380 - // vec![ 381 - // Need::Node(cid1()), 382 - // Need::Record { 383 - // rkey: "asdf".into(), 384 - // cid: cid2(), 385 - // }, 386 - // Need::Node(cid3()), 387 - // Need::Record { 388 - // rkey: "aghi".into(), 389 - // cid: cid4(), 390 - // }, 391 - // Need::Node(cid5()), 392 - // Need::Record { 393 - // rkey: "agjkl".into(), 394 - // cid: cid6(), 395 - // }, 396 - // Need::Node(cid7()), 397 - // Need::Record { 398 - // rkey: "agjkmno".into(), 399 - // cid: cid8(), 400 - // }, 401 - // Need::Node(cid9()), 402 - // ] 403 - // ); 404 - // } 405 406 }

+34 -31

tests/non-huge-cars.rs

··· 1 1 extern crate repo_stream; 2 - use futures::TryStreamExt; 3 - use iroh_car::CarReader; 4 - use std::convert::Infallible; 2 + use repo_stream::Driver; 5 3 4 + const EMPTY_CAR: &'static [u8] = include_bytes!("../car-samples/empty.car"); 6 5 const TINY_CAR: &'static [u8] = include_bytes!("../car-samples/tiny.car"); 7 6 const LITTLE_CAR: &'static [u8] = include_bytes!("../car-samples/little.car"); 8 7 const MIDSIZE_CAR: &'static [u8] = include_bytes!("../car-samples/midsize.car"); 9 8 10 - async fn test_car(bytes: &[u8], expected_records: usize, expected_sum: usize) { 11 - let reader = CarReader::new(bytes).await.unwrap(); 12 - 13 - let root = reader 14 - .header() 15 - .roots() 16 - .first() 17 - .ok_or("missing root") 9 + async fn test_car( 10 + bytes: &[u8], 11 + expected_records: usize, 12 + expected_sum: usize, 13 + expect_profile: bool, 14 + ) { 15 + let mut driver = match Driver::load_car(bytes, |block| block.len(), 10 /* MiB */) 16 + .await 18 17 .unwrap() 19 - .clone(); 20 - 21 - let stream = std::pin::pin!(reader.stream()); 22 - 23 - let (_commit, v) = 24 - repo_stream::drive::Vehicle::init(root, stream, |block| Ok::<_, Infallible>(block.len())) 25 - .await 26 - .unwrap(); 27 - let mut record_stream = std::pin::pin!(v.stream()); 18 + { 19 + Driver::Memory(_commit, mem_driver) => mem_driver, 20 + Driver::Disk(_) => panic!("too big"), 21 + }; 28 22 29 23 let mut records = 0; 30 24 let mut sum = 0; 31 25 let mut found_bsky_profile = false; 32 26 let mut prev_rkey = "".to_string(); 33 - while let Some((rkey, size)) = record_stream.try_next().await.unwrap() { 34 - records += 1; 35 - sum += size; 36 - if rkey == "app.bsky.actor.profile/self" { 37 - found_bsky_profile = true; 27 + 28 + while let Some(pairs) = driver.next_chunk(256).await.unwrap() { 29 + for (rkey, size) in pairs { 30 + records += 1; 31 + sum += size; 32 + if rkey == "app.bsky.actor.profile/self" { 33 + found_bsky_profile = true; 34 + } 35 + assert!(rkey > prev_rkey, "rkeys are streamed in order"); 36 + prev_rkey = rkey; 38 37 } 39 - assert!(rkey > prev_rkey, "rkeys are streamed in order"); 40 - prev_rkey = rkey; 41 38 } 39 + 42 40 assert_eq!(records, expected_records); 43 41 assert_eq!(sum, expected_sum); 44 - assert!(found_bsky_profile); 42 + assert_eq!(found_bsky_profile, expect_profile); 43 + } 44 + 45 + #[tokio::test] 46 + async fn test_empty_car() { 47 + test_car(EMPTY_CAR, 0, 0, false).await 45 48 } 46 49 47 50 #[tokio::test] 48 51 async fn test_tiny_car() { 49 - test_car(TINY_CAR, 8, 2071).await 52 + test_car(TINY_CAR, 8, 2071, true).await 50 53 } 51 54 52 55 #[tokio::test] 53 56 async fn test_little_car() { 54 - test_car(LITTLE_CAR, 278, 246960).await 57 + test_car(LITTLE_CAR, 278, 246960, true).await 55 58 } 56 59 57 60 #[tokio::test] 58 61 async fn test_midsize_car() { 59 - test_car(MIDSIZE_CAR, 11585, 3741393).await 62 + test_car(MIDSIZE_CAR, 11585, 3741393, true).await 60 63 }

Compare changes