Fast and robust atproto CAR file processing in rust

redb disk version

Changed files
+833
examples
disk-read-file
src
+116
Cargo.lock
··· 126 126 ] 127 127 128 128 [[package]] 129 + name = "bincode" 130 + version = "2.0.1" 131 + source = "registry+https://github.com/rust-lang/crates.io-index" 132 + checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" 133 + dependencies = [ 134 + "bincode_derive", 135 + "serde", 136 + "unty", 137 + ] 138 + 139 + [[package]] 140 + name = "bincode_derive" 141 + version = "2.0.1" 142 + source = "registry+https://github.com/rust-lang/crates.io-index" 143 + checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" 144 + dependencies = [ 145 + "virtue", 146 + ] 147 + 148 + [[package]] 129 149 name = "bitflags" 130 150 version = "2.9.4" 131 151 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 387 407 ] 388 408 389 409 [[package]] 410 + name = "fallible-iterator" 411 + version = "0.3.0" 412 + source = "registry+https://github.com/rust-lang/crates.io-index" 413 + checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" 414 + 415 + [[package]] 416 + name = "fallible-streaming-iterator" 417 + version = "0.1.9" 418 + source = "registry+https://github.com/rust-lang/crates.io-index" 419 + checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" 420 + 421 + [[package]] 422 + name = "foldhash" 423 + version = "0.1.5" 424 + source = "registry+https://github.com/rust-lang/crates.io-index" 425 + checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" 426 + 427 + [[package]] 390 428 name = "futures" 391 429 version = "0.3.31" 392 430 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 493 531 ] 494 532 495 533 [[package]] 534 + name = "hashbrown" 535 + version = "0.15.5" 536 + source = "registry+https://github.com/rust-lang/crates.io-index" 537 + checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" 538 + dependencies = [ 539 + "foldhash", 540 + ] 541 + 542 + [[package]] 543 + name = "hashlink" 544 + version = "0.10.0" 545 + source = "registry+https://github.com/rust-lang/crates.io-index" 546 + checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" 547 + dependencies = [ 548 + "hashbrown", 549 + ] 550 + 551 + [[package]] 496 552 name = "heck" 497 553 version = "0.5.0" 498 554 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 598 654 checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" 599 655 600 656 [[package]] 657 + name = "libsqlite3-sys" 658 + version = "0.35.0" 659 + source = "registry+https://github.com/rust-lang/crates.io-index" 660 + checksum = "133c182a6a2c87864fe97778797e46c7e999672690dc9fa3ee8e241aa4a9c13f" 661 + dependencies = [ 662 + "pkg-config", 663 + "vcpkg", 664 + ] 665 + 666 + [[package]] 601 667 name = "lock_api" 602 668 version = "0.4.14" 603 669 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 744 810 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" 745 811 746 812 [[package]] 813 + name = "pkg-config" 814 + version = "0.3.32" 815 + source = "registry+https://github.com/rust-lang/crates.io-index" 816 + checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" 817 + 818 + [[package]] 747 819 name = "plotters" 748 820 version = "0.3.7" 749 821 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 825 897 ] 826 898 827 899 [[package]] 900 + name = "redb" 901 + version = "3.1.0" 902 + source = "registry+https://github.com/rust-lang/crates.io-index" 903 + checksum = "ae323eb086579a3769daa2c753bb96deb95993c534711e0dbe881b5192906a06" 904 + dependencies = [ 905 + "libc", 906 + ] 907 + 908 + [[package]] 828 909 name = "redox_syscall" 829 910 version = "0.5.18" 830 911 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 866 947 name = "repo-stream" 867 948 version = "0.1.1" 868 949 dependencies = [ 950 + "bincode", 869 951 "clap", 870 952 "criterion", 871 953 "env_logger", ··· 875 957 "iroh-car", 876 958 "log", 877 959 "multibase", 960 + "redb", 961 + "rusqlite", 878 962 "serde", 879 963 "serde_bytes", 880 964 "serde_ipld_dagcbor", ··· 883 967 ] 884 968 885 969 [[package]] 970 + name = "rusqlite" 971 + version = "0.37.0" 972 + source = "registry+https://github.com/rust-lang/crates.io-index" 973 + checksum = "165ca6e57b20e1351573e3729b958bc62f0e48025386970b6e4d29e7a7e71f3f" 974 + dependencies = [ 975 + "bitflags", 976 + "fallible-iterator", 977 + "fallible-streaming-iterator", 978 + "hashlink", 979 + "libsqlite3-sys", 980 + "smallvec", 981 + ] 982 + 983 + [[package]] 886 984 name = "rustc-demangle" 887 985 version = "0.1.26" 888 986 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1139 1237 checksum = "eb066959b24b5196ae73cb057f45598450d2c5f71460e98c49b738086eff9c06" 1140 1238 1141 1239 [[package]] 1240 + name = "unty" 1241 + version = "0.0.4" 1242 + source = "registry+https://github.com/rust-lang/crates.io-index" 1243 + checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" 1244 + 1245 + [[package]] 1142 1246 name = "utf8parse" 1143 1247 version = "0.2.2" 1144 1248 source = "registry+https://github.com/rust-lang/crates.io-index" 1145 1249 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 1250 + 1251 + [[package]] 1252 + name = "vcpkg" 1253 + version = "0.2.15" 1254 + source = "registry+https://github.com/rust-lang/crates.io-index" 1255 + checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 1256 + 1257 + [[package]] 1258 + name = "virtue" 1259 + version = "0.0.18" 1260 + source = "registry+https://github.com/rust-lang/crates.io-index" 1261 + checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" 1146 1262 1147 1263 [[package]] 1148 1264 name = "walkdir"
+3
Cargo.toml
··· 7 7 repository = "https://tangled.org/@microcosm.blue/repo-stream" 8 8 9 9 [dependencies] 10 + bincode = { version = "2.0.1", features = ["serde"] } 10 11 futures = "0.3.31" 11 12 futures-core = "0.3.31" 12 13 ipld-core = { version = "0.4.2", features = ["serde"] } 13 14 iroh-car = "0.5.1" 14 15 log = "0.4.28" 15 16 multibase = "0.9.2" 17 + redb = "3.1.0" 18 + rusqlite = "0.37.0" 16 19 serde = { version = "1.0.228", features = ["derive"] } 17 20 serde_bytes = "0.11.19" 18 21 serde_ipld_dagcbor = "0.6.4"
+57
examples/disk-read-file/main.rs
··· 1 + extern crate repo_stream; 2 + use clap::Parser; 3 + use futures::TryStreamExt; 4 + use iroh_car::CarReader; 5 + use std::convert::Infallible; 6 + use std::path::PathBuf; 7 + 8 + type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>; 9 + 10 + #[derive(Debug, Parser)] 11 + struct Args { 12 + #[arg()] 13 + car: PathBuf, 14 + #[arg()] 15 + tmpfile: PathBuf, 16 + } 17 + 18 + #[tokio::main] 19 + async fn main() -> Result<()> { 20 + env_logger::init(); 21 + 22 + let Args { car, tmpfile } = Args::parse(); 23 + let reader = tokio::fs::File::open(car).await?; 24 + let reader = tokio::io::BufReader::new(reader); 25 + 26 + println!("hello!"); 27 + 28 + let reader = CarReader::new(reader).await?; 29 + 30 + let redb_store = repo_stream::disk_redb::RedbStore::new(tmpfile)?; 31 + 32 + let root = reader 33 + .header() 34 + .roots() 35 + .first() 36 + .ok_or("missing root")? 37 + .clone(); 38 + log::debug!("root: {root:?}"); 39 + 40 + // let stream = Box::pin(reader.stream()); 41 + let stream = std::pin::pin!(reader.stream()); 42 + 43 + let (commit, v) = repo_stream::disk_drive::Vehicle::init(root, stream, redb_store, |block| { 44 + Ok::<_, Infallible>(block.len()) 45 + }) 46 + .await?; 47 + let mut record_stream = std::pin::pin!(v.stream()); 48 + 49 + log::info!("got commit: {commit:?}"); 50 + 51 + while let Some((rkey, _rec)) = record_stream.try_next().await? { 52 + log::info!("got {rkey:?}"); 53 + } 54 + log::info!("bye!"); 55 + 56 + Ok(()) 57 + }
+201
src/disk_drive.rs
··· 1 + use futures::Stream; 2 + use futures::TryStreamExt; 3 + use std::error::Error; 4 + 5 + use crate::disk_walk::{Step, Trip, Walker}; 6 + use crate::mst::Commit; 7 + use crate::mst::Node; 8 + 9 + use ipld_core::cid::Cid; 10 + use serde::{Deserialize, Serialize, de::DeserializeOwned}; 11 + 12 + /// Errors that can happen while consuming and emitting blocks and records 13 + #[derive(Debug, thiserror::Error)] 14 + pub enum DriveError { 15 + #[error("Failed to initialize CarReader: {0}")] 16 + CarReader(#[from] iroh_car::Error), 17 + #[error("Car block stream error: {0}")] 18 + CarBlockError(Box<dyn Error>), 19 + #[error("Failed to decode commit block: {0}")] 20 + BadCommit(Box<dyn Error>), 21 + #[error("The Commit block reference by the root was not found")] 22 + MissingCommit, 23 + #[error("The MST block {0} could not be found")] 24 + MissingBlock(Cid), 25 + #[error("Failed to walk the mst tree: {0}")] 26 + Tripped(#[from] Trip), 27 + } 28 + 29 + #[derive(Debug, Clone, Serialize, Deserialize)] 30 + pub enum MaybeProcessedBlock<T: Clone + Serialize> { 31 + /// A block that's *probably* a Node (but we can't know yet) 32 + /// 33 + /// It *can be* a record that suspiciously looks a lot like a node, so we 34 + /// cannot eagerly turn it into a Node. We only know for sure what it is 35 + /// when we actually walk down the MST 36 + Raw(Vec<u8>), 37 + /// A processed record from a block that was definitely not a Node 38 + /// 39 + /// If we _never_ needed this block, then we may have wasted a bit of effort 40 + /// trying to process it. Oh well. 41 + /// 42 + /// Processing has to be fallible because the CAR can have totally-unused 43 + /// blocks, which can just be garbage. since we're eagerly trying to process 44 + /// record blocks without knowing for sure that they *are* records, we 45 + /// discard any definitely-not-nodes that fail processing and keep their 46 + /// error in the buffer for them. if we later try to retreive them as a 47 + /// record, then we can surface the error. 48 + /// 49 + /// The error type is `String` because we don't really want to put 50 + /// any constraints like `Serialize` on the error type, and `Error` 51 + /// at least requires `Display`. It's a compromise. 52 + ProcessedOk(T), 53 + Unprocessable(String), 54 + } 55 + 56 + pub trait BlockStore<MPB: Serialize + DeserializeOwned> { 57 + fn put(&self, key: Cid, value: MPB); // unwraps for now 58 + fn get(&self, key: Cid) -> Option<MPB>; 59 + } 60 + 61 + type CarBlock<E> = Result<(Cid, Vec<u8>), E>; 62 + 63 + /// The core driver between the block stream and MST walker 64 + pub struct Vehicle<SE, S, T, BS, P, PE> 65 + where 66 + SE: Error + 'static, 67 + S: Stream<Item = CarBlock<SE>>, 68 + T: Clone + Serialize + DeserializeOwned, 69 + BS: BlockStore<MaybeProcessedBlock<T>>, 70 + P: Fn(&[u8]) -> Result<T, PE>, 71 + PE: Error, 72 + { 73 + block_stream: S, 74 + block_store: BS, 75 + walker: Walker, 76 + process: P, 77 + } 78 + 79 + impl<SE, S, T, BS, P, PE> Vehicle<SE, S, T, BS, P, PE> 80 + where 81 + SE: Error + 'static, 82 + S: Stream<Item = CarBlock<SE>> + Unpin, 83 + T: Clone + Serialize + DeserializeOwned, 84 + BS: BlockStore<MaybeProcessedBlock<T>>, 85 + P: Fn(&[u8]) -> Result<T, PE>, 86 + PE: Error, 87 + { 88 + /// Set up the stream 89 + /// 90 + /// This will eagerly consume blocks until the `Commit` object is found. 91 + /// *Usually* the it's the first block, but there is no guarantee. 92 + /// 93 + /// ### Parameters 94 + /// 95 + /// `root`: CID of the commit object that is the root of the MST 96 + /// 97 + /// `block_stream`: Input stream of raw CAR blocks 98 + /// 99 + /// `process`: record-transforming callback: 100 + /// 101 + /// For tasks where records can be quickly processed into a *smaller* 102 + /// useful representation, you can do that eagerly as blocks come in by 103 + /// passing the processor as a callback here. This can reduce overall 104 + /// memory usage. 105 + pub async fn init( 106 + root: Cid, 107 + mut block_stream: S, 108 + block_store: BS, 109 + process: P, 110 + ) -> Result<(Commit, Self), DriveError> { 111 + let mut commit = None; 112 + 113 + while let Some((cid, data)) = block_stream 114 + .try_next() 115 + .await 116 + .map_err(|e| DriveError::CarBlockError(e.into()))? 117 + { 118 + if cid == root { 119 + let c: Commit = serde_ipld_dagcbor::from_slice(&data) 120 + .map_err(|e| DriveError::BadCommit(e.into()))?; 121 + commit = Some(c); 122 + break; 123 + } else { 124 + block_store.put( 125 + cid, 126 + if Node::could_be(&data) { 127 + MaybeProcessedBlock::Raw(data) 128 + } else { 129 + match process(&data) { 130 + Ok(t) => MaybeProcessedBlock::ProcessedOk(t), 131 + Err(e) => MaybeProcessedBlock::Unprocessable(e.to_string()), 132 + } 133 + }, 134 + ); 135 + } 136 + } 137 + 138 + // we either broke out or read all the blocks without finding the commit... 139 + let commit = commit.ok_or(DriveError::MissingCommit)?; 140 + 141 + let walker = Walker::new(commit.data); 142 + 143 + let me = Self { 144 + block_stream, 145 + block_store, 146 + walker, 147 + process, 148 + }; 149 + Ok((commit, me)) 150 + } 151 + 152 + async fn drive_until(&mut self, cid_needed: Cid) -> Result<(), DriveError> { 153 + while let Some((cid, data)) = self 154 + .block_stream 155 + .try_next() 156 + .await 157 + .map_err(|e| DriveError::CarBlockError(e.into()))? 158 + { 159 + self.block_store.put( 160 + cid, 161 + if Node::could_be(&data) { 162 + MaybeProcessedBlock::Raw(data) 163 + } else { 164 + match (self.process)(&data) { 165 + Ok(t) => MaybeProcessedBlock::ProcessedOk(t), 166 + Err(e) => MaybeProcessedBlock::Unprocessable(e.to_string()), 167 + } 168 + }, 169 + ); 170 + if cid == cid_needed { 171 + return Ok(()); 172 + } 173 + } 174 + 175 + // if we never found the block 176 + Err(DriveError::MissingBlock(cid_needed)) 177 + } 178 + 179 + /// Manually step through the record outputs 180 + pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError> { 181 + loop { 182 + // walk as far as we can until we run out of blocks or find a record 183 + let cid_needed = match self.walker.step(&mut self.block_store, &self.process)? { 184 + Step::Rest(cid) => cid, 185 + Step::Finish => return Ok(None), 186 + Step::Step { rkey, data } => return Ok(Some((rkey, data))), 187 + }; 188 + 189 + // load blocks until we reach that cid 190 + self.drive_until(cid_needed).await?; 191 + } 192 + } 193 + 194 + /// Convert to a futures::stream of record outputs 195 + pub fn stream(self) -> impl Stream<Item = Result<(String, T), DriveError>> { 196 + futures::stream::try_unfold(self, |mut this| async move { 197 + let maybe_record = this.next_record().await?; 198 + Ok(maybe_record.map(|b| (b, this))) 199 + }) 200 + } 201 + }
+50
src/disk_redb.rs
··· 1 + use crate::disk_drive::BlockStore; 2 + use ipld_core::cid::Cid; 3 + use redb::{Database, Error, ReadableDatabase, TableDefinition}; 4 + use serde::{Serialize, de::DeserializeOwned}; 5 + use std::path::Path; 6 + 7 + const TABLE: TableDefinition<&[u8], &[u8]> = TableDefinition::new("blocks"); 8 + 9 + pub struct RedbStore { 10 + db: Database, 11 + } 12 + 13 + impl RedbStore { 14 + pub fn new(path: impl AsRef<Path>) -> Result<Self, Error> { 15 + let db = Database::create(path)?; 16 + Ok(Self { db }) 17 + } 18 + } 19 + 20 + // TODO: clean up on drop 21 + 22 + impl<MPB: Serialize + DeserializeOwned> BlockStore<MPB> for RedbStore { 23 + fn put(&self, c: Cid, t: MPB) { 24 + let key_bytes = c.to_bytes(); 25 + let val_bytes = bincode::serde::encode_to_vec(t, bincode::config::standard()).unwrap(); 26 + 27 + let mut tx = self.db.begin_write().unwrap(); 28 + tx.set_durability(redb::Durability::None).unwrap(); 29 + { 30 + let mut table = tx.open_table(TABLE).unwrap(); 31 + table.insert(&*key_bytes, &*val_bytes).unwrap(); 32 + } 33 + tx.commit().unwrap(); 34 + } 35 + fn get(&self, c: Cid) -> Option<MPB> { 36 + let key_bytes = c.to_bytes(); 37 + let tx = self.db.begin_read().unwrap(); 38 + let table = match tx.open_table(TABLE) { 39 + Ok(t) => t, 40 + Err(redb::TableError::TableDoesNotExist(_)) => return None, 41 + e => e.unwrap(), 42 + }; 43 + let maybe_val_bytes = table.get(&*key_bytes).unwrap()?; 44 + let (t, n): (MPB, usize) = 45 + bincode::serde::decode_from_slice(maybe_val_bytes.value(), bincode::config::standard()) 46 + .unwrap(); 47 + assert_eq!(maybe_val_bytes.value().len(), n); 48 + Some(t) 49 + } 50 + }
+403
src/disk_walk.rs
··· 1 + //! Depth-first MST traversal 2 + 3 + use crate::disk_drive::{BlockStore, MaybeProcessedBlock}; 4 + use crate::mst::Node; 5 + 6 + use ipld_core::cid::Cid; 7 + use serde::{Serialize, de::DeserializeOwned}; 8 + use std::error::Error; 9 + 10 + /// Errors that can happen while walking 11 + #[derive(Debug, thiserror::Error)] 12 + pub enum Trip { 13 + #[error("empty mst nodes are not allowed")] 14 + NodeEmpty, 15 + #[error("Failed to decode commit block: {0}")] 16 + BadCommit(Box<dyn std::error::Error>), 17 + #[error("Action node error: {0}")] 18 + RkeyError(#[from] RkeyError), 19 + #[error("Process failed: {0}")] 20 + ProcessFailed(String), 21 + #[error("Encountered an rkey out of order while walking the MST")] 22 + RkeyOutOfOrder, 23 + } 24 + 25 + /// Errors from invalid Rkeys 26 + #[derive(Debug, thiserror::Error)] 27 + pub enum RkeyError { 28 + #[error("Failed to compute an rkey due to invalid prefix_len")] 29 + EntryPrefixOutOfbounds, 30 + #[error("RKey was not utf-8")] 31 + EntryRkeyNotUtf8(#[from] std::string::FromUtf8Error), 32 + } 33 + 34 + /// Walker outputs 35 + #[derive(Debug)] 36 + pub enum Step<T: Serialize + DeserializeOwned> { 37 + /// We need a CID but it's not in the block store 38 + /// 39 + /// Give the needed CID to the driver so it can load blocks until it's found 40 + Rest(Cid), 41 + /// Reached the end of the MST! yay! 42 + Finish, 43 + /// A record was found! 44 + Step { rkey: String, data: T }, 45 + } 46 + 47 + #[derive(Debug, Clone, PartialEq)] 48 + enum Need { 49 + Node(Cid), 50 + Record { rkey: String, cid: Cid }, 51 + } 52 + 53 + fn push_from_node(stack: &mut Vec<Need>, node: &Node) -> Result<(), RkeyError> { 54 + let mut entries = Vec::with_capacity(node.entries.len()); 55 + 56 + let mut prefix = vec![]; 57 + for entry in &node.entries { 58 + let mut rkey = vec![]; 59 + let pre_checked = prefix 60 + .get(..entry.prefix_len) 61 + .ok_or(RkeyError::EntryPrefixOutOfbounds)?; 62 + rkey.extend_from_slice(pre_checked); 63 + rkey.extend_from_slice(&entry.keysuffix); 64 + prefix = rkey.clone(); 65 + 66 + entries.push(Need::Record { 67 + rkey: String::from_utf8(rkey)?, 68 + cid: entry.value, 69 + }); 70 + if let Some(ref tree) = entry.tree { 71 + entries.push(Need::Node(*tree)); 72 + } 73 + } 74 + 75 + entries.reverse(); 76 + stack.append(&mut entries); 77 + 78 + if let Some(tree) = node.left { 79 + stack.push(Need::Node(tree)); 80 + } 81 + Ok(()) 82 + } 83 + 84 + /// Traverser of an atproto MST 85 + /// 86 + /// Walks the tree from left-to-right in depth-first order 87 + #[derive(Debug)] 88 + pub struct Walker { 89 + stack: Vec<Need>, 90 + prev: String, 91 + } 92 + 93 + impl Walker { 94 + pub fn new(tree_root_cid: Cid) -> Self { 95 + Self { 96 + stack: vec![Need::Node(tree_root_cid)], 97 + prev: "".to_string(), 98 + } 99 + } 100 + 101 + /// Advance through nodes until we find a record or can't go further 102 + pub fn step<T: Clone + Serialize + DeserializeOwned, E: Error>( 103 + &mut self, 104 + block_store: &mut impl BlockStore<MaybeProcessedBlock<T>>, 105 + process: impl Fn(&[u8]) -> Result<T, E>, 106 + ) -> Result<Step<T>, Trip> { 107 + loop { 108 + let Some(mut need) = self.stack.last() else { 109 + log::trace!("tried to walk but we're actually done."); 110 + return Ok(Step::Finish); 111 + }; 112 + 113 + match &mut need { 114 + Need::Node(cid) => { 115 + log::trace!("need node {cid:?}"); 116 + let Some(mpb) = block_store.get(*cid) else { 117 + log::trace!("node not found, resting"); 118 + return Ok(Step::Rest(*cid)); 119 + }; 120 + 121 + let MaybeProcessedBlock::<T>::Raw(block) = mpb else { 122 + return Err(Trip::BadCommit("failed commit fingerprint".into())); 123 + }; 124 + let node = serde_ipld_dagcbor::from_slice::<Node>(&block) 125 + .map_err(|e| Trip::BadCommit(e.into()))?; 126 + 127 + // found node, make sure we remember 128 + self.stack.pop(); 129 + 130 + // queue up work on the found node next 131 + push_from_node(&mut self.stack, &node)?; 132 + } 133 + Need::Record { rkey, cid } => { 134 + log::trace!("need record {cid:?}"); 135 + let Some(mpb) = block_store.get(*cid) else { 136 + log::trace!("record block not found, resting"); 137 + return Ok(Step::Rest(*cid)); 138 + }; 139 + let rkey = rkey.clone(); 140 + let data = match mpb { 141 + MaybeProcessedBlock::Raw(data) => match process(&data) { 142 + Ok(t) => Ok(t), 143 + Err(e) => Err(Trip::ProcessFailed(e.to_string())), 144 + }, 145 + MaybeProcessedBlock::ProcessedOk(t) => Ok(t.clone()), 146 + MaybeProcessedBlock::Unprocessable(s) => { 147 + return Err(Trip::ProcessFailed(s.clone())); 148 + } 149 + }; 150 + 151 + // found node, make sure we remember 152 + self.stack.pop(); 153 + 154 + log::trace!("emitting a block as a step. depth={}", self.stack.len()); 155 + 156 + let data = data.map_err(|e| Trip::ProcessFailed(e.to_string()))?; 157 + 158 + // rkeys *must* be in order or else the tree is invalid (or 159 + // we have a bug) 160 + if rkey <= self.prev { 161 + return Err(Trip::RkeyOutOfOrder); 162 + } 163 + self.prev = rkey.clone(); 164 + 165 + return Ok(Step::Step { rkey, data }); 166 + } 167 + } 168 + } 169 + } 170 + } 171 + 172 + #[cfg(test)] 173 + mod test { 174 + use super::*; 175 + // use crate::mst::Entry; 176 + 177 + fn cid1() -> Cid { 178 + "bafyreihixenvk3ahqbytas4hk4a26w43bh6eo3w6usjqtxkpzsvi655a3m" 179 + .parse() 180 + .unwrap() 181 + } 182 + // fn cid2() -> Cid { 183 + // "QmY7Yh4UquoXHLPFo2XbhXkhBvFoPwmQUSa92pxnxjQuPU" 184 + // .parse() 185 + // .unwrap() 186 + // } 187 + // fn cid3() -> Cid { 188 + // "bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi" 189 + // .parse() 190 + // .unwrap() 191 + // } 192 + // fn cid4() -> Cid { 193 + // "QmbWqxBEKC3P8tqsKc98xmWNzrzDtRLMiMPL8wBuTGsMnR" 194 + // .parse() 195 + // .unwrap() 196 + // } 197 + // fn cid5() -> Cid { 198 + // "QmSnuWmxptJZdLJpKRarxBMS2Ju2oANVrgbr2xWbie9b2D" 199 + // .parse() 200 + // .unwrap() 201 + // } 202 + // fn cid6() -> Cid { 203 + // "QmdmQXB2mzChmMeKY47C43LxUdg1NDJ5MWcKMKxDu7RgQm" 204 + // .parse() 205 + // .unwrap() 206 + // } 207 + // fn cid7() -> Cid { 208 + // "bafybeiaysi4s6lnjev27ln5icwm6tueaw2vdykrtjkwiphwekaywqhcjze" 209 + // .parse() 210 + // .unwrap() 211 + // } 212 + // fn cid8() -> Cid { 213 + // "bafyreif3tfdpr5n4jdrbielmcapwvbpcthepfkwq2vwonmlhirbjmotedi" 214 + // .parse() 215 + // .unwrap() 216 + // } 217 + // fn cid9() -> Cid { 218 + // "bafyreicnokmhmrnlp2wjhyk2haep4tqxiptwfrp2rrs7rzq7uk766chqvq" 219 + // .parse() 220 + // .unwrap() 221 + // } 222 + 223 + #[test] 224 + fn test_next_from_node_empty() { 225 + let node = Node { 226 + left: None, 227 + entries: vec![], 228 + }; 229 + let mut stack = vec![]; 230 + push_from_node(&mut stack, &node).unwrap(); 231 + assert_eq!(stack.last(), None); 232 + } 233 + 234 + #[test] 235 + fn test_needs_from_node_just_left() { 236 + let node = Node { 237 + left: Some(cid1()), 238 + entries: vec![], 239 + }; 240 + let mut stack = vec![]; 241 + push_from_node(&mut stack, &node).unwrap(); 242 + assert_eq!(stack.last(), Some(Need::Node(cid1())).as_ref()); 243 + } 244 + 245 + // #[test] 246 + // fn test_needs_from_node_just_one_record() { 247 + // let node = Node { 248 + // left: None, 249 + // entries: vec![Entry { 250 + // keysuffix: "asdf".into(), 251 + // prefix_len: 0, 252 + // value: cid1(), 253 + // tree: None, 254 + // }], 255 + // }; 256 + // assert_eq!( 257 + // needs_from_node(node).unwrap(), 258 + // vec![Need::Record { 259 + // rkey: "asdf".into(), 260 + // cid: cid1(), 261 + // },] 262 + // ); 263 + // } 264 + 265 + // #[test] 266 + // fn test_needs_from_node_two_records() { 267 + // let node = Node { 268 + // left: None, 269 + // entries: vec![ 270 + // Entry { 271 + // keysuffix: "asdf".into(), 272 + // prefix_len: 0, 273 + // value: cid1(), 274 + // tree: None, 275 + // }, 276 + // Entry { 277 + // keysuffix: "gh".into(), 278 + // prefix_len: 2, 279 + // value: cid2(), 280 + // tree: None, 281 + // }, 282 + // ], 283 + // }; 284 + // assert_eq!( 285 + // needs_from_node(node).unwrap(), 286 + // vec![ 287 + // Need::Record { 288 + // rkey: "asdf".into(), 289 + // cid: cid1(), 290 + // }, 291 + // Need::Record { 292 + // rkey: "asgh".into(), 293 + // cid: cid2(), 294 + // }, 295 + // ] 296 + // ); 297 + // } 298 + 299 + // #[test] 300 + // fn test_needs_from_node_with_both() { 301 + // let node = Node { 302 + // left: None, 303 + // entries: vec![Entry { 304 + // keysuffix: "asdf".into(), 305 + // prefix_len: 0, 306 + // value: cid1(), 307 + // tree: Some(cid2()), 308 + // }], 309 + // }; 310 + // assert_eq!( 311 + // needs_from_node(node).unwrap(), 312 + // vec![ 313 + // Need::Record { 314 + // rkey: "asdf".into(), 315 + // cid: cid1(), 316 + // }, 317 + // Need::Node(cid2()), 318 + // ] 319 + // ); 320 + // } 321 + 322 + // #[test] 323 + // fn test_needs_from_node_left_and_record() { 324 + // let node = Node { 325 + // left: Some(cid1()), 326 + // entries: vec![Entry { 327 + // keysuffix: "asdf".into(), 328 + // prefix_len: 0, 329 + // value: cid2(), 330 + // tree: None, 331 + // }], 332 + // }; 333 + // assert_eq!( 334 + // needs_from_node(node).unwrap(), 335 + // vec![ 336 + // Need::Node(cid1()), 337 + // Need::Record { 338 + // rkey: "asdf".into(), 339 + // cid: cid2(), 340 + // }, 341 + // ] 342 + // ); 343 + // } 344 + 345 + // #[test] 346 + // fn test_needs_from_full_node() { 347 + // let node = Node { 348 + // left: Some(cid1()), 349 + // entries: vec![ 350 + // Entry { 351 + // keysuffix: "asdf".into(), 352 + // prefix_len: 0, 353 + // value: cid2(), 354 + // tree: Some(cid3()), 355 + // }, 356 + // Entry { 357 + // keysuffix: "ghi".into(), 358 + // prefix_len: 1, 359 + // value: cid4(), 360 + // tree: Some(cid5()), 361 + // }, 362 + // Entry { 363 + // keysuffix: "jkl".into(), 364 + // prefix_len: 2, 365 + // value: cid6(), 366 + // tree: Some(cid7()), 367 + // }, 368 + // Entry { 369 + // keysuffix: "mno".into(), 370 + // prefix_len: 4, 371 + // value: cid8(), 372 + // tree: Some(cid9()), 373 + // }, 374 + // ], 375 + // }; 376 + // assert_eq!( 377 + // needs_from_node(node).unwrap(), 378 + // vec![ 379 + // Need::Node(cid1()), 380 + // Need::Record { 381 + // rkey: "asdf".into(), 382 + // cid: cid2(), 383 + // }, 384 + // Need::Node(cid3()), 385 + // Need::Record { 386 + // rkey: "aghi".into(), 387 + // cid: cid4(), 388 + // }, 389 + // Need::Node(cid5()), 390 + // Need::Record { 391 + // rkey: "agjkl".into(), 392 + // cid: cid6(), 393 + // }, 394 + // Need::Node(cid7()), 395 + // Need::Record { 396 + // rkey: "agjkmno".into(), 397 + // cid: cid8(), 398 + // }, 399 + // Need::Node(cid9()), 400 + // ] 401 + // ); 402 + // } 403 + }
+3
src/lib.rs
··· 2 2 //! 3 3 //! For now see the [examples](https://tangled.org/@microcosm.blue/repo-stream/tree/main/examples) 4 4 5 + pub mod disk_drive; 6 + pub mod disk_redb; 7 + pub mod disk_walk; 5 8 pub mod drive; 6 9 pub mod mst; 7 10 pub mod walk;