Fast and robust atproto CAR file processing in rust

chunked walking

Changed files
+36 -6
src
+1 -1
Cargo.toml
··· 20 serde_bytes = "0.11.19" 21 serde_ipld_dagcbor = "0.6.4" 22 thiserror = "2.0.17" 23 - tokio = "1.47.1" 24 25 [dev-dependencies] 26 clap = { version = "4.5.48", features = ["derive"] }
··· 20 serde_bytes = "0.11.19" 21 serde_ipld_dagcbor = "0.6.4" 22 thiserror = "2.0.17" 23 + tokio = { version = "1.47.1", features = ["rt"] } 24 25 [dev-dependencies] 26 clap = { version = "4.5.48", features = ["derive"] }
+32 -4
src/disk_drive.rs
··· 1 use futures::Stream; 2 use futures::TryStreamExt; 3 use std::error::Error; 4 5 use crate::disk_walk::{Step, Trip, Walker}; ··· 47 block_store: BS, 48 walker: Walker, 49 process: P, 50 } 51 52 impl<SE, S, T, BS, P, PE> Vehicle<SE, S, T, BS, P, PE> ··· 120 block_store, 121 walker, 122 process, 123 }; 124 Ok((commit, me)) 125 } 126 127 /// Manually step through the record outputs 128 pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError> { 129 - match self.walker.step(&mut self.block_store, &self.process)? { 130 - Step::Rest(cid) => Err(DriveError::MissingBlock(cid)), 131 - Step::Finish => Ok(None), 132 - Step::Step { rkey, data } => Ok(Some((rkey, data))), 133 } 134 } 135 136 /// Convert to a futures::stream of record outputs
··· 1 use futures::Stream; 2 use futures::TryStreamExt; 3 + use std::collections::VecDeque; 4 use std::error::Error; 5 6 use crate::disk_walk::{Step, Trip, Walker}; ··· 48 block_store: BS, 49 walker: Walker, 50 process: P, 51 + out_cache: VecDeque<(String, T)>, 52 } 53 54 impl<SE, S, T, BS, P, PE> Vehicle<SE, S, T, BS, P, PE> ··· 122 block_store, 123 walker, 124 process, 125 + out_cache: VecDeque::new(), 126 }; 127 Ok((commit, me)) 128 } 129 130 + async fn load_chunk(&mut self, n: usize) -> Result<(), DriveError> { 131 + self.out_cache.reserve(n); 132 + for _ in 0..n { 133 + let item = match self.walker.step(&mut self.block_store, &self.process)? { 134 + Step::Step { rkey, data } => (rkey, data), 135 + Step::Finish => break, 136 + Step::Rest(cid) => return Err(DriveError::MissingBlock(cid)), 137 + }; 138 + self.out_cache.push_back(item); 139 + } 140 + Ok(()) 141 + } 142 + 143 + /// Get a chunk of records at a time 144 + /// 145 + /// the number of returned records may be smaller or larger than requested 146 + /// (but non-zero), even if it's not the last chunk. 147 + /// 148 + /// an empty vec will be returned to signal the end. 149 + pub async fn next_chunk(&mut self, n: usize) -> Result<Vec<(String, T)>, DriveError> { 150 + if self.out_cache.is_empty() { 151 + self.load_chunk(n).await?; 152 + } 153 + Ok(std::mem::take(&mut self.out_cache).into()) 154 + } 155 + 156 /// Manually step through the record outputs 157 pub async fn next_record(&mut self) -> Result<Option<(String, T)>, DriveError> { 158 + if self.out_cache.is_empty() { 159 + self.load_chunk(64).await?; // TODO 160 } 161 + Ok(self.out_cache.pop_front()) 162 } 163 164 /// Convert to a futures::stream of record outputs
+3 -1
src/disk_redb.rs
··· 45 } 46 47 tx.commit().unwrap(); 48 - }).await.unwrap(); 49 } 50 51 fn get(&self, c: Cid) -> Option<Vec<u8>> {
··· 45 } 46 47 tx.commit().unwrap(); 48 + }) 49 + .await 50 + .unwrap(); 51 } 52 53 fn get(&self, c: Cid) -> Option<Vec<u8>> {