Fast and robust atproto CAR file processing in rust

add builder-style constructors for driver + store

Changed files
+185 -35
examples
disk-read-file
read-file
src
+7 -10
examples/disk-read-file/main.rs
··· 4 4 5 5 extern crate repo_stream; 6 6 use clap::Parser; 7 - use repo_stream::{DiskStore, Driver, process::noop}; 7 + use repo_stream::{DiskBuilder, Driver, DriverBuilder}; 8 8 use std::path::PathBuf; 9 9 10 10 #[derive(Debug, Parser)] ··· 26 26 let reader = tokio::fs::File::open(car).await?; 27 27 let reader = tokio::io::BufReader::new(reader); 28 28 29 - // configure how much memory can be used before spilling to disk. 30 - // real memory usage may differ somewhat. 31 - let in_mem_limit = 10; // MiB 32 - 33 - // configure how much memory sqlite is allowed to use when dumping to disk 34 - let db_cache_mb = 32; // MiB 35 - 36 29 log::info!("hello! reading the car..."); 37 30 38 31 // in this example we only bother handling CARs that are too big for memory 39 32 // `noop` helper means: do no block processing, store the raw blocks 40 - let driver = match Driver::load_car(reader, noop, in_mem_limit).await? { 33 + let driver = match DriverBuilder::new() 34 + .with_mem_limit_mb(10) // how much memory can be used before disk spill 35 + .load_car(reader) 36 + .await? 37 + { 41 38 Driver::Memory(_, _) => panic!("try this on a bigger car"), 42 39 Driver::Disk(big_stuff) => { 43 40 // we reach here if the repo was too big and needs to be spilled to 44 41 // disk to continue 45 42 46 43 // set up a disk store we can spill to 47 - let disk_store = DiskStore::new(tmpfile.clone(), db_cache_mb).await?; 44 + let disk_store = DiskBuilder::new().open(tmpfile).await?; 48 45 49 46 // do the spilling, get back a (similar) driver 50 47 let (commit, driver) = big_stuff.finish_loading(disk_store).await?;
+9 -6
examples/read-file/main.rs
··· 4 4 5 5 extern crate repo_stream; 6 6 use clap::Parser; 7 - use repo_stream::Driver; 7 + use repo_stream::{Driver, DriverBuilder}; 8 8 use std::path::PathBuf; 9 9 10 10 type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>; ··· 23 23 let reader = tokio::fs::File::open(file).await?; 24 24 let reader = tokio::io::BufReader::new(reader); 25 25 26 - let (commit, mut driver) = 27 - match Driver::load_car(reader, |block| block.len(), 16 /* MiB */).await? { 28 - Driver::Memory(commit, mem_driver) => (commit, mem_driver), 29 - Driver::Disk(_) => panic!("this example doesn't handle big CARs"), 30 - }; 26 + let (commit, mut driver) = match DriverBuilder::new() 27 + .with_block_processor(|block| block.len()) 28 + .load_car(reader) 29 + .await? 30 + { 31 + Driver::Memory(commit, mem_driver) => (commit, mem_driver), 32 + Driver::Disk(_) => panic!("this example doesn't handle big CARs"), 33 + }; 31 34 32 35 log::info!("got commit: {commit:?}"); 33 36
+84 -6
src/disk.rs
··· 5 5 to be the best behaved in terms of both on-disk space usage and memory usage. 6 6 7 7 ```no_run 8 - # use repo_stream::{DiskStore, DiskError}; 8 + # use repo_stream::{DiskBuilder, DiskError}; 9 9 # #[tokio::main] 10 10 # async fn main() -> Result<(), DiskError> { 11 - let db_cache_size = 32; // MiB 12 - let store = DiskStore::new("/some/path.db".into(), db_cache_size).await?; 11 + let store = DiskBuilder::new() 12 + .with_cache_size_mb(32) 13 + .with_max_stored_mb(1024) // errors when >1GiB of processed blocks are inserted 14 + .open("/some/path.db".into()).await?; 13 15 # Ok(()) 14 16 # } 15 17 ``` ··· 30 32 /// A tokio blocking task failed to join 31 33 #[error("Failed to join a tokio blocking task: {0}")] 32 34 JoinError(#[from] tokio::task::JoinError), 35 + /// The total size of stored blocks exceeded the allowed size 36 + /// 37 + /// If you need to process *really* big CARs, you can configure a higher 38 + /// limit. 39 + #[error("Maximum disk size reached")] 40 + MaxSizeExceeded, 33 41 #[error("this error was replaced, seeing this is a bug.")] 34 42 #[doc(hidden)] 35 43 Stolen, ··· 44 52 } 45 53 } 46 54 55 + /// Builder-style disk store setup 56 + pub struct DiskBuilder { 57 + /// Database in-memory cache allowance 58 + /// 59 + /// Default: 32 MiB 60 + pub cache_size_mb: usize, 61 + /// Database stored block size limit 62 + /// 63 + /// Default: 10 GiB 64 + /// 65 + /// Note: actual size on disk may be more, but should approximately scale 66 + /// with this limit 67 + pub max_stored_mb: usize, 68 + } 69 + 70 + impl Default for DiskBuilder { 71 + fn default() -> Self { 72 + Self { 73 + cache_size_mb: 32, 74 + max_stored_mb: 10 * 1024, // 10 GiB 75 + } 76 + } 77 + } 78 + 79 + impl DiskBuilder { 80 + /// Begin configuring the storage with defaults 81 + pub fn new() -> Self { 82 + Default::default() 83 + } 84 + /// Set the in-memory cache allowance for the database 85 + /// 86 + /// Default: 32 MiB 87 + pub fn with_cache_size_mb(mut self, size: usize) -> Self { 88 + self.cache_size_mb = size; 89 + self 90 + } 91 + /// Set the approximate stored block size limit 92 + /// 93 + /// Default: 10 GiB 94 + pub fn with_max_stored_mb(mut self, max: usize) -> Self { 95 + self.max_stored_mb = max; 96 + self 97 + } 98 + /// Open and initialize the actual disk storage 99 + pub async fn open(self, path: PathBuf) -> Result<DiskStore, DiskError> { 100 + DiskStore::new(path, self.cache_size_mb, self.max_stored_mb).await 101 + } 102 + } 103 + 47 104 /// On-disk block storage 48 105 pub struct DiskStore { 49 106 conn: rusqlite::Connection, 107 + max_stored: usize, 108 + stored: usize, 50 109 } 51 110 52 111 impl DiskStore { 53 112 /// Initialize a new disk store 54 - pub async fn new(path: PathBuf, cache_mb: usize) -> Result<Self, DiskError> { 113 + pub async fn new( 114 + path: PathBuf, 115 + cache_mb: usize, 116 + max_stored_mb: usize, 117 + ) -> Result<Self, DiskError> { 118 + let max_stored = max_stored_mb * 2_usize.pow(20); 55 119 let conn = tokio::task::spawn_blocking(move || { 56 120 let conn = rusqlite::Connection::open(path)?; 57 121 ··· 73 137 }) 74 138 .await??; 75 139 76 - Ok(Self { conn }) 140 + Ok(Self { 141 + conn, 142 + max_stored, 143 + stored: 0, 144 + }) 77 145 } 78 146 pub(crate) fn get_writer(&'_ mut self) -> Result<SqliteWriter<'_>, DiskError> { 79 147 let tx = self.conn.transaction()?; 80 - Ok(SqliteWriter { tx }) 148 + Ok(SqliteWriter { 149 + tx, 150 + stored: &mut self.stored, 151 + max: self.max_stored, 152 + }) 81 153 } 82 154 pub(crate) fn get_reader<'conn>(&'conn self) -> Result<SqliteReader<'conn>, DiskError> { 83 155 let select_stmt = self.conn.prepare("SELECT val FROM blocks WHERE key = ?1")?; ··· 106 178 107 179 pub(crate) struct SqliteWriter<'conn> { 108 180 tx: rusqlite::Transaction<'conn>, 181 + stored: &'conn mut usize, 182 + max: usize, 109 183 } 110 184 111 185 impl SqliteWriter<'_> { ··· 119 193 .map_err(DiskError::DbError)?; 120 194 for pair in kv { 121 195 let (k, v) = pair?; 196 + *self.stored += v.len(); 197 + if *self.stored > self.max { 198 + return Err(DiskError::MaxSizeExceeded.into()); 199 + } 122 200 insert_stmt.execute((k, v)).map_err(DiskError::DbError)?; 123 201 } 124 202 Ok(())
+75 -5
src/drive.rs
··· 115 115 Disk(NeedDisk<R, T>), 116 116 } 117 117 118 + /// Builder-style driver setup 119 + pub struct DriverBuilder { 120 + pub mem_limit_mb: usize, 121 + } 122 + 123 + impl Default for DriverBuilder { 124 + fn default() -> Self { 125 + Self { mem_limit_mb: 16 } 126 + } 127 + } 128 + 129 + impl DriverBuilder { 130 + /// Begin configuring the driver with defaults 131 + pub fn new() -> Self { 132 + Default::default() 133 + } 134 + /// Set the in-memory size limit, in MiB 135 + /// 136 + /// Default: 16 MiB 137 + pub fn with_mem_limit_mb(self, new_limit: usize) -> Self { 138 + Self { 139 + mem_limit_mb: new_limit, 140 + } 141 + } 142 + /// Set the block processor 143 + /// 144 + /// Default: noop, raw blocks will be emitted 145 + pub fn with_block_processor<T: Processable>( 146 + self, 147 + p: fn(Vec<u8>) -> T, 148 + ) -> DriverBuilderWithProcessor<T> { 149 + DriverBuilderWithProcessor { 150 + mem_limit_mb: self.mem_limit_mb, 151 + block_processor: p, 152 + } 153 + } 154 + /// Begin processing an atproto MST from a CAR file 155 + pub async fn load_car<R: AsyncRead + Unpin>( 156 + self, 157 + reader: R, 158 + ) -> Result<Driver<R, Vec<u8>>, DriveError> { 159 + Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await 160 + } 161 + } 162 + 163 + /// Builder-style driver intermediate step 164 + /// 165 + /// start from `DriverBuilder` 166 + pub struct DriverBuilderWithProcessor<T: Processable> { 167 + pub mem_limit_mb: usize, 168 + pub block_processor: fn(Vec<u8>) -> T, 169 + } 170 + 171 + impl<T: Processable> DriverBuilderWithProcessor<T> { 172 + /// Set the in-memory size limit, in MiB 173 + /// 174 + /// Default: 16 MiB 175 + pub fn with_mem_limit_mb(mut self, new_limit: usize) -> Self { 176 + self.mem_limit_mb = new_limit; 177 + self 178 + } 179 + /// Begin processing an atproto MST from a CAR file 180 + pub async fn load_car<R: AsyncRead + Unpin>( 181 + self, 182 + reader: R, 183 + ) -> Result<Driver<R, T>, DriveError> { 184 + Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await 185 + } 186 + } 187 + 118 188 impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> { 119 189 /// Begin processing an atproto MST from a CAR file 120 190 /// 121 191 /// Blocks will be loaded, processed, and buffered in memory. If the entire 122 - /// processed size is under the `max_size_mb` limit, a `Driver::Memory` will 123 - /// be returned along with a `Commit` ready for validation. 192 + /// processed size is under the `mem_limit_mb` limit, a `Driver::Memory` 193 + /// will be returned along with a `Commit` ready for validation. 124 194 /// 125 - /// If the `max_size_mb` limit is reached before loading all blocks, the 195 + /// If the `mem_limit_mb` limit is reached before loading all blocks, the 126 196 /// partial state will be returned as `Driver::Disk(needed)`, which can be 127 197 /// resumed by providing a `SqliteStorage` for on-disk block storage. 128 198 pub async fn load_car( 129 199 reader: R, 130 200 process: fn(Vec<u8>) -> T, 131 - max_size_mb: usize, 201 + mem_limit_mb: usize, 132 202 ) -> Result<Driver<R, T>, DriveError> { 133 - let max_size = max_size_mb * 2_usize.pow(20); 203 + let max_size = mem_limit_mb * 2_usize.pow(20); 134 204 let mut mem_blocks = HashMap::new(); 135 205 136 206 let mut car = CarReader::new(reader).await?;
+10 -8
src/lib.rs
··· 18 18 `iroh_car` additionally applies a block size limit of `2MiB`. 19 19 20 20 ``` 21 - use repo_stream::{Driver, DiskStore}; 21 + use repo_stream::{Driver, DriverBuilder, DiskBuilder}; 22 22 23 23 # #[tokio::main] 24 24 # async fn main() -> Result<(), Box<dyn std::error::Error>> { 25 25 # let reader = include_bytes!("../car-samples/tiny.car").as_slice(); 26 26 let mut total_size = 0; 27 - let process = |rec: Vec<u8>| rec.len(); // block processing: just extract the size 28 - let in_mem_limit = 10; /* MiB */ 29 - let db_cache_size = 32; /* MiB */ 30 27 31 - match Driver::load_car(reader, process, in_mem_limit).await? { 28 + match DriverBuilder::new() 29 + .with_mem_limit_mb(10) 30 + .with_block_processor(|rec| rec.len()) // block processing: just extract the raw record size 31 + .load_car(reader) 32 + .await? 33 + { 32 34 33 35 // if all blocks fit within memory 34 36 Driver::Memory(_commit, mut driver) => { ··· 42 44 // if the CAR was too big for in-memory processing 43 45 Driver::Disk(paused) => { 44 46 // set up a disk store we can spill to 45 - let store = DiskStore::new("some/path.db".into(), db_cache_size).await?; 47 + let store = DiskBuilder::new().open("some/path.db".into()).await?; 46 48 // do the spilling, get back a (similar) driver 47 49 let (_commit, mut driver) = paused.finish_loading(store).await?; 48 50 ··· 79 81 pub mod drive; 80 82 pub mod process; 81 83 82 - pub use disk::{DiskError, DiskStore}; 83 - pub use drive::{DriveError, Driver}; 84 + pub use disk::{DiskBuilder, DiskError, DiskStore}; 85 + pub use drive::{DriveError, Driver, DriverBuilder}; 84 86 pub use mst::Commit; 85 87 pub use process::Processable;