an efficient binary archive format
at main 542 lines 19 kB view raw
1use crc32fast::Hasher; 2use memmap2::Mmap; 3use std::borrow::Cow; 4use std::collections::BTreeMap; 5use std::fs::{File, OpenOptions}; 6use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write}; 7use std::path::{Path, PathBuf}; 8use zerocopy::{FromBytes, IntoBytes}; 9 10use crate::compress::Compress; 11use crate::entry::{Entry, Footer}; 12use crate::reader::{Either, Reader}; 13use crate::writer::Writer; 14use crate::{ 15 AUTO_COMPRESS_THRESHOLD, BNDL_ALIGN, BNDL_MAGIC, ENTRY_SIZE, FOOTER_MAGIC, FOOTER_SIZE, 16 HEADER_SIZE, pad, write_padding, 17}; 18 19/// A binary archive for collecting files. 20/// 21/// Uses memory-mapped I/O for fast reads, supports optional zstd compression, and handles updates via shadowing. 22/// Files can be added incrementally without rewriting the entire archive. 23/// 24/// # Example 25/// 26/// ```no_run 27/// use bindle_file::{Bindle, Compress}; 28/// 29/// let mut archive = Bindle::open("data.bndl")?; 30/// archive.add("file.txt", b"data", Compress::None)?; 31/// archive.save()?; 32/// # Ok::<(), std::io::Error>(()) 33/// ``` 34pub struct Bindle { 35 pub(crate) path: PathBuf, 36 pub(crate) file: File, 37 pub(crate) mmap: Option<Mmap>, 38 pub(crate) index: BTreeMap<String, Entry>, 39 pub(crate) data_end: u64, 40} 41 42impl Bindle { 43 /// Creates a new archive, overwriting any existing file at the path. 44 pub fn create<P: AsRef<Path>>(path: P) -> io::Result<Self> { 45 let path_buf = path.as_ref().to_path_buf(); 46 let opts = OpenOptions::new() 47 .truncate(true) 48 .read(true) 49 .write(true) 50 .create(true) 51 .to_owned(); 52 Self::new(path_buf, opts) 53 } 54 55 /// Opens an existing archive or creates a new one if it doesn't exist. 56 pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> { 57 let path_buf = path.as_ref().to_path_buf(); 58 let opts = OpenOptions::new() 59 .read(true) 60 .write(true) 61 .create(true) 62 .to_owned(); 63 Self::new(path_buf, opts) 64 } 65 66 /// Opens an existing archive. Returns an error if the file doesn't exist. 67 pub fn load<P: AsRef<Path>>(path: P) -> io::Result<Self> { 68 let path_buf = path.as_ref().to_path_buf(); 69 let opts = OpenOptions::new().read(true).write(true).to_owned(); 70 Self::new(path_buf, opts) 71 } 72 73 /// Create a new `Bindle` from a path and file, the path must match the file 74 pub fn new(path: PathBuf, opts: OpenOptions) -> io::Result<Self> { 75 let mut file = opts.open(&path)?; 76 file.lock_shared()?; 77 let len = file.metadata()?.len(); 78 79 // Handle completely new/empty files 80 if len == 0 { 81 file.write_all(BNDL_MAGIC)?; 82 return Ok(Self { 83 path, 84 file, 85 mmap: None, 86 index: BTreeMap::new(), 87 data_end: HEADER_SIZE as u64, 88 }); 89 } 90 91 // Safety check: File must be at least HEADER + FOOTER size (24 bytes) 92 // This prevents "attempt to subtract with overflow" when calculating footer_pos 93 if len < (HEADER_SIZE + FOOTER_SIZE) as u64 { 94 return Err(io::Error::new( 95 io::ErrorKind::InvalidData, 96 "File too small to be a valid bindle", 97 )); 98 } 99 100 let mut header = [0u8; 8]; 101 file.read_exact(&mut header)?; 102 if &header != BNDL_MAGIC { 103 return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid header")); 104 } 105 106 let m = unsafe { Mmap::map(&file)? }; 107 108 // Calculate footer position. Subtraction is now safe due to the check above. 109 let footer_pos = m.len() - FOOTER_SIZE; 110 let footer = Footer::read_from_bytes(&m[footer_pos..]) 111 .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "Failed to read footer"))?; 112 113 if footer.magic() != FOOTER_MAGIC { 114 return Err(io::Error::new( 115 io::ErrorKind::InvalidData, 116 "Invalid footer, the file may be corrupt", 117 )); 118 } 119 120 let data_end = footer.index_offset(); 121 let count = footer.entry_count(); 122 let mut index = BTreeMap::new(); 123 124 let mut cursor = data_end as usize; 125 for _ in 0..count { 126 // Ensure there is enough data left for an Entry header 127 if cursor + ENTRY_SIZE > footer_pos { 128 break; 129 } 130 131 let entry = match Entry::read_from_bytes(&m[cursor..cursor + ENTRY_SIZE]) { 132 Ok(e) => e, 133 Err(_) => break, // Corrupted entry, stop reading 134 }; 135 let n_start = cursor + ENTRY_SIZE; 136 137 // Validate that the filename exists within the mapped bounds 138 if n_start + entry.name_len() > footer_pos { 139 break; 140 } 141 142 let name = 143 String::from_utf8_lossy(&m[n_start..n_start + entry.name_len()]).into_owned(); 144 index.insert(name, entry); 145 146 let total = ENTRY_SIZE + entry.name_len(); 147 cursor += (total + (BNDL_ALIGN - 1)) & !(BNDL_ALIGN - 1); 148 } 149 150 Ok(Self { 151 path, 152 file, 153 mmap: Some(m), 154 index, 155 data_end, 156 }) 157 } 158 159 fn should_auto_compress(&self, compress: Compress, len: usize) -> bool { 160 compress == Compress::Zstd || (compress == Compress::Auto && len > AUTO_COMPRESS_THRESHOLD) 161 } 162 163 /// Adds data to the archive with the given name. 164 /// 165 /// If an entry with the same name exists, it will be shadowed. Call [`save()`](Bindle::save) to commit changes. 166 pub fn add(&mut self, name: &str, data: &[u8], compress: Compress) -> io::Result<()> { 167 let mut stream = self.writer(name, compress)?; 168 stream.write_all(data)?; 169 stream.close()?; 170 Ok(()) 171 } 172 173 /// Adds a file from the filesystem to the archive. 174 /// 175 /// Reads the file at `path` and stores it with the given `name`. Call [`save()`](Bindle::save) to commit changes. 176 pub fn add_file( 177 &mut self, 178 name: &str, 179 path: impl AsRef<Path>, 180 compress: Compress, 181 ) -> io::Result<()> { 182 let mut stream = self.writer(name, compress)?; 183 let mut src = std::fs::File::open(path)?; 184 std::io::copy(&mut src, &mut stream)?; 185 Ok(()) 186 } 187 188 /// Commits all pending changes by writing the index and footer to disk. 189 /// 190 /// Must be called after add/remove operations to make changes persistent. 191 pub fn save(&mut self) -> io::Result<()> { 192 self.file.lock()?; 193 self.file.seek(SeekFrom::Start(self.data_end))?; 194 let index_start = self.data_end; 195 196 // Use buffered writer to batch index writes 197 { 198 let mut writer = BufWriter::new(&mut self.file); 199 for (name, entry) in &self.index { 200 writer.write_all(entry.as_bytes())?; 201 writer.write_all(name.as_bytes())?; 202 let pad = pad::<BNDL_ALIGN, usize>(ENTRY_SIZE + name.len()); 203 if pad > 0 { 204 write_padding(&mut writer, pad)?; 205 } 206 } 207 208 let footer = Footer::new(index_start, self.index.len() as u32, FOOTER_MAGIC); 209 writer.write_all(footer.as_bytes())?; 210 writer.flush()?; 211 } // Drop writer here to release borrow 212 213 // Truncate file to current position to remove any old data 214 let current_pos = self.file.stream_position()?; 215 self.file.set_len(current_pos)?; 216 217 let mmap = unsafe { Mmap::map(&self.file)? }; 218 self.mmap = Some(mmap); 219 self.file.lock_shared()?; 220 Ok(()) 221 } 222 223 /// Reclaims space by removing shadowed data. 224 /// 225 /// Rebuilds the archive with only live entries, removing old versions of updated files. 226 pub fn vacuum(&mut self) -> io::Result<()> { 227 let temp_path = self.path.with_extension("tmp"); 228 229 // Create temp file and keep handle to reuse after rename 230 let mut temp_file = OpenOptions::new() 231 .write(true) 232 .read(true) 233 .create(true) 234 .truncate(true) 235 .open(&temp_path)?; 236 237 temp_file.lock()?; 238 temp_file.write_all(BNDL_MAGIC)?; 239 let mut current_offset = HEADER_SIZE as u64; 240 241 // Copy only live entries from original to temp 242 for entry in self.index.values_mut() { 243 self.file.seek(SeekFrom::Start(entry.offset()))?; 244 temp_file.seek(SeekFrom::Start(current_offset))?; 245 246 // Stream data without allocating full buffer 247 let mut limited = (&mut self.file).take(entry.compressed_size()); 248 io::copy(&mut limited, &mut temp_file)?; 249 250 entry.set_offset(current_offset); 251 let pad = pad::<8, u64>(entry.compressed_size()); 252 if pad > 0 { 253 write_padding(&mut temp_file, pad as usize)?; 254 } 255 current_offset += entry.compressed_size() + pad; 256 } 257 258 // Write the index and footer 259 let index_start = current_offset; 260 for (name, entry) in &self.index { 261 temp_file.write_all(entry.as_bytes())?; 262 temp_file.write_all(name.as_bytes())?; 263 let pad = pad::<BNDL_ALIGN, usize>(ENTRY_SIZE + name.len()); 264 if pad > 0 { 265 write_padding(&mut temp_file, pad)?; 266 } 267 } 268 269 let footer = Footer::new(index_start, self.index.len() as u32, FOOTER_MAGIC); 270 temp_file.write_all(footer.as_bytes())?; 271 temp_file.sync_all()?; 272 273 // Acquire exclusive lock just before rename to prevent concurrent access 274 self.file.lock()?; 275 276 // Release locks and close current file 277 drop(self.mmap.take()); 278 let _ = self.file.unlock(); 279 280 // Atomically replace original with temp 281 std::fs::rename(&temp_path, &self.path)?; 282 283 // Reuse temp_file handle (still valid after rename) 284 temp_file.lock_shared()?; 285 let mmap = unsafe { Mmap::map(&temp_file)? }; 286 287 let footer_pos = mmap.len() - FOOTER_SIZE; 288 let footer = Footer::read_from_bytes(&mmap[footer_pos..]).map_err(|_| { 289 io::Error::new( 290 io::ErrorKind::InvalidData, 291 "Failed to read footer after vacuum", 292 ) 293 })?; 294 295 self.file = temp_file; 296 self.mmap = Some(mmap); 297 self.data_end = footer.index_offset(); 298 299 Ok(()) 300 } 301 302 /// Reads an entry from the archive, decompressing if needed. 303 /// 304 /// Returns `None` if the entry doesn't exist or if CRC32 verification fails. 305 pub fn read<'a>(&'a self, name: &str) -> Option<Cow<'a, [u8]>> { 306 let entry = self.index.get(name)?; 307 let mmap = self.mmap.as_ref()?; 308 309 let data = if entry.compression_type() == Compress::Zstd { 310 let compressed_data = mmap.get( 311 entry.offset() as usize..(entry.offset() + entry.compressed_size()) as usize, 312 )?; 313 let mut out = Vec::with_capacity(entry.uncompressed_size() as usize); 314 zstd::Decoder::new(compressed_data) 315 .ok()? 316 .read_to_end(&mut out) 317 .ok()?; 318 Cow::Owned(out) 319 } else { 320 let uncompressed_data = mmap.get( 321 entry.offset() as usize..(entry.offset() + entry.uncompressed_size()) as usize, 322 )?; 323 Cow::Borrowed(uncompressed_data) 324 }; 325 326 // Verify CRC32 327 let computed_crc = crc32fast::hash(&data); 328 if computed_crc != entry.crc32() { 329 return None; 330 } 331 332 Some(data) 333 } 334 335 /// Reads an entry into a provided buffer, avoiding allocation. 336 /// 337 /// Decompresses if needed and verifies CRC32. Returns the number of bytes read. 338 /// If the buffer is too small, only reads up to buffer.len() bytes. 339 /// 340 /// # Example 341 /// 342 /// ```no_run 343 /// use bindle_file::Bindle; 344 /// 345 /// let archive = Bindle::open("data.bndl")?; 346 /// let mut buffer = vec![0u8; 1024]; 347 /// let bytes_read = archive.read_into("file.txt", &mut buffer)?; 348 /// # Ok::<(), std::io::Error>(()) 349 /// ``` 350 pub fn read_into(&self, name: &str, buffer: &mut [u8]) -> io::Result<usize> { 351 let mut reader = self.reader(name)?; 352 let bytes_read = reader.read(buffer)?; 353 reader.verify_crc32()?; 354 Ok(bytes_read) 355 } 356 357 /// Reads an entry and writes it to the given writer. 358 /// 359 /// Returns the number of bytes written. Verifies CRC32 after reading. 360 pub fn read_to<W: std::io::Write>(&self, name: &str, mut w: W) -> std::io::Result<u64> { 361 let mut reader = self.reader(name)?; 362 let bytes_copied = std::io::copy(&mut reader, &mut w)?; 363 reader.verify_crc32()?; 364 Ok(bytes_copied) 365 } 366 367 /// Returns a streaming reader for an entry. 368 /// 369 /// Automatically decompresses if the entry is compressed. Call [`Reader::verify_crc32()`] after reading to verify integrity. 370 pub fn reader<'a>(&'a self, name: &str) -> io::Result<Reader<'a>> { 371 let entry = self 372 .index 373 .get(name) 374 .ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "Entry not found"))?; 375 376 let start = entry.offset() as usize; 377 let end = start + entry.compressed_size() as usize; 378 let mmap = self 379 .mmap 380 .as_ref() 381 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing mmap"))?; 382 let data_slice = &mmap[start..end]; 383 384 let cursor = io::Cursor::new(data_slice); 385 386 if entry.compression_type() == Compress::Zstd { 387 // Zstd streaming decoder 388 let decoder = zstd::Decoder::new(cursor)?; 389 Ok(Reader { 390 decoder: Either::Left(decoder), 391 crc32_hasher: Hasher::new(), 392 expected_crc32: entry.crc32(), 393 }) 394 } else { 395 Ok(Reader { 396 decoder: Either::Right(cursor), 397 crc32_hasher: Hasher::new(), 398 expected_crc32: entry.crc32(), 399 }) 400 } 401 } 402 403 /// Returns the number of entries in the archive. 404 pub fn len(&self) -> usize { 405 self.index.len() 406 } 407 408 /// Returns true if the archive contains no entries. 409 pub fn is_empty(&self) -> bool { 410 self.index.is_empty() 411 } 412 413 /// Returns a reference to the archive index. 414 /// 415 /// The index maps entry names to their metadata. 416 pub fn index(&self) -> &BTreeMap<String, Entry> { 417 &self.index 418 } 419 420 /// Removes all entries from the index. 421 /// 422 /// Call [`save()`](Bindle::save) to commit. Data remains in the file until [`vacuum()`](Bindle::vacuum) is called. 423 pub fn clear(&mut self) { 424 self.index.clear() 425 } 426 427 /// Returns true if an entry with the given name exists. 428 pub fn exists(&self, name: &str) -> bool { 429 self.index.contains_key(name) 430 } 431 432 /// Removes an entry from the index. 433 /// 434 /// Returns true if the entry existed. Data remains in the file until [`vacuum()`](Bindle::vacuum) is called. 435 pub fn remove(&mut self, name: &str) -> bool { 436 self.index.remove(name).is_some() 437 } 438 439 /// Recursively adds all files from a directory to the archive. 440 /// 441 /// File paths are stored relative to the source directory. Call [`save()`](Bindle::save) to commit. 442 pub fn pack<P: AsRef<Path>>(&mut self, src_dir: P, compress: Compress) -> io::Result<()> { 443 self.pack_recursive(src_dir.as_ref(), src_dir.as_ref(), compress) 444 } 445 446 fn pack_recursive( 447 &mut self, 448 base: &Path, 449 current: &Path, 450 compress: Compress, 451 ) -> io::Result<()> { 452 if current.is_dir() { 453 for entry in std::fs::read_dir(current)? { 454 self.pack_recursive(base, &entry?.path(), compress)?; 455 } 456 } else { 457 let name = current 458 .strip_prefix(base) 459 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))? 460 .to_str() 461 .unwrap_or_default(); 462 self.add_file(&name, current, compress)?; 463 } 464 Ok(()) 465 } 466 467 /// Extracts all entries to a destination directory. 468 /// 469 /// Creates subdirectories as needed to match the stored paths. 470 pub fn unpack<P: AsRef<Path>>(&self, dest: P) -> io::Result<()> { 471 let dest_path = dest.as_ref(); 472 std::fs::create_dir_all(dest_path)?; 473 474 // Collect all unique parent directories 475 let mut dirs = std::collections::HashSet::new(); 476 for (name, _) in &self.index { 477 if let Some(parent) = Path::new(name).parent() { 478 // Only add non-empty parent paths 479 if parent != Path::new("") { 480 dirs.insert(dest_path.join(parent)); 481 } 482 } 483 } 484 485 // Create all directories upfront (sorted for parent-first order) 486 if !dirs.is_empty() { 487 let mut dirs: Vec<_> = dirs.into_iter().collect(); 488 dirs.sort(); 489 for dir in dirs { 490 std::fs::create_dir_all(&dir)?; 491 } 492 } 493 494 // Sort entries by physical offset for sequential reads (better cache locality) 495 let mut entries: Vec<_> = self.index.iter().collect(); 496 entries.sort_by_key(|(_, entry)| entry.offset()); 497 498 // Extract files without per-file directory checks 499 for (name, _) in entries { 500 let file_path = dest_path.join(name); 501 let mut reader = self.reader(name)?; 502 let mut file = File::create(&file_path)?; 503 io::copy(&mut reader, &mut file)?; 504 reader.verify_crc32()?; 505 } 506 Ok(()) 507 } 508 509 /// Creates a streaming writer for adding an entry. 510 /// 511 /// The writer must be closed and then [`save()`](Bindle::save) must be called to commit the entry. 512 pub fn writer<'a>(&'a mut self, name: &str, compress: Compress) -> io::Result<Writer<'a>> { 513 self.file.lock()?; 514 // Only seek if not already at the correct position 515 let current_pos = self.file.stream_position()?; 516 if current_pos != self.data_end { 517 self.file.seek(SeekFrom::Start(self.data_end))?; 518 } 519 let compress = self.should_auto_compress(compress, 0); 520 let start_offset = self.data_end; 521 let encoder = if compress { 522 let f = self.file.try_clone()?; 523 Some(zstd::Encoder::new(f, 3)?) 524 } else { 525 None 526 }; 527 Ok(Writer { 528 name: name.to_string(), 529 bindle: self, 530 encoder, 531 start_offset, 532 uncompressed_size: 0, 533 crc32_hasher: Hasher::new(), 534 }) 535 } 536} 537 538impl Drop for Bindle { 539 fn drop(&mut self) { 540 let _ = self.file.unlock(); 541 } 542}