an efficient binary archive format
at main 358 lines 11 kB view raw
1#!/usr/bin/env -S uv run 2# /// script 3# requires-python = ">=3.11" 4# dependencies = [] 5# /// 6""" 7Benchmark comparing bindle vs tar/tar.gz/zip for archive operations. 8 9Measures: 10- Archive creation time 11- Archive size 12- Extraction/read time 13""" 14 15import subprocess 16import tempfile 17import time 18from pathlib import Path 19 20 21def format_size(bytes: int) -> str: 22 """Format bytes as human readable string.""" 23 for unit in ["B", "KB", "MB", "GB"]: 24 if bytes < 1024: 25 return f"{bytes:.1f} {unit}" 26 bytes //= 1024 27 return f"{bytes:.1f} TB" 28 29 30def format_time(seconds: float) -> str: 31 """Format seconds as human readable string.""" 32 if seconds < 0.001: 33 return f"{seconds * 1_000_000:.1f} µs" 34 elif seconds < 1: 35 return f"{seconds * 1000:.1f} ms" 36 else: 37 return f"{seconds:.3f} s" 38 39 40def verify_extraction(src_dir: Path, extract_dir: Path) -> None: 41 """Verify extracted files match source files.""" 42 src_files = {f.relative_to(src_dir): f for f in src_dir.rglob("*") if f.is_file()} 43 extract_files = {f.relative_to(extract_dir): f for f in extract_dir.rglob("*") if f.is_file()} 44 45 # Check file count 46 if len(src_files) != len(extract_files): 47 raise ValueError(f"File count mismatch: {len(src_files)} source, {len(extract_files)} extracted") 48 49 # Check each file exists and has correct size 50 for rel_path, src_file in src_files.items(): 51 if rel_path not in extract_files: 52 raise ValueError(f"Missing file in extraction: {rel_path}") 53 54 extract_file = extract_files[rel_path] 55 src_size = src_file.stat().st_size 56 extract_size = extract_file.stat().st_size 57 58 if src_size != extract_size: 59 raise ValueError(f"Size mismatch for {rel_path}: {src_size} vs {extract_size}") 60 61 # Verify content matches 62 if src_file.read_bytes() != extract_file.read_bytes(): 63 raise ValueError(f"Content mismatch for {rel_path}") 64 65 66def create_test_data(base_dir: Path) -> None: 67 """Create a variety of test files.""" 68 base_dir.mkdir(parents=True, exist_ok=True) 69 70 # Small text files (highly compressible) 71 for i in range(100): 72 (base_dir / f"text_{i}.txt").write_text( 73 f"This is test file {i}\n" * 100 74 ) 75 76 # Medium files with repetitive data 77 for i in range(10): 78 (base_dir / f"medium_{i}.dat").write_bytes( 79 bytes([i % 256] * 100_000) 80 ) 81 82 # Large file with repetitive data 83 (base_dir / "large.dat").write_bytes(b"X" * 10_000_000) 84 85 # Binary-like data (less compressible) 86 import random 87 random.seed(42) 88 (base_dir / "random.bin").write_bytes( 89 bytes(random.randint(0, 255) for _ in range(1_000_000)) 90 ) 91 92 93def benchmark_bindle_uncompressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 94 """Benchmark bindle without compression.""" 95 # Pack 96 start = time.perf_counter() 97 subprocess.run( 98 [str(bindle_bin), "pack", str(archive_path), str(src_dir)], 99 stdout=subprocess.DEVNULL, 100 stderr=subprocess.DEVNULL, 101 check=True, 102 ) 103 pack_time = time.perf_counter() - start 104 105 size = archive_path.stat().st_size 106 107 # Unpack 108 extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 109 extract_dir.mkdir(exist_ok=True) 110 start = time.perf_counter() 111 subprocess.run( 112 [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)], 113 stdout=subprocess.DEVNULL, 114 stderr=subprocess.DEVNULL, 115 check=True, 116 ) 117 unpack_time = time.perf_counter() - start 118 119 # Verify extraction (not timed) 120 verify_extraction(src_dir, extract_dir) 121 122 return pack_time, size, unpack_time 123 124 125def benchmark_bindle_compressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 126 """Benchmark bindle with zstd compression.""" 127 # Pack 128 start = time.perf_counter() 129 subprocess.run( 130 [str(bindle_bin), "pack", str(archive_path), str(src_dir), "--compress"], 131 stdout=subprocess.DEVNULL, 132 stderr=subprocess.DEVNULL, 133 check=True, 134 ) 135 pack_time = time.perf_counter() - start 136 137 size = archive_path.stat().st_size 138 139 # Unpack 140 extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 141 extract_dir.mkdir(exist_ok=True) 142 start = time.perf_counter() 143 subprocess.run( 144 [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)], 145 stdout=subprocess.DEVNULL, 146 stderr=subprocess.DEVNULL, 147 check=True, 148 ) 149 unpack_time = time.perf_counter() - start 150 151 # Verify extraction (not timed) 152 verify_extraction(src_dir, extract_dir) 153 154 return pack_time, size, unpack_time 155 156 157def benchmark_tar(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 158 """Benchmark tar (uncompressed) using CLI.""" 159 # Create 160 start = time.perf_counter() 161 subprocess.run( 162 ["tar", "-cf", str(archive_path), "-C", str(src_dir), "."], 163 capture_output=True, 164 check=True, 165 ) 166 pack_time = time.perf_counter() - start 167 168 size = archive_path.stat().st_size 169 170 # Extract 171 extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 172 extract_dir.mkdir(exist_ok=True) 173 start = time.perf_counter() 174 subprocess.run( 175 ["tar", "-xf", str(archive_path), "-C", str(extract_dir)], 176 capture_output=True, 177 check=True, 178 ) 179 unpack_time = time.perf_counter() - start 180 181 # Verify extraction (not timed) 182 verify_extraction(src_dir, extract_dir) 183 184 return pack_time, size, unpack_time 185 186 187def benchmark_tar_gz(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 188 """Benchmark tar.gz using CLI.""" 189 # Create 190 start = time.perf_counter() 191 subprocess.run( 192 ["tar", "-czf", str(archive_path), "-C", str(src_dir), "."], 193 capture_output=True, 194 check=True, 195 ) 196 pack_time = time.perf_counter() - start 197 198 size = archive_path.stat().st_size 199 200 # Extract 201 extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 202 extract_dir.mkdir(exist_ok=True) 203 start = time.perf_counter() 204 subprocess.run( 205 ["tar", "-xzf", str(archive_path), "-C", str(extract_dir)], 206 capture_output=True, 207 check=True, 208 ) 209 unpack_time = time.perf_counter() - start 210 211 # Verify extraction (not timed) 212 verify_extraction(src_dir, extract_dir) 213 214 return pack_time, size, unpack_time 215 216 217def benchmark_zip(src_dir: Path, archive_path: Path) -> tuple[float, int, float]: 218 """Benchmark zip using CLI.""" 219 # Create - zip requires being in the directory or using find 220 start = time.perf_counter() 221 subprocess.run( 222 ["sh", "-c", f"cd {src_dir} && zip -r -q {archive_path} ."], 223 capture_output=True, 224 check=True, 225 ) 226 pack_time = time.perf_counter() - start 227 228 size = archive_path.stat().st_size 229 230 # Extract 231 extract_dir = archive_path.parent / f"extract_{archive_path.stem}" 232 extract_dir.mkdir(exist_ok=True) 233 start = time.perf_counter() 234 subprocess.run( 235 ["unzip", "-o", "-q", str(archive_path), "-d", str(extract_dir)], 236 capture_output=True, 237 check=True, 238 ) 239 unpack_time = time.perf_counter() - start 240 241 # Verify extraction (not timed) 242 verify_extraction(src_dir, extract_dir) 243 244 return pack_time, size, unpack_time 245 246 247def main(): 248 project_root = Path(__file__).parent.parent 249 250 print("Building bindle in release mode...") 251 subprocess.run( 252 ["cargo", "build", "--release", "--features", "cli"], 253 cwd=project_root, 254 capture_output=True, 255 check=True, 256 ) 257 258 # Get the built binary path 259 bindle_bin = project_root / "target" / "release" / "bindle" 260 if not bindle_bin.exists(): 261 raise FileNotFoundError(f"Built binary not found at {bindle_bin}") 262 263 with tempfile.TemporaryDirectory() as tmpdir: 264 tmpdir = Path(tmpdir) 265 266 # Ensure directories exist and warm up filesystem 267 test_data = tmpdir / "test_data" 268 test_data.mkdir(parents=True, exist_ok=True) 269 270 # Warm up: write and delete a small file to initialize filesystem 271 warmup_file = tmpdir / "warmup" 272 warmup_file.write_bytes(b"warmup" * 1000) 273 warmup_file.unlink() 274 275 # Create test data 276 print("Creating test dataset...") 277 create_test_data(test_data) 278 279 # Warm up: read all test files to initialize filesystem caches 280 for f in test_data.rglob("*"): 281 if f.is_file(): 282 _ = f.read_bytes() 283 284 # Calculate total size 285 total_size = sum(f.stat().st_size for f in test_data.rglob("*") if f.is_file()) 286 file_count = len(list(test_data.rglob("*"))) 287 288 print(f"Test dataset: {file_count} files, {format_size(total_size)}\n") 289 290 benchmarks = [ 291 ("bindle (uncompressed)", lambda run: benchmark_bindle_uncompressed( 292 bindle_bin, test_data, tmpdir / f"test_{run}.bndl" 293 )), 294 ("bindle (zstd)", lambda run: benchmark_bindle_compressed( 295 bindle_bin, test_data, tmpdir / f"test_zstd_{run}.bndl" 296 )), 297 ("tar", lambda run: benchmark_tar( 298 test_data, tmpdir / f"test_{run}.tar" 299 )), 300 ("tar.gz", lambda run: benchmark_tar_gz( 301 test_data, tmpdir / f"test_{run}.tar.gz" 302 )), 303 ("zip", lambda run: benchmark_zip( 304 test_data, tmpdir / f"test_{run}.zip" 305 )), 306 ] 307 308 results = [] 309 num_runs = 4 # Run each test 4 times, discard first, average remaining 3 310 311 for name, bench_fn in benchmarks: 312 print(f"Benchmarking {name}...", flush=True) 313 try: 314 pack_times = [] 315 unpack_times = [] 316 size = 0 317 318 for run in range(num_runs): 319 pack_time, run_size, unpack_time = bench_fn(run) 320 pack_times.append(pack_time) 321 unpack_times.append(unpack_time) 322 size = run_size 323 324 # Discard first run, average the rest 325 avg_pack = sum(pack_times[1:]) / (num_runs - 1) 326 avg_unpack = sum(unpack_times[1:]) / (num_runs - 1) 327 328 results.append((name, avg_pack, size, avg_unpack)) 329 except subprocess.CalledProcessError as e: 330 print(f" ERROR: Command failed with exit code {e.returncode}") 331 if e.stderr: 332 print(f" stderr: {e.stderr.decode()}") 333 results.append((name, 0, 0, 0)) 334 except Exception as e: 335 print(f" ERROR: {e}") 336 results.append((name, 0, 0, 0)) 337 338 # Print results 339 print("\n" + "=" * 90) 340 print(f"{'Format':<22} {'Pack Time':<15} {'Size':<15} {'Unpack Time':<15} {'Ratio':>10}") 341 print("=" * 90) 342 343 for name, pack_time, size, unpack_time in results: 344 if size > 0: 345 ratio = (size / total_size) * 100 346 print( 347 f"{name:<22} {format_time(pack_time):<15} " 348 f"{format_size(size):<15} {format_time(unpack_time):<15} " 349 f"{ratio:>9.1f}%" 350 ) 351 else: 352 print(f"{name:<22} {'FAILED'}") 353 354 print("=" * 90) 355 356 357if __name__ == "__main__": 358 main()