an efficient binary archive format
1#!/usr/bin/env -S uv run
2# /// script
3# requires-python = ">=3.11"
4# dependencies = []
5# ///
6"""
7Benchmark comparing bindle vs tar/tar.gz/zip for archive operations.
8
9Measures:
10- Archive creation time
11- Archive size
12- Extraction/read time
13"""
14
15import subprocess
16import tempfile
17import time
18from pathlib import Path
19
20
21def format_size(bytes: int) -> str:
22 """Format bytes as human readable string."""
23 for unit in ["B", "KB", "MB", "GB"]:
24 if bytes < 1024:
25 return f"{bytes:.1f} {unit}"
26 bytes //= 1024
27 return f"{bytes:.1f} TB"
28
29
30def format_time(seconds: float) -> str:
31 """Format seconds as human readable string."""
32 if seconds < 0.001:
33 return f"{seconds * 1_000_000:.1f} µs"
34 elif seconds < 1:
35 return f"{seconds * 1000:.1f} ms"
36 else:
37 return f"{seconds:.3f} s"
38
39
40def verify_extraction(src_dir: Path, extract_dir: Path) -> None:
41 """Verify extracted files match source files."""
42 src_files = {f.relative_to(src_dir): f for f in src_dir.rglob("*") if f.is_file()}
43 extract_files = {f.relative_to(extract_dir): f for f in extract_dir.rglob("*") if f.is_file()}
44
45 # Check file count
46 if len(src_files) != len(extract_files):
47 raise ValueError(f"File count mismatch: {len(src_files)} source, {len(extract_files)} extracted")
48
49 # Check each file exists and has correct size
50 for rel_path, src_file in src_files.items():
51 if rel_path not in extract_files:
52 raise ValueError(f"Missing file in extraction: {rel_path}")
53
54 extract_file = extract_files[rel_path]
55 src_size = src_file.stat().st_size
56 extract_size = extract_file.stat().st_size
57
58 if src_size != extract_size:
59 raise ValueError(f"Size mismatch for {rel_path}: {src_size} vs {extract_size}")
60
61 # Verify content matches
62 if src_file.read_bytes() != extract_file.read_bytes():
63 raise ValueError(f"Content mismatch for {rel_path}")
64
65
66def create_test_data(base_dir: Path) -> None:
67 """Create a variety of test files."""
68 base_dir.mkdir(parents=True, exist_ok=True)
69
70 # Small text files (highly compressible)
71 for i in range(100):
72 (base_dir / f"text_{i}.txt").write_text(
73 f"This is test file {i}\n" * 100
74 )
75
76 # Medium files with repetitive data
77 for i in range(10):
78 (base_dir / f"medium_{i}.dat").write_bytes(
79 bytes([i % 256] * 100_000)
80 )
81
82 # Large file with repetitive data
83 (base_dir / "large.dat").write_bytes(b"X" * 10_000_000)
84
85 # Binary-like data (less compressible)
86 import random
87 random.seed(42)
88 (base_dir / "random.bin").write_bytes(
89 bytes(random.randint(0, 255) for _ in range(1_000_000))
90 )
91
92
93def benchmark_bindle_uncompressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
94 """Benchmark bindle without compression."""
95 # Pack
96 start = time.perf_counter()
97 subprocess.run(
98 [str(bindle_bin), "pack", str(archive_path), str(src_dir)],
99 stdout=subprocess.DEVNULL,
100 stderr=subprocess.DEVNULL,
101 check=True,
102 )
103 pack_time = time.perf_counter() - start
104
105 size = archive_path.stat().st_size
106
107 # Unpack
108 extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
109 extract_dir.mkdir(exist_ok=True)
110 start = time.perf_counter()
111 subprocess.run(
112 [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)],
113 stdout=subprocess.DEVNULL,
114 stderr=subprocess.DEVNULL,
115 check=True,
116 )
117 unpack_time = time.perf_counter() - start
118
119 # Verify extraction (not timed)
120 verify_extraction(src_dir, extract_dir)
121
122 return pack_time, size, unpack_time
123
124
125def benchmark_bindle_compressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
126 """Benchmark bindle with zstd compression."""
127 # Pack
128 start = time.perf_counter()
129 subprocess.run(
130 [str(bindle_bin), "pack", str(archive_path), str(src_dir), "--compress"],
131 stdout=subprocess.DEVNULL,
132 stderr=subprocess.DEVNULL,
133 check=True,
134 )
135 pack_time = time.perf_counter() - start
136
137 size = archive_path.stat().st_size
138
139 # Unpack
140 extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
141 extract_dir.mkdir(exist_ok=True)
142 start = time.perf_counter()
143 subprocess.run(
144 [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)],
145 stdout=subprocess.DEVNULL,
146 stderr=subprocess.DEVNULL,
147 check=True,
148 )
149 unpack_time = time.perf_counter() - start
150
151 # Verify extraction (not timed)
152 verify_extraction(src_dir, extract_dir)
153
154 return pack_time, size, unpack_time
155
156
157def benchmark_tar(src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
158 """Benchmark tar (uncompressed) using CLI."""
159 # Create
160 start = time.perf_counter()
161 subprocess.run(
162 ["tar", "-cf", str(archive_path), "-C", str(src_dir), "."],
163 capture_output=True,
164 check=True,
165 )
166 pack_time = time.perf_counter() - start
167
168 size = archive_path.stat().st_size
169
170 # Extract
171 extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
172 extract_dir.mkdir(exist_ok=True)
173 start = time.perf_counter()
174 subprocess.run(
175 ["tar", "-xf", str(archive_path), "-C", str(extract_dir)],
176 capture_output=True,
177 check=True,
178 )
179 unpack_time = time.perf_counter() - start
180
181 # Verify extraction (not timed)
182 verify_extraction(src_dir, extract_dir)
183
184 return pack_time, size, unpack_time
185
186
187def benchmark_tar_gz(src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
188 """Benchmark tar.gz using CLI."""
189 # Create
190 start = time.perf_counter()
191 subprocess.run(
192 ["tar", "-czf", str(archive_path), "-C", str(src_dir), "."],
193 capture_output=True,
194 check=True,
195 )
196 pack_time = time.perf_counter() - start
197
198 size = archive_path.stat().st_size
199
200 # Extract
201 extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
202 extract_dir.mkdir(exist_ok=True)
203 start = time.perf_counter()
204 subprocess.run(
205 ["tar", "-xzf", str(archive_path), "-C", str(extract_dir)],
206 capture_output=True,
207 check=True,
208 )
209 unpack_time = time.perf_counter() - start
210
211 # Verify extraction (not timed)
212 verify_extraction(src_dir, extract_dir)
213
214 return pack_time, size, unpack_time
215
216
217def benchmark_zip(src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
218 """Benchmark zip using CLI."""
219 # Create - zip requires being in the directory or using find
220 start = time.perf_counter()
221 subprocess.run(
222 ["sh", "-c", f"cd {src_dir} && zip -r -q {archive_path} ."],
223 capture_output=True,
224 check=True,
225 )
226 pack_time = time.perf_counter() - start
227
228 size = archive_path.stat().st_size
229
230 # Extract
231 extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
232 extract_dir.mkdir(exist_ok=True)
233 start = time.perf_counter()
234 subprocess.run(
235 ["unzip", "-o", "-q", str(archive_path), "-d", str(extract_dir)],
236 capture_output=True,
237 check=True,
238 )
239 unpack_time = time.perf_counter() - start
240
241 # Verify extraction (not timed)
242 verify_extraction(src_dir, extract_dir)
243
244 return pack_time, size, unpack_time
245
246
247def main():
248 project_root = Path(__file__).parent.parent
249
250 print("Building bindle in release mode...")
251 subprocess.run(
252 ["cargo", "build", "--release", "--features", "cli"],
253 cwd=project_root,
254 capture_output=True,
255 check=True,
256 )
257
258 # Get the built binary path
259 bindle_bin = project_root / "target" / "release" / "bindle"
260 if not bindle_bin.exists():
261 raise FileNotFoundError(f"Built binary not found at {bindle_bin}")
262
263 with tempfile.TemporaryDirectory() as tmpdir:
264 tmpdir = Path(tmpdir)
265
266 # Ensure directories exist and warm up filesystem
267 test_data = tmpdir / "test_data"
268 test_data.mkdir(parents=True, exist_ok=True)
269
270 # Warm up: write and delete a small file to initialize filesystem
271 warmup_file = tmpdir / "warmup"
272 warmup_file.write_bytes(b"warmup" * 1000)
273 warmup_file.unlink()
274
275 # Create test data
276 print("Creating test dataset...")
277 create_test_data(test_data)
278
279 # Warm up: read all test files to initialize filesystem caches
280 for f in test_data.rglob("*"):
281 if f.is_file():
282 _ = f.read_bytes()
283
284 # Calculate total size
285 total_size = sum(f.stat().st_size for f in test_data.rglob("*") if f.is_file())
286 file_count = len(list(test_data.rglob("*")))
287
288 print(f"Test dataset: {file_count} files, {format_size(total_size)}\n")
289
290 benchmarks = [
291 ("bindle (uncompressed)", lambda run: benchmark_bindle_uncompressed(
292 bindle_bin, test_data, tmpdir / f"test_{run}.bndl"
293 )),
294 ("bindle (zstd)", lambda run: benchmark_bindle_compressed(
295 bindle_bin, test_data, tmpdir / f"test_zstd_{run}.bndl"
296 )),
297 ("tar", lambda run: benchmark_tar(
298 test_data, tmpdir / f"test_{run}.tar"
299 )),
300 ("tar.gz", lambda run: benchmark_tar_gz(
301 test_data, tmpdir / f"test_{run}.tar.gz"
302 )),
303 ("zip", lambda run: benchmark_zip(
304 test_data, tmpdir / f"test_{run}.zip"
305 )),
306 ]
307
308 results = []
309 num_runs = 4 # Run each test 4 times, discard first, average remaining 3
310
311 for name, bench_fn in benchmarks:
312 print(f"Benchmarking {name}...", flush=True)
313 try:
314 pack_times = []
315 unpack_times = []
316 size = 0
317
318 for run in range(num_runs):
319 pack_time, run_size, unpack_time = bench_fn(run)
320 pack_times.append(pack_time)
321 unpack_times.append(unpack_time)
322 size = run_size
323
324 # Discard first run, average the rest
325 avg_pack = sum(pack_times[1:]) / (num_runs - 1)
326 avg_unpack = sum(unpack_times[1:]) / (num_runs - 1)
327
328 results.append((name, avg_pack, size, avg_unpack))
329 except subprocess.CalledProcessError as e:
330 print(f" ERROR: Command failed with exit code {e.returncode}")
331 if e.stderr:
332 print(f" stderr: {e.stderr.decode()}")
333 results.append((name, 0, 0, 0))
334 except Exception as e:
335 print(f" ERROR: {e}")
336 results.append((name, 0, 0, 0))
337
338 # Print results
339 print("\n" + "=" * 90)
340 print(f"{'Format':<22} {'Pack Time':<15} {'Size':<15} {'Unpack Time':<15} {'Ratio':>10}")
341 print("=" * 90)
342
343 for name, pack_time, size, unpack_time in results:
344 if size > 0:
345 ratio = (size / total_size) * 100
346 print(
347 f"{name:<22} {format_time(pack_time):<15} "
348 f"{format_size(size):<15} {format_time(unpack_time):<15} "
349 f"{ratio:>9.1f}%"
350 )
351 else:
352 print(f"{name:<22} {'FAILED'}")
353
354 print("=" * 90)
355
356
357if __name__ == "__main__":
358 main()