bench/bench.py at main · zachshipko.com/bindle-file

zachshipko.com / bindle-file
fork atom
an efficient binary archive format
fork atom
bindle-file / bench / bench.py
at main 358 lines 11 kB view raw
wrap content
zachshipko.com add verification step to tests 7w ago
7e9d2089
  1#!/usr/bin/env -S uv run
  2# /// script
  3# requires-python = ">=3.11"
  4# dependencies = []
  5# ///
  6"""
  7Benchmark comparing bindle vs tar/tar.gz/zip for archive operations.
  8
  9Measures:
 10- Archive creation time
 11- Archive size
 12- Extraction/read time
 13"""
 14
 15import subprocess
 16import tempfile
 17import time
 18from pathlib import Path
 19
 20
 21def format_size(bytes: int) -> str:
 22    """Format bytes as human readable string."""
 23    for unit in ["B", "KB", "MB", "GB"]:
 24        if bytes < 1024:
 25            return f"{bytes:.1f} {unit}"
 26        bytes //= 1024
 27    return f"{bytes:.1f} TB"
 28
 29
 30def format_time(seconds: float) -> str:
 31    """Format seconds as human readable string."""
 32    if seconds < 0.001:
 33        return f"{seconds * 1_000_000:.1f} µs"
 34    elif seconds < 1:
 35        return f"{seconds * 1000:.1f} ms"
 36    else:
 37        return f"{seconds:.3f} s"
 38
 39
 40def verify_extraction(src_dir: Path, extract_dir: Path) -> None:
 41    """Verify extracted files match source files."""
 42    src_files = {f.relative_to(src_dir): f for f in src_dir.rglob("*") if f.is_file()}
 43    extract_files = {f.relative_to(extract_dir): f for f in extract_dir.rglob("*") if f.is_file()}
 44
 45    # Check file count
 46    if len(src_files) != len(extract_files):
 47        raise ValueError(f"File count mismatch: {len(src_files)} source, {len(extract_files)} extracted")
 48
 49    # Check each file exists and has correct size
 50    for rel_path, src_file in src_files.items():
 51        if rel_path not in extract_files:
 52            raise ValueError(f"Missing file in extraction: {rel_path}")
 53
 54        extract_file = extract_files[rel_path]
 55        src_size = src_file.stat().st_size
 56        extract_size = extract_file.stat().st_size
 57
 58        if src_size != extract_size:
 59            raise ValueError(f"Size mismatch for {rel_path}: {src_size} vs {extract_size}")
 60
 61        # Verify content matches
 62        if src_file.read_bytes() != extract_file.read_bytes():
 63            raise ValueError(f"Content mismatch for {rel_path}")
 64
 65
 66def create_test_data(base_dir: Path) -> None:
 67    """Create a variety of test files."""
 68    base_dir.mkdir(parents=True, exist_ok=True)
 69
 70    # Small text files (highly compressible)
 71    for i in range(100):
 72        (base_dir / f"text_{i}.txt").write_text(
 73            f"This is test file {i}\n" * 100
 74        )
 75
 76    # Medium files with repetitive data
 77    for i in range(10):
 78        (base_dir / f"medium_{i}.dat").write_bytes(
 79            bytes([i % 256] * 100_000)
 80        )
 81
 82    # Large file with repetitive data
 83    (base_dir / "large.dat").write_bytes(b"X" * 10_000_000)
 84
 85    # Binary-like data (less compressible)
 86    import random
 87    random.seed(42)
 88    (base_dir / "random.bin").write_bytes(
 89        bytes(random.randint(0, 255) for _ in range(1_000_000))
 90    )
 91
 92
 93def benchmark_bindle_uncompressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
 94    """Benchmark bindle without compression."""
 95    # Pack
 96    start = time.perf_counter()
 97    subprocess.run(
 98        [str(bindle_bin), "pack", str(archive_path), str(src_dir)],
 99        stdout=subprocess.DEVNULL,
100        stderr=subprocess.DEVNULL,
101        check=True,
102    )
103    pack_time = time.perf_counter() - start
104
105    size = archive_path.stat().st_size
106
107    # Unpack
108    extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
109    extract_dir.mkdir(exist_ok=True)
110    start = time.perf_counter()
111    subprocess.run(
112        [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)],
113        stdout=subprocess.DEVNULL,
114        stderr=subprocess.DEVNULL,
115        check=True,
116    )
117    unpack_time = time.perf_counter() - start
118
119    # Verify extraction (not timed)
120    verify_extraction(src_dir, extract_dir)
121
122    return pack_time, size, unpack_time
123
124
125def benchmark_bindle_compressed(bindle_bin: Path, src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
126    """Benchmark bindle with zstd compression."""
127    # Pack
128    start = time.perf_counter()
129    subprocess.run(
130        [str(bindle_bin), "pack", str(archive_path), str(src_dir), "--compress"],
131        stdout=subprocess.DEVNULL,
132        stderr=subprocess.DEVNULL,
133        check=True,
134    )
135    pack_time = time.perf_counter() - start
136
137    size = archive_path.stat().st_size
138
139    # Unpack
140    extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
141    extract_dir.mkdir(exist_ok=True)
142    start = time.perf_counter()
143    subprocess.run(
144        [str(bindle_bin), "unpack", str(archive_path), str(extract_dir)],
145        stdout=subprocess.DEVNULL,
146        stderr=subprocess.DEVNULL,
147        check=True,
148    )
149    unpack_time = time.perf_counter() - start
150
151    # Verify extraction (not timed)
152    verify_extraction(src_dir, extract_dir)
153
154    return pack_time, size, unpack_time
155
156
157def benchmark_tar(src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
158    """Benchmark tar (uncompressed) using CLI."""
159    # Create
160    start = time.perf_counter()
161    subprocess.run(
162        ["tar", "-cf", str(archive_path), "-C", str(src_dir), "."],
163        capture_output=True,
164        check=True,
165    )
166    pack_time = time.perf_counter() - start
167
168    size = archive_path.stat().st_size
169
170    # Extract
171    extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
172    extract_dir.mkdir(exist_ok=True)
173    start = time.perf_counter()
174    subprocess.run(
175        ["tar", "-xf", str(archive_path), "-C", str(extract_dir)],
176        capture_output=True,
177        check=True,
178    )
179    unpack_time = time.perf_counter() - start
180
181    # Verify extraction (not timed)
182    verify_extraction(src_dir, extract_dir)
183
184    return pack_time, size, unpack_time
185
186
187def benchmark_tar_gz(src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
188    """Benchmark tar.gz using CLI."""
189    # Create
190    start = time.perf_counter()
191    subprocess.run(
192        ["tar", "-czf", str(archive_path), "-C", str(src_dir), "."],
193        capture_output=True,
194        check=True,
195    )
196    pack_time = time.perf_counter() - start
197
198    size = archive_path.stat().st_size
199
200    # Extract
201    extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
202    extract_dir.mkdir(exist_ok=True)
203    start = time.perf_counter()
204    subprocess.run(
205        ["tar", "-xzf", str(archive_path), "-C", str(extract_dir)],
206        capture_output=True,
207        check=True,
208    )
209    unpack_time = time.perf_counter() - start
210
211    # Verify extraction (not timed)
212    verify_extraction(src_dir, extract_dir)
213
214    return pack_time, size, unpack_time
215
216
217def benchmark_zip(src_dir: Path, archive_path: Path) -> tuple[float, int, float]:
218    """Benchmark zip using CLI."""
219    # Create - zip requires being in the directory or using find
220    start = time.perf_counter()
221    subprocess.run(
222        ["sh", "-c", f"cd {src_dir} && zip -r -q {archive_path} ."],
223        capture_output=True,
224        check=True,
225    )
226    pack_time = time.perf_counter() - start
227
228    size = archive_path.stat().st_size
229
230    # Extract
231    extract_dir = archive_path.parent / f"extract_{archive_path.stem}"
232    extract_dir.mkdir(exist_ok=True)
233    start = time.perf_counter()
234    subprocess.run(
235        ["unzip", "-o", "-q", str(archive_path), "-d", str(extract_dir)],
236        capture_output=True,
237        check=True,
238    )
239    unpack_time = time.perf_counter() - start
240
241    # Verify extraction (not timed)
242    verify_extraction(src_dir, extract_dir)
243
244    return pack_time, size, unpack_time
245
246
247def main():
248    project_root = Path(__file__).parent.parent
249
250    print("Building bindle in release mode...")
251    subprocess.run(
252        ["cargo", "build", "--release", "--features", "cli"],
253        cwd=project_root,
254        capture_output=True,
255        check=True,
256    )
257
258    # Get the built binary path
259    bindle_bin = project_root / "target" / "release" / "bindle"
260    if not bindle_bin.exists():
261        raise FileNotFoundError(f"Built binary not found at {bindle_bin}")
262
263    with tempfile.TemporaryDirectory() as tmpdir:
264        tmpdir = Path(tmpdir)
265
266        # Ensure directories exist and warm up filesystem
267        test_data = tmpdir / "test_data"
268        test_data.mkdir(parents=True, exist_ok=True)
269
270        # Warm up: write and delete a small file to initialize filesystem
271        warmup_file = tmpdir / "warmup"
272        warmup_file.write_bytes(b"warmup" * 1000)
273        warmup_file.unlink()
274
275        # Create test data
276        print("Creating test dataset...")
277        create_test_data(test_data)
278
279        # Warm up: read all test files to initialize filesystem caches
280        for f in test_data.rglob("*"):
281            if f.is_file():
282                _ = f.read_bytes()
283
284        # Calculate total size
285        total_size = sum(f.stat().st_size for f in test_data.rglob("*") if f.is_file())
286        file_count = len(list(test_data.rglob("*")))
287
288        print(f"Test dataset: {file_count} files, {format_size(total_size)}\n")
289
290        benchmarks = [
291            ("bindle (uncompressed)", lambda run: benchmark_bindle_uncompressed(
292                bindle_bin, test_data, tmpdir / f"test_{run}.bndl"
293            )),
294            ("bindle (zstd)", lambda run: benchmark_bindle_compressed(
295                bindle_bin, test_data, tmpdir / f"test_zstd_{run}.bndl"
296            )),
297            ("tar", lambda run: benchmark_tar(
298                test_data, tmpdir / f"test_{run}.tar"
299            )),
300            ("tar.gz", lambda run: benchmark_tar_gz(
301                test_data, tmpdir / f"test_{run}.tar.gz"
302            )),
303            ("zip", lambda run: benchmark_zip(
304                test_data, tmpdir / f"test_{run}.zip"
305            )),
306        ]
307
308        results = []
309        num_runs = 4  # Run each test 4 times, discard first, average remaining 3
310
311        for name, bench_fn in benchmarks:
312            print(f"Benchmarking {name}...", flush=True)
313            try:
314                pack_times = []
315                unpack_times = []
316                size = 0
317
318                for run in range(num_runs):
319                    pack_time, run_size, unpack_time = bench_fn(run)
320                    pack_times.append(pack_time)
321                    unpack_times.append(unpack_time)
322                    size = run_size
323
324                # Discard first run, average the rest
325                avg_pack = sum(pack_times[1:]) / (num_runs - 1)
326                avg_unpack = sum(unpack_times[1:]) / (num_runs - 1)
327
328                results.append((name, avg_pack, size, avg_unpack))
329            except subprocess.CalledProcessError as e:
330                print(f"  ERROR: Command failed with exit code {e.returncode}")
331                if e.stderr:
332                    print(f"  stderr: {e.stderr.decode()}")
333                results.append((name, 0, 0, 0))
334            except Exception as e:
335                print(f"  ERROR: {e}")
336                results.append((name, 0, 0, 0))
337
338        # Print results
339        print("\n" + "=" * 90)
340        print(f"{'Format':<22} {'Pack Time':<15} {'Size':<15} {'Unpack Time':<15} {'Ratio':>10}")
341        print("=" * 90)
342
343        for name, pack_time, size, unpack_time in results:
344            if size > 0:
345                ratio = (size / total_size) * 100
346                print(
347                    f"{name:<22} {format_time(pack_time):<15} "
348                    f"{format_size(size):<15} {format_time(unpack_time):<15} "
349                    f"{ratio:>9.1f}%"
350                )
351            else:
352                print(f"{name:<22} {'FAILED'}")
353
354        print("=" * 90)
355
356
357if __name__ == "__main__":
358    main()