pydantic model generator for atproto lexicons

refactor: move lexicon hashing to rust

- hash_lexicons now implemented in rust with sha2 crate
- exposed to python via pyo3
- update justfile to use uvx maturin

Changed files
+180 -36
python
pmgfal
src
+111
Cargo.lock
··· 34 34 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" 35 35 36 36 [[package]] 37 + name = "block-buffer" 38 + version = "0.10.4" 39 + source = "registry+https://github.com/rust-lang/crates.io-index" 40 + checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" 41 + dependencies = [ 42 + "generic-array", 43 + ] 44 + 45 + [[package]] 37 46 name = "bumpalo" 38 47 version = "3.19.0" 39 48 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 72 81 version = "0.8.7" 73 82 source = "registry+https://github.com/rust-lang/crates.io-index" 74 83 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" 84 + 85 + [[package]] 86 + name = "cpufeatures" 87 + version = "0.2.17" 88 + source = "registry+https://github.com/rust-lang/crates.io-index" 89 + checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" 90 + dependencies = [ 91 + "libc", 92 + ] 93 + 94 + [[package]] 95 + name = "crypto-common" 96 + version = "0.1.7" 97 + source = "registry+https://github.com/rust-lang/crates.io-index" 98 + checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" 99 + dependencies = [ 100 + "generic-array", 101 + "typenum", 102 + ] 75 103 76 104 [[package]] 77 105 name = "darling" ··· 119 147 ] 120 148 121 149 [[package]] 150 + name = "digest" 151 + version = "0.10.7" 152 + source = "registry+https://github.com/rust-lang/crates.io-index" 153 + checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" 154 + dependencies = [ 155 + "block-buffer", 156 + "crypto-common", 157 + ] 158 + 159 + [[package]] 122 160 name = "find-msvc-tools" 123 161 version = "0.1.5" 124 162 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 131 169 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 132 170 133 171 [[package]] 172 + name = "generic-array" 173 + version = "0.14.7" 174 + source = "registry+https://github.com/rust-lang/crates.io-index" 175 + checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" 176 + dependencies = [ 177 + "typenum", 178 + "version_check", 179 + ] 180 + 181 + [[package]] 134 182 name = "hashbrown" 135 183 version = "0.12.3" 136 184 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 268 316 dependencies = [ 269 317 "atrium-lex", 270 318 "heck", 319 + "hex", 271 320 "pyo3", 272 321 "serde", 273 322 "serde_json", 323 + "sha2", 274 324 "thiserror", 325 + "walkdir", 275 326 ] 276 327 277 328 [[package]] ··· 378 429 checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 379 430 380 431 [[package]] 432 + name = "same-file" 433 + version = "1.0.6" 434 + source = "registry+https://github.com/rust-lang/crates.io-index" 435 + checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 436 + dependencies = [ 437 + "winapi-util", 438 + ] 439 + 440 + [[package]] 381 441 name = "serde" 382 442 version = "1.0.228" 383 443 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 460 520 ] 461 521 462 522 [[package]] 523 + name = "sha2" 524 + version = "0.10.9" 525 + source = "registry+https://github.com/rust-lang/crates.io-index" 526 + checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" 527 + dependencies = [ 528 + "cfg-if", 529 + "cpufeatures", 530 + "digest", 531 + ] 532 + 533 + [[package]] 463 534 name = "shlex" 464 535 version = "1.3.0" 465 536 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 540 611 ] 541 612 542 613 [[package]] 614 + name = "typenum" 615 + version = "1.19.0" 616 + source = "registry+https://github.com/rust-lang/crates.io-index" 617 + checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" 618 + 619 + [[package]] 543 620 name = "unicode-ident" 544 621 version = "1.0.22" 545 622 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 552 629 checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" 553 630 554 631 [[package]] 632 + name = "version_check" 633 + version = "0.9.5" 634 + source = "registry+https://github.com/rust-lang/crates.io-index" 635 + checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 636 + 637 + [[package]] 638 + name = "walkdir" 639 + version = "2.5.0" 640 + source = "registry+https://github.com/rust-lang/crates.io-index" 641 + checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 642 + dependencies = [ 643 + "same-file", 644 + "winapi-util", 645 + ] 646 + 647 + [[package]] 555 648 name = "wasm-bindgen" 556 649 version = "0.2.106" 557 650 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 597 690 ] 598 691 599 692 [[package]] 693 + name = "winapi-util" 694 + version = "0.1.11" 695 + source = "registry+https://github.com/rust-lang/crates.io-index" 696 + checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" 697 + dependencies = [ 698 + "windows-sys", 699 + ] 700 + 701 + [[package]] 600 702 name = "windows-core" 601 703 version = "0.62.2" 602 704 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 654 756 dependencies = [ 655 757 "windows-link", 656 758 ] 759 + 760 + [[package]] 761 + name = "windows-sys" 762 + version = "0.61.2" 763 + source = "registry+https://github.com/rust-lang/crates.io-index" 764 + checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" 765 + dependencies = [ 766 + "windows-link", 767 + ]
+3
Cargo.toml
··· 16 16 serde_json = "1.0" 17 17 heck = "0.5" 18 18 thiserror = "2.0" 19 + sha2 = "0.10" 20 + hex = "0.4" 21 + walkdir = "2.5" 19 22 20 23 [profile.release] 21 24 lto = true
+20 -11
justfile
··· 1 - # build rust extension 1 + # build rust extension in dev mode 2 + dev: 3 + uvx maturin develop 4 + 5 + # run tests 6 + test: dev 7 + uv run pytest -v 8 + 9 + # build release wheels 2 10 build: 3 - uv run maturin develop 11 + uvx maturin build --release 4 12 5 - # run tests (requires build first) 6 - test: build 7 - uv run pytest 8 - 9 - # lint python 13 + # lint 10 14 lint: 11 - uv run ruff check 12 - uv run ruff format --check 15 + uv run ruff check . 16 + uv run ruff format --check . 13 17 14 - # format python 18 + # format 15 19 fmt: 16 - uv run ruff format 20 + uv run ruff check --fix . 21 + uv run ruff format . 22 + 23 + # clean build artifacts 24 + clean: 25 + rm -rf target dist *.egg-info
+4 -25
python/pmgfal/__init__.py
··· 3 3 from __future__ import annotations 4 4 5 5 import argparse 6 - import hashlib 7 6 import os 8 7 import shutil 9 8 import sys 10 9 from pathlib import Path 11 10 12 - from pmgfal._pmgfal import __version__, generate 11 + from pmgfal._pmgfal import __version__, generate, hash_lexicons 13 12 14 - __all__ = ["__version__", "generate", "main", "get_cache_dir"] 13 + __all__ = ["__version__", "generate", "hash_lexicons", "main", "get_cache_dir"] 15 14 16 15 17 16 def get_cache_dir() -> Path: ··· 23 22 else: 24 23 base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) 25 24 return base / "pmgfal" 26 - 27 - 28 - def hash_lexicons(lexicon_dir: Path, prefix: str | None = None) -> str: 29 - """compute a hash of all lexicon files in a directory.""" 30 - hasher = hashlib.sha256() 31 - 32 - # include version in hash so cache invalidates on upgrades 33 - hasher.update(__version__.encode()) 34 - 35 - # include prefix in hash 36 - if prefix: 37 - hasher.update(prefix.encode()) 38 - 39 - # hash all json files in sorted order for determinism 40 - json_files = sorted(lexicon_dir.rglob("*.json")) 41 - for path in json_files: 42 - hasher.update(path.name.encode()) 43 - hasher.update(path.read_bytes()) 44 - 45 - return hasher.hexdigest()[:16] 46 25 47 26 48 27 def main(args: list[str] | None = None) -> int: ··· 99 78 return 1 100 79 101 80 try: 102 - # compute hash of lexicons 103 - lexicon_hash = hash_lexicons(lexicon_dir, parsed.prefix) 81 + # compute hash of lexicons (in rust) 82 + lexicon_hash = hash_lexicons(str(lexicon_dir), parsed.prefix) 104 83 cache_dir = get_cache_dir() / lexicon_hash 105 84 106 85 # check cache
+42
src/lib.rs
··· 5 5 mod parser; 6 6 mod types; 7 7 8 + use std::fs; 8 9 use std::path::Path; 9 10 10 11 use pyo3::prelude::*; 12 + use sha2::{Digest, Sha256}; 13 + 14 + /// compute a hash of all lexicon files in a directory 15 + #[pyfunction] 16 + #[pyo3(signature = (lexicon_dir, namespace_prefix=None))] 17 + fn hash_lexicons(lexicon_dir: &str, namespace_prefix: Option<&str>) -> PyResult<String> { 18 + let lexicon_path = Path::new(lexicon_dir); 19 + 20 + let mut hasher = Sha256::new(); 21 + 22 + // include version in hash so cache invalidates on upgrades 23 + hasher.update(env!("CARGO_PKG_VERSION").as_bytes()); 24 + 25 + // include prefix in hash 26 + if let Some(prefix) = namespace_prefix { 27 + hasher.update(prefix.as_bytes()); 28 + } 29 + 30 + // collect and sort json files for deterministic hashing 31 + let mut json_files: Vec<_> = walkdir::WalkDir::new(lexicon_path) 32 + .into_iter() 33 + .filter_map(|e| e.ok()) 34 + .filter(|e| e.path().extension().is_some_and(|ext| ext == "json")) 35 + .collect(); 36 + 37 + json_files.sort_by(|a, b| a.path().cmp(b.path())); 38 + 39 + for entry in json_files { 40 + let path = entry.path(); 41 + if let Some(name) = path.file_name() { 42 + hasher.update(name.as_encoded_bytes()); 43 + } 44 + if let Ok(content) = fs::read(path) { 45 + hasher.update(&content); 46 + } 47 + } 48 + 49 + let result = hasher.finalize(); 50 + Ok(hex::encode(&result[..8])) // 16 hex chars 51 + } 11 52 12 53 /// generate pydantic models from lexicon files 13 54 #[pyfunction] ··· 32 73 #[pymodule] 33 74 fn _pmgfal(m: &Bound<'_, PyModule>) -> PyResult<()> { 34 75 m.add_function(wrap_pyfunction!(generate, m)?)?; 76 + m.add_function(wrap_pyfunction!(hash_lexicons, m)?)?; 35 77 m.add("__version__", env!("CARGO_PKG_VERSION"))?; 36 78 Ok(()) 37 79 }