A better Rust ATProto crate

better lexicon parsing errors

Orual 2c6880ce 7e5406c7

Changed files
+369 -27
crates
jacquard-lexicon
src
tests
fixtures
error_cases
test_lexicons
+1
Cargo.lock
··· 2506 "serde", 2507 "serde_ipld_dagcbor", 2508 "serde_json", 2509 "serde_repr", 2510 "serde_with", 2511 "sha2",
··· 2506 "serde", 2507 "serde_ipld_dagcbor", 2508 "serde_json", 2509 + "serde_path_to_error", 2510 "serde_repr", 2511 "serde_with", 2512 "sha2",
+1
crates/jacquard-lexicon/Cargo.toml
··· 27 proc-macro2 = { workspace = true, optional = true } 28 quote = { workspace = true, optional = true } 29 serde.workspace = true 30 serde_ipld_dagcbor.workspace = true 31 serde_json.workspace = true 32 serde_repr.workspace = true
··· 27 proc-macro2 = { workspace = true, optional = true } 28 quote = { workspace = true, optional = true } 29 serde.workspace = true 30 + serde_path_to_error = "0.1" 31 serde_ipld_dagcbor.workspace = true 32 serde_json.workspace = true 33 serde_repr.workspace = true
+321 -8
crates/jacquard-lexicon/src/corpus.rs
··· 1 - use crate::ref_utils::RefPath; 2 - use crate::error::Result; 3 use crate::lexicon::{LexUserType, LexiconDoc}; 4 use jacquard_common::{into_static::IntoStatic, smol_str::SmolStr}; 5 use std::collections::BTreeMap; 6 use std::fs; 7 use std::path::Path; 8 9 /// Registry of all loaded lexicons for reference resolution 10 #[derive(Debug, Clone)] 11 pub struct LexiconCorpus { ··· 32 for schema_path in schemas { 33 let content = fs::read_to_string(schema_path.as_ref())?; 34 35 - // Try to parse as lexicon doc - skip files that aren't lexicon schemas 36 - let doc: LexiconDoc = match serde_json::from_str(&content) { 37 - Ok(doc) => doc, 38 - Err(_) => continue, // Skip non-lexicon JSON files 39 - }; 40 41 let nsid = SmolStr::from(doc.id.to_string()); 42 - corpus.docs.insert(nsid.clone(), doc.into_static()); 43 corpus.sources.insert(nsid, content); 44 } 45 ··· 166 // Non-existing refs 167 assert!(!corpus.ref_exists("com.example.fake")); 168 assert!(!corpus.ref_exists("app.bsky.feed.post#nonexistent")); 169 } 170 }
··· 1 + use crate::error::{CodegenError, Result}; 2 use crate::lexicon::{LexUserType, LexiconDoc}; 3 + use crate::ref_utils::RefPath; 4 use jacquard_common::{into_static::IntoStatic, smol_str::SmolStr}; 5 use std::collections::BTreeMap; 6 use std::fs; 7 use std::path::Path; 8 9 + /// Check if content looks like a lexicon file. 10 + /// 11 + /// A file is considered a lexicon if it contains a `"lexicon"` key at the top level 12 + /// or one level down (for some wrapper formats). This allows us to distinguish 13 + /// "not a lexicon at all" (skip silently) from "broken lexicon" (report error). 14 + fn is_lexicon_content(content: &str) -> bool { 15 + // Quick string scan first (fast path for non-JSON or unrelated JSON) 16 + if !content.contains("\"lexicon\"") { 17 + return false; 18 + } 19 + 20 + // Parse to Value and check structure 21 + if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) { 22 + // Top-level lexicon field 23 + if value.get("lexicon").is_some() { 24 + return true; 25 + } 26 + // One level down (some wrapper formats) 27 + if let Some(obj) = value.as_object() { 28 + for v in obj.values() { 29 + if v.get("lexicon").is_some() { 30 + return true; 31 + } 32 + } 33 + } 34 + } 35 + false 36 + } 37 + 38 + /// Raw lexicon doc for two-phase parsing - defs are kept as raw JSON Values 39 + /// so we can deserialize each separately with better error tracking. 40 + #[derive(Debug, serde::Deserialize)] 41 + struct RawLexiconDoc<'s> { 42 + pub lexicon: crate::lexicon::Lexicon, 43 + #[serde(borrow)] 44 + pub id: jacquard_common::CowStr<'s>, 45 + pub revision: Option<u32>, 46 + #[serde(borrow)] 47 + pub description: Option<jacquard_common::CowStr<'s>>, 48 + pub defs: BTreeMap<SmolStr, serde_json::Value>, 49 + } 50 + 51 + /// Helper to create a parse error with path context. 52 + fn make_parse_error( 53 + file_path: &Path, 54 + json_path: &str, 55 + message: String, 56 + content: &str, 57 + ) -> CodegenError { 58 + CodegenError::ParseError { 59 + path: file_path.to_path_buf(), 60 + json_path: Some(json_path.to_string()), 61 + message, 62 + src: Some(content.to_string()), 63 + span: None, 64 + } 65 + } 66 + 67 + /// Recursively parse properties with path tracking. 68 + /// Returns parsed properties or an error with the full path. 69 + fn parse_properties_deep( 70 + props_value: &serde_json::Value, 71 + base_path: &str, 72 + file_path: &Path, 73 + content: &str, 74 + ) -> std::result::Result<BTreeMap<SmolStr, crate::lexicon::LexObjectProperty<'static>>, CodegenError> 75 + { 76 + let props_obj = props_value.as_object().ok_or_else(|| { 77 + make_parse_error( 78 + file_path, 79 + base_path, 80 + "expected object for properties".to_string(), 81 + content, 82 + ) 83 + })?; 84 + 85 + let mut parsed_props = BTreeMap::new(); 86 + for (prop_name, prop_value) in props_obj { 87 + let prop_path = format!("{}.{}", base_path, prop_name); 88 + 89 + // Try to parse this property 90 + let parsed: crate::lexicon::LexObjectProperty = 91 + serde_path_to_error::deserialize(prop_value).map_err(|e| { 92 + let inner_path = e.path().to_string(); 93 + let full_path = if inner_path.is_empty() { 94 + prop_path.clone() 95 + } else { 96 + format!("{}.{}", prop_path, inner_path) 97 + }; 98 + make_parse_error(file_path, &full_path, e.inner().to_string(), content) 99 + })?; 100 + 101 + parsed_props.insert(SmolStr::new(prop_name), parsed.into_static()); 102 + } 103 + 104 + Ok(parsed_props) 105 + } 106 + 107 + /// Parse an object-like def with deep property tracking. 108 + fn parse_object_deep( 109 + value: &serde_json::Value, 110 + base_path: &str, 111 + file_path: &Path, 112 + content: &str, 113 + ) -> std::result::Result<crate::lexicon::LexObject<'static>, CodegenError> { 114 + use crate::lexicon::LexObject; 115 + 116 + let obj = value.as_object().ok_or_else(|| { 117 + make_parse_error(file_path, base_path, "expected object".to_string(), content) 118 + })?; 119 + 120 + // Parse properties deeply if present 121 + let properties = if let Some(props) = obj.get("properties") { 122 + let props_path = format!("{}.properties", base_path); 123 + parse_properties_deep(props, &props_path, file_path, content)? 124 + } else { 125 + BTreeMap::new() 126 + }; 127 + 128 + // Parse the rest of the object normally 129 + let description = obj 130 + .get("description") 131 + .and_then(|v| v.as_str()) 132 + .map(|s| jacquard_common::CowStr::copy_from_str(s)); 133 + let required: Option<Vec<SmolStr>> = obj 134 + .get("required") 135 + .map(|v| serde_json::from_value(v.clone())) 136 + .transpose() 137 + .map_err(|e| make_parse_error(file_path, &format!("{}.required", base_path), e.to_string(), content))?; 138 + let nullable: Option<Vec<SmolStr>> = obj 139 + .get("nullable") 140 + .map(|v| serde_json::from_value(v.clone())) 141 + .transpose() 142 + .map_err(|e| make_parse_error(file_path, &format!("{}.nullable", base_path), e.to_string(), content))?; 143 + 144 + Ok(LexObject { 145 + description, 146 + required, 147 + nullable, 148 + properties, 149 + }) 150 + } 151 + 152 + /// Parse a def with deep path tracking for nested structures. 153 + fn parse_def_deep( 154 + def_name: &str, 155 + value: &serde_json::Value, 156 + file_path: &Path, 157 + content: &str, 158 + ) -> std::result::Result<LexUserType<'static>, CodegenError> { 159 + let base_path = format!("defs.{}", def_name); 160 + 161 + // Check the type field to determine how to parse 162 + let type_str = value 163 + .get("type") 164 + .and_then(|v| v.as_str()) 165 + .unwrap_or("object"); 166 + 167 + match type_str { 168 + "object" => { 169 + let obj = parse_object_deep(value, &base_path, file_path, content)?; 170 + Ok(LexUserType::Object(obj)) 171 + } 172 + "record" => { 173 + // Records have a nested record.properties structure 174 + if let Some(record_value) = value.get("record") { 175 + let record_path = format!("{}.record", base_path); 176 + let inner_obj = parse_object_deep(record_value, &record_path, file_path, content)?; 177 + 178 + // Parse the rest of the record 179 + let obj = value.as_object().ok_or_else(|| { 180 + make_parse_error(file_path, &base_path, "expected object".to_string(), content) 181 + })?; 182 + 183 + let description = obj 184 + .get("description") 185 + .and_then(|v| v.as_str()) 186 + .map(|s| jacquard_common::CowStr::copy_from_str(s)); 187 + let key: Option<jacquard_common::CowStr<'static>> = obj 188 + .get("key") 189 + .and_then(|v| v.as_str()) 190 + .map(|s| jacquard_common::CowStr::copy_from_str(s)); 191 + 192 + Ok(LexUserType::Record(crate::lexicon::LexRecord { 193 + description, 194 + key, 195 + record: crate::lexicon::LexRecordRecord::Object(inner_obj), 196 + })) 197 + } else { 198 + // Fallback to normal parsing if no record field 199 + serde_path_to_error::deserialize(value) 200 + .map(|v: LexUserType| v.into_static()) 201 + .map_err(|e| make_parse_error(file_path, &base_path, e.inner().to_string(), content)) 202 + } 203 + } 204 + // For other types (query, procedure, etc.), use the simpler approach for now 205 + // Could be extended later 206 + _ => serde_path_to_error::deserialize(value) 207 + .map(|v: LexUserType| v.into_static()) 208 + .map_err(|e| { 209 + let inner_path = e.path().to_string(); 210 + let full_path = if inner_path.is_empty() { 211 + base_path 212 + } else { 213 + format!("{}.{}", base_path, inner_path) 214 + }; 215 + make_parse_error(file_path, &full_path, e.inner().to_string(), content) 216 + }), 217 + } 218 + } 219 + 220 + /// Parse a lexicon with rich error context using deep recursive parsing. 221 + /// 222 + /// This parses the document structure recursively, tracking paths through: 223 + /// - defs → def_name → properties → prop_name → nested fields 224 + /// 225 + /// This gives us detailed error paths like "defs.main.properties.count.default" 226 + fn parse_lexicon_with_context( 227 + content: &str, 228 + path: &Path, 229 + ) -> std::result::Result<LexiconDoc<'static>, CodegenError> { 230 + // Phase 1: Parse the top-level structure with defs as raw Values 231 + let raw_doc: RawLexiconDoc = serde_json::from_str(content).map_err(|e| { 232 + CodegenError::ParseError { 233 + path: path.to_path_buf(), 234 + json_path: None, 235 + message: e.to_string(), 236 + src: Some(content.to_string()), 237 + span: None, 238 + } 239 + })?; 240 + 241 + // Phase 2: Parse each def with deep path tracking 242 + let mut parsed_defs = BTreeMap::new(); 243 + for (def_name, def_value) in raw_doc.defs { 244 + let parsed_def = parse_def_deep(&def_name, &def_value, path, content)?; 245 + parsed_defs.insert(def_name, parsed_def); 246 + } 247 + 248 + // Reconstruct the full LexiconDoc 249 + Ok(LexiconDoc { 250 + lexicon: raw_doc.lexicon, 251 + id: raw_doc.id.into_static(), 252 + revision: raw_doc.revision, 253 + description: raw_doc.description.map(|d| d.into_static()), 254 + defs: parsed_defs, 255 + }) 256 + } 257 + 258 /// Registry of all loaded lexicons for reference resolution 259 #[derive(Debug, Clone)] 260 pub struct LexiconCorpus { ··· 281 for schema_path in schemas { 282 let content = fs::read_to_string(schema_path.as_ref())?; 283 284 + // Check if this file is trying to be a lexicon 285 + if !is_lexicon_content(&content) { 286 + // Not a lexicon, skip silently 287 + continue; 288 + } 289 + 290 + // This IS a lexicon - parse with good error reporting 291 + let doc = parse_lexicon_with_context(&content, schema_path.as_ref())?; 292 293 let nsid = SmolStr::from(doc.id.to_string()); 294 + corpus.docs.insert(nsid.clone(), doc); 295 corpus.sources.insert(nsid, content); 296 } 297 ··· 418 // Non-existing refs 419 assert!(!corpus.ref_exists("com.example.fake")); 420 assert!(!corpus.ref_exists("app.bsky.feed.post#nonexistent")); 421 + } 422 + 423 + #[test] 424 + fn test_non_lexicon_json_skipped_silently() { 425 + // The test_lexicons directory contains not_a_lexicon.json which should be skipped 426 + let corpus = LexiconCorpus::load_from_dir("tests/fixtures/test_lexicons") 427 + .expect("should succeed even with non-lexicon JSON files"); 428 + 429 + // The non-lexicon file should not be in the corpus 430 + assert!(corpus.get("some random config").is_none()); 431 + 432 + // But valid lexicons should still load 433 + assert!(corpus.get("app.bsky.feed.post").is_some()); 434 + } 435 + 436 + #[test] 437 + fn test_is_lexicon_content_detection() { 438 + // Not a lexicon - no "lexicon" key 439 + assert!(!is_lexicon_content(r#"{"name": "test", "version": "1.0"}"#)); 440 + 441 + // Not a lexicon - invalid JSON 442 + assert!(!is_lexicon_content("not json at all")); 443 + 444 + // Is a lexicon - has "lexicon" at top level 445 + assert!(is_lexicon_content(r#"{"lexicon": 1, "id": "test.foo"}"#)); 446 + 447 + // Is a lexicon - has "lexicon" one level down 448 + assert!(is_lexicon_content( 449 + r#"{"wrapper": {"lexicon": 1, "id": "test.foo"}}"# 450 + )); 451 + } 452 + 453 + #[test] 454 + fn test_broken_lexicon_returns_error_with_path() { 455 + let result = LexiconCorpus::load_from_dir("tests/fixtures/error_cases"); 456 + 457 + // Should fail because broken_lexicon.json is a lexicon (has "lexicon" key) 458 + // but has invalid structure 459 + let err = result.expect_err("should fail on broken lexicon"); 460 + let err_str = err.to_string(); 461 + 462 + // Error should include the full path to the broken property 463 + assert!( 464 + err_str.contains("defs.main.properties.count"), 465 + "error should contain path to the broken property, got: {}", 466 + err_str 467 + ); 468 + 469 + // Error should also include the actual error message 470 + assert!( 471 + err_str.contains("expected i64"), 472 + "error should describe the type mismatch, got: {}", 473 + err_str 474 + ); 475 + 476 + // Error should mention the file 477 + assert!( 478 + err_str.contains("broken_lexicon.json"), 479 + "error should mention the file, got: {}", 480 + err_str 481 + ); 482 } 483 }
+24 -19
crates/jacquard-lexicon/src/error.rs
··· 3 use std::path::PathBuf; 4 use thiserror::Error; 5 6 /// Errors that can occur during lexicon code generation 7 #[derive(Debug, Error, Diagnostic)] 8 #[non_exhaustive] ··· 12 Io(#[from] io::Error), 13 14 /// Failed to parse lexicon JSON 15 - #[error("Failed to parse lexicon JSON in {}", path.display())] 16 #[diagnostic( 17 code(lexicon::parse_error), 18 help("Check that the lexicon file is valid JSON and follows the lexicon schema") 19 )] 20 ParseError { 21 - #[source] 22 - source: serde_json::Error, 23 /// Path to the file that failed to parse 24 path: PathBuf, 25 /// Source text that failed to parse 26 #[source_code] 27 src: Option<String>, ··· 90 91 impl CodegenError { 92 /// Create a parse error with context 93 - pub fn parse_error(source: serde_json::Error, path: impl Into<PathBuf>) -> Self { 94 Self::ParseError { 95 - source, 96 path: path.into(), 97 src: None, 98 span: None, 99 } 100 } 101 102 - /// Create a parse error with source text 103 - pub fn parse_error_with_source( 104 - source: serde_json::Error, 105 path: impl Into<PathBuf>, 106 src: String, 107 ) -> Self { 108 - // Try to extract error location from serde_json error 109 - let span = if let Some(line) = source.line().checked_sub(1) { 110 - let col = source.column().saturating_sub(1); 111 - // Approximate byte offset (not perfect but good enough for display) 112 - Some((line * 80 + col, 1).into()) 113 - } else { 114 - None 115 - }; 116 - 117 Self::ParseError { 118 - source, 119 path: path.into(), 120 src: Some(src), 121 - span, 122 } 123 } 124
··· 3 use std::path::PathBuf; 4 use thiserror::Error; 5 6 + fn format_parse_error(path: &PathBuf, json_path: Option<&str>, message: &str) -> String { 7 + match json_path { 8 + Some(jp) if !jp.is_empty() => { 9 + format!("failed to parse lexicon {}: at {}: {}", path.display(), jp, message) 10 + } 11 + _ => format!("failed to parse lexicon {}: {}", path.display(), message), 12 + } 13 + } 14 + 15 /// Errors that can occur during lexicon code generation 16 #[derive(Debug, Error, Diagnostic)] 17 #[non_exhaustive] ··· 21 Io(#[from] io::Error), 22 23 /// Failed to parse lexicon JSON 24 + #[error("{}", format_parse_error(path, json_path.as_deref(), message))] 25 #[diagnostic( 26 code(lexicon::parse_error), 27 help("Check that the lexicon file is valid JSON and follows the lexicon schema") 28 )] 29 ParseError { 30 /// Path to the file that failed to parse 31 path: PathBuf, 32 + /// JSON path where the error occurred (from serde_path_to_error) 33 + json_path: Option<String>, 34 + /// The underlying error message 35 + message: String, 36 /// Source text that failed to parse 37 #[source_code] 38 src: Option<String>, ··· 101 102 impl CodegenError { 103 /// Create a parse error with context 104 + pub fn parse_error(message: impl Into<String>, path: impl Into<PathBuf>) -> Self { 105 Self::ParseError { 106 path: path.into(), 107 + json_path: None, 108 + message: message.into(), 109 src: None, 110 span: None, 111 } 112 } 113 114 + /// Create a parse error with source text and JSON path 115 + pub fn parse_error_with_context( 116 + message: impl Into<String>, 117 path: impl Into<PathBuf>, 118 + json_path: Option<String>, 119 src: String, 120 ) -> Self { 121 Self::ParseError { 122 path: path.into(), 123 + json_path, 124 + message: message.into(), 125 src: Some(src), 126 + span: None, 127 } 128 } 129
+15
crates/jacquard-lexicon/tests/fixtures/error_cases/broken_lexicon.json
···
··· 1 + { 2 + "lexicon": 1, 3 + "id": "test.broken.lexicon", 4 + "defs": { 5 + "main": { 6 + "type": "object", 7 + "properties": { 8 + "count": { 9 + "type": "integer", 10 + "default": "not_a_number" 11 + } 12 + } 13 + } 14 + } 15 + }
+7
crates/jacquard-lexicon/tests/fixtures/test_lexicons/not_a_lexicon.json
···
··· 1 + { 2 + "name": "some random config", 3 + "version": "1.0.0", 4 + "settings": { 5 + "enabled": true 6 + } 7 + }