+1
Cargo.lock
+1
Cargo.lock
+1
crates/jacquard-lexicon/Cargo.toml
+1
crates/jacquard-lexicon/Cargo.toml
+321
-8
crates/jacquard-lexicon/src/corpus.rs
+321
-8
crates/jacquard-lexicon/src/corpus.rs
···
1
-
use crate::ref_utils::RefPath;
2
-
use crate::error::Result;
3
use crate::lexicon::{LexUserType, LexiconDoc};
4
use jacquard_common::{into_static::IntoStatic, smol_str::SmolStr};
5
use std::collections::BTreeMap;
6
use std::fs;
7
use std::path::Path;
8
9
/// Registry of all loaded lexicons for reference resolution
10
#[derive(Debug, Clone)]
11
pub struct LexiconCorpus {
···
32
for schema_path in schemas {
33
let content = fs::read_to_string(schema_path.as_ref())?;
34
35
-
// Try to parse as lexicon doc - skip files that aren't lexicon schemas
36
-
let doc: LexiconDoc = match serde_json::from_str(&content) {
37
-
Ok(doc) => doc,
38
-
Err(_) => continue, // Skip non-lexicon JSON files
39
-
};
40
41
let nsid = SmolStr::from(doc.id.to_string());
42
-
corpus.docs.insert(nsid.clone(), doc.into_static());
43
corpus.sources.insert(nsid, content);
44
}
45
···
166
// Non-existing refs
167
assert!(!corpus.ref_exists("com.example.fake"));
168
assert!(!corpus.ref_exists("app.bsky.feed.post#nonexistent"));
169
}
170
}
···
1
+
use crate::error::{CodegenError, Result};
2
use crate::lexicon::{LexUserType, LexiconDoc};
3
+
use crate::ref_utils::RefPath;
4
use jacquard_common::{into_static::IntoStatic, smol_str::SmolStr};
5
use std::collections::BTreeMap;
6
use std::fs;
7
use std::path::Path;
8
9
+
/// Check if content looks like a lexicon file.
10
+
///
11
+
/// A file is considered a lexicon if it contains a `"lexicon"` key at the top level
12
+
/// or one level down (for some wrapper formats). This allows us to distinguish
13
+
/// "not a lexicon at all" (skip silently) from "broken lexicon" (report error).
14
+
fn is_lexicon_content(content: &str) -> bool {
15
+
// Quick string scan first (fast path for non-JSON or unrelated JSON)
16
+
if !content.contains("\"lexicon\"") {
17
+
return false;
18
+
}
19
+
20
+
// Parse to Value and check structure
21
+
if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
22
+
// Top-level lexicon field
23
+
if value.get("lexicon").is_some() {
24
+
return true;
25
+
}
26
+
// One level down (some wrapper formats)
27
+
if let Some(obj) = value.as_object() {
28
+
for v in obj.values() {
29
+
if v.get("lexicon").is_some() {
30
+
return true;
31
+
}
32
+
}
33
+
}
34
+
}
35
+
false
36
+
}
37
+
38
+
/// Raw lexicon doc for two-phase parsing - defs are kept as raw JSON Values
39
+
/// so we can deserialize each separately with better error tracking.
40
+
#[derive(Debug, serde::Deserialize)]
41
+
struct RawLexiconDoc<'s> {
42
+
pub lexicon: crate::lexicon::Lexicon,
43
+
#[serde(borrow)]
44
+
pub id: jacquard_common::CowStr<'s>,
45
+
pub revision: Option<u32>,
46
+
#[serde(borrow)]
47
+
pub description: Option<jacquard_common::CowStr<'s>>,
48
+
pub defs: BTreeMap<SmolStr, serde_json::Value>,
49
+
}
50
+
51
+
/// Helper to create a parse error with path context.
52
+
fn make_parse_error(
53
+
file_path: &Path,
54
+
json_path: &str,
55
+
message: String,
56
+
content: &str,
57
+
) -> CodegenError {
58
+
CodegenError::ParseError {
59
+
path: file_path.to_path_buf(),
60
+
json_path: Some(json_path.to_string()),
61
+
message,
62
+
src: Some(content.to_string()),
63
+
span: None,
64
+
}
65
+
}
66
+
67
+
/// Recursively parse properties with path tracking.
68
+
/// Returns parsed properties or an error with the full path.
69
+
fn parse_properties_deep(
70
+
props_value: &serde_json::Value,
71
+
base_path: &str,
72
+
file_path: &Path,
73
+
content: &str,
74
+
) -> std::result::Result<BTreeMap<SmolStr, crate::lexicon::LexObjectProperty<'static>>, CodegenError>
75
+
{
76
+
let props_obj = props_value.as_object().ok_or_else(|| {
77
+
make_parse_error(
78
+
file_path,
79
+
base_path,
80
+
"expected object for properties".to_string(),
81
+
content,
82
+
)
83
+
})?;
84
+
85
+
let mut parsed_props = BTreeMap::new();
86
+
for (prop_name, prop_value) in props_obj {
87
+
let prop_path = format!("{}.{}", base_path, prop_name);
88
+
89
+
// Try to parse this property
90
+
let parsed: crate::lexicon::LexObjectProperty =
91
+
serde_path_to_error::deserialize(prop_value).map_err(|e| {
92
+
let inner_path = e.path().to_string();
93
+
let full_path = if inner_path.is_empty() {
94
+
prop_path.clone()
95
+
} else {
96
+
format!("{}.{}", prop_path, inner_path)
97
+
};
98
+
make_parse_error(file_path, &full_path, e.inner().to_string(), content)
99
+
})?;
100
+
101
+
parsed_props.insert(SmolStr::new(prop_name), parsed.into_static());
102
+
}
103
+
104
+
Ok(parsed_props)
105
+
}
106
+
107
+
/// Parse an object-like def with deep property tracking.
108
+
fn parse_object_deep(
109
+
value: &serde_json::Value,
110
+
base_path: &str,
111
+
file_path: &Path,
112
+
content: &str,
113
+
) -> std::result::Result<crate::lexicon::LexObject<'static>, CodegenError> {
114
+
use crate::lexicon::LexObject;
115
+
116
+
let obj = value.as_object().ok_or_else(|| {
117
+
make_parse_error(file_path, base_path, "expected object".to_string(), content)
118
+
})?;
119
+
120
+
// Parse properties deeply if present
121
+
let properties = if let Some(props) = obj.get("properties") {
122
+
let props_path = format!("{}.properties", base_path);
123
+
parse_properties_deep(props, &props_path, file_path, content)?
124
+
} else {
125
+
BTreeMap::new()
126
+
};
127
+
128
+
// Parse the rest of the object normally
129
+
let description = obj
130
+
.get("description")
131
+
.and_then(|v| v.as_str())
132
+
.map(|s| jacquard_common::CowStr::copy_from_str(s));
133
+
let required: Option<Vec<SmolStr>> = obj
134
+
.get("required")
135
+
.map(|v| serde_json::from_value(v.clone()))
136
+
.transpose()
137
+
.map_err(|e| make_parse_error(file_path, &format!("{}.required", base_path), e.to_string(), content))?;
138
+
let nullable: Option<Vec<SmolStr>> = obj
139
+
.get("nullable")
140
+
.map(|v| serde_json::from_value(v.clone()))
141
+
.transpose()
142
+
.map_err(|e| make_parse_error(file_path, &format!("{}.nullable", base_path), e.to_string(), content))?;
143
+
144
+
Ok(LexObject {
145
+
description,
146
+
required,
147
+
nullable,
148
+
properties,
149
+
})
150
+
}
151
+
152
+
/// Parse a def with deep path tracking for nested structures.
153
+
fn parse_def_deep(
154
+
def_name: &str,
155
+
value: &serde_json::Value,
156
+
file_path: &Path,
157
+
content: &str,
158
+
) -> std::result::Result<LexUserType<'static>, CodegenError> {
159
+
let base_path = format!("defs.{}", def_name);
160
+
161
+
// Check the type field to determine how to parse
162
+
let type_str = value
163
+
.get("type")
164
+
.and_then(|v| v.as_str())
165
+
.unwrap_or("object");
166
+
167
+
match type_str {
168
+
"object" => {
169
+
let obj = parse_object_deep(value, &base_path, file_path, content)?;
170
+
Ok(LexUserType::Object(obj))
171
+
}
172
+
"record" => {
173
+
// Records have a nested record.properties structure
174
+
if let Some(record_value) = value.get("record") {
175
+
let record_path = format!("{}.record", base_path);
176
+
let inner_obj = parse_object_deep(record_value, &record_path, file_path, content)?;
177
+
178
+
// Parse the rest of the record
179
+
let obj = value.as_object().ok_or_else(|| {
180
+
make_parse_error(file_path, &base_path, "expected object".to_string(), content)
181
+
})?;
182
+
183
+
let description = obj
184
+
.get("description")
185
+
.and_then(|v| v.as_str())
186
+
.map(|s| jacquard_common::CowStr::copy_from_str(s));
187
+
let key: Option<jacquard_common::CowStr<'static>> = obj
188
+
.get("key")
189
+
.and_then(|v| v.as_str())
190
+
.map(|s| jacquard_common::CowStr::copy_from_str(s));
191
+
192
+
Ok(LexUserType::Record(crate::lexicon::LexRecord {
193
+
description,
194
+
key,
195
+
record: crate::lexicon::LexRecordRecord::Object(inner_obj),
196
+
}))
197
+
} else {
198
+
// Fallback to normal parsing if no record field
199
+
serde_path_to_error::deserialize(value)
200
+
.map(|v: LexUserType| v.into_static())
201
+
.map_err(|e| make_parse_error(file_path, &base_path, e.inner().to_string(), content))
202
+
}
203
+
}
204
+
// For other types (query, procedure, etc.), use the simpler approach for now
205
+
// Could be extended later
206
+
_ => serde_path_to_error::deserialize(value)
207
+
.map(|v: LexUserType| v.into_static())
208
+
.map_err(|e| {
209
+
let inner_path = e.path().to_string();
210
+
let full_path = if inner_path.is_empty() {
211
+
base_path
212
+
} else {
213
+
format!("{}.{}", base_path, inner_path)
214
+
};
215
+
make_parse_error(file_path, &full_path, e.inner().to_string(), content)
216
+
}),
217
+
}
218
+
}
219
+
220
+
/// Parse a lexicon with rich error context using deep recursive parsing.
221
+
///
222
+
/// This parses the document structure recursively, tracking paths through:
223
+
/// - defs → def_name → properties → prop_name → nested fields
224
+
///
225
+
/// This gives us detailed error paths like "defs.main.properties.count.default"
226
+
fn parse_lexicon_with_context(
227
+
content: &str,
228
+
path: &Path,
229
+
) -> std::result::Result<LexiconDoc<'static>, CodegenError> {
230
+
// Phase 1: Parse the top-level structure with defs as raw Values
231
+
let raw_doc: RawLexiconDoc = serde_json::from_str(content).map_err(|e| {
232
+
CodegenError::ParseError {
233
+
path: path.to_path_buf(),
234
+
json_path: None,
235
+
message: e.to_string(),
236
+
src: Some(content.to_string()),
237
+
span: None,
238
+
}
239
+
})?;
240
+
241
+
// Phase 2: Parse each def with deep path tracking
242
+
let mut parsed_defs = BTreeMap::new();
243
+
for (def_name, def_value) in raw_doc.defs {
244
+
let parsed_def = parse_def_deep(&def_name, &def_value, path, content)?;
245
+
parsed_defs.insert(def_name, parsed_def);
246
+
}
247
+
248
+
// Reconstruct the full LexiconDoc
249
+
Ok(LexiconDoc {
250
+
lexicon: raw_doc.lexicon,
251
+
id: raw_doc.id.into_static(),
252
+
revision: raw_doc.revision,
253
+
description: raw_doc.description.map(|d| d.into_static()),
254
+
defs: parsed_defs,
255
+
})
256
+
}
257
+
258
/// Registry of all loaded lexicons for reference resolution
259
#[derive(Debug, Clone)]
260
pub struct LexiconCorpus {
···
281
for schema_path in schemas {
282
let content = fs::read_to_string(schema_path.as_ref())?;
283
284
+
// Check if this file is trying to be a lexicon
285
+
if !is_lexicon_content(&content) {
286
+
// Not a lexicon, skip silently
287
+
continue;
288
+
}
289
+
290
+
// This IS a lexicon - parse with good error reporting
291
+
let doc = parse_lexicon_with_context(&content, schema_path.as_ref())?;
292
293
let nsid = SmolStr::from(doc.id.to_string());
294
+
corpus.docs.insert(nsid.clone(), doc);
295
corpus.sources.insert(nsid, content);
296
}
297
···
418
// Non-existing refs
419
assert!(!corpus.ref_exists("com.example.fake"));
420
assert!(!corpus.ref_exists("app.bsky.feed.post#nonexistent"));
421
+
}
422
+
423
+
#[test]
424
+
fn test_non_lexicon_json_skipped_silently() {
425
+
// The test_lexicons directory contains not_a_lexicon.json which should be skipped
426
+
let corpus = LexiconCorpus::load_from_dir("tests/fixtures/test_lexicons")
427
+
.expect("should succeed even with non-lexicon JSON files");
428
+
429
+
// The non-lexicon file should not be in the corpus
430
+
assert!(corpus.get("some random config").is_none());
431
+
432
+
// But valid lexicons should still load
433
+
assert!(corpus.get("app.bsky.feed.post").is_some());
434
+
}
435
+
436
+
#[test]
437
+
fn test_is_lexicon_content_detection() {
438
+
// Not a lexicon - no "lexicon" key
439
+
assert!(!is_lexicon_content(r#"{"name": "test", "version": "1.0"}"#));
440
+
441
+
// Not a lexicon - invalid JSON
442
+
assert!(!is_lexicon_content("not json at all"));
443
+
444
+
// Is a lexicon - has "lexicon" at top level
445
+
assert!(is_lexicon_content(r#"{"lexicon": 1, "id": "test.foo"}"#));
446
+
447
+
// Is a lexicon - has "lexicon" one level down
448
+
assert!(is_lexicon_content(
449
+
r#"{"wrapper": {"lexicon": 1, "id": "test.foo"}}"#
450
+
));
451
+
}
452
+
453
+
#[test]
454
+
fn test_broken_lexicon_returns_error_with_path() {
455
+
let result = LexiconCorpus::load_from_dir("tests/fixtures/error_cases");
456
+
457
+
// Should fail because broken_lexicon.json is a lexicon (has "lexicon" key)
458
+
// but has invalid structure
459
+
let err = result.expect_err("should fail on broken lexicon");
460
+
let err_str = err.to_string();
461
+
462
+
// Error should include the full path to the broken property
463
+
assert!(
464
+
err_str.contains("defs.main.properties.count"),
465
+
"error should contain path to the broken property, got: {}",
466
+
err_str
467
+
);
468
+
469
+
// Error should also include the actual error message
470
+
assert!(
471
+
err_str.contains("expected i64"),
472
+
"error should describe the type mismatch, got: {}",
473
+
err_str
474
+
);
475
+
476
+
// Error should mention the file
477
+
assert!(
478
+
err_str.contains("broken_lexicon.json"),
479
+
"error should mention the file, got: {}",
480
+
err_str
481
+
);
482
}
483
}
+24
-19
crates/jacquard-lexicon/src/error.rs
+24
-19
crates/jacquard-lexicon/src/error.rs
···
3
use std::path::PathBuf;
4
use thiserror::Error;
5
6
/// Errors that can occur during lexicon code generation
7
#[derive(Debug, Error, Diagnostic)]
8
#[non_exhaustive]
···
12
Io(#[from] io::Error),
13
14
/// Failed to parse lexicon JSON
15
-
#[error("Failed to parse lexicon JSON in {}", path.display())]
16
#[diagnostic(
17
code(lexicon::parse_error),
18
help("Check that the lexicon file is valid JSON and follows the lexicon schema")
19
)]
20
ParseError {
21
-
#[source]
22
-
source: serde_json::Error,
23
/// Path to the file that failed to parse
24
path: PathBuf,
25
/// Source text that failed to parse
26
#[source_code]
27
src: Option<String>,
···
90
91
impl CodegenError {
92
/// Create a parse error with context
93
-
pub fn parse_error(source: serde_json::Error, path: impl Into<PathBuf>) -> Self {
94
Self::ParseError {
95
-
source,
96
path: path.into(),
97
src: None,
98
span: None,
99
}
100
}
101
102
-
/// Create a parse error with source text
103
-
pub fn parse_error_with_source(
104
-
source: serde_json::Error,
105
path: impl Into<PathBuf>,
106
src: String,
107
) -> Self {
108
-
// Try to extract error location from serde_json error
109
-
let span = if let Some(line) = source.line().checked_sub(1) {
110
-
let col = source.column().saturating_sub(1);
111
-
// Approximate byte offset (not perfect but good enough for display)
112
-
Some((line * 80 + col, 1).into())
113
-
} else {
114
-
None
115
-
};
116
-
117
Self::ParseError {
118
-
source,
119
path: path.into(),
120
src: Some(src),
121
-
span,
122
}
123
}
124
···
3
use std::path::PathBuf;
4
use thiserror::Error;
5
6
+
fn format_parse_error(path: &PathBuf, json_path: Option<&str>, message: &str) -> String {
7
+
match json_path {
8
+
Some(jp) if !jp.is_empty() => {
9
+
format!("failed to parse lexicon {}: at {}: {}", path.display(), jp, message)
10
+
}
11
+
_ => format!("failed to parse lexicon {}: {}", path.display(), message),
12
+
}
13
+
}
14
+
15
/// Errors that can occur during lexicon code generation
16
#[derive(Debug, Error, Diagnostic)]
17
#[non_exhaustive]
···
21
Io(#[from] io::Error),
22
23
/// Failed to parse lexicon JSON
24
+
#[error("{}", format_parse_error(path, json_path.as_deref(), message))]
25
#[diagnostic(
26
code(lexicon::parse_error),
27
help("Check that the lexicon file is valid JSON and follows the lexicon schema")
28
)]
29
ParseError {
30
/// Path to the file that failed to parse
31
path: PathBuf,
32
+
/// JSON path where the error occurred (from serde_path_to_error)
33
+
json_path: Option<String>,
34
+
/// The underlying error message
35
+
message: String,
36
/// Source text that failed to parse
37
#[source_code]
38
src: Option<String>,
···
101
102
impl CodegenError {
103
/// Create a parse error with context
104
+
pub fn parse_error(message: impl Into<String>, path: impl Into<PathBuf>) -> Self {
105
Self::ParseError {
106
path: path.into(),
107
+
json_path: None,
108
+
message: message.into(),
109
src: None,
110
span: None,
111
}
112
}
113
114
+
/// Create a parse error with source text and JSON path
115
+
pub fn parse_error_with_context(
116
+
message: impl Into<String>,
117
path: impl Into<PathBuf>,
118
+
json_path: Option<String>,
119
src: String,
120
) -> Self {
121
Self::ParseError {
122
path: path.into(),
123
+
json_path,
124
+
message: message.into(),
125
src: Some(src),
126
+
span: None,
127
}
128
}
129
+15
crates/jacquard-lexicon/tests/fixtures/error_cases/broken_lexicon.json
+15
crates/jacquard-lexicon/tests/fixtures/error_cases/broken_lexicon.json