OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Structured expected messages from Nu validator. *)
2
3type t = {
4 message: string;
5 error_code: Htmlrw_check.Error_code.t option;
6 line: int option;
7 column: int option;
8 element: string option;
9 attribute: string option;
10 severity: [`Error | `Warning | `Info] option;
11}
12
13type match_quality =
14 | Exact_match
15 | Code_match
16 | Message_match
17 | Substring_match
18 | Severity_mismatch
19 | No_match
20
21type strictness = {
22 require_exact_message: bool;
23 require_error_code: bool;
24 require_location: bool;
25 require_severity: bool;
26}
27
28let lenient = {
29 require_exact_message = false;
30 require_error_code = false;
31 require_location = false;
32 require_severity = false;
33}
34
35(** Practical strict mode: requires exact message text but not typed error codes *)
36let exact_message = {
37 require_exact_message = true;
38 require_error_code = false;
39 require_location = false;
40 require_severity = false;
41}
42
43(** Full strict mode: all checks enabled (requires typed error code migration) *)
44let strict = {
45 require_exact_message = true;
46 require_error_code = true;
47 require_location = true;
48 require_severity = true;
49}
50
51(** Unicode ellipsis character *)
52let ellipsis = "\xe2\x80\xa6"
53
54(** Normalize Unicode curly quotes to ASCII for comparison *)
55let normalize_quotes s =
56 let buf = Buffer.create (String.length s) in
57 let i = ref 0 in
58 while !i < String.length s do
59 let c = s.[!i] in
60 if !i + 2 < String.length s && c = '\xe2' then begin
61 let c1 = s.[!i + 1] in
62 let c2 = s.[!i + 2] in
63 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin
64 Buffer.add_char buf '"';
65 i := !i + 3
66 end else begin
67 Buffer.add_char buf c;
68 incr i
69 end
70 end else begin
71 Buffer.add_char buf c;
72 incr i
73 end
74 done;
75 Buffer.contents buf
76
77(** Unicode curly quotes *)
78let left_curly_quote = "\xe2\x80\x9c"
79let right_curly_quote = "\xe2\x80\x9d"
80
81(** Check if expected message (with potential ellipsis truncation) matches actual.
82 When expected has ellipsis followed by text in curly quotes, we check if actual
83 has a value that ends with that text.
84 This handles Nu validator's message truncation for long attribute values. *)
85let truncation_aware_match expected actual =
86 (* Look for pattern: left_curly_quote + ellipsis in expected *)
87 let quote_ellipsis = left_curly_quote ^ ellipsis in
88 try
89 let pos = Str.search_forward (Str.regexp_string quote_ellipsis) expected 0 in
90 (* Found quote+ellipsis pattern - extract what comes after ellipsis until closing curly quote *)
91 let start_after_ellipsis = pos + String.length quote_ellipsis in
92 let end_quote_pos =
93 try Str.search_forward (Str.regexp_string right_curly_quote) expected start_after_ellipsis
94 with Not_found -> String.length expected
95 in
96 let truncated_suffix = String.sub expected start_after_ellipsis (end_quote_pos - start_after_ellipsis) in
97
98 (* Build expected prefix (everything before the truncated quote) and suffix (everything after) *)
99 let prefix = String.sub expected 0 pos in
100 let suffix_start = end_quote_pos + String.length right_curly_quote in
101 let suffix =
102 if suffix_start < String.length expected then
103 String.sub expected suffix_start (String.length expected - suffix_start)
104 else ""
105 in
106
107 (* Check if actual starts with prefix and ends with suffix *)
108 let actual_starts_with_prefix =
109 String.length actual >= String.length prefix &&
110 String.sub actual 0 (String.length prefix) = prefix
111 in
112 let actual_ends_with_suffix =
113 String.length actual >= String.length suffix &&
114 String.sub actual (String.length actual - String.length suffix) (String.length suffix) = suffix
115 in
116
117 (* If prefix and suffix match, extract the middle (the quoted value in actual) *)
118 if actual_starts_with_prefix && actual_ends_with_suffix then begin
119 (* Find the quoted value in actual at the same position *)
120 let actual_quote_start = String.length prefix in
121 try
122 (* Check actual has left curly quote at expected position *)
123 if String.sub actual actual_quote_start (String.length left_curly_quote) = left_curly_quote then begin
124 let actual_value_start = actual_quote_start + String.length left_curly_quote in
125 let actual_value_end =
126 Str.search_forward (Str.regexp_string right_curly_quote) actual actual_value_start
127 in
128 let actual_value = String.sub actual actual_value_start (actual_value_end - actual_value_start) in
129 (* Check if actual value ends with the truncated suffix from expected *)
130 String.length actual_value >= String.length truncated_suffix &&
131 String.sub actual_value (String.length actual_value - String.length truncated_suffix) (String.length truncated_suffix) = truncated_suffix
132 end else false
133 with _ -> false
134 end else false
135 with Not_found ->
136 (* No ellipsis truncation pattern found *)
137 false
138
139(** Pattern matchers for Nu validator messages.
140 Each returns (error_code option, element option, attribute option) *)
141
142let pattern_element_not_allowed msg =
143 (* "Element "X" not allowed as child of element "Y"..." *)
144 let re = Str.regexp {|Element "\([^"]+\)" not allowed as child of element "\([^"]+\)"|} in
145 if Str.string_match re msg 0 then
146 let child = Str.matched_group 1 msg in
147 let parent = Str.matched_group 2 msg in
148 Some ((`Element (`Not_allowed_as_child (`Child child, `Parent parent)) : Htmlrw_check.Error_code.t),
149 Some child, None)
150 else None
151
152let pattern_attr_not_allowed_on_element msg =
153 (* "Attribute "X" not allowed on element "Y"..." *)
154 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed on element "\([^"]+\)"|} in
155 if Str.string_match re msg 0 then
156 let attr = Str.matched_group 1 msg in
157 let element = Str.matched_group 2 msg in
158 Some ((`Attr (`Not_allowed (`Attr attr, `Elem element)) : Htmlrw_check.Error_code.t),
159 Some element, Some attr)
160 else None
161
162let pattern_attr_not_allowed_here msg =
163 (* "Attribute "X" not allowed here." *)
164 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed here|} in
165 if Str.string_match re msg 0 then
166 let attr = Str.matched_group 1 msg in
167 Some ((`Attr (`Not_allowed_here (`Attr attr)) : Htmlrw_check.Error_code.t),
168 None, Some attr)
169 else None
170
171let pattern_missing_required_attr msg =
172 (* "Element "X" is missing required attribute "Y"." *)
173 let re = Str.regexp {|Element "\([^"]+\)" is missing required attribute "\([^"]+\)"|} in
174 if Str.string_match re msg 0 then
175 let element = Str.matched_group 1 msg in
176 let attr = Str.matched_group 2 msg in
177 Some ((`Attr (`Missing (`Elem element, `Attr attr)) : Htmlrw_check.Error_code.t),
178 Some element, Some attr)
179 else None
180
181let pattern_missing_required_child msg =
182 (* "Element "X" is missing required child element "Y"." *)
183 let re = Str.regexp {|Element "\([^"]+\)" is missing required child element "\([^"]+\)"|} in
184 if Str.string_match re msg 0 then
185 let parent = Str.matched_group 1 msg in
186 let child = Str.matched_group 2 msg in
187 Some ((`Element (`Missing_child (`Parent parent, `Child child)) : Htmlrw_check.Error_code.t),
188 Some parent, None)
189 else None
190
191let pattern_duplicate_id msg =
192 (* "Duplicate ID "X"." *)
193 let re = Str.regexp {|Duplicate ID "\([^"]+\)"|} in
194 if Str.string_match re msg 0 then
195 let id = Str.matched_group 1 msg in
196 Some ((`Attr (`Duplicate_id (`Id id)) : Htmlrw_check.Error_code.t),
197 None, None)
198 else None
199
200let pattern_obsolete_element msg =
201 (* "The "X" element is obsolete." *)
202 let re = Str.regexp {|The "\([^"]+\)" element is obsolete|} in
203 if Str.string_match re msg 0 then
204 let element = Str.matched_group 1 msg in
205 Some ((`Element (`Obsolete (`Elem element, `Suggestion "")) : Htmlrw_check.Error_code.t),
206 Some element, None)
207 else None
208
209let pattern_obsolete_attr msg =
210 (* "The "X" attribute on the "Y" element is obsolete." *)
211 let re = Str.regexp {|The "\([^"]+\)" attribute on the "\([^"]+\)" element is obsolete|} in
212 if Str.string_match re msg 0 then
213 let attr = Str.matched_group 1 msg in
214 let element = Str.matched_group 2 msg in
215 Some ((`Element (`Obsolete_attr (`Elem element, `Attr attr, `Suggestion None)) : Htmlrw_check.Error_code.t),
216 Some element, Some attr)
217 else None
218
219let pattern_stray_end_tag msg =
220 (* "Stray end tag "X"." *)
221 let re = Str.regexp {|Stray end tag "\([^"]+\)"|} in
222 if Str.string_match re msg 0 then
223 let tag = Str.matched_group 1 msg in
224 Some ((`Tag (`Stray_end (`Tag tag)) : Htmlrw_check.Error_code.t),
225 Some tag, None)
226 else None
227
228let pattern_stray_start_tag msg =
229 (* "Stray start tag "X"." *)
230 let re = Str.regexp {|Stray start tag "\([^"]+\)"|} in
231 if Str.string_match re msg 0 then
232 let tag = Str.matched_group 1 msg in
233 Some ((`Tag (`Stray_start (`Tag tag)) : Htmlrw_check.Error_code.t),
234 Some tag, None)
235 else None
236
237let pattern_unnecessary_role msg =
238 (* "The "X" role is unnecessary for..." *)
239 let re = Str.regexp {|The "\([^"]+\)" role is unnecessary for \(.*\)|} in
240 if Str.string_match re msg 0 then
241 let role = Str.matched_group 1 msg in
242 let reason = Str.matched_group 2 msg in
243 Some ((`Aria (`Unnecessary_role (`Role role, `Elem "", `Reason reason)) : Htmlrw_check.Error_code.t),
244 None, None)
245 else None
246
247let pattern_bad_role msg =
248 (* "Bad value "X" for attribute "role" on element "Y"." *)
249 let re = Str.regexp {|Bad value "\([^"]+\)" for attribute "role" on element "\([^"]+\)"|} in
250 if Str.string_match re msg 0 then
251 let role = Str.matched_group 1 msg in
252 let element = Str.matched_group 2 msg in
253 Some ((`Aria (`Bad_role (`Elem element, `Role role)) : Htmlrw_check.Error_code.t),
254 Some element, Some "role")
255 else None
256
257let pattern_aria_must_not_be_specified msg =
258 (* "The "X" attribute must not be specified on any "Y" element unless..." *)
259 let re = Str.regexp {|The "\([^"]+\)" attribute must not be specified on any "\([^"]+\)" element unless \(.*\)|} in
260 if Str.string_match re msg 0 then
261 let attr = Str.matched_group 1 msg in
262 let element = Str.matched_group 2 msg in
263 let condition = Str.matched_group 3 msg in
264 Some ((`Aria (`Must_not_specify (`Attr attr, `Elem element, `Condition condition)) : Htmlrw_check.Error_code.t),
265 Some element, Some attr)
266 else None
267
268let pattern_aria_must_not_be_used msg =
269 (* "The "X" attribute must not be used on an "Y" element which has..." *)
270 let re = Str.regexp {|The "\([^"]+\)" attribute must not be used on an "\([^"]+\)" element which has \(.*\)|} in
271 if Str.string_match re msg 0 then
272 let attr = Str.matched_group 1 msg in
273 let element = Str.matched_group 2 msg in
274 let condition = Str.matched_group 3 msg in
275 Some ((`Aria (`Must_not_use (`Attr attr, `Elem element, `Condition condition)) : Htmlrw_check.Error_code.t),
276 Some element, Some attr)
277 else None
278
279let pattern_bad_attr_value msg =
280 (* "Bad value "X" for attribute "Y" on element "Z": ..." *)
281 let re = Str.regexp {|Bad value "\([^"]*\)" for attribute "\([^"]+\)" on element "\([^"]+\)"|} in
282 if Str.string_match re msg 0 then
283 let value = Str.matched_group 1 msg in
284 let attr = Str.matched_group 2 msg in
285 let element = Str.matched_group 3 msg in
286 (* Extract reason after the colon if present *)
287 let reason =
288 try
289 let colon_pos = String.index_from msg (Str.match_end ()) ':' in
290 String.trim (String.sub msg (colon_pos + 1) (String.length msg - colon_pos - 1))
291 with Not_found -> ""
292 in
293 Some ((`Attr (`Bad_value (`Elem element, `Attr attr, `Value value, `Reason reason)) : Htmlrw_check.Error_code.t),
294 Some element, Some attr)
295 else None
296
297let pattern_end_tag_implied msg =
298 (* "End tag "X" implied, but there were open elements." *)
299 let re = Str.regexp {|End tag "\([^"]+\)" implied, but there were open elements|} in
300 if Str.string_match re msg 0 then
301 let tag = Str.matched_group 1 msg in
302 Some ((`Tag (`End_implied_open (`Tag tag)) : Htmlrw_check.Error_code.t),
303 Some tag, None)
304 else None
305
306let pattern_no_element_in_scope msg =
307 (* "No "X" element in scope but a "X" end tag seen." *)
308 let re = Str.regexp {|No "\([^"]+\)" element in scope but a "\([^"]+\)" end tag seen|} in
309 if Str.string_match re msg 0 then
310 let tag = Str.matched_group 1 msg in
311 Some ((`Tag (`Not_in_scope (`Tag tag)) : Htmlrw_check.Error_code.t),
312 Some tag, None)
313 else None
314
315let pattern_start_tag_in_table msg =
316 (* "Start tag "X" seen in "table"." *)
317 let re = Str.regexp {|Start tag "\([^"]+\)" seen in "table"|} in
318 if Str.string_match re msg 0 then
319 let tag = Str.matched_group 1 msg in
320 Some ((`Tag (`Start_in_table (`Tag tag)) : Htmlrw_check.Error_code.t),
321 Some tag, None)
322 else None
323
324(** All pattern matchers in priority order *)
325let patterns = [
326 pattern_element_not_allowed;
327 pattern_attr_not_allowed_on_element;
328 pattern_attr_not_allowed_here;
329 pattern_missing_required_attr;
330 pattern_missing_required_child;
331 pattern_duplicate_id;
332 pattern_obsolete_element;
333 pattern_obsolete_attr;
334 pattern_stray_end_tag;
335 pattern_stray_start_tag;
336 pattern_unnecessary_role;
337 pattern_bad_role;
338 pattern_aria_must_not_be_specified;
339 pattern_aria_must_not_be_used;
340 pattern_bad_attr_value;
341 pattern_end_tag_implied;
342 pattern_no_element_in_scope;
343 pattern_start_tag_in_table;
344]
345
346(** Try to recognize the error code from a message *)
347let recognize_error_code msg =
348 let normalized = normalize_quotes msg in
349 let rec try_patterns = function
350 | [] -> (None, None, None)
351 | p :: rest ->
352 match p normalized with
353 | Some (code, elem, attr) -> (Some code, elem, attr)
354 | None -> try_patterns rest
355 in
356 try_patterns patterns
357
358(** Infer severity from message patterns *)
359let infer_severity msg =
360 let normalized = String.lowercase_ascii msg in
361 if String.sub normalized 0 (min 8 (String.length normalized)) = "consider" then
362 Some `Info
363 else if String.sub normalized 0 (min 3 (String.length normalized)) = "the"
364 && (try let _ = Str.search_forward (Str.regexp_string "is unnecessary") normalized 0 in true
365 with Not_found -> false) then
366 Some `Warning
367 else
368 Some `Error
369
370let parse message =
371 let (error_code, element, attribute) = recognize_error_code message in
372 let severity = infer_severity message in
373 {
374 message;
375 error_code;
376 line = None;
377 column = None;
378 element;
379 attribute;
380 severity;
381 }
382
383let parse_json_value ~get_string ~get_int ~message_field =
384 let message = match message_field with
385 | Some m -> m
386 | None -> match get_string "message" with Some m -> m | None -> ""
387 in
388 let base = parse message in
389 { base with
390 line = (match get_int "line" with Some l -> Some l | None -> base.line);
391 column = (match get_int "column" with Some c -> Some c | None -> base.column);
392 element = (match get_string "element" with Some e -> Some e | None -> base.element);
393 attribute = (match get_string "attribute" with Some a -> Some a | None -> base.attribute);
394 }
395
396(** Compare error codes for semantic equality *)
397let error_codes_match code1 code2 =
398 (* Use structural equality for all polymorphic variant error codes *)
399 code1 = code2
400
401let matches ~strictness ~expected ~actual =
402 let expected_norm = normalize_quotes expected.message in
403 let actual_norm = normalize_quotes actual.Htmlrw_check.text in
404
405 (* Check severity match *)
406 let severity_matches =
407 match (expected.severity, actual.Htmlrw_check.severity) with
408 | (None, _) -> true
409 | (Some `Error, Htmlrw_check.Error) -> true
410 | (Some `Warning, Htmlrw_check.Warning) -> true
411 | (Some `Info, Htmlrw_check.Info) -> true
412 | _ -> false
413 in
414
415 (* Check location match *)
416 let location_matches =
417 match (expected.line, expected.column, actual.Htmlrw_check.location) with
418 | (None, None, _) -> true
419 | (Some el, Some ec, Some loc) -> loc.line = el && loc.column = ec
420 | (Some el, None, Some loc) -> loc.line = el
421 | _ -> false
422 in
423
424 (* Check error code match *)
425 let code_matches =
426 match (expected.error_code, actual.Htmlrw_check.error_code) with
427 | (None, _) -> true (* No expected code to match *)
428 | (Some ec, Htmlrw_check.Conformance ac) -> error_codes_match ec ac
429 | (Some _, Htmlrw_check.Parse _) -> false (* Expected conformance but got parse error *)
430 in
431
432 (* Check message text *)
433 let exact_text_match = actual_norm = expected_norm in
434 (* Truncation-aware match: expected may have ellipsis where actual has full value *)
435 let truncation_match = truncation_aware_match expected.message actual.Htmlrw_check.text in
436 let substring_match =
437 try let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in true
438 with Not_found -> false
439 in
440
441 (* Determine match quality *)
442 if not severity_matches && strictness.require_severity then
443 Severity_mismatch
444 else if exact_text_match && code_matches && (location_matches || not strictness.require_location) then
445 Exact_match
446 else if code_matches && expected.error_code <> None then
447 Code_match
448 else if exact_text_match then
449 Message_match
450 else if truncation_match then
451 Message_match (* Treat truncation match same as message match *)
452 else if substring_match && not strictness.require_exact_message then
453 Substring_match
454 else
455 No_match
456
457let is_acceptable ~strictness quality =
458 match quality with
459 | Exact_match -> true
460 | Code_match -> not strictness.require_exact_message
461 | Message_match -> not strictness.require_error_code
462 | Substring_match -> not strictness.require_exact_message
463 | Severity_mismatch -> not strictness.require_severity
464 | No_match -> false
465
466let match_quality_to_string = function
467 | Exact_match -> "exact"
468 | Code_match -> "code"
469 | Message_match -> "message"
470 | Substring_match -> "substring"
471 | Severity_mismatch -> "severity-mismatch"
472 | No_match -> "no-match"