OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Structured expected messages from Nu validator. *)
2
3type t = {
4 message: string;
5 error_code: Html5_checker.Error_code.t option;
6 line: int option;
7 column: int option;
8 element: string option;
9 attribute: string option;
10 severity: [`Error | `Warning | `Info] option;
11}
12
13type match_quality =
14 | Exact_match
15 | Code_match
16 | Message_match
17 | Substring_match
18 | Severity_mismatch
19 | No_match
20
21type strictness = {
22 require_exact_message: bool;
23 require_error_code: bool;
24 require_location: bool;
25 require_severity: bool;
26}
27
28let lenient = {
29 require_exact_message = false;
30 require_error_code = false;
31 require_location = false;
32 require_severity = false;
33}
34
35(** Practical strict mode: requires exact message text but not typed error codes *)
36let exact_message = {
37 require_exact_message = true;
38 require_error_code = false;
39 require_location = false;
40 require_severity = false;
41}
42
43(** Full strict mode: all checks enabled (requires typed error code migration) *)
44let strict = {
45 require_exact_message = true;
46 require_error_code = true;
47 require_location = true;
48 require_severity = true;
49}
50
51(** Normalize Unicode curly quotes to ASCII for comparison *)
52let normalize_quotes s =
53 let buf = Buffer.create (String.length s) in
54 let i = ref 0 in
55 while !i < String.length s do
56 let c = s.[!i] in
57 if !i + 2 < String.length s && c = '\xe2' then begin
58 let c1 = s.[!i + 1] in
59 let c2 = s.[!i + 2] in
60 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin
61 Buffer.add_char buf '"';
62 i := !i + 3
63 end else begin
64 Buffer.add_char buf c;
65 incr i
66 end
67 end else begin
68 Buffer.add_char buf c;
69 incr i
70 end
71 done;
72 Buffer.contents buf
73
74(** Pattern matchers for Nu validator messages.
75 Each returns (error_code option, element option, attribute option) *)
76
77let pattern_element_not_allowed msg =
78 (* "Element "X" not allowed as child of element "Y"..." *)
79 let re = Str.regexp {|Element "\([^"]+\)" not allowed as child of element "\([^"]+\)"|} in
80 if Str.string_match re msg 0 then
81 let child = Str.matched_group 1 msg in
82 let parent = Str.matched_group 2 msg in
83 Some (Html5_checker.Error_code.Element_not_allowed_as_child { child; parent },
84 Some child, None)
85 else None
86
87let pattern_attr_not_allowed_on_element msg =
88 (* "Attribute "X" not allowed on element "Y"..." *)
89 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed on element "\([^"]+\)"|} in
90 if Str.string_match re msg 0 then
91 let attr = Str.matched_group 1 msg in
92 let element = Str.matched_group 2 msg in
93 Some (Html5_checker.Error_code.Attr_not_allowed_on_element { attr; element },
94 Some element, Some attr)
95 else None
96
97let pattern_attr_not_allowed_here msg =
98 (* "Attribute "X" not allowed here." *)
99 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed here|} in
100 if Str.string_match re msg 0 then
101 let attr = Str.matched_group 1 msg in
102 Some (Html5_checker.Error_code.Attr_not_allowed_here { attr },
103 None, Some attr)
104 else None
105
106let pattern_missing_required_attr msg =
107 (* "Element "X" is missing required attribute "Y"." *)
108 let re = Str.regexp {|Element "\([^"]+\)" is missing required attribute "\([^"]+\)"|} in
109 if Str.string_match re msg 0 then
110 let element = Str.matched_group 1 msg in
111 let attr = Str.matched_group 2 msg in
112 Some (Html5_checker.Error_code.Missing_required_attr { element; attr },
113 Some element, Some attr)
114 else None
115
116let pattern_missing_required_child msg =
117 (* "Element "X" is missing required child element "Y"." *)
118 let re = Str.regexp {|Element "\([^"]+\)" is missing required child element "\([^"]+\)"|} in
119 if Str.string_match re msg 0 then
120 let parent = Str.matched_group 1 msg in
121 let child = Str.matched_group 2 msg in
122 Some (Html5_checker.Error_code.Missing_required_child { parent; child },
123 Some parent, None)
124 else None
125
126let pattern_duplicate_id msg =
127 (* "Duplicate ID "X"." *)
128 let re = Str.regexp {|Duplicate ID "\([^"]+\)"|} in
129 if Str.string_match re msg 0 then
130 let id = Str.matched_group 1 msg in
131 Some (Html5_checker.Error_code.Duplicate_id { id },
132 None, None)
133 else None
134
135let pattern_obsolete_element msg =
136 (* "The "X" element is obsolete." *)
137 let re = Str.regexp {|The "\([^"]+\)" element is obsolete|} in
138 if Str.string_match re msg 0 then
139 let element = Str.matched_group 1 msg in
140 Some (Html5_checker.Error_code.Obsolete_element { element; suggestion = "" },
141 Some element, None)
142 else None
143
144let pattern_obsolete_attr msg =
145 (* "The "X" attribute on the "Y" element is obsolete." *)
146 let re = Str.regexp {|The "\([^"]+\)" attribute on the "\([^"]+\)" element is obsolete|} in
147 if Str.string_match re msg 0 then
148 let attr = Str.matched_group 1 msg in
149 let element = Str.matched_group 2 msg in
150 Some (Html5_checker.Error_code.Obsolete_attr { attr; element; suggestion = None },
151 Some element, Some attr)
152 else None
153
154let pattern_stray_end_tag msg =
155 (* "Stray end tag "X"." *)
156 let re = Str.regexp {|Stray end tag "\([^"]+\)"|} in
157 if Str.string_match re msg 0 then
158 let tag = Str.matched_group 1 msg in
159 Some (Html5_checker.Error_code.Stray_end_tag { tag },
160 Some tag, None)
161 else None
162
163let pattern_stray_start_tag msg =
164 (* "Stray start tag "X"." *)
165 let re = Str.regexp {|Stray start tag "\([^"]+\)"|} in
166 if Str.string_match re msg 0 then
167 let tag = Str.matched_group 1 msg in
168 Some (Html5_checker.Error_code.Stray_start_tag { tag },
169 Some tag, None)
170 else None
171
172let pattern_unnecessary_role msg =
173 (* "The "X" role is unnecessary for..." *)
174 let re = Str.regexp {|The "\([^"]+\)" role is unnecessary for \(.*\)|} in
175 if Str.string_match re msg 0 then
176 let role = Str.matched_group 1 msg in
177 let reason = Str.matched_group 2 msg in
178 Some (Html5_checker.Error_code.Unnecessary_role { role; element = ""; reason },
179 None, None)
180 else None
181
182let pattern_bad_role msg =
183 (* "Bad value "X" for attribute "role" on element "Y"." *)
184 let re = Str.regexp {|Bad value "\([^"]+\)" for attribute "role" on element "\([^"]+\)"|} in
185 if Str.string_match re msg 0 then
186 let role = Str.matched_group 1 msg in
187 let element = Str.matched_group 2 msg in
188 Some (Html5_checker.Error_code.Bad_role { element; role },
189 Some element, Some "role")
190 else None
191
192let pattern_aria_must_not_be_specified msg =
193 (* "The "X" attribute must not be specified on any "Y" element unless..." *)
194 let re = Str.regexp {|The "\([^"]+\)" attribute must not be specified on any "\([^"]+\)" element unless \(.*\)|} in
195 if Str.string_match re msg 0 then
196 let attr = Str.matched_group 1 msg in
197 let element = Str.matched_group 2 msg in
198 let condition = Str.matched_group 3 msg in
199 Some (Html5_checker.Error_code.Aria_must_not_be_specified { attr; element; condition },
200 Some element, Some attr)
201 else None
202
203let pattern_aria_must_not_be_used msg =
204 (* "The "X" attribute must not be used on an "Y" element which has..." *)
205 let re = Str.regexp {|The "\([^"]+\)" attribute must not be used on an "\([^"]+\)" element which has \(.*\)|} in
206 if Str.string_match re msg 0 then
207 let attr = Str.matched_group 1 msg in
208 let element = Str.matched_group 2 msg in
209 let condition = Str.matched_group 3 msg in
210 Some (Html5_checker.Error_code.Aria_must_not_be_used { attr; element; condition },
211 Some element, Some attr)
212 else None
213
214let pattern_bad_attr_value msg =
215 (* "Bad value "X" for attribute "Y" on element "Z": ..." *)
216 let re = Str.regexp {|Bad value "\([^"]*\)" for attribute "\([^"]+\)" on element "\([^"]+\)"|} in
217 if Str.string_match re msg 0 then
218 let value = Str.matched_group 1 msg in
219 let attr = Str.matched_group 2 msg in
220 let element = Str.matched_group 3 msg in
221 (* Extract reason after the colon if present *)
222 let reason =
223 try
224 let colon_pos = String.index_from msg (Str.match_end ()) ':' in
225 String.trim (String.sub msg (colon_pos + 1) (String.length msg - colon_pos - 1))
226 with Not_found -> ""
227 in
228 Some (Html5_checker.Error_code.Bad_attr_value { element; attr; value; reason },
229 Some element, Some attr)
230 else None
231
232let pattern_end_tag_implied msg =
233 (* "End tag "X" implied, but there were open elements." *)
234 let re = Str.regexp {|End tag "\([^"]+\)" implied, but there were open elements|} in
235 if Str.string_match re msg 0 then
236 let tag = Str.matched_group 1 msg in
237 Some (Html5_checker.Error_code.End_tag_implied_open_elements { tag },
238 Some tag, None)
239 else None
240
241let pattern_no_element_in_scope msg =
242 (* "No "X" element in scope but a "X" end tag seen." *)
243 let re = Str.regexp {|No "\([^"]+\)" element in scope but a "\([^"]+\)" end tag seen|} in
244 if Str.string_match re msg 0 then
245 let tag = Str.matched_group 1 msg in
246 Some (Html5_checker.Error_code.No_element_in_scope { tag },
247 Some tag, None)
248 else None
249
250let pattern_start_tag_in_table msg =
251 (* "Start tag "X" seen in "table"." *)
252 let re = Str.regexp {|Start tag "\([^"]+\)" seen in "table"|} in
253 if Str.string_match re msg 0 then
254 let tag = Str.matched_group 1 msg in
255 Some (Html5_checker.Error_code.Start_tag_in_table { tag },
256 Some tag, None)
257 else None
258
259(** All pattern matchers in priority order *)
260let patterns = [
261 pattern_element_not_allowed;
262 pattern_attr_not_allowed_on_element;
263 pattern_attr_not_allowed_here;
264 pattern_missing_required_attr;
265 pattern_missing_required_child;
266 pattern_duplicate_id;
267 pattern_obsolete_element;
268 pattern_obsolete_attr;
269 pattern_stray_end_tag;
270 pattern_stray_start_tag;
271 pattern_unnecessary_role;
272 pattern_bad_role;
273 pattern_aria_must_not_be_specified;
274 pattern_aria_must_not_be_used;
275 pattern_bad_attr_value;
276 pattern_end_tag_implied;
277 pattern_no_element_in_scope;
278 pattern_start_tag_in_table;
279]
280
281(** Try to recognize the error code from a message *)
282let recognize_error_code msg =
283 let normalized = normalize_quotes msg in
284 let rec try_patterns = function
285 | [] -> (None, None, None)
286 | p :: rest ->
287 match p normalized with
288 | Some (code, elem, attr) -> (Some code, elem, attr)
289 | None -> try_patterns rest
290 in
291 try_patterns patterns
292
293(** Infer severity from message patterns *)
294let infer_severity msg =
295 let normalized = String.lowercase_ascii msg in
296 if String.sub normalized 0 (min 8 (String.length normalized)) = "consider" then
297 Some `Info
298 else if String.sub normalized 0 (min 3 (String.length normalized)) = "the"
299 && (try let _ = Str.search_forward (Str.regexp_string "is unnecessary") normalized 0 in true
300 with Not_found -> false) then
301 Some `Warning
302 else
303 Some `Error
304
305let parse message =
306 let (error_code, element, attribute) = recognize_error_code message in
307 let severity = infer_severity message in
308 {
309 message;
310 error_code;
311 line = None;
312 column = None;
313 element;
314 attribute;
315 severity;
316 }
317
318let parse_json_value ~get_string ~get_int ~message_field =
319 let message = match message_field with
320 | Some m -> m
321 | None -> match get_string "message" with Some m -> m | None -> ""
322 in
323 let base = parse message in
324 { base with
325 line = (match get_int "line" with Some l -> Some l | None -> base.line);
326 column = (match get_int "column" with Some c -> Some c | None -> base.column);
327 element = (match get_string "element" with Some e -> Some e | None -> base.element);
328 attribute = (match get_string "attribute" with Some a -> Some a | None -> base.attribute);
329 }
330
331(** Compare error codes for semantic equality *)
332let error_codes_match code1 code2 =
333 match (code1, code2) with
334 | (Html5_checker.Error_code.Element_not_allowed_as_child { child = c1; parent = p1 },
335 Html5_checker.Error_code.Element_not_allowed_as_child { child = c2; parent = p2 }) ->
336 String.lowercase_ascii c1 = String.lowercase_ascii c2 &&
337 String.lowercase_ascii p1 = String.lowercase_ascii p2
338 | (Html5_checker.Error_code.Attr_not_allowed_on_element { attr = a1; element = e1 },
339 Html5_checker.Error_code.Attr_not_allowed_on_element { attr = a2; element = e2 }) ->
340 String.lowercase_ascii a1 = String.lowercase_ascii a2 &&
341 String.lowercase_ascii e1 = String.lowercase_ascii e2
342 | (Html5_checker.Error_code.Missing_required_attr { element = e1; attr = a1 },
343 Html5_checker.Error_code.Missing_required_attr { element = e2; attr = a2 }) ->
344 String.lowercase_ascii e1 = String.lowercase_ascii e2 &&
345 String.lowercase_ascii a1 = String.lowercase_ascii a2
346 | (Html5_checker.Error_code.Duplicate_id { id = i1 },
347 Html5_checker.Error_code.Duplicate_id { id = i2 }) ->
348 i1 = i2
349 | (Html5_checker.Error_code.Stray_end_tag { tag = t1 },
350 Html5_checker.Error_code.Stray_end_tag { tag = t2 }) ->
351 String.lowercase_ascii t1 = String.lowercase_ascii t2
352 | (Html5_checker.Error_code.Stray_start_tag { tag = t1 },
353 Html5_checker.Error_code.Stray_start_tag { tag = t2 }) ->
354 String.lowercase_ascii t1 = String.lowercase_ascii t2
355 (* For other cases, fall back to structural equality *)
356 | (c1, c2) -> c1 = c2
357
358let matches ~strictness ~expected ~actual =
359 let expected_norm = normalize_quotes expected.message in
360 let actual_norm = normalize_quotes actual.Html5_checker.Message.message in
361
362 (* Check severity match *)
363 let severity_matches =
364 match (expected.severity, actual.Html5_checker.Message.severity) with
365 | (None, _) -> true
366 | (Some `Error, Html5_checker.Message.Error) -> true
367 | (Some `Warning, Html5_checker.Message.Warning) -> true
368 | (Some `Info, Html5_checker.Message.Info) -> true
369 | _ -> false
370 in
371
372 (* Check location match *)
373 let location_matches =
374 match (expected.line, expected.column, actual.Html5_checker.Message.location) with
375 | (None, None, _) -> true
376 | (Some el, Some ec, Some loc) -> loc.line = el && loc.column = ec
377 | (Some el, None, Some loc) -> loc.line = el
378 | _ -> false
379 in
380
381 (* Check error code match *)
382 let code_matches =
383 match (expected.error_code, actual.Html5_checker.Message.error_code) with
384 | (None, _) -> true (* No expected code to match *)
385 | (Some ec, Some ac) -> error_codes_match ec ac
386 | (Some _, None) -> false (* Expected typed but got untyped *)
387 in
388
389 (* Check message text *)
390 let exact_text_match = actual_norm = expected_norm in
391 let substring_match =
392 try let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in true
393 with Not_found -> false
394 in
395
396 (* Determine match quality *)
397 if not severity_matches && strictness.require_severity then
398 Severity_mismatch
399 else if exact_text_match && code_matches && (location_matches || not strictness.require_location) then
400 Exact_match
401 else if code_matches && expected.error_code <> None then
402 Code_match
403 else if exact_text_match then
404 Message_match
405 else if substring_match && not strictness.require_exact_message then
406 Substring_match
407 else
408 No_match
409
410let is_acceptable ~strictness quality =
411 match quality with
412 | Exact_match -> true
413 | Code_match -> not strictness.require_exact_message
414 | Message_match -> not strictness.require_error_code
415 | Substring_match -> not strictness.require_exact_message
416 | Severity_mismatch -> not strictness.require_severity
417 | No_match -> false
418
419let match_quality_to_string = function
420 | Exact_match -> "exact"
421 | Code_match -> "code"
422 | Message_match -> "message"
423 | Substring_match -> "substring"
424 | Severity_mismatch -> "severity-mismatch"
425 | No_match -> "no-match"