+34
-11
lib/htmlrw_check/htmlrw_check.ml
+34
-11
lib/htmlrw_check/htmlrw_check.ml
···
5
5
6
6
module Error_code = Error_code
7
7
8
-
(* Public types - defined here to avoid re-exporting internal modules *)
8
+
(* Public types *)
9
9
10
10
type severity = Error | Warning | Info
11
11
···
17
17
system_id : string option;
18
18
}
19
19
20
+
type error_code =
21
+
| Parse of Html5rw.Parse_error_code.t
22
+
| Conformance of Error_code.t
23
+
20
24
type message = {
21
25
severity : severity;
22
26
text : string;
23
-
code : string;
24
-
error_code : Error_code.t option;
27
+
error_code : error_code;
25
28
location : location option;
26
29
element : string option;
27
30
attribute : string option;
···
49
52
system_id = loc.system_id;
50
53
}
51
54
55
+
let convert_error_code = function
56
+
| Message.Parse_error code -> Parse code
57
+
| Message.Conformance_error code -> Conformance code
58
+
52
59
let convert_message (m : Message.t) : message = {
53
60
severity = convert_severity m.severity;
54
61
text = m.message;
55
-
code = m.code;
56
-
error_code = m.error_code;
62
+
error_code = convert_error_code m.error_code;
57
63
location = Option.map convert_location m.location;
58
64
element = m.element;
59
65
attribute = m.attribute;
···
77
83
78
84
match Xhtml_parser.parse_xhtml content with
79
85
| Ok root ->
80
-
(* Run all registered checkers via DOM traversal *)
81
86
let registry = Checker_registry.default () in
82
87
Dom_walker.walk_registry registry collector root;
83
88
let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
···
111
116
{ doc; msgs; sys_id = system_id }
112
117
end
113
118
119
+
let check_string ?system_id html =
120
+
let reader = Bytesrw.Bytes.Reader.of_string html in
121
+
check ?system_id reader
122
+
114
123
let check_parsed ?(collect_parse_errors = true) ?system_id doc =
115
124
let collector = Message_collector.create () in
116
125
···
138
147
let infos t =
139
148
List.filter (fun msg -> msg.severity = Info) t.msgs
140
149
150
+
let parse_errors t =
151
+
List.filter (fun msg ->
152
+
match msg.error_code with Parse _ -> true | Conformance _ -> false
153
+
) t.msgs
154
+
155
+
let conformance_errors t =
156
+
List.filter (fun msg ->
157
+
match msg.error_code with Parse _ -> false | Conformance _ -> true
158
+
) t.msgs
159
+
141
160
let has_errors t =
142
161
List.exists (fun msg -> msg.severity = Error) t.msgs
143
162
···
163
182
system_id = loc.system_id;
164
183
}
165
184
185
+
let unconvert_error_code = function
186
+
| Parse code -> Message.Parse_error code
187
+
| Conformance code -> Message.Conformance_error code
188
+
166
189
let unconvert_message (m : message) : Message.t = {
167
190
severity = unconvert_severity m.severity;
168
191
message = m.text;
169
-
code = m.code;
170
-
error_code = m.error_code;
192
+
error_code = unconvert_error_code m.error_code;
171
193
location = Option.map unconvert_location m.location;
172
194
element = m.element;
173
195
attribute = m.attribute;
···
193
215
| Warning -> "warning"
194
216
| Info -> "info"
195
217
218
+
let error_code_to_string = function
219
+
| Parse code -> Html5rw.Parse_error_code.to_string code
220
+
| Conformance code -> Error_code.code_string code
221
+
196
222
let pp_severity fmt sev =
197
223
Format.pp_print_string fmt (severity_to_string sev)
198
224
···
207
233
match msg.location with
208
234
| Some loc -> Format.fprintf fmt " (at %a)" pp_location loc
209
235
| None -> ()
210
-
211
-
let message_to_string msg =
212
-
Format.asprintf "%a" pp_message msg
+135
-247
lib/htmlrw_check/htmlrw_check.mli
+135
-247
lib/htmlrw_check/htmlrw_check.mli
···
5
5
6
6
(** HTML5 Conformance Checker
7
7
8
-
This module validates HTML5 documents against the
9
-
{{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard},
10
-
reporting conformance errors, warnings, and suggestions.
8
+
Validates HTML5 documents against the
9
+
{{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}.
11
10
12
11
{2 Quick Start}
13
12
14
13
{[
15
-
(* Validate HTML from a string *)
16
-
let html = "<html><body><img></body></html>" in
17
-
let reader = Bytesrw.Bytes.Reader.of_string html in
18
-
let result = Htmlrw_check.check reader in
14
+
let result = Htmlrw_check.check_string "<html><body><img></body></html>" in
19
15
20
-
if Htmlrw_check.has_errors result then begin
21
-
List.iter (fun msg ->
22
-
Printf.printf "%s: %s\n"
23
-
(Htmlrw_check.severity_to_string msg.Htmlrw_check.severity)
24
-
msg.Htmlrw_check.text
25
-
) (Htmlrw_check.errors result)
26
-
end
16
+
if Htmlrw_check.has_errors result then
17
+
print_endline (Htmlrw_check.to_text result)
18
+
else
19
+
print_endline "Valid HTML5!"
27
20
]}
28
21
29
-
{2 What Gets Checked}
22
+
{2 Handling Specific Errors}
30
23
31
-
The checker validates:
24
+
Use pattern matching on {!field-message.error_code} for fine-grained control:
32
25
33
-
- {b Parse errors}: Malformed HTML syntax (missing end tags, invalid
34
-
nesting, etc.) per the WHATWG parsing specification
35
-
- {b Content model}: Elements appearing in contexts where they're not
36
-
allowed (e.g., [<div>] inside [<p>])
37
-
- {b Attributes}: Missing required attributes, disallowed attributes,
38
-
and invalid attribute values
39
-
- {b Accessibility}: ARIA role/attribute misuse, missing alt text on
40
-
images, form labeling issues
41
-
- {b Document structure}: Missing DOCTYPE, duplicate IDs, heading
42
-
hierarchy issues
43
-
- {b Internationalization}: Missing or mismatched lang attributes
26
+
{[
27
+
List.iter (fun msg ->
28
+
match msg.Htmlrw_check.error_code with
29
+
| Parse code ->
30
+
Printf.printf "Syntax error: %s\n"
31
+
(Html5rw.Parse_error_code.to_string code)
32
+
| Conformance code ->
33
+
match code with
34
+
| `Img `Missing_alt ->
35
+
Printf.printf "Accessibility: %s needs alt text\n"
36
+
(Option.value ~default:"image" msg.element)
37
+
| `Attr (`Duplicate_id _) ->
38
+
Printf.printf "Duplicate ID found\n"
39
+
| _ ->
40
+
Printf.printf "Error: %s\n" msg.text
41
+
) (Htmlrw_check.errors result)
42
+
]}
44
43
45
-
{2 Output Formats}
44
+
{2 CI Integration}
46
45
47
-
Results can be formatted as:
48
-
- {b Text}: Human-readable messages for terminal output
49
-
- {b JSON}: Machine-readable format compatible with Nu HTML Validator
50
-
- {b GNU}: Error format for IDE integration
46
+
{[
47
+
let validate_file path =
48
+
let ic = open_in path in
49
+
let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
50
+
let result = Htmlrw_check.check ~system_id:path reader in
51
+
close_in ic;
52
+
if Htmlrw_check.has_errors result then begin
53
+
print_string (Htmlrw_check.to_gnu result);
54
+
exit 1
55
+
end
56
+
]}
51
57
52
-
@see <https://html.spec.whatwg.org/>
53
-
WHATWG HTML Living Standard
54
-
@see <https://validator.w3.org/nu/>
55
-
Nu HTML Checker (reference validator) *)
58
+
{2 What Gets Checked}
56
59
57
-
(** {1:types Types} *)
60
+
- {b Parse errors}: Malformed syntax per WHATWG parsing specification
61
+
- {b Content model}: Invalid element nesting (e.g., [<div>] inside [<p>])
62
+
- {b Attributes}: Missing required, disallowed, or invalid attributes
63
+
- {b Accessibility}: ARIA misuse, missing alt text, form labeling
64
+
- {b Structure}: Missing DOCTYPE, duplicate IDs, heading hierarchy
65
+
- {b Internationalization}: Missing or mismatched lang attributes
58
66
59
-
(** Message severity level.
67
+
@see <https://html.spec.whatwg.org/> WHATWG HTML Living Standard
68
+
@see <https://validator.w3.org/nu/> Nu HTML Checker *)
60
69
61
-
- [Error]: Conformance error - the document violates the HTML5 spec
62
-
- [Warning]: Likely problem - should be reviewed but may be intentional
63
-
- [Info]: Suggestion - best practice recommendation *)
64
-
type severity = Error | Warning | Info
65
70
66
-
(** Source location of a validation issue.
71
+
(** {1:types Types} *)
72
+
73
+
(** Message severity level. *)
74
+
type severity =
75
+
| Error (** Conformance violation - document is invalid *)
76
+
| Warning (** Likely problem - may be intentional *)
77
+
| Info (** Suggestion for improvement *)
67
78
68
-
Locations use 1-based line and column numbers matching typical editor
69
-
conventions. The [system_id] field contains the file path or URL if one
70
-
was provided to the checker. *)
79
+
(** Source location in the document. Line and column are 1-indexed. *)
71
80
type location = {
72
81
line : int;
73
-
(** Line number (1-indexed) where the issue was found. *)
74
-
75
82
column : int;
76
-
(** Column number (1-indexed) within the line. *)
77
-
78
83
end_line : int option;
79
-
(** End line for issues spanning multiple lines. *)
80
-
81
84
end_column : int option;
82
-
(** End column for range-based issues. *)
83
-
84
-
system_id : string option;
85
-
(** File path or URL, if provided to the checker. *)
85
+
system_id : string option; (** File path or URL if provided *)
86
86
}
87
87
88
-
(** A validation message describing a conformance issue.
88
+
(** Typed error code. Pattern match to handle specific errors.
89
89
90
-
Each message contains:
91
-
- The {!field-severity} indicating how serious the issue is
92
-
- Human-readable {!field-text} explaining the problem
93
-
- Machine-readable {!field-code} for programmatic handling
94
-
- Optional {!field-error_code} for fine-grained pattern matching
95
-
- Source {!field-location} when available
96
-
- Context ({!field-element}, {!field-attribute}) when relevant *)
90
+
{[
91
+
match msg.error_code with
92
+
| Parse Html5rw.Parse_error_code.Eof_in_tag ->
93
+
(* Unclosed tag at end of file *)
94
+
| Conformance (`Img `Missing_alt) ->
95
+
(* Image without alt attribute *)
96
+
| _ -> ()
97
+
]} *)
98
+
type error_code =
99
+
| Parse of Html5rw.Parse_error_code.t
100
+
(** Syntax error from the HTML5 parser.
101
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> *)
102
+
| Conformance of Error_code.t
103
+
(** Semantic error from conformance checking. *)
104
+
105
+
(** A validation message. *)
97
106
type message = {
98
107
severity : severity;
99
-
(** Severity level of this message. *)
100
-
101
-
text : string;
102
-
(** Human-readable description of the issue.
103
-
104
-
The text follows Nu HTML Validator message conventions, using
105
-
Unicode quotes around element/attribute names:
106
-
["Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child..."] *)
107
-
108
-
code : string;
109
-
(** Machine-readable error code in kebab-case.
110
-
111
-
Examples: ["missing-alt"], ["duplicate-id"], ["unexpected-end-tag"].
112
-
Useful for filtering or categorizing errors programmatically. *)
113
-
114
-
error_code : Error_code.t option;
115
-
(** Typed error code for pattern matching.
116
-
117
-
When present, allows fine-grained handling of specific errors:
118
-
{[
119
-
match msg.error_code with
120
-
| Some (`Img `Missing_alt) -> suggest_alt_text ()
121
-
| Some (`Attr (`Duplicate_id (`Id id))) -> highlight_duplicate id
122
-
| _ -> show_generic_error msg
123
-
]} *)
124
-
125
-
location : location option;
126
-
(** Source location where the issue was detected.
127
-
128
-
[None] for document-level issues or when location tracking is
129
-
unavailable (e.g., for some content model errors). *)
130
-
131
-
element : string option;
132
-
(** Element name relevant to this message (e.g., ["img"], ["div"]).
133
-
134
-
Lowercase, without angle brackets. *)
135
-
136
-
attribute : string option;
137
-
(** Attribute name relevant to this message (e.g., ["alt"], ["href"]).
138
-
139
-
Lowercase. Only present for attribute-related errors. *)
140
-
141
-
extract : string option;
142
-
(** Source excerpt showing context around the error.
143
-
144
-
Typically a few characters before and after the problematic location.
145
-
Useful for displaying the error in context. *)
108
+
text : string; (** Human-readable description *)
109
+
error_code : error_code; (** Typed code for pattern matching *)
110
+
location : location option; (** Source location if available *)
111
+
element : string option; (** Relevant element (lowercase) *)
112
+
attribute : string option; (** Relevant attribute (lowercase) *)
113
+
extract : string option; (** Source excerpt for context *)
146
114
}
147
115
148
-
(** Validation result containing all messages and the parsed document.
149
-
150
-
Use {!messages}, {!errors}, {!warnings}, and {!infos} to access
151
-
the validation messages. Use {!document} to access the parsed DOM. *)
116
+
(** Validation result. Use accessors below to inspect. *)
152
117
type t
153
118
154
-
(** {1:validation Validation Functions} *)
155
119
156
-
(** Validate HTML from a reader.
120
+
(** {1:validation Validation} *)
157
121
158
-
Parses the HTML input and runs all conformance checks, returning
159
-
a result containing any validation messages.
122
+
(** Validate HTML from a string.
160
123
161
-
{b Example:}
162
124
{[
163
-
let ic = open_in "page.html" in
164
-
let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
165
-
let result = Htmlrw_check.check ~system_id:"page.html" reader in
166
-
close_in ic;
167
-
125
+
let result = Htmlrw_check.check_string html in
168
126
if Htmlrw_check.has_errors result then
169
-
print_endline (Htmlrw_check.to_text result)
127
+
prerr_endline (Htmlrw_check.to_text result)
170
128
]}
171
129
172
-
@param collect_parse_errors If [true] (default), include HTML parse
173
-
errors in the results. Set to [false] to only get conformance
174
-
checker errors (content model, attributes, etc.).
175
-
@param system_id File path or URL for the document. Used in error
176
-
messages and the {!location} field. Does not affect validation. *)
130
+
@param system_id File path or URL for error messages. *)
131
+
val check_string : ?system_id:string -> string -> t
132
+
133
+
(** Validate HTML from a reader.
134
+
135
+
@param collect_parse_errors Include syntax errors (default: [true]).
136
+
@param system_id File path or URL for error messages. *)
177
137
val check :
178
138
?collect_parse_errors:bool ->
179
139
?system_id:string ->
180
140
Bytesrw.Bytes.Reader.t ->
181
141
t
182
142
183
-
(** Validate an already-parsed HTML document.
184
-
185
-
Runs conformance checks on an existing {!Html5rw.t} parse result.
186
-
Useful when you've already parsed the document and want to validate
187
-
it without re-parsing.
188
-
189
-
{b Example:}
190
-
{[
191
-
let doc = Html5rw.parse reader in
192
-
(* ... manipulate the DOM ... *)
193
-
let result = Htmlrw_check.check_parsed doc in
194
-
]}
143
+
(** Validate an already-parsed document.
195
144
196
-
@param collect_parse_errors If [true] (default), include any parse
197
-
errors that were collected during the original parse.
198
-
@param system_id File path or URL for error reporting. *)
145
+
Useful when you've parsed the HTML separately and want to run
146
+
conformance checks without re-parsing. *)
199
147
val check_parsed :
200
148
?collect_parse_errors:bool ->
201
149
?system_id:string ->
202
150
Html5rw.t ->
203
151
t
204
152
205
-
(** {1:results Result Accessors} *)
206
153
207
-
(** Get all validation messages.
154
+
(** {1:results Results} *)
208
155
209
-
Returns messages in the order they were generated, which roughly
210
-
corresponds to document order for element-related errors. *)
156
+
(** All messages in document order. *)
211
157
val messages : t -> message list
212
158
213
-
(** Get only error messages.
214
-
215
-
Errors indicate conformance violations - the document does not
216
-
comply with the HTML5 specification. *)
159
+
(** Only error-severity messages. *)
217
160
val errors : t -> message list
218
161
219
-
(** Get only warning messages.
220
-
221
-
Warnings indicate likely problems that may be intentional in
222
-
some cases (e.g., deprecated features still in use). *)
162
+
(** Only warning-severity messages. *)
223
163
val warnings : t -> message list
224
164
225
-
(** Get only informational messages.
226
-
227
-
Info messages are suggestions for best practices that don't
228
-
affect conformance. *)
165
+
(** Only info-severity messages. *)
229
166
val infos : t -> message list
230
167
231
-
(** Test if any errors were found.
168
+
(** Only syntax errors from the parser. *)
169
+
val parse_errors : t -> message list
232
170
233
-
Equivalent to [errors result <> []] but more efficient. *)
234
-
val has_errors : t -> bool
171
+
(** Only semantic errors from conformance checking. *)
172
+
val conformance_errors : t -> message list
235
173
236
-
(** Test if any warnings were found.
174
+
(** [true] if any errors were found. *)
175
+
val has_errors : t -> bool
237
176
238
-
Equivalent to [warnings result <> []] but more efficient. *)
177
+
(** [true] if any warnings were found. *)
239
178
val has_warnings : t -> bool
240
179
241
-
(** Get the parsed document.
242
-
243
-
Returns the DOM tree that was validated. For {!check}, this is the
244
-
newly parsed document. For {!check_parsed}, this is the document
245
-
that was passed in. *)
180
+
(** The parsed document. *)
246
181
val document : t -> Html5rw.t
247
182
248
-
(** Get the system identifier.
249
-
250
-
Returns the file path or URL that was passed to {!check} or
251
-
{!check_parsed}, or [None] if not provided. *)
183
+
(** The system identifier (file path or URL) if provided. *)
252
184
val system_id : t -> string option
253
185
186
+
254
187
(** {1:formatting Output Formatting} *)
255
188
256
-
(** Format messages as human-readable text.
189
+
(** Human-readable text format.
257
190
258
-
Produces multi-line output suitable for terminal display:
259
191
{v
260
-
Error: Element "img" is missing required attribute "alt".
261
-
At line 5, column 3
262
-
<img src="photo.jpg">
263
-
v}
264
-
265
-
Messages are formatted with severity, description, location,
266
-
and source excerpt when available. *)
192
+
file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt".
193
+
v} *)
267
194
val to_text : t -> string
268
195
269
-
(** Format messages as JSON.
196
+
(** JSON format compatible with Nu HTML Validator.
270
197
271
-
Produces JSON output compatible with the Nu HTML Validator format:
272
198
{v
273
-
{
274
-
"messages": [
275
-
{
276
-
"type": "error",
277
-
"message": "Element \"img\" is missing required attribute \"alt\".",
278
-
"lastLine": 5,
279
-
"lastColumn": 3
280
-
}
281
-
]
282
-
}
283
-
v}
284
-
285
-
Useful for machine processing and integration with other tools. *)
199
+
{"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]}
200
+
v} *)
286
201
val to_json : t -> string
287
202
288
-
(** Format messages in GNU error format.
203
+
(** GNU error format for IDE integration.
289
204
290
-
Produces one-line-per-error output for IDE integration:
291
205
{v
292
-
page.html:5:3: error: Element "img" is missing required attribute "alt".
293
-
v}
294
-
295
-
This format is recognized by many editors and build tools. *)
206
+
file.html:5:3: error: Element "img" is missing required attribute "alt".
207
+
v} *)
296
208
val to_gnu : t -> string
297
209
298
-
(** {1:utilities Utility Functions} *)
299
210
300
-
(** Convert severity to lowercase string.
211
+
(** {1:utilities Utilities} *)
301
212
302
-
Returns ["error"], ["warning"], or ["info"]. *)
213
+
(** ["error"], ["warning"], or ["info"]. *)
303
214
val severity_to_string : severity -> string
304
215
305
-
(** Pretty-print a severity value. *)
216
+
(** String representation of an error code. *)
217
+
val error_code_to_string : error_code -> string
218
+
219
+
(** Pretty-printer for severity. *)
306
220
val pp_severity : Format.formatter -> severity -> unit
307
221
308
-
(** Pretty-print a location. *)
222
+
(** Pretty-printer for location. *)
309
223
val pp_location : Format.formatter -> location -> unit
310
224
311
-
(** Pretty-print a message.
312
-
313
-
Includes severity, text, and location if available. *)
225
+
(** Pretty-printer for message. *)
314
226
val pp_message : Format.formatter -> message -> unit
315
227
316
-
(** Convert a message to a single-line string.
317
228
318
-
Includes severity and message text. *)
319
-
val message_to_string : message -> string
320
-
321
-
(** {1:error_codes Error Codes}
229
+
(** {1:error_codes Error Code Types}
322
230
323
-
The {!Error_code} module provides typed error codes for programmatic
324
-
handling of validation issues. Use pattern matching to handle specific
325
-
errors:
231
+
For pattern matching on conformance errors. Parse errors use
232
+
{!Html5rw.Parse_error_code}.
326
233
327
234
{[
328
-
let handle_message msg =
329
-
match msg.Htmlrw_check.error_code with
330
-
| Some (`Img `Missing_alt) ->
331
-
(* Image accessibility issue *)
332
-
suggest_alt_text msg
333
-
| Some (`Attr (`Duplicate_id (`Id id))) ->
334
-
(* Duplicate ID found *)
335
-
highlight_all_with_id id
336
-
| Some (`Aria _) ->
337
-
(* Any ARIA-related error *)
338
-
show_aria_help ()
339
-
| _ ->
340
-
(* Generic handling *)
341
-
display_error msg
342
-
]}
343
-
344
-
The error codes are organized into categories:
345
-
- [`Attr _]: Attribute errors (missing, invalid, duplicate)
346
-
- [`Element _]: Element/content model errors
347
-
- [`Aria _]: ARIA accessibility errors
348
-
- [`Img _]: Image-related errors
349
-
- [`Table _]: Table structure errors
350
-
- And more...
351
-
352
-
See {!Error_code} for the complete type definition. *)
235
+
match code with
236
+
| `Attr (`Missing_required_attr _) -> ...
237
+
| `Img `Missing_alt -> ...
238
+
| `Aria _ -> ... (* Any ARIA error *)
239
+
| _ -> ...
240
+
]} *)
353
241
module Error_code = Error_code
+23
-18
lib/htmlrw_check/message.ml
+23
-18
lib/htmlrw_check/message.ml
···
10
10
system_id : string option;
11
11
}
12
12
13
+
type error_code =
14
+
| Parse_error of Html5rw.Parse_error_code.t
15
+
| Conformance_error of Error_code.t
16
+
13
17
type t = {
14
18
severity : severity;
15
19
message : string;
16
-
code : string;
17
-
error_code : Error_code.t option;
20
+
error_code : error_code;
18
21
location : location option;
19
22
element : string option;
20
23
attribute : string option;
···
24
27
let make_location ~line ~column ?end_line ?end_column ?system_id () =
25
28
{ line; column; end_line; end_column; system_id }
26
29
27
-
(** Create a message from a typed error code *)
28
-
let of_error_code ?location ?element ?attribute ?extract error_code =
30
+
(** Create a message from a conformance error code *)
31
+
let of_conformance_error ?location ?element ?attribute ?extract error_code =
29
32
let severity = match Error_code.severity error_code with
30
33
| Error_code.Error -> Error
31
34
| Error_code.Warning -> Warning
···
34
37
{
35
38
severity;
36
39
message = Error_code.to_message error_code;
37
-
code = Error_code.code_string error_code;
38
-
error_code = Some error_code;
40
+
error_code = Conformance_error error_code;
39
41
location;
40
42
element;
41
43
attribute;
42
44
extract;
43
45
}
44
46
45
-
(** Create a message with manual message text (for backwards compatibility during migration) *)
46
-
let make ~severity ~message ?(code="generic") ?location ?element ?attribute ?extract () =
47
-
{ severity; message; code; error_code = None; location; element; attribute; extract }
48
-
49
-
let error ~message ?(code="generic") ?location ?element ?attribute ?extract () =
50
-
make ~severity:Error ~message ~code ?location ?element ?attribute ?extract ()
51
-
52
-
let warning ~message ?(code="generic") ?location ?element ?attribute ?extract () =
53
-
make ~severity:Warning ~message ~code ?location ?element ?attribute ?extract ()
47
+
(** Create a message from a parse error code *)
48
+
let of_parse_error ?location ?element ?attribute ?extract ~message code =
49
+
{
50
+
severity = Error; (* Parse errors are always errors *)
51
+
message;
52
+
error_code = Parse_error code;
53
+
location;
54
+
element;
55
+
attribute;
56
+
extract;
57
+
}
54
58
55
-
let info ~message ?(code="generic") ?location ?element ?attribute ?extract () =
56
-
make ~severity:Info ~message ~code ?location ?element ?attribute ?extract ()
59
+
let error_code_to_string = function
60
+
| Parse_error code -> Html5rw.Parse_error_code.to_string code
61
+
| Conformance_error code -> Error_code.code_string code
57
62
58
63
let severity_to_string = function
59
64
| Error -> "error"
···
82
87
Format.fprintf fmt ": "
83
88
| None -> ());
84
89
pp_severity fmt msg.severity;
85
-
Format.fprintf fmt " [%s]" msg.code;
90
+
Format.fprintf fmt " [%s]" (error_code_to_string msg.error_code);
86
91
Format.fprintf fmt ": %s" msg.message;
87
92
(match msg.element with
88
93
| Some elem -> Format.fprintf fmt " (element: %s)" elem
+18
-42
lib/htmlrw_check/message.mli
+18
-42
lib/htmlrw_check/message.mli
···
18
18
system_id : string option; (** File path or URL *)
19
19
}
20
20
21
+
(** Unified error code type covering both parse errors and conformance errors. *)
22
+
type error_code =
23
+
| Parse_error of Html5rw.Parse_error_code.t
24
+
(** Parse error from the HTML5 tokenizer/parser *)
25
+
| Conformance_error of Error_code.t
26
+
(** Conformance error from semantic validation *)
27
+
21
28
(** A validation message. *)
22
29
type t = {
23
30
severity : severity;
24
31
message : string; (** Human-readable description *)
25
-
code : string; (** Machine-readable error code *)
26
-
error_code : Error_code.t option; (** Typed error code if available *)
32
+
error_code : error_code; (** Typed error code *)
27
33
location : location option;
28
34
element : string option; (** Element name if relevant *)
29
35
attribute : string option; (** Attribute name if relevant *)
···
32
38
33
39
(** {1 Constructors} *)
34
40
35
-
(** Create a message from a typed error code (preferred method). *)
36
-
val of_error_code :
41
+
(** Create a message from a conformance error code. *)
42
+
val of_conformance_error :
37
43
?location:location ->
38
44
?element:string ->
39
45
?attribute:string ->
···
41
47
Error_code.t ->
42
48
t
43
49
44
-
(** Create a validation message with specified severity (legacy). *)
45
-
val make :
46
-
severity:severity ->
47
-
message:string ->
48
-
?code:string ->
49
-
?location:location ->
50
-
?element:string ->
51
-
?attribute:string ->
52
-
?extract:string ->
53
-
unit ->
54
-
t
55
-
56
-
(** Create an error message (legacy). *)
57
-
val error :
58
-
message:string ->
59
-
?code:string ->
50
+
(** Create a message from a parse error code. *)
51
+
val of_parse_error :
60
52
?location:location ->
61
53
?element:string ->
62
54
?attribute:string ->
63
55
?extract:string ->
64
-
unit ->
65
-
t
66
-
67
-
(** Create a warning message (legacy). *)
68
-
val warning :
69
56
message:string ->
70
-
?code:string ->
71
-
?location:location ->
72
-
?element:string ->
73
-
?attribute:string ->
74
-
?extract:string ->
75
-
unit ->
76
-
t
77
-
78
-
(** Create an informational message (legacy). *)
79
-
val info :
80
-
message:string ->
81
-
?code:string ->
82
-
?location:location ->
83
-
?element:string ->
84
-
?attribute:string ->
85
-
?extract:string ->
86
-
unit ->
57
+
Html5rw.Parse_error_code.t ->
87
58
t
88
59
89
60
(** Create a location record. *)
···
95
66
?system_id:string ->
96
67
unit ->
97
68
location
69
+
70
+
(** {1 Utilities} *)
71
+
72
+
(** Get the string representation of an error code. *)
73
+
val error_code_to_string : error_code -> string
98
74
99
75
(** {1 Formatting} *)
100
76
+2
-27
lib/htmlrw_check/message_collector.ml
+2
-27
lib/htmlrw_check/message_collector.ml
···
13
13
14
14
let add t msg = t.messages <- msg :: t.messages
15
15
16
-
(** Add a message from a typed error code *)
16
+
(** Add a message from a typed conformance error code *)
17
17
let add_typed t ?location ?element ?attribute ?extract error_code =
18
18
(* Use provided location, or fall back to current_location *)
19
19
let loc = match location with
20
20
| Some _ -> location
21
21
| None -> t.current_location
22
22
in
23
-
let msg = Message.of_error_code ?location:loc ?element ?attribute ?extract error_code in
24
-
add t msg
25
-
26
-
(** Add an error from a typed error code *)
27
-
let add_error_code t ?location ?element ?attribute ?extract error_code =
28
-
add_typed t ?location ?element ?attribute ?extract error_code
29
-
30
-
(** Legacy: Add an error with manual message text *)
31
-
let add_error t ~message ?code ?location ?element ?attribute ?extract () =
32
-
let msg =
33
-
Message.error ~message ?code ?location ?element ?attribute ?extract ()
34
-
in
35
-
add t msg
36
-
37
-
(** Legacy: Add a warning with manual message text *)
38
-
let add_warning t ~message ?code ?location ?element ?attribute ?extract () =
39
-
let msg =
40
-
Message.warning ~message ?code ?location ?element ?attribute ?extract ()
41
-
in
42
-
add t msg
43
-
44
-
(** Legacy: Add an info message with manual message text *)
45
-
let add_info t ~message ?code ?location ?element ?attribute ?extract () =
46
-
let msg =
47
-
Message.info ~message ?code ?location ?element ?attribute ?extract ()
48
-
in
23
+
let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in
49
24
add t msg
50
25
51
26
let messages t = List.rev t.messages
+5
-52
lib/htmlrw_check/message_collector.mli
+5
-52
lib/htmlrw_check/message_collector.mli
···
20
20
(** Get the current location. *)
21
21
val get_current_location : t -> Message.location option
22
22
23
-
(** {1 Adding Messages - Typed Error Codes (Preferred)} *)
24
-
25
-
(** Add a message from a typed error code. *)
26
-
val add_typed :
27
-
t ->
28
-
?location:Message.location ->
29
-
?element:string ->
30
-
?attribute:string ->
31
-
?extract:string ->
32
-
Error_code.t ->
33
-
unit
34
-
35
-
(** Add an error from a typed error code. Alias for add_typed. *)
36
-
val add_error_code :
37
-
t ->
38
-
?location:Message.location ->
39
-
?element:string ->
40
-
?attribute:string ->
41
-
?extract:string ->
42
-
Error_code.t ->
43
-
unit
44
-
45
-
(** {1 Adding Messages - Legacy (for migration)} *)
23
+
(** {1 Adding Messages} *)
46
24
47
25
(** Add a message to the collector. *)
48
26
val add : t -> Message.t -> unit
49
27
50
-
(** Add an error message to the collector (legacy). *)
51
-
val add_error :
28
+
(** Add a message from a typed conformance error code.
29
+
Uses the current location if no explicit location is provided. *)
30
+
val add_typed :
52
31
t ->
53
-
message:string ->
54
-
?code:string ->
55
-
?location:Message.location ->
56
-
?element:string ->
57
-
?attribute:string ->
58
-
?extract:string ->
59
-
unit ->
60
-
unit
61
-
62
-
(** Add a warning message to the collector (legacy). *)
63
-
val add_warning :
64
-
t ->
65
-
message:string ->
66
-
?code:string ->
67
32
?location:Message.location ->
68
33
?element:string ->
69
34
?attribute:string ->
70
35
?extract:string ->
71
-
unit ->
72
-
unit
73
-
74
-
(** Add an info message to the collector (legacy). *)
75
-
val add_info :
76
-
t ->
77
-
message:string ->
78
-
?code:string ->
79
-
?location:Message.location ->
80
-
?element:string ->
81
-
?attribute:string ->
82
-
?extract:string ->
83
-
unit ->
36
+
Error_code.t ->
84
37
unit
85
38
86
39
(** {1 Retrieving Messages} *)
+3
-3
lib/htmlrw_check/message_format.ml
+3
-3
lib/htmlrw_check/message_format.ml
···
24
24
match system_id with Some s -> s | None -> "input")
25
25
in
26
26
let severity_str = Message.severity_to_string msg.Message.severity in
27
-
let code_str = " [" ^ msg.Message.code ^ "]" in
27
+
let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in
28
28
let elem_str =
29
29
match msg.Message.element with
30
30
| Some e -> " (element: " ^ e ^ ")"
···
59
59
match system_id with Some s -> s ^ ":0:0" | None -> "input:0:0")
60
60
in
61
61
let severity_str = Message.severity_to_string msg.Message.severity in
62
-
let code_str = " [" ^ msg.Message.code ^ "]" in
62
+
let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in
63
63
Buffer.add_string buf
64
64
(Printf.sprintf "%s: %s%s: %s\n" loc_str severity_str code_str
65
65
msg.Message.message))
···
72
72
let message_text = String (msg.Message.message, Meta.none) in
73
73
let base = [ (("type", Meta.none), severity); (("message", Meta.none), message_text) ] in
74
74
let with_code =
75
-
(("subType", Meta.none), String (msg.Message.code, Meta.none)) :: base
75
+
(("subType", Meta.none), String (Message.error_code_to_string msg.Message.error_code, Meta.none)) :: base
76
76
in
77
77
let with_location =
78
78
match msg.Message.location with
+79
-84
lib/htmlrw_check/parse_error_bridge.ml
+79
-84
lib/htmlrw_check/parse_error_bridge.ml
···
3
3
SPDX-License-Identifier: MIT
4
4
---------------------------------------------------------------------------*)
5
5
6
+
(** Generate human-readable message for a parse error code *)
7
+
let message_of_parse_error code =
8
+
let code_str = Html5rw.Parse_error_code.to_string code in
9
+
match code with
10
+
| Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
11
+
"Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
12
+
| Html5rw.Parse_error_code.Null_character_reference ->
13
+
"Character reference expands to zero."
14
+
| Html5rw.Parse_error_code.Tree_construction_error s ->
15
+
(* Check for control-character/noncharacter/surrogate with codepoint info *)
16
+
(try
17
+
if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
18
+
let colon_pos = String.index s ':' in
19
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
20
+
let cp = int_of_string ("0x" ^ cp_str) in
21
+
Printf.sprintf "Forbidden code point U+%04x." cp
22
+
else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
23
+
let colon_pos = String.index s ':' in
24
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
25
+
let cp = int_of_string ("0x" ^ cp_str) in
26
+
Printf.sprintf "Forbidden code point U+%04x." cp
27
+
else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
28
+
let colon_pos = String.index s ':' in
29
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
30
+
let cp = int_of_string ("0x" ^ cp_str) in
31
+
Printf.sprintf "Forbidden code point U+%04x." cp
32
+
(* Character reference errors *)
33
+
else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
34
+
let cp_str = String.sub s 28 (String.length s - 28) in
35
+
let cp = int_of_string ("0x" ^ cp_str) in
36
+
if cp = 0x0D then
37
+
"A numeric character reference expanded to carriage return."
38
+
else
39
+
Printf.sprintf "Character reference expands to a control character (U+%04x)." cp
40
+
else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
41
+
let colon_pos = String.index s ':' in
42
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
43
+
let cp = int_of_string ("0x" ^ cp_str) in
44
+
(* U+FDD0-U+FDEF are "permanently unassigned" *)
45
+
if cp >= 0xFDD0 && cp <= 0xFDEF then
46
+
"Character reference expands to a permanently unassigned code point."
47
+
(* Astral noncharacters (planes 1-16) *)
48
+
else if cp >= 0x10000 then
49
+
Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp
50
+
else
51
+
Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp
52
+
else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
53
+
"Character reference outside the permissible Unicode range."
54
+
else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
55
+
let colon_pos = String.index s ':' in
56
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
57
+
let cp = int_of_string ("0x" ^ cp_str) in
58
+
Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp
59
+
else if s = "no-p-element-in-scope" then
60
+
"No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen."
61
+
else if s = "end-tag-p-implied-but-open-elements" then
62
+
"End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements."
63
+
else if s = "end-tag-br" then
64
+
"End tag \xe2\x80\x9cbr\xe2\x80\x9d."
65
+
else if s = "expected-closing-tag-but-got-eof" then
66
+
"End of file seen and there were open elements."
67
+
else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then
68
+
let colon_pos = String.index s ':' in
69
+
let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
70
+
Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element
71
+
else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then
72
+
let element = String.sub s 19 (String.length s - 19) in
73
+
Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element
74
+
else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then
75
+
let tag = String.sub s 19 (String.length s - 19) in
76
+
Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag
77
+
else
78
+
Printf.sprintf "Parse error: %s" s
79
+
with _ -> Printf.sprintf "Parse error: %s" s)
80
+
| _ -> Printf.sprintf "Parse error: %s" code_str
81
+
6
82
let of_parse_error ?system_id err =
7
83
let code = Html5rw.error_code err in
8
84
let line = Html5rw.error_line err in
9
85
let column = Html5rw.error_column err in
10
-
let location =
11
-
Message.make_location ~line ~column ?system_id ()
12
-
in
13
-
let code_str = Html5rw.Parse_error_code.to_string code in
14
-
let (message, final_code) = match code with
15
-
| Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
16
-
("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str)
17
-
| Html5rw.Parse_error_code.Null_character_reference ->
18
-
("Character reference expands to zero.", "null-character-reference")
19
-
| Html5rw.Parse_error_code.Tree_construction_error s ->
20
-
(* Check for control-character/noncharacter/surrogate with codepoint info *)
21
-
(try
22
-
if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
23
-
let colon_pos = String.index s ':' in
24
-
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
25
-
let cp = int_of_string ("0x" ^ cp_str) in
26
-
(Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
27
-
else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
28
-
let colon_pos = String.index s ':' in
29
-
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
30
-
let cp = int_of_string ("0x" ^ cp_str) in
31
-
(Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
32
-
else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
33
-
let colon_pos = String.index s ':' in
34
-
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
35
-
let cp = int_of_string ("0x" ^ cp_str) in
36
-
(Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
37
-
(* Character reference errors *)
38
-
else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
39
-
let cp_str = String.sub s 28 (String.length s - 28) in
40
-
let cp = int_of_string ("0x" ^ cp_str) in
41
-
if cp = 0x0D then
42
-
("A numeric character reference expanded to carriage return.", "control-character-reference")
43
-
else
44
-
(Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference")
45
-
else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
46
-
let colon_pos = String.index s ':' in
47
-
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
48
-
let cp = int_of_string ("0x" ^ cp_str) in
49
-
(* U+FDD0-U+FDEF are "permanently unassigned" *)
50
-
if cp >= 0xFDD0 && cp <= 0xFDEF then
51
-
("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference")
52
-
(* Astral noncharacters (planes 1-16) *)
53
-
else if cp >= 0x10000 then
54
-
(Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference")
55
-
else
56
-
(Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference")
57
-
else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
58
-
let colon_pos = String.index s ':' in
59
-
let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
60
-
("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range")
61
-
else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
62
-
let colon_pos = String.index s ':' in
63
-
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
64
-
let cp = int_of_string ("0x" ^ cp_str) in
65
-
(Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference")
66
-
else if s = "no-p-element-in-scope" then
67
-
("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope")
68
-
else if s = "end-tag-p-implied-but-open-elements" then
69
-
("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied")
70
-
else if s = "end-tag-br" then
71
-
("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br")
72
-
else if s = "expected-closing-tag-but-got-eof" then
73
-
("End of file seen and there were open elements.", "eof-in-open-element")
74
-
else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then
75
-
let colon_pos = String.index s ':' in
76
-
let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
77
-
(Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element, "bad-start-tag-in-head-noscript")
78
-
else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then
79
-
let element = String.sub s 19 (String.length s - 19) in
80
-
(Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element, "stray-end-tag")
81
-
else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then
82
-
let tag = String.sub s 19 (String.length s - 19) in
83
-
(Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag, "start-tag-in-table")
84
-
else
85
-
(Printf.sprintf "Parse error: %s" s, s)
86
-
with _ -> (Printf.sprintf "Parse error: %s" s, s))
87
-
| _ -> (Printf.sprintf "Parse error: %s" code_str, code_str)
88
-
in
89
-
Message.error
90
-
~message
91
-
~code:final_code
92
-
~location
93
-
()
86
+
let location = Message.make_location ~line ~column ?system_id () in
87
+
let message = message_of_parse_error code in
88
+
Message.of_parse_error ~location ~message code
94
89
95
90
let collect_parse_errors ?system_id result =
96
91
let errors = Html5rw.errors result in
+2
-2
test/expected_message.ml
+2
-2
test/expected_message.ml
···
360
360
let code_matches =
361
361
match (expected.error_code, actual.Htmlrw_check.error_code) with
362
362
| (None, _) -> true (* No expected code to match *)
363
-
| (Some ec, Some ac) -> error_codes_match ec ac
364
-
| (Some _, None) -> false (* Expected typed but got untyped *)
363
+
| (Some ec, Htmlrw_check.Conformance ac) -> error_codes_match ec ac
364
+
| (Some _, Htmlrw_check.Parse _) -> false (* Expected conformance but got parse error *)
365
365
in
366
366
367
367
(* Check message text *)