+3
.gitmodules
+3
.gitmodules
+1
-1
dune-project
+1
-1
dune-project
+1
-1
html5rw.opam
+1
-1
html5rw.opam
+16
-2
lib/parser/tree_builder.ml
+16
-2
lib/parser/tree_builder.ml
···
2312
2312
2313
2313
and process_foreign_content t token =
2314
2314
match token with
2315
-
| Token.Character "\x00" ->
2315
+
| Token.Character data when String.contains data '\x00' ->
2316
+
(* Replace NUL characters with U+FFFD replacement character *)
2316
2317
parse_error t "unexpected-null-character";
2317
-
insert_character t "\xEF\xBF\xBD"
2318
+
let buf = Buffer.create (String.length data) in
2319
+
let has_non_ws_non_nul = ref false in
2320
+
String.iter (fun c ->
2321
+
if c = '\x00' then Buffer.add_string buf "\xEF\xBF\xBD"
2322
+
else begin
2323
+
Buffer.add_char buf c;
2324
+
if not (c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r') then
2325
+
has_non_ws_non_nul := true
2326
+
end
2327
+
) data;
2328
+
let replaced = Buffer.contents buf in
2329
+
insert_character t replaced;
2330
+
(* Only set frameset_ok to false if there's actual non-whitespace non-NUL content *)
2331
+
if !has_non_ws_non_nul then t.frameset_ok <- false
2318
2332
| Token.Character data when is_whitespace data ->
2319
2333
insert_character t data
2320
2334
| Token.Character data ->
+13
-3
lib/tokenizer/stream.ml
+13
-3
lib/tokenizer/stream.ml
···
20
20
mutable column : int;
21
21
(* Track if we just saw CR (for CR/LF normalization) *)
22
22
mutable last_was_cr : bool;
23
+
(* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
24
+
mutable skip_next_lf : bool;
23
25
}
24
26
25
27
(* Create a stream from a Bytes.Reader.t *)
···
33
35
line = 1;
34
36
column = 0;
35
37
last_was_cr = false;
38
+
skip_next_lf = false;
36
39
}
37
40
38
41
(* Create a stream from a string - discouraged, prefer create_from_reader *)
···
83
86
None
84
87
| Some '\r' ->
85
88
t.last_was_cr <- true;
86
-
Some '\n' (* CR becomes LF *)
89
+
(* Immediately consume following LF if present (CRLF -> single LF) *)
90
+
(match read_raw_char t with
91
+
| Some '\n' -> () (* Consume the LF that follows CR *)
92
+
| Some c -> push_back_char t c (* Put non-LF char back *)
93
+
| None -> ());
94
+
Some '\n' (* CR (or CRLF) becomes single LF *)
87
95
| Some '\n' when t.last_was_cr ->
88
96
(* Skip LF after CR - it was already converted *)
89
97
t.last_was_cr <- false;
···
102
110
Bytes.Slice.is_eod next)))
103
111
104
112
let peek t =
113
+
(* Save last_was_cr state before reading *)
114
+
let saved_last_was_cr = t.last_was_cr in
105
115
match read_normalized_char t with
106
116
| None -> None
107
117
| Some c ->
108
118
push_back_char t c;
109
-
(* Undo last_was_cr if we pushed back a CR-converted LF *)
110
-
if c = '\n' then t.last_was_cr <- false;
119
+
(* Restore the last_was_cr state so advance handles CR/LF correctly *)
120
+
t.last_was_cr <- saved_last_was_cr;
111
121
Some c
112
122
113
123
(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
+62
-23
lib/tokenizer/tokenizer.ml
+62
-23
lib/tokenizer/tokenizer.ml
···
150
150
let emit_current_tag () =
151
151
finish_attribute t;
152
152
let name = Buffer.contents t.current_tag_name in
153
+
let attrs = List.rev t.current_attrs in
154
+
(* Check for end tag with attributes or self-closing flag *)
155
+
if t.current_tag_kind = Token.End then begin
156
+
if attrs <> [] then
157
+
error t "end-tag-with-attributes";
158
+
if t.current_tag_self_closing then
159
+
error t "end-tag-with-trailing-solidus"
160
+
end;
153
161
let tag = {
154
162
Token.kind = t.current_tag_kind;
155
163
name;
156
-
attrs = List.rev t.current_attrs;
164
+
attrs;
157
165
self_closing = t.current_tag_self_closing;
158
166
} in
159
167
if t.current_tag_kind = Token.Start then
···
173
181
174
182
let emit_current_comment () =
175
183
emit (Token.Comment (Buffer.contents t.current_comment))
184
+
in
185
+
186
+
(* Check for control characters and emit error if needed *)
187
+
let check_control_char c =
188
+
let code = Char.code c in
189
+
(* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F-U+009F *)
190
+
(* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
191
+
if (code >= 0x01 && code <= 0x08) ||
192
+
code = 0x0B ||
193
+
(code >= 0x0E && code <= 0x1F) ||
194
+
(code >= 0x7F && code <= 0x9F) then
195
+
error t "control-character-in-input-stream"
196
+
in
197
+
198
+
(* Emit char with control character check *)
199
+
let emit_char_checked c =
200
+
check_control_char c;
201
+
emit_char t c
176
202
in
177
203
178
204
let rec process_state () =
···
374
400
flush_code_points_consumed_as_char_ref t;
375
401
t.state <- t.return_state;
376
402
handle_eof ()
377
-
| State.Named_character_reference
378
-
| State.Numeric_character_reference
403
+
| State.Named_character_reference ->
404
+
flush_code_points_consumed_as_char_ref t;
405
+
t.state <- t.return_state;
406
+
handle_eof ()
407
+
| State.Numeric_character_reference ->
408
+
(* At EOF with just "&#" - no digits follow *)
409
+
error t "absence-of-digits-in-numeric-character-reference";
410
+
flush_code_points_consumed_as_char_ref t;
411
+
t.state <- t.return_state;
412
+
handle_eof ()
379
413
| State.Hexadecimal_character_reference_start
380
-
| State.Decimal_character_reference_start
381
-
| State.Numeric_character_reference_end ->
414
+
| State.Decimal_character_reference_start ->
415
+
error t "absence-of-digits-in-numeric-character-reference";
382
416
flush_code_points_consumed_as_char_ref t;
383
417
t.state <- t.return_state;
418
+
handle_eof ()
419
+
| State.Numeric_character_reference_end ->
420
+
(* We have collected digits, just need to finalize the character reference *)
421
+
step ();
384
422
handle_eof ()
385
423
| State.Ambiguous_ampersand ->
386
424
(* Buffer was already flushed when entering this state, just transition *)
···
508
546
error t "unexpected-null-character";
509
547
ignore (S.process t.sink (Token.Character "\x00"))
510
548
| Some c ->
511
-
emit_char t c
549
+
emit_char_checked c
512
550
| None -> ()
513
551
514
552
and state_rcdata () =
···
522
560
error t "unexpected-null-character";
523
561
emit_str t "\xEF\xBF\xBD"
524
562
| Some c ->
525
-
emit_char t c
563
+
emit_char_checked c
526
564
| None -> ()
527
565
528
566
and state_rawtext () =
···
533
571
error t "unexpected-null-character";
534
572
emit_str t "\xEF\xBF\xBD"
535
573
| Some c ->
536
-
emit_char t c
574
+
emit_char_checked c
537
575
| None -> ()
538
576
539
577
and state_script_data () =
···
544
582
error t "unexpected-null-character";
545
583
emit_str t "\xEF\xBF\xBD"
546
584
| Some c ->
547
-
emit_char t c
585
+
emit_char_checked c
548
586
| None -> ()
549
587
550
588
and state_plaintext () =
···
553
591
error t "unexpected-null-character";
554
592
emit_str t "\xEF\xBF\xBD"
555
593
| Some c ->
556
-
emit_char t c
594
+
emit_char_checked c
557
595
| None -> ()
558
596
559
597
and state_tag_open () =
···
765
803
error t "unexpected-null-character";
766
804
emit_str t "\xEF\xBF\xBD"
767
805
| Some c ->
768
-
emit_char t c
806
+
emit_char_checked c
769
807
| None -> ()
770
808
771
809
and state_script_data_escaped_dash () =
···
781
819
emit_str t "\xEF\xBF\xBD"
782
820
| Some c ->
783
821
t.state <- State.Script_data_escaped;
784
-
emit_char t c
822
+
emit_char_checked c
785
823
| None -> ()
786
824
787
825
and state_script_data_escaped_dash_dash () =
···
799
837
emit_str t "\xEF\xBF\xBD"
800
838
| Some c ->
801
839
t.state <- State.Script_data_escaped;
802
-
emit_char t c
840
+
emit_char_checked c
803
841
| None -> ()
804
842
805
843
and state_script_data_escaped_less_than_sign () =
···
875
913
error t "unexpected-null-character";
876
914
emit_str t "\xEF\xBF\xBD"
877
915
| Some c ->
878
-
emit_char t c
916
+
emit_char_checked c
879
917
| None -> ()
880
918
881
919
and state_script_data_double_escaped_dash () =
···
892
930
emit_str t "\xEF\xBF\xBD"
893
931
| Some c ->
894
932
t.state <- State.Script_data_double_escaped;
895
-
emit_char t c
933
+
emit_char_checked c
896
934
| None -> ()
897
935
898
936
and state_script_data_double_escaped_dash_dash () =
···
911
949
emit_str t "\xEF\xBF\xBD"
912
950
| Some c ->
913
951
t.state <- State.Script_data_double_escaped;
914
-
emit_char t c
952
+
emit_char_checked c
915
953
| None -> ()
916
954
917
955
and state_script_data_double_escaped_less_than_sign () =
···
1570
1608
match Stream.consume t.stream with
1571
1609
| Some ']' ->
1572
1610
t.state <- State.Cdata_section_bracket
1573
-
| Some '\x00' ->
1574
-
error t "unexpected-null-character";
1575
-
emit_str t "\xEF\xBF\xBD"
1576
1611
| Some c ->
1612
+
(* CDATA section emits all characters as-is, including NUL *)
1577
1613
emit_char t c
1578
1614
| None -> ()
1579
1615
···
1703
1739
t.state <- t.return_state
1704
1740
end
1705
1741
| None ->
1706
-
(* No match - check if we should report ambiguous ampersand *)
1742
+
(* No match - check if we should report unknown-named-character-reference *)
1707
1743
if String.length entity_name > 0 then begin
1708
-
t.state <- State.Ambiguous_ampersand;
1709
-
(* Reset position - we need to emit the ampersand and chars *)
1710
-
flush_code_points_consumed_as_char_ref t
1744
+
(* If we have a semicolon, it's definitely an unknown named character reference *)
1745
+
if has_semicolon then
1746
+
error t "unknown-named-character-reference";
1747
+
(* Emit all the chars we consumed *)
1748
+
flush_code_points_consumed_as_char_ref t;
1749
+
t.state <- t.return_state
1711
1750
end else begin
1712
1751
flush_code_points_consumed_as_char_ref t;
1713
1752
t.state <- t.return_state
+40
test/dune
+40
test/dune
···
2
2
(name test_html5lib)
3
3
(libraries bytesrw html5rw.parser html5rw.dom))
4
4
5
+
(rule
6
+
(alias runtest)
7
+
(deps
8
+
(glob_files ../html5lib-tests/tree-construction/*.dat))
9
+
(action
10
+
(run %{exe:test_html5lib.exe} ../html5lib-tests/tree-construction)))
11
+
12
+
(executable
13
+
(name test_tokenizer)
14
+
(libraries bytesrw html5rw.tokenizer jsont jsont.bytesrw))
15
+
16
+
(rule
17
+
(alias runtest)
18
+
(deps
19
+
(glob_files ../html5lib-tests/tokenizer/*.test))
20
+
(action
21
+
(run %{exe:test_tokenizer.exe} ../html5lib-tests/tokenizer)))
22
+
23
+
(executable
24
+
(name test_encoding)
25
+
(libraries html5rw.encoding))
26
+
27
+
(rule
28
+
(alias runtest)
29
+
(deps
30
+
(glob_files ../html5lib-tests/encoding/*.dat))
31
+
(action
32
+
(run %{exe:test_encoding.exe} ../html5lib-tests/encoding)))
33
+
34
+
(executable
35
+
(name test_serializer)
36
+
(libraries html5rw.dom jsont jsont.bytesrw))
37
+
38
+
(rule
39
+
(alias runtest)
40
+
(deps
41
+
(glob_files ../html5lib-tests/serializer/*.test))
42
+
(action
43
+
(run %{exe:test_serializer.exe} ../html5lib-tests/serializer)))
44
+
5
45
(executable
6
46
(name debug_fragment)
7
47
(libraries bytesrw html5rw.parser html5rw.dom))
+167
test/test_encoding.ml
+167
test/test_encoding.ml
···
1
+
(* Test runner for html5lib-tests encoding tests *)
2
+
3
+
module Encoding = Html5rw_encoding
4
+
5
+
type test_case = {
6
+
input : string;
7
+
expected_encoding : string;
8
+
}
9
+
10
+
(* Normalize encoding name for comparison *)
11
+
let normalize_encoding_name s =
12
+
String.lowercase_ascii (String.trim s)
13
+
14
+
(* Convert our encoding type to canonical test name *)
15
+
let encoding_to_test_name = function
16
+
| Encoding.Utf8 -> "utf-8"
17
+
| Encoding.Utf16le -> "utf-16le"
18
+
| Encoding.Utf16be -> "utf-16be"
19
+
| Encoding.Windows_1252 -> "windows-1252"
20
+
| Encoding.Iso_8859_2 -> "iso-8859-2"
21
+
| Encoding.Euc_jp -> "euc-jp"
22
+
23
+
(* Parse a single test case from lines *)
24
+
let parse_test_case lines =
25
+
let rec parse acc = function
26
+
| [] -> acc
27
+
| line :: rest when String.length line > 0 && line.[0] = '#' ->
28
+
let section = String.trim line in
29
+
let content, remaining = collect_section rest in
30
+
parse ((section, content) :: acc) remaining
31
+
| _ :: rest -> parse acc rest
32
+
and collect_section lines =
33
+
let rec loop acc = function
34
+
| [] -> (List.rev acc, [])
35
+
| line :: rest when String.length line > 0 && line.[0] = '#' ->
36
+
(List.rev acc, line :: rest)
37
+
| line :: rest -> loop (line :: acc) rest
38
+
in
39
+
loop [] lines
40
+
in
41
+
let sections = parse [] lines in
42
+
43
+
let get_section name =
44
+
match List.assoc_opt name sections with
45
+
| Some lines -> String.concat "\n" lines
46
+
| None -> ""
47
+
in
48
+
49
+
let data = get_section "#data" in
50
+
let encoding = get_section "#encoding" in
51
+
52
+
{ input = data; expected_encoding = String.trim encoding }
53
+
54
+
(* Parse a .dat file into test cases *)
55
+
let parse_dat_file content =
56
+
let lines = String.split_on_char '\n' content in
57
+
(* Split on empty lines followed by #data *)
58
+
let rec split_tests current acc = function
59
+
| [] ->
60
+
if current = [] then List.rev acc
61
+
else List.rev (List.rev current :: acc)
62
+
| "" :: "#data" :: rest ->
63
+
let new_acc = if current = [] then acc else (List.rev current :: acc) in
64
+
split_tests ["#data"] new_acc rest
65
+
| line :: rest ->
66
+
split_tests (line :: current) acc rest
67
+
in
68
+
let test_groups = split_tests [] [] lines in
69
+
List.filter_map (fun lines ->
70
+
if List.exists (fun l -> l = "#data") lines then
71
+
Some (parse_test_case lines)
72
+
else None
73
+
) test_groups
74
+
75
+
(* Run a single encoding test *)
76
+
let run_test test =
77
+
try
78
+
(* Detect encoding from the input bytes *)
79
+
let (_, detected_encoding) = Encoding.decode (Bytes.of_string test.input) () in
80
+
let detected_name = encoding_to_test_name detected_encoding in
81
+
let expected_name = normalize_encoding_name test.expected_encoding in
82
+
83
+
(* Compare - allow some flexibility in naming *)
84
+
let match_encoding det exp =
85
+
det = exp ||
86
+
(det = "windows-1252" && (exp = "windows-1252" || exp = "cp1252" || exp = "iso-8859-1")) ||
87
+
(det = "iso-8859-2" && (exp = "iso-8859-2" || exp = "iso8859-2" || exp = "latin2")) ||
88
+
(det = "utf-8" && (exp = "utf-8" || exp = "utf8")) ||
89
+
(det = "euc-jp" && (exp = "euc-jp" || exp = "eucjp"))
90
+
in
91
+
92
+
(match_encoding detected_name expected_name, detected_name, expected_name)
93
+
with e ->
94
+
(false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected_encoding)
95
+
96
+
(* Run all tests in a file *)
97
+
let run_file path =
98
+
let ic = open_in path in
99
+
let content = really_input_string ic (in_channel_length ic) in
100
+
close_in ic;
101
+
102
+
let tests = parse_dat_file content in
103
+
let filename = Filename.basename path in
104
+
105
+
let passed = ref 0 in
106
+
let failed = ref 0 in
107
+
let errors = ref [] in
108
+
109
+
List.iteri (fun i test ->
110
+
if String.trim test.expected_encoding = "" then
111
+
(* Skip tests without expected encoding *)
112
+
()
113
+
else begin
114
+
let (success, detected, expected) = run_test test in
115
+
if success then
116
+
incr passed
117
+
else begin
118
+
incr failed;
119
+
if List.length !errors < 5 then
120
+
errors := (i + 1, test.input, detected, expected) :: !errors
121
+
end
122
+
end
123
+
) tests;
124
+
125
+
(!passed, !failed, List.rev !errors, filename)
126
+
127
+
let () =
128
+
let test_dir = Sys.argv.(1) in
129
+
let files = Sys.readdir test_dir |> Array.to_list in
130
+
let dat_files = List.filter (fun f ->
131
+
Filename.check_suffix f ".dat" &&
132
+
not (String.contains f '/')
133
+
) files in
134
+
135
+
let total_passed = ref 0 in
136
+
let total_failed = ref 0 in
137
+
let all_errors = ref [] in
138
+
139
+
List.iter (fun file ->
140
+
let path = Filename.concat test_dir file in
141
+
if Sys.is_directory path then () else begin
142
+
let (passed, failed, errors, filename) = run_file path in
143
+
total_passed := !total_passed + passed;
144
+
total_failed := !total_failed + failed;
145
+
if errors <> [] then
146
+
all_errors := (filename, errors) :: !all_errors;
147
+
Printf.printf "%s: %d passed, %d failed\n" filename passed failed
148
+
end
149
+
) (List.sort String.compare dat_files);
150
+
151
+
Printf.printf "\n=== Summary ===\n";
152
+
Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
153
+
154
+
if !all_errors <> [] then begin
155
+
Printf.printf "\n=== First failures ===\n";
156
+
List.iter (fun (filename, errors) ->
157
+
List.iter (fun (test_num, input, detected, expected) ->
158
+
Printf.printf "\n--- %s test %d ---\n" filename test_num;
159
+
Printf.printf "Input (first 200 chars): %s\n"
160
+
(String.escaped (String.sub input 0 (min 200 (String.length input))));
161
+
Printf.printf "Expected encoding: %s\n" expected;
162
+
Printf.printf "Detected encoding: %s\n" detected
163
+
) errors
164
+
) (List.rev !all_errors)
165
+
end;
166
+
167
+
exit (if !total_failed > 0 then 1 else 0)
+297
test/test_serializer.ml
+297
test/test_serializer.ml
···
1
+
(* Test runner for html5lib-tests serializer tests *)
2
+
3
+
module Dom = Html5rw_dom
4
+
5
+
(* Extract values from JSON *)
6
+
let json_string = function
7
+
| Jsont.String (s, _) -> s
8
+
| _ -> failwith "Expected string"
9
+
10
+
let json_string_opt = function
11
+
| Jsont.Null _ -> None
12
+
| Jsont.String (s, _) -> Some s
13
+
| _ -> failwith "Expected string or null"
14
+
15
+
let json_array = function
16
+
| Jsont.Array (arr, _) -> arr
17
+
| _ -> failwith "Expected array"
18
+
19
+
let json_object = function
20
+
| Jsont.Object (obj, _) -> obj
21
+
| _ -> failwith "Expected object"
22
+
23
+
let json_mem name obj =
24
+
match List.find_opt (fun ((n, _), _) -> n = name) obj with
25
+
| Some (_, v) -> Some v
26
+
| None -> None
27
+
28
+
let json_mem_exn name obj =
29
+
match json_mem name obj with
30
+
| Some v -> v
31
+
| None -> failwith ("Missing member: " ^ name)
32
+
33
+
(* Test case *)
34
+
type test_case = {
35
+
description : string;
36
+
input : Jsont.json list;
37
+
expected : string list;
38
+
}
39
+
40
+
let parse_test_case json =
41
+
let obj = json_object json in
42
+
let description = json_string (json_mem_exn "description" obj) in
43
+
let input = json_array (json_mem_exn "input" obj) in
44
+
let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in
45
+
{ description; input; expected }
46
+
47
+
(* Build a DOM node from test input token *)
48
+
let build_node_from_token token =
49
+
let arr = json_array token in
50
+
match arr with
51
+
| [] -> None
52
+
| type_json :: rest ->
53
+
let token_type = json_string type_json in
54
+
match token_type, rest with
55
+
| "StartTag", [_ns_json; name_json; attrs_json] ->
56
+
let name = json_string name_json in
57
+
let attrs_list = json_array attrs_json in
58
+
let attrs = List.map (fun attr_json ->
59
+
let attr_obj = json_object attr_json in
60
+
let attr_name = json_string (json_mem_exn "name" attr_obj) in
61
+
let value = json_string (json_mem_exn "value" attr_obj) in
62
+
(attr_name, value)
63
+
) attrs_list in
64
+
Some (Dom.create_element name ~attrs ())
65
+
66
+
| "StartTag", [name_json; attrs_json] ->
67
+
let name = json_string name_json in
68
+
let attrs_obj = json_object attrs_json in
69
+
let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in
70
+
Some (Dom.create_element name ~attrs ())
71
+
72
+
| "EmptyTag", [name_json; attrs_json] ->
73
+
let name = json_string name_json in
74
+
let attrs_obj = json_object attrs_json in
75
+
let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in
76
+
Some (Dom.create_element name ~attrs ())
77
+
78
+
| "Characters", [text_json] ->
79
+
let text = json_string text_json in
80
+
Some (Dom.create_text text)
81
+
82
+
| "Comment", [text_json] ->
83
+
let text = json_string text_json in
84
+
Some (Dom.create_comment text)
85
+
86
+
| "Doctype", [name_json] ->
87
+
let name = json_string name_json in
88
+
Some (Dom.create_doctype ~name ())
89
+
90
+
| "Doctype", [name_json; public_json] ->
91
+
let name = json_string name_json in
92
+
let public_id = json_string_opt public_json in
93
+
(match public_id with
94
+
| Some pub -> Some (Dom.create_doctype ~name ~public_id:pub ())
95
+
| None -> Some (Dom.create_doctype ~name ()))
96
+
97
+
| "Doctype", [name_json; public_json; system_json] ->
98
+
let name = json_string name_json in
99
+
let public_id = json_string_opt public_json in
100
+
let system_id = json_string_opt system_json in
101
+
(match public_id, system_id with
102
+
| Some pub, Some sys -> Some (Dom.create_doctype ~name ~public_id:pub ~system_id:sys ())
103
+
| Some pub, None -> Some (Dom.create_doctype ~name ~public_id:pub ())
104
+
| None, Some sys -> Some (Dom.create_doctype ~name ~system_id:sys ())
105
+
| None, None -> Some (Dom.create_doctype ~name ()))
106
+
107
+
| _ -> None
108
+
109
+
(* Serialize a single node to HTML (simplified, matches test expectations) *)
110
+
let escape_text text =
111
+
let buf = Buffer.create (String.length text) in
112
+
String.iter (fun c ->
113
+
match c with
114
+
| '&' -> Buffer.add_string buf "&"
115
+
| '<' -> Buffer.add_string buf "<"
116
+
| '>' -> Buffer.add_string buf ">"
117
+
| c -> Buffer.add_char buf c
118
+
) text;
119
+
Buffer.contents buf
120
+
121
+
let can_unquote_attr_value value =
122
+
if String.length value = 0 then false
123
+
else
124
+
let valid = ref true in
125
+
String.iter (fun c ->
126
+
if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' ||
127
+
c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then
128
+
valid := false
129
+
) value;
130
+
!valid
131
+
132
+
let choose_quote value =
133
+
if String.contains value '"' && not (String.contains value '\'') then '\''
134
+
else '"'
135
+
136
+
let escape_attr_value value quote_char =
137
+
let buf = Buffer.create (String.length value) in
138
+
String.iter (fun c ->
139
+
match c with
140
+
| '&' -> Buffer.add_string buf "&"
141
+
| '"' when quote_char = '"' -> Buffer.add_string buf """
142
+
| c -> Buffer.add_char buf c
143
+
) value;
144
+
Buffer.contents buf
145
+
146
+
let serialize_node node =
147
+
match node.Dom.name with
148
+
| "#text" ->
149
+
(* Check if parent is a raw text element *)
150
+
escape_text node.Dom.data
151
+
| "#comment" ->
152
+
"<!--" ^ node.Dom.data ^ "-->"
153
+
| "!doctype" ->
154
+
let buf = Buffer.create 64 in
155
+
Buffer.add_string buf "<!DOCTYPE ";
156
+
(match node.Dom.doctype with
157
+
| Some dt ->
158
+
Buffer.add_string buf (Option.value ~default:"html" dt.Dom.name);
159
+
(match dt.Dom.public_id with
160
+
| Some pub when pub <> "" ->
161
+
Buffer.add_string buf " PUBLIC \"";
162
+
Buffer.add_string buf pub;
163
+
Buffer.add_char buf '"';
164
+
(match dt.Dom.system_id with
165
+
| Some sys ->
166
+
Buffer.add_string buf " \"";
167
+
Buffer.add_string buf sys;
168
+
Buffer.add_char buf '"'
169
+
| None -> ())
170
+
| _ ->
171
+
match dt.Dom.system_id with
172
+
| Some sys when sys <> "" ->
173
+
Buffer.add_string buf " SYSTEM \"";
174
+
Buffer.add_string buf sys;
175
+
Buffer.add_char buf '"'
176
+
| _ -> ())
177
+
| None -> Buffer.add_string buf "html");
178
+
Buffer.add_char buf '>';
179
+
Buffer.contents buf
180
+
| _ ->
181
+
(* Element *)
182
+
let buf = Buffer.create 64 in
183
+
Buffer.add_char buf '<';
184
+
Buffer.add_string buf node.Dom.name;
185
+
List.iter (fun (key, value) ->
186
+
Buffer.add_char buf ' ';
187
+
Buffer.add_string buf key;
188
+
if can_unquote_attr_value value then begin
189
+
Buffer.add_char buf '=';
190
+
Buffer.add_string buf value
191
+
end else begin
192
+
let quote = choose_quote value in
193
+
Buffer.add_char buf '=';
194
+
Buffer.add_char buf quote;
195
+
Buffer.add_string buf (escape_attr_value value quote);
196
+
Buffer.add_char buf quote
197
+
end
198
+
) node.Dom.attrs;
199
+
Buffer.add_char buf '>';
200
+
Buffer.contents buf
201
+
202
+
(* Run a single test *)
203
+
let run_test test =
204
+
try
205
+
(* Build nodes from input tokens *)
206
+
let nodes = List.filter_map build_node_from_token test.input in
207
+
208
+
(* Serialize *)
209
+
let serialized = String.concat "" (List.map serialize_node nodes) in
210
+
211
+
(* Check if it matches any expected output *)
212
+
let matches = List.exists (fun exp -> serialized = exp) test.expected in
213
+
214
+
(matches, serialized, test.expected)
215
+
with e ->
216
+
(false, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), test.expected)
217
+
218
+
(* Run all tests in a file *)
219
+
let run_file path =
220
+
let content =
221
+
let ic = open_in path in
222
+
let n = in_channel_length ic in
223
+
let s = really_input_string ic n in
224
+
close_in ic;
225
+
s
226
+
in
227
+
228
+
let json = match Jsont_bytesrw.decode_string Jsont.json content with
229
+
| Ok j -> j
230
+
| Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
231
+
in
232
+
233
+
let obj = json_object json in
234
+
let tests_json = match json_mem "tests" obj with
235
+
| Some t -> json_array t
236
+
| None -> []
237
+
in
238
+
239
+
let filename = Filename.basename path in
240
+
let passed = ref 0 in
241
+
let failed = ref 0 in
242
+
let first_failures = ref [] in
243
+
244
+
List.iteri (fun i test_json ->
245
+
try
246
+
let test = parse_test_case test_json in
247
+
let (success, actual, expected) = run_test test in
248
+
249
+
if success then
250
+
incr passed
251
+
else begin
252
+
incr failed;
253
+
if List.length !first_failures < 3 then
254
+
first_failures := (i + 1, test.description, actual, expected) :: !first_failures
255
+
end
256
+
with e ->
257
+
incr failed;
258
+
Printf.eprintf "Exception parsing test %d: %s\n" (i + 1) (Printexc.to_string e)
259
+
) tests_json;
260
+
261
+
(!passed, !failed, List.rev !first_failures, filename)
262
+
263
+
let () =
264
+
let test_dir = Sys.argv.(1) in
265
+
let files = Sys.readdir test_dir |> Array.to_list in
266
+
let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
267
+
268
+
let total_passed = ref 0 in
269
+
let total_failed = ref 0 in
270
+
let all_failures = ref [] in
271
+
272
+
List.iter (fun file ->
273
+
let path = Filename.concat test_dir file in
274
+
let (passed, failed, failures, filename) = run_file path in
275
+
total_passed := !total_passed + passed;
276
+
total_failed := !total_failed + failed;
277
+
if failures <> [] then
278
+
all_failures := (filename, failures) :: !all_failures;
279
+
Printf.printf "%s: %d passed, %d failed\n" filename passed failed
280
+
) (List.sort String.compare test_files);
281
+
282
+
Printf.printf "\n=== Summary ===\n";
283
+
Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
284
+
285
+
if !all_failures <> [] then begin
286
+
Printf.printf "\n=== First failures ===\n";
287
+
List.iter (fun (filename, failures) ->
288
+
List.iter (fun (test_num, desc, actual, expected) ->
289
+
Printf.printf "\n--- %s test %d ---\n" filename test_num;
290
+
Printf.printf "Description: %s\n" desc;
291
+
Printf.printf "Expected: %s\n" (String.concat " | " expected);
292
+
Printf.printf "Actual: %s\n" actual
293
+
) failures
294
+
) (List.rev !all_failures)
295
+
end;
296
+
297
+
exit (if !total_failed > 0 then 1 else 0)
+386
test/test_tokenizer.ml
+386
test/test_tokenizer.ml
···
1
+
(* Test runner for html5lib-tests tokenizer tests *)
2
+
3
+
open Bytesrw
4
+
5
+
module Tokenizer = Html5rw_tokenizer
6
+
7
+
(* Token collector sink - collects all tokens into a list *)
8
+
module TokenCollector = struct
9
+
type t = {
10
+
mutable tokens : Tokenizer.Token.t list;
11
+
}
12
+
13
+
let create () = { tokens = [] }
14
+
15
+
let process t token =
16
+
t.tokens <- token :: t.tokens;
17
+
`Continue
18
+
19
+
let adjusted_current_node_in_html_namespace _ = true
20
+
21
+
let get_tokens t = List.rev t.tokens
22
+
end
23
+
24
+
(* Test case representation *)
25
+
type test_error = {
26
+
code : string;
27
+
line : int;
28
+
col : int;
29
+
}
30
+
31
+
type test_case = {
32
+
description : string;
33
+
input : string;
34
+
output : Jsont.json list;
35
+
errors : test_error list;
36
+
initial_states : string list;
37
+
last_start_tag : string option;
38
+
double_escaped : bool;
39
+
}
40
+
41
+
(* Unescape double-escaped strings from tests *)
42
+
let unescape_double s =
43
+
let b = Buffer.create (String.length s) in
44
+
let i = ref 0 in
45
+
while !i < String.length s do
46
+
if !i + 1 < String.length s && s.[!i] = '\\' then begin
47
+
match s.[!i + 1] with
48
+
| 'u' when !i + 5 < String.length s ->
49
+
let hex = String.sub s (!i + 2) 4 in
50
+
(try
51
+
let code = int_of_string ("0x" ^ hex) in
52
+
if code < 128 then Buffer.add_char b (Char.chr code)
53
+
else begin
54
+
(* UTF-8 encode *)
55
+
if code < 0x800 then begin
56
+
Buffer.add_char b (Char.chr (0xC0 lor (code lsr 6)));
57
+
Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
58
+
end else begin
59
+
Buffer.add_char b (Char.chr (0xE0 lor (code lsr 12)));
60
+
Buffer.add_char b (Char.chr (0x80 lor ((code lsr 6) land 0x3F)));
61
+
Buffer.add_char b (Char.chr (0x80 lor (code land 0x3F)))
62
+
end
63
+
end;
64
+
i := !i + 6
65
+
with _ ->
66
+
Buffer.add_char b s.[!i];
67
+
incr i)
68
+
| 'n' -> Buffer.add_char b '\n'; i := !i + 2
69
+
| 'r' -> Buffer.add_char b '\r'; i := !i + 2
70
+
| 't' -> Buffer.add_char b '\t'; i := !i + 2
71
+
| '\\' -> Buffer.add_char b '\\'; i := !i + 2
72
+
| _ -> Buffer.add_char b s.[!i]; incr i
73
+
end else begin
74
+
Buffer.add_char b s.[!i];
75
+
incr i
76
+
end
77
+
done;
78
+
Buffer.contents b
79
+
80
+
(* Extract string from JSON node *)
81
+
let json_string = function
82
+
| Jsont.String (s, _) -> s
83
+
| _ -> failwith "Expected string"
84
+
85
+
let json_bool = function
86
+
| Jsont.Bool (b, _) -> b
87
+
| _ -> failwith "Expected bool"
88
+
89
+
let json_int = function
90
+
| Jsont.Number (n, _) -> int_of_float n
91
+
| _ -> failwith "Expected number"
92
+
93
+
let json_array = function
94
+
| Jsont.Array (arr, _) -> arr
95
+
| _ -> failwith "Expected array"
96
+
97
+
let json_object = function
98
+
| Jsont.Object (obj, _) -> obj
99
+
| _ -> failwith "Expected object"
100
+
101
+
let json_mem name obj =
102
+
match List.find_opt (fun ((n, _), _) -> n = name) obj with
103
+
| Some (_, v) -> Some v
104
+
| None -> None
105
+
106
+
let json_mem_exn name obj =
107
+
match json_mem name obj with
108
+
| Some v -> v
109
+
| None -> failwith ("Missing member: " ^ name)
110
+
111
+
(* Parse test error from JSON *)
112
+
let parse_test_error json =
113
+
let obj = json_object json in
114
+
{
115
+
code = json_string (json_mem_exn "code" obj);
116
+
line = json_int (json_mem_exn "line" obj);
117
+
col = json_int (json_mem_exn "col" obj);
118
+
}
119
+
120
+
(* Parse a single test case from JSON *)
121
+
let parse_test_case json =
122
+
let obj = json_object json in
123
+
let description = json_string (json_mem_exn "description" obj) in
124
+
let input = json_string (json_mem_exn "input" obj) in
125
+
let output = json_array (json_mem_exn "output" obj) in
126
+
let errors = match json_mem "errors" obj with
127
+
| Some e -> List.map parse_test_error (json_array e)
128
+
| None -> []
129
+
in
130
+
let initial_states = match json_mem "initialStates" obj with
131
+
| Some s -> List.map json_string (json_array s)
132
+
| None -> ["Data state"]
133
+
in
134
+
let last_start_tag = match json_mem "lastStartTag" obj with
135
+
| Some s -> Some (json_string s)
136
+
| None -> None
137
+
in
138
+
let double_escaped = match json_mem "doubleEscaped" obj with
139
+
| Some b -> json_bool b
140
+
| None -> false
141
+
in
142
+
{ description; input; output; errors; initial_states; last_start_tag; double_escaped }
143
+
144
+
(* Convert state name to State.t *)
145
+
let state_of_string = function
146
+
| "Data state" -> Tokenizer.State.Data
147
+
| "PLAINTEXT state" -> Tokenizer.State.Plaintext
148
+
| "RCDATA state" -> Tokenizer.State.Rcdata
149
+
| "RAWTEXT state" -> Tokenizer.State.Rawtext
150
+
| "Script data state" -> Tokenizer.State.Script_data
151
+
| "CDATA section state" -> Tokenizer.State.Cdata_section
152
+
| s -> failwith ("Unknown state: " ^ s)
153
+
154
+
(* Convert our token to test format for comparison *)
155
+
let token_to_test_json (tok : Tokenizer.Token.t) : Jsont.json list =
156
+
let str s = Jsont.String (s, Jsont.Meta.none) in
157
+
let arr l = Jsont.Array (l, Jsont.Meta.none) in
158
+
match tok with
159
+
| Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
160
+
let name_json = match name with
161
+
| Some n -> str n
162
+
| None -> Jsont.Null ((), Jsont.Meta.none)
163
+
in
164
+
let public_json = match public_id with
165
+
| Some p -> str p
166
+
| None -> Jsont.Null ((), Jsont.Meta.none)
167
+
in
168
+
let system_json = match system_id with
169
+
| Some s -> str s
170
+
| None -> Jsont.Null ((), Jsont.Meta.none)
171
+
in
172
+
let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in
173
+
[arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]]
174
+
| Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
175
+
let attrs_obj = Jsont.Object (
176
+
List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs),
177
+
Jsont.Meta.none
178
+
) in
179
+
if self_closing then
180
+
[arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]]
181
+
else
182
+
[arr [str "StartTag"; str name; attrs_obj]]
183
+
| Tokenizer.Token.Tag { kind = End; name; _ } ->
184
+
[arr [str "EndTag"; str name]]
185
+
| Tokenizer.Token.Comment data ->
186
+
[arr [str "Comment"; str data]]
187
+
| Tokenizer.Token.Character data ->
188
+
(* Split into individual characters for comparison - but actually
189
+
the tests expect consecutive characters to be merged *)
190
+
[arr [str "Character"; str data]]
191
+
| Tokenizer.Token.EOF -> []
192
+
193
+
(* Compare JSON values for equality *)
194
+
let rec json_equal a b =
195
+
match a, b with
196
+
| Jsont.Null _, Jsont.Null _ -> true
197
+
| Jsont.Bool (a, _), Jsont.Bool (b, _) -> a = b
198
+
| Jsont.Number (a, _), Jsont.Number (b, _) -> a = b
199
+
| Jsont.String (a, _), Jsont.String (b, _) -> a = b
200
+
| Jsont.Array (a, _), Jsont.Array (b, _) ->
201
+
List.length a = List.length b &&
202
+
List.for_all2 json_equal a b
203
+
| Jsont.Object (a, _), Jsont.Object (b, _) ->
204
+
let a_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) a in
205
+
let b_sorted = List.sort (fun ((n1, _), _) ((n2, _), _) -> String.compare n1 n2) b in
206
+
List.length a_sorted = List.length b_sorted &&
207
+
List.for_all2 (fun ((n1, _), v1) ((n2, _), v2) -> n1 = n2 && json_equal v1 v2) a_sorted b_sorted
208
+
| _ -> false
209
+
210
+
(* Merge consecutive Character tokens *)
211
+
let merge_character_tokens tokens =
212
+
let rec loop acc = function
213
+
| [] -> List.rev acc
214
+
| Tokenizer.Token.Character s1 :: Tokenizer.Token.Character s2 :: rest ->
215
+
loop acc (Tokenizer.Token.Character (s1 ^ s2) :: rest)
216
+
| tok :: rest -> loop (tok :: acc) rest
217
+
in
218
+
loop [] tokens
219
+
220
+
(* Run a single test *)
221
+
let run_test test initial_state =
222
+
let input = if test.double_escaped then unescape_double test.input else test.input in
223
+
224
+
let collector = TokenCollector.create () in
225
+
let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true () in
226
+
227
+
(* Set initial state *)
228
+
Tokenizer.set_state tokenizer initial_state;
229
+
230
+
(* Set last start tag if specified *)
231
+
(match test.last_start_tag with
232
+
| Some tag -> Tokenizer.set_last_start_tag tokenizer tag
233
+
| None -> ());
234
+
235
+
(* Run tokenizer *)
236
+
let reader = Bytes.Reader.of_string input in
237
+
Tokenizer.run tokenizer (module TokenCollector) reader;
238
+
239
+
(* Get results *)
240
+
let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in
241
+
let actual_tokens = List.concat_map token_to_test_json tokens in
242
+
243
+
(* Unescape expected output if double_escaped *)
244
+
let expected_output = if test.double_escaped then
245
+
let rec unescape_json = function
246
+
| Jsont.String (s, m) -> Jsont.String (unescape_double s, m)
247
+
| Jsont.Array (arr, m) -> Jsont.Array (List.map unescape_json arr, m)
248
+
| Jsont.Object (obj, m) ->
249
+
Jsont.Object (List.map (fun (n, v) -> (n, unescape_json v)) obj, m)
250
+
| other -> other
251
+
in
252
+
List.map unescape_json test.output
253
+
else test.output
254
+
in
255
+
256
+
(* Merge consecutive Character tokens in expected output too *)
257
+
let rec merge_expected = function
258
+
| [] -> []
259
+
| [x] -> [x]
260
+
| Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s1, m1)], am1) ::
261
+
Jsont.Array ([Jsont.String ("Character", _); Jsont.String (s2, _)], _) ::
262
+
rest ->
263
+
merge_expected (Jsont.Array ([Jsont.String ("Character", Jsont.Meta.none); Jsont.String (s1 ^ s2, m1)], am1) :: rest)
264
+
| x :: rest -> x :: merge_expected rest
265
+
in
266
+
let expected = merge_expected expected_output in
267
+
268
+
(* Compare *)
269
+
let tokens_match =
270
+
List.length actual_tokens = List.length expected &&
271
+
List.for_all2 json_equal actual_tokens expected
272
+
in
273
+
274
+
let actual_errors = Tokenizer.get_errors tokenizer in
275
+
let errors_count_match = List.length actual_errors = List.length test.errors in
276
+
277
+
(tokens_match && errors_count_match, actual_tokens, expected, actual_errors, test.errors)
278
+
279
+
(* Format JSON for display *)
280
+
let rec json_to_string = function
281
+
| Jsont.Null _ -> "null"
282
+
| Jsont.Bool (b, _) -> string_of_bool b
283
+
| Jsont.Number (n, _) -> Printf.sprintf "%g" n
284
+
| Jsont.String (s, _) -> Printf.sprintf "%S" s
285
+
| Jsont.Array (arr, _) ->
286
+
"[" ^ String.concat ", " (List.map json_to_string arr) ^ "]"
287
+
| Jsont.Object (obj, _) ->
288
+
"{" ^ String.concat ", " (List.map (fun ((n, _), v) -> Printf.sprintf "%S: %s" n (json_to_string v)) obj) ^ "}"
289
+
290
+
(* Run all tests in a file *)
291
+
let run_file path =
292
+
let content =
293
+
let ic = open_in path in
294
+
let n = in_channel_length ic in
295
+
let s = really_input_string ic n in
296
+
close_in ic;
297
+
s
298
+
in
299
+
300
+
(* Parse JSON *)
301
+
let json = match Jsont_bytesrw.decode_string Jsont.json content with
302
+
| Ok j -> j
303
+
| Error e -> failwith (Printf.sprintf "JSON parse error in %s: %s" path e)
304
+
in
305
+
306
+
let obj = json_object json in
307
+
308
+
(* Handle both {"tests": [...]} and {"xmlViolationTests": [...], "tests": [...]} formats *)
309
+
let test_arrays =
310
+
let tests = match json_mem "tests" obj with
311
+
| Some t -> json_array t
312
+
| None -> []
313
+
in
314
+
let xml_tests = match json_mem "xmlViolationTests" obj with
315
+
| Some t -> json_array t
316
+
| None -> []
317
+
in
318
+
tests @ xml_tests
319
+
in
320
+
321
+
let filename = Filename.basename path in
322
+
let passed = ref 0 in
323
+
let failed = ref 0 in
324
+
let first_failures = ref [] in
325
+
326
+
List.iteri (fun i test_json ->
327
+
let test = parse_test_case test_json in
328
+
329
+
(* Run for each initial state *)
330
+
List.iter (fun state_name ->
331
+
try
332
+
let state = state_of_string state_name in
333
+
let (success, actual, expected, actual_errors, expected_errors) = run_test test state in
334
+
335
+
if success then
336
+
incr passed
337
+
else begin
338
+
incr failed;
339
+
if List.length !first_failures < 3 then
340
+
first_failures := (i + 1, test.description, state_name, actual, expected, actual_errors, expected_errors) :: !first_failures
341
+
end
342
+
with e ->
343
+
incr failed;
344
+
if List.length !first_failures < 3 then
345
+
first_failures := (i + 1, test.description, state_name, [], [], [], []) :: !first_failures;
346
+
Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e)
347
+
) test.initial_states
348
+
) test_arrays;
349
+
350
+
(!passed, !failed, List.rev !first_failures, filename)
351
+
352
+
let () =
353
+
let test_dir = Sys.argv.(1) in
354
+
let files = Sys.readdir test_dir |> Array.to_list in
355
+
let test_files = List.filter (fun f -> Filename.check_suffix f ".test") files in
356
+
357
+
let total_passed = ref 0 in
358
+
let total_failed = ref 0 in
359
+
let all_failures = ref [] in
360
+
361
+
List.iter (fun file ->
362
+
let path = Filename.concat test_dir file in
363
+
let (passed, failed, failures, filename) = run_file path in
364
+
total_passed := !total_passed + passed;
365
+
total_failed := !total_failed + failed;
366
+
if failures <> [] then
367
+
all_failures := (filename, failures) :: !all_failures;
368
+
Printf.printf "%s: %d passed, %d failed\n" filename passed failed
369
+
) (List.sort String.compare test_files);
370
+
371
+
Printf.printf "\n=== Summary ===\n";
372
+
Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
373
+
374
+
if !all_failures <> [] then begin
375
+
Printf.printf "\n=== First failures ===\n";
376
+
List.iter (fun (filename, failures) ->
377
+
List.iter (fun (test_num, desc, state, actual, expected, actual_errs, expected_errs) ->
378
+
Printf.printf "\n--- %s test %d (%s) in %s ---\n" filename test_num state desc;
379
+
Printf.printf "Expected tokens: [%s]\n" (String.concat "; " (List.map json_to_string expected));
380
+
Printf.printf "Actual tokens: [%s]\n" (String.concat "; " (List.map json_to_string actual));
381
+
Printf.printf "Expected %d errors, got %d\n" (List.length expected_errs) (List.length actual_errs)
382
+
) failures
383
+
) (List.rev !all_failures)
384
+
end;
385
+
386
+
exit (if !total_failed > 0 then 1 else 0)