+2
-2
lib/encoding/decode.ml
+2
-2
lib/encoding/decode.ml
···
186
186
match Prescan.prescan_for_meta_charset data with
187
187
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
188
188
| None ->
189
-
(* Default to UTF-8 *)
190
-
(decode_with_encoding data Encoding.Utf8 ~bom_len:0, Encoding.Utf8)
189
+
(* Default to Windows-1252 per HTML5 spec when no encoding detected *)
190
+
(decode_with_encoding data Encoding.Windows_1252 ~bom_len:0, Encoding.Windows_1252)
+8
-5
lib/encoding/prescan.ml
+8
-5
lib/encoding/prescan.ml
···
97
97
if !j + 2 < len then
98
98
i := !j + 3
99
99
else
100
-
result := None (* Unclosed comment, stop scanning *)
100
+
i := len (* Unclosed comment - stop scanning *)
101
101
end
102
102
(* Check for end tag - skip it *)
103
103
else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin
104
104
let j = ref (!i + 2) in
105
105
let in_quote = ref None in
106
-
while !j < len && !j < max_total && !non_comment < max_non_comment do
106
+
let done_tag = ref false in
107
+
while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do
107
108
let c = Bytes.get data !j in
108
109
match !in_quote with
109
110
| None ->
···
114
115
end else if c = '>' then begin
115
116
incr j;
116
117
incr non_comment;
117
-
j := len (* Exit loop *)
118
+
done_tag := true
118
119
end else begin
119
120
incr j;
120
121
incr non_comment
···
138
139
if tag_name <> "meta" then begin
139
140
(* Skip non-meta tag *)
140
141
let in_quote = ref None in
141
-
while !j < len && !j < max_total && !non_comment < max_non_comment do
142
+
let done_tag = ref false in
143
+
while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do
142
144
let c = Bytes.get data !j in
143
145
match !in_quote with
144
146
| None ->
···
149
151
end else if c = '>' then begin
150
152
incr j;
151
153
incr non_comment;
152
-
j := len
154
+
done_tag := true
153
155
end else begin
154
156
incr j;
155
157
incr non_comment
···
240
242
| None -> ());
241
243
242
244
(* Check for http-equiv="content-type" with content *)
245
+
(* Note: http-equiv value must be exactly "content-type" (case-insensitive) *)
243
246
if !result = None then
244
247
(match !http_equiv, !content with
245
248
| Some he, Some ct when String.lowercase_ascii he = "content-type" ->
+108
lib/tokenizer/stream.ml
+108
lib/tokenizer/stream.ml
···
22
22
mutable last_was_cr : bool;
23
23
(* Track if we need to skip the next LF from raw stream (set after peek of CR) *)
24
24
mutable skip_next_lf : bool;
25
+
(* Error callback for surrogate/noncharacter detection *)
26
+
mutable error_callback : (string -> unit) option;
25
27
}
26
28
27
29
(* Create a stream from a Bytes.Reader.t *)
···
36
38
column = 0;
37
39
last_was_cr = false;
38
40
skip_next_lf = false;
41
+
error_callback = None;
39
42
}
40
43
44
+
let set_error_callback t cb =
45
+
t.error_callback <- Some cb
46
+
47
+
(* Check if a Unicode codepoint is a surrogate *)
48
+
let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF
49
+
50
+
(* Check if a Unicode codepoint is a noncharacter *)
51
+
let is_noncharacter cp =
52
+
(* U+FDD0 to U+FDEF *)
53
+
(cp >= 0xFDD0 && cp <= 0xFDEF) ||
54
+
(* U+FFFE and U+FFFF in each plane (0-16) *)
55
+
((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF)
56
+
41
57
(* Create a stream from a string - discouraged, prefer create_from_reader *)
42
58
let create input =
43
59
create_from_reader (Bytes.Reader.of_string input)
···
78
94
let push_back_char t c =
79
95
t.lookahead <- c :: t.lookahead
80
96
97
+
(* Check for surrogates and noncharacters in UTF-8 sequences.
98
+
Called after reading a lead byte, peeks continuation bytes to decode codepoint. *)
99
+
let check_utf8_codepoint t lead_byte =
100
+
let b0 = Char.code lead_byte in
101
+
if b0 < 0x80 then
102
+
(* ASCII - no surrogates or noncharacters possible in this range except control chars *)
103
+
()
104
+
else if b0 >= 0xC2 && b0 <= 0xDF then begin
105
+
(* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
106
+
(* Check for C1 control characters U+0080-U+009F *)
107
+
match read_raw_char t with
108
+
| Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
109
+
let b1 = Char.code c1 in
110
+
let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in
111
+
push_back_char t c1;
112
+
(* C1 controls: U+0080 to U+009F *)
113
+
if cp >= 0x80 && cp <= 0x9F then
114
+
(match t.error_callback with
115
+
| Some cb -> cb "control-character-in-input-stream"
116
+
| None -> ())
117
+
| Some c1 ->
118
+
push_back_char t c1
119
+
| None -> ()
120
+
end else if b0 >= 0xE0 && b0 <= 0xEF then begin
121
+
(* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *)
122
+
(* Need to peek 2 continuation bytes *)
123
+
match read_raw_char t with
124
+
| Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
125
+
let b1 = Char.code c1 in
126
+
(match read_raw_char t with
127
+
| Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
128
+
let b2 = Char.code c2 in
129
+
let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
130
+
push_back_char t c2;
131
+
push_back_char t c1;
132
+
(* Check for surrogates and noncharacters *)
133
+
(match t.error_callback with
134
+
| Some cb ->
135
+
if is_surrogate cp then cb "surrogate-in-input-stream"
136
+
else if is_noncharacter cp then cb "noncharacter-in-input-stream"
137
+
| None -> ())
138
+
| Some c2 ->
139
+
push_back_char t c2;
140
+
push_back_char t c1
141
+
| None ->
142
+
push_back_char t c1)
143
+
| Some c1 ->
144
+
push_back_char t c1
145
+
| None -> ()
146
+
end else if b0 >= 0xF0 && b0 <= 0xF4 then begin
147
+
(* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *)
148
+
match read_raw_char t with
149
+
| Some c1 when (Char.code c1 land 0xC0) = 0x80 ->
150
+
let b1 = Char.code c1 in
151
+
(match read_raw_char t with
152
+
| Some c2 when (Char.code c2 land 0xC0) = 0x80 ->
153
+
let b2 = Char.code c2 in
154
+
(match read_raw_char t with
155
+
| Some c3 when (Char.code c3 land 0xC0) = 0x80 ->
156
+
let b3 = Char.code c3 in
157
+
let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
158
+
((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
159
+
push_back_char t c3;
160
+
push_back_char t c2;
161
+
push_back_char t c1;
162
+
(* Check for noncharacters (no surrogates in 4-byte range) *)
163
+
(match t.error_callback with
164
+
| Some cb ->
165
+
if is_noncharacter cp then cb "noncharacter-in-input-stream"
166
+
| None -> ())
167
+
| Some c3 ->
168
+
push_back_char t c3;
169
+
push_back_char t c2;
170
+
push_back_char t c1
171
+
| None ->
172
+
push_back_char t c2;
173
+
push_back_char t c1)
174
+
| Some c2 ->
175
+
push_back_char t c2;
176
+
push_back_char t c1
177
+
| None ->
178
+
push_back_char t c1)
179
+
| Some c1 ->
180
+
push_back_char t c1
181
+
| None -> ()
182
+
end
183
+
81
184
(* Read next char with CR/LF normalization *)
82
185
let rec read_normalized_char t =
186
+
(* Track if we're reading from lookahead - if so, we've already checked this byte *)
187
+
let from_lookahead = t.lookahead <> [] in
83
188
match read_raw_char t with
84
189
| None ->
85
190
t.last_was_cr <- false;
···
98
203
read_normalized_char t
99
204
| Some c ->
100
205
t.last_was_cr <- false;
206
+
(* Only check for surrogates/noncharacters when reading fresh from stream,
207
+
not when re-reading from lookahead (to avoid duplicate errors) *)
208
+
if not from_lookahead then check_utf8_codepoint t c;
101
209
Some c
102
210
103
211
let is_eof t =
+105
-9
lib/tokenizer/tokenizer.ml
+105
-9
lib/tokenizer/tokenizer.ml
···
39
39
mutable pending_chars : Buffer.t;
40
40
mutable errors : Errors.t list;
41
41
collect_errors : bool;
42
+
xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *)
42
43
}
43
44
44
-
let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) () = {
45
+
let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = {
45
46
stream = Stream.create "";
46
47
sink;
47
48
state = State.Data;
···
63
64
pending_chars = Buffer.create 256;
64
65
errors = [];
65
66
collect_errors;
67
+
xml_mode;
66
68
}
67
69
68
70
let error t code =
···
73
75
74
76
(* emit functions are defined locally inside run *)
75
77
78
+
(* XML mode character transformation: form feed → space *)
76
79
let emit_char t c =
77
-
Buffer.add_char t.pending_chars c
80
+
if t.xml_mode && c = '\x0C' then
81
+
Buffer.add_char t.pending_chars ' '
82
+
else
83
+
Buffer.add_char t.pending_chars c
78
84
85
+
(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *)
79
86
let emit_str t s =
80
-
Buffer.add_string t.pending_chars s
87
+
if t.xml_mode then begin
88
+
(* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *)
89
+
let len = String.length s in
90
+
let i = ref 0 in
91
+
while !i < len do
92
+
let c = s.[!i] in
93
+
if c = '\x0C' then begin
94
+
Buffer.add_char t.pending_chars ' ';
95
+
incr i
96
+
end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin
97
+
(* U+FFFF → U+FFFD *)
98
+
Buffer.add_string t.pending_chars "\xEF\xBF\xBD";
99
+
i := !i + 3
100
+
end else begin
101
+
Buffer.add_char t.pending_chars c;
102
+
incr i
103
+
end
104
+
done
105
+
end else
106
+
Buffer.add_string t.pending_chars s
81
107
82
108
let start_new_tag t kind =
83
109
Buffer.clear t.current_tag_name;
···
130
156
let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
131
157
t.stream <- Stream.create_from_reader reader;
132
158
t.errors <- [];
159
+
(* Set up error callback for surrogate/noncharacter detection in stream *)
160
+
(* In XML mode, we don't report noncharacter errors - we transform them instead *)
161
+
if not t.xml_mode then
162
+
Stream.set_error_callback t.stream (fun code -> error t code);
163
+
164
+
(* XML mode transformation for pending chars: U+FFFF → U+FFFD *)
165
+
let transform_xml_chars data =
166
+
let len = String.length data in
167
+
let buf = Buffer.create len in
168
+
let i = ref 0 in
169
+
while !i < len do
170
+
let c = data.[!i] in
171
+
if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin
172
+
(* U+FFFF → U+FFFD *)
173
+
Buffer.add_string buf "\xEF\xBF\xBD";
174
+
i := !i + 3
175
+
end else begin
176
+
Buffer.add_char buf c;
177
+
incr i
178
+
end
179
+
done;
180
+
Buffer.contents buf
181
+
in
133
182
134
183
(* Local emit functions with access to S *)
135
184
let emit_pending_chars () =
136
185
if Buffer.length t.pending_chars > 0 then begin
137
186
let data = Buffer.contents t.pending_chars in
138
187
Buffer.clear t.pending_chars;
188
+
let data = if t.xml_mode then transform_xml_chars data else data in
139
189
ignore (S.process t.sink (Token.Character data))
140
190
end
141
191
in
···
180
230
in
181
231
182
232
let emit_current_comment () =
183
-
emit (Token.Comment (Buffer.contents t.current_comment))
233
+
let content = Buffer.contents t.current_comment in
234
+
let content =
235
+
if t.xml_mode then begin
236
+
(* XML mode: transform -- to - - in comments *)
237
+
let buf = Buffer.create (String.length content + 10) in
238
+
let len = String.length content in
239
+
let i = ref 0 in
240
+
while !i < len do
241
+
if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin
242
+
Buffer.add_string buf "- -";
243
+
i := !i + 2
244
+
end else begin
245
+
Buffer.add_char buf content.[!i];
246
+
incr i
247
+
end
248
+
done;
249
+
Buffer.contents buf
250
+
end else content
251
+
in
252
+
emit (Token.Comment content)
184
253
in
185
254
186
255
(* Check for control characters and emit error if needed *)
256
+
(* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *)
187
257
let check_control_char c =
188
258
let code = Char.code c in
189
-
(* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F-U+009F *)
259
+
(* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *)
190
260
(* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
261
+
(* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *)
262
+
(* Note: We only check single-byte control chars here; multi-byte checks are TODO *)
191
263
if (code >= 0x01 && code <= 0x08) ||
192
264
code = 0x0B ||
193
265
(code >= 0x0E && code <= 0x1F) ||
194
-
(code >= 0x7F && code <= 0x9F) then
266
+
code = 0x7F then
195
267
error t "control-character-in-input-stream"
196
268
in
269
+
197
270
198
271
(* Emit char with control character check *)
199
272
let emit_char_checked c =
···
294
367
| State.Script_data_escaped
295
368
| State.Script_data_escaped_dash
296
369
| State.Script_data_escaped_dash_dash ->
370
+
error t "eof-in-script-html-comment-like-text";
297
371
emit_pending_chars ();
298
372
ignore (S.process t.sink Token.EOF)
299
373
| State.Script_data_escaped_less_than_sign ->
···
313
387
| State.Script_data_double_escaped
314
388
| State.Script_data_double_escaped_dash
315
389
| State.Script_data_double_escaped_dash_dash ->
390
+
error t "eof-in-script-html-comment-like-text";
316
391
emit_pending_chars ();
317
392
ignore (S.process t.sink Token.EOF)
318
393
| State.Script_data_double_escaped_less_than_sign ->
···
647
722
error t "unexpected-null-character";
648
723
Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
649
724
| Some c ->
725
+
check_control_char c;
650
726
Buffer.add_char t.current_tag_name (ascii_lower c)
651
727
| None -> ()
652
728
···
1015
1091
Buffer.add_char t.current_attr_name (Option.get c_opt)
1016
1092
| Some c ->
1017
1093
Stream.advance t.stream;
1094
+
check_control_char c;
1018
1095
Buffer.add_char t.current_attr_name (ascii_lower c)
1019
1096
1020
1097
and state_after_attribute_name () =
···
1065
1142
error t "unexpected-null-character";
1066
1143
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1067
1144
| Some c ->
1145
+
check_control_char c;
1068
1146
Buffer.add_char t.current_attr_value c
1069
1147
| None -> ()
1070
1148
···
1079
1157
error t "unexpected-null-character";
1080
1158
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1081
1159
| Some c ->
1160
+
check_control_char c;
1082
1161
Buffer.add_char t.current_attr_value c
1083
1162
| None -> ()
1084
1163
···
1105
1184
Buffer.add_char t.current_attr_value (Option.get c_opt)
1106
1185
| Some c ->
1107
1186
Stream.advance t.stream;
1187
+
check_control_char c;
1108
1188
Buffer.add_char t.current_attr_value c
1109
1189
| None -> ()
1110
1190
···
1146
1226
error t "unexpected-null-character";
1147
1227
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1148
1228
| Some c ->
1229
+
check_control_char c;
1149
1230
Buffer.add_char t.current_comment c
1150
1231
| None -> ()
1151
1232
···
1212
1293
error t "unexpected-null-character";
1213
1294
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1214
1295
| Some c ->
1296
+
check_control_char c;
1215
1297
Buffer.add_char t.current_comment c
1216
1298
| None -> ()
1217
1299
···
1327
1409
| None -> ()
1328
1410
| Some c ->
1329
1411
Stream.advance t.stream;
1412
+
check_control_char c;
1330
1413
start_new_doctype t;
1331
1414
t.current_doctype_name <- Some (Buffer.create 8);
1332
1415
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
···
1343
1426
error t "unexpected-null-character";
1344
1427
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
1345
1428
| Some c ->
1429
+
check_control_char c;
1346
1430
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
1347
1431
| None -> ()
1348
1432
···
1356
1440
emit_current_doctype ()
1357
1441
| None -> ()
1358
1442
| Some _ ->
1443
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1359
1444
if Stream.matches_ci t.stream "PUBLIC" then begin
1360
1445
ignore (Stream.consume_exact_ci t.stream "PUBLIC");
1361
1446
t.state <- State.After_doctype_public_keyword
···
1391
1476
emit_current_doctype ()
1392
1477
| None -> ()
1393
1478
| Some _ ->
1479
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1394
1480
error t "missing-quote-before-doctype-public-identifier";
1395
1481
t.current_doctype_force_quirks <- true;
1396
1482
t.state <- State.Bogus_doctype
···
1432
1518
t.state <- State.Data;
1433
1519
emit_current_doctype ()
1434
1520
| Some c ->
1521
+
check_control_char c;
1435
1522
Buffer.add_char (Option.get t.current_doctype_public) c
1436
1523
| None -> ()
1437
1524
···
1448
1535
t.state <- State.Data;
1449
1536
emit_current_doctype ()
1450
1537
| Some c ->
1538
+
check_control_char c;
1451
1539
Buffer.add_char (Option.get t.current_doctype_public) c
1452
1540
| None -> ()
1453
1541
···
1472
1560
t.state <- State.Doctype_system_identifier_single_quoted
1473
1561
| None -> ()
1474
1562
| Some _ ->
1563
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1475
1564
error t "missing-quote-before-doctype-system-identifier";
1476
1565
t.current_doctype_force_quirks <- true;
1477
1566
t.state <- State.Bogus_doctype
···
1494
1583
t.state <- State.Doctype_system_identifier_single_quoted
1495
1584
| None -> ()
1496
1585
| Some _ ->
1586
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1497
1587
error t "missing-quote-before-doctype-system-identifier";
1498
1588
t.current_doctype_force_quirks <- true;
1499
1589
t.state <- State.Bogus_doctype
···
1521
1611
emit_current_doctype ()
1522
1612
| None -> ()
1523
1613
| Some _ ->
1614
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1524
1615
error t "missing-quote-before-doctype-system-identifier";
1525
1616
t.current_doctype_force_quirks <- true;
1526
1617
t.state <- State.Bogus_doctype
···
1545
1636
emit_current_doctype ()
1546
1637
| None -> ()
1547
1638
| Some _ ->
1639
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1548
1640
error t "missing-quote-before-doctype-system-identifier";
1549
1641
t.current_doctype_force_quirks <- true;
1550
1642
t.state <- State.Bogus_doctype
···
1562
1654
t.state <- State.Data;
1563
1655
emit_current_doctype ()
1564
1656
| Some c ->
1657
+
check_control_char c;
1565
1658
Buffer.add_char (Option.get t.current_doctype_system) c
1566
1659
| None -> ()
1567
1660
···
1578
1671
t.state <- State.Data;
1579
1672
emit_current_doctype ()
1580
1673
| Some c ->
1674
+
check_control_char c;
1581
1675
Buffer.add_char (Option.get t.current_doctype_system) c
1582
1676
| None -> ()
1583
1677
···
1591
1685
emit_current_doctype ()
1592
1686
| None -> ()
1593
1687
| Some _ ->
1688
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1594
1689
error t "unexpected-character-after-doctype-system-identifier";
1595
1690
t.state <- State.Bogus_doctype
1596
1691
···
1601
1696
emit_current_doctype ()
1602
1697
| Some '\x00' ->
1603
1698
error t "unexpected-null-character"
1604
-
| Some _ -> ()
1699
+
| Some c ->
1700
+
check_control_char c (* Check all chars in bogus doctype *)
1605
1701
| None -> ()
1606
1702
1607
1703
and state_cdata_section () =
···
1609
1705
| Some ']' ->
1610
1706
t.state <- State.Cdata_section_bracket
1611
1707
| Some c ->
1612
-
(* CDATA section emits all characters as-is, including NUL *)
1613
-
emit_char t c
1708
+
(* CDATA section emits all characters as-is, including NUL, but still check for control chars *)
1709
+
emit_char_checked c
1614
1710
| None -> ()
1615
1711
1616
1712
and state_cdata_section_bracket () =
+577
-66
test/test_serializer.ml
+577
-66
test/test_serializer.ml
···
12
12
| Jsont.String (s, _) -> Some s
13
13
| _ -> failwith "Expected string or null"
14
14
15
+
let json_bool = function
16
+
| Jsont.Bool (b, _) -> b
17
+
| _ -> failwith "Expected bool"
18
+
15
19
let json_array = function
16
20
| Jsont.Array (arr, _) -> arr
17
21
| _ -> failwith "Expected array"
···
30
34
| Some v -> v
31
35
| None -> failwith ("Missing member: " ^ name)
32
36
37
+
(* Serialization options *)
38
+
type serialize_options = {
39
+
quote_char : char;
40
+
quote_char_explicit : bool; (* Was quote_char explicitly set? *)
41
+
minimize_boolean_attributes : bool;
42
+
use_trailing_solidus : bool;
43
+
escape_lt_in_attrs : bool;
44
+
escape_rcdata : bool;
45
+
strip_whitespace : bool;
46
+
inject_meta_charset : bool;
47
+
encoding : string option;
48
+
omit_optional_tags : bool;
49
+
}
50
+
51
+
let default_options = {
52
+
quote_char = '"';
53
+
quote_char_explicit = false;
54
+
minimize_boolean_attributes = true;
55
+
use_trailing_solidus = false;
56
+
escape_lt_in_attrs = false;
57
+
escape_rcdata = false;
58
+
strip_whitespace = false;
59
+
inject_meta_charset = false;
60
+
encoding = None;
61
+
omit_optional_tags = true; (* HTML5 default *)
62
+
}
63
+
64
+
(* Parse options from JSON *)
65
+
let parse_options json_opt =
66
+
match json_opt with
67
+
| None -> default_options
68
+
| Some json ->
69
+
let obj = json_object json in
70
+
let get_bool name default =
71
+
match json_mem name obj with
72
+
| Some j -> (try json_bool j with _ -> default)
73
+
| None -> default
74
+
in
75
+
let get_string name =
76
+
match json_mem name obj with
77
+
| Some (Jsont.String (s, _)) -> Some s
78
+
| _ -> None
79
+
in
80
+
let quote_char_opt =
81
+
match json_mem "quote_char" obj with
82
+
| Some (Jsont.String (s, _)) when String.length s = 1 -> Some s.[0]
83
+
| _ -> None
84
+
in
85
+
{
86
+
quote_char = Option.value ~default:'"' quote_char_opt;
87
+
quote_char_explicit = Option.is_some quote_char_opt;
88
+
minimize_boolean_attributes = get_bool "minimize_boolean_attributes" (get_bool "quote_attr_values" true);
89
+
use_trailing_solidus = get_bool "use_trailing_solidus" false;
90
+
escape_lt_in_attrs = get_bool "escape_lt_in_attrs" false;
91
+
escape_rcdata = get_bool "escape_rcdata" false;
92
+
strip_whitespace = get_bool "strip_whitespace" false;
93
+
inject_meta_charset = get_bool "inject_meta_charset" false;
94
+
encoding = get_string "encoding";
95
+
omit_optional_tags = get_bool "omit_optional_tags" true;
96
+
}
97
+
33
98
(* Test case *)
34
99
type test_case = {
35
100
description : string;
36
101
input : Jsont.json list;
37
102
expected : string list;
103
+
options : serialize_options;
38
104
}
39
105
40
106
let parse_test_case json =
···
42
108
let description = json_string (json_mem_exn "description" obj) in
43
109
let input = json_array (json_mem_exn "input" obj) in
44
110
let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in
45
-
{ description; input; expected }
111
+
let options = parse_options (json_mem "options" obj) in
112
+
{ description; input; expected; options }
113
+
114
+
(* Parse attrs that can be either array [{name, value}] or object {name: value} or empty {} *)
115
+
let parse_attrs attrs_json =
116
+
match attrs_json with
117
+
| Jsont.Array (arr, _) ->
118
+
List.map (fun attr_json ->
119
+
let attr_obj = json_object attr_json in
120
+
let attr_name = json_string (json_mem_exn "name" attr_obj) in
121
+
let value = json_string (json_mem_exn "value" attr_obj) in
122
+
(attr_name, value)
123
+
) arr
124
+
| Jsont.Object (obj, _) ->
125
+
List.map (fun ((n, _), v) -> (n, json_string v)) obj
126
+
| _ -> []
46
127
47
-
(* Build a DOM node from test input token *)
48
-
let build_node_from_token token =
128
+
(* Void elements that don't need end tags *)
129
+
let is_void_element name =
130
+
List.mem (String.lowercase_ascii name)
131
+
["area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
132
+
"link"; "meta"; "param"; "source"; "track"; "wbr"]
133
+
134
+
(* Raw text elements whose content should not be escaped *)
135
+
let is_raw_text_element name =
136
+
List.mem (String.lowercase_ascii name) ["script"; "style"]
137
+
138
+
(* Elements where whitespace should be preserved *)
139
+
let is_whitespace_preserving_element name =
140
+
List.mem (String.lowercase_ascii name) ["pre"; "textarea"; "script"; "style"]
141
+
142
+
(* Block elements that close a p tag *)
143
+
let p_closing_elements = [
144
+
"address"; "article"; "aside"; "blockquote"; "datagrid"; "dialog"; "dir";
145
+
"div"; "dl"; "fieldset"; "footer"; "form"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
146
+
"header"; "hgroup"; "hr"; "main"; "menu"; "nav"; "ol"; "p"; "pre"; "section";
147
+
"table"; "ul"
148
+
]
149
+
150
+
let is_p_closing_element name =
151
+
List.mem (String.lowercase_ascii name) p_closing_elements
152
+
153
+
(* Collapse runs of whitespace to single space *)
154
+
let collapse_whitespace text =
155
+
let len = String.length text in
156
+
let buf = Buffer.create len in
157
+
let in_whitespace = ref false in
158
+
for i = 0 to len - 1 do
159
+
let c = text.[i] in
160
+
if c = '\t' || c = '\r' || c = '\n' || c = '\x0C' || c = ' ' then begin
161
+
if not !in_whitespace then begin
162
+
Buffer.add_char buf ' ';
163
+
in_whitespace := true
164
+
end
165
+
end else begin
166
+
Buffer.add_char buf c;
167
+
in_whitespace := false
168
+
end
169
+
done;
170
+
Buffer.contents buf
171
+
172
+
(* Token types for stream-based serialization *)
173
+
type token_type =
174
+
| StartTag of string * (string * string) list (* name, attrs *)
175
+
| EndTag of string (* name *)
176
+
| EmptyTag of string * (string * string) list (* name, attrs *)
177
+
| TextNode of string
178
+
| CommentNode of string
179
+
| DoctypeNode of Dom.node
180
+
181
+
type token_info = {
182
+
token : token_type option;
183
+
node : Dom.node option; (* Legacy for compatibility *)
184
+
tag_name : string option;
185
+
is_empty_tag : bool;
186
+
}
187
+
188
+
let build_token_info token =
49
189
let arr = json_array token in
50
190
match arr with
51
-
| [] -> None
191
+
| [] -> { token = None; node = None; tag_name = None; is_empty_tag = false }
52
192
| type_json :: rest ->
53
-
let token_type = json_string type_json in
54
-
match token_type, rest with
193
+
let token_type_str = json_string type_json in
194
+
match token_type_str, rest with
55
195
| "StartTag", [_ns_json; name_json; attrs_json] ->
56
196
let name = json_string name_json in
57
-
let attrs_list = json_array attrs_json in
58
-
let attrs = List.map (fun attr_json ->
59
-
let attr_obj = json_object attr_json in
60
-
let attr_name = json_string (json_mem_exn "name" attr_obj) in
61
-
let value = json_string (json_mem_exn "value" attr_obj) in
62
-
(attr_name, value)
63
-
) attrs_list in
64
-
Some (Dom.create_element name ~attrs ())
197
+
let attrs = parse_attrs attrs_json in
198
+
{ token = Some (StartTag (name, attrs));
199
+
node = Some (Dom.create_element name ~attrs ());
200
+
tag_name = Some name;
201
+
is_empty_tag = false }
65
202
66
203
| "StartTag", [name_json; attrs_json] ->
67
204
let name = json_string name_json in
68
-
let attrs_obj = json_object attrs_json in
69
-
let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in
70
-
Some (Dom.create_element name ~attrs ())
205
+
let attrs = parse_attrs attrs_json in
206
+
{ token = Some (StartTag (name, attrs));
207
+
node = Some (Dom.create_element name ~attrs ());
208
+
tag_name = Some name;
209
+
is_empty_tag = false }
71
210
72
211
| "EmptyTag", [name_json; attrs_json] ->
73
212
let name = json_string name_json in
74
-
let attrs_obj = json_object attrs_json in
75
-
let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in
76
-
Some (Dom.create_element name ~attrs ())
213
+
let attrs = parse_attrs attrs_json in
214
+
{ token = Some (EmptyTag (name, attrs));
215
+
node = Some (Dom.create_element name ~attrs ());
216
+
tag_name = Some name;
217
+
is_empty_tag = true }
218
+
219
+
| "EndTag", [_ns_json; name_json] ->
220
+
let name = json_string name_json in
221
+
{ token = Some (EndTag name);
222
+
node = None;
223
+
tag_name = Some name;
224
+
is_empty_tag = false }
225
+
226
+
| "EndTag", [name_json] ->
227
+
let name = json_string name_json in
228
+
{ token = Some (EndTag name);
229
+
node = None;
230
+
tag_name = Some name;
231
+
is_empty_tag = false }
77
232
78
233
| "Characters", [text_json] ->
79
234
let text = json_string text_json in
80
-
Some (Dom.create_text text)
235
+
{ token = Some (TextNode text);
236
+
node = Some (Dom.create_text text);
237
+
tag_name = None;
238
+
is_empty_tag = false }
81
239
82
240
| "Comment", [text_json] ->
83
241
let text = json_string text_json in
84
-
Some (Dom.create_comment text)
242
+
{ token = Some (CommentNode text);
243
+
node = Some (Dom.create_comment text);
244
+
tag_name = None;
245
+
is_empty_tag = false }
85
246
86
247
| "Doctype", [name_json] ->
87
248
let name = json_string name_json in
88
-
Some (Dom.create_doctype ~name ())
249
+
let node = Dom.create_doctype ~name () in
250
+
{ token = Some (DoctypeNode node);
251
+
node = Some node;
252
+
tag_name = None;
253
+
is_empty_tag = false }
89
254
90
255
| "Doctype", [name_json; public_json] ->
91
256
let name = json_string name_json in
92
257
let public_id = json_string_opt public_json in
93
-
(match public_id with
94
-
| Some pub -> Some (Dom.create_doctype ~name ~public_id:pub ())
95
-
| None -> Some (Dom.create_doctype ~name ()))
258
+
let node = match public_id with
259
+
| Some pub -> Dom.create_doctype ~name ~public_id:pub ()
260
+
| None -> Dom.create_doctype ~name ()
261
+
in
262
+
{ token = Some (DoctypeNode node);
263
+
node = Some node;
264
+
tag_name = None;
265
+
is_empty_tag = false }
96
266
97
267
| "Doctype", [name_json; public_json; system_json] ->
98
268
let name = json_string name_json in
99
269
let public_id = json_string_opt public_json in
100
270
let system_id = json_string_opt system_json in
101
-
(match public_id, system_id with
102
-
| Some pub, Some sys -> Some (Dom.create_doctype ~name ~public_id:pub ~system_id:sys ())
103
-
| Some pub, None -> Some (Dom.create_doctype ~name ~public_id:pub ())
104
-
| None, Some sys -> Some (Dom.create_doctype ~name ~system_id:sys ())
105
-
| None, None -> Some (Dom.create_doctype ~name ()))
271
+
let node = match public_id, system_id with
272
+
| Some pub, Some sys -> Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()
273
+
| Some pub, None -> Dom.create_doctype ~name ~public_id:pub ()
274
+
| None, Some sys -> Dom.create_doctype ~name ~system_id:sys ()
275
+
| None, None -> Dom.create_doctype ~name ()
276
+
in
277
+
{ token = Some (DoctypeNode node);
278
+
node = Some node;
279
+
tag_name = None;
280
+
is_empty_tag = false }
106
281
107
-
| _ -> None
282
+
| _ -> { token = None; node = None; tag_name = None; is_empty_tag = false }
108
283
109
-
(* Serialize a single node to HTML (simplified, matches test expectations) *)
284
+
(* Serialize a single node to HTML with options *)
110
285
let escape_text text =
111
286
let buf = Buffer.create (String.length text) in
112
287
String.iter (fun c ->
···
129
304
) value;
130
305
!valid
131
306
132
-
let choose_quote value =
133
-
if String.contains value '"' && not (String.contains value '\'') then '\''
134
-
else '"'
135
-
136
-
let escape_attr_value value quote_char =
307
+
let escape_attr_value value quote_char escape_lt =
137
308
let buf = Buffer.create (String.length value) in
138
309
String.iter (fun c ->
139
310
match c with
140
311
| '&' -> Buffer.add_string buf "&"
141
312
| '"' when quote_char = '"' -> Buffer.add_string buf """
313
+
| '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
314
+
| '<' when escape_lt -> Buffer.add_string buf "<"
142
315
| c -> Buffer.add_char buf c
143
316
) value;
144
317
Buffer.contents buf
145
318
146
-
let serialize_node node =
319
+
let serialize_node opts ~in_raw_text node =
147
320
match node.Dom.name with
148
321
| "#text" ->
149
-
(* Check if parent is a raw text element *)
150
-
escape_text node.Dom.data
322
+
if in_raw_text && not opts.escape_rcdata then
323
+
node.Dom.data
324
+
else
325
+
escape_text node.Dom.data
151
326
| "#comment" ->
152
327
"<!--" ^ node.Dom.data ^ "-->"
153
328
| "!doctype" ->
···
177
352
| None -> Buffer.add_string buf "html");
178
353
Buffer.add_char buf '>';
179
354
Buffer.contents buf
180
-
| _ ->
181
-
(* Element *)
182
-
let buf = Buffer.create 64 in
183
-
Buffer.add_char buf '<';
184
-
Buffer.add_string buf node.Dom.name;
185
-
List.iter (fun (key, value) ->
186
-
Buffer.add_char buf ' ';
187
-
Buffer.add_string buf key;
188
-
if can_unquote_attr_value value then begin
189
-
Buffer.add_char buf '=';
190
-
Buffer.add_string buf value
191
-
end else begin
192
-
let quote = choose_quote value in
193
-
Buffer.add_char buf '=';
194
-
Buffer.add_char buf quote;
195
-
Buffer.add_string buf (escape_attr_value value quote);
196
-
Buffer.add_char buf quote
197
-
end
198
-
) node.Dom.attrs;
199
-
Buffer.add_char buf '>';
200
-
Buffer.contents buf
355
+
| _ -> failwith "serialize_node called with element"
356
+
357
+
let choose_quote value default_quote explicit =
358
+
(* If quote_char was explicitly set, always use it *)
359
+
if explicit then default_quote
360
+
else
361
+
(* Otherwise, if value contains the default quote but not the other, use the other *)
362
+
let has_double = String.contains value '"' in
363
+
let has_single = String.contains value '\'' in
364
+
if has_double && not has_single then '\''
365
+
else if has_single && not has_double then '"'
366
+
else default_quote
367
+
368
+
(* Serialize an element tag (start tag) *)
369
+
let serialize_start_tag opts ~is_empty_tag name attrs =
370
+
let buf = Buffer.create 64 in
371
+
Buffer.add_char buf '<';
372
+
Buffer.add_string buf name;
373
+
(* Sort attributes alphabetically for consistent output *)
374
+
let sorted_attrs = List.sort (fun (a, _) (b, _) -> String.compare a b) attrs in
375
+
List.iter (fun (key, value) ->
376
+
Buffer.add_char buf ' ';
377
+
Buffer.add_string buf key;
378
+
let should_minimize =
379
+
opts.minimize_boolean_attributes &&
380
+
String.lowercase_ascii key = String.lowercase_ascii value
381
+
in
382
+
if should_minimize then
383
+
()
384
+
else if String.length value = 0 then begin
385
+
Buffer.add_char buf '=';
386
+
Buffer.add_char buf opts.quote_char;
387
+
Buffer.add_char buf opts.quote_char
388
+
end else if can_unquote_attr_value value then begin
389
+
Buffer.add_char buf '=';
390
+
Buffer.add_string buf value
391
+
end else begin
392
+
let quote = choose_quote value opts.quote_char opts.quote_char_explicit in
393
+
Buffer.add_char buf '=';
394
+
Buffer.add_char buf quote;
395
+
Buffer.add_string buf (escape_attr_value value quote opts.escape_lt_in_attrs);
396
+
Buffer.add_char buf quote
397
+
end
398
+
) sorted_attrs;
399
+
if opts.use_trailing_solidus && (is_empty_tag || is_void_element name) then
400
+
Buffer.add_string buf " /";
401
+
Buffer.add_char buf '>';
402
+
Buffer.contents buf
403
+
404
+
(* Check if text starts with ASCII whitespace *)
405
+
let text_starts_with_space text =
406
+
String.length text > 0 &&
407
+
let c = text.[0] in
408
+
c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || c = ' '
409
+
410
+
(* Optional tag omission helpers *)
411
+
type next_token =
412
+
| NTComment
413
+
| NTSpace (* Text starting with space *)
414
+
| NTText (* Text not starting with space *)
415
+
| NTStartTag of string
416
+
| NTEmptyTag of string
417
+
| NTEndTag of string
418
+
| NTEOF
419
+
420
+
let classify_next tokens idx =
421
+
if idx >= Array.length tokens then NTEOF
422
+
else match tokens.(idx).token with
423
+
| None -> NTEOF
424
+
| Some (CommentNode _) -> NTComment
425
+
| Some (TextNode text) ->
426
+
if text_starts_with_space text then NTSpace else NTText
427
+
| Some (StartTag (name, _)) -> NTStartTag (String.lowercase_ascii name)
428
+
| Some (EmptyTag (name, _)) -> NTEmptyTag (String.lowercase_ascii name)
429
+
| Some (EndTag name) -> NTEndTag (String.lowercase_ascii name)
430
+
| Some (DoctypeNode _) -> NTEOF (* Treat doctype as if nothing follows *)
431
+
432
+
(* Should we omit a start tag? *)
433
+
let should_omit_start_tag opts name attrs next =
434
+
if not opts.omit_optional_tags then false
435
+
else
436
+
let name = String.lowercase_ascii name in
437
+
match name, next with
438
+
(* html start tag: omit if not followed by comment or space, AND has no attributes *)
439
+
| "html", NTComment -> false
440
+
| "html", NTSpace -> false
441
+
| "html", _ -> attrs = [] (* only omit if no attributes *)
442
+
(* head start tag: omit if followed by element (start/empty tag) *)
443
+
| "head", NTStartTag _ -> true
444
+
| "head", NTEmptyTag _ -> true
445
+
| "head", NTEndTag "head" -> true (* empty head *)
446
+
| "head", NTEOF -> true
447
+
| "head", _ -> false
448
+
(* body start tag: omit if not followed by comment or space, AND has no attributes *)
449
+
| "body", NTComment -> false
450
+
| "body", NTSpace -> false
451
+
| "body", _ -> attrs = [] (* only omit if no attributes *)
452
+
(* colgroup start tag: omit if followed by col element *)
453
+
| "colgroup", NTStartTag "col" -> true
454
+
| "colgroup", NTEmptyTag "col" -> true
455
+
| "colgroup", _ -> false
456
+
(* tbody start tag: omit if first child is tr *)
457
+
| "tbody", NTStartTag "tr" -> true
458
+
| "tbody", _ -> false
459
+
| _ -> false
460
+
461
+
(* Should we omit an end tag? *)
462
+
let should_omit_end_tag opts name next =
463
+
if not opts.omit_optional_tags then false
464
+
else
465
+
let name = String.lowercase_ascii name in
466
+
match name, next with
467
+
(* html end tag: omit if not followed by comment or space *)
468
+
| "html", NTComment -> false
469
+
| "html", NTSpace -> false
470
+
| "html", _ -> true
471
+
(* head end tag: omit if not followed by comment or space *)
472
+
| "head", NTComment -> false
473
+
| "head", NTSpace -> false
474
+
| "head", _ -> true
475
+
(* body end tag: omit if not followed by comment or space *)
476
+
| "body", NTComment -> false
477
+
| "body", NTSpace -> false
478
+
| "body", _ -> true
479
+
(* li end tag: omit if followed by li start tag or parent end tag *)
480
+
| "li", NTStartTag "li" -> true
481
+
| "li", NTEndTag _ -> true
482
+
| "li", NTEOF -> true
483
+
| "li", _ -> false
484
+
(* dt end tag: omit if followed by dt or dd start tag (NOT end tag or EOF!) *)
485
+
| "dt", NTStartTag "dt" -> true
486
+
| "dt", NTStartTag "dd" -> true
487
+
| "dt", _ -> false
488
+
(* dd end tag: omit if followed by dd or dt start tag, or end tag, or EOF *)
489
+
| "dd", NTStartTag "dd" -> true
490
+
| "dd", NTStartTag "dt" -> true
491
+
| "dd", NTEndTag _ -> true
492
+
| "dd", NTEOF -> true
493
+
| "dd", _ -> false
494
+
(* p end tag: omit if followed by block element (start or empty tag), end tag, or EOF *)
495
+
| "p", NTStartTag next_name when is_p_closing_element next_name -> true
496
+
| "p", NTEmptyTag next_name when is_p_closing_element next_name -> true
497
+
| "p", NTEndTag _ -> true
498
+
| "p", NTEOF -> true
499
+
| "p", _ -> false
500
+
(* optgroup end tag: omit if followed by optgroup start tag, end tag, or EOF *)
501
+
| "optgroup", NTStartTag "optgroup" -> true
502
+
| "optgroup", NTEndTag _ -> true
503
+
| "optgroup", NTEOF -> true
504
+
| "optgroup", _ -> false
505
+
(* option end tag: omit if followed by option/optgroup start tag, end tag, or EOF *)
506
+
| "option", NTStartTag "option" -> true
507
+
| "option", NTStartTag "optgroup" -> true
508
+
| "option", NTEndTag _ -> true
509
+
| "option", NTEOF -> true
510
+
| "option", _ -> false
511
+
(* colgroup end tag: omit if not followed by comment, space, or another colgroup *)
512
+
| "colgroup", NTComment -> false
513
+
| "colgroup", NTSpace -> false
514
+
| "colgroup", NTStartTag "colgroup" -> false (* keep end tag when another colgroup follows *)
515
+
| "colgroup", _ -> true
516
+
(* thead end tag: omit if followed by tbody or tfoot start tag *)
517
+
| "thead", NTStartTag "tbody" -> true
518
+
| "thead", NTStartTag "tfoot" -> true
519
+
| "thead", _ -> false
520
+
(* tbody end tag: omit if followed by tbody/tfoot start tag, end tag, or EOF *)
521
+
| "tbody", NTStartTag "tbody" -> true
522
+
| "tbody", NTStartTag "tfoot" -> true
523
+
| "tbody", NTEndTag _ -> true
524
+
| "tbody", NTEOF -> true
525
+
| "tbody", _ -> false
526
+
(* tfoot end tag: omit if followed by tbody start tag, end tag, or EOF *)
527
+
| "tfoot", NTStartTag "tbody" -> true
528
+
| "tfoot", NTEndTag _ -> true
529
+
| "tfoot", NTEOF -> true
530
+
| "tfoot", _ -> false
531
+
(* tr end tag: omit if followed by tr start tag, end tag, or EOF *)
532
+
| "tr", NTStartTag "tr" -> true
533
+
| "tr", NTEndTag _ -> true
534
+
| "tr", NTEOF -> true
535
+
| "tr", _ -> false
536
+
(* td end tag: omit if followed by td/th start tag, end tag, or EOF *)
537
+
| "td", NTStartTag "td" -> true
538
+
| "td", NTStartTag "th" -> true
539
+
| "td", NTEndTag _ -> true
540
+
| "td", NTEOF -> true
541
+
| "td", _ -> false
542
+
(* th end tag: omit if followed by th/td start tag, end tag, or EOF *)
543
+
| "th", NTStartTag "th" -> true
544
+
| "th", NTStartTag "td" -> true
545
+
| "th", NTEndTag _ -> true
546
+
| "th", NTEOF -> true
547
+
| "th", _ -> false
548
+
| _ -> false
201
549
202
550
(* Run a single test *)
203
551
let run_test test =
204
552
try
205
-
(* Build nodes from input tokens *)
206
-
let nodes = List.filter_map build_node_from_token test.input in
553
+
(* Build token infos from input *)
554
+
let token_infos = Array.of_list (List.map build_token_info test.input) in
555
+
let num_tokens = Array.length token_infos in
556
+
557
+
(* Handle inject_meta_charset option *)
558
+
let inject_meta = test.options.inject_meta_charset in
559
+
let encoding = test.options.encoding in
560
+
561
+
(* Serialize with context tracking *)
562
+
let buf = Buffer.create 256 in
563
+
let in_raw_text = ref false in
564
+
let preserve_whitespace = ref false in
565
+
let element_stack : string list ref = ref [] in
566
+
let in_head = ref false in
567
+
let meta_charset_injected = ref false in
568
+
let prev_was_section_end = ref false in (* Track if prev token was thead/tbody/tfoot end *)
207
569
208
-
(* Serialize *)
209
-
let serialized = String.concat "" (List.map serialize_node nodes) in
570
+
for i = 0 to num_tokens - 1 do
571
+
let info = token_infos.(i) in
572
+
let next = classify_next token_infos (i + 1) in
573
+
574
+
match info.token with
575
+
| None -> ()
576
+
577
+
| Some (StartTag (name, attrs)) ->
578
+
let name_lower = String.lowercase_ascii name in
579
+
580
+
(* Track head element *)
581
+
if name_lower = "head" then in_head := true;
582
+
583
+
(* For inject_meta_charset, we need to check if there's any charset meta coming up *)
584
+
(* If yes, don't inject at head start; if no, inject at head start *)
585
+
let should_inject_at_head =
586
+
if not inject_meta || name_lower <> "head" then false
587
+
else match encoding with
588
+
| None -> false
589
+
| Some _ ->
590
+
(* Look ahead to see if there's a charset meta or http-equiv content-type *)
591
+
let has_charset_meta = ref false in
592
+
for j = i + 1 to num_tokens - 1 do
593
+
match token_infos.(j).token with
594
+
| Some (EmptyTag (n, a)) when String.lowercase_ascii n = "meta" ->
595
+
let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") a in
596
+
let has_http_equiv_ct = List.exists (fun (k, v) ->
597
+
String.lowercase_ascii k = "http-equiv" && String.lowercase_ascii v = "content-type") a in
598
+
if has_charset || has_http_equiv_ct then has_charset_meta := true
599
+
| Some (EndTag n) when String.lowercase_ascii n = "head" -> ()
600
+
| _ -> ()
601
+
done;
602
+
not !has_charset_meta
603
+
in
604
+
605
+
(* Special case: tbody start tag cannot be omitted if preceded by section end tag *)
606
+
let can_omit_start =
607
+
if name_lower = "tbody" && !prev_was_section_end then false
608
+
else should_omit_start_tag test.options name attrs next
609
+
in
610
+
prev_was_section_end := false; (* Reset for next iteration *)
611
+
612
+
if should_inject_at_head then begin
613
+
meta_charset_injected := true;
614
+
(* Don't output head start tag if we should omit it *)
615
+
if not can_omit_start then
616
+
Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
617
+
Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" (Option.get encoding));
618
+
element_stack := name :: !element_stack;
619
+
if is_raw_text_element name then in_raw_text := true;
620
+
if is_whitespace_preserving_element name then preserve_whitespace := true
621
+
end else if not can_omit_start then begin
622
+
Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs);
623
+
element_stack := name :: !element_stack;
624
+
if is_raw_text_element name then in_raw_text := true;
625
+
if is_whitespace_preserving_element name then preserve_whitespace := true
626
+
end else begin
627
+
element_stack := name :: !element_stack;
628
+
if is_raw_text_element name then in_raw_text := true;
629
+
if is_whitespace_preserving_element name then preserve_whitespace := true
630
+
end
631
+
632
+
| Some (EmptyTag (name, attrs)) ->
633
+
let name_lower = String.lowercase_ascii name in
634
+
prev_was_section_end := false; (* Reset for next iteration *)
635
+
636
+
(* Handle meta charset replacement *)
637
+
if inject_meta && !in_head && name_lower = "meta" then begin
638
+
let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") attrs in
639
+
let has_http_equiv_ct =
640
+
List.exists (fun (k, v) ->
641
+
String.lowercase_ascii k = "http-equiv" &&
642
+
String.lowercase_ascii v = "content-type"
643
+
) attrs
644
+
in
645
+
if has_charset then begin
646
+
(* Replace charset value *)
647
+
match encoding with
648
+
| Some enc ->
649
+
Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" enc)
650
+
| None -> () (* No encoding, skip the meta tag *)
651
+
end else if has_http_equiv_ct then begin
652
+
(* Replace charset in content value *)
653
+
match encoding with
654
+
| Some enc ->
655
+
let new_attrs = List.map (fun (k, v) ->
656
+
if String.lowercase_ascii k = "content" then
657
+
let new_content = Printf.sprintf "text/html; charset=%s" enc in
658
+
(k, new_content)
659
+
else (k, v)
660
+
) attrs in
661
+
Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name new_attrs)
662
+
| None ->
663
+
Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
664
+
end else begin
665
+
(* Regular meta tag, output as normal *)
666
+
Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
667
+
end
668
+
end else
669
+
Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs)
670
+
671
+
| Some (EndTag name) ->
672
+
let name_lower = String.lowercase_ascii name in
673
+
674
+
(* Track head element *)
675
+
if name_lower = "head" then in_head := false;
676
+
677
+
(* Pop from element stack *)
678
+
(match !element_stack with
679
+
| top :: rest when String.lowercase_ascii top = name_lower ->
680
+
element_stack := rest;
681
+
if is_raw_text_element name then in_raw_text := false;
682
+
if is_whitespace_preserving_element name then preserve_whitespace := false
683
+
| _ -> ());
684
+
685
+
let is_section_end = List.mem name_lower ["thead"; "tbody"; "tfoot"] in
686
+
let omit = should_omit_end_tag test.options name next in
687
+
688
+
if not omit then begin
689
+
Buffer.add_string buf "</";
690
+
Buffer.add_string buf name;
691
+
Buffer.add_char buf '>'
692
+
end;
693
+
694
+
(* Track if we omitted a section end tag - next tbody can't be omitted *)
695
+
prev_was_section_end := is_section_end && omit
696
+
697
+
| Some (TextNode text) ->
698
+
prev_was_section_end := false;
699
+
let processed_text =
700
+
if !in_raw_text && not test.options.escape_rcdata then
701
+
text
702
+
else if test.options.strip_whitespace && not !preserve_whitespace then
703
+
escape_text (collapse_whitespace text)
704
+
else
705
+
escape_text text
706
+
in
707
+
Buffer.add_string buf processed_text
708
+
709
+
| Some (CommentNode text) ->
710
+
prev_was_section_end := false;
711
+
Buffer.add_string buf "<!--";
712
+
Buffer.add_string buf text;
713
+
Buffer.add_string buf "-->"
714
+
715
+
| Some (DoctypeNode node) ->
716
+
prev_was_section_end := false;
717
+
Buffer.add_string buf (serialize_node test.options ~in_raw_text:false node)
718
+
done;
719
+
720
+
let serialized = Buffer.contents buf in
210
721
211
722
(* Check if it matches any expected output *)
212
723
let matches = List.exists (fun exp -> serialized = exp) test.expected in
+18
-17
test/test_tokenizer.ml
+18
-17
test/test_tokenizer.ml
···
36
36
initial_states : string list;
37
37
last_start_tag : string option;
38
38
double_escaped : bool;
39
+
xml_mode : bool;
39
40
}
40
41
41
42
(* Unescape double-escaped strings from tests *)
···
118
119
}
119
120
120
121
(* Parse a single test case from JSON *)
121
-
let parse_test_case json =
122
+
let parse_test_case ~xml_mode json =
122
123
let obj = json_object json in
123
124
let description = json_string (json_mem_exn "description" obj) in
124
125
let input = json_string (json_mem_exn "input" obj) in
···
139
140
| Some b -> json_bool b
140
141
| None -> false
141
142
in
142
-
{ description; input; output; errors; initial_states; last_start_tag; double_escaped }
143
+
{ description; input; output; errors; initial_states; last_start_tag; double_escaped; xml_mode }
143
144
144
145
(* Convert state name to State.t *)
145
146
let state_of_string = function
···
222
223
let input = if test.double_escaped then unescape_double test.input else test.input in
223
224
224
225
let collector = TokenCollector.create () in
225
-
let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true () in
226
+
let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
226
227
227
228
(* Set initial state *)
228
229
Tokenizer.set_state tokenizer initial_state;
···
305
306
306
307
let obj = json_object json in
307
308
308
-
(* Handle both {"tests": [...]} and {"xmlViolationTests": [...], "tests": [...]} formats *)
309
-
let test_arrays =
310
-
let tests = match json_mem "tests" obj with
311
-
| Some t -> json_array t
312
-
| None -> []
313
-
in
314
-
let xml_tests = match json_mem "xmlViolationTests" obj with
315
-
| Some t -> json_array t
316
-
| None -> []
317
-
in
318
-
tests @ xml_tests
309
+
(* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *)
310
+
let regular_tests =
311
+
match json_mem "tests" obj with
312
+
| Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t)
313
+
| None -> []
314
+
in
315
+
let xml_tests =
316
+
match json_mem "xmlViolationTests" obj with
317
+
| Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t)
318
+
| None -> []
319
319
in
320
+
let all_tests = regular_tests @ xml_tests in
320
321
321
322
let filename = Filename.basename path in
322
323
let passed = ref 0 in
323
324
let failed = ref 0 in
324
325
let first_failures = ref [] in
325
326
326
-
List.iteri (fun i test_json ->
327
-
let test = parse_test_case test_json in
327
+
List.iteri (fun i test ->
328
+
(* test is already parsed *)
328
329
329
330
(* Run for each initial state *)
330
331
List.iter (fun state_name ->
···
345
346
first_failures := (i + 1, test.description, state_name, [], [], [], []) :: !first_failures;
346
347
Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e)
347
348
) test.initial_states
348
-
) test_arrays;
349
+
) all_tests;
349
350
350
351
(!passed, !failed, List.rev !first_failures, filename)
351
352