OCaml HTML5 parser/serialiser based on Python's JustHTML

100%

Changed files
+818 -99
lib
test
+2 -2
lib/encoding/decode.ml
··· 186 186 match Prescan.prescan_for_meta_charset data with 187 187 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 188 188 | None -> 189 - (* Default to UTF-8 *) 190 - (decode_with_encoding data Encoding.Utf8 ~bom_len:0, Encoding.Utf8) 189 + (* Default to Windows-1252 per HTML5 spec when no encoding detected *) 190 + (decode_with_encoding data Encoding.Windows_1252 ~bom_len:0, Encoding.Windows_1252)
+8 -5
lib/encoding/prescan.ml
··· 97 97 if !j + 2 < len then 98 98 i := !j + 3 99 99 else 100 - result := None (* Unclosed comment, stop scanning *) 100 + i := len (* Unclosed comment - stop scanning *) 101 101 end 102 102 (* Check for end tag - skip it *) 103 103 else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin 104 104 let j = ref (!i + 2) in 105 105 let in_quote = ref None in 106 - while !j < len && !j < max_total && !non_comment < max_non_comment do 106 + let done_tag = ref false in 107 + while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do 107 108 let c = Bytes.get data !j in 108 109 match !in_quote with 109 110 | None -> ··· 114 115 end else if c = '>' then begin 115 116 incr j; 116 117 incr non_comment; 117 - j := len (* Exit loop *) 118 + done_tag := true 118 119 end else begin 119 120 incr j; 120 121 incr non_comment ··· 138 139 if tag_name <> "meta" then begin 139 140 (* Skip non-meta tag *) 140 141 let in_quote = ref None in 141 - while !j < len && !j < max_total && !non_comment < max_non_comment do 142 + let done_tag = ref false in 143 + while not !done_tag && !j < len && !j < max_total && !non_comment < max_non_comment do 142 144 let c = Bytes.get data !j in 143 145 match !in_quote with 144 146 | None -> ··· 149 151 end else if c = '>' then begin 150 152 incr j; 151 153 incr non_comment; 152 - j := len 154 + done_tag := true 153 155 end else begin 154 156 incr j; 155 157 incr non_comment ··· 240 242 | None -> ()); 241 243 242 244 (* Check for http-equiv="content-type" with content *) 245 + (* Note: http-equiv value must be exactly "content-type" (case-insensitive) *) 243 246 if !result = None then 244 247 (match !http_equiv, !content with 245 248 | Some he, Some ct when String.lowercase_ascii he = "content-type" ->
+108
lib/tokenizer/stream.ml
··· 22 22 mutable last_was_cr : bool; 23 23 (* Track if we need to skip the next LF from raw stream (set after peek of CR) *) 24 24 mutable skip_next_lf : bool; 25 + (* Error callback for surrogate/noncharacter detection *) 26 + mutable error_callback : (string -> unit) option; 25 27 } 26 28 27 29 (* Create a stream from a Bytes.Reader.t *) ··· 36 38 column = 0; 37 39 last_was_cr = false; 38 40 skip_next_lf = false; 41 + error_callback = None; 39 42 } 40 43 44 + let set_error_callback t cb = 45 + t.error_callback <- Some cb 46 + 47 + (* Check if a Unicode codepoint is a surrogate *) 48 + let is_surrogate cp = cp >= 0xD800 && cp <= 0xDFFF 49 + 50 + (* Check if a Unicode codepoint is a noncharacter *) 51 + let is_noncharacter cp = 52 + (* U+FDD0 to U+FDEF *) 53 + (cp >= 0xFDD0 && cp <= 0xFDEF) || 54 + (* U+FFFE and U+FFFF in each plane (0-16) *) 55 + ((cp land 0xFFFF) = 0xFFFE || (cp land 0xFFFF) = 0xFFFF) 56 + 41 57 (* Create a stream from a string - discouraged, prefer create_from_reader *) 42 58 let create input = 43 59 create_from_reader (Bytes.Reader.of_string input) ··· 78 94 let push_back_char t c = 79 95 t.lookahead <- c :: t.lookahead 80 96 97 + (* Check for surrogates and noncharacters in UTF-8 sequences. 98 + Called after reading a lead byte, peeks continuation bytes to decode codepoint. *) 99 + let check_utf8_codepoint t lead_byte = 100 + let b0 = Char.code lead_byte in 101 + if b0 < 0x80 then 102 + (* ASCII - no surrogates or noncharacters possible in this range except control chars *) 103 + () 104 + else if b0 >= 0xC2 && b0 <= 0xDF then begin 105 + (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *) 106 + (* Check for C1 control characters U+0080-U+009F *) 107 + match read_raw_char t with 108 + | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> 109 + let b1 = Char.code c1 in 110 + let cp = ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) in 111 + push_back_char t c1; 112 + (* C1 controls: U+0080 to U+009F *) 113 + if cp >= 0x80 && cp <= 0x9F then 114 + (match t.error_callback with 115 + | Some cb -> cb "control-character-in-input-stream" 116 + | None -> ()) 117 + | Some c1 -> 118 + push_back_char t c1 119 + | None -> () 120 + end else if b0 >= 0xE0 && b0 <= 0xEF then begin 121 + (* 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx -> U+0800 to U+FFFF *) 122 + (* Need to peek 2 continuation bytes *) 123 + match read_raw_char t with 124 + | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> 125 + let b1 = Char.code c1 in 126 + (match read_raw_char t with 127 + | Some c2 when (Char.code c2 land 0xC0) = 0x80 -> 128 + let b2 = Char.code c2 in 129 + let cp = ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in 130 + push_back_char t c2; 131 + push_back_char t c1; 132 + (* Check for surrogates and noncharacters *) 133 + (match t.error_callback with 134 + | Some cb -> 135 + if is_surrogate cp then cb "surrogate-in-input-stream" 136 + else if is_noncharacter cp then cb "noncharacter-in-input-stream" 137 + | None -> ()) 138 + | Some c2 -> 139 + push_back_char t c2; 140 + push_back_char t c1 141 + | None -> 142 + push_back_char t c1) 143 + | Some c1 -> 144 + push_back_char t c1 145 + | None -> () 146 + end else if b0 >= 0xF0 && b0 <= 0xF4 then begin 147 + (* 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+10000 to U+10FFFF *) 148 + match read_raw_char t with 149 + | Some c1 when (Char.code c1 land 0xC0) = 0x80 -> 150 + let b1 = Char.code c1 in 151 + (match read_raw_char t with 152 + | Some c2 when (Char.code c2 land 0xC0) = 0x80 -> 153 + let b2 = Char.code c2 in 154 + (match read_raw_char t with 155 + | Some c3 when (Char.code c3 land 0xC0) = 0x80 -> 156 + let b3 = Char.code c3 in 157 + let cp = ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor 158 + ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in 159 + push_back_char t c3; 160 + push_back_char t c2; 161 + push_back_char t c1; 162 + (* Check for noncharacters (no surrogates in 4-byte range) *) 163 + (match t.error_callback with 164 + | Some cb -> 165 + if is_noncharacter cp then cb "noncharacter-in-input-stream" 166 + | None -> ()) 167 + | Some c3 -> 168 + push_back_char t c3; 169 + push_back_char t c2; 170 + push_back_char t c1 171 + | None -> 172 + push_back_char t c2; 173 + push_back_char t c1) 174 + | Some c2 -> 175 + push_back_char t c2; 176 + push_back_char t c1 177 + | None -> 178 + push_back_char t c1) 179 + | Some c1 -> 180 + push_back_char t c1 181 + | None -> () 182 + end 183 + 81 184 (* Read next char with CR/LF normalization *) 82 185 let rec read_normalized_char t = 186 + (* Track if we're reading from lookahead - if so, we've already checked this byte *) 187 + let from_lookahead = t.lookahead <> [] in 83 188 match read_raw_char t with 84 189 | None -> 85 190 t.last_was_cr <- false; ··· 98 203 read_normalized_char t 99 204 | Some c -> 100 205 t.last_was_cr <- false; 206 + (* Only check for surrogates/noncharacters when reading fresh from stream, 207 + not when re-reading from lookahead (to avoid duplicate errors) *) 208 + if not from_lookahead then check_utf8_codepoint t c; 101 209 Some c 102 210 103 211 let is_eof t =
+105 -9
lib/tokenizer/tokenizer.ml
··· 39 39 mutable pending_chars : Buffer.t; 40 40 mutable errors : Errors.t list; 41 41 collect_errors : bool; 42 + xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *) 42 43 } 43 44 44 - let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) () = { 45 + let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = { 45 46 stream = Stream.create ""; 46 47 sink; 47 48 state = State.Data; ··· 63 64 pending_chars = Buffer.create 256; 64 65 errors = []; 65 66 collect_errors; 67 + xml_mode; 66 68 } 67 69 68 70 let error t code = ··· 73 75 74 76 (* emit functions are defined locally inside run *) 75 77 78 + (* XML mode character transformation: form feed → space *) 76 79 let emit_char t c = 77 - Buffer.add_char t.pending_chars c 80 + if t.xml_mode && c = '\x0C' then 81 + Buffer.add_char t.pending_chars ' ' 82 + else 83 + Buffer.add_char t.pending_chars c 78 84 85 + (* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *) 79 86 let emit_str t s = 80 - Buffer.add_string t.pending_chars s 87 + if t.xml_mode then begin 88 + (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *) 89 + let len = String.length s in 90 + let i = ref 0 in 91 + while !i < len do 92 + let c = s.[!i] in 93 + if c = '\x0C' then begin 94 + Buffer.add_char t.pending_chars ' '; 95 + incr i 96 + end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin 97 + (* U+FFFF → U+FFFD *) 98 + Buffer.add_string t.pending_chars "\xEF\xBF\xBD"; 99 + i := !i + 3 100 + end else begin 101 + Buffer.add_char t.pending_chars c; 102 + incr i 103 + end 104 + done 105 + end else 106 + Buffer.add_string t.pending_chars s 81 107 82 108 let start_new_tag t kind = 83 109 Buffer.clear t.current_tag_name; ··· 130 156 let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) = 131 157 t.stream <- Stream.create_from_reader reader; 132 158 t.errors <- []; 159 + (* Set up error callback for surrogate/noncharacter detection in stream *) 160 + (* In XML mode, we don't report noncharacter errors - we transform them instead *) 161 + if not t.xml_mode then 162 + Stream.set_error_callback t.stream (fun code -> error t code); 163 + 164 + (* XML mode transformation for pending chars: U+FFFF → U+FFFD *) 165 + let transform_xml_chars data = 166 + let len = String.length data in 167 + let buf = Buffer.create len in 168 + let i = ref 0 in 169 + while !i < len do 170 + let c = data.[!i] in 171 + if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin 172 + (* U+FFFF → U+FFFD *) 173 + Buffer.add_string buf "\xEF\xBF\xBD"; 174 + i := !i + 3 175 + end else begin 176 + Buffer.add_char buf c; 177 + incr i 178 + end 179 + done; 180 + Buffer.contents buf 181 + in 133 182 134 183 (* Local emit functions with access to S *) 135 184 let emit_pending_chars () = 136 185 if Buffer.length t.pending_chars > 0 then begin 137 186 let data = Buffer.contents t.pending_chars in 138 187 Buffer.clear t.pending_chars; 188 + let data = if t.xml_mode then transform_xml_chars data else data in 139 189 ignore (S.process t.sink (Token.Character data)) 140 190 end 141 191 in ··· 180 230 in 181 231 182 232 let emit_current_comment () = 183 - emit (Token.Comment (Buffer.contents t.current_comment)) 233 + let content = Buffer.contents t.current_comment in 234 + let content = 235 + if t.xml_mode then begin 236 + (* XML mode: transform -- to - - in comments *) 237 + let buf = Buffer.create (String.length content + 10) in 238 + let len = String.length content in 239 + let i = ref 0 in 240 + while !i < len do 241 + if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin 242 + Buffer.add_string buf "- -"; 243 + i := !i + 2 244 + end else begin 245 + Buffer.add_char buf content.[!i]; 246 + incr i 247 + end 248 + done; 249 + Buffer.contents buf 250 + end else content 251 + in 252 + emit (Token.Comment content) 184 253 in 185 254 186 255 (* Check for control characters and emit error if needed *) 256 + (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *) 187 257 let check_control_char c = 188 258 let code = Char.code c in 189 - (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F-U+009F *) 259 + (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *) 190 260 (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *) 261 + (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *) 262 + (* Note: We only check single-byte control chars here; multi-byte checks are TODO *) 191 263 if (code >= 0x01 && code <= 0x08) || 192 264 code = 0x0B || 193 265 (code >= 0x0E && code <= 0x1F) || 194 - (code >= 0x7F && code <= 0x9F) then 266 + code = 0x7F then 195 267 error t "control-character-in-input-stream" 196 268 in 269 + 197 270 198 271 (* Emit char with control character check *) 199 272 let emit_char_checked c = ··· 294 367 | State.Script_data_escaped 295 368 | State.Script_data_escaped_dash 296 369 | State.Script_data_escaped_dash_dash -> 370 + error t "eof-in-script-html-comment-like-text"; 297 371 emit_pending_chars (); 298 372 ignore (S.process t.sink Token.EOF) 299 373 | State.Script_data_escaped_less_than_sign -> ··· 313 387 | State.Script_data_double_escaped 314 388 | State.Script_data_double_escaped_dash 315 389 | State.Script_data_double_escaped_dash_dash -> 390 + error t "eof-in-script-html-comment-like-text"; 316 391 emit_pending_chars (); 317 392 ignore (S.process t.sink Token.EOF) 318 393 | State.Script_data_double_escaped_less_than_sign -> ··· 647 722 error t "unexpected-null-character"; 648 723 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 649 724 | Some c -> 725 + check_control_char c; 650 726 Buffer.add_char t.current_tag_name (ascii_lower c) 651 727 | None -> () 652 728 ··· 1015 1091 Buffer.add_char t.current_attr_name (Option.get c_opt) 1016 1092 | Some c -> 1017 1093 Stream.advance t.stream; 1094 + check_control_char c; 1018 1095 Buffer.add_char t.current_attr_name (ascii_lower c) 1019 1096 1020 1097 and state_after_attribute_name () = ··· 1065 1142 error t "unexpected-null-character"; 1066 1143 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1067 1144 | Some c -> 1145 + check_control_char c; 1068 1146 Buffer.add_char t.current_attr_value c 1069 1147 | None -> () 1070 1148 ··· 1079 1157 error t "unexpected-null-character"; 1080 1158 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1081 1159 | Some c -> 1160 + check_control_char c; 1082 1161 Buffer.add_char t.current_attr_value c 1083 1162 | None -> () 1084 1163 ··· 1105 1184 Buffer.add_char t.current_attr_value (Option.get c_opt) 1106 1185 | Some c -> 1107 1186 Stream.advance t.stream; 1187 + check_control_char c; 1108 1188 Buffer.add_char t.current_attr_value c 1109 1189 | None -> () 1110 1190 ··· 1146 1226 error t "unexpected-null-character"; 1147 1227 Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1148 1228 | Some c -> 1229 + check_control_char c; 1149 1230 Buffer.add_char t.current_comment c 1150 1231 | None -> () 1151 1232 ··· 1212 1293 error t "unexpected-null-character"; 1213 1294 Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1214 1295 | Some c -> 1296 + check_control_char c; 1215 1297 Buffer.add_char t.current_comment c 1216 1298 | None -> () 1217 1299 ··· 1327 1409 | None -> () 1328 1410 | Some c -> 1329 1411 Stream.advance t.stream; 1412 + check_control_char c; 1330 1413 start_new_doctype t; 1331 1414 t.current_doctype_name <- Some (Buffer.create 8); 1332 1415 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c); ··· 1343 1426 error t "unexpected-null-character"; 1344 1427 Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD" 1345 1428 | Some c -> 1429 + check_control_char c; 1346 1430 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c) 1347 1431 | None -> () 1348 1432 ··· 1356 1440 emit_current_doctype () 1357 1441 | None -> () 1358 1442 | Some _ -> 1443 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1359 1444 if Stream.matches_ci t.stream "PUBLIC" then begin 1360 1445 ignore (Stream.consume_exact_ci t.stream "PUBLIC"); 1361 1446 t.state <- State.After_doctype_public_keyword ··· 1391 1476 emit_current_doctype () 1392 1477 | None -> () 1393 1478 | Some _ -> 1479 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1394 1480 error t "missing-quote-before-doctype-public-identifier"; 1395 1481 t.current_doctype_force_quirks <- true; 1396 1482 t.state <- State.Bogus_doctype ··· 1432 1518 t.state <- State.Data; 1433 1519 emit_current_doctype () 1434 1520 | Some c -> 1521 + check_control_char c; 1435 1522 Buffer.add_char (Option.get t.current_doctype_public) c 1436 1523 | None -> () 1437 1524 ··· 1448 1535 t.state <- State.Data; 1449 1536 emit_current_doctype () 1450 1537 | Some c -> 1538 + check_control_char c; 1451 1539 Buffer.add_char (Option.get t.current_doctype_public) c 1452 1540 | None -> () 1453 1541 ··· 1472 1560 t.state <- State.Doctype_system_identifier_single_quoted 1473 1561 | None -> () 1474 1562 | Some _ -> 1563 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1475 1564 error t "missing-quote-before-doctype-system-identifier"; 1476 1565 t.current_doctype_force_quirks <- true; 1477 1566 t.state <- State.Bogus_doctype ··· 1494 1583 t.state <- State.Doctype_system_identifier_single_quoted 1495 1584 | None -> () 1496 1585 | Some _ -> 1586 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1497 1587 error t "missing-quote-before-doctype-system-identifier"; 1498 1588 t.current_doctype_force_quirks <- true; 1499 1589 t.state <- State.Bogus_doctype ··· 1521 1611 emit_current_doctype () 1522 1612 | None -> () 1523 1613 | Some _ -> 1614 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1524 1615 error t "missing-quote-before-doctype-system-identifier"; 1525 1616 t.current_doctype_force_quirks <- true; 1526 1617 t.state <- State.Bogus_doctype ··· 1545 1636 emit_current_doctype () 1546 1637 | None -> () 1547 1638 | Some _ -> 1639 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1548 1640 error t "missing-quote-before-doctype-system-identifier"; 1549 1641 t.current_doctype_force_quirks <- true; 1550 1642 t.state <- State.Bogus_doctype ··· 1562 1654 t.state <- State.Data; 1563 1655 emit_current_doctype () 1564 1656 | Some c -> 1657 + check_control_char c; 1565 1658 Buffer.add_char (Option.get t.current_doctype_system) c 1566 1659 | None -> () 1567 1660 ··· 1578 1671 t.state <- State.Data; 1579 1672 emit_current_doctype () 1580 1673 | Some c -> 1674 + check_control_char c; 1581 1675 Buffer.add_char (Option.get t.current_doctype_system) c 1582 1676 | None -> () 1583 1677 ··· 1591 1685 emit_current_doctype () 1592 1686 | None -> () 1593 1687 | Some _ -> 1688 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1594 1689 error t "unexpected-character-after-doctype-system-identifier"; 1595 1690 t.state <- State.Bogus_doctype 1596 1691 ··· 1601 1696 emit_current_doctype () 1602 1697 | Some '\x00' -> 1603 1698 error t "unexpected-null-character" 1604 - | Some _ -> () 1699 + | Some c -> 1700 + check_control_char c (* Check all chars in bogus doctype *) 1605 1701 | None -> () 1606 1702 1607 1703 and state_cdata_section () = ··· 1609 1705 | Some ']' -> 1610 1706 t.state <- State.Cdata_section_bracket 1611 1707 | Some c -> 1612 - (* CDATA section emits all characters as-is, including NUL *) 1613 - emit_char t c 1708 + (* CDATA section emits all characters as-is, including NUL, but still check for control chars *) 1709 + emit_char_checked c 1614 1710 | None -> () 1615 1711 1616 1712 and state_cdata_section_bracket () =
+577 -66
test/test_serializer.ml
··· 12 12 | Jsont.String (s, _) -> Some s 13 13 | _ -> failwith "Expected string or null" 14 14 15 + let json_bool = function 16 + | Jsont.Bool (b, _) -> b 17 + | _ -> failwith "Expected bool" 18 + 15 19 let json_array = function 16 20 | Jsont.Array (arr, _) -> arr 17 21 | _ -> failwith "Expected array" ··· 30 34 | Some v -> v 31 35 | None -> failwith ("Missing member: " ^ name) 32 36 37 + (* Serialization options *) 38 + type serialize_options = { 39 + quote_char : char; 40 + quote_char_explicit : bool; (* Was quote_char explicitly set? *) 41 + minimize_boolean_attributes : bool; 42 + use_trailing_solidus : bool; 43 + escape_lt_in_attrs : bool; 44 + escape_rcdata : bool; 45 + strip_whitespace : bool; 46 + inject_meta_charset : bool; 47 + encoding : string option; 48 + omit_optional_tags : bool; 49 + } 50 + 51 + let default_options = { 52 + quote_char = '"'; 53 + quote_char_explicit = false; 54 + minimize_boolean_attributes = true; 55 + use_trailing_solidus = false; 56 + escape_lt_in_attrs = false; 57 + escape_rcdata = false; 58 + strip_whitespace = false; 59 + inject_meta_charset = false; 60 + encoding = None; 61 + omit_optional_tags = true; (* HTML5 default *) 62 + } 63 + 64 + (* Parse options from JSON *) 65 + let parse_options json_opt = 66 + match json_opt with 67 + | None -> default_options 68 + | Some json -> 69 + let obj = json_object json in 70 + let get_bool name default = 71 + match json_mem name obj with 72 + | Some j -> (try json_bool j with _ -> default) 73 + | None -> default 74 + in 75 + let get_string name = 76 + match json_mem name obj with 77 + | Some (Jsont.String (s, _)) -> Some s 78 + | _ -> None 79 + in 80 + let quote_char_opt = 81 + match json_mem "quote_char" obj with 82 + | Some (Jsont.String (s, _)) when String.length s = 1 -> Some s.[0] 83 + | _ -> None 84 + in 85 + { 86 + quote_char = Option.value ~default:'"' quote_char_opt; 87 + quote_char_explicit = Option.is_some quote_char_opt; 88 + minimize_boolean_attributes = get_bool "minimize_boolean_attributes" (get_bool "quote_attr_values" true); 89 + use_trailing_solidus = get_bool "use_trailing_solidus" false; 90 + escape_lt_in_attrs = get_bool "escape_lt_in_attrs" false; 91 + escape_rcdata = get_bool "escape_rcdata" false; 92 + strip_whitespace = get_bool "strip_whitespace" false; 93 + inject_meta_charset = get_bool "inject_meta_charset" false; 94 + encoding = get_string "encoding"; 95 + omit_optional_tags = get_bool "omit_optional_tags" true; 96 + } 97 + 33 98 (* Test case *) 34 99 type test_case = { 35 100 description : string; 36 101 input : Jsont.json list; 37 102 expected : string list; 103 + options : serialize_options; 38 104 } 39 105 40 106 let parse_test_case json = ··· 42 108 let description = json_string (json_mem_exn "description" obj) in 43 109 let input = json_array (json_mem_exn "input" obj) in 44 110 let expected = List.map json_string (json_array (json_mem_exn "expected" obj)) in 45 - { description; input; expected } 111 + let options = parse_options (json_mem "options" obj) in 112 + { description; input; expected; options } 113 + 114 + (* Parse attrs that can be either array [{name, value}] or object {name: value} or empty {} *) 115 + let parse_attrs attrs_json = 116 + match attrs_json with 117 + | Jsont.Array (arr, _) -> 118 + List.map (fun attr_json -> 119 + let attr_obj = json_object attr_json in 120 + let attr_name = json_string (json_mem_exn "name" attr_obj) in 121 + let value = json_string (json_mem_exn "value" attr_obj) in 122 + (attr_name, value) 123 + ) arr 124 + | Jsont.Object (obj, _) -> 125 + List.map (fun ((n, _), v) -> (n, json_string v)) obj 126 + | _ -> [] 46 127 47 - (* Build a DOM node from test input token *) 48 - let build_node_from_token token = 128 + (* Void elements that don't need end tags *) 129 + let is_void_element name = 130 + List.mem (String.lowercase_ascii name) 131 + ["area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 132 + "link"; "meta"; "param"; "source"; "track"; "wbr"] 133 + 134 + (* Raw text elements whose content should not be escaped *) 135 + let is_raw_text_element name = 136 + List.mem (String.lowercase_ascii name) ["script"; "style"] 137 + 138 + (* Elements where whitespace should be preserved *) 139 + let is_whitespace_preserving_element name = 140 + List.mem (String.lowercase_ascii name) ["pre"; "textarea"; "script"; "style"] 141 + 142 + (* Block elements that close a p tag *) 143 + let p_closing_elements = [ 144 + "address"; "article"; "aside"; "blockquote"; "datagrid"; "dialog"; "dir"; 145 + "div"; "dl"; "fieldset"; "footer"; "form"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 146 + "header"; "hgroup"; "hr"; "main"; "menu"; "nav"; "ol"; "p"; "pre"; "section"; 147 + "table"; "ul" 148 + ] 149 + 150 + let is_p_closing_element name = 151 + List.mem (String.lowercase_ascii name) p_closing_elements 152 + 153 + (* Collapse runs of whitespace to single space *) 154 + let collapse_whitespace text = 155 + let len = String.length text in 156 + let buf = Buffer.create len in 157 + let in_whitespace = ref false in 158 + for i = 0 to len - 1 do 159 + let c = text.[i] in 160 + if c = '\t' || c = '\r' || c = '\n' || c = '\x0C' || c = ' ' then begin 161 + if not !in_whitespace then begin 162 + Buffer.add_char buf ' '; 163 + in_whitespace := true 164 + end 165 + end else begin 166 + Buffer.add_char buf c; 167 + in_whitespace := false 168 + end 169 + done; 170 + Buffer.contents buf 171 + 172 + (* Token types for stream-based serialization *) 173 + type token_type = 174 + | StartTag of string * (string * string) list (* name, attrs *) 175 + | EndTag of string (* name *) 176 + | EmptyTag of string * (string * string) list (* name, attrs *) 177 + | TextNode of string 178 + | CommentNode of string 179 + | DoctypeNode of Dom.node 180 + 181 + type token_info = { 182 + token : token_type option; 183 + node : Dom.node option; (* Legacy for compatibility *) 184 + tag_name : string option; 185 + is_empty_tag : bool; 186 + } 187 + 188 + let build_token_info token = 49 189 let arr = json_array token in 50 190 match arr with 51 - | [] -> None 191 + | [] -> { token = None; node = None; tag_name = None; is_empty_tag = false } 52 192 | type_json :: rest -> 53 - let token_type = json_string type_json in 54 - match token_type, rest with 193 + let token_type_str = json_string type_json in 194 + match token_type_str, rest with 55 195 | "StartTag", [_ns_json; name_json; attrs_json] -> 56 196 let name = json_string name_json in 57 - let attrs_list = json_array attrs_json in 58 - let attrs = List.map (fun attr_json -> 59 - let attr_obj = json_object attr_json in 60 - let attr_name = json_string (json_mem_exn "name" attr_obj) in 61 - let value = json_string (json_mem_exn "value" attr_obj) in 62 - (attr_name, value) 63 - ) attrs_list in 64 - Some (Dom.create_element name ~attrs ()) 197 + let attrs = parse_attrs attrs_json in 198 + { token = Some (StartTag (name, attrs)); 199 + node = Some (Dom.create_element name ~attrs ()); 200 + tag_name = Some name; 201 + is_empty_tag = false } 65 202 66 203 | "StartTag", [name_json; attrs_json] -> 67 204 let name = json_string name_json in 68 - let attrs_obj = json_object attrs_json in 69 - let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in 70 - Some (Dom.create_element name ~attrs ()) 205 + let attrs = parse_attrs attrs_json in 206 + { token = Some (StartTag (name, attrs)); 207 + node = Some (Dom.create_element name ~attrs ()); 208 + tag_name = Some name; 209 + is_empty_tag = false } 71 210 72 211 | "EmptyTag", [name_json; attrs_json] -> 73 212 let name = json_string name_json in 74 - let attrs_obj = json_object attrs_json in 75 - let attrs = List.map (fun ((n, _), v) -> (n, json_string v)) attrs_obj in 76 - Some (Dom.create_element name ~attrs ()) 213 + let attrs = parse_attrs attrs_json in 214 + { token = Some (EmptyTag (name, attrs)); 215 + node = Some (Dom.create_element name ~attrs ()); 216 + tag_name = Some name; 217 + is_empty_tag = true } 218 + 219 + | "EndTag", [_ns_json; name_json] -> 220 + let name = json_string name_json in 221 + { token = Some (EndTag name); 222 + node = None; 223 + tag_name = Some name; 224 + is_empty_tag = false } 225 + 226 + | "EndTag", [name_json] -> 227 + let name = json_string name_json in 228 + { token = Some (EndTag name); 229 + node = None; 230 + tag_name = Some name; 231 + is_empty_tag = false } 77 232 78 233 | "Characters", [text_json] -> 79 234 let text = json_string text_json in 80 - Some (Dom.create_text text) 235 + { token = Some (TextNode text); 236 + node = Some (Dom.create_text text); 237 + tag_name = None; 238 + is_empty_tag = false } 81 239 82 240 | "Comment", [text_json] -> 83 241 let text = json_string text_json in 84 - Some (Dom.create_comment text) 242 + { token = Some (CommentNode text); 243 + node = Some (Dom.create_comment text); 244 + tag_name = None; 245 + is_empty_tag = false } 85 246 86 247 | "Doctype", [name_json] -> 87 248 let name = json_string name_json in 88 - Some (Dom.create_doctype ~name ()) 249 + let node = Dom.create_doctype ~name () in 250 + { token = Some (DoctypeNode node); 251 + node = Some node; 252 + tag_name = None; 253 + is_empty_tag = false } 89 254 90 255 | "Doctype", [name_json; public_json] -> 91 256 let name = json_string name_json in 92 257 let public_id = json_string_opt public_json in 93 - (match public_id with 94 - | Some pub -> Some (Dom.create_doctype ~name ~public_id:pub ()) 95 - | None -> Some (Dom.create_doctype ~name ())) 258 + let node = match public_id with 259 + | Some pub -> Dom.create_doctype ~name ~public_id:pub () 260 + | None -> Dom.create_doctype ~name () 261 + in 262 + { token = Some (DoctypeNode node); 263 + node = Some node; 264 + tag_name = None; 265 + is_empty_tag = false } 96 266 97 267 | "Doctype", [name_json; public_json; system_json] -> 98 268 let name = json_string name_json in 99 269 let public_id = json_string_opt public_json in 100 270 let system_id = json_string_opt system_json in 101 - (match public_id, system_id with 102 - | Some pub, Some sys -> Some (Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()) 103 - | Some pub, None -> Some (Dom.create_doctype ~name ~public_id:pub ()) 104 - | None, Some sys -> Some (Dom.create_doctype ~name ~system_id:sys ()) 105 - | None, None -> Some (Dom.create_doctype ~name ())) 271 + let node = match public_id, system_id with 272 + | Some pub, Some sys -> Dom.create_doctype ~name ~public_id:pub ~system_id:sys () 273 + | Some pub, None -> Dom.create_doctype ~name ~public_id:pub () 274 + | None, Some sys -> Dom.create_doctype ~name ~system_id:sys () 275 + | None, None -> Dom.create_doctype ~name () 276 + in 277 + { token = Some (DoctypeNode node); 278 + node = Some node; 279 + tag_name = None; 280 + is_empty_tag = false } 106 281 107 - | _ -> None 282 + | _ -> { token = None; node = None; tag_name = None; is_empty_tag = false } 108 283 109 - (* Serialize a single node to HTML (simplified, matches test expectations) *) 284 + (* Serialize a single node to HTML with options *) 110 285 let escape_text text = 111 286 let buf = Buffer.create (String.length text) in 112 287 String.iter (fun c -> ··· 129 304 ) value; 130 305 !valid 131 306 132 - let choose_quote value = 133 - if String.contains value '"' && not (String.contains value '\'') then '\'' 134 - else '"' 135 - 136 - let escape_attr_value value quote_char = 307 + let escape_attr_value value quote_char escape_lt = 137 308 let buf = Buffer.create (String.length value) in 138 309 String.iter (fun c -> 139 310 match c with 140 311 | '&' -> Buffer.add_string buf "&amp;" 141 312 | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 313 + | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 314 + | '<' when escape_lt -> Buffer.add_string buf "&lt;" 142 315 | c -> Buffer.add_char buf c 143 316 ) value; 144 317 Buffer.contents buf 145 318 146 - let serialize_node node = 319 + let serialize_node opts ~in_raw_text node = 147 320 match node.Dom.name with 148 321 | "#text" -> 149 - (* Check if parent is a raw text element *) 150 - escape_text node.Dom.data 322 + if in_raw_text && not opts.escape_rcdata then 323 + node.Dom.data 324 + else 325 + escape_text node.Dom.data 151 326 | "#comment" -> 152 327 "<!--" ^ node.Dom.data ^ "-->" 153 328 | "!doctype" -> ··· 177 352 | None -> Buffer.add_string buf "html"); 178 353 Buffer.add_char buf '>'; 179 354 Buffer.contents buf 180 - | _ -> 181 - (* Element *) 182 - let buf = Buffer.create 64 in 183 - Buffer.add_char buf '<'; 184 - Buffer.add_string buf node.Dom.name; 185 - List.iter (fun (key, value) -> 186 - Buffer.add_char buf ' '; 187 - Buffer.add_string buf key; 188 - if can_unquote_attr_value value then begin 189 - Buffer.add_char buf '='; 190 - Buffer.add_string buf value 191 - end else begin 192 - let quote = choose_quote value in 193 - Buffer.add_char buf '='; 194 - Buffer.add_char buf quote; 195 - Buffer.add_string buf (escape_attr_value value quote); 196 - Buffer.add_char buf quote 197 - end 198 - ) node.Dom.attrs; 199 - Buffer.add_char buf '>'; 200 - Buffer.contents buf 355 + | _ -> failwith "serialize_node called with element" 356 + 357 + let choose_quote value default_quote explicit = 358 + (* If quote_char was explicitly set, always use it *) 359 + if explicit then default_quote 360 + else 361 + (* Otherwise, if value contains the default quote but not the other, use the other *) 362 + let has_double = String.contains value '"' in 363 + let has_single = String.contains value '\'' in 364 + if has_double && not has_single then '\'' 365 + else if has_single && not has_double then '"' 366 + else default_quote 367 + 368 + (* Serialize an element tag (start tag) *) 369 + let serialize_start_tag opts ~is_empty_tag name attrs = 370 + let buf = Buffer.create 64 in 371 + Buffer.add_char buf '<'; 372 + Buffer.add_string buf name; 373 + (* Sort attributes alphabetically for consistent output *) 374 + let sorted_attrs = List.sort (fun (a, _) (b, _) -> String.compare a b) attrs in 375 + List.iter (fun (key, value) -> 376 + Buffer.add_char buf ' '; 377 + Buffer.add_string buf key; 378 + let should_minimize = 379 + opts.minimize_boolean_attributes && 380 + String.lowercase_ascii key = String.lowercase_ascii value 381 + in 382 + if should_minimize then 383 + () 384 + else if String.length value = 0 then begin 385 + Buffer.add_char buf '='; 386 + Buffer.add_char buf opts.quote_char; 387 + Buffer.add_char buf opts.quote_char 388 + end else if can_unquote_attr_value value then begin 389 + Buffer.add_char buf '='; 390 + Buffer.add_string buf value 391 + end else begin 392 + let quote = choose_quote value opts.quote_char opts.quote_char_explicit in 393 + Buffer.add_char buf '='; 394 + Buffer.add_char buf quote; 395 + Buffer.add_string buf (escape_attr_value value quote opts.escape_lt_in_attrs); 396 + Buffer.add_char buf quote 397 + end 398 + ) sorted_attrs; 399 + if opts.use_trailing_solidus && (is_empty_tag || is_void_element name) then 400 + Buffer.add_string buf " /"; 401 + Buffer.add_char buf '>'; 402 + Buffer.contents buf 403 + 404 + (* Check if text starts with ASCII whitespace *) 405 + let text_starts_with_space text = 406 + String.length text > 0 && 407 + let c = text.[0] in 408 + c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || c = ' ' 409 + 410 + (* Optional tag omission helpers *) 411 + type next_token = 412 + | NTComment 413 + | NTSpace (* Text starting with space *) 414 + | NTText (* Text not starting with space *) 415 + | NTStartTag of string 416 + | NTEmptyTag of string 417 + | NTEndTag of string 418 + | NTEOF 419 + 420 + let classify_next tokens idx = 421 + if idx >= Array.length tokens then NTEOF 422 + else match tokens.(idx).token with 423 + | None -> NTEOF 424 + | Some (CommentNode _) -> NTComment 425 + | Some (TextNode text) -> 426 + if text_starts_with_space text then NTSpace else NTText 427 + | Some (StartTag (name, _)) -> NTStartTag (String.lowercase_ascii name) 428 + | Some (EmptyTag (name, _)) -> NTEmptyTag (String.lowercase_ascii name) 429 + | Some (EndTag name) -> NTEndTag (String.lowercase_ascii name) 430 + | Some (DoctypeNode _) -> NTEOF (* Treat doctype as if nothing follows *) 431 + 432 + (* Should we omit a start tag? *) 433 + let should_omit_start_tag opts name attrs next = 434 + if not opts.omit_optional_tags then false 435 + else 436 + let name = String.lowercase_ascii name in 437 + match name, next with 438 + (* html start tag: omit if not followed by comment or space, AND has no attributes *) 439 + | "html", NTComment -> false 440 + | "html", NTSpace -> false 441 + | "html", _ -> attrs = [] (* only omit if no attributes *) 442 + (* head start tag: omit if followed by element (start/empty tag) *) 443 + | "head", NTStartTag _ -> true 444 + | "head", NTEmptyTag _ -> true 445 + | "head", NTEndTag "head" -> true (* empty head *) 446 + | "head", NTEOF -> true 447 + | "head", _ -> false 448 + (* body start tag: omit if not followed by comment or space, AND has no attributes *) 449 + | "body", NTComment -> false 450 + | "body", NTSpace -> false 451 + | "body", _ -> attrs = [] (* only omit if no attributes *) 452 + (* colgroup start tag: omit if followed by col element *) 453 + | "colgroup", NTStartTag "col" -> true 454 + | "colgroup", NTEmptyTag "col" -> true 455 + | "colgroup", _ -> false 456 + (* tbody start tag: omit if first child is tr *) 457 + | "tbody", NTStartTag "tr" -> true 458 + | "tbody", _ -> false 459 + | _ -> false 460 + 461 + (* Should we omit an end tag? *) 462 + let should_omit_end_tag opts name next = 463 + if not opts.omit_optional_tags then false 464 + else 465 + let name = String.lowercase_ascii name in 466 + match name, next with 467 + (* html end tag: omit if not followed by comment or space *) 468 + | "html", NTComment -> false 469 + | "html", NTSpace -> false 470 + | "html", _ -> true 471 + (* head end tag: omit if not followed by comment or space *) 472 + | "head", NTComment -> false 473 + | "head", NTSpace -> false 474 + | "head", _ -> true 475 + (* body end tag: omit if not followed by comment or space *) 476 + | "body", NTComment -> false 477 + | "body", NTSpace -> false 478 + | "body", _ -> true 479 + (* li end tag: omit if followed by li start tag or parent end tag *) 480 + | "li", NTStartTag "li" -> true 481 + | "li", NTEndTag _ -> true 482 + | "li", NTEOF -> true 483 + | "li", _ -> false 484 + (* dt end tag: omit if followed by dt or dd start tag (NOT end tag or EOF!) *) 485 + | "dt", NTStartTag "dt" -> true 486 + | "dt", NTStartTag "dd" -> true 487 + | "dt", _ -> false 488 + (* dd end tag: omit if followed by dd or dt start tag, or end tag, or EOF *) 489 + | "dd", NTStartTag "dd" -> true 490 + | "dd", NTStartTag "dt" -> true 491 + | "dd", NTEndTag _ -> true 492 + | "dd", NTEOF -> true 493 + | "dd", _ -> false 494 + (* p end tag: omit if followed by block element (start or empty tag), end tag, or EOF *) 495 + | "p", NTStartTag next_name when is_p_closing_element next_name -> true 496 + | "p", NTEmptyTag next_name when is_p_closing_element next_name -> true 497 + | "p", NTEndTag _ -> true 498 + | "p", NTEOF -> true 499 + | "p", _ -> false 500 + (* optgroup end tag: omit if followed by optgroup start tag, end tag, or EOF *) 501 + | "optgroup", NTStartTag "optgroup" -> true 502 + | "optgroup", NTEndTag _ -> true 503 + | "optgroup", NTEOF -> true 504 + | "optgroup", _ -> false 505 + (* option end tag: omit if followed by option/optgroup start tag, end tag, or EOF *) 506 + | "option", NTStartTag "option" -> true 507 + | "option", NTStartTag "optgroup" -> true 508 + | "option", NTEndTag _ -> true 509 + | "option", NTEOF -> true 510 + | "option", _ -> false 511 + (* colgroup end tag: omit if not followed by comment, space, or another colgroup *) 512 + | "colgroup", NTComment -> false 513 + | "colgroup", NTSpace -> false 514 + | "colgroup", NTStartTag "colgroup" -> false (* keep end tag when another colgroup follows *) 515 + | "colgroup", _ -> true 516 + (* thead end tag: omit if followed by tbody or tfoot start tag *) 517 + | "thead", NTStartTag "tbody" -> true 518 + | "thead", NTStartTag "tfoot" -> true 519 + | "thead", _ -> false 520 + (* tbody end tag: omit if followed by tbody/tfoot start tag, end tag, or EOF *) 521 + | "tbody", NTStartTag "tbody" -> true 522 + | "tbody", NTStartTag "tfoot" -> true 523 + | "tbody", NTEndTag _ -> true 524 + | "tbody", NTEOF -> true 525 + | "tbody", _ -> false 526 + (* tfoot end tag: omit if followed by tbody start tag, end tag, or EOF *) 527 + | "tfoot", NTStartTag "tbody" -> true 528 + | "tfoot", NTEndTag _ -> true 529 + | "tfoot", NTEOF -> true 530 + | "tfoot", _ -> false 531 + (* tr end tag: omit if followed by tr start tag, end tag, or EOF *) 532 + | "tr", NTStartTag "tr" -> true 533 + | "tr", NTEndTag _ -> true 534 + | "tr", NTEOF -> true 535 + | "tr", _ -> false 536 + (* td end tag: omit if followed by td/th start tag, end tag, or EOF *) 537 + | "td", NTStartTag "td" -> true 538 + | "td", NTStartTag "th" -> true 539 + | "td", NTEndTag _ -> true 540 + | "td", NTEOF -> true 541 + | "td", _ -> false 542 + (* th end tag: omit if followed by th/td start tag, end tag, or EOF *) 543 + | "th", NTStartTag "th" -> true 544 + | "th", NTStartTag "td" -> true 545 + | "th", NTEndTag _ -> true 546 + | "th", NTEOF -> true 547 + | "th", _ -> false 548 + | _ -> false 201 549 202 550 (* Run a single test *) 203 551 let run_test test = 204 552 try 205 - (* Build nodes from input tokens *) 206 - let nodes = List.filter_map build_node_from_token test.input in 553 + (* Build token infos from input *) 554 + let token_infos = Array.of_list (List.map build_token_info test.input) in 555 + let num_tokens = Array.length token_infos in 556 + 557 + (* Handle inject_meta_charset option *) 558 + let inject_meta = test.options.inject_meta_charset in 559 + let encoding = test.options.encoding in 560 + 561 + (* Serialize with context tracking *) 562 + let buf = Buffer.create 256 in 563 + let in_raw_text = ref false in 564 + let preserve_whitespace = ref false in 565 + let element_stack : string list ref = ref [] in 566 + let in_head = ref false in 567 + let meta_charset_injected = ref false in 568 + let prev_was_section_end = ref false in (* Track if prev token was thead/tbody/tfoot end *) 207 569 208 - (* Serialize *) 209 - let serialized = String.concat "" (List.map serialize_node nodes) in 570 + for i = 0 to num_tokens - 1 do 571 + let info = token_infos.(i) in 572 + let next = classify_next token_infos (i + 1) in 573 + 574 + match info.token with 575 + | None -> () 576 + 577 + | Some (StartTag (name, attrs)) -> 578 + let name_lower = String.lowercase_ascii name in 579 + 580 + (* Track head element *) 581 + if name_lower = "head" then in_head := true; 582 + 583 + (* For inject_meta_charset, we need to check if there's any charset meta coming up *) 584 + (* If yes, don't inject at head start; if no, inject at head start *) 585 + let should_inject_at_head = 586 + if not inject_meta || name_lower <> "head" then false 587 + else match encoding with 588 + | None -> false 589 + | Some _ -> 590 + (* Look ahead to see if there's a charset meta or http-equiv content-type *) 591 + let has_charset_meta = ref false in 592 + for j = i + 1 to num_tokens - 1 do 593 + match token_infos.(j).token with 594 + | Some (EmptyTag (n, a)) when String.lowercase_ascii n = "meta" -> 595 + let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") a in 596 + let has_http_equiv_ct = List.exists (fun (k, v) -> 597 + String.lowercase_ascii k = "http-equiv" && String.lowercase_ascii v = "content-type") a in 598 + if has_charset || has_http_equiv_ct then has_charset_meta := true 599 + | Some (EndTag n) when String.lowercase_ascii n = "head" -> () 600 + | _ -> () 601 + done; 602 + not !has_charset_meta 603 + in 604 + 605 + (* Special case: tbody start tag cannot be omitted if preceded by section end tag *) 606 + let can_omit_start = 607 + if name_lower = "tbody" && !prev_was_section_end then false 608 + else should_omit_start_tag test.options name attrs next 609 + in 610 + prev_was_section_end := false; (* Reset for next iteration *) 611 + 612 + if should_inject_at_head then begin 613 + meta_charset_injected := true; 614 + (* Don't output head start tag if we should omit it *) 615 + if not can_omit_start then 616 + Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs); 617 + Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" (Option.get encoding)); 618 + element_stack := name :: !element_stack; 619 + if is_raw_text_element name then in_raw_text := true; 620 + if is_whitespace_preserving_element name then preserve_whitespace := true 621 + end else if not can_omit_start then begin 622 + Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:false name attrs); 623 + element_stack := name :: !element_stack; 624 + if is_raw_text_element name then in_raw_text := true; 625 + if is_whitespace_preserving_element name then preserve_whitespace := true 626 + end else begin 627 + element_stack := name :: !element_stack; 628 + if is_raw_text_element name then in_raw_text := true; 629 + if is_whitespace_preserving_element name then preserve_whitespace := true 630 + end 631 + 632 + | Some (EmptyTag (name, attrs)) -> 633 + let name_lower = String.lowercase_ascii name in 634 + prev_was_section_end := false; (* Reset for next iteration *) 635 + 636 + (* Handle meta charset replacement *) 637 + if inject_meta && !in_head && name_lower = "meta" then begin 638 + let has_charset = List.exists (fun (k, _) -> String.lowercase_ascii k = "charset") attrs in 639 + let has_http_equiv_ct = 640 + List.exists (fun (k, v) -> 641 + String.lowercase_ascii k = "http-equiv" && 642 + String.lowercase_ascii v = "content-type" 643 + ) attrs 644 + in 645 + if has_charset then begin 646 + (* Replace charset value *) 647 + match encoding with 648 + | Some enc -> 649 + Buffer.add_string buf (Printf.sprintf "<meta charset=%s>" enc) 650 + | None -> () (* No encoding, skip the meta tag *) 651 + end else if has_http_equiv_ct then begin 652 + (* Replace charset in content value *) 653 + match encoding with 654 + | Some enc -> 655 + let new_attrs = List.map (fun (k, v) -> 656 + if String.lowercase_ascii k = "content" then 657 + let new_content = Printf.sprintf "text/html; charset=%s" enc in 658 + (k, new_content) 659 + else (k, v) 660 + ) attrs in 661 + Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name new_attrs) 662 + | None -> 663 + Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs) 664 + end else begin 665 + (* Regular meta tag, output as normal *) 666 + Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs) 667 + end 668 + end else 669 + Buffer.add_string buf (serialize_start_tag test.options ~is_empty_tag:true name attrs) 670 + 671 + | Some (EndTag name) -> 672 + let name_lower = String.lowercase_ascii name in 673 + 674 + (* Track head element *) 675 + if name_lower = "head" then in_head := false; 676 + 677 + (* Pop from element stack *) 678 + (match !element_stack with 679 + | top :: rest when String.lowercase_ascii top = name_lower -> 680 + element_stack := rest; 681 + if is_raw_text_element name then in_raw_text := false; 682 + if is_whitespace_preserving_element name then preserve_whitespace := false 683 + | _ -> ()); 684 + 685 + let is_section_end = List.mem name_lower ["thead"; "tbody"; "tfoot"] in 686 + let omit = should_omit_end_tag test.options name next in 687 + 688 + if not omit then begin 689 + Buffer.add_string buf "</"; 690 + Buffer.add_string buf name; 691 + Buffer.add_char buf '>' 692 + end; 693 + 694 + (* Track if we omitted a section end tag - next tbody can't be omitted *) 695 + prev_was_section_end := is_section_end && omit 696 + 697 + | Some (TextNode text) -> 698 + prev_was_section_end := false; 699 + let processed_text = 700 + if !in_raw_text && not test.options.escape_rcdata then 701 + text 702 + else if test.options.strip_whitespace && not !preserve_whitespace then 703 + escape_text (collapse_whitespace text) 704 + else 705 + escape_text text 706 + in 707 + Buffer.add_string buf processed_text 708 + 709 + | Some (CommentNode text) -> 710 + prev_was_section_end := false; 711 + Buffer.add_string buf "<!--"; 712 + Buffer.add_string buf text; 713 + Buffer.add_string buf "-->" 714 + 715 + | Some (DoctypeNode node) -> 716 + prev_was_section_end := false; 717 + Buffer.add_string buf (serialize_node test.options ~in_raw_text:false node) 718 + done; 719 + 720 + let serialized = Buffer.contents buf in 210 721 211 722 (* Check if it matches any expected output *) 212 723 let matches = List.exists (fun exp -> serialized = exp) test.expected in
+18 -17
test/test_tokenizer.ml
··· 36 36 initial_states : string list; 37 37 last_start_tag : string option; 38 38 double_escaped : bool; 39 + xml_mode : bool; 39 40 } 40 41 41 42 (* Unescape double-escaped strings from tests *) ··· 118 119 } 119 120 120 121 (* Parse a single test case from JSON *) 121 - let parse_test_case json = 122 + let parse_test_case ~xml_mode json = 122 123 let obj = json_object json in 123 124 let description = json_string (json_mem_exn "description" obj) in 124 125 let input = json_string (json_mem_exn "input" obj) in ··· 139 140 | Some b -> json_bool b 140 141 | None -> false 141 142 in 142 - { description; input; output; errors; initial_states; last_start_tag; double_escaped } 143 + { description; input; output; errors; initial_states; last_start_tag; double_escaped; xml_mode } 143 144 144 145 (* Convert state name to State.t *) 145 146 let state_of_string = function ··· 222 223 let input = if test.double_escaped then unescape_double test.input else test.input in 223 224 224 225 let collector = TokenCollector.create () in 225 - let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true () in 226 + let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 226 227 227 228 (* Set initial state *) 228 229 Tokenizer.set_state tokenizer initial_state; ··· 305 306 306 307 let obj = json_object json in 307 308 308 - (* Handle both {"tests": [...]} and {"xmlViolationTests": [...], "tests": [...]} formats *) 309 - let test_arrays = 310 - let tests = match json_mem "tests" obj with 311 - | Some t -> json_array t 312 - | None -> [] 313 - in 314 - let xml_tests = match json_mem "xmlViolationTests" obj with 315 - | Some t -> json_array t 316 - | None -> [] 317 - in 318 - tests @ xml_tests 309 + (* Handle both {"tests": [...]} and {"xmlViolationTests": [...]} formats *) 310 + let regular_tests = 311 + match json_mem "tests" obj with 312 + | Some t -> List.map (parse_test_case ~xml_mode:false) (json_array t) 313 + | None -> [] 314 + in 315 + let xml_tests = 316 + match json_mem "xmlViolationTests" obj with 317 + | Some t -> List.map (parse_test_case ~xml_mode:true) (json_array t) 318 + | None -> [] 319 319 in 320 + let all_tests = regular_tests @ xml_tests in 320 321 321 322 let filename = Filename.basename path in 322 323 let passed = ref 0 in 323 324 let failed = ref 0 in 324 325 let first_failures = ref [] in 325 326 326 - List.iteri (fun i test_json -> 327 - let test = parse_test_case test_json in 327 + List.iteri (fun i test -> 328 + (* test is already parsed *) 328 329 329 330 (* Run for each initial state *) 330 331 List.iter (fun state_name -> ··· 345 346 first_failures := (i + 1, test.description, state_name, [], [], [], []) :: !first_failures; 346 347 Printf.eprintf "Exception in test %d (%s): %s\n" (i + 1) test.description (Printexc.to_string e) 347 348 ) test.initial_states 348 - ) test_arrays; 349 + ) all_tests; 349 350 350 351 (!passed, !failed, List.rev !first_failures, filename) 351 352