OCaml HTML5 parser/serialiser based on Python's JustHTML

rearrange to be wrapped under one Htmlrw module

-4
lib/dom/dune
··· 1 - (library 2 - (name html5rw_dom) 3 - (public_name html5rw.dom) 4 - (libraries bytesrw))
+5 -5
lib/dom/html5rw_dom.ml lib/html5rw/dom/dom.ml
··· 10 10 and serialization. 11 11 *) 12 12 13 - include Node 13 + include Dom_node 14 14 15 - let to_html = Serialize.to_html 16 - let to_writer = Serialize.to_writer 17 - let to_test_format = Serialize.to_test_format 18 - let to_text = Serialize.to_text 15 + let to_html = Dom_serialize.to_html 16 + let to_writer = Dom_serialize.to_writer 17 + let to_test_format = Dom_serialize.to_test_format 18 + let to_text = Dom_serialize.to_text
+3 -3
lib/dom/html5rw_dom.mli lib/html5rw/dom/dom.mli
··· 128 128 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode> 129 129 WHATWG: DOCTYPE handling during parsing 130 130 *) 131 - type doctype_data = Node.doctype_data = { 131 + type doctype_data = Dom_node.doctype_data = { 132 132 name : string option; (** The DOCTYPE name, e.g., "html" *) 133 133 public_id : string option; (** Public identifier (legacy, rarely used) *) 134 134 system_id : string option; (** System identifier (legacy, rarely used) *) ··· 172 172 @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode> 173 173 WHATWG: How the parser determines quirks mode 174 174 *) 175 - type quirks_mode = Node.quirks_mode = No_quirks | Quirks | Limited_quirks 175 + type quirks_mode = Dom_node.quirks_mode = No_quirks | Quirks | Limited_quirks 176 176 177 177 (** A DOM node in the parsed document tree. 178 178 ··· 240 240 @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element> 241 241 WHATWG: The template element 242 242 *) 243 - type node = Node.node = { 243 + type node = Dom_node.node = { 244 244 mutable name : string; 245 245 (** Tag name for elements, or special name for other node types. 246 246
lib/dom/node.ml lib/html5rw/dom/dom_node.ml
lib/dom/node.mli lib/html5rw/dom/dom_node.mli
+1 -1
lib/dom/serialize.ml lib/html5rw/dom/dom_serialize.ml
··· 6 6 (* HTML5 DOM serialization *) 7 7 8 8 open Bytesrw 9 - open Node 9 + open Dom_node 10 10 11 11 (* Void elements that don't have end tags *) 12 12 let void_elements = [
+3 -3
lib/encoding/bom.ml lib/html5rw/encoding/encoding_bom.ml
··· 6 6 Bytes.get data 0 = '\xEF' && 7 7 Bytes.get data 1 = '\xBB' && 8 8 Bytes.get data 2 = '\xBF' then 9 - Some (Encoding.Utf8, 3) 9 + Some (Encoding_types.Utf8, 3) 10 10 else if len >= 2 && 11 11 Bytes.get data 0 = '\xFF' && 12 12 Bytes.get data 1 = '\xFE' then 13 - Some (Encoding.Utf16le, 2) 13 + Some (Encoding_types.Utf16le, 2) 14 14 else if len >= 2 && 15 15 Bytes.get data 0 = '\xFE' && 16 16 Bytes.get data 1 = '\xFF' then 17 - Some (Encoding.Utf16be, 2) 17 + Some (Encoding_types.Utf16be, 2) 18 18 else 19 19 None
+10 -10
lib/encoding/decode.ml lib/html5rw/encoding/encoding_decode.ml
··· 51 51 52 52 let decode_with_encoding data enc ~bom_len = 53 53 match enc with 54 - | Encoding.Utf8 -> 54 + | Encoding_types.Utf8 -> 55 55 (* UTF-8: Just validate and replace errors with replacement character *) 56 56 let len = Bytes.length data in 57 57 let buf = Buffer.create len in ··· 74 74 loop (); 75 75 Buffer.contents buf 76 76 77 - | Encoding.Utf16le -> decode_utf16 data ~is_le:true ~bom_len 78 - | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 77 + | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len 78 + | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 79 79 80 - | Encoding.Windows_1252 -> 80 + | Encoding_types.Windows_1252 -> 81 81 (* Windows-1252 mapping table for 0x80-0x9F range *) 82 82 let len = Bytes.length data in 83 83 let buf = Buffer.create len in ··· 98 98 done; 99 99 Buffer.contents buf 100 100 101 - | Encoding.Iso_8859_2 -> 101 + | Encoding_types.Iso_8859_2 -> 102 102 (* Use uuuu for ISO-8859-2 decoding *) 103 103 let len = Bytes.length data in 104 104 let buf = Buffer.create len in ··· 109 109 ) () s; 110 110 Buffer.contents buf 111 111 112 - | Encoding.Euc_jp -> 112 + | Encoding_types.Euc_jp -> 113 113 (* For EUC-JP, use uutf with best effort *) 114 114 let len = Bytes.length data in 115 115 let buf = Buffer.create len in ··· 126 126 127 127 let decode data ?transport_encoding () = 128 128 (* Step 1: Check for BOM *) 129 - let bom_result = Bom.sniff data in 129 + let bom_result = Encoding_bom.sniff data in 130 130 match bom_result with 131 131 | Some (enc, bom_len) -> 132 132 (decode_with_encoding data enc ~bom_len, enc) ··· 134 134 (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *) 135 135 let enc_from_transport = 136 136 match transport_encoding with 137 - | Some te -> Labels.normalize_label te 137 + | Some te -> Encoding_labels.normalize_label te 138 138 | None -> None 139 139 in 140 140 match enc_from_transport with 141 141 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 142 142 | None -> 143 143 (* Step 3: Prescan for meta charset *) 144 - match Prescan.prescan_for_meta_charset data with 144 + match Encoding_prescan.prescan_for_meta_charset data with 145 145 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 146 146 | None -> 147 147 (* Default to Windows-1252 per HTML5 spec when no encoding detected *) 148 - (decode_with_encoding data Encoding.Windows_1252 ~bom_len:0, Encoding.Windows_1252) 148 + (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)
-4
lib/encoding/dune
··· 1 - (library 2 - (name html5rw_encoding) 3 - (public_name html5rw.encoding) 4 - (libraries astring bytesrw uutf uuuu))
lib/encoding/encoding.ml lib/html5rw/encoding/encoding_types.ml
+6 -6
lib/encoding/html5rw_encoding.ml lib/html5rw/encoding/encoding.ml
··· 30 30 encodings, but this implementation focuses on the most common ones. 31 31 Other encodings are mapped to their closest equivalent. 32 32 *) 33 - type encoding = Encoding.t = 33 + type encoding = Encoding_types.t = 34 34 | Utf8 (** UTF-8 encoding (default) *) 35 35 | Utf16le (** UTF-16 little-endian *) 36 36 | Utf16be (** UTF-16 big-endian *) ··· 44 44 45 45 Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"]. 46 46 *) 47 - let encoding_to_string = Encoding.to_string 47 + let encoding_to_string = Encoding_types.to_string 48 48 49 49 (** Detect encoding from a byte order mark. 50 50 ··· 54 54 @return [(Some (encoding, skip_bytes))] if a BOM is found, 55 55 [None] otherwise. 56 56 *) 57 - let sniff_bom = Bom.sniff 57 + let sniff_bom = Encoding_bom.sniff 58 58 59 59 (** Normalize an encoding label to its canonical form. 60 60 ··· 69 69 normalize_label "latin1" (* Some Windows_1252 *) 70 70 ]} 71 71 *) 72 - let normalize_label = Labels.normalize_label 72 + let normalize_label = Encoding_labels.normalize_label 73 73 74 74 (** Prescan bytes to find a meta charset declaration. 75 75 ··· 78 78 79 79 @return [Some encoding] if a meta charset is found, [None] otherwise. 80 80 *) 81 - let prescan_for_meta_charset = Prescan.prescan_for_meta_charset 81 + let prescan_for_meta_charset = Encoding_prescan.prescan_for_meta_charset 82 82 83 83 (** {1 Decoding} *) 84 84 ··· 98 98 (* html is now a UTF-8 string, enc is the detected encoding *) 99 99 ]} 100 100 *) 101 - let decode = Decode.decode 101 + let decode = Encoding_decode.decode
+1 -1
lib/encoding/html5rw_encoding.mli lib/html5rw/encoding/encoding.mli
··· 30 30 encodings, but this implementation focuses on the most common ones. 31 31 Other encodings are mapped to their closest equivalent. 32 32 *) 33 - type encoding = Encoding.t = 33 + type encoding = Encoding_types.t = 34 34 | Utf8 (** UTF-8 encoding (default) *) 35 35 | Utf16le (** UTF-16 little-endian *) 36 36 | Utf16be (** UTF-16 big-endian *)
+10 -10
lib/encoding/labels.ml lib/html5rw/encoding/encoding_labels.ml
··· 8 8 else 9 9 (* Security: never allow utf-7 *) 10 10 if s = "utf-7" || s = "utf7" || s = "x-utf-7" then 11 - Some Encoding.Windows_1252 11 + Some Encoding_types.Windows_1252 12 12 else if s = "utf-8" || s = "utf8" then 13 - Some Encoding.Utf8 13 + Some Encoding_types.Utf8 14 14 (* HTML treats latin-1 labels as windows-1252 *) 15 15 else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" || 16 16 s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then 17 - Some Encoding.Windows_1252 17 + Some Encoding_types.Windows_1252 18 18 else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then 19 - Some Encoding.Windows_1252 19 + Some Encoding_types.Windows_1252 20 20 else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then 21 - Some Encoding.Iso_8859_2 21 + Some Encoding_types.Iso_8859_2 22 22 else if s = "euc-jp" || s = "eucjp" then 23 - Some Encoding.Euc_jp 23 + Some Encoding_types.Euc_jp 24 24 else if s = "utf-16" || s = "utf16" then 25 - Some Encoding.Utf16le (* Default to LE for ambiguous utf-16 *) 25 + Some Encoding_types.Utf16le (* Default to LE for ambiguous utf-16 *) 26 26 else if s = "utf-16le" || s = "utf16le" then 27 - Some Encoding.Utf16le 27 + Some Encoding_types.Utf16le 28 28 else if s = "utf-16be" || s = "utf16be" then 29 - Some Encoding.Utf16be 29 + Some Encoding_types.Utf16be 30 30 else 31 31 None 32 32 ··· 37 37 (* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and 38 38 treat them as UTF-8 *) 39 39 match enc with 40 - | Encoding.Utf16le | Encoding.Utf16be -> Some Encoding.Utf8 40 + | Encoding_types.Utf16le | Encoding_types.Utf16be -> Some Encoding_types.Utf8 41 41 | other -> Some other
+2 -2
lib/encoding/prescan.ml lib/html5rw/encoding/encoding_prescan.ml
··· 229 229 (* Check for charset *) 230 230 (match !charset with 231 231 | Some cs -> 232 - (match Labels.normalize_meta_declared cs with 232 + (match Encoding_labels.normalize_meta_declared cs with 233 233 | Some enc -> result := Some enc 234 234 | None -> ()) 235 235 | None -> ()); ··· 241 241 | Some he, Some ct when String.lowercase_ascii he = "content-type" -> 242 242 (match extract_charset_from_content ct with 243 243 | Some extracted -> 244 - (match Labels.normalize_meta_declared extracted with 244 + (match Encoding_labels.normalize_meta_declared extracted with 245 245 | Some enc -> result := Some enc 246 246 | None -> ()) 247 247 | None -> ())
+8 -8
lib/entities/decode.ml lib/html5rw/entities/entities_decode.ml
··· 54 54 let digit_text = String.sub text digit_start (!j - digit_start) in 55 55 56 56 if String.length digit_text > 0 then begin 57 - match Numeric_ref.decode digit_text ~is_hex with 57 + match Entities_numeric_ref.decode digit_text ~is_hex with 58 58 | Some decoded -> 59 59 Buffer.add_string buf decoded; 60 60 i := if has_semicolon then !j + 1 else !j ··· 84 84 (* Try exact match first (with semicolon expected) *) 85 85 let decoded = 86 86 if has_semicolon then 87 - Entity_table.lookup entity_name 87 + Entities_entity_table.lookup entity_name 88 88 else 89 89 None 90 90 in ··· 101 101 if k <= 0 then None 102 102 else 103 103 let prefix = String.sub entity_name 0 k in 104 - if Entity_table.is_legacy prefix then 105 - match Entity_table.lookup prefix with 104 + if Entities_entity_table.is_legacy prefix then 105 + match Entities_entity_table.lookup prefix with 106 106 | Some value -> Some (value, k) 107 107 | None -> try_prefix (k - 1) 108 108 else ··· 118 118 i := !j + 1 119 119 end else if not has_semicolon then begin 120 120 (* Try without semicolon for legacy compatibility *) 121 - if Entity_table.is_legacy entity_name then 122 - match Entity_table.lookup entity_name with 121 + if Entities_entity_table.is_legacy entity_name then 122 + match Entities_entity_table.lookup entity_name with 123 123 | Some value -> 124 124 (* Legacy entities without semicolon have strict rules in attributes *) 125 125 let next_char = if !j < len then Some text.[!j] else None in ··· 145 145 if k <= 0 then None 146 146 else 147 147 let prefix = String.sub entity_name 0 k in 148 - if Entity_table.is_legacy prefix then 149 - match Entity_table.lookup prefix with 148 + if Entities_entity_table.is_legacy prefix then 149 + match Entities_entity_table.lookup prefix with 150 150 | Some value -> Some (value, k) 151 151 | None -> try_prefix (k - 1) 152 152 else
-9
lib/entities/dune
··· 1 - (library 2 - (name html5rw_entities) 3 - (public_name html5rw.entities) 4 - (libraries astring uutf)) 5 - 6 - (rule 7 - (target entity_table.ml) 8 - (deps ../../data/entities.json) 9 - (action (run ../../gen/gen_entities.exe %{deps} %{target})))
+6 -6
lib/entities/html5rw_entities.ml lib/html5rw/entities/entities.ml
··· 49 49 (* Returns: "<script>" *) 50 50 ]} 51 51 *) 52 - let decode = Decode.decode_entities_in_text 52 + let decode = Entities_decode.decode_entities_in_text 53 53 54 54 (** Decode a numeric character reference. 55 55 ··· 60 60 specification (e.g., control characters in the 0x80-0x9F range 61 61 are mapped to Windows-1252 equivalents). 62 62 *) 63 - let decode_numeric = Numeric_ref.decode 63 + let decode_numeric = Entities_numeric_ref.decode 64 64 65 65 (** Look up a named character reference. 66 66 ··· 73 73 lookup "bogus" (* None *) 74 74 ]} 75 75 *) 76 - let lookup = Entity_table.lookup 76 + let lookup = Entities_entity_table.lookup 77 77 78 78 (** Check if an entity is a legacy entity. 79 79 ··· 87 87 is_legacy "Aacute" (* false - requires semicolon *) 88 88 ]} 89 89 *) 90 - let is_legacy = Entity_table.is_legacy 90 + let is_legacy = Entities_entity_table.is_legacy 91 91 92 92 (** Convert a Unicode codepoint to its UTF-8 encoding. 93 93 94 94 @param codepoint The Unicode codepoint (0 to 0x10FFFF) 95 95 @return The UTF-8 encoded string 96 96 *) 97 - let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8 97 + let codepoint_to_utf8 = Entities_numeric_ref.codepoint_to_utf8 98 98 99 99 (** {1 Sub-modules} *) 100 100 101 101 (** Numeric character reference handling. *) 102 - module Numeric_ref = Numeric_ref 102 + module Numeric_ref = Entities_numeric_ref
lib/entities/numeric_ref.ml lib/html5rw/entities/entities_numeric_ref.ml
+3 -8
lib/html5rw/dune
··· 1 + (include_subdirs unqualified) 2 + 1 3 (library 2 4 (name html5rw) 3 5 (public_name html5rw) 4 - (libraries 5 - bytesrw 6 - html5rw.parser 7 - html5rw.dom 8 - html5rw.tokenizer 9 - html5rw.encoding 10 - html5rw.selector 11 - html5rw.entities)) 6 + (libraries astring bytesrw uutf uuuu))
+4
lib/html5rw/entities/dune
··· 1 + (rule 2 + (target entities_entity_table.ml) 3 + (deps ../../../data/entities.json) 4 + (action (run ../../../gen/gen_entities.exe %{deps} %{target})))
+6 -6
lib/html5rw/html5rw.ml
··· 37 37 (** {1 Sub-modules} *) 38 38 39 39 (** DOM types and manipulation functions *) 40 - module Dom = Html5rw_dom 40 + module Dom = Dom 41 41 42 42 (** HTML5 tokenizer *) 43 - module Tokenizer = Html5rw_tokenizer 43 + module Tokenizer = Tokenizer 44 44 45 45 (** Encoding detection and decoding *) 46 - module Encoding = Html5rw_encoding 46 + module Encoding = Encoding 47 47 48 48 (** CSS selector engine *) 49 - module Selector = Html5rw_selector 49 + module Selector = Selector 50 50 51 51 (** HTML entity decoding *) 52 - module Entities = Html5rw_entities 52 + module Entities = Entities 53 53 54 54 (** Low-level parser access *) 55 - module Parser = Html5rw_parser 55 + module Parser = Parser 56 56 57 57 (** {1 Core Types} *) 58 58
+6 -6
lib/html5rw/html5rw.mli
··· 132 132 133 133 @see <https://html.spec.whatwg.org/multipage/dom.html> 134 134 WHATWG: The elements of HTML *) 135 - module Dom = Html5rw_dom 135 + module Dom = Dom 136 136 137 137 (** HTML5 tokenizer. 138 138 ··· 146 146 147 147 @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization> 148 148 WHATWG: Tokenization *) 149 - module Tokenizer = Html5rw_tokenizer 149 + module Tokenizer = Tokenizer 150 150 151 151 (** Encoding detection and decoding. 152 152 ··· 163 163 WHATWG: Determining the character encoding 164 164 @see <https://encoding.spec.whatwg.org/> 165 165 WHATWG Encoding Standard *) 166 - module Encoding = Html5rw_encoding 166 + module Encoding = Encoding 167 167 168 168 (** CSS selector engine. 169 169 ··· 180 180 181 181 @see <https://www.w3.org/TR/selectors-4/> 182 182 W3C Selectors Level 4 specification *) 183 - module Selector = Html5rw_selector 183 + module Selector = Selector 184 184 185 185 (** HTML entity decoding. 186 186 ··· 196 196 197 197 @see <https://html.spec.whatwg.org/multipage/named-characters.html> 198 198 WHATWG: Named character references *) 199 - module Entities = Html5rw_entities 199 + module Entities = Entities 200 200 201 201 (** Low-level parser access. 202 202 ··· 210 210 211 211 @see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction> 212 212 WHATWG: Tree construction *) 213 - module Parser = Html5rw_parser 213 + module Parser = Parser 214 214 215 215 (** {1 Core Types} *) 216 216
+41
lib/html5rw/parser/parser.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (* html5rw.parser - HTML5 parser with bytesrw-only API *) 7 + 8 + module Dom = Dom 9 + module Tokenizer = Tokenizer 10 + module Encoding = Encoding 11 + module Constants = Parser_constants 12 + module Insertion_mode = Parser_insertion_mode 13 + module Tree_builder = Parser_tree_builder 14 + 15 + type parse_error = Parser_impl.parse_error 16 + type fragment_context = Parser_impl.fragment_context 17 + type t = Parser_impl.t 18 + 19 + (* parse_error accessors *) 20 + let error_code (e : parse_error) = e.Parser_tree_builder.code 21 + let error_line (e : parse_error) = e.Parser_tree_builder.line 22 + let error_column (e : parse_error) = e.Parser_tree_builder.column 23 + 24 + (* fragment_context constructor and accessors *) 25 + let make_fragment_context ~tag_name ?(namespace=None) () : fragment_context = 26 + { Parser_tree_builder.tag_name; namespace } 27 + 28 + let fragment_context_tag (ctx : fragment_context) = ctx.Parser_tree_builder.tag_name 29 + let fragment_context_namespace (ctx : fragment_context) = ctx.Parser_tree_builder.namespace 30 + 31 + let parse = Parser_impl.parse 32 + let parse_bytes = Parser_impl.parse_bytes 33 + let query = Parser_impl.query 34 + let to_writer = Parser_impl.to_writer 35 + let to_string = Parser_impl.to_string 36 + let to_text = Parser_impl.to_text 37 + let to_test_format = Parser_impl.to_test_format 38 + 39 + let root t = t.Parser_impl.root 40 + let errors t = t.Parser_impl.errors 41 + let encoding t = t.Parser_impl.encoding
+16
lib/html5rw/tokenizer/tokenizer.ml
··· 1 + (* html5rw.tokenizer - HTML5 tokenizer with bytesrw-only API *) 2 + 3 + module Token = Tokenizer_token 4 + module State = Tokenizer_state 5 + module Errors = Tokenizer_errors 6 + module Stream = Tokenizer_stream 7 + 8 + module type SINK = Tokenizer_impl.SINK 9 + 10 + type 'a t = 'a Tokenizer_impl.t 11 + 12 + let create = Tokenizer_impl.create 13 + let run = Tokenizer_impl.run 14 + let get_errors = Tokenizer_impl.get_errors 15 + let set_state = Tokenizer_impl.set_state 16 + let set_last_start_tag = Tokenizer_impl.set_last_start_tag
+1975
lib/html5rw/tokenizer/tokenizer_impl.ml
··· 1 + (* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 + 3 + (* Character classification using Astring *) 4 + let is_ascii_alpha = Astring.Char.Ascii.is_letter 5 + let is_ascii_digit = Astring.Char.Ascii.is_digit 6 + let is_ascii_hex = Astring.Char.Ascii.is_hex_digit 7 + let is_ascii_alnum = Astring.Char.Ascii.is_alphanum 8 + let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9 + let ascii_lower = Astring.Char.Ascii.lowercase 10 + 11 + (* Token sink interface *) 12 + module type SINK = sig 13 + type t 14 + val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ] 15 + val adjusted_current_node_in_html_namespace : t -> bool 16 + end 17 + 18 + type 'sink t = { 19 + mutable stream : Tokenizer_stream.t; 20 + sink : 'sink; 21 + mutable state : Tokenizer_state.t; 22 + mutable return_state : Tokenizer_state.t; 23 + mutable char_ref_code : int; 24 + mutable temp_buffer : Buffer.t; 25 + mutable last_start_tag : string; 26 + mutable current_tag_name : Buffer.t; 27 + mutable current_tag_kind : Tokenizer_token.tag_kind; 28 + mutable current_tag_self_closing : bool; 29 + mutable current_attr_name : Buffer.t; 30 + mutable current_attr_value : Buffer.t; 31 + mutable current_attrs : (string * string) list; 32 + mutable current_doctype_name : Buffer.t option; 33 + mutable current_doctype_public : Buffer.t option; 34 + mutable current_doctype_system : Buffer.t option; 35 + mutable current_doctype_force_quirks : bool; 36 + mutable current_comment : Buffer.t; 37 + mutable pending_chars : Buffer.t; 38 + mutable errors : Tokenizer_errors.t list; 39 + collect_errors : bool; 40 + xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *) 41 + } 42 + 43 + let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = { 44 + stream = Tokenizer_stream.create ""; 45 + sink; 46 + state = Tokenizer_state.Data; 47 + return_state = Tokenizer_state.Data; 48 + char_ref_code = 0; 49 + temp_buffer = Buffer.create 64; 50 + last_start_tag = ""; 51 + current_tag_name = Buffer.create 32; 52 + current_tag_kind = Tokenizer_token.Start; 53 + current_tag_self_closing = false; 54 + current_attr_name = Buffer.create 32; 55 + current_attr_value = Buffer.create 64; 56 + current_attrs = []; 57 + current_doctype_name = None; 58 + current_doctype_public = None; 59 + current_doctype_system = None; 60 + current_doctype_force_quirks = false; 61 + current_comment = Buffer.create 64; 62 + pending_chars = Buffer.create 256; 63 + errors = []; 64 + collect_errors; 65 + xml_mode; 66 + } 67 + 68 + let error t code = 69 + if t.collect_errors then begin 70 + let (line, column) = Tokenizer_stream.position t.stream in 71 + t.errors <- Tokenizer_errors.make ~code ~line ~column :: t.errors 72 + end 73 + 74 + (* emit functions are defined locally inside run *) 75 + 76 + (* XML mode character transformation: form feed → space *) 77 + let emit_char t c = 78 + if t.xml_mode && c = '\x0C' then 79 + Buffer.add_char t.pending_chars ' ' 80 + else 81 + Buffer.add_char t.pending_chars c 82 + 83 + (* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *) 84 + let emit_str t s = 85 + if t.xml_mode then begin 86 + (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *) 87 + let len = String.length s in 88 + let i = ref 0 in 89 + while !i < len do 90 + let c = s.[!i] in 91 + if c = '\x0C' then begin 92 + Buffer.add_char t.pending_chars ' '; 93 + incr i 94 + end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin 95 + (* U+FFFF → U+FFFD *) 96 + Buffer.add_string t.pending_chars "\xEF\xBF\xBD"; 97 + i := !i + 3 98 + end else begin 99 + Buffer.add_char t.pending_chars c; 100 + incr i 101 + end 102 + done 103 + end else 104 + Buffer.add_string t.pending_chars s 105 + 106 + let start_new_tag t kind = 107 + Buffer.clear t.current_tag_name; 108 + t.current_tag_kind <- kind; 109 + t.current_tag_self_closing <- false; 110 + t.current_attrs <- [] 111 + 112 + let start_new_attribute t = 113 + (* Save previous attribute if any *) 114 + let name = Buffer.contents t.current_attr_name in 115 + if String.length name > 0 then begin 116 + let value = Buffer.contents t.current_attr_value in 117 + (* Check for duplicates - only add if not already present *) 118 + if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then 119 + t.current_attrs <- (name, value) :: t.current_attrs 120 + else 121 + error t "duplicate-attribute" 122 + end; 123 + Buffer.clear t.current_attr_name; 124 + Buffer.clear t.current_attr_value 125 + 126 + let finish_attribute t = 127 + start_new_attribute t 128 + 129 + let start_new_doctype t = 130 + t.current_doctype_name <- None; 131 + t.current_doctype_public <- None; 132 + t.current_doctype_system <- None; 133 + t.current_doctype_force_quirks <- false 134 + 135 + (* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *) 136 + 137 + let is_appropriate_end_tag t = 138 + let name = Buffer.contents t.current_tag_name in 139 + String.length t.last_start_tag > 0 && name = t.last_start_tag 140 + 141 + let flush_code_points_consumed_as_char_ref t = 142 + let s = Buffer.contents t.temp_buffer in 143 + match t.return_state with 144 + | Tokenizer_state.Attribute_value_double_quoted 145 + | Tokenizer_state.Attribute_value_single_quoted 146 + | Tokenizer_state.Attribute_value_unquoted -> 147 + Buffer.add_string t.current_attr_value s 148 + | _ -> 149 + emit_str t s 150 + 151 + open Bytesrw 152 + 153 + (* Main tokenization loop *) 154 + let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) = 155 + t.stream <- Tokenizer_stream.create_from_reader reader; 156 + t.errors <- []; 157 + (* Set up error callback for surrogate/noncharacter detection in stream *) 158 + (* In XML mode, we don't report noncharacter errors - we transform them instead *) 159 + if not t.xml_mode then 160 + Tokenizer_stream.set_error_callback t.stream (fun code -> error t code); 161 + 162 + (* XML mode transformation for pending chars: U+FFFF → U+FFFD *) 163 + let transform_xml_chars data = 164 + let len = String.length data in 165 + let buf = Buffer.create len in 166 + let i = ref 0 in 167 + while !i < len do 168 + let c = data.[!i] in 169 + if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin 170 + (* U+FFFF → U+FFFD *) 171 + Buffer.add_string buf "\xEF\xBF\xBD"; 172 + i := !i + 3 173 + end else begin 174 + Buffer.add_char buf c; 175 + incr i 176 + end 177 + done; 178 + Buffer.contents buf 179 + in 180 + 181 + (* Local emit functions with access to S *) 182 + let emit_pending_chars () = 183 + if Buffer.length t.pending_chars > 0 then begin 184 + let data = Buffer.contents t.pending_chars in 185 + Buffer.clear t.pending_chars; 186 + let data = if t.xml_mode then transform_xml_chars data else data in 187 + ignore (S.process t.sink (Tokenizer_token.Character data)) 188 + end 189 + in 190 + 191 + let emit token = 192 + emit_pending_chars (); 193 + match S.process t.sink token with 194 + | `Continue -> () 195 + | `SwitchTo new_state -> t.state <- new_state 196 + in 197 + 198 + let emit_current_tag () = 199 + finish_attribute t; 200 + let name = Buffer.contents t.current_tag_name in 201 + let attrs = List.rev t.current_attrs in 202 + (* Check for end tag with attributes or self-closing flag *) 203 + if t.current_tag_kind = Tokenizer_token.End then begin 204 + if attrs <> [] then 205 + error t "end-tag-with-attributes"; 206 + if t.current_tag_self_closing then 207 + error t "end-tag-with-trailing-solidus" 208 + end; 209 + let tag = { 210 + Tokenizer_token.kind = t.current_tag_kind; 211 + name; 212 + attrs; 213 + self_closing = t.current_tag_self_closing; 214 + } in 215 + if t.current_tag_kind = Tokenizer_token.Start then 216 + t.last_start_tag <- name; 217 + emit (Tokenizer_token.Tag tag) 218 + in 219 + 220 + let emit_current_doctype () = 221 + let doctype = { 222 + Tokenizer_token.name = Option.map Buffer.contents t.current_doctype_name; 223 + public_id = Option.map Buffer.contents t.current_doctype_public; 224 + system_id = Option.map Buffer.contents t.current_doctype_system; 225 + force_quirks = t.current_doctype_force_quirks; 226 + } in 227 + emit (Tokenizer_token.Doctype doctype) 228 + in 229 + 230 + let emit_current_comment () = 231 + let content = Buffer.contents t.current_comment in 232 + let content = 233 + if t.xml_mode then begin 234 + (* XML mode: transform -- to - - in comments *) 235 + let buf = Buffer.create (String.length content + 10) in 236 + let len = String.length content in 237 + let i = ref 0 in 238 + while !i < len do 239 + if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin 240 + Buffer.add_string buf "- -"; 241 + i := !i + 2 242 + end else begin 243 + Buffer.add_char buf content.[!i]; 244 + incr i 245 + end 246 + done; 247 + Buffer.contents buf 248 + end else content 249 + in 250 + emit (Tokenizer_token.Comment content) 251 + in 252 + 253 + (* Check for control characters and emit error if needed *) 254 + (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *) 255 + let check_control_char c = 256 + let code = Char.code c in 257 + (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *) 258 + (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *) 259 + (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *) 260 + (* Note: We only check single-byte control chars here; multi-byte checks are TODO *) 261 + if (code >= 0x01 && code <= 0x08) || 262 + code = 0x0B || 263 + (code >= 0x0E && code <= 0x1F) || 264 + code = 0x7F then 265 + error t "control-character-in-input-stream" 266 + in 267 + 268 + 269 + (* Emit char with control character check *) 270 + let emit_char_checked c = 271 + check_control_char c; 272 + emit_char t c 273 + in 274 + 275 + let rec process_state () = 276 + if Tokenizer_stream.is_eof t.stream && t.state <> Tokenizer_state.Data then begin 277 + (* Handle EOF in various states *) 278 + handle_eof () 279 + end else if Tokenizer_stream.is_eof t.stream then begin 280 + emit_pending_chars (); 281 + ignore (S.process t.sink Tokenizer_token.EOF) 282 + end else begin 283 + step (); 284 + process_state () 285 + end 286 + 287 + and handle_eof () = 288 + match t.state with 289 + | Tokenizer_state.Data -> 290 + emit_pending_chars (); 291 + ignore (S.process t.sink Tokenizer_token.EOF) 292 + | Tokenizer_state.Tag_open -> 293 + error t "eof-before-tag-name"; 294 + emit_char t '<'; 295 + emit_pending_chars (); 296 + ignore (S.process t.sink Tokenizer_token.EOF) 297 + | Tokenizer_state.End_tag_open -> 298 + error t "eof-before-tag-name"; 299 + emit_str t "</"; 300 + emit_pending_chars (); 301 + ignore (S.process t.sink Tokenizer_token.EOF) 302 + | Tokenizer_state.Tag_name 303 + | Tokenizer_state.Before_attribute_name 304 + | Tokenizer_state.Attribute_name 305 + | Tokenizer_state.After_attribute_name 306 + | Tokenizer_state.Before_attribute_value 307 + | Tokenizer_state.Attribute_value_double_quoted 308 + | Tokenizer_state.Attribute_value_single_quoted 309 + | Tokenizer_state.Attribute_value_unquoted 310 + | Tokenizer_state.After_attribute_value_quoted 311 + | Tokenizer_state.Self_closing_start_tag -> 312 + error t "eof-in-tag"; 313 + emit_pending_chars (); 314 + ignore (S.process t.sink Tokenizer_token.EOF) 315 + | Tokenizer_state.Rawtext -> 316 + emit_pending_chars (); 317 + ignore (S.process t.sink Tokenizer_token.EOF) 318 + | Tokenizer_state.Rawtext_less_than_sign -> 319 + emit_char t '<'; 320 + emit_pending_chars (); 321 + ignore (S.process t.sink Tokenizer_token.EOF) 322 + | Tokenizer_state.Rawtext_end_tag_open -> 323 + emit_str t "</"; 324 + emit_pending_chars (); 325 + ignore (S.process t.sink Tokenizer_token.EOF) 326 + | Tokenizer_state.Rawtext_end_tag_name -> 327 + emit_str t "</"; 328 + emit_str t (Buffer.contents t.temp_buffer); 329 + emit_pending_chars (); 330 + ignore (S.process t.sink Tokenizer_token.EOF) 331 + | Tokenizer_state.Rcdata -> 332 + emit_pending_chars (); 333 + ignore (S.process t.sink Tokenizer_token.EOF) 334 + | Tokenizer_state.Rcdata_less_than_sign -> 335 + emit_char t '<'; 336 + emit_pending_chars (); 337 + ignore (S.process t.sink Tokenizer_token.EOF) 338 + | Tokenizer_state.Rcdata_end_tag_open -> 339 + emit_str t "</"; 340 + emit_pending_chars (); 341 + ignore (S.process t.sink Tokenizer_token.EOF) 342 + | Tokenizer_state.Rcdata_end_tag_name -> 343 + emit_str t "</"; 344 + emit_str t (Buffer.contents t.temp_buffer); 345 + emit_pending_chars (); 346 + ignore (S.process t.sink Tokenizer_token.EOF) 347 + | Tokenizer_state.Script_data -> 348 + emit_pending_chars (); 349 + ignore (S.process t.sink Tokenizer_token.EOF) 350 + | Tokenizer_state.Script_data_less_than_sign -> 351 + emit_char t '<'; 352 + emit_pending_chars (); 353 + ignore (S.process t.sink Tokenizer_token.EOF) 354 + | Tokenizer_state.Script_data_end_tag_open -> 355 + emit_str t "</"; 356 + emit_pending_chars (); 357 + ignore (S.process t.sink Tokenizer_token.EOF) 358 + | Tokenizer_state.Script_data_end_tag_name -> 359 + emit_str t "</"; 360 + emit_str t (Buffer.contents t.temp_buffer); 361 + emit_pending_chars (); 362 + ignore (S.process t.sink Tokenizer_token.EOF) 363 + | Tokenizer_state.Script_data_escape_start 364 + | Tokenizer_state.Script_data_escape_start_dash 365 + | Tokenizer_state.Script_data_escaped 366 + | Tokenizer_state.Script_data_escaped_dash 367 + | Tokenizer_state.Script_data_escaped_dash_dash -> 368 + error t "eof-in-script-html-comment-like-text"; 369 + emit_pending_chars (); 370 + ignore (S.process t.sink Tokenizer_token.EOF) 371 + | Tokenizer_state.Script_data_escaped_less_than_sign -> 372 + emit_char t '<'; 373 + emit_pending_chars (); 374 + ignore (S.process t.sink Tokenizer_token.EOF) 375 + | Tokenizer_state.Script_data_escaped_end_tag_open -> 376 + emit_str t "</"; 377 + emit_pending_chars (); 378 + ignore (S.process t.sink Tokenizer_token.EOF) 379 + | Tokenizer_state.Script_data_escaped_end_tag_name -> 380 + emit_str t "</"; 381 + emit_str t (Buffer.contents t.temp_buffer); 382 + emit_pending_chars (); 383 + ignore (S.process t.sink Tokenizer_token.EOF) 384 + | Tokenizer_state.Script_data_double_escape_start 385 + | Tokenizer_state.Script_data_double_escaped 386 + | Tokenizer_state.Script_data_double_escaped_dash 387 + | Tokenizer_state.Script_data_double_escaped_dash_dash -> 388 + error t "eof-in-script-html-comment-like-text"; 389 + emit_pending_chars (); 390 + ignore (S.process t.sink Tokenizer_token.EOF) 391 + | Tokenizer_state.Script_data_double_escaped_less_than_sign -> 392 + (* '<' was already emitted when entering this state from Script_data_double_escaped *) 393 + emit_pending_chars (); 394 + ignore (S.process t.sink Tokenizer_token.EOF) 395 + | Tokenizer_state.Script_data_double_escape_end -> 396 + emit_pending_chars (); 397 + ignore (S.process t.sink Tokenizer_token.EOF) 398 + | Tokenizer_state.Plaintext -> 399 + emit_pending_chars (); 400 + ignore (S.process t.sink Tokenizer_token.EOF) 401 + | Tokenizer_state.Comment_start 402 + | Tokenizer_state.Comment_start_dash 403 + | Tokenizer_state.Comment 404 + | Tokenizer_state.Comment_less_than_sign 405 + | Tokenizer_state.Comment_less_than_sign_bang 406 + | Tokenizer_state.Comment_less_than_sign_bang_dash 407 + | Tokenizer_state.Comment_less_than_sign_bang_dash_dash 408 + | Tokenizer_state.Comment_end_dash 409 + | Tokenizer_state.Comment_end 410 + | Tokenizer_state.Comment_end_bang -> 411 + error t "eof-in-comment"; 412 + emit_current_comment (); 413 + emit_pending_chars (); 414 + ignore (S.process t.sink Tokenizer_token.EOF) 415 + | Tokenizer_state.Bogus_comment -> 416 + emit_current_comment (); 417 + emit_pending_chars (); 418 + ignore (S.process t.sink Tokenizer_token.EOF) 419 + | Tokenizer_state.Markup_declaration_open -> 420 + error t "incorrectly-opened-comment"; 421 + Buffer.clear t.current_comment; 422 + emit_current_comment (); 423 + emit_pending_chars (); 424 + ignore (S.process t.sink Tokenizer_token.EOF) 425 + | Tokenizer_state.Doctype 426 + | Tokenizer_state.Before_doctype_name -> 427 + error t "eof-in-doctype"; 428 + start_new_doctype t; 429 + t.current_doctype_force_quirks <- true; 430 + emit_current_doctype (); 431 + emit_pending_chars (); 432 + ignore (S.process t.sink Tokenizer_token.EOF) 433 + | Tokenizer_state.Doctype_name 434 + | Tokenizer_state.After_doctype_name 435 + | Tokenizer_state.After_doctype_public_keyword 436 + | Tokenizer_state.Before_doctype_public_identifier 437 + | Tokenizer_state.Doctype_public_identifier_double_quoted 438 + | Tokenizer_state.Doctype_public_identifier_single_quoted 439 + | Tokenizer_state.After_doctype_public_identifier 440 + | Tokenizer_state.Between_doctype_public_and_system_identifiers 441 + | Tokenizer_state.After_doctype_system_keyword 442 + | Tokenizer_state.Before_doctype_system_identifier 443 + | Tokenizer_state.Doctype_system_identifier_double_quoted 444 + | Tokenizer_state.Doctype_system_identifier_single_quoted 445 + | Tokenizer_state.After_doctype_system_identifier -> 446 + error t "eof-in-doctype"; 447 + t.current_doctype_force_quirks <- true; 448 + emit_current_doctype (); 449 + emit_pending_chars (); 450 + ignore (S.process t.sink Tokenizer_token.EOF) 451 + | Tokenizer_state.Bogus_doctype -> 452 + emit_current_doctype (); 453 + emit_pending_chars (); 454 + ignore (S.process t.sink Tokenizer_token.EOF) 455 + | Tokenizer_state.Cdata_section -> 456 + error t "eof-in-cdata"; 457 + emit_pending_chars (); 458 + ignore (S.process t.sink Tokenizer_token.EOF) 459 + | Tokenizer_state.Cdata_section_bracket -> 460 + error t "eof-in-cdata"; 461 + emit_char t ']'; 462 + emit_pending_chars (); 463 + ignore (S.process t.sink Tokenizer_token.EOF) 464 + | Tokenizer_state.Cdata_section_end -> 465 + error t "eof-in-cdata"; 466 + emit_str t "]]"; 467 + emit_pending_chars (); 468 + ignore (S.process t.sink Tokenizer_token.EOF) 469 + | Tokenizer_state.Character_reference -> 470 + (* state_character_reference never ran, so initialize temp_buffer with & *) 471 + Buffer.clear t.temp_buffer; 472 + Buffer.add_char t.temp_buffer '&'; 473 + flush_code_points_consumed_as_char_ref t; 474 + t.state <- t.return_state; 475 + handle_eof () 476 + | Tokenizer_state.Named_character_reference -> 477 + flush_code_points_consumed_as_char_ref t; 478 + t.state <- t.return_state; 479 + handle_eof () 480 + | Tokenizer_state.Numeric_character_reference -> 481 + (* At EOF with just "&#" - no digits follow *) 482 + error t "absence-of-digits-in-numeric-character-reference"; 483 + flush_code_points_consumed_as_char_ref t; 484 + t.state <- t.return_state; 485 + handle_eof () 486 + | Tokenizer_state.Hexadecimal_character_reference_start 487 + | Tokenizer_state.Decimal_character_reference_start -> 488 + error t "absence-of-digits-in-numeric-character-reference"; 489 + flush_code_points_consumed_as_char_ref t; 490 + t.state <- t.return_state; 491 + handle_eof () 492 + | Tokenizer_state.Numeric_character_reference_end -> 493 + (* We have collected digits, just need to finalize the character reference *) 494 + step (); 495 + handle_eof () 496 + | Tokenizer_state.Ambiguous_ampersand -> 497 + (* Buffer was already flushed when entering this state, just transition *) 498 + t.state <- t.return_state; 499 + handle_eof () 500 + | Tokenizer_state.Hexadecimal_character_reference 501 + | Tokenizer_state.Decimal_character_reference -> 502 + (* At EOF with collected digits - convert the numeric reference *) 503 + error t "missing-semicolon-after-character-reference"; 504 + let code = t.char_ref_code in 505 + let replacement_char = "\xEF\xBF\xBD" in 506 + let result = 507 + if code = 0 then begin 508 + error t "null-character-reference"; 509 + replacement_char 510 + end else if code > 0x10FFFF then begin 511 + error t "character-reference-outside-unicode-range"; 512 + replacement_char 513 + end else if code >= 0xD800 && code <= 0xDFFF then begin 514 + error t "surrogate-character-reference"; 515 + replacement_char 516 + end else 517 + Entities.Numeric_ref.codepoint_to_utf8 code 518 + in 519 + Buffer.clear t.temp_buffer; 520 + Buffer.add_string t.temp_buffer result; 521 + flush_code_points_consumed_as_char_ref t; 522 + t.state <- t.return_state; 523 + handle_eof () 524 + 525 + and step () = 526 + match t.state with 527 + | Tokenizer_state.Data -> state_data () 528 + | Tokenizer_state.Rcdata -> state_rcdata () 529 + | Tokenizer_state.Rawtext -> state_rawtext () 530 + | Tokenizer_state.Script_data -> state_script_data () 531 + | Tokenizer_state.Plaintext -> state_plaintext () 532 + | Tokenizer_state.Tag_open -> state_tag_open () 533 + | Tokenizer_state.End_tag_open -> state_end_tag_open () 534 + | Tokenizer_state.Tag_name -> state_tag_name () 535 + | Tokenizer_state.Rcdata_less_than_sign -> state_rcdata_less_than_sign () 536 + | Tokenizer_state.Rcdata_end_tag_open -> state_rcdata_end_tag_open () 537 + | Tokenizer_state.Rcdata_end_tag_name -> state_rcdata_end_tag_name () 538 + | Tokenizer_state.Rawtext_less_than_sign -> state_rawtext_less_than_sign () 539 + | Tokenizer_state.Rawtext_end_tag_open -> state_rawtext_end_tag_open () 540 + | Tokenizer_state.Rawtext_end_tag_name -> state_rawtext_end_tag_name () 541 + | Tokenizer_state.Script_data_less_than_sign -> state_script_data_less_than_sign () 542 + | Tokenizer_state.Script_data_end_tag_open -> state_script_data_end_tag_open () 543 + | Tokenizer_state.Script_data_end_tag_name -> state_script_data_end_tag_name () 544 + | Tokenizer_state.Script_data_escape_start -> state_script_data_escape_start () 545 + | Tokenizer_state.Script_data_escape_start_dash -> state_script_data_escape_start_dash () 546 + | Tokenizer_state.Script_data_escaped -> state_script_data_escaped () 547 + | Tokenizer_state.Script_data_escaped_dash -> state_script_data_escaped_dash () 548 + | Tokenizer_state.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash () 549 + | Tokenizer_state.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign () 550 + | Tokenizer_state.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open () 551 + | Tokenizer_state.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name () 552 + | Tokenizer_state.Script_data_double_escape_start -> state_script_data_double_escape_start () 553 + | Tokenizer_state.Script_data_double_escaped -> state_script_data_double_escaped () 554 + | Tokenizer_state.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash () 555 + | Tokenizer_state.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash () 556 + | Tokenizer_state.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign () 557 + | Tokenizer_state.Script_data_double_escape_end -> state_script_data_double_escape_end () 558 + | Tokenizer_state.Before_attribute_name -> state_before_attribute_name () 559 + | Tokenizer_state.Attribute_name -> state_attribute_name () 560 + | Tokenizer_state.After_attribute_name -> state_after_attribute_name () 561 + | Tokenizer_state.Before_attribute_value -> state_before_attribute_value () 562 + | Tokenizer_state.Attribute_value_double_quoted -> state_attribute_value_double_quoted () 563 + | Tokenizer_state.Attribute_value_single_quoted -> state_attribute_value_single_quoted () 564 + | Tokenizer_state.Attribute_value_unquoted -> state_attribute_value_unquoted () 565 + | Tokenizer_state.After_attribute_value_quoted -> state_after_attribute_value_quoted () 566 + | Tokenizer_state.Self_closing_start_tag -> state_self_closing_start_tag () 567 + | Tokenizer_state.Bogus_comment -> state_bogus_comment () 568 + | Tokenizer_state.Markup_declaration_open -> state_markup_declaration_open () 569 + | Tokenizer_state.Comment_start -> state_comment_start () 570 + | Tokenizer_state.Comment_start_dash -> state_comment_start_dash () 571 + | Tokenizer_state.Comment -> state_comment () 572 + | Tokenizer_state.Comment_less_than_sign -> state_comment_less_than_sign () 573 + | Tokenizer_state.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang () 574 + | Tokenizer_state.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash () 575 + | Tokenizer_state.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash () 576 + | Tokenizer_state.Comment_end_dash -> state_comment_end_dash () 577 + | Tokenizer_state.Comment_end -> state_comment_end () 578 + | Tokenizer_state.Comment_end_bang -> state_comment_end_bang () 579 + | Tokenizer_state.Doctype -> state_doctype () 580 + | Tokenizer_state.Before_doctype_name -> state_before_doctype_name () 581 + | Tokenizer_state.Doctype_name -> state_doctype_name () 582 + | Tokenizer_state.After_doctype_name -> state_after_doctype_name () 583 + | Tokenizer_state.After_doctype_public_keyword -> state_after_doctype_public_keyword () 584 + | Tokenizer_state.Before_doctype_public_identifier -> state_before_doctype_public_identifier () 585 + | Tokenizer_state.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted () 586 + | Tokenizer_state.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted () 587 + | Tokenizer_state.After_doctype_public_identifier -> state_after_doctype_public_identifier () 588 + | Tokenizer_state.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers () 589 + | Tokenizer_state.After_doctype_system_keyword -> state_after_doctype_system_keyword () 590 + | Tokenizer_state.Before_doctype_system_identifier -> state_before_doctype_system_identifier () 591 + | Tokenizer_state.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted () 592 + | Tokenizer_state.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted () 593 + | Tokenizer_state.After_doctype_system_identifier -> state_after_doctype_system_identifier () 594 + | Tokenizer_state.Bogus_doctype -> state_bogus_doctype () 595 + | Tokenizer_state.Cdata_section -> state_cdata_section () 596 + | Tokenizer_state.Cdata_section_bracket -> state_cdata_section_bracket () 597 + | Tokenizer_state.Cdata_section_end -> state_cdata_section_end () 598 + | Tokenizer_state.Character_reference -> state_character_reference () 599 + | Tokenizer_state.Named_character_reference -> state_named_character_reference () 600 + | Tokenizer_state.Ambiguous_ampersand -> state_ambiguous_ampersand () 601 + | Tokenizer_state.Numeric_character_reference -> state_numeric_character_reference () 602 + | Tokenizer_state.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start () 603 + | Tokenizer_state.Decimal_character_reference_start -> state_decimal_character_reference_start () 604 + | Tokenizer_state.Hexadecimal_character_reference -> state_hexadecimal_character_reference () 605 + | Tokenizer_state.Decimal_character_reference -> state_decimal_character_reference () 606 + | Tokenizer_state.Numeric_character_reference_end -> state_numeric_character_reference_end () 607 + 608 + (* State implementations *) 609 + and state_data () = 610 + match Tokenizer_stream.consume t.stream with 611 + | Some '&' -> 612 + t.return_state <- Tokenizer_state.Data; 613 + t.state <- Tokenizer_state.Character_reference 614 + | Some '<' -> 615 + t.state <- Tokenizer_state.Tag_open 616 + | Some '\x00' -> 617 + (* Emit pending chars first, then emit null separately for proper tree builder handling *) 618 + emit_pending_chars (); 619 + error t "unexpected-null-character"; 620 + ignore (S.process t.sink (Tokenizer_token.Character "\x00")) 621 + | Some c -> 622 + emit_char_checked c 623 + | None -> () 624 + 625 + and state_rcdata () = 626 + match Tokenizer_stream.consume t.stream with 627 + | Some '&' -> 628 + t.return_state <- Tokenizer_state.Rcdata; 629 + t.state <- Tokenizer_state.Character_reference 630 + | Some '<' -> 631 + t.state <- Tokenizer_state.Rcdata_less_than_sign 632 + | Some '\x00' -> 633 + error t "unexpected-null-character"; 634 + emit_str t "\xEF\xBF\xBD" 635 + | Some c -> 636 + emit_char_checked c 637 + | None -> () 638 + 639 + and state_rawtext () = 640 + match Tokenizer_stream.consume t.stream with 641 + | Some '<' -> 642 + t.state <- Tokenizer_state.Rawtext_less_than_sign 643 + | Some '\x00' -> 644 + error t "unexpected-null-character"; 645 + emit_str t "\xEF\xBF\xBD" 646 + | Some c -> 647 + emit_char_checked c 648 + | None -> () 649 + 650 + and state_script_data () = 651 + match Tokenizer_stream.consume t.stream with 652 + | Some '<' -> 653 + t.state <- Tokenizer_state.Script_data_less_than_sign 654 + | Some '\x00' -> 655 + error t "unexpected-null-character"; 656 + emit_str t "\xEF\xBF\xBD" 657 + | Some c -> 658 + emit_char_checked c 659 + | None -> () 660 + 661 + and state_plaintext () = 662 + match Tokenizer_stream.consume t.stream with 663 + | Some '\x00' -> 664 + error t "unexpected-null-character"; 665 + emit_str t "\xEF\xBF\xBD" 666 + | Some c -> 667 + emit_char_checked c 668 + | None -> () 669 + 670 + and state_tag_open () = 671 + match Tokenizer_stream.peek t.stream with 672 + | Some '!' -> 673 + Tokenizer_stream.advance t.stream; 674 + t.state <- Tokenizer_state.Markup_declaration_open 675 + | Some '/' -> 676 + Tokenizer_stream.advance t.stream; 677 + t.state <- Tokenizer_state.End_tag_open 678 + | Some c when is_ascii_alpha c -> 679 + start_new_tag t Tokenizer_token.Start; 680 + t.state <- Tokenizer_state.Tag_name 681 + | Some '?' -> 682 + error t "unexpected-question-mark-instead-of-tag-name"; 683 + Buffer.clear t.current_comment; 684 + t.state <- Tokenizer_state.Bogus_comment 685 + | None -> 686 + error t "eof-before-tag-name"; 687 + emit_char t '<' 688 + | Some _ -> 689 + error t "invalid-first-character-of-tag-name"; 690 + emit_char t '<'; 691 + t.state <- Tokenizer_state.Data 692 + 693 + and state_end_tag_open () = 694 + match Tokenizer_stream.peek t.stream with 695 + | Some c when is_ascii_alpha c -> 696 + start_new_tag t Tokenizer_token.End; 697 + t.state <- Tokenizer_state.Tag_name 698 + | Some '>' -> 699 + Tokenizer_stream.advance t.stream; 700 + error t "missing-end-tag-name"; 701 + t.state <- Tokenizer_state.Data 702 + | None -> 703 + error t "eof-before-tag-name"; 704 + emit_str t "</" 705 + | Some _ -> 706 + error t "invalid-first-character-of-tag-name"; 707 + Buffer.clear t.current_comment; 708 + t.state <- Tokenizer_state.Bogus_comment 709 + 710 + and state_tag_name () = 711 + match Tokenizer_stream.consume t.stream with 712 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 713 + t.state <- Tokenizer_state.Before_attribute_name 714 + | Some '/' -> 715 + t.state <- Tokenizer_state.Self_closing_start_tag 716 + | Some '>' -> 717 + t.state <- Tokenizer_state.Data; 718 + emit_current_tag () 719 + | Some '\x00' -> 720 + error t "unexpected-null-character"; 721 + Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 722 + | Some c -> 723 + check_control_char c; 724 + Buffer.add_char t.current_tag_name (ascii_lower c) 725 + | None -> () 726 + 727 + and state_rcdata_less_than_sign () = 728 + match Tokenizer_stream.peek t.stream with 729 + | Some '/' -> 730 + Tokenizer_stream.advance t.stream; 731 + Buffer.clear t.temp_buffer; 732 + t.state <- Tokenizer_state.Rcdata_end_tag_open 733 + | _ -> 734 + emit_char t '<'; 735 + t.state <- Tokenizer_state.Rcdata 736 + 737 + and state_rcdata_end_tag_open () = 738 + match Tokenizer_stream.peek t.stream with 739 + | Some c when is_ascii_alpha c -> 740 + start_new_tag t Tokenizer_token.End; 741 + t.state <- Tokenizer_state.Rcdata_end_tag_name 742 + | _ -> 743 + emit_str t "</"; 744 + t.state <- Tokenizer_state.Rcdata 745 + 746 + and state_rcdata_end_tag_name () = 747 + match Tokenizer_stream.peek t.stream with 748 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 749 + Tokenizer_stream.advance t.stream; 750 + t.state <- Tokenizer_state.Before_attribute_name 751 + | Some '/' when is_appropriate_end_tag t -> 752 + Tokenizer_stream.advance t.stream; 753 + t.state <- Tokenizer_state.Self_closing_start_tag 754 + | Some '>' when is_appropriate_end_tag t -> 755 + Tokenizer_stream.advance t.stream; 756 + t.state <- Tokenizer_state.Data; 757 + emit_current_tag () 758 + | Some c when is_ascii_alpha c -> 759 + Tokenizer_stream.advance t.stream; 760 + Buffer.add_char t.current_tag_name (ascii_lower c); 761 + Buffer.add_char t.temp_buffer c 762 + | _ -> 763 + emit_str t "</"; 764 + emit_str t (Buffer.contents t.temp_buffer); 765 + t.state <- Tokenizer_state.Rcdata 766 + 767 + and state_rawtext_less_than_sign () = 768 + match Tokenizer_stream.peek t.stream with 769 + | Some '/' -> 770 + Tokenizer_stream.advance t.stream; 771 + Buffer.clear t.temp_buffer; 772 + t.state <- Tokenizer_state.Rawtext_end_tag_open 773 + | _ -> 774 + emit_char t '<'; 775 + t.state <- Tokenizer_state.Rawtext 776 + 777 + and state_rawtext_end_tag_open () = 778 + match Tokenizer_stream.peek t.stream with 779 + | Some c when is_ascii_alpha c -> 780 + start_new_tag t Tokenizer_token.End; 781 + t.state <- Tokenizer_state.Rawtext_end_tag_name 782 + | _ -> 783 + emit_str t "</"; 784 + t.state <- Tokenizer_state.Rawtext 785 + 786 + and state_rawtext_end_tag_name () = 787 + match Tokenizer_stream.peek t.stream with 788 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 789 + Tokenizer_stream.advance t.stream; 790 + t.state <- Tokenizer_state.Before_attribute_name 791 + | Some '/' when is_appropriate_end_tag t -> 792 + Tokenizer_stream.advance t.stream; 793 + t.state <- Tokenizer_state.Self_closing_start_tag 794 + | Some '>' when is_appropriate_end_tag t -> 795 + Tokenizer_stream.advance t.stream; 796 + t.state <- Tokenizer_state.Data; 797 + emit_current_tag () 798 + | Some c when is_ascii_alpha c -> 799 + Tokenizer_stream.advance t.stream; 800 + Buffer.add_char t.current_tag_name (ascii_lower c); 801 + Buffer.add_char t.temp_buffer c 802 + | _ -> 803 + emit_str t "</"; 804 + emit_str t (Buffer.contents t.temp_buffer); 805 + t.state <- Tokenizer_state.Rawtext 806 + 807 + and state_script_data_less_than_sign () = 808 + match Tokenizer_stream.peek t.stream with 809 + | Some '/' -> 810 + Tokenizer_stream.advance t.stream; 811 + Buffer.clear t.temp_buffer; 812 + t.state <- Tokenizer_state.Script_data_end_tag_open 813 + | Some '!' -> 814 + Tokenizer_stream.advance t.stream; 815 + t.state <- Tokenizer_state.Script_data_escape_start; 816 + emit_str t "<!" 817 + | _ -> 818 + emit_char t '<'; 819 + t.state <- Tokenizer_state.Script_data 820 + 821 + and state_script_data_end_tag_open () = 822 + match Tokenizer_stream.peek t.stream with 823 + | Some c when is_ascii_alpha c -> 824 + start_new_tag t Tokenizer_token.End; 825 + t.state <- Tokenizer_state.Script_data_end_tag_name 826 + | _ -> 827 + emit_str t "</"; 828 + t.state <- Tokenizer_state.Script_data 829 + 830 + and state_script_data_end_tag_name () = 831 + match Tokenizer_stream.peek t.stream with 832 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 833 + Tokenizer_stream.advance t.stream; 834 + t.state <- Tokenizer_state.Before_attribute_name 835 + | Some '/' when is_appropriate_end_tag t -> 836 + Tokenizer_stream.advance t.stream; 837 + t.state <- Tokenizer_state.Self_closing_start_tag 838 + | Some '>' when is_appropriate_end_tag t -> 839 + Tokenizer_stream.advance t.stream; 840 + t.state <- Tokenizer_state.Data; 841 + emit_current_tag () 842 + | Some c when is_ascii_alpha c -> 843 + Tokenizer_stream.advance t.stream; 844 + Buffer.add_char t.current_tag_name (ascii_lower c); 845 + Buffer.add_char t.temp_buffer c 846 + | _ -> 847 + emit_str t "</"; 848 + emit_str t (Buffer.contents t.temp_buffer); 849 + t.state <- Tokenizer_state.Script_data 850 + 851 + and state_script_data_escape_start () = 852 + match Tokenizer_stream.peek t.stream with 853 + | Some '-' -> 854 + Tokenizer_stream.advance t.stream; 855 + t.state <- Tokenizer_state.Script_data_escape_start_dash; 856 + emit_char t '-' 857 + | _ -> 858 + t.state <- Tokenizer_state.Script_data 859 + 860 + and state_script_data_escape_start_dash () = 861 + match Tokenizer_stream.peek t.stream with 862 + | Some '-' -> 863 + Tokenizer_stream.advance t.stream; 864 + t.state <- Tokenizer_state.Script_data_escaped_dash_dash; 865 + emit_char t '-' 866 + | _ -> 867 + t.state <- Tokenizer_state.Script_data 868 + 869 + and state_script_data_escaped () = 870 + match Tokenizer_stream.consume t.stream with 871 + | Some '-' -> 872 + t.state <- Tokenizer_state.Script_data_escaped_dash; 873 + emit_char t '-' 874 + | Some '<' -> 875 + t.state <- Tokenizer_state.Script_data_escaped_less_than_sign 876 + | Some '\x00' -> 877 + error t "unexpected-null-character"; 878 + emit_str t "\xEF\xBF\xBD" 879 + | Some c -> 880 + emit_char_checked c 881 + | None -> () 882 + 883 + and state_script_data_escaped_dash () = 884 + match Tokenizer_stream.consume t.stream with 885 + | Some '-' -> 886 + t.state <- Tokenizer_state.Script_data_escaped_dash_dash; 887 + emit_char t '-' 888 + | Some '<' -> 889 + t.state <- Tokenizer_state.Script_data_escaped_less_than_sign 890 + | Some '\x00' -> 891 + error t "unexpected-null-character"; 892 + t.state <- Tokenizer_state.Script_data_escaped; 893 + emit_str t "\xEF\xBF\xBD" 894 + | Some c -> 895 + t.state <- Tokenizer_state.Script_data_escaped; 896 + emit_char_checked c 897 + | None -> () 898 + 899 + and state_script_data_escaped_dash_dash () = 900 + match Tokenizer_stream.consume t.stream with 901 + | Some '-' -> 902 + emit_char t '-' 903 + | Some '<' -> 904 + t.state <- Tokenizer_state.Script_data_escaped_less_than_sign 905 + | Some '>' -> 906 + t.state <- Tokenizer_state.Script_data; 907 + emit_char t '>' 908 + | Some '\x00' -> 909 + error t "unexpected-null-character"; 910 + t.state <- Tokenizer_state.Script_data_escaped; 911 + emit_str t "\xEF\xBF\xBD" 912 + | Some c -> 913 + t.state <- Tokenizer_state.Script_data_escaped; 914 + emit_char_checked c 915 + | None -> () 916 + 917 + and state_script_data_escaped_less_than_sign () = 918 + match Tokenizer_stream.peek t.stream with 919 + | Some '/' -> 920 + Tokenizer_stream.advance t.stream; 921 + Buffer.clear t.temp_buffer; 922 + t.state <- Tokenizer_state.Script_data_escaped_end_tag_open 923 + | Some c when is_ascii_alpha c -> 924 + Buffer.clear t.temp_buffer; 925 + emit_char t '<'; 926 + t.state <- Tokenizer_state.Script_data_double_escape_start 927 + | _ -> 928 + emit_char t '<'; 929 + t.state <- Tokenizer_state.Script_data_escaped 930 + 931 + and state_script_data_escaped_end_tag_open () = 932 + match Tokenizer_stream.peek t.stream with 933 + | Some c when is_ascii_alpha c -> 934 + start_new_tag t Tokenizer_token.End; 935 + t.state <- Tokenizer_state.Script_data_escaped_end_tag_name 936 + | _ -> 937 + emit_str t "</"; 938 + t.state <- Tokenizer_state.Script_data_escaped 939 + 940 + and state_script_data_escaped_end_tag_name () = 941 + match Tokenizer_stream.peek t.stream with 942 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 943 + Tokenizer_stream.advance t.stream; 944 + t.state <- Tokenizer_state.Before_attribute_name 945 + | Some '/' when is_appropriate_end_tag t -> 946 + Tokenizer_stream.advance t.stream; 947 + t.state <- Tokenizer_state.Self_closing_start_tag 948 + | Some '>' when is_appropriate_end_tag t -> 949 + Tokenizer_stream.advance t.stream; 950 + t.state <- Tokenizer_state.Data; 951 + emit_current_tag () 952 + | Some c when is_ascii_alpha c -> 953 + Tokenizer_stream.advance t.stream; 954 + Buffer.add_char t.current_tag_name (ascii_lower c); 955 + Buffer.add_char t.temp_buffer c 956 + | _ -> 957 + emit_str t "</"; 958 + emit_str t (Buffer.contents t.temp_buffer); 959 + t.state <- Tokenizer_state.Script_data_escaped 960 + 961 + and state_script_data_double_escape_start () = 962 + match Tokenizer_stream.peek t.stream with 963 + | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 964 + Tokenizer_stream.advance t.stream; 965 + let c = Option.get c_opt in 966 + if Buffer.contents t.temp_buffer = "script" then 967 + t.state <- Tokenizer_state.Script_data_double_escaped 968 + else 969 + t.state <- Tokenizer_state.Script_data_escaped; 970 + emit_char t c 971 + | Some c when is_ascii_alpha c -> 972 + Tokenizer_stream.advance t.stream; 973 + Buffer.add_char t.temp_buffer (ascii_lower c); 974 + emit_char t c 975 + | _ -> 976 + t.state <- Tokenizer_state.Script_data_escaped 977 + 978 + and state_script_data_double_escaped () = 979 + match Tokenizer_stream.consume t.stream with 980 + | Some '-' -> 981 + t.state <- Tokenizer_state.Script_data_double_escaped_dash; 982 + emit_char t '-' 983 + | Some '<' -> 984 + t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; 985 + emit_char t '<' 986 + | Some '\x00' -> 987 + error t "unexpected-null-character"; 988 + emit_str t "\xEF\xBF\xBD" 989 + | Some c -> 990 + emit_char_checked c 991 + | None -> () 992 + 993 + and state_script_data_double_escaped_dash () = 994 + match Tokenizer_stream.consume t.stream with 995 + | Some '-' -> 996 + t.state <- Tokenizer_state.Script_data_double_escaped_dash_dash; 997 + emit_char t '-' 998 + | Some '<' -> 999 + t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; 1000 + emit_char t '<' 1001 + | Some '\x00' -> 1002 + error t "unexpected-null-character"; 1003 + t.state <- Tokenizer_state.Script_data_double_escaped; 1004 + emit_str t "\xEF\xBF\xBD" 1005 + | Some c -> 1006 + t.state <- Tokenizer_state.Script_data_double_escaped; 1007 + emit_char_checked c 1008 + | None -> () 1009 + 1010 + and state_script_data_double_escaped_dash_dash () = 1011 + match Tokenizer_stream.consume t.stream with 1012 + | Some '-' -> 1013 + emit_char t '-' 1014 + | Some '<' -> 1015 + t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; 1016 + emit_char t '<' 1017 + | Some '>' -> 1018 + t.state <- Tokenizer_state.Script_data; 1019 + emit_char t '>' 1020 + | Some '\x00' -> 1021 + error t "unexpected-null-character"; 1022 + t.state <- Tokenizer_state.Script_data_double_escaped; 1023 + emit_str t "\xEF\xBF\xBD" 1024 + | Some c -> 1025 + t.state <- Tokenizer_state.Script_data_double_escaped; 1026 + emit_char_checked c 1027 + | None -> () 1028 + 1029 + and state_script_data_double_escaped_less_than_sign () = 1030 + match Tokenizer_stream.peek t.stream with 1031 + | Some '/' -> 1032 + Tokenizer_stream.advance t.stream; 1033 + Buffer.clear t.temp_buffer; 1034 + t.state <- Tokenizer_state.Script_data_double_escape_end; 1035 + emit_char t '/' 1036 + | _ -> 1037 + t.state <- Tokenizer_state.Script_data_double_escaped 1038 + 1039 + and state_script_data_double_escape_end () = 1040 + match Tokenizer_stream.peek t.stream with 1041 + | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 1042 + Tokenizer_stream.advance t.stream; 1043 + let c = Option.get c_opt in 1044 + if Buffer.contents t.temp_buffer = "script" then 1045 + t.state <- Tokenizer_state.Script_data_escaped 1046 + else 1047 + t.state <- Tokenizer_state.Script_data_double_escaped; 1048 + emit_char t c 1049 + | Some c when is_ascii_alpha c -> 1050 + Tokenizer_stream.advance t.stream; 1051 + Buffer.add_char t.temp_buffer (ascii_lower c); 1052 + emit_char t c 1053 + | _ -> 1054 + t.state <- Tokenizer_state.Script_data_double_escaped 1055 + 1056 + and state_before_attribute_name () = 1057 + match Tokenizer_stream.peek t.stream with 1058 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1059 + Tokenizer_stream.advance t.stream 1060 + | Some '/' | Some '>' | None -> 1061 + t.state <- Tokenizer_state.After_attribute_name 1062 + | Some '=' -> 1063 + Tokenizer_stream.advance t.stream; 1064 + error t "unexpected-equals-sign-before-attribute-name"; 1065 + start_new_attribute t; 1066 + Buffer.add_char t.current_attr_name '='; 1067 + t.state <- Tokenizer_state.Attribute_name 1068 + | Some _ -> 1069 + start_new_attribute t; 1070 + t.state <- Tokenizer_state.Attribute_name 1071 + 1072 + and state_attribute_name () = 1073 + match Tokenizer_stream.peek t.stream with 1074 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1075 + Tokenizer_stream.advance t.stream; 1076 + t.state <- Tokenizer_state.After_attribute_name 1077 + | Some '/' | Some '>' | None -> 1078 + t.state <- Tokenizer_state.After_attribute_name 1079 + | Some '=' -> 1080 + Tokenizer_stream.advance t.stream; 1081 + t.state <- Tokenizer_state.Before_attribute_value 1082 + | Some '\x00' -> 1083 + Tokenizer_stream.advance t.stream; 1084 + error t "unexpected-null-character"; 1085 + Buffer.add_string t.current_attr_name "\xEF\xBF\xBD" 1086 + | Some ('"' | '\'' | '<') as c_opt -> 1087 + Tokenizer_stream.advance t.stream; 1088 + error t "unexpected-character-in-attribute-name"; 1089 + Buffer.add_char t.current_attr_name (Option.get c_opt) 1090 + | Some c -> 1091 + Tokenizer_stream.advance t.stream; 1092 + check_control_char c; 1093 + Buffer.add_char t.current_attr_name (ascii_lower c) 1094 + 1095 + and state_after_attribute_name () = 1096 + match Tokenizer_stream.peek t.stream with 1097 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1098 + Tokenizer_stream.advance t.stream 1099 + | Some '/' -> 1100 + Tokenizer_stream.advance t.stream; 1101 + t.state <- Tokenizer_state.Self_closing_start_tag 1102 + | Some '=' -> 1103 + Tokenizer_stream.advance t.stream; 1104 + t.state <- Tokenizer_state.Before_attribute_value 1105 + | Some '>' -> 1106 + Tokenizer_stream.advance t.stream; 1107 + t.state <- Tokenizer_state.Data; 1108 + emit_current_tag () 1109 + | None -> () 1110 + | Some _ -> 1111 + start_new_attribute t; 1112 + t.state <- Tokenizer_state.Attribute_name 1113 + 1114 + and state_before_attribute_value () = 1115 + match Tokenizer_stream.peek t.stream with 1116 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1117 + Tokenizer_stream.advance t.stream 1118 + | Some '"' -> 1119 + Tokenizer_stream.advance t.stream; 1120 + t.state <- Tokenizer_state.Attribute_value_double_quoted 1121 + | Some '\'' -> 1122 + Tokenizer_stream.advance t.stream; 1123 + t.state <- Tokenizer_state.Attribute_value_single_quoted 1124 + | Some '>' -> 1125 + Tokenizer_stream.advance t.stream; 1126 + error t "missing-attribute-value"; 1127 + t.state <- Tokenizer_state.Data; 1128 + emit_current_tag () 1129 + | _ -> 1130 + t.state <- Tokenizer_state.Attribute_value_unquoted 1131 + 1132 + and state_attribute_value_double_quoted () = 1133 + match Tokenizer_stream.consume t.stream with 1134 + | Some '"' -> 1135 + t.state <- Tokenizer_state.After_attribute_value_quoted 1136 + | Some '&' -> 1137 + t.return_state <- Tokenizer_state.Attribute_value_double_quoted; 1138 + t.state <- Tokenizer_state.Character_reference 1139 + | Some '\x00' -> 1140 + error t "unexpected-null-character"; 1141 + Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1142 + | Some c -> 1143 + check_control_char c; 1144 + Buffer.add_char t.current_attr_value c 1145 + | None -> () 1146 + 1147 + and state_attribute_value_single_quoted () = 1148 + match Tokenizer_stream.consume t.stream with 1149 + | Some '\'' -> 1150 + t.state <- Tokenizer_state.After_attribute_value_quoted 1151 + | Some '&' -> 1152 + t.return_state <- Tokenizer_state.Attribute_value_single_quoted; 1153 + t.state <- Tokenizer_state.Character_reference 1154 + | Some '\x00' -> 1155 + error t "unexpected-null-character"; 1156 + Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1157 + | Some c -> 1158 + check_control_char c; 1159 + Buffer.add_char t.current_attr_value c 1160 + | None -> () 1161 + 1162 + and state_attribute_value_unquoted () = 1163 + match Tokenizer_stream.peek t.stream with 1164 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1165 + Tokenizer_stream.advance t.stream; 1166 + t.state <- Tokenizer_state.Before_attribute_name 1167 + | Some '&' -> 1168 + Tokenizer_stream.advance t.stream; 1169 + t.return_state <- Tokenizer_state.Attribute_value_unquoted; 1170 + t.state <- Tokenizer_state.Character_reference 1171 + | Some '>' -> 1172 + Tokenizer_stream.advance t.stream; 1173 + t.state <- Tokenizer_state.Data; 1174 + emit_current_tag () 1175 + | Some '\x00' -> 1176 + Tokenizer_stream.advance t.stream; 1177 + error t "unexpected-null-character"; 1178 + Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1179 + | Some ('"' | '\'' | '<' | '=' | '`') as c_opt -> 1180 + Tokenizer_stream.advance t.stream; 1181 + error t "unexpected-character-in-unquoted-attribute-value"; 1182 + Buffer.add_char t.current_attr_value (Option.get c_opt) 1183 + | Some c -> 1184 + Tokenizer_stream.advance t.stream; 1185 + check_control_char c; 1186 + Buffer.add_char t.current_attr_value c 1187 + | None -> () 1188 + 1189 + and state_after_attribute_value_quoted () = 1190 + match Tokenizer_stream.peek t.stream with 1191 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1192 + Tokenizer_stream.advance t.stream; 1193 + t.state <- Tokenizer_state.Before_attribute_name 1194 + | Some '/' -> 1195 + Tokenizer_stream.advance t.stream; 1196 + t.state <- Tokenizer_state.Self_closing_start_tag 1197 + | Some '>' -> 1198 + Tokenizer_stream.advance t.stream; 1199 + t.state <- Tokenizer_state.Data; 1200 + emit_current_tag () 1201 + | None -> () 1202 + | Some _ -> 1203 + error t "missing-whitespace-between-attributes"; 1204 + t.state <- Tokenizer_state.Before_attribute_name 1205 + 1206 + and state_self_closing_start_tag () = 1207 + match Tokenizer_stream.peek t.stream with 1208 + | Some '>' -> 1209 + Tokenizer_stream.advance t.stream; 1210 + t.current_tag_self_closing <- true; 1211 + t.state <- Tokenizer_state.Data; 1212 + emit_current_tag () 1213 + | None -> () 1214 + | Some _ -> 1215 + error t "unexpected-solidus-in-tag"; 1216 + t.state <- Tokenizer_state.Before_attribute_name 1217 + 1218 + and state_bogus_comment () = 1219 + match Tokenizer_stream.consume t.stream with 1220 + | Some '>' -> 1221 + t.state <- Tokenizer_state.Data; 1222 + emit_current_comment () 1223 + | Some '\x00' -> 1224 + error t "unexpected-null-character"; 1225 + Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1226 + | Some c -> 1227 + check_control_char c; 1228 + Buffer.add_char t.current_comment c 1229 + | None -> () 1230 + 1231 + and state_markup_declaration_open () = 1232 + if Tokenizer_stream.matches_ci t.stream "--" then begin 1233 + ignore (Tokenizer_stream.consume_exact_ci t.stream "--"); 1234 + Buffer.clear t.current_comment; 1235 + t.state <- Tokenizer_state.Comment_start 1236 + end else if Tokenizer_stream.matches_ci t.stream "DOCTYPE" then begin 1237 + ignore (Tokenizer_stream.consume_exact_ci t.stream "DOCTYPE"); 1238 + t.state <- Tokenizer_state.Doctype 1239 + end else if Tokenizer_stream.matches_ci t.stream "[CDATA[" then begin 1240 + ignore (Tokenizer_stream.consume_exact_ci t.stream "[CDATA["); 1241 + (* CDATA only allowed in foreign content *) 1242 + if S.adjusted_current_node_in_html_namespace t.sink then begin 1243 + error t "cdata-in-html-content"; 1244 + Buffer.clear t.current_comment; 1245 + Buffer.add_string t.current_comment "[CDATA["; 1246 + t.state <- Tokenizer_state.Bogus_comment 1247 + end else 1248 + t.state <- Tokenizer_state.Cdata_section 1249 + end else begin 1250 + error t "incorrectly-opened-comment"; 1251 + Buffer.clear t.current_comment; 1252 + t.state <- Tokenizer_state.Bogus_comment 1253 + end 1254 + 1255 + and state_comment_start () = 1256 + match Tokenizer_stream.peek t.stream with 1257 + | Some '-' -> 1258 + Tokenizer_stream.advance t.stream; 1259 + t.state <- Tokenizer_state.Comment_start_dash 1260 + | Some '>' -> 1261 + Tokenizer_stream.advance t.stream; 1262 + error t "abrupt-closing-of-empty-comment"; 1263 + t.state <- Tokenizer_state.Data; 1264 + emit_current_comment () 1265 + | _ -> 1266 + t.state <- Tokenizer_state.Comment 1267 + 1268 + and state_comment_start_dash () = 1269 + match Tokenizer_stream.peek t.stream with 1270 + | Some '-' -> 1271 + Tokenizer_stream.advance t.stream; 1272 + t.state <- Tokenizer_state.Comment_end 1273 + | Some '>' -> 1274 + Tokenizer_stream.advance t.stream; 1275 + error t "abrupt-closing-of-empty-comment"; 1276 + t.state <- Tokenizer_state.Data; 1277 + emit_current_comment () 1278 + | None -> () 1279 + | Some _ -> 1280 + Buffer.add_char t.current_comment '-'; 1281 + t.state <- Tokenizer_state.Comment 1282 + 1283 + and state_comment () = 1284 + match Tokenizer_stream.consume t.stream with 1285 + | Some '<' -> 1286 + Buffer.add_char t.current_comment '<'; 1287 + t.state <- Tokenizer_state.Comment_less_than_sign 1288 + | Some '-' -> 1289 + t.state <- Tokenizer_state.Comment_end_dash 1290 + | Some '\x00' -> 1291 + error t "unexpected-null-character"; 1292 + Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1293 + | Some c -> 1294 + check_control_char c; 1295 + Buffer.add_char t.current_comment c 1296 + | None -> () 1297 + 1298 + and state_comment_less_than_sign () = 1299 + match Tokenizer_stream.peek t.stream with 1300 + | Some '!' -> 1301 + Tokenizer_stream.advance t.stream; 1302 + Buffer.add_char t.current_comment '!'; 1303 + t.state <- Tokenizer_state.Comment_less_than_sign_bang 1304 + | Some '<' -> 1305 + Tokenizer_stream.advance t.stream; 1306 + Buffer.add_char t.current_comment '<' 1307 + | _ -> 1308 + t.state <- Tokenizer_state.Comment 1309 + 1310 + and state_comment_less_than_sign_bang () = 1311 + match Tokenizer_stream.peek t.stream with 1312 + | Some '-' -> 1313 + Tokenizer_stream.advance t.stream; 1314 + t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash 1315 + | _ -> 1316 + t.state <- Tokenizer_state.Comment 1317 + 1318 + and state_comment_less_than_sign_bang_dash () = 1319 + match Tokenizer_stream.peek t.stream with 1320 + | Some '-' -> 1321 + Tokenizer_stream.advance t.stream; 1322 + t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash_dash 1323 + | _ -> 1324 + t.state <- Tokenizer_state.Comment_end_dash 1325 + 1326 + and state_comment_less_than_sign_bang_dash_dash () = 1327 + match Tokenizer_stream.peek t.stream with 1328 + | Some '>' | None -> 1329 + t.state <- Tokenizer_state.Comment_end 1330 + | Some _ -> 1331 + error t "nested-comment"; 1332 + t.state <- Tokenizer_state.Comment_end 1333 + 1334 + and state_comment_end_dash () = 1335 + match Tokenizer_stream.peek t.stream with 1336 + | Some '-' -> 1337 + Tokenizer_stream.advance t.stream; 1338 + t.state <- Tokenizer_state.Comment_end 1339 + | None -> () 1340 + | Some _ -> 1341 + Buffer.add_char t.current_comment '-'; 1342 + t.state <- Tokenizer_state.Comment 1343 + 1344 + and state_comment_end () = 1345 + match Tokenizer_stream.peek t.stream with 1346 + | Some '>' -> 1347 + Tokenizer_stream.advance t.stream; 1348 + t.state <- Tokenizer_state.Data; 1349 + emit_current_comment () 1350 + | Some '!' -> 1351 + Tokenizer_stream.advance t.stream; 1352 + t.state <- Tokenizer_state.Comment_end_bang 1353 + | Some '-' -> 1354 + Tokenizer_stream.advance t.stream; 1355 + Buffer.add_char t.current_comment '-' 1356 + | None -> () 1357 + | Some _ -> 1358 + Buffer.add_string t.current_comment "--"; 1359 + t.state <- Tokenizer_state.Comment 1360 + 1361 + and state_comment_end_bang () = 1362 + match Tokenizer_stream.peek t.stream with 1363 + | Some '-' -> 1364 + Tokenizer_stream.advance t.stream; 1365 + Buffer.add_string t.current_comment "--!"; 1366 + t.state <- Tokenizer_state.Comment_end_dash 1367 + | Some '>' -> 1368 + Tokenizer_stream.advance t.stream; 1369 + error t "incorrectly-closed-comment"; 1370 + t.state <- Tokenizer_state.Data; 1371 + emit_current_comment () 1372 + | None -> () 1373 + | Some _ -> 1374 + Buffer.add_string t.current_comment "--!"; 1375 + t.state <- Tokenizer_state.Comment 1376 + 1377 + and state_doctype () = 1378 + match Tokenizer_stream.peek t.stream with 1379 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1380 + Tokenizer_stream.advance t.stream; 1381 + t.state <- Tokenizer_state.Before_doctype_name 1382 + | Some '>' -> 1383 + t.state <- Tokenizer_state.Before_doctype_name 1384 + | None -> () 1385 + | Some _ -> 1386 + error t "missing-whitespace-before-doctype-name"; 1387 + t.state <- Tokenizer_state.Before_doctype_name 1388 + 1389 + and state_before_doctype_name () = 1390 + match Tokenizer_stream.peek t.stream with 1391 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1392 + Tokenizer_stream.advance t.stream 1393 + | Some '\x00' -> 1394 + Tokenizer_stream.advance t.stream; 1395 + error t "unexpected-null-character"; 1396 + start_new_doctype t; 1397 + t.current_doctype_name <- Some (Buffer.create 8); 1398 + Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"; 1399 + t.state <- Tokenizer_state.Doctype_name 1400 + | Some '>' -> 1401 + Tokenizer_stream.advance t.stream; 1402 + error t "missing-doctype-name"; 1403 + start_new_doctype t; 1404 + t.current_doctype_force_quirks <- true; 1405 + t.state <- Tokenizer_state.Data; 1406 + emit_current_doctype () 1407 + | None -> () 1408 + | Some c -> 1409 + Tokenizer_stream.advance t.stream; 1410 + check_control_char c; 1411 + start_new_doctype t; 1412 + t.current_doctype_name <- Some (Buffer.create 8); 1413 + Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c); 1414 + t.state <- Tokenizer_state.Doctype_name 1415 + 1416 + and state_doctype_name () = 1417 + match Tokenizer_stream.consume t.stream with 1418 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1419 + t.state <- Tokenizer_state.After_doctype_name 1420 + | Some '>' -> 1421 + t.state <- Tokenizer_state.Data; 1422 + emit_current_doctype () 1423 + | Some '\x00' -> 1424 + error t "unexpected-null-character"; 1425 + Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD" 1426 + | Some c -> 1427 + check_control_char c; 1428 + Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c) 1429 + | None -> () 1430 + 1431 + and state_after_doctype_name () = 1432 + match Tokenizer_stream.peek t.stream with 1433 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1434 + Tokenizer_stream.advance t.stream 1435 + | Some '>' -> 1436 + Tokenizer_stream.advance t.stream; 1437 + t.state <- Tokenizer_state.Data; 1438 + emit_current_doctype () 1439 + | None -> () 1440 + | Some _ -> 1441 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1442 + if Tokenizer_stream.matches_ci t.stream "PUBLIC" then begin 1443 + ignore (Tokenizer_stream.consume_exact_ci t.stream "PUBLIC"); 1444 + t.state <- Tokenizer_state.After_doctype_public_keyword 1445 + end else if Tokenizer_stream.matches_ci t.stream "SYSTEM" then begin 1446 + ignore (Tokenizer_stream.consume_exact_ci t.stream "SYSTEM"); 1447 + t.state <- Tokenizer_state.After_doctype_system_keyword 1448 + end else begin 1449 + error t "invalid-character-sequence-after-doctype-name"; 1450 + t.current_doctype_force_quirks <- true; 1451 + t.state <- Tokenizer_state.Bogus_doctype 1452 + end 1453 + 1454 + and state_after_doctype_public_keyword () = 1455 + match Tokenizer_stream.peek t.stream with 1456 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1457 + Tokenizer_stream.advance t.stream; 1458 + t.state <- Tokenizer_state.Before_doctype_public_identifier 1459 + | Some '"' -> 1460 + Tokenizer_stream.advance t.stream; 1461 + error t "missing-whitespace-after-doctype-public-keyword"; 1462 + t.current_doctype_public <- Some (Buffer.create 32); 1463 + t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted 1464 + | Some '\'' -> 1465 + Tokenizer_stream.advance t.stream; 1466 + error t "missing-whitespace-after-doctype-public-keyword"; 1467 + t.current_doctype_public <- Some (Buffer.create 32); 1468 + t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted 1469 + | Some '>' -> 1470 + Tokenizer_stream.advance t.stream; 1471 + error t "missing-doctype-public-identifier"; 1472 + t.current_doctype_force_quirks <- true; 1473 + t.state <- Tokenizer_state.Data; 1474 + emit_current_doctype () 1475 + | None -> () 1476 + | Some _ -> 1477 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1478 + error t "missing-quote-before-doctype-public-identifier"; 1479 + t.current_doctype_force_quirks <- true; 1480 + t.state <- Tokenizer_state.Bogus_doctype 1481 + 1482 + and state_before_doctype_public_identifier () = 1483 + match Tokenizer_stream.peek t.stream with 1484 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1485 + Tokenizer_stream.advance t.stream 1486 + | Some '"' -> 1487 + Tokenizer_stream.advance t.stream; 1488 + t.current_doctype_public <- Some (Buffer.create 32); 1489 + t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted 1490 + | Some '\'' -> 1491 + Tokenizer_stream.advance t.stream; 1492 + t.current_doctype_public <- Some (Buffer.create 32); 1493 + t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted 1494 + | Some '>' -> 1495 + Tokenizer_stream.advance t.stream; 1496 + error t "missing-doctype-public-identifier"; 1497 + t.current_doctype_force_quirks <- true; 1498 + t.state <- Tokenizer_state.Data; 1499 + emit_current_doctype () 1500 + | None -> () 1501 + | Some _ -> 1502 + error t "missing-quote-before-doctype-public-identifier"; 1503 + t.current_doctype_force_quirks <- true; 1504 + t.state <- Tokenizer_state.Bogus_doctype 1505 + 1506 + and state_doctype_public_identifier_double_quoted () = 1507 + match Tokenizer_stream.consume t.stream with 1508 + | Some '"' -> 1509 + t.state <- Tokenizer_state.After_doctype_public_identifier 1510 + | Some '\x00' -> 1511 + error t "unexpected-null-character"; 1512 + Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1513 + | Some '>' -> 1514 + error t "abrupt-doctype-public-identifier"; 1515 + t.current_doctype_force_quirks <- true; 1516 + t.state <- Tokenizer_state.Data; 1517 + emit_current_doctype () 1518 + | Some c -> 1519 + check_control_char c; 1520 + Buffer.add_char (Option.get t.current_doctype_public) c 1521 + | None -> () 1522 + 1523 + and state_doctype_public_identifier_single_quoted () = 1524 + match Tokenizer_stream.consume t.stream with 1525 + | Some '\'' -> 1526 + t.state <- Tokenizer_state.After_doctype_public_identifier 1527 + | Some '\x00' -> 1528 + error t "unexpected-null-character"; 1529 + Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1530 + | Some '>' -> 1531 + error t "abrupt-doctype-public-identifier"; 1532 + t.current_doctype_force_quirks <- true; 1533 + t.state <- Tokenizer_state.Data; 1534 + emit_current_doctype () 1535 + | Some c -> 1536 + check_control_char c; 1537 + Buffer.add_char (Option.get t.current_doctype_public) c 1538 + | None -> () 1539 + 1540 + and state_after_doctype_public_identifier () = 1541 + match Tokenizer_stream.peek t.stream with 1542 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1543 + Tokenizer_stream.advance t.stream; 1544 + t.state <- Tokenizer_state.Between_doctype_public_and_system_identifiers 1545 + | Some '>' -> 1546 + Tokenizer_stream.advance t.stream; 1547 + t.state <- Tokenizer_state.Data; 1548 + emit_current_doctype () 1549 + | Some '"' -> 1550 + Tokenizer_stream.advance t.stream; 1551 + error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1552 + t.current_doctype_system <- Some (Buffer.create 32); 1553 + t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1554 + | Some '\'' -> 1555 + Tokenizer_stream.advance t.stream; 1556 + error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1557 + t.current_doctype_system <- Some (Buffer.create 32); 1558 + t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1559 + | None -> () 1560 + | Some _ -> 1561 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1562 + error t "missing-quote-before-doctype-system-identifier"; 1563 + t.current_doctype_force_quirks <- true; 1564 + t.state <- Tokenizer_state.Bogus_doctype 1565 + 1566 + and state_between_doctype_public_and_system_identifiers () = 1567 + match Tokenizer_stream.peek t.stream with 1568 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1569 + Tokenizer_stream.advance t.stream 1570 + | Some '>' -> 1571 + Tokenizer_stream.advance t.stream; 1572 + t.state <- Tokenizer_state.Data; 1573 + emit_current_doctype () 1574 + | Some '"' -> 1575 + Tokenizer_stream.advance t.stream; 1576 + t.current_doctype_system <- Some (Buffer.create 32); 1577 + t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1578 + | Some '\'' -> 1579 + Tokenizer_stream.advance t.stream; 1580 + t.current_doctype_system <- Some (Buffer.create 32); 1581 + t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1582 + | None -> () 1583 + | Some _ -> 1584 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1585 + error t "missing-quote-before-doctype-system-identifier"; 1586 + t.current_doctype_force_quirks <- true; 1587 + t.state <- Tokenizer_state.Bogus_doctype 1588 + 1589 + and state_after_doctype_system_keyword () = 1590 + match Tokenizer_stream.peek t.stream with 1591 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1592 + Tokenizer_stream.advance t.stream; 1593 + t.state <- Tokenizer_state.Before_doctype_system_identifier 1594 + | Some '"' -> 1595 + Tokenizer_stream.advance t.stream; 1596 + error t "missing-whitespace-after-doctype-system-keyword"; 1597 + t.current_doctype_system <- Some (Buffer.create 32); 1598 + t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1599 + | Some '\'' -> 1600 + Tokenizer_stream.advance t.stream; 1601 + error t "missing-whitespace-after-doctype-system-keyword"; 1602 + t.current_doctype_system <- Some (Buffer.create 32); 1603 + t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1604 + | Some '>' -> 1605 + Tokenizer_stream.advance t.stream; 1606 + error t "missing-doctype-system-identifier"; 1607 + t.current_doctype_force_quirks <- true; 1608 + t.state <- Tokenizer_state.Data; 1609 + emit_current_doctype () 1610 + | None -> () 1611 + | Some _ -> 1612 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1613 + error t "missing-quote-before-doctype-system-identifier"; 1614 + t.current_doctype_force_quirks <- true; 1615 + t.state <- Tokenizer_state.Bogus_doctype 1616 + 1617 + and state_before_doctype_system_identifier () = 1618 + match Tokenizer_stream.peek t.stream with 1619 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1620 + Tokenizer_stream.advance t.stream 1621 + | Some '"' -> 1622 + Tokenizer_stream.advance t.stream; 1623 + t.current_doctype_system <- Some (Buffer.create 32); 1624 + t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1625 + | Some '\'' -> 1626 + Tokenizer_stream.advance t.stream; 1627 + t.current_doctype_system <- Some (Buffer.create 32); 1628 + t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1629 + | Some '>' -> 1630 + Tokenizer_stream.advance t.stream; 1631 + error t "missing-doctype-system-identifier"; 1632 + t.current_doctype_force_quirks <- true; 1633 + t.state <- Tokenizer_state.Data; 1634 + emit_current_doctype () 1635 + | None -> () 1636 + | Some _ -> 1637 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1638 + error t "missing-quote-before-doctype-system-identifier"; 1639 + t.current_doctype_force_quirks <- true; 1640 + t.state <- Tokenizer_state.Bogus_doctype 1641 + 1642 + and state_doctype_system_identifier_double_quoted () = 1643 + match Tokenizer_stream.consume t.stream with 1644 + | Some '"' -> 1645 + t.state <- Tokenizer_state.After_doctype_system_identifier 1646 + | Some '\x00' -> 1647 + error t "unexpected-null-character"; 1648 + Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1649 + | Some '>' -> 1650 + error t "abrupt-doctype-system-identifier"; 1651 + t.current_doctype_force_quirks <- true; 1652 + t.state <- Tokenizer_state.Data; 1653 + emit_current_doctype () 1654 + | Some c -> 1655 + check_control_char c; 1656 + Buffer.add_char (Option.get t.current_doctype_system) c 1657 + | None -> () 1658 + 1659 + and state_doctype_system_identifier_single_quoted () = 1660 + match Tokenizer_stream.consume t.stream with 1661 + | Some '\'' -> 1662 + t.state <- Tokenizer_state.After_doctype_system_identifier 1663 + | Some '\x00' -> 1664 + error t "unexpected-null-character"; 1665 + Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1666 + | Some '>' -> 1667 + error t "abrupt-doctype-system-identifier"; 1668 + t.current_doctype_force_quirks <- true; 1669 + t.state <- Tokenizer_state.Data; 1670 + emit_current_doctype () 1671 + | Some c -> 1672 + check_control_char c; 1673 + Buffer.add_char (Option.get t.current_doctype_system) c 1674 + | None -> () 1675 + 1676 + and state_after_doctype_system_identifier () = 1677 + match Tokenizer_stream.peek t.stream with 1678 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1679 + Tokenizer_stream.advance t.stream 1680 + | Some '>' -> 1681 + Tokenizer_stream.advance t.stream; 1682 + t.state <- Tokenizer_state.Data; 1683 + emit_current_doctype () 1684 + | None -> () 1685 + | Some _ -> 1686 + (* Don't check control char here - bogus_doctype will check when it consumes *) 1687 + error t "unexpected-character-after-doctype-system-identifier"; 1688 + t.state <- Tokenizer_state.Bogus_doctype 1689 + 1690 + and state_bogus_doctype () = 1691 + match Tokenizer_stream.consume t.stream with 1692 + | Some '>' -> 1693 + t.state <- Tokenizer_state.Data; 1694 + emit_current_doctype () 1695 + | Some '\x00' -> 1696 + error t "unexpected-null-character" 1697 + | Some c -> 1698 + check_control_char c (* Check all chars in bogus doctype *) 1699 + | None -> () 1700 + 1701 + and state_cdata_section () = 1702 + match Tokenizer_stream.consume t.stream with 1703 + | Some ']' -> 1704 + t.state <- Tokenizer_state.Cdata_section_bracket 1705 + | Some c -> 1706 + (* CDATA section emits all characters as-is, including NUL, but still check for control chars *) 1707 + emit_char_checked c 1708 + | None -> () 1709 + 1710 + and state_cdata_section_bracket () = 1711 + match Tokenizer_stream.peek t.stream with 1712 + | Some ']' -> 1713 + Tokenizer_stream.advance t.stream; 1714 + t.state <- Tokenizer_state.Cdata_section_end 1715 + | _ -> 1716 + emit_char t ']'; 1717 + t.state <- Tokenizer_state.Cdata_section 1718 + 1719 + and state_cdata_section_end () = 1720 + match Tokenizer_stream.peek t.stream with 1721 + | Some ']' -> 1722 + Tokenizer_stream.advance t.stream; 1723 + emit_char t ']' 1724 + | Some '>' -> 1725 + Tokenizer_stream.advance t.stream; 1726 + t.state <- Tokenizer_state.Data 1727 + | _ -> 1728 + emit_str t "]]"; 1729 + t.state <- Tokenizer_state.Cdata_section 1730 + 1731 + and state_character_reference () = 1732 + Buffer.clear t.temp_buffer; 1733 + Buffer.add_char t.temp_buffer '&'; 1734 + match Tokenizer_stream.peek t.stream with 1735 + | Some c when is_ascii_alnum c -> 1736 + t.state <- Tokenizer_state.Named_character_reference 1737 + | Some '#' -> 1738 + Tokenizer_stream.advance t.stream; 1739 + Buffer.add_char t.temp_buffer '#'; 1740 + t.state <- Tokenizer_state.Numeric_character_reference 1741 + | _ -> 1742 + flush_code_points_consumed_as_char_ref t; 1743 + t.state <- t.return_state 1744 + 1745 + and state_named_character_reference () = 1746 + (* Collect alphanumeric characters *) 1747 + let rec collect () = 1748 + match Tokenizer_stream.peek t.stream with 1749 + | Some c when is_ascii_alnum c -> 1750 + Tokenizer_stream.advance t.stream; 1751 + Buffer.add_char t.temp_buffer c; 1752 + collect () 1753 + | _ -> () 1754 + in 1755 + collect (); 1756 + 1757 + let has_semicolon = 1758 + match Tokenizer_stream.peek t.stream with 1759 + | Some ';' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true 1760 + | _ -> false 1761 + in 1762 + 1763 + (* Try to match entity - buffer contains "&name" or "&name;" *) 1764 + let buf_contents = Buffer.contents t.temp_buffer in 1765 + let name_start = 1 in (* Skip '&' *) 1766 + let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in 1767 + let entity_name = String.sub buf_contents name_start (name_end - name_start) in 1768 + 1769 + (* Try progressively shorter matches *) 1770 + (* Only match if: 1771 + 1. Full match with semicolon, OR 1772 + 2. Legacy entity (can be used without semicolon) *) 1773 + let rec try_match len = 1774 + if len <= 0 then None 1775 + else 1776 + let prefix = String.sub entity_name 0 len in 1777 + let is_full = len = String.length entity_name in 1778 + let would_have_semi = has_semicolon && is_full in 1779 + (* Only use this match if it has semicolon or is a legacy entity *) 1780 + if would_have_semi || Entities.is_legacy prefix then 1781 + match Entities.lookup prefix with 1782 + | Some decoded -> Some (decoded, len) 1783 + | None -> try_match (len - 1) 1784 + else 1785 + try_match (len - 1) 1786 + in 1787 + 1788 + match try_match (String.length entity_name) with 1789 + | Some (decoded, matched_len) -> 1790 + let full_match = matched_len = String.length entity_name in 1791 + let ends_with_semi = has_semicolon && full_match in 1792 + 1793 + (* Check attribute context restrictions *) 1794 + let in_attribute = match t.return_state with 1795 + | Tokenizer_state.Attribute_value_double_quoted 1796 + | Tokenizer_state.Attribute_value_single_quoted 1797 + | Tokenizer_state.Attribute_value_unquoted -> true 1798 + | _ -> false 1799 + in 1800 + 1801 + let next_char = 1802 + if full_match && not has_semicolon then 1803 + Tokenizer_stream.peek t.stream 1804 + else if not full_match then 1805 + Some entity_name.[matched_len] 1806 + else None 1807 + in 1808 + 1809 + let blocked = in_attribute && not ends_with_semi && 1810 + match next_char with 1811 + | Some '=' -> true 1812 + | Some c when is_ascii_alnum c -> true 1813 + | _ -> false 1814 + in 1815 + 1816 + if blocked then begin 1817 + flush_code_points_consumed_as_char_ref t; 1818 + t.state <- t.return_state 1819 + end else begin 1820 + if not ends_with_semi then 1821 + error t "missing-semicolon-after-character-reference"; 1822 + Buffer.clear t.temp_buffer; 1823 + Buffer.add_string t.temp_buffer decoded; 1824 + flush_code_points_consumed_as_char_ref t; 1825 + (* Emit unconsumed chars after partial match *) 1826 + if not full_match then begin 1827 + let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in 1828 + emit_str t unconsumed; 1829 + (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *) 1830 + if has_semicolon then 1831 + emit_char t ';' 1832 + end; 1833 + t.state <- t.return_state 1834 + end 1835 + | None -> 1836 + (* No match - check if we should report unknown-named-character-reference *) 1837 + if String.length entity_name > 0 then begin 1838 + (* If we have a semicolon, it's definitely an unknown named character reference *) 1839 + if has_semicolon then 1840 + error t "unknown-named-character-reference"; 1841 + (* Emit all the chars we consumed *) 1842 + flush_code_points_consumed_as_char_ref t; 1843 + t.state <- t.return_state 1844 + end else begin 1845 + flush_code_points_consumed_as_char_ref t; 1846 + t.state <- t.return_state 1847 + end 1848 + 1849 + and state_ambiguous_ampersand () = 1850 + match Tokenizer_stream.peek t.stream with 1851 + | Some c when is_ascii_alnum c -> 1852 + Tokenizer_stream.advance t.stream; 1853 + (match t.return_state with 1854 + | Tokenizer_state.Attribute_value_double_quoted 1855 + | Tokenizer_state.Attribute_value_single_quoted 1856 + | Tokenizer_state.Attribute_value_unquoted -> 1857 + Buffer.add_char t.current_attr_value c 1858 + | _ -> 1859 + emit_char t c) 1860 + | Some ';' -> 1861 + error t "unknown-named-character-reference"; 1862 + t.state <- t.return_state 1863 + | _ -> 1864 + t.state <- t.return_state 1865 + 1866 + and state_numeric_character_reference () = 1867 + t.char_ref_code <- 0; 1868 + match Tokenizer_stream.peek t.stream with 1869 + | Some (('x' | 'X') as c) -> 1870 + Tokenizer_stream.advance t.stream; 1871 + Buffer.add_char t.temp_buffer c; 1872 + t.state <- Tokenizer_state.Hexadecimal_character_reference_start 1873 + | _ -> 1874 + t.state <- Tokenizer_state.Decimal_character_reference_start 1875 + 1876 + and state_hexadecimal_character_reference_start () = 1877 + match Tokenizer_stream.peek t.stream with 1878 + | Some c when is_ascii_hex c -> 1879 + t.state <- Tokenizer_state.Hexadecimal_character_reference 1880 + | _ -> 1881 + error t "absence-of-digits-in-numeric-character-reference"; 1882 + flush_code_points_consumed_as_char_ref t; 1883 + t.state <- t.return_state 1884 + 1885 + and state_decimal_character_reference_start () = 1886 + match Tokenizer_stream.peek t.stream with 1887 + | Some c when is_ascii_digit c -> 1888 + t.state <- Tokenizer_state.Decimal_character_reference 1889 + | _ -> 1890 + error t "absence-of-digits-in-numeric-character-reference"; 1891 + flush_code_points_consumed_as_char_ref t; 1892 + t.state <- t.return_state 1893 + 1894 + and state_hexadecimal_character_reference () = 1895 + match Tokenizer_stream.peek t.stream with 1896 + | Some c when is_ascii_digit c -> 1897 + Tokenizer_stream.advance t.stream; 1898 + t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0'); 1899 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1900 + | Some c when c >= 'A' && c <= 'F' -> 1901 + Tokenizer_stream.advance t.stream; 1902 + t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10); 1903 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1904 + | Some c when c >= 'a' && c <= 'f' -> 1905 + Tokenizer_stream.advance t.stream; 1906 + t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10); 1907 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1908 + | Some ';' -> 1909 + Tokenizer_stream.advance t.stream; 1910 + t.state <- Tokenizer_state.Numeric_character_reference_end 1911 + | _ -> 1912 + error t "missing-semicolon-after-character-reference"; 1913 + t.state <- Tokenizer_state.Numeric_character_reference_end 1914 + 1915 + and state_decimal_character_reference () = 1916 + match Tokenizer_stream.peek t.stream with 1917 + | Some c when is_ascii_digit c -> 1918 + Tokenizer_stream.advance t.stream; 1919 + t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0'); 1920 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1921 + | Some ';' -> 1922 + Tokenizer_stream.advance t.stream; 1923 + t.state <- Tokenizer_state.Numeric_character_reference_end 1924 + | _ -> 1925 + error t "missing-semicolon-after-character-reference"; 1926 + t.state <- Tokenizer_state.Numeric_character_reference_end 1927 + 1928 + and state_numeric_character_reference_end () = 1929 + let code = t.char_ref_code in 1930 + let replacement_char = "\xEF\xBF\xBD" in 1931 + 1932 + let result = 1933 + if code = 0 then begin 1934 + error t "null-character-reference"; 1935 + replacement_char 1936 + end else if code > 0x10FFFF then begin 1937 + error t "character-reference-outside-unicode-range"; 1938 + replacement_char 1939 + end else if code >= 0xD800 && code <= 0xDFFF then begin 1940 + error t "surrogate-character-reference"; 1941 + replacement_char 1942 + end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1943 + List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; 1944 + 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF; 1945 + 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF; 1946 + 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1947 + 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1948 + 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1949 + error t "noncharacter-character-reference"; 1950 + Entities.Numeric_ref.codepoint_to_utf8 code 1951 + end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1952 + (code >= 0x0D && code <= 0x1F) || 1953 + (code >= 0x7F && code <= 0x9F) then begin 1954 + error t "control-character-reference"; 1955 + (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1956 + match Entities.Numeric_ref.find_replacement code with 1957 + | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement 1958 + | None -> Entities.Numeric_ref.codepoint_to_utf8 code 1959 + end else 1960 + Entities.Numeric_ref.codepoint_to_utf8 code 1961 + in 1962 + 1963 + Buffer.clear t.temp_buffer; 1964 + Buffer.add_string t.temp_buffer result; 1965 + flush_code_points_consumed_as_char_ref t; 1966 + t.state <- t.return_state 1967 + 1968 + in 1969 + process_state () 1970 + 1971 + let get_errors t = List.rev t.errors 1972 + 1973 + let set_state t state = t.state <- state 1974 + 1975 + let set_last_start_tag t name = t.last_start_tag <- name
lib/parser/constants.ml lib/html5rw/parser/parser_constants.ml
-4
lib/parser/dune
··· 1 - (library 2 - (name html5rw_parser) 3 - (public_name html5rw.parser) 4 - (libraries astring bytesrw html5rw.tokenizer html5rw.dom html5rw.encoding html5rw.selector))
-41
lib/parser/html5rw_parser.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 - 6 - (* html5rw.parser - HTML5 parser with bytesrw-only API *) 7 - 8 - module Dom = Html5rw_dom 9 - module Tokenizer = Html5rw_tokenizer 10 - module Encoding = Html5rw_encoding 11 - module Constants = Constants 12 - module Insertion_mode = Insertion_mode 13 - module Tree_builder = Tree_builder 14 - 15 - type parse_error = Parser.parse_error 16 - type fragment_context = Parser.fragment_context 17 - type t = Parser.t 18 - 19 - (* parse_error accessors *) 20 - let error_code (e : parse_error) = e.Tree_builder.code 21 - let error_line (e : parse_error) = e.Tree_builder.line 22 - let error_column (e : parse_error) = e.Tree_builder.column 23 - 24 - (* fragment_context constructor and accessors *) 25 - let make_fragment_context ~tag_name ?(namespace=None) () : fragment_context = 26 - { Tree_builder.tag_name; namespace } 27 - 28 - let fragment_context_tag (ctx : fragment_context) = ctx.Tree_builder.tag_name 29 - let fragment_context_namespace (ctx : fragment_context) = ctx.Tree_builder.namespace 30 - 31 - let parse = Parser.parse 32 - let parse_bytes = Parser.parse_bytes 33 - let query = Parser.query 34 - let to_writer = Parser.to_writer 35 - let to_string = Parser.to_string 36 - let to_text = Parser.to_text 37 - let to_test_format = Parser.to_test_format 38 - 39 - let root t = t.Parser.root 40 - let errors t = t.Parser.errors 41 - let encoding t = t.Parser.encoding
+5 -5
lib/parser/html5rw_parser.mli lib/html5rw/parser/parser.mli
··· 115 115 (** {1 Sub-modules} *) 116 116 117 117 (** DOM types and manipulation. *) 118 - module Dom = Html5rw_dom 118 + module Dom = Dom 119 119 120 120 (** HTML5 tokenizer. 121 121 ··· 125 125 126 126 @see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization> 127 127 WHATWG: Tokenization *) 128 - module Tokenizer = Html5rw_tokenizer 128 + module Tokenizer = Tokenizer 129 129 130 130 (** Character encoding detection and conversion. 131 131 132 132 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding> 133 133 WHATWG: Determining the character encoding *) 134 - module Encoding = Html5rw_encoding 134 + module Encoding = Encoding 135 135 136 136 (** HTML element constants and categories. 137 137 ··· 480 480 481 481 {b Supported selectors:} 482 482 483 - See {!Html5rw_selector} for the complete list. Key selectors include: 483 + See {!Selector} for the complete list. Key selectors include: 484 484 - Type: [div], [p], [a] 485 485 - ID: [#myid] 486 486 - Class: [.myclass] ··· 488 488 - Pseudo-class: [:first-child], [:nth-child(2)] 489 489 - Combinators: [div p] (descendant), [div > p] (child) 490 490 491 - @raise Html5rw_selector.Selector_error if the selector is invalid 491 + @raise Selector.Selector_error if the selector is invalid 492 492 493 493 @see <https://www.w3.org/TR/selectors-4/> 494 494 W3C: Selectors Level 4 *)
lib/parser/insertion_mode.ml lib/html5rw/parser/parser_insertion_mode.ml
+22 -22
lib/parser/parser.ml lib/html5rw/parser/parser_impl.ml
··· 7 7 8 8 open Bytesrw 9 9 10 - module Dom = Html5rw_dom 11 - module Tokenizer = Html5rw_tokenizer 12 - module Encoding = Html5rw_encoding 10 + module Dom = Dom 11 + module Tokenizer = Tokenizer 12 + module Encoding = Encoding 13 13 14 - type parse_error = Tree_builder.parse_error 14 + type parse_error = Parser_tree_builder.parse_error 15 15 16 - type fragment_context = Tree_builder.fragment_context 16 + type fragment_context = Parser_tree_builder.fragment_context 17 17 18 18 type t = { 19 19 root : Dom.node; ··· 23 23 24 24 (* Token sink that feeds tokens to tree builder *) 25 25 module TreeBuilderSink = struct 26 - type t = Tree_builder.t 26 + type t = Parser_tree_builder.t 27 27 28 28 let process tb token = 29 - Tree_builder.process_token tb token; 29 + Parser_tree_builder.process_token tb token; 30 30 (* Check if we need to switch tokenizer state based on current element *) 31 31 (* Only switch for HTML namespace elements - SVG/MathML use different rules *) 32 - match Tree_builder.current_node tb with 32 + match Parser_tree_builder.current_node tb with 33 33 | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" -> 34 34 let name = node.Dom.name in 35 35 if List.mem name ["textarea"; "title"] then 36 - `SwitchTo Tokenizer.State.Rcdata 36 + `SwitchTo Tokenizer_state.Rcdata 37 37 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then 38 - `SwitchTo Tokenizer.State.Rawtext 38 + `SwitchTo Tokenizer_state.Rawtext 39 39 else if name = "script" then 40 - `SwitchTo Tokenizer.State.Script_data 40 + `SwitchTo Tokenizer_state.Script_data 41 41 else if name = "plaintext" then 42 - `SwitchTo Tokenizer.State.Plaintext 42 + `SwitchTo Tokenizer_state.Plaintext 43 43 else 44 44 `Continue 45 45 | _ -> `Continue 46 46 47 47 let adjusted_current_node_in_html_namespace tb = 48 - Tree_builder.adjusted_current_node_in_html_namespace tb 48 + Parser_tree_builder.adjusted_current_node_in_html_namespace tb 49 49 end 50 50 51 51 (* Core parsing function that takes a Bytes.Reader.t *) 52 52 let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) = 53 - let tb = Tree_builder.create ~collect_errors ?fragment_context () in 53 + let tb = Parser_tree_builder.create ~collect_errors ?fragment_context () in 54 54 let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in 55 55 56 56 (* Set tokenizer state for fragment parsing *) ··· 62 62 | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" -> 63 63 let name = String.lowercase_ascii ctx.tag_name in 64 64 if List.mem name ["title"; "textarea"] then 65 - Tokenizer.set_state tokenizer Tokenizer.State.Rcdata 65 + Tokenizer.set_state tokenizer Tokenizer_state.Rcdata 66 66 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then 67 - Tokenizer.set_state tokenizer Tokenizer.State.Rawtext 67 + Tokenizer.set_state tokenizer Tokenizer_state.Rawtext 68 68 else if name = "script" then 69 - Tokenizer.set_state tokenizer Tokenizer.State.Script_data 69 + Tokenizer.set_state tokenizer Tokenizer_state.Script_data 70 70 else if name = "plaintext" then 71 - Tokenizer.set_state tokenizer Tokenizer.State.Plaintext 71 + Tokenizer.set_state tokenizer Tokenizer_state.Plaintext 72 72 | _ -> ()); 73 73 74 74 Tokenizer.run tokenizer (module TreeBuilderSink) reader; 75 75 76 - let root = Tree_builder.finish tb in 76 + let root = Parser_tree_builder.finish tb in 77 77 let tokenizer_errors = Tokenizer.get_errors tokenizer in 78 - let tree_errors = Tree_builder.get_errors tb in 78 + let tree_errors = Parser_tree_builder.get_errors tb in 79 79 let all_errors = List.map (fun e -> 80 - { Tree_builder.code = e.Tokenizer.Errors.code; 80 + { Parser_tree_builder.code = e.Tokenizer.Errors.code; 81 81 line = e.Tokenizer.Errors.line; 82 82 column = e.Tokenizer.Errors.column } 83 83 ) tokenizer_errors @ tree_errors in ··· 92 92 { result with encoding = Some enc } 93 93 94 94 let query t selector = 95 - Html5rw_selector.query t.root selector 95 + Selector.query t.root selector 96 96 97 97 (* Serialize to a Bytes.Writer.t *) 98 98 let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) =
+172 -172
lib/parser/tree_builder.ml lib/html5rw/parser/parser_tree_builder.ml
··· 1 1 (* HTML5 Tree Builder *) 2 2 3 - module Dom = Html5rw_dom 4 - module Token = Html5rw_tokenizer.Token 5 - module State = Html5rw_tokenizer.State 3 + module Dom = Dom 4 + module Token = Tokenizer_token 5 + module State = Tokenizer_state 6 6 7 7 type fragment_context = { 8 8 tag_name : string; ··· 25 25 26 26 type t = { 27 27 mutable document : Dom.node; 28 - mutable mode : Insertion_mode.t; 29 - mutable original_mode : Insertion_mode.t option; 28 + mutable mode : Parser_insertion_mode.t; 29 + mutable original_mode : Parser_insertion_mode.t option; 30 30 mutable open_elements : Dom.node list; 31 31 mutable active_formatting : formatting_entry list; 32 32 mutable head_element : Dom.node option; ··· 35 35 mutable ignore_lf : bool; 36 36 mutable foster_parenting : bool; 37 37 mutable pending_table_chars : string list; 38 - mutable template_modes : Insertion_mode.t list; 38 + mutable template_modes : Parser_insertion_mode.t list; 39 39 mutable quirks_mode : Dom.quirks_mode; 40 40 mutable errors : parse_error list; 41 41 collect_errors : bool; ··· 49 49 let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in 50 50 let t = { 51 51 document = doc; 52 - mode = Insertion_mode.Initial; 52 + mode = Parser_insertion_mode.Initial; 53 53 original_mode = None; 54 54 open_elements = []; 55 55 active_formatting = []; ··· 86 86 | _ -> ()); 87 87 (* Set initial mode based on context *) 88 88 t.mode <- ( 89 - if name = "html" then Insertion_mode.Before_head 89 + if name = "html" then Parser_insertion_mode.Before_head 90 90 else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then 91 - Insertion_mode.In_table_body 91 + Parser_insertion_mode.In_table_body 92 92 else if name = "tr" && (ns = None || ns = Some "html") then 93 - Insertion_mode.In_row 93 + Parser_insertion_mode.In_row 94 94 else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then 95 - Insertion_mode.In_cell 95 + Parser_insertion_mode.In_cell 96 96 else if name = "caption" && (ns = None || ns = Some "html") then 97 - Insertion_mode.In_caption 97 + Parser_insertion_mode.In_caption 98 98 else if name = "colgroup" && (ns = None || ns = Some "html") then 99 - Insertion_mode.In_column_group 99 + Parser_insertion_mode.In_column_group 100 100 else if name = "table" && (ns = None || ns = Some "html") then 101 - Insertion_mode.In_table 101 + Parser_insertion_mode.In_table 102 102 else if name = "template" && (ns = None || ns = Some "html") then begin 103 - t.template_modes <- [Insertion_mode.In_template]; 104 - Insertion_mode.In_template 103 + t.template_modes <- [Parser_insertion_mode.In_template]; 104 + Parser_insertion_mode.In_template 105 105 end 106 106 else 107 - Insertion_mode.In_body 107 + Parser_insertion_mode.In_body 108 108 ); 109 109 t.frameset_ok <- false 110 110 | None -> ()); ··· 136 136 let is_special_element node = 137 137 let name = String.lowercase_ascii node.Dom.name in 138 138 match node.Dom.namespace with 139 - | None | Some "html" -> Constants.is_special name 139 + | None | Some "html" -> Parser_constants.is_special name 140 140 | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"] 141 141 | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"] 142 142 | _ -> false ··· 213 213 let insert_foreign_element t (tag : Token.tag) namespace = 214 214 let attrs = 215 215 if namespace = Some "svg" then 216 - Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs tag.attrs) 216 + Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs) 217 217 else 218 - Constants.adjust_foreign_attrs tag.attrs 218 + Parser_constants.adjust_foreign_attrs tag.attrs 219 219 in 220 220 let name = 221 - if namespace = Some "svg" then Constants.adjust_svg_tag_name tag.name 221 + if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name 222 222 else tag.name 223 223 in 224 224 let node = insert_element t name ~namespace attrs in ··· 282 282 let is_html_integration_point node = 283 283 (* SVG foreignObject, desc, and title are always HTML integration points *) 284 284 if node.Dom.namespace = Some "svg" && 285 - List.mem node.Dom.name Constants.svg_html_integration then true 285 + List.mem node.Dom.name Parser_constants.svg_html_integration then true 286 286 (* annotation-xml is an HTML integration point only with specific encoding values *) 287 287 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 288 288 match List.assoc_opt "encoding" node.Dom.attrs with ··· 313 313 check t.open_elements 314 314 315 315 let has_element_in_scope t name = 316 - has_element_in_scope_impl t [name] Constants.default_scope ~check_integration_points:true 316 + has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true 317 317 318 318 let has_element_in_button_scope t name = 319 - has_element_in_scope_impl t [name] Constants.button_scope ~check_integration_points:true 319 + has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true 320 320 321 321 let has_element_in_list_item_scope t name = 322 - has_element_in_scope_impl t [name] Constants.list_item_scope ~check_integration_points:true 322 + has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true 323 323 324 324 let has_element_in_table_scope t name = 325 - has_element_in_scope_impl t [name] Constants.table_scope ~check_integration_points:false 325 + has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false 326 326 327 327 let has_element_in_select_scope t name = 328 328 let rec check = function 329 329 | [] -> false 330 330 | n :: rest -> 331 331 if n.Dom.name = name then true 332 - else if not (List.mem n.Dom.name Constants.select_scope_exclude) then false 332 + else if not (List.mem n.Dom.name Parser_constants.select_scope_exclude) then false 333 333 else check rest 334 334 in 335 335 check t.open_elements ··· 338 338 let generate_implied_end_tags t ?except () = 339 339 let rec loop () = 340 340 match current_node t with 341 - | Some n when List.mem n.Dom.name Constants.implied_end_tags -> 341 + | Some n when List.mem n.Dom.name Parser_constants.implied_end_tags -> 342 342 (match except with 343 343 | Some ex when n.Dom.name = ex -> () 344 344 | _ -> pop_current t; loop ()) ··· 349 349 let generate_all_implied_end_tags t = 350 350 let rec loop () = 351 351 match current_node t with 352 - | Some n when List.mem n.Dom.name Constants.thoroughly_implied_end_tags -> 352 + | Some n when List.mem n.Dom.name Parser_constants.thoroughly_implied_end_tags -> 353 353 pop_current t; loop () 354 354 | _ -> () 355 355 in ··· 662 662 (* Reset insertion mode *) 663 663 let reset_insertion_mode t = 664 664 let rec check_node last = function 665 - | [] -> t.mode <- Insertion_mode.In_body 665 + | [] -> t.mode <- Parser_insertion_mode.In_body 666 666 | node :: rest -> 667 667 let is_last = rest = [] in 668 668 let node_to_check = ··· 678 678 let rec find_table_or_template = function 679 679 | [] -> () 680 680 | n :: rest -> 681 - if n.Dom.name = "template" then t.mode <- Insertion_mode.In_select 682 - else if n.Dom.name = "table" then t.mode <- Insertion_mode.In_select_in_table 681 + if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select 682 + else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table 683 683 else find_table_or_template rest 684 684 in 685 685 find_table_or_template rest 686 686 end; 687 - if t.mode <> Insertion_mode.In_select_in_table then 688 - t.mode <- Insertion_mode.In_select 687 + if t.mode <> Parser_insertion_mode.In_select_in_table then 688 + t.mode <- Parser_insertion_mode.In_select 689 689 end else if List.mem name ["td"; "th"] && not is_last then 690 - t.mode <- Insertion_mode.In_cell 690 + t.mode <- Parser_insertion_mode.In_cell 691 691 else if name = "tr" then 692 - t.mode <- Insertion_mode.In_row 692 + t.mode <- Parser_insertion_mode.In_row 693 693 else if List.mem name ["tbody"; "thead"; "tfoot"] then 694 - t.mode <- Insertion_mode.In_table_body 694 + t.mode <- Parser_insertion_mode.In_table_body 695 695 else if name = "caption" then 696 - t.mode <- Insertion_mode.In_caption 696 + t.mode <- Parser_insertion_mode.In_caption 697 697 else if name = "colgroup" then 698 - t.mode <- Insertion_mode.In_column_group 698 + t.mode <- Parser_insertion_mode.In_column_group 699 699 else if name = "table" then 700 - t.mode <- Insertion_mode.In_table 700 + t.mode <- Parser_insertion_mode.In_table 701 701 else if name = "template" then 702 - t.mode <- (match t.template_modes with m :: _ -> m | [] -> Insertion_mode.In_template) 702 + t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template) 703 703 else if name = "head" && not is_last then 704 - t.mode <- Insertion_mode.In_head 704 + t.mode <- Parser_insertion_mode.In_head 705 705 else if name = "body" then 706 - t.mode <- Insertion_mode.In_body 706 + t.mode <- Parser_insertion_mode.In_body 707 707 else if name = "frameset" then 708 - t.mode <- Insertion_mode.In_frameset 708 + t.mode <- Parser_insertion_mode.In_frameset 709 709 else if name = "html" then 710 - t.mode <- (if t.head_element = None then Insertion_mode.Before_head else Insertion_mode.After_head) 710 + t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head) 711 711 else if is_last then 712 - t.mode <- Insertion_mode.In_body 712 + t.mode <- Parser_insertion_mode.In_body 713 713 else 714 714 check_node last rest 715 715 in ··· 737 737 let sys = Option.map String.lowercase_ascii dt.system_id in 738 738 let is_quirky = 739 739 (match pub with 740 - | Some p -> List.mem p Constants.quirky_public_matches || 740 + | Some p -> List.mem p Parser_constants.quirky_public_matches || 741 741 List.exists (fun prefix -> String.length p >= String.length prefix && 742 - String.sub p 0 (String.length prefix) = prefix) Constants.quirky_public_prefixes 742 + String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes 743 743 | None -> false) || 744 744 (match sys with 745 - | Some s -> List.mem s Constants.quirky_system_matches 745 + | Some s -> List.mem s Parser_constants.quirky_system_matches 746 746 | None -> false) 747 747 in 748 748 if is_quirky then t.quirks_mode <- Dom.Quirks ··· 751 751 match pub with 752 752 | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix && 753 753 String.sub p 0 (String.length prefix) = prefix) 754 - Constants.limited_quirky_public_prefixes 754 + Parser_constants.limited_quirky_public_prefixes 755 755 | None -> false 756 756 in 757 757 if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks 758 758 end 759 759 end; 760 - t.mode <- Insertion_mode.Before_html 760 + t.mode <- Parser_insertion_mode.Before_html 761 761 | _ -> 762 762 parse_error t "expected-doctype-but-got-other"; 763 763 t.quirks_mode <- Dom.Quirks; 764 - t.mode <- Insertion_mode.Before_html; 764 + t.mode <- Parser_insertion_mode.Before_html; 765 765 process_token t token 766 766 767 767 and process_before_html t token = ··· 772 772 | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 773 773 let html = insert_element t "html" attrs in 774 774 t.open_elements <- [html]; 775 - t.mode <- Insertion_mode.Before_head 775 + t.mode <- Parser_insertion_mode.Before_head 776 776 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 777 777 let html = insert_element t "html" [] in 778 778 t.open_elements <- [html]; 779 - t.mode <- Insertion_mode.Before_head; 779 + t.mode <- Parser_insertion_mode.Before_head; 780 780 process_token t token 781 781 | Token.Tag { kind = Token.End; _ } -> 782 782 parse_error t "unexpected-end-tag" 783 783 | _ -> 784 784 let html = insert_element t "html" [] in 785 785 t.open_elements <- [html]; 786 - t.mode <- Insertion_mode.Before_head; 786 + t.mode <- Parser_insertion_mode.Before_head; 787 787 process_token t token 788 788 789 789 and process_before_head t token = ··· 797 797 let head = insert_element t "head" attrs in 798 798 t.open_elements <- head :: t.open_elements; 799 799 t.head_element <- Some head; 800 - t.mode <- Insertion_mode.In_head 800 + t.mode <- Parser_insertion_mode.In_head 801 801 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 802 802 let head = insert_element t "head" [] in 803 803 t.open_elements <- head :: t.open_elements; 804 804 t.head_element <- Some head; 805 - t.mode <- Insertion_mode.In_head; 805 + t.mode <- Parser_insertion_mode.In_head; 806 806 process_token t token 807 807 | Token.Tag { kind = Token.End; _ } -> 808 808 parse_error t "unexpected-end-tag" ··· 810 810 let head = insert_element t "head" [] in 811 811 t.open_elements <- head :: t.open_elements; 812 812 t.head_element <- Some head; 813 - t.mode <- Insertion_mode.In_head; 813 + t.mode <- Parser_insertion_mode.In_head; 814 814 process_token t token 815 815 816 816 and process_in_head t token = ··· 834 834 | Some n when n.Dom.children <> [] -> insert_character t leading_ws 835 835 | _ -> ()); 836 836 pop_current t; 837 - t.mode <- Insertion_mode.After_head; 837 + t.mode <- Parser_insertion_mode.After_head; 838 838 process_token t (Token.Character remaining) 839 839 | Token.Comment data -> 840 840 insert_comment t data ··· 848 848 | Token.Tag { kind = Token.Start; name = "title"; _ } -> 849 849 ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false }); 850 850 t.original_mode <- Some t.mode; 851 - t.mode <- Insertion_mode.Text 851 + t.mode <- Parser_insertion_mode.Text 852 852 | Token.Tag { kind = Token.Start; name; _ } 853 853 when List.mem name ["noframes"; "style"] -> 854 854 ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false }); 855 855 t.original_mode <- Some t.mode; 856 - t.mode <- Insertion_mode.Text 856 + t.mode <- Parser_insertion_mode.Text 857 857 | Token.Tag { kind = Token.Start; name = "noscript"; _ } -> 858 858 (* Scripting is disabled: parse noscript content as HTML *) 859 859 ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false }); 860 - t.mode <- Insertion_mode.In_head_noscript 860 + t.mode <- Parser_insertion_mode.In_head_noscript 861 861 | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } -> 862 862 ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing }); 863 863 t.original_mode <- Some t.mode; 864 - t.mode <- Insertion_mode.Text 864 + t.mode <- Parser_insertion_mode.Text 865 865 | Token.Tag { kind = Token.End; name = "head"; _ } -> 866 866 pop_current t; 867 - t.mode <- Insertion_mode.After_head 867 + t.mode <- Parser_insertion_mode.After_head 868 868 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 869 869 pop_current t; 870 - t.mode <- Insertion_mode.After_head; 870 + t.mode <- Parser_insertion_mode.After_head; 871 871 process_token t token 872 872 | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } -> 873 873 let node = Dom.create_template ~attrs () in ··· 876 876 t.open_elements <- node :: t.open_elements; 877 877 push_formatting_marker t; 878 878 t.frameset_ok <- false; 879 - t.mode <- Insertion_mode.In_template; 880 - t.template_modes <- Insertion_mode.In_template :: t.template_modes 879 + t.mode <- Parser_insertion_mode.In_template; 880 + t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes 881 881 | Token.Tag { kind = Token.End; name = "template"; _ } -> 882 882 if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 883 883 parse_error t "unexpected-end-tag" ··· 897 897 parse_error t "unexpected-end-tag" 898 898 | _ -> 899 899 pop_current t; 900 - t.mode <- Insertion_mode.After_head; 900 + t.mode <- Parser_insertion_mode.After_head; 901 901 process_token t token 902 902 903 903 and process_in_head_noscript t token = ··· 907 907 | Token.Character _ -> 908 908 parse_error t "unexpected-char-in-noscript"; 909 909 pop_current t; (* Pop noscript *) 910 - t.mode <- Insertion_mode.In_head; 910 + t.mode <- Parser_insertion_mode.In_head; 911 911 process_token t token 912 912 | Token.Comment _ -> 913 913 process_in_head t token ··· 924 924 | Token.Tag { kind = Token.Start; _ } -> 925 925 parse_error t "unexpected-start-tag"; 926 926 pop_current t; (* Pop noscript *) 927 - t.mode <- Insertion_mode.In_head; 927 + t.mode <- Parser_insertion_mode.In_head; 928 928 process_token t token 929 929 | Token.Tag { kind = Token.End; name = "noscript"; _ } -> 930 930 pop_current t; (* Pop noscript *) 931 - t.mode <- Insertion_mode.In_head 931 + t.mode <- Parser_insertion_mode.In_head 932 932 | Token.Tag { kind = Token.End; name = "br"; _ } -> 933 933 parse_error t "unexpected-end-tag"; 934 934 pop_current t; (* Pop noscript *) 935 - t.mode <- Insertion_mode.In_head; 935 + t.mode <- Parser_insertion_mode.In_head; 936 936 process_token t token 937 937 | Token.Tag { kind = Token.End; _ } -> 938 938 parse_error t "unexpected-end-tag" 939 939 | Token.EOF -> 940 940 parse_error t "expected-closing-tag-but-got-eof"; 941 941 pop_current t; (* Pop noscript *) 942 - t.mode <- Insertion_mode.In_head; 942 + t.mode <- Parser_insertion_mode.In_head; 943 943 process_token t token 944 944 945 945 and process_after_head t token = ··· 955 955 | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 956 956 ignore (insert_element t "body" ~push:true attrs); 957 957 t.frameset_ok <- false; 958 - t.mode <- Insertion_mode.In_body 958 + t.mode <- Parser_insertion_mode.In_body 959 959 | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 960 960 ignore (insert_element t "frameset" ~push:true attrs); 961 - t.mode <- Insertion_mode.In_frameset 961 + t.mode <- Parser_insertion_mode.In_frameset 962 962 | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 963 963 (* Special handling for input type="hidden" - parse error, ignore *) 964 964 let input_type = List.assoc_opt "type" attrs in ··· 969 969 (* Non-hidden input creates body *) 970 970 let body = insert_element t "body" [] in 971 971 t.open_elements <- body :: t.open_elements; 972 - t.mode <- Insertion_mode.In_body; 972 + t.mode <- Parser_insertion_mode.In_body; 973 973 process_token t token) 974 974 | Token.Tag { kind = Token.Start; name; _ } 975 975 when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> ··· 985 985 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 986 986 let body = insert_element t "body" [] in 987 987 t.open_elements <- body :: t.open_elements; 988 - t.mode <- Insertion_mode.In_body; 988 + t.mode <- Parser_insertion_mode.In_body; 989 989 process_token t token 990 990 | Token.Tag { kind = Token.Start; name = "head"; _ } -> 991 991 parse_error t "unexpected-start-tag" ··· 994 994 | _ -> 995 995 let body = insert_element t "body" [] in 996 996 t.open_elements <- body :: t.open_elements; 997 - t.mode <- Insertion_mode.In_body; 997 + t.mode <- Parser_insertion_mode.In_body; 998 998 process_token t token 999 999 1000 1000 and process_in_body t token = ··· 1060 1060 t.open_elements <- drop (idx + 1) t.open_elements; 1061 1061 (* Insert frameset element *) 1062 1062 ignore (insert_element t "frameset" ~push:true attrs); 1063 - t.mode <- Insertion_mode.In_frameset 1063 + t.mode <- Parser_insertion_mode.In_frameset 1064 1064 end 1065 1065 | Token.EOF -> 1066 1066 if t.template_modes <> [] then ··· 1079 1079 not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1080 1080 ) t.open_elements in 1081 1081 if has_unclosed then parse_error t "end-tag-too-early"; 1082 - t.mode <- Insertion_mode.After_body 1082 + t.mode <- Parser_insertion_mode.After_body 1083 1083 end 1084 1084 | Token.Tag { kind = Token.End; name = "html"; _ } -> 1085 1085 if not (has_element_in_scope t "body") then 1086 1086 parse_error t "unexpected-end-tag" 1087 1087 else begin 1088 - t.mode <- Insertion_mode.After_body; 1088 + t.mode <- Parser_insertion_mode.After_body; 1089 1089 process_token t token 1090 1090 end 1091 1091 | Token.Tag { kind = Token.Start; name; attrs; _ } 1092 1092 when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] -> 1093 1093 if has_element_in_button_scope t "p" then close_p_element t; 1094 1094 ignore (insert_element t name ~push:true attrs) 1095 - | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.heading_elements -> 1095 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Parser_constants.heading_elements -> 1096 1096 if has_element_in_button_scope t "p" then close_p_element t; 1097 1097 (match current_node t with 1098 - | Some n when List.mem n.Dom.name Constants.heading_elements -> 1098 + | Some n when List.mem n.Dom.name Parser_constants.heading_elements -> 1099 1099 parse_error t "unexpected-start-tag"; 1100 1100 pop_current t 1101 1101 | _ -> ()); ··· 1230 1230 | _ -> ()); 1231 1231 pop_until_tag t name 1232 1232 end 1233 - | Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.heading_elements -> 1234 - if not (has_element_in_scope_impl t Constants.heading_elements Constants.default_scope ~check_integration_points:true) then 1233 + | Token.Tag { kind = Token.End; name; _ } when List.mem name Parser_constants.heading_elements -> 1234 + if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then 1235 1235 parse_error t "unexpected-end-tag" 1236 1236 else begin 1237 1237 generate_implied_end_tags t (); 1238 1238 (match current_node t with 1239 1239 | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1240 1240 | _ -> ()); 1241 - pop_until_one_of t Constants.heading_elements 1241 + pop_until_one_of t Parser_constants.heading_elements 1242 1242 end 1243 1243 | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } -> 1244 1244 (* Check for existing <a> in active formatting *) ··· 1310 1310 close_p_element t; 1311 1311 ignore (insert_element t "table" ~push:true attrs); 1312 1312 t.frameset_ok <- false; 1313 - t.mode <- Insertion_mode.In_table 1313 + t.mode <- Parser_insertion_mode.In_table 1314 1314 | Token.Tag { kind = Token.End; name = "br"; _ } -> 1315 1315 parse_error t "unexpected-end-tag"; 1316 1316 reconstruct_active_formatting t; ··· 1352 1352 t.ignore_lf <- true; 1353 1353 t.original_mode <- Some t.mode; 1354 1354 t.frameset_ok <- false; 1355 - t.mode <- Insertion_mode.Text 1355 + t.mode <- Parser_insertion_mode.Text 1356 1356 | Token.Tag { kind = Token.Start; name = "xmp"; _ } -> 1357 1357 if has_element_in_button_scope t "p" then close_p_element t; 1358 1358 reconstruct_active_formatting t; 1359 1359 t.frameset_ok <- false; 1360 1360 ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false }); 1361 1361 t.original_mode <- Some t.mode; 1362 - t.mode <- Insertion_mode.Text 1362 + t.mode <- Parser_insertion_mode.Text 1363 1363 | Token.Tag { kind = Token.Start; name = "iframe"; _ } -> 1364 1364 t.frameset_ok <- false; 1365 1365 ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false }); 1366 1366 t.original_mode <- Some t.mode; 1367 - t.mode <- Insertion_mode.Text 1367 + t.mode <- Parser_insertion_mode.Text 1368 1368 | Token.Tag { kind = Token.Start; name = "noembed"; _ } -> 1369 1369 ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false }); 1370 1370 t.original_mode <- Some t.mode; 1371 - t.mode <- Insertion_mode.Text 1371 + t.mode <- Parser_insertion_mode.Text 1372 1372 | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } -> 1373 1373 reconstruct_active_formatting t; 1374 1374 ignore (insert_element t "select" ~push:true attrs); 1375 1375 t.frameset_ok <- false; 1376 - if List.mem t.mode [Insertion_mode.In_table; Insertion_mode.In_caption; Insertion_mode.In_table_body; Insertion_mode.In_row; Insertion_mode.In_cell] then 1377 - t.mode <- Insertion_mode.In_select_in_table 1376 + if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then 1377 + t.mode <- Parser_insertion_mode.In_select_in_table 1378 1378 else 1379 - t.mode <- Insertion_mode.In_select 1379 + t.mode <- Parser_insertion_mode.In_select 1380 1380 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] -> 1381 1381 (match current_node t with 1382 1382 | Some n when n.Dom.name = "option" -> pop_current t ··· 1401 1401 ignore (insert_element t name ~push:true attrs) 1402 1402 | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1403 1403 reconstruct_active_formatting t; 1404 - let adjusted_attrs = Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) in 1404 + let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in 1405 1405 ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml")); 1406 1406 if self_closing then pop_current t 1407 1407 | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1408 1408 reconstruct_active_formatting t; 1409 - let adjusted_attrs = Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) in 1409 + let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in 1410 1410 ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg")); 1411 1411 if self_closing then pop_current t 1412 1412 | Token.Tag { kind = Token.Start; name; attrs; _ } ··· 1448 1448 | Token.EOF -> 1449 1449 parse_error t "expected-closing-tag-but-got-eof"; 1450 1450 pop_current t; 1451 - t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body; 1451 + t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body; 1452 1452 process_token t token 1453 1453 | Token.Tag { kind = Token.End; _ } -> 1454 1454 pop_current t; 1455 - t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body 1455 + t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body 1456 1456 | _ -> () 1457 1457 1458 1458 and process_in_table t token = ··· 1460 1460 | Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) -> 1461 1461 t.pending_table_chars <- []; 1462 1462 t.original_mode <- Some t.mode; 1463 - t.mode <- Insertion_mode.In_table_text; 1463 + t.mode <- Parser_insertion_mode.In_table_text; 1464 1464 process_token t token 1465 1465 | Token.Comment data -> 1466 1466 insert_comment t data ··· 1470 1470 clear_stack_back_to_table_context t; 1471 1471 push_formatting_marker t; 1472 1472 ignore (insert_element t "caption" ~push:true attrs); 1473 - t.mode <- Insertion_mode.In_caption 1473 + t.mode <- Parser_insertion_mode.In_caption 1474 1474 | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } -> 1475 1475 clear_stack_back_to_table_context t; 1476 1476 ignore (insert_element t "colgroup" ~push:true attrs); 1477 - t.mode <- Insertion_mode.In_column_group 1477 + t.mode <- Parser_insertion_mode.In_column_group 1478 1478 | Token.Tag { kind = Token.Start; name = "col"; _ } -> 1479 1479 clear_stack_back_to_table_context t; 1480 1480 ignore (insert_element t "colgroup" ~push:true []); 1481 - t.mode <- Insertion_mode.In_column_group; 1481 + t.mode <- Parser_insertion_mode.In_column_group; 1482 1482 process_token t token 1483 1483 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1484 1484 clear_stack_back_to_table_context t; 1485 1485 ignore (insert_element t name ~push:true attrs); 1486 - t.mode <- Insertion_mode.In_table_body 1486 + t.mode <- Parser_insertion_mode.In_table_body 1487 1487 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] -> 1488 1488 clear_stack_back_to_table_context t; 1489 1489 ignore (insert_element t "tbody" ~push:true []); 1490 - t.mode <- Insertion_mode.In_table_body; 1490 + t.mode <- Parser_insertion_mode.In_table_body; 1491 1491 process_token t token 1492 1492 | Token.Tag { kind = Token.Start; name = "table"; _ } -> 1493 1493 parse_error t "unexpected-start-tag"; ··· 1568 1568 t.foster_parenting <- false 1569 1569 end else 1570 1570 insert_character t pending; 1571 - t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_table; 1571 + t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table; 1572 1572 process_token t token 1573 1573 1574 1574 and process_in_caption t token = ··· 1583 1583 | _ -> ()); 1584 1584 pop_until_tag t "caption"; 1585 1585 clear_active_formatting_to_marker t; 1586 - t.mode <- Insertion_mode.In_table 1586 + t.mode <- Parser_insertion_mode.In_table 1587 1587 end 1588 1588 | Token.Tag { kind = Token.Start; name; _ } 1589 1589 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> ··· 1593 1593 generate_implied_end_tags t (); 1594 1594 pop_until_tag t "caption"; 1595 1595 clear_active_formatting_to_marker t; 1596 - t.mode <- Insertion_mode.In_table; 1596 + t.mode <- Parser_insertion_mode.In_table; 1597 1597 process_token t token 1598 1598 end 1599 1599 | Token.Tag { kind = Token.End; name = "table"; _ } -> ··· 1603 1603 generate_implied_end_tags t (); 1604 1604 pop_until_tag t "caption"; 1605 1605 clear_active_formatting_to_marker t; 1606 - t.mode <- Insertion_mode.In_table; 1606 + t.mode <- Parser_insertion_mode.In_table; 1607 1607 process_token t token 1608 1608 end 1609 1609 | Token.Tag { kind = Token.End; name; _ } ··· 1629 1629 (match current_node t with 1630 1630 | Some n when n.Dom.name = "colgroup" -> 1631 1631 pop_current t; 1632 - t.mode <- Insertion_mode.In_table; 1632 + t.mode <- Parser_insertion_mode.In_table; 1633 1633 process_token t (Token.Character remaining) 1634 1634 | _ -> 1635 1635 parse_error t "unexpected-token") ··· 1646 1646 | Token.Tag { kind = Token.End; name = "colgroup"; _ } -> 1647 1647 (match current_node t with 1648 1648 | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag" 1649 - | Some _ -> pop_current t; t.mode <- Insertion_mode.In_table 1649 + | Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table 1650 1650 | None -> parse_error t "unexpected-end-tag") 1651 1651 | Token.Tag { kind = Token.End; name = "col"; _ } -> 1652 1652 parse_error t "unexpected-end-tag" ··· 1659 1659 (match current_node t with 1660 1660 | Some n when n.Dom.name = "colgroup" -> 1661 1661 pop_current t; 1662 - t.mode <- Insertion_mode.In_table; 1662 + t.mode <- Parser_insertion_mode.In_table; 1663 1663 process_token t token 1664 1664 | _ -> 1665 1665 parse_error t "unexpected-token") ··· 1669 1669 | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } -> 1670 1670 clear_stack_back_to_table_body_context t; 1671 1671 ignore (insert_element t "tr" ~push:true attrs); 1672 - t.mode <- Insertion_mode.In_row 1672 + t.mode <- Parser_insertion_mode.In_row 1673 1673 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] -> 1674 1674 parse_error t "unexpected-start-tag"; 1675 1675 clear_stack_back_to_table_body_context t; 1676 1676 ignore (insert_element t "tr" ~push:true []); 1677 - t.mode <- Insertion_mode.In_row; 1677 + t.mode <- Parser_insertion_mode.In_row; 1678 1678 process_token t token 1679 1679 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1680 1680 if not (has_element_in_table_scope t name) then ··· 1682 1682 else begin 1683 1683 clear_stack_back_to_table_body_context t; 1684 1684 pop_current t; 1685 - t.mode <- Insertion_mode.In_table 1685 + t.mode <- Parser_insertion_mode.In_table 1686 1686 end 1687 1687 | Token.Tag { kind = Token.Start; name; _ } 1688 1688 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 1689 - if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then 1689 + if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then 1690 1690 parse_error t "unexpected-start-tag" 1691 1691 else begin 1692 1692 clear_stack_back_to_table_body_context t; 1693 1693 pop_current t; 1694 - t.mode <- Insertion_mode.In_table; 1694 + t.mode <- Parser_insertion_mode.In_table; 1695 1695 process_token t token 1696 1696 end 1697 1697 | Token.Tag { kind = Token.End; name = "table"; _ } -> 1698 - if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then 1698 + if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then 1699 1699 parse_error t "unexpected-end-tag" 1700 1700 else begin 1701 1701 clear_stack_back_to_table_body_context t; 1702 1702 pop_current t; 1703 - t.mode <- Insertion_mode.In_table; 1703 + t.mode <- Parser_insertion_mode.In_table; 1704 1704 process_token t token 1705 1705 end 1706 1706 | Token.Tag { kind = Token.End; name; _ } ··· 1724 1724 | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] -> 1725 1725 clear_stack_back_to_table_row_context t; 1726 1726 ignore (insert_element t name ~push:true attrs); 1727 - t.mode <- Insertion_mode.In_cell; 1727 + t.mode <- Parser_insertion_mode.In_cell; 1728 1728 push_formatting_marker t 1729 1729 | Token.Tag { kind = Token.End; name = "tr"; _ } -> 1730 1730 if not (has_element_in_table_scope t "tr") then ··· 1732 1732 else begin 1733 1733 clear_stack_back_to_table_row_context t; 1734 1734 pop_current t; 1735 - t.mode <- Insertion_mode.In_table_body 1735 + t.mode <- Parser_insertion_mode.In_table_body 1736 1736 end 1737 1737 | Token.Tag { kind = Token.Start; name; _ } 1738 1738 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] -> ··· 1741 1741 else begin 1742 1742 clear_stack_back_to_table_row_context t; 1743 1743 pop_current t; 1744 - t.mode <- Insertion_mode.In_table_body; 1744 + t.mode <- Parser_insertion_mode.In_table_body; 1745 1745 process_token t token 1746 1746 end 1747 1747 | Token.Tag { kind = Token.End; name = "table"; _ } -> ··· 1750 1750 else begin 1751 1751 clear_stack_back_to_table_row_context t; 1752 1752 pop_current t; 1753 - t.mode <- Insertion_mode.In_table_body; 1753 + t.mode <- Parser_insertion_mode.In_table_body; 1754 1754 process_token t token 1755 1755 end 1756 1756 | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> ··· 1761 1761 else begin 1762 1762 clear_stack_back_to_table_row_context t; 1763 1763 pop_current t; 1764 - t.mode <- Insertion_mode.In_table_body; 1764 + t.mode <- Parser_insertion_mode.In_table_body; 1765 1765 process_token t token 1766 1766 end 1767 1767 | Token.Tag { kind = Token.End; name; _ } ··· 1792 1792 | _ -> ()); 1793 1793 pop_until_html_tag t name; 1794 1794 clear_active_formatting_to_marker t; 1795 - t.mode <- Insertion_mode.In_row 1795 + t.mode <- Parser_insertion_mode.In_row 1796 1796 end 1797 1797 | Token.Tag { kind = Token.Start; name; _ } 1798 1798 when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1799 - if not (has_element_in_scope_impl t ["td"; "th"] Constants.table_scope ~check_integration_points:false) then 1799 + if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then 1800 1800 parse_error t "unexpected-start-tag" 1801 1801 else begin 1802 1802 close_cell t; ··· 1823 1823 | _ -> ()); 1824 1824 pop_until_html_one_of t ["td"; "th"]; 1825 1825 clear_active_formatting_to_marker t; 1826 - t.mode <- Insertion_mode.In_row 1826 + t.mode <- Parser_insertion_mode.In_row 1827 1827 1828 1828 and process_in_select t token = 1829 1829 match token with ··· 1927 1927 ignore (insert_element t name attrs) 1928 1928 (* Don't push to stack - void elements *) 1929 1929 (* Handle formatting elements in select *) 1930 - | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.formatting_elements -> 1930 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Parser_constants.formatting_elements -> 1931 1931 reconstruct_active_formatting t; 1932 1932 let node = insert_element t name ~push:true attrs in 1933 1933 push_formatting_element t node name attrs 1934 - | Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.formatting_elements -> 1934 + | Token.Tag { kind = Token.End; name; _ } when List.mem name Parser_constants.formatting_elements -> 1935 1935 (* Find select element and check if formatting element is inside select *) 1936 1936 let select_idx = ref None in 1937 1937 let fmt_idx = ref None in ··· 2021 2021 | Token.Tag { kind = Token.Start; name; _ } 2022 2022 when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 2023 2023 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2024 - t.template_modes <- Insertion_mode.In_table :: t.template_modes; 2025 - t.mode <- Insertion_mode.In_table; 2024 + t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes; 2025 + t.mode <- Parser_insertion_mode.In_table; 2026 2026 process_token t token 2027 2027 | Token.Tag { kind = Token.Start; name = "col"; _ } -> 2028 2028 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2029 - t.template_modes <- Insertion_mode.In_column_group :: t.template_modes; 2030 - t.mode <- Insertion_mode.In_column_group; 2029 + t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes; 2030 + t.mode <- Parser_insertion_mode.In_column_group; 2031 2031 process_token t token 2032 2032 | Token.Tag { kind = Token.Start; name = "tr"; _ } -> 2033 2033 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2034 - t.template_modes <- Insertion_mode.In_table_body :: t.template_modes; 2035 - t.mode <- Insertion_mode.In_table_body; 2034 + t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes; 2035 + t.mode <- Parser_insertion_mode.In_table_body; 2036 2036 process_token t token 2037 2037 | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] -> 2038 2038 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2039 - t.template_modes <- Insertion_mode.In_row :: t.template_modes; 2040 - t.mode <- Insertion_mode.In_row; 2039 + t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes; 2040 + t.mode <- Parser_insertion_mode.In_row; 2041 2041 process_token t token 2042 2042 | Token.Tag { kind = Token.Start; _ } -> 2043 2043 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2044 - t.template_modes <- Insertion_mode.In_body :: t.template_modes; 2045 - t.mode <- Insertion_mode.In_body; 2044 + t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes; 2045 + t.mode <- Parser_insertion_mode.In_body; 2046 2046 process_token t token 2047 2047 | Token.Tag { kind = Token.End; _ } -> 2048 2048 parse_error t "unexpected-end-tag" ··· 2076 2076 if t.fragment_context <> None then 2077 2077 parse_error t "unexpected-end-tag" 2078 2078 else 2079 - t.mode <- Insertion_mode.After_after_body 2079 + t.mode <- Parser_insertion_mode.After_after_body 2080 2080 | Token.EOF -> 2081 2081 () (* Stop parsing *) 2082 2082 | _ -> 2083 2083 parse_error t "unexpected-token-after-body"; 2084 - t.mode <- Insertion_mode.In_body; 2084 + t.mode <- Parser_insertion_mode.In_body; 2085 2085 process_token t token 2086 2086 2087 2087 and process_in_frameset t token = ··· 2109 2109 pop_current t; 2110 2110 if t.fragment_context = None then 2111 2111 (match current_node t with 2112 - | Some n when n.Dom.name <> "frameset" -> t.mode <- Insertion_mode.After_frameset 2112 + | Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset 2113 2113 | _ -> ())) 2114 2114 | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } -> 2115 2115 ignore (insert_element t "frame" ~push:true attrs); ··· 2140 2140 | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2141 2141 process_in_body t token 2142 2142 | Token.Tag { kind = Token.End; name = "html"; _ } -> 2143 - t.mode <- Insertion_mode.After_after_frameset 2143 + t.mode <- Parser_insertion_mode.After_after_frameset 2144 2144 | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2145 2145 process_in_head t token 2146 2146 | Token.EOF -> ··· 2162 2162 () (* Stop parsing *) 2163 2163 | _ -> 2164 2164 parse_error t "unexpected-token-after-after-body"; 2165 - t.mode <- Insertion_mode.In_body; 2165 + t.mode <- Parser_insertion_mode.In_body; 2166 2166 process_token t token 2167 2167 2168 2168 and process_after_after_frameset t token = ··· 2193 2193 let is_html_integration_point node = 2194 2194 (* SVG foreignObject, desc, and title are always HTML integration points *) 2195 2195 if node.Dom.namespace = Some "svg" && 2196 - List.mem node.Dom.name Constants.svg_html_integration then true 2196 + List.mem node.Dom.name Parser_constants.svg_html_integration then true 2197 2197 (* annotation-xml is an HTML integration point only with specific encoding values *) 2198 2198 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2199 2199 match List.assoc_opt "encoding" node.Dom.attrs with ··· 2275 2275 let is_html_integration_point node = 2276 2276 (* SVG foreignObject, desc, and title are always HTML integration points *) 2277 2277 if node.Dom.namespace = Some "svg" && 2278 - List.mem node.Dom.name Constants.svg_html_integration then true 2278 + List.mem node.Dom.name Parser_constants.svg_html_integration then true 2279 2279 (* annotation-xml is an HTML integration point only with specific encoding values *) 2280 2280 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2281 2281 match List.assoc_opt "encoding" node.Dom.attrs with ··· 2357 2357 | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 2358 2358 let name = 2359 2359 match adjusted_current_node t with 2360 - | Some n when n.Dom.namespace = Some "svg" -> Constants.adjust_svg_tag_name name 2360 + | Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name 2361 2361 | _ -> name 2362 2362 in 2363 2363 let attrs = 2364 2364 match adjusted_current_node t with 2365 2365 | Some n when n.Dom.namespace = Some "svg" -> 2366 - Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) 2366 + Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) 2367 2367 | Some n when n.Dom.namespace = Some "mathml" -> 2368 - Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) 2369 - | _ -> Constants.adjust_foreign_attrs attrs 2368 + Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) 2369 + | _ -> Parser_constants.adjust_foreign_attrs attrs 2370 2370 in 2371 2371 let namespace = 2372 2372 match adjusted_current_node t with ··· 2436 2436 2437 2437 and process_by_mode t token = 2438 2438 match t.mode with 2439 - | Insertion_mode.Initial -> process_initial t token 2440 - | Insertion_mode.Before_html -> process_before_html t token 2441 - | Insertion_mode.Before_head -> process_before_head t token 2442 - | Insertion_mode.In_head -> process_in_head t token 2443 - | Insertion_mode.In_head_noscript -> process_in_head_noscript t token 2444 - | Insertion_mode.After_head -> process_after_head t token 2445 - | Insertion_mode.In_body -> process_in_body t token 2446 - | Insertion_mode.Text -> process_text t token 2447 - | Insertion_mode.In_table -> process_in_table t token 2448 - | Insertion_mode.In_table_text -> process_in_table_text t token 2449 - | Insertion_mode.In_caption -> process_in_caption t token 2450 - | Insertion_mode.In_column_group -> process_in_column_group t token 2451 - | Insertion_mode.In_table_body -> process_in_table_body t token 2452 - | Insertion_mode.In_row -> process_in_row t token 2453 - | Insertion_mode.In_cell -> process_in_cell t token 2454 - | Insertion_mode.In_select -> process_in_select t token 2455 - | Insertion_mode.In_select_in_table -> process_in_select_in_table t token 2456 - | Insertion_mode.In_template -> process_in_template t token 2457 - | Insertion_mode.After_body -> process_after_body t token 2458 - | Insertion_mode.In_frameset -> process_in_frameset t token 2459 - | Insertion_mode.After_frameset -> process_after_frameset t token 2460 - | Insertion_mode.After_after_body -> process_after_after_body t token 2461 - | Insertion_mode.After_after_frameset -> process_after_after_frameset t token 2439 + | Parser_insertion_mode.Initial -> process_initial t token 2440 + | Parser_insertion_mode.Before_html -> process_before_html t token 2441 + | Parser_insertion_mode.Before_head -> process_before_head t token 2442 + | Parser_insertion_mode.In_head -> process_in_head t token 2443 + | Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token 2444 + | Parser_insertion_mode.After_head -> process_after_head t token 2445 + | Parser_insertion_mode.In_body -> process_in_body t token 2446 + | Parser_insertion_mode.Text -> process_text t token 2447 + | Parser_insertion_mode.In_table -> process_in_table t token 2448 + | Parser_insertion_mode.In_table_text -> process_in_table_text t token 2449 + | Parser_insertion_mode.In_caption -> process_in_caption t token 2450 + | Parser_insertion_mode.In_column_group -> process_in_column_group t token 2451 + | Parser_insertion_mode.In_table_body -> process_in_table_body t token 2452 + | Parser_insertion_mode.In_row -> process_in_row t token 2453 + | Parser_insertion_mode.In_cell -> process_in_cell t token 2454 + | Parser_insertion_mode.In_select -> process_in_select t token 2455 + | Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token 2456 + | Parser_insertion_mode.In_template -> process_in_template t token 2457 + | Parser_insertion_mode.After_body -> process_after_body t token 2458 + | Parser_insertion_mode.In_frameset -> process_in_frameset t token 2459 + | Parser_insertion_mode.After_frameset -> process_after_frameset t token 2460 + | Parser_insertion_mode.After_after_body -> process_after_after_body t token 2461 + | Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token 2462 2462 2463 2463 (* Populate selectedcontent elements with content from selected option *) 2464 2464 let find_elements name node =
-4
lib/selector/dune
··· 1 - (library 2 - (name html5rw_selector) 3 - (public_name html5rw.selector) 4 - (libraries astring html5rw.dom))
lib/selector/html5rw_selector.ml lib/html5rw/selector/selector.ml
+2 -2
lib/selector/html5rw_selector.mli lib/html5rw/selector/selector.mli
··· 131 131 @raise Selector_error if the selector is malformed. 132 132 *) 133 133 134 - val query : Html5rw_dom.node -> string -> Html5rw_dom.node list 134 + val query : Dom.node -> string -> Dom.node list 135 135 (** Query the DOM tree with a CSS selector. 136 136 137 137 Returns all nodes matching the selector in document order. ··· 143 143 ]} 144 144 *) 145 145 146 - val matches : Html5rw_dom.node -> string -> bool 146 + val matches : Dom.node -> string -> bool 147 147 (** Check if a node matches a CSS selector. 148 148 149 149 @raise Selector_error if the selector is malformed.
lib/selector/selector_ast.ml lib/html5rw/selector/selector_ast.ml
lib/selector/selector_lexer.ml lib/html5rw/selector/selector_lexer.ml
+1 -1
lib/selector/selector_match.ml lib/html5rw/selector/selector_match.ml
··· 1 1 (* CSS selector matching *) 2 2 3 - module Dom = Html5rw_dom 3 + module Dom = Dom 4 4 open Selector_ast 5 5 6 6 (* Use Astring for string operations *)
lib/selector/selector_parser.ml lib/html5rw/selector/selector_parser.ml
lib/selector/selector_token.ml lib/html5rw/selector/selector_token.ml
-4
lib/tokenizer/dune
··· 1 - (library 2 - (name html5rw_tokenizer) 3 - (public_name html5rw.tokenizer) 4 - (libraries astring bytesrw html5rw.entities))
lib/tokenizer/errors.ml lib/html5rw/tokenizer/tokenizer_errors.ml
-16
lib/tokenizer/html5rw_tokenizer.ml
··· 1 - (* html5rw.tokenizer - HTML5 tokenizer with bytesrw-only API *) 2 - 3 - module Token = Token 4 - module State = State 5 - module Errors = Errors 6 - module Stream = Stream 7 - 8 - module type SINK = Tokenizer.SINK 9 - 10 - type 'a t = 'a Tokenizer.t 11 - 12 - let create = Tokenizer.create 13 - let run = Tokenizer.run 14 - let get_errors = Tokenizer.get_errors 15 - let set_state = Tokenizer.set_state 16 - let set_last_start_tag = Tokenizer.set_last_start_tag
+10 -10
lib/tokenizer/html5rw_tokenizer.mli lib/html5rw/tokenizer/tokenizer.mli
··· 15 15 16 16 (** Token types produced by the tokenizer. *) 17 17 module Token : sig 18 - type tag_kind = Token.tag_kind = Start | End 18 + type tag_kind = Tokenizer_token.tag_kind = Start | End 19 19 20 - type doctype = Token.doctype = { 20 + type doctype = Tokenizer_token.doctype = { 21 21 name : string option; 22 22 public_id : string option; 23 23 system_id : string option; 24 24 force_quirks : bool; 25 25 } 26 26 27 - type tag = Token.tag = { 27 + type tag = Tokenizer_token.tag = { 28 28 kind : tag_kind; 29 29 name : string; 30 30 attrs : (string * string) list; 31 31 self_closing : bool; 32 32 } 33 33 34 - type t = Token.t = 34 + type t = Tokenizer_token.t = 35 35 | Tag of tag 36 36 | Character of string 37 37 | Comment of string ··· 54 54 55 55 (** Tokenizer states. *) 56 56 module State : sig 57 - type t = State.t = 57 + type t = Tokenizer_state.t = 58 58 | Data 59 59 | Rcdata 60 60 | Rawtext ··· 139 139 140 140 (** Parse error types. *) 141 141 module Errors : sig 142 - type t = Errors.t = { 142 + type t = Tokenizer_errors.t = { 143 143 code : string; 144 144 line : int; 145 145 column : int; ··· 151 151 152 152 (** Input stream with position tracking. *) 153 153 module Stream : sig 154 - type t = Stream.t 154 + type t = Tokenizer_stream.t 155 155 156 156 val create : string -> t 157 157 val create_from_reader : Bytesrw.Bytes.Reader.t -> t ··· 170 170 *) 171 171 module type SINK = sig 172 172 type t 173 - val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ] 173 + val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ] 174 174 val adjusted_current_node_in_html_namespace : t -> bool 175 175 end 176 176 ··· 204 204 function for each token until EOF is reached. 205 205 *) 206 206 207 - val get_errors : 'sink t -> Errors.t list 207 + val get_errors : 'sink t -> Tokenizer_errors.t list 208 208 (** Get the list of parse errors encountered during tokenization. 209 209 210 210 Only populated if [collect_errors:true] was passed to {!create}. 211 211 *) 212 212 213 - val set_state : 'sink t -> State.t -> unit 213 + val set_state : 'sink t -> Tokenizer_state.t -> unit 214 214 (** Set the tokenizer state. 215 215 216 216 Used by the tree builder to switch states for raw text elements.
lib/tokenizer/state.ml lib/html5rw/tokenizer/tokenizer_state.ml
lib/tokenizer/stream.ml lib/html5rw/tokenizer/tokenizer_stream.ml
lib/tokenizer/token.ml lib/html5rw/tokenizer/tokenizer_token.ml
-1975
lib/tokenizer/tokenizer.ml
··· 1 - (* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 - 3 - (* Character classification using Astring *) 4 - let is_ascii_alpha = Astring.Char.Ascii.is_letter 5 - let is_ascii_digit = Astring.Char.Ascii.is_digit 6 - let is_ascii_hex = Astring.Char.Ascii.is_hex_digit 7 - let is_ascii_alnum = Astring.Char.Ascii.is_alphanum 8 - let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9 - let ascii_lower = Astring.Char.Ascii.lowercase 10 - 11 - (* Token sink interface *) 12 - module type SINK = sig 13 - type t 14 - val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ] 15 - val adjusted_current_node_in_html_namespace : t -> bool 16 - end 17 - 18 - type 'sink t = { 19 - mutable stream : Stream.t; 20 - sink : 'sink; 21 - mutable state : State.t; 22 - mutable return_state : State.t; 23 - mutable char_ref_code : int; 24 - mutable temp_buffer : Buffer.t; 25 - mutable last_start_tag : string; 26 - mutable current_tag_name : Buffer.t; 27 - mutable current_tag_kind : Token.tag_kind; 28 - mutable current_tag_self_closing : bool; 29 - mutable current_attr_name : Buffer.t; 30 - mutable current_attr_value : Buffer.t; 31 - mutable current_attrs : (string * string) list; 32 - mutable current_doctype_name : Buffer.t option; 33 - mutable current_doctype_public : Buffer.t option; 34 - mutable current_doctype_system : Buffer.t option; 35 - mutable current_doctype_force_quirks : bool; 36 - mutable current_comment : Buffer.t; 37 - mutable pending_chars : Buffer.t; 38 - mutable errors : Errors.t list; 39 - collect_errors : bool; 40 - xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *) 41 - } 42 - 43 - let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = { 44 - stream = Stream.create ""; 45 - sink; 46 - state = State.Data; 47 - return_state = State.Data; 48 - char_ref_code = 0; 49 - temp_buffer = Buffer.create 64; 50 - last_start_tag = ""; 51 - current_tag_name = Buffer.create 32; 52 - current_tag_kind = Token.Start; 53 - current_tag_self_closing = false; 54 - current_attr_name = Buffer.create 32; 55 - current_attr_value = Buffer.create 64; 56 - current_attrs = []; 57 - current_doctype_name = None; 58 - current_doctype_public = None; 59 - current_doctype_system = None; 60 - current_doctype_force_quirks = false; 61 - current_comment = Buffer.create 64; 62 - pending_chars = Buffer.create 256; 63 - errors = []; 64 - collect_errors; 65 - xml_mode; 66 - } 67 - 68 - let error t code = 69 - if t.collect_errors then begin 70 - let (line, column) = Stream.position t.stream in 71 - t.errors <- Errors.make ~code ~line ~column :: t.errors 72 - end 73 - 74 - (* emit functions are defined locally inside run *) 75 - 76 - (* XML mode character transformation: form feed → space *) 77 - let emit_char t c = 78 - if t.xml_mode && c = '\x0C' then 79 - Buffer.add_char t.pending_chars ' ' 80 - else 81 - Buffer.add_char t.pending_chars c 82 - 83 - (* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *) 84 - let emit_str t s = 85 - if t.xml_mode then begin 86 - (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *) 87 - let len = String.length s in 88 - let i = ref 0 in 89 - while !i < len do 90 - let c = s.[!i] in 91 - if c = '\x0C' then begin 92 - Buffer.add_char t.pending_chars ' '; 93 - incr i 94 - end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin 95 - (* U+FFFF → U+FFFD *) 96 - Buffer.add_string t.pending_chars "\xEF\xBF\xBD"; 97 - i := !i + 3 98 - end else begin 99 - Buffer.add_char t.pending_chars c; 100 - incr i 101 - end 102 - done 103 - end else 104 - Buffer.add_string t.pending_chars s 105 - 106 - let start_new_tag t kind = 107 - Buffer.clear t.current_tag_name; 108 - t.current_tag_kind <- kind; 109 - t.current_tag_self_closing <- false; 110 - t.current_attrs <- [] 111 - 112 - let start_new_attribute t = 113 - (* Save previous attribute if any *) 114 - let name = Buffer.contents t.current_attr_name in 115 - if String.length name > 0 then begin 116 - let value = Buffer.contents t.current_attr_value in 117 - (* Check for duplicates - only add if not already present *) 118 - if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then 119 - t.current_attrs <- (name, value) :: t.current_attrs 120 - else 121 - error t "duplicate-attribute" 122 - end; 123 - Buffer.clear t.current_attr_name; 124 - Buffer.clear t.current_attr_value 125 - 126 - let finish_attribute t = 127 - start_new_attribute t 128 - 129 - let start_new_doctype t = 130 - t.current_doctype_name <- None; 131 - t.current_doctype_public <- None; 132 - t.current_doctype_system <- None; 133 - t.current_doctype_force_quirks <- false 134 - 135 - (* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *) 136 - 137 - let is_appropriate_end_tag t = 138 - let name = Buffer.contents t.current_tag_name in 139 - String.length t.last_start_tag > 0 && name = t.last_start_tag 140 - 141 - let flush_code_points_consumed_as_char_ref t = 142 - let s = Buffer.contents t.temp_buffer in 143 - match t.return_state with 144 - | State.Attribute_value_double_quoted 145 - | State.Attribute_value_single_quoted 146 - | State.Attribute_value_unquoted -> 147 - Buffer.add_string t.current_attr_value s 148 - | _ -> 149 - emit_str t s 150 - 151 - open Bytesrw 152 - 153 - (* Main tokenization loop *) 154 - let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) = 155 - t.stream <- Stream.create_from_reader reader; 156 - t.errors <- []; 157 - (* Set up error callback for surrogate/noncharacter detection in stream *) 158 - (* In XML mode, we don't report noncharacter errors - we transform them instead *) 159 - if not t.xml_mode then 160 - Stream.set_error_callback t.stream (fun code -> error t code); 161 - 162 - (* XML mode transformation for pending chars: U+FFFF → U+FFFD *) 163 - let transform_xml_chars data = 164 - let len = String.length data in 165 - let buf = Buffer.create len in 166 - let i = ref 0 in 167 - while !i < len do 168 - let c = data.[!i] in 169 - if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin 170 - (* U+FFFF → U+FFFD *) 171 - Buffer.add_string buf "\xEF\xBF\xBD"; 172 - i := !i + 3 173 - end else begin 174 - Buffer.add_char buf c; 175 - incr i 176 - end 177 - done; 178 - Buffer.contents buf 179 - in 180 - 181 - (* Local emit functions with access to S *) 182 - let emit_pending_chars () = 183 - if Buffer.length t.pending_chars > 0 then begin 184 - let data = Buffer.contents t.pending_chars in 185 - Buffer.clear t.pending_chars; 186 - let data = if t.xml_mode then transform_xml_chars data else data in 187 - ignore (S.process t.sink (Token.Character data)) 188 - end 189 - in 190 - 191 - let emit token = 192 - emit_pending_chars (); 193 - match S.process t.sink token with 194 - | `Continue -> () 195 - | `SwitchTo new_state -> t.state <- new_state 196 - in 197 - 198 - let emit_current_tag () = 199 - finish_attribute t; 200 - let name = Buffer.contents t.current_tag_name in 201 - let attrs = List.rev t.current_attrs in 202 - (* Check for end tag with attributes or self-closing flag *) 203 - if t.current_tag_kind = Token.End then begin 204 - if attrs <> [] then 205 - error t "end-tag-with-attributes"; 206 - if t.current_tag_self_closing then 207 - error t "end-tag-with-trailing-solidus" 208 - end; 209 - let tag = { 210 - Token.kind = t.current_tag_kind; 211 - name; 212 - attrs; 213 - self_closing = t.current_tag_self_closing; 214 - } in 215 - if t.current_tag_kind = Token.Start then 216 - t.last_start_tag <- name; 217 - emit (Token.Tag tag) 218 - in 219 - 220 - let emit_current_doctype () = 221 - let doctype = { 222 - Token.name = Option.map Buffer.contents t.current_doctype_name; 223 - public_id = Option.map Buffer.contents t.current_doctype_public; 224 - system_id = Option.map Buffer.contents t.current_doctype_system; 225 - force_quirks = t.current_doctype_force_quirks; 226 - } in 227 - emit (Token.Doctype doctype) 228 - in 229 - 230 - let emit_current_comment () = 231 - let content = Buffer.contents t.current_comment in 232 - let content = 233 - if t.xml_mode then begin 234 - (* XML mode: transform -- to - - in comments *) 235 - let buf = Buffer.create (String.length content + 10) in 236 - let len = String.length content in 237 - let i = ref 0 in 238 - while !i < len do 239 - if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin 240 - Buffer.add_string buf "- -"; 241 - i := !i + 2 242 - end else begin 243 - Buffer.add_char buf content.[!i]; 244 - incr i 245 - end 246 - done; 247 - Buffer.contents buf 248 - end else content 249 - in 250 - emit (Token.Comment content) 251 - in 252 - 253 - (* Check for control characters and emit error if needed *) 254 - (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *) 255 - let check_control_char c = 256 - let code = Char.code c in 257 - (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *) 258 - (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *) 259 - (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *) 260 - (* Note: We only check single-byte control chars here; multi-byte checks are TODO *) 261 - if (code >= 0x01 && code <= 0x08) || 262 - code = 0x0B || 263 - (code >= 0x0E && code <= 0x1F) || 264 - code = 0x7F then 265 - error t "control-character-in-input-stream" 266 - in 267 - 268 - 269 - (* Emit char with control character check *) 270 - let emit_char_checked c = 271 - check_control_char c; 272 - emit_char t c 273 - in 274 - 275 - let rec process_state () = 276 - if Stream.is_eof t.stream && t.state <> State.Data then begin 277 - (* Handle EOF in various states *) 278 - handle_eof () 279 - end else if Stream.is_eof t.stream then begin 280 - emit_pending_chars (); 281 - ignore (S.process t.sink Token.EOF) 282 - end else begin 283 - step (); 284 - process_state () 285 - end 286 - 287 - and handle_eof () = 288 - match t.state with 289 - | State.Data -> 290 - emit_pending_chars (); 291 - ignore (S.process t.sink Token.EOF) 292 - | State.Tag_open -> 293 - error t "eof-before-tag-name"; 294 - emit_char t '<'; 295 - emit_pending_chars (); 296 - ignore (S.process t.sink Token.EOF) 297 - | State.End_tag_open -> 298 - error t "eof-before-tag-name"; 299 - emit_str t "</"; 300 - emit_pending_chars (); 301 - ignore (S.process t.sink Token.EOF) 302 - | State.Tag_name 303 - | State.Before_attribute_name 304 - | State.Attribute_name 305 - | State.After_attribute_name 306 - | State.Before_attribute_value 307 - | State.Attribute_value_double_quoted 308 - | State.Attribute_value_single_quoted 309 - | State.Attribute_value_unquoted 310 - | State.After_attribute_value_quoted 311 - | State.Self_closing_start_tag -> 312 - error t "eof-in-tag"; 313 - emit_pending_chars (); 314 - ignore (S.process t.sink Token.EOF) 315 - | State.Rawtext -> 316 - emit_pending_chars (); 317 - ignore (S.process t.sink Token.EOF) 318 - | State.Rawtext_less_than_sign -> 319 - emit_char t '<'; 320 - emit_pending_chars (); 321 - ignore (S.process t.sink Token.EOF) 322 - | State.Rawtext_end_tag_open -> 323 - emit_str t "</"; 324 - emit_pending_chars (); 325 - ignore (S.process t.sink Token.EOF) 326 - | State.Rawtext_end_tag_name -> 327 - emit_str t "</"; 328 - emit_str t (Buffer.contents t.temp_buffer); 329 - emit_pending_chars (); 330 - ignore (S.process t.sink Token.EOF) 331 - | State.Rcdata -> 332 - emit_pending_chars (); 333 - ignore (S.process t.sink Token.EOF) 334 - | State.Rcdata_less_than_sign -> 335 - emit_char t '<'; 336 - emit_pending_chars (); 337 - ignore (S.process t.sink Token.EOF) 338 - | State.Rcdata_end_tag_open -> 339 - emit_str t "</"; 340 - emit_pending_chars (); 341 - ignore (S.process t.sink Token.EOF) 342 - | State.Rcdata_end_tag_name -> 343 - emit_str t "</"; 344 - emit_str t (Buffer.contents t.temp_buffer); 345 - emit_pending_chars (); 346 - ignore (S.process t.sink Token.EOF) 347 - | State.Script_data -> 348 - emit_pending_chars (); 349 - ignore (S.process t.sink Token.EOF) 350 - | State.Script_data_less_than_sign -> 351 - emit_char t '<'; 352 - emit_pending_chars (); 353 - ignore (S.process t.sink Token.EOF) 354 - | State.Script_data_end_tag_open -> 355 - emit_str t "</"; 356 - emit_pending_chars (); 357 - ignore (S.process t.sink Token.EOF) 358 - | State.Script_data_end_tag_name -> 359 - emit_str t "</"; 360 - emit_str t (Buffer.contents t.temp_buffer); 361 - emit_pending_chars (); 362 - ignore (S.process t.sink Token.EOF) 363 - | State.Script_data_escape_start 364 - | State.Script_data_escape_start_dash 365 - | State.Script_data_escaped 366 - | State.Script_data_escaped_dash 367 - | State.Script_data_escaped_dash_dash -> 368 - error t "eof-in-script-html-comment-like-text"; 369 - emit_pending_chars (); 370 - ignore (S.process t.sink Token.EOF) 371 - | State.Script_data_escaped_less_than_sign -> 372 - emit_char t '<'; 373 - emit_pending_chars (); 374 - ignore (S.process t.sink Token.EOF) 375 - | State.Script_data_escaped_end_tag_open -> 376 - emit_str t "</"; 377 - emit_pending_chars (); 378 - ignore (S.process t.sink Token.EOF) 379 - | State.Script_data_escaped_end_tag_name -> 380 - emit_str t "</"; 381 - emit_str t (Buffer.contents t.temp_buffer); 382 - emit_pending_chars (); 383 - ignore (S.process t.sink Token.EOF) 384 - | State.Script_data_double_escape_start 385 - | State.Script_data_double_escaped 386 - | State.Script_data_double_escaped_dash 387 - | State.Script_data_double_escaped_dash_dash -> 388 - error t "eof-in-script-html-comment-like-text"; 389 - emit_pending_chars (); 390 - ignore (S.process t.sink Token.EOF) 391 - | State.Script_data_double_escaped_less_than_sign -> 392 - (* '<' was already emitted when entering this state from Script_data_double_escaped *) 393 - emit_pending_chars (); 394 - ignore (S.process t.sink Token.EOF) 395 - | State.Script_data_double_escape_end -> 396 - emit_pending_chars (); 397 - ignore (S.process t.sink Token.EOF) 398 - | State.Plaintext -> 399 - emit_pending_chars (); 400 - ignore (S.process t.sink Token.EOF) 401 - | State.Comment_start 402 - | State.Comment_start_dash 403 - | State.Comment 404 - | State.Comment_less_than_sign 405 - | State.Comment_less_than_sign_bang 406 - | State.Comment_less_than_sign_bang_dash 407 - | State.Comment_less_than_sign_bang_dash_dash 408 - | State.Comment_end_dash 409 - | State.Comment_end 410 - | State.Comment_end_bang -> 411 - error t "eof-in-comment"; 412 - emit_current_comment (); 413 - emit_pending_chars (); 414 - ignore (S.process t.sink Token.EOF) 415 - | State.Bogus_comment -> 416 - emit_current_comment (); 417 - emit_pending_chars (); 418 - ignore (S.process t.sink Token.EOF) 419 - | State.Markup_declaration_open -> 420 - error t "incorrectly-opened-comment"; 421 - Buffer.clear t.current_comment; 422 - emit_current_comment (); 423 - emit_pending_chars (); 424 - ignore (S.process t.sink Token.EOF) 425 - | State.Doctype 426 - | State.Before_doctype_name -> 427 - error t "eof-in-doctype"; 428 - start_new_doctype t; 429 - t.current_doctype_force_quirks <- true; 430 - emit_current_doctype (); 431 - emit_pending_chars (); 432 - ignore (S.process t.sink Token.EOF) 433 - | State.Doctype_name 434 - | State.After_doctype_name 435 - | State.After_doctype_public_keyword 436 - | State.Before_doctype_public_identifier 437 - | State.Doctype_public_identifier_double_quoted 438 - | State.Doctype_public_identifier_single_quoted 439 - | State.After_doctype_public_identifier 440 - | State.Between_doctype_public_and_system_identifiers 441 - | State.After_doctype_system_keyword 442 - | State.Before_doctype_system_identifier 443 - | State.Doctype_system_identifier_double_quoted 444 - | State.Doctype_system_identifier_single_quoted 445 - | State.After_doctype_system_identifier -> 446 - error t "eof-in-doctype"; 447 - t.current_doctype_force_quirks <- true; 448 - emit_current_doctype (); 449 - emit_pending_chars (); 450 - ignore (S.process t.sink Token.EOF) 451 - | State.Bogus_doctype -> 452 - emit_current_doctype (); 453 - emit_pending_chars (); 454 - ignore (S.process t.sink Token.EOF) 455 - | State.Cdata_section -> 456 - error t "eof-in-cdata"; 457 - emit_pending_chars (); 458 - ignore (S.process t.sink Token.EOF) 459 - | State.Cdata_section_bracket -> 460 - error t "eof-in-cdata"; 461 - emit_char t ']'; 462 - emit_pending_chars (); 463 - ignore (S.process t.sink Token.EOF) 464 - | State.Cdata_section_end -> 465 - error t "eof-in-cdata"; 466 - emit_str t "]]"; 467 - emit_pending_chars (); 468 - ignore (S.process t.sink Token.EOF) 469 - | State.Character_reference -> 470 - (* state_character_reference never ran, so initialize temp_buffer with & *) 471 - Buffer.clear t.temp_buffer; 472 - Buffer.add_char t.temp_buffer '&'; 473 - flush_code_points_consumed_as_char_ref t; 474 - t.state <- t.return_state; 475 - handle_eof () 476 - | State.Named_character_reference -> 477 - flush_code_points_consumed_as_char_ref t; 478 - t.state <- t.return_state; 479 - handle_eof () 480 - | State.Numeric_character_reference -> 481 - (* At EOF with just "&#" - no digits follow *) 482 - error t "absence-of-digits-in-numeric-character-reference"; 483 - flush_code_points_consumed_as_char_ref t; 484 - t.state <- t.return_state; 485 - handle_eof () 486 - | State.Hexadecimal_character_reference_start 487 - | State.Decimal_character_reference_start -> 488 - error t "absence-of-digits-in-numeric-character-reference"; 489 - flush_code_points_consumed_as_char_ref t; 490 - t.state <- t.return_state; 491 - handle_eof () 492 - | State.Numeric_character_reference_end -> 493 - (* We have collected digits, just need to finalize the character reference *) 494 - step (); 495 - handle_eof () 496 - | State.Ambiguous_ampersand -> 497 - (* Buffer was already flushed when entering this state, just transition *) 498 - t.state <- t.return_state; 499 - handle_eof () 500 - | State.Hexadecimal_character_reference 501 - | State.Decimal_character_reference -> 502 - (* At EOF with collected digits - convert the numeric reference *) 503 - error t "missing-semicolon-after-character-reference"; 504 - let code = t.char_ref_code in 505 - let replacement_char = "\xEF\xBF\xBD" in 506 - let result = 507 - if code = 0 then begin 508 - error t "null-character-reference"; 509 - replacement_char 510 - end else if code > 0x10FFFF then begin 511 - error t "character-reference-outside-unicode-range"; 512 - replacement_char 513 - end else if code >= 0xD800 && code <= 0xDFFF then begin 514 - error t "surrogate-character-reference"; 515 - replacement_char 516 - end else 517 - Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 518 - in 519 - Buffer.clear t.temp_buffer; 520 - Buffer.add_string t.temp_buffer result; 521 - flush_code_points_consumed_as_char_ref t; 522 - t.state <- t.return_state; 523 - handle_eof () 524 - 525 - and step () = 526 - match t.state with 527 - | State.Data -> state_data () 528 - | State.Rcdata -> state_rcdata () 529 - | State.Rawtext -> state_rawtext () 530 - | State.Script_data -> state_script_data () 531 - | State.Plaintext -> state_plaintext () 532 - | State.Tag_open -> state_tag_open () 533 - | State.End_tag_open -> state_end_tag_open () 534 - | State.Tag_name -> state_tag_name () 535 - | State.Rcdata_less_than_sign -> state_rcdata_less_than_sign () 536 - | State.Rcdata_end_tag_open -> state_rcdata_end_tag_open () 537 - | State.Rcdata_end_tag_name -> state_rcdata_end_tag_name () 538 - | State.Rawtext_less_than_sign -> state_rawtext_less_than_sign () 539 - | State.Rawtext_end_tag_open -> state_rawtext_end_tag_open () 540 - | State.Rawtext_end_tag_name -> state_rawtext_end_tag_name () 541 - | State.Script_data_less_than_sign -> state_script_data_less_than_sign () 542 - | State.Script_data_end_tag_open -> state_script_data_end_tag_open () 543 - | State.Script_data_end_tag_name -> state_script_data_end_tag_name () 544 - | State.Script_data_escape_start -> state_script_data_escape_start () 545 - | State.Script_data_escape_start_dash -> state_script_data_escape_start_dash () 546 - | State.Script_data_escaped -> state_script_data_escaped () 547 - | State.Script_data_escaped_dash -> state_script_data_escaped_dash () 548 - | State.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash () 549 - | State.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign () 550 - | State.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open () 551 - | State.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name () 552 - | State.Script_data_double_escape_start -> state_script_data_double_escape_start () 553 - | State.Script_data_double_escaped -> state_script_data_double_escaped () 554 - | State.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash () 555 - | State.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash () 556 - | State.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign () 557 - | State.Script_data_double_escape_end -> state_script_data_double_escape_end () 558 - | State.Before_attribute_name -> state_before_attribute_name () 559 - | State.Attribute_name -> state_attribute_name () 560 - | State.After_attribute_name -> state_after_attribute_name () 561 - | State.Before_attribute_value -> state_before_attribute_value () 562 - | State.Attribute_value_double_quoted -> state_attribute_value_double_quoted () 563 - | State.Attribute_value_single_quoted -> state_attribute_value_single_quoted () 564 - | State.Attribute_value_unquoted -> state_attribute_value_unquoted () 565 - | State.After_attribute_value_quoted -> state_after_attribute_value_quoted () 566 - | State.Self_closing_start_tag -> state_self_closing_start_tag () 567 - | State.Bogus_comment -> state_bogus_comment () 568 - | State.Markup_declaration_open -> state_markup_declaration_open () 569 - | State.Comment_start -> state_comment_start () 570 - | State.Comment_start_dash -> state_comment_start_dash () 571 - | State.Comment -> state_comment () 572 - | State.Comment_less_than_sign -> state_comment_less_than_sign () 573 - | State.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang () 574 - | State.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash () 575 - | State.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash () 576 - | State.Comment_end_dash -> state_comment_end_dash () 577 - | State.Comment_end -> state_comment_end () 578 - | State.Comment_end_bang -> state_comment_end_bang () 579 - | State.Doctype -> state_doctype () 580 - | State.Before_doctype_name -> state_before_doctype_name () 581 - | State.Doctype_name -> state_doctype_name () 582 - | State.After_doctype_name -> state_after_doctype_name () 583 - | State.After_doctype_public_keyword -> state_after_doctype_public_keyword () 584 - | State.Before_doctype_public_identifier -> state_before_doctype_public_identifier () 585 - | State.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted () 586 - | State.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted () 587 - | State.After_doctype_public_identifier -> state_after_doctype_public_identifier () 588 - | State.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers () 589 - | State.After_doctype_system_keyword -> state_after_doctype_system_keyword () 590 - | State.Before_doctype_system_identifier -> state_before_doctype_system_identifier () 591 - | State.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted () 592 - | State.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted () 593 - | State.After_doctype_system_identifier -> state_after_doctype_system_identifier () 594 - | State.Bogus_doctype -> state_bogus_doctype () 595 - | State.Cdata_section -> state_cdata_section () 596 - | State.Cdata_section_bracket -> state_cdata_section_bracket () 597 - | State.Cdata_section_end -> state_cdata_section_end () 598 - | State.Character_reference -> state_character_reference () 599 - | State.Named_character_reference -> state_named_character_reference () 600 - | State.Ambiguous_ampersand -> state_ambiguous_ampersand () 601 - | State.Numeric_character_reference -> state_numeric_character_reference () 602 - | State.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start () 603 - | State.Decimal_character_reference_start -> state_decimal_character_reference_start () 604 - | State.Hexadecimal_character_reference -> state_hexadecimal_character_reference () 605 - | State.Decimal_character_reference -> state_decimal_character_reference () 606 - | State.Numeric_character_reference_end -> state_numeric_character_reference_end () 607 - 608 - (* State implementations *) 609 - and state_data () = 610 - match Stream.consume t.stream with 611 - | Some '&' -> 612 - t.return_state <- State.Data; 613 - t.state <- State.Character_reference 614 - | Some '<' -> 615 - t.state <- State.Tag_open 616 - | Some '\x00' -> 617 - (* Emit pending chars first, then emit null separately for proper tree builder handling *) 618 - emit_pending_chars (); 619 - error t "unexpected-null-character"; 620 - ignore (S.process t.sink (Token.Character "\x00")) 621 - | Some c -> 622 - emit_char_checked c 623 - | None -> () 624 - 625 - and state_rcdata () = 626 - match Stream.consume t.stream with 627 - | Some '&' -> 628 - t.return_state <- State.Rcdata; 629 - t.state <- State.Character_reference 630 - | Some '<' -> 631 - t.state <- State.Rcdata_less_than_sign 632 - | Some '\x00' -> 633 - error t "unexpected-null-character"; 634 - emit_str t "\xEF\xBF\xBD" 635 - | Some c -> 636 - emit_char_checked c 637 - | None -> () 638 - 639 - and state_rawtext () = 640 - match Stream.consume t.stream with 641 - | Some '<' -> 642 - t.state <- State.Rawtext_less_than_sign 643 - | Some '\x00' -> 644 - error t "unexpected-null-character"; 645 - emit_str t "\xEF\xBF\xBD" 646 - | Some c -> 647 - emit_char_checked c 648 - | None -> () 649 - 650 - and state_script_data () = 651 - match Stream.consume t.stream with 652 - | Some '<' -> 653 - t.state <- State.Script_data_less_than_sign 654 - | Some '\x00' -> 655 - error t "unexpected-null-character"; 656 - emit_str t "\xEF\xBF\xBD" 657 - | Some c -> 658 - emit_char_checked c 659 - | None -> () 660 - 661 - and state_plaintext () = 662 - match Stream.consume t.stream with 663 - | Some '\x00' -> 664 - error t "unexpected-null-character"; 665 - emit_str t "\xEF\xBF\xBD" 666 - | Some c -> 667 - emit_char_checked c 668 - | None -> () 669 - 670 - and state_tag_open () = 671 - match Stream.peek t.stream with 672 - | Some '!' -> 673 - Stream.advance t.stream; 674 - t.state <- State.Markup_declaration_open 675 - | Some '/' -> 676 - Stream.advance t.stream; 677 - t.state <- State.End_tag_open 678 - | Some c when is_ascii_alpha c -> 679 - start_new_tag t Token.Start; 680 - t.state <- State.Tag_name 681 - | Some '?' -> 682 - error t "unexpected-question-mark-instead-of-tag-name"; 683 - Buffer.clear t.current_comment; 684 - t.state <- State.Bogus_comment 685 - | None -> 686 - error t "eof-before-tag-name"; 687 - emit_char t '<' 688 - | Some _ -> 689 - error t "invalid-first-character-of-tag-name"; 690 - emit_char t '<'; 691 - t.state <- State.Data 692 - 693 - and state_end_tag_open () = 694 - match Stream.peek t.stream with 695 - | Some c when is_ascii_alpha c -> 696 - start_new_tag t Token.End; 697 - t.state <- State.Tag_name 698 - | Some '>' -> 699 - Stream.advance t.stream; 700 - error t "missing-end-tag-name"; 701 - t.state <- State.Data 702 - | None -> 703 - error t "eof-before-tag-name"; 704 - emit_str t "</" 705 - | Some _ -> 706 - error t "invalid-first-character-of-tag-name"; 707 - Buffer.clear t.current_comment; 708 - t.state <- State.Bogus_comment 709 - 710 - and state_tag_name () = 711 - match Stream.consume t.stream with 712 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 713 - t.state <- State.Before_attribute_name 714 - | Some '/' -> 715 - t.state <- State.Self_closing_start_tag 716 - | Some '>' -> 717 - t.state <- State.Data; 718 - emit_current_tag () 719 - | Some '\x00' -> 720 - error t "unexpected-null-character"; 721 - Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 722 - | Some c -> 723 - check_control_char c; 724 - Buffer.add_char t.current_tag_name (ascii_lower c) 725 - | None -> () 726 - 727 - and state_rcdata_less_than_sign () = 728 - match Stream.peek t.stream with 729 - | Some '/' -> 730 - Stream.advance t.stream; 731 - Buffer.clear t.temp_buffer; 732 - t.state <- State.Rcdata_end_tag_open 733 - | _ -> 734 - emit_char t '<'; 735 - t.state <- State.Rcdata 736 - 737 - and state_rcdata_end_tag_open () = 738 - match Stream.peek t.stream with 739 - | Some c when is_ascii_alpha c -> 740 - start_new_tag t Token.End; 741 - t.state <- State.Rcdata_end_tag_name 742 - | _ -> 743 - emit_str t "</"; 744 - t.state <- State.Rcdata 745 - 746 - and state_rcdata_end_tag_name () = 747 - match Stream.peek t.stream with 748 - | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 749 - Stream.advance t.stream; 750 - t.state <- State.Before_attribute_name 751 - | Some '/' when is_appropriate_end_tag t -> 752 - Stream.advance t.stream; 753 - t.state <- State.Self_closing_start_tag 754 - | Some '>' when is_appropriate_end_tag t -> 755 - Stream.advance t.stream; 756 - t.state <- State.Data; 757 - emit_current_tag () 758 - | Some c when is_ascii_alpha c -> 759 - Stream.advance t.stream; 760 - Buffer.add_char t.current_tag_name (ascii_lower c); 761 - Buffer.add_char t.temp_buffer c 762 - | _ -> 763 - emit_str t "</"; 764 - emit_str t (Buffer.contents t.temp_buffer); 765 - t.state <- State.Rcdata 766 - 767 - and state_rawtext_less_than_sign () = 768 - match Stream.peek t.stream with 769 - | Some '/' -> 770 - Stream.advance t.stream; 771 - Buffer.clear t.temp_buffer; 772 - t.state <- State.Rawtext_end_tag_open 773 - | _ -> 774 - emit_char t '<'; 775 - t.state <- State.Rawtext 776 - 777 - and state_rawtext_end_tag_open () = 778 - match Stream.peek t.stream with 779 - | Some c when is_ascii_alpha c -> 780 - start_new_tag t Token.End; 781 - t.state <- State.Rawtext_end_tag_name 782 - | _ -> 783 - emit_str t "</"; 784 - t.state <- State.Rawtext 785 - 786 - and state_rawtext_end_tag_name () = 787 - match Stream.peek t.stream with 788 - | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 789 - Stream.advance t.stream; 790 - t.state <- State.Before_attribute_name 791 - | Some '/' when is_appropriate_end_tag t -> 792 - Stream.advance t.stream; 793 - t.state <- State.Self_closing_start_tag 794 - | Some '>' when is_appropriate_end_tag t -> 795 - Stream.advance t.stream; 796 - t.state <- State.Data; 797 - emit_current_tag () 798 - | Some c when is_ascii_alpha c -> 799 - Stream.advance t.stream; 800 - Buffer.add_char t.current_tag_name (ascii_lower c); 801 - Buffer.add_char t.temp_buffer c 802 - | _ -> 803 - emit_str t "</"; 804 - emit_str t (Buffer.contents t.temp_buffer); 805 - t.state <- State.Rawtext 806 - 807 - and state_script_data_less_than_sign () = 808 - match Stream.peek t.stream with 809 - | Some '/' -> 810 - Stream.advance t.stream; 811 - Buffer.clear t.temp_buffer; 812 - t.state <- State.Script_data_end_tag_open 813 - | Some '!' -> 814 - Stream.advance t.stream; 815 - t.state <- State.Script_data_escape_start; 816 - emit_str t "<!" 817 - | _ -> 818 - emit_char t '<'; 819 - t.state <- State.Script_data 820 - 821 - and state_script_data_end_tag_open () = 822 - match Stream.peek t.stream with 823 - | Some c when is_ascii_alpha c -> 824 - start_new_tag t Token.End; 825 - t.state <- State.Script_data_end_tag_name 826 - | _ -> 827 - emit_str t "</"; 828 - t.state <- State.Script_data 829 - 830 - and state_script_data_end_tag_name () = 831 - match Stream.peek t.stream with 832 - | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 833 - Stream.advance t.stream; 834 - t.state <- State.Before_attribute_name 835 - | Some '/' when is_appropriate_end_tag t -> 836 - Stream.advance t.stream; 837 - t.state <- State.Self_closing_start_tag 838 - | Some '>' when is_appropriate_end_tag t -> 839 - Stream.advance t.stream; 840 - t.state <- State.Data; 841 - emit_current_tag () 842 - | Some c when is_ascii_alpha c -> 843 - Stream.advance t.stream; 844 - Buffer.add_char t.current_tag_name (ascii_lower c); 845 - Buffer.add_char t.temp_buffer c 846 - | _ -> 847 - emit_str t "</"; 848 - emit_str t (Buffer.contents t.temp_buffer); 849 - t.state <- State.Script_data 850 - 851 - and state_script_data_escape_start () = 852 - match Stream.peek t.stream with 853 - | Some '-' -> 854 - Stream.advance t.stream; 855 - t.state <- State.Script_data_escape_start_dash; 856 - emit_char t '-' 857 - | _ -> 858 - t.state <- State.Script_data 859 - 860 - and state_script_data_escape_start_dash () = 861 - match Stream.peek t.stream with 862 - | Some '-' -> 863 - Stream.advance t.stream; 864 - t.state <- State.Script_data_escaped_dash_dash; 865 - emit_char t '-' 866 - | _ -> 867 - t.state <- State.Script_data 868 - 869 - and state_script_data_escaped () = 870 - match Stream.consume t.stream with 871 - | Some '-' -> 872 - t.state <- State.Script_data_escaped_dash; 873 - emit_char t '-' 874 - | Some '<' -> 875 - t.state <- State.Script_data_escaped_less_than_sign 876 - | Some '\x00' -> 877 - error t "unexpected-null-character"; 878 - emit_str t "\xEF\xBF\xBD" 879 - | Some c -> 880 - emit_char_checked c 881 - | None -> () 882 - 883 - and state_script_data_escaped_dash () = 884 - match Stream.consume t.stream with 885 - | Some '-' -> 886 - t.state <- State.Script_data_escaped_dash_dash; 887 - emit_char t '-' 888 - | Some '<' -> 889 - t.state <- State.Script_data_escaped_less_than_sign 890 - | Some '\x00' -> 891 - error t "unexpected-null-character"; 892 - t.state <- State.Script_data_escaped; 893 - emit_str t "\xEF\xBF\xBD" 894 - | Some c -> 895 - t.state <- State.Script_data_escaped; 896 - emit_char_checked c 897 - | None -> () 898 - 899 - and state_script_data_escaped_dash_dash () = 900 - match Stream.consume t.stream with 901 - | Some '-' -> 902 - emit_char t '-' 903 - | Some '<' -> 904 - t.state <- State.Script_data_escaped_less_than_sign 905 - | Some '>' -> 906 - t.state <- State.Script_data; 907 - emit_char t '>' 908 - | Some '\x00' -> 909 - error t "unexpected-null-character"; 910 - t.state <- State.Script_data_escaped; 911 - emit_str t "\xEF\xBF\xBD" 912 - | Some c -> 913 - t.state <- State.Script_data_escaped; 914 - emit_char_checked c 915 - | None -> () 916 - 917 - and state_script_data_escaped_less_than_sign () = 918 - match Stream.peek t.stream with 919 - | Some '/' -> 920 - Stream.advance t.stream; 921 - Buffer.clear t.temp_buffer; 922 - t.state <- State.Script_data_escaped_end_tag_open 923 - | Some c when is_ascii_alpha c -> 924 - Buffer.clear t.temp_buffer; 925 - emit_char t '<'; 926 - t.state <- State.Script_data_double_escape_start 927 - | _ -> 928 - emit_char t '<'; 929 - t.state <- State.Script_data_escaped 930 - 931 - and state_script_data_escaped_end_tag_open () = 932 - match Stream.peek t.stream with 933 - | Some c when is_ascii_alpha c -> 934 - start_new_tag t Token.End; 935 - t.state <- State.Script_data_escaped_end_tag_name 936 - | _ -> 937 - emit_str t "</"; 938 - t.state <- State.Script_data_escaped 939 - 940 - and state_script_data_escaped_end_tag_name () = 941 - match Stream.peek t.stream with 942 - | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 943 - Stream.advance t.stream; 944 - t.state <- State.Before_attribute_name 945 - | Some '/' when is_appropriate_end_tag t -> 946 - Stream.advance t.stream; 947 - t.state <- State.Self_closing_start_tag 948 - | Some '>' when is_appropriate_end_tag t -> 949 - Stream.advance t.stream; 950 - t.state <- State.Data; 951 - emit_current_tag () 952 - | Some c when is_ascii_alpha c -> 953 - Stream.advance t.stream; 954 - Buffer.add_char t.current_tag_name (ascii_lower c); 955 - Buffer.add_char t.temp_buffer c 956 - | _ -> 957 - emit_str t "</"; 958 - emit_str t (Buffer.contents t.temp_buffer); 959 - t.state <- State.Script_data_escaped 960 - 961 - and state_script_data_double_escape_start () = 962 - match Stream.peek t.stream with 963 - | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 964 - Stream.advance t.stream; 965 - let c = Option.get c_opt in 966 - if Buffer.contents t.temp_buffer = "script" then 967 - t.state <- State.Script_data_double_escaped 968 - else 969 - t.state <- State.Script_data_escaped; 970 - emit_char t c 971 - | Some c when is_ascii_alpha c -> 972 - Stream.advance t.stream; 973 - Buffer.add_char t.temp_buffer (ascii_lower c); 974 - emit_char t c 975 - | _ -> 976 - t.state <- State.Script_data_escaped 977 - 978 - and state_script_data_double_escaped () = 979 - match Stream.consume t.stream with 980 - | Some '-' -> 981 - t.state <- State.Script_data_double_escaped_dash; 982 - emit_char t '-' 983 - | Some '<' -> 984 - t.state <- State.Script_data_double_escaped_less_than_sign; 985 - emit_char t '<' 986 - | Some '\x00' -> 987 - error t "unexpected-null-character"; 988 - emit_str t "\xEF\xBF\xBD" 989 - | Some c -> 990 - emit_char_checked c 991 - | None -> () 992 - 993 - and state_script_data_double_escaped_dash () = 994 - match Stream.consume t.stream with 995 - | Some '-' -> 996 - t.state <- State.Script_data_double_escaped_dash_dash; 997 - emit_char t '-' 998 - | Some '<' -> 999 - t.state <- State.Script_data_double_escaped_less_than_sign; 1000 - emit_char t '<' 1001 - | Some '\x00' -> 1002 - error t "unexpected-null-character"; 1003 - t.state <- State.Script_data_double_escaped; 1004 - emit_str t "\xEF\xBF\xBD" 1005 - | Some c -> 1006 - t.state <- State.Script_data_double_escaped; 1007 - emit_char_checked c 1008 - | None -> () 1009 - 1010 - and state_script_data_double_escaped_dash_dash () = 1011 - match Stream.consume t.stream with 1012 - | Some '-' -> 1013 - emit_char t '-' 1014 - | Some '<' -> 1015 - t.state <- State.Script_data_double_escaped_less_than_sign; 1016 - emit_char t '<' 1017 - | Some '>' -> 1018 - t.state <- State.Script_data; 1019 - emit_char t '>' 1020 - | Some '\x00' -> 1021 - error t "unexpected-null-character"; 1022 - t.state <- State.Script_data_double_escaped; 1023 - emit_str t "\xEF\xBF\xBD" 1024 - | Some c -> 1025 - t.state <- State.Script_data_double_escaped; 1026 - emit_char_checked c 1027 - | None -> () 1028 - 1029 - and state_script_data_double_escaped_less_than_sign () = 1030 - match Stream.peek t.stream with 1031 - | Some '/' -> 1032 - Stream.advance t.stream; 1033 - Buffer.clear t.temp_buffer; 1034 - t.state <- State.Script_data_double_escape_end; 1035 - emit_char t '/' 1036 - | _ -> 1037 - t.state <- State.Script_data_double_escaped 1038 - 1039 - and state_script_data_double_escape_end () = 1040 - match Stream.peek t.stream with 1041 - | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 1042 - Stream.advance t.stream; 1043 - let c = Option.get c_opt in 1044 - if Buffer.contents t.temp_buffer = "script" then 1045 - t.state <- State.Script_data_escaped 1046 - else 1047 - t.state <- State.Script_data_double_escaped; 1048 - emit_char t c 1049 - | Some c when is_ascii_alpha c -> 1050 - Stream.advance t.stream; 1051 - Buffer.add_char t.temp_buffer (ascii_lower c); 1052 - emit_char t c 1053 - | _ -> 1054 - t.state <- State.Script_data_double_escaped 1055 - 1056 - and state_before_attribute_name () = 1057 - match Stream.peek t.stream with 1058 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1059 - Stream.advance t.stream 1060 - | Some '/' | Some '>' | None -> 1061 - t.state <- State.After_attribute_name 1062 - | Some '=' -> 1063 - Stream.advance t.stream; 1064 - error t "unexpected-equals-sign-before-attribute-name"; 1065 - start_new_attribute t; 1066 - Buffer.add_char t.current_attr_name '='; 1067 - t.state <- State.Attribute_name 1068 - | Some _ -> 1069 - start_new_attribute t; 1070 - t.state <- State.Attribute_name 1071 - 1072 - and state_attribute_name () = 1073 - match Stream.peek t.stream with 1074 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1075 - Stream.advance t.stream; 1076 - t.state <- State.After_attribute_name 1077 - | Some '/' | Some '>' | None -> 1078 - t.state <- State.After_attribute_name 1079 - | Some '=' -> 1080 - Stream.advance t.stream; 1081 - t.state <- State.Before_attribute_value 1082 - | Some '\x00' -> 1083 - Stream.advance t.stream; 1084 - error t "unexpected-null-character"; 1085 - Buffer.add_string t.current_attr_name "\xEF\xBF\xBD" 1086 - | Some ('"' | '\'' | '<') as c_opt -> 1087 - Stream.advance t.stream; 1088 - error t "unexpected-character-in-attribute-name"; 1089 - Buffer.add_char t.current_attr_name (Option.get c_opt) 1090 - | Some c -> 1091 - Stream.advance t.stream; 1092 - check_control_char c; 1093 - Buffer.add_char t.current_attr_name (ascii_lower c) 1094 - 1095 - and state_after_attribute_name () = 1096 - match Stream.peek t.stream with 1097 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1098 - Stream.advance t.stream 1099 - | Some '/' -> 1100 - Stream.advance t.stream; 1101 - t.state <- State.Self_closing_start_tag 1102 - | Some '=' -> 1103 - Stream.advance t.stream; 1104 - t.state <- State.Before_attribute_value 1105 - | Some '>' -> 1106 - Stream.advance t.stream; 1107 - t.state <- State.Data; 1108 - emit_current_tag () 1109 - | None -> () 1110 - | Some _ -> 1111 - start_new_attribute t; 1112 - t.state <- State.Attribute_name 1113 - 1114 - and state_before_attribute_value () = 1115 - match Stream.peek t.stream with 1116 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1117 - Stream.advance t.stream 1118 - | Some '"' -> 1119 - Stream.advance t.stream; 1120 - t.state <- State.Attribute_value_double_quoted 1121 - | Some '\'' -> 1122 - Stream.advance t.stream; 1123 - t.state <- State.Attribute_value_single_quoted 1124 - | Some '>' -> 1125 - Stream.advance t.stream; 1126 - error t "missing-attribute-value"; 1127 - t.state <- State.Data; 1128 - emit_current_tag () 1129 - | _ -> 1130 - t.state <- State.Attribute_value_unquoted 1131 - 1132 - and state_attribute_value_double_quoted () = 1133 - match Stream.consume t.stream with 1134 - | Some '"' -> 1135 - t.state <- State.After_attribute_value_quoted 1136 - | Some '&' -> 1137 - t.return_state <- State.Attribute_value_double_quoted; 1138 - t.state <- State.Character_reference 1139 - | Some '\x00' -> 1140 - error t "unexpected-null-character"; 1141 - Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1142 - | Some c -> 1143 - check_control_char c; 1144 - Buffer.add_char t.current_attr_value c 1145 - | None -> () 1146 - 1147 - and state_attribute_value_single_quoted () = 1148 - match Stream.consume t.stream with 1149 - | Some '\'' -> 1150 - t.state <- State.After_attribute_value_quoted 1151 - | Some '&' -> 1152 - t.return_state <- State.Attribute_value_single_quoted; 1153 - t.state <- State.Character_reference 1154 - | Some '\x00' -> 1155 - error t "unexpected-null-character"; 1156 - Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1157 - | Some c -> 1158 - check_control_char c; 1159 - Buffer.add_char t.current_attr_value c 1160 - | None -> () 1161 - 1162 - and state_attribute_value_unquoted () = 1163 - match Stream.peek t.stream with 1164 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1165 - Stream.advance t.stream; 1166 - t.state <- State.Before_attribute_name 1167 - | Some '&' -> 1168 - Stream.advance t.stream; 1169 - t.return_state <- State.Attribute_value_unquoted; 1170 - t.state <- State.Character_reference 1171 - | Some '>' -> 1172 - Stream.advance t.stream; 1173 - t.state <- State.Data; 1174 - emit_current_tag () 1175 - | Some '\x00' -> 1176 - Stream.advance t.stream; 1177 - error t "unexpected-null-character"; 1178 - Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1179 - | Some ('"' | '\'' | '<' | '=' | '`') as c_opt -> 1180 - Stream.advance t.stream; 1181 - error t "unexpected-character-in-unquoted-attribute-value"; 1182 - Buffer.add_char t.current_attr_value (Option.get c_opt) 1183 - | Some c -> 1184 - Stream.advance t.stream; 1185 - check_control_char c; 1186 - Buffer.add_char t.current_attr_value c 1187 - | None -> () 1188 - 1189 - and state_after_attribute_value_quoted () = 1190 - match Stream.peek t.stream with 1191 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1192 - Stream.advance t.stream; 1193 - t.state <- State.Before_attribute_name 1194 - | Some '/' -> 1195 - Stream.advance t.stream; 1196 - t.state <- State.Self_closing_start_tag 1197 - | Some '>' -> 1198 - Stream.advance t.stream; 1199 - t.state <- State.Data; 1200 - emit_current_tag () 1201 - | None -> () 1202 - | Some _ -> 1203 - error t "missing-whitespace-between-attributes"; 1204 - t.state <- State.Before_attribute_name 1205 - 1206 - and state_self_closing_start_tag () = 1207 - match Stream.peek t.stream with 1208 - | Some '>' -> 1209 - Stream.advance t.stream; 1210 - t.current_tag_self_closing <- true; 1211 - t.state <- State.Data; 1212 - emit_current_tag () 1213 - | None -> () 1214 - | Some _ -> 1215 - error t "unexpected-solidus-in-tag"; 1216 - t.state <- State.Before_attribute_name 1217 - 1218 - and state_bogus_comment () = 1219 - match Stream.consume t.stream with 1220 - | Some '>' -> 1221 - t.state <- State.Data; 1222 - emit_current_comment () 1223 - | Some '\x00' -> 1224 - error t "unexpected-null-character"; 1225 - Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1226 - | Some c -> 1227 - check_control_char c; 1228 - Buffer.add_char t.current_comment c 1229 - | None -> () 1230 - 1231 - and state_markup_declaration_open () = 1232 - if Stream.matches_ci t.stream "--" then begin 1233 - ignore (Stream.consume_exact_ci t.stream "--"); 1234 - Buffer.clear t.current_comment; 1235 - t.state <- State.Comment_start 1236 - end else if Stream.matches_ci t.stream "DOCTYPE" then begin 1237 - ignore (Stream.consume_exact_ci t.stream "DOCTYPE"); 1238 - t.state <- State.Doctype 1239 - end else if Stream.matches_ci t.stream "[CDATA[" then begin 1240 - ignore (Stream.consume_exact_ci t.stream "[CDATA["); 1241 - (* CDATA only allowed in foreign content *) 1242 - if S.adjusted_current_node_in_html_namespace t.sink then begin 1243 - error t "cdata-in-html-content"; 1244 - Buffer.clear t.current_comment; 1245 - Buffer.add_string t.current_comment "[CDATA["; 1246 - t.state <- State.Bogus_comment 1247 - end else 1248 - t.state <- State.Cdata_section 1249 - end else begin 1250 - error t "incorrectly-opened-comment"; 1251 - Buffer.clear t.current_comment; 1252 - t.state <- State.Bogus_comment 1253 - end 1254 - 1255 - and state_comment_start () = 1256 - match Stream.peek t.stream with 1257 - | Some '-' -> 1258 - Stream.advance t.stream; 1259 - t.state <- State.Comment_start_dash 1260 - | Some '>' -> 1261 - Stream.advance t.stream; 1262 - error t "abrupt-closing-of-empty-comment"; 1263 - t.state <- State.Data; 1264 - emit_current_comment () 1265 - | _ -> 1266 - t.state <- State.Comment 1267 - 1268 - and state_comment_start_dash () = 1269 - match Stream.peek t.stream with 1270 - | Some '-' -> 1271 - Stream.advance t.stream; 1272 - t.state <- State.Comment_end 1273 - | Some '>' -> 1274 - Stream.advance t.stream; 1275 - error t "abrupt-closing-of-empty-comment"; 1276 - t.state <- State.Data; 1277 - emit_current_comment () 1278 - | None -> () 1279 - | Some _ -> 1280 - Buffer.add_char t.current_comment '-'; 1281 - t.state <- State.Comment 1282 - 1283 - and state_comment () = 1284 - match Stream.consume t.stream with 1285 - | Some '<' -> 1286 - Buffer.add_char t.current_comment '<'; 1287 - t.state <- State.Comment_less_than_sign 1288 - | Some '-' -> 1289 - t.state <- State.Comment_end_dash 1290 - | Some '\x00' -> 1291 - error t "unexpected-null-character"; 1292 - Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1293 - | Some c -> 1294 - check_control_char c; 1295 - Buffer.add_char t.current_comment c 1296 - | None -> () 1297 - 1298 - and state_comment_less_than_sign () = 1299 - match Stream.peek t.stream with 1300 - | Some '!' -> 1301 - Stream.advance t.stream; 1302 - Buffer.add_char t.current_comment '!'; 1303 - t.state <- State.Comment_less_than_sign_bang 1304 - | Some '<' -> 1305 - Stream.advance t.stream; 1306 - Buffer.add_char t.current_comment '<' 1307 - | _ -> 1308 - t.state <- State.Comment 1309 - 1310 - and state_comment_less_than_sign_bang () = 1311 - match Stream.peek t.stream with 1312 - | Some '-' -> 1313 - Stream.advance t.stream; 1314 - t.state <- State.Comment_less_than_sign_bang_dash 1315 - | _ -> 1316 - t.state <- State.Comment 1317 - 1318 - and state_comment_less_than_sign_bang_dash () = 1319 - match Stream.peek t.stream with 1320 - | Some '-' -> 1321 - Stream.advance t.stream; 1322 - t.state <- State.Comment_less_than_sign_bang_dash_dash 1323 - | _ -> 1324 - t.state <- State.Comment_end_dash 1325 - 1326 - and state_comment_less_than_sign_bang_dash_dash () = 1327 - match Stream.peek t.stream with 1328 - | Some '>' | None -> 1329 - t.state <- State.Comment_end 1330 - | Some _ -> 1331 - error t "nested-comment"; 1332 - t.state <- State.Comment_end 1333 - 1334 - and state_comment_end_dash () = 1335 - match Stream.peek t.stream with 1336 - | Some '-' -> 1337 - Stream.advance t.stream; 1338 - t.state <- State.Comment_end 1339 - | None -> () 1340 - | Some _ -> 1341 - Buffer.add_char t.current_comment '-'; 1342 - t.state <- State.Comment 1343 - 1344 - and state_comment_end () = 1345 - match Stream.peek t.stream with 1346 - | Some '>' -> 1347 - Stream.advance t.stream; 1348 - t.state <- State.Data; 1349 - emit_current_comment () 1350 - | Some '!' -> 1351 - Stream.advance t.stream; 1352 - t.state <- State.Comment_end_bang 1353 - | Some '-' -> 1354 - Stream.advance t.stream; 1355 - Buffer.add_char t.current_comment '-' 1356 - | None -> () 1357 - | Some _ -> 1358 - Buffer.add_string t.current_comment "--"; 1359 - t.state <- State.Comment 1360 - 1361 - and state_comment_end_bang () = 1362 - match Stream.peek t.stream with 1363 - | Some '-' -> 1364 - Stream.advance t.stream; 1365 - Buffer.add_string t.current_comment "--!"; 1366 - t.state <- State.Comment_end_dash 1367 - | Some '>' -> 1368 - Stream.advance t.stream; 1369 - error t "incorrectly-closed-comment"; 1370 - t.state <- State.Data; 1371 - emit_current_comment () 1372 - | None -> () 1373 - | Some _ -> 1374 - Buffer.add_string t.current_comment "--!"; 1375 - t.state <- State.Comment 1376 - 1377 - and state_doctype () = 1378 - match Stream.peek t.stream with 1379 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1380 - Stream.advance t.stream; 1381 - t.state <- State.Before_doctype_name 1382 - | Some '>' -> 1383 - t.state <- State.Before_doctype_name 1384 - | None -> () 1385 - | Some _ -> 1386 - error t "missing-whitespace-before-doctype-name"; 1387 - t.state <- State.Before_doctype_name 1388 - 1389 - and state_before_doctype_name () = 1390 - match Stream.peek t.stream with 1391 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1392 - Stream.advance t.stream 1393 - | Some '\x00' -> 1394 - Stream.advance t.stream; 1395 - error t "unexpected-null-character"; 1396 - start_new_doctype t; 1397 - t.current_doctype_name <- Some (Buffer.create 8); 1398 - Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"; 1399 - t.state <- State.Doctype_name 1400 - | Some '>' -> 1401 - Stream.advance t.stream; 1402 - error t "missing-doctype-name"; 1403 - start_new_doctype t; 1404 - t.current_doctype_force_quirks <- true; 1405 - t.state <- State.Data; 1406 - emit_current_doctype () 1407 - | None -> () 1408 - | Some c -> 1409 - Stream.advance t.stream; 1410 - check_control_char c; 1411 - start_new_doctype t; 1412 - t.current_doctype_name <- Some (Buffer.create 8); 1413 - Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c); 1414 - t.state <- State.Doctype_name 1415 - 1416 - and state_doctype_name () = 1417 - match Stream.consume t.stream with 1418 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1419 - t.state <- State.After_doctype_name 1420 - | Some '>' -> 1421 - t.state <- State.Data; 1422 - emit_current_doctype () 1423 - | Some '\x00' -> 1424 - error t "unexpected-null-character"; 1425 - Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD" 1426 - | Some c -> 1427 - check_control_char c; 1428 - Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c) 1429 - | None -> () 1430 - 1431 - and state_after_doctype_name () = 1432 - match Stream.peek t.stream with 1433 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1434 - Stream.advance t.stream 1435 - | Some '>' -> 1436 - Stream.advance t.stream; 1437 - t.state <- State.Data; 1438 - emit_current_doctype () 1439 - | None -> () 1440 - | Some _ -> 1441 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1442 - if Stream.matches_ci t.stream "PUBLIC" then begin 1443 - ignore (Stream.consume_exact_ci t.stream "PUBLIC"); 1444 - t.state <- State.After_doctype_public_keyword 1445 - end else if Stream.matches_ci t.stream "SYSTEM" then begin 1446 - ignore (Stream.consume_exact_ci t.stream "SYSTEM"); 1447 - t.state <- State.After_doctype_system_keyword 1448 - end else begin 1449 - error t "invalid-character-sequence-after-doctype-name"; 1450 - t.current_doctype_force_quirks <- true; 1451 - t.state <- State.Bogus_doctype 1452 - end 1453 - 1454 - and state_after_doctype_public_keyword () = 1455 - match Stream.peek t.stream with 1456 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1457 - Stream.advance t.stream; 1458 - t.state <- State.Before_doctype_public_identifier 1459 - | Some '"' -> 1460 - Stream.advance t.stream; 1461 - error t "missing-whitespace-after-doctype-public-keyword"; 1462 - t.current_doctype_public <- Some (Buffer.create 32); 1463 - t.state <- State.Doctype_public_identifier_double_quoted 1464 - | Some '\'' -> 1465 - Stream.advance t.stream; 1466 - error t "missing-whitespace-after-doctype-public-keyword"; 1467 - t.current_doctype_public <- Some (Buffer.create 32); 1468 - t.state <- State.Doctype_public_identifier_single_quoted 1469 - | Some '>' -> 1470 - Stream.advance t.stream; 1471 - error t "missing-doctype-public-identifier"; 1472 - t.current_doctype_force_quirks <- true; 1473 - t.state <- State.Data; 1474 - emit_current_doctype () 1475 - | None -> () 1476 - | Some _ -> 1477 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1478 - error t "missing-quote-before-doctype-public-identifier"; 1479 - t.current_doctype_force_quirks <- true; 1480 - t.state <- State.Bogus_doctype 1481 - 1482 - and state_before_doctype_public_identifier () = 1483 - match Stream.peek t.stream with 1484 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1485 - Stream.advance t.stream 1486 - | Some '"' -> 1487 - Stream.advance t.stream; 1488 - t.current_doctype_public <- Some (Buffer.create 32); 1489 - t.state <- State.Doctype_public_identifier_double_quoted 1490 - | Some '\'' -> 1491 - Stream.advance t.stream; 1492 - t.current_doctype_public <- Some (Buffer.create 32); 1493 - t.state <- State.Doctype_public_identifier_single_quoted 1494 - | Some '>' -> 1495 - Stream.advance t.stream; 1496 - error t "missing-doctype-public-identifier"; 1497 - t.current_doctype_force_quirks <- true; 1498 - t.state <- State.Data; 1499 - emit_current_doctype () 1500 - | None -> () 1501 - | Some _ -> 1502 - error t "missing-quote-before-doctype-public-identifier"; 1503 - t.current_doctype_force_quirks <- true; 1504 - t.state <- State.Bogus_doctype 1505 - 1506 - and state_doctype_public_identifier_double_quoted () = 1507 - match Stream.consume t.stream with 1508 - | Some '"' -> 1509 - t.state <- State.After_doctype_public_identifier 1510 - | Some '\x00' -> 1511 - error t "unexpected-null-character"; 1512 - Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1513 - | Some '>' -> 1514 - error t "abrupt-doctype-public-identifier"; 1515 - t.current_doctype_force_quirks <- true; 1516 - t.state <- State.Data; 1517 - emit_current_doctype () 1518 - | Some c -> 1519 - check_control_char c; 1520 - Buffer.add_char (Option.get t.current_doctype_public) c 1521 - | None -> () 1522 - 1523 - and state_doctype_public_identifier_single_quoted () = 1524 - match Stream.consume t.stream with 1525 - | Some '\'' -> 1526 - t.state <- State.After_doctype_public_identifier 1527 - | Some '\x00' -> 1528 - error t "unexpected-null-character"; 1529 - Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1530 - | Some '>' -> 1531 - error t "abrupt-doctype-public-identifier"; 1532 - t.current_doctype_force_quirks <- true; 1533 - t.state <- State.Data; 1534 - emit_current_doctype () 1535 - | Some c -> 1536 - check_control_char c; 1537 - Buffer.add_char (Option.get t.current_doctype_public) c 1538 - | None -> () 1539 - 1540 - and state_after_doctype_public_identifier () = 1541 - match Stream.peek t.stream with 1542 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1543 - Stream.advance t.stream; 1544 - t.state <- State.Between_doctype_public_and_system_identifiers 1545 - | Some '>' -> 1546 - Stream.advance t.stream; 1547 - t.state <- State.Data; 1548 - emit_current_doctype () 1549 - | Some '"' -> 1550 - Stream.advance t.stream; 1551 - error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1552 - t.current_doctype_system <- Some (Buffer.create 32); 1553 - t.state <- State.Doctype_system_identifier_double_quoted 1554 - | Some '\'' -> 1555 - Stream.advance t.stream; 1556 - error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1557 - t.current_doctype_system <- Some (Buffer.create 32); 1558 - t.state <- State.Doctype_system_identifier_single_quoted 1559 - | None -> () 1560 - | Some _ -> 1561 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1562 - error t "missing-quote-before-doctype-system-identifier"; 1563 - t.current_doctype_force_quirks <- true; 1564 - t.state <- State.Bogus_doctype 1565 - 1566 - and state_between_doctype_public_and_system_identifiers () = 1567 - match Stream.peek t.stream with 1568 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1569 - Stream.advance t.stream 1570 - | Some '>' -> 1571 - Stream.advance t.stream; 1572 - t.state <- State.Data; 1573 - emit_current_doctype () 1574 - | Some '"' -> 1575 - Stream.advance t.stream; 1576 - t.current_doctype_system <- Some (Buffer.create 32); 1577 - t.state <- State.Doctype_system_identifier_double_quoted 1578 - | Some '\'' -> 1579 - Stream.advance t.stream; 1580 - t.current_doctype_system <- Some (Buffer.create 32); 1581 - t.state <- State.Doctype_system_identifier_single_quoted 1582 - | None -> () 1583 - | Some _ -> 1584 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1585 - error t "missing-quote-before-doctype-system-identifier"; 1586 - t.current_doctype_force_quirks <- true; 1587 - t.state <- State.Bogus_doctype 1588 - 1589 - and state_after_doctype_system_keyword () = 1590 - match Stream.peek t.stream with 1591 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1592 - Stream.advance t.stream; 1593 - t.state <- State.Before_doctype_system_identifier 1594 - | Some '"' -> 1595 - Stream.advance t.stream; 1596 - error t "missing-whitespace-after-doctype-system-keyword"; 1597 - t.current_doctype_system <- Some (Buffer.create 32); 1598 - t.state <- State.Doctype_system_identifier_double_quoted 1599 - | Some '\'' -> 1600 - Stream.advance t.stream; 1601 - error t "missing-whitespace-after-doctype-system-keyword"; 1602 - t.current_doctype_system <- Some (Buffer.create 32); 1603 - t.state <- State.Doctype_system_identifier_single_quoted 1604 - | Some '>' -> 1605 - Stream.advance t.stream; 1606 - error t "missing-doctype-system-identifier"; 1607 - t.current_doctype_force_quirks <- true; 1608 - t.state <- State.Data; 1609 - emit_current_doctype () 1610 - | None -> () 1611 - | Some _ -> 1612 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1613 - error t "missing-quote-before-doctype-system-identifier"; 1614 - t.current_doctype_force_quirks <- true; 1615 - t.state <- State.Bogus_doctype 1616 - 1617 - and state_before_doctype_system_identifier () = 1618 - match Stream.peek t.stream with 1619 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1620 - Stream.advance t.stream 1621 - | Some '"' -> 1622 - Stream.advance t.stream; 1623 - t.current_doctype_system <- Some (Buffer.create 32); 1624 - t.state <- State.Doctype_system_identifier_double_quoted 1625 - | Some '\'' -> 1626 - Stream.advance t.stream; 1627 - t.current_doctype_system <- Some (Buffer.create 32); 1628 - t.state <- State.Doctype_system_identifier_single_quoted 1629 - | Some '>' -> 1630 - Stream.advance t.stream; 1631 - error t "missing-doctype-system-identifier"; 1632 - t.current_doctype_force_quirks <- true; 1633 - t.state <- State.Data; 1634 - emit_current_doctype () 1635 - | None -> () 1636 - | Some _ -> 1637 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1638 - error t "missing-quote-before-doctype-system-identifier"; 1639 - t.current_doctype_force_quirks <- true; 1640 - t.state <- State.Bogus_doctype 1641 - 1642 - and state_doctype_system_identifier_double_quoted () = 1643 - match Stream.consume t.stream with 1644 - | Some '"' -> 1645 - t.state <- State.After_doctype_system_identifier 1646 - | Some '\x00' -> 1647 - error t "unexpected-null-character"; 1648 - Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1649 - | Some '>' -> 1650 - error t "abrupt-doctype-system-identifier"; 1651 - t.current_doctype_force_quirks <- true; 1652 - t.state <- State.Data; 1653 - emit_current_doctype () 1654 - | Some c -> 1655 - check_control_char c; 1656 - Buffer.add_char (Option.get t.current_doctype_system) c 1657 - | None -> () 1658 - 1659 - and state_doctype_system_identifier_single_quoted () = 1660 - match Stream.consume t.stream with 1661 - | Some '\'' -> 1662 - t.state <- State.After_doctype_system_identifier 1663 - | Some '\x00' -> 1664 - error t "unexpected-null-character"; 1665 - Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1666 - | Some '>' -> 1667 - error t "abrupt-doctype-system-identifier"; 1668 - t.current_doctype_force_quirks <- true; 1669 - t.state <- State.Data; 1670 - emit_current_doctype () 1671 - | Some c -> 1672 - check_control_char c; 1673 - Buffer.add_char (Option.get t.current_doctype_system) c 1674 - | None -> () 1675 - 1676 - and state_after_doctype_system_identifier () = 1677 - match Stream.peek t.stream with 1678 - | Some ('\t' | '\n' | '\x0C' | ' ') -> 1679 - Stream.advance t.stream 1680 - | Some '>' -> 1681 - Stream.advance t.stream; 1682 - t.state <- State.Data; 1683 - emit_current_doctype () 1684 - | None -> () 1685 - | Some _ -> 1686 - (* Don't check control char here - bogus_doctype will check when it consumes *) 1687 - error t "unexpected-character-after-doctype-system-identifier"; 1688 - t.state <- State.Bogus_doctype 1689 - 1690 - and state_bogus_doctype () = 1691 - match Stream.consume t.stream with 1692 - | Some '>' -> 1693 - t.state <- State.Data; 1694 - emit_current_doctype () 1695 - | Some '\x00' -> 1696 - error t "unexpected-null-character" 1697 - | Some c -> 1698 - check_control_char c (* Check all chars in bogus doctype *) 1699 - | None -> () 1700 - 1701 - and state_cdata_section () = 1702 - match Stream.consume t.stream with 1703 - | Some ']' -> 1704 - t.state <- State.Cdata_section_bracket 1705 - | Some c -> 1706 - (* CDATA section emits all characters as-is, including NUL, but still check for control chars *) 1707 - emit_char_checked c 1708 - | None -> () 1709 - 1710 - and state_cdata_section_bracket () = 1711 - match Stream.peek t.stream with 1712 - | Some ']' -> 1713 - Stream.advance t.stream; 1714 - t.state <- State.Cdata_section_end 1715 - | _ -> 1716 - emit_char t ']'; 1717 - t.state <- State.Cdata_section 1718 - 1719 - and state_cdata_section_end () = 1720 - match Stream.peek t.stream with 1721 - | Some ']' -> 1722 - Stream.advance t.stream; 1723 - emit_char t ']' 1724 - | Some '>' -> 1725 - Stream.advance t.stream; 1726 - t.state <- State.Data 1727 - | _ -> 1728 - emit_str t "]]"; 1729 - t.state <- State.Cdata_section 1730 - 1731 - and state_character_reference () = 1732 - Buffer.clear t.temp_buffer; 1733 - Buffer.add_char t.temp_buffer '&'; 1734 - match Stream.peek t.stream with 1735 - | Some c when is_ascii_alnum c -> 1736 - t.state <- State.Named_character_reference 1737 - | Some '#' -> 1738 - Stream.advance t.stream; 1739 - Buffer.add_char t.temp_buffer '#'; 1740 - t.state <- State.Numeric_character_reference 1741 - | _ -> 1742 - flush_code_points_consumed_as_char_ref t; 1743 - t.state <- t.return_state 1744 - 1745 - and state_named_character_reference () = 1746 - (* Collect alphanumeric characters *) 1747 - let rec collect () = 1748 - match Stream.peek t.stream with 1749 - | Some c when is_ascii_alnum c -> 1750 - Stream.advance t.stream; 1751 - Buffer.add_char t.temp_buffer c; 1752 - collect () 1753 - | _ -> () 1754 - in 1755 - collect (); 1756 - 1757 - let has_semicolon = 1758 - match Stream.peek t.stream with 1759 - | Some ';' -> Stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true 1760 - | _ -> false 1761 - in 1762 - 1763 - (* Try to match entity - buffer contains "&name" or "&name;" *) 1764 - let buf_contents = Buffer.contents t.temp_buffer in 1765 - let name_start = 1 in (* Skip '&' *) 1766 - let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in 1767 - let entity_name = String.sub buf_contents name_start (name_end - name_start) in 1768 - 1769 - (* Try progressively shorter matches *) 1770 - (* Only match if: 1771 - 1. Full match with semicolon, OR 1772 - 2. Legacy entity (can be used without semicolon) *) 1773 - let rec try_match len = 1774 - if len <= 0 then None 1775 - else 1776 - let prefix = String.sub entity_name 0 len in 1777 - let is_full = len = String.length entity_name in 1778 - let would_have_semi = has_semicolon && is_full in 1779 - (* Only use this match if it has semicolon or is a legacy entity *) 1780 - if would_have_semi || Html5rw_entities.is_legacy prefix then 1781 - match Html5rw_entities.lookup prefix with 1782 - | Some decoded -> Some (decoded, len) 1783 - | None -> try_match (len - 1) 1784 - else 1785 - try_match (len - 1) 1786 - in 1787 - 1788 - match try_match (String.length entity_name) with 1789 - | Some (decoded, matched_len) -> 1790 - let full_match = matched_len = String.length entity_name in 1791 - let ends_with_semi = has_semicolon && full_match in 1792 - 1793 - (* Check attribute context restrictions *) 1794 - let in_attribute = match t.return_state with 1795 - | State.Attribute_value_double_quoted 1796 - | State.Attribute_value_single_quoted 1797 - | State.Attribute_value_unquoted -> true 1798 - | _ -> false 1799 - in 1800 - 1801 - let next_char = 1802 - if full_match && not has_semicolon then 1803 - Stream.peek t.stream 1804 - else if not full_match then 1805 - Some entity_name.[matched_len] 1806 - else None 1807 - in 1808 - 1809 - let blocked = in_attribute && not ends_with_semi && 1810 - match next_char with 1811 - | Some '=' -> true 1812 - | Some c when is_ascii_alnum c -> true 1813 - | _ -> false 1814 - in 1815 - 1816 - if blocked then begin 1817 - flush_code_points_consumed_as_char_ref t; 1818 - t.state <- t.return_state 1819 - end else begin 1820 - if not ends_with_semi then 1821 - error t "missing-semicolon-after-character-reference"; 1822 - Buffer.clear t.temp_buffer; 1823 - Buffer.add_string t.temp_buffer decoded; 1824 - flush_code_points_consumed_as_char_ref t; 1825 - (* Emit unconsumed chars after partial match *) 1826 - if not full_match then begin 1827 - let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in 1828 - emit_str t unconsumed; 1829 - (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *) 1830 - if has_semicolon then 1831 - emit_char t ';' 1832 - end; 1833 - t.state <- t.return_state 1834 - end 1835 - | None -> 1836 - (* No match - check if we should report unknown-named-character-reference *) 1837 - if String.length entity_name > 0 then begin 1838 - (* If we have a semicolon, it's definitely an unknown named character reference *) 1839 - if has_semicolon then 1840 - error t "unknown-named-character-reference"; 1841 - (* Emit all the chars we consumed *) 1842 - flush_code_points_consumed_as_char_ref t; 1843 - t.state <- t.return_state 1844 - end else begin 1845 - flush_code_points_consumed_as_char_ref t; 1846 - t.state <- t.return_state 1847 - end 1848 - 1849 - and state_ambiguous_ampersand () = 1850 - match Stream.peek t.stream with 1851 - | Some c when is_ascii_alnum c -> 1852 - Stream.advance t.stream; 1853 - (match t.return_state with 1854 - | State.Attribute_value_double_quoted 1855 - | State.Attribute_value_single_quoted 1856 - | State.Attribute_value_unquoted -> 1857 - Buffer.add_char t.current_attr_value c 1858 - | _ -> 1859 - emit_char t c) 1860 - | Some ';' -> 1861 - error t "unknown-named-character-reference"; 1862 - t.state <- t.return_state 1863 - | _ -> 1864 - t.state <- t.return_state 1865 - 1866 - and state_numeric_character_reference () = 1867 - t.char_ref_code <- 0; 1868 - match Stream.peek t.stream with 1869 - | Some (('x' | 'X') as c) -> 1870 - Stream.advance t.stream; 1871 - Buffer.add_char t.temp_buffer c; 1872 - t.state <- State.Hexadecimal_character_reference_start 1873 - | _ -> 1874 - t.state <- State.Decimal_character_reference_start 1875 - 1876 - and state_hexadecimal_character_reference_start () = 1877 - match Stream.peek t.stream with 1878 - | Some c when is_ascii_hex c -> 1879 - t.state <- State.Hexadecimal_character_reference 1880 - | _ -> 1881 - error t "absence-of-digits-in-numeric-character-reference"; 1882 - flush_code_points_consumed_as_char_ref t; 1883 - t.state <- t.return_state 1884 - 1885 - and state_decimal_character_reference_start () = 1886 - match Stream.peek t.stream with 1887 - | Some c when is_ascii_digit c -> 1888 - t.state <- State.Decimal_character_reference 1889 - | _ -> 1890 - error t "absence-of-digits-in-numeric-character-reference"; 1891 - flush_code_points_consumed_as_char_ref t; 1892 - t.state <- t.return_state 1893 - 1894 - and state_hexadecimal_character_reference () = 1895 - match Stream.peek t.stream with 1896 - | Some c when is_ascii_digit c -> 1897 - Stream.advance t.stream; 1898 - t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0'); 1899 - if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1900 - | Some c when c >= 'A' && c <= 'F' -> 1901 - Stream.advance t.stream; 1902 - t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10); 1903 - if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1904 - | Some c when c >= 'a' && c <= 'f' -> 1905 - Stream.advance t.stream; 1906 - t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10); 1907 - if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1908 - | Some ';' -> 1909 - Stream.advance t.stream; 1910 - t.state <- State.Numeric_character_reference_end 1911 - | _ -> 1912 - error t "missing-semicolon-after-character-reference"; 1913 - t.state <- State.Numeric_character_reference_end 1914 - 1915 - and state_decimal_character_reference () = 1916 - match Stream.peek t.stream with 1917 - | Some c when is_ascii_digit c -> 1918 - Stream.advance t.stream; 1919 - t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0'); 1920 - if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1921 - | Some ';' -> 1922 - Stream.advance t.stream; 1923 - t.state <- State.Numeric_character_reference_end 1924 - | _ -> 1925 - error t "missing-semicolon-after-character-reference"; 1926 - t.state <- State.Numeric_character_reference_end 1927 - 1928 - and state_numeric_character_reference_end () = 1929 - let code = t.char_ref_code in 1930 - let replacement_char = "\xEF\xBF\xBD" in 1931 - 1932 - let result = 1933 - if code = 0 then begin 1934 - error t "null-character-reference"; 1935 - replacement_char 1936 - end else if code > 0x10FFFF then begin 1937 - error t "character-reference-outside-unicode-range"; 1938 - replacement_char 1939 - end else if code >= 0xD800 && code <= 0xDFFF then begin 1940 - error t "surrogate-character-reference"; 1941 - replacement_char 1942 - end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1943 - List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; 1944 - 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF; 1945 - 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF; 1946 - 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1947 - 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1948 - 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1949 - error t "noncharacter-character-reference"; 1950 - Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 1951 - end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1952 - (code >= 0x0D && code <= 0x1F) || 1953 - (code >= 0x7F && code <= 0x9F) then begin 1954 - error t "control-character-reference"; 1955 - (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1956 - match Html5rw_entities.Numeric_ref.find_replacement code with 1957 - | Some replacement -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 replacement 1958 - | None -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 1959 - end else 1960 - Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 1961 - in 1962 - 1963 - Buffer.clear t.temp_buffer; 1964 - Buffer.add_string t.temp_buffer result; 1965 - flush_code_points_consumed_as_char_ref t; 1966 - t.state <- t.return_state 1967 - 1968 - in 1969 - process_state () 1970 - 1971 - let get_errors t = List.rev t.errors 1972 - 1973 - let set_state t state = t.state <- state 1974 - 1975 - let set_last_start_tag t name = t.last_start_tag <- name
+5 -5
test/dune
··· 5 5 (executable 6 6 (name test_all) 7 7 (modules test_all) 8 - (libraries bytesrw html5rw.parser html5rw.dom html5rw.tokenizer html5rw.encoding jsont jsont.bytesrw test_report)) 8 + (libraries bytesrw html5rw jsont jsont.bytesrw test_report)) 9 9 10 10 (executable 11 11 (name test_html5lib) 12 12 (modules test_html5lib) 13 - (libraries bytesrw html5rw.parser html5rw.dom test_report)) 13 + (libraries bytesrw html5rw test_report)) 14 14 15 15 (rule 16 16 (alias runtest) ··· 22 22 (executable 23 23 (name test_tokenizer) 24 24 (modules test_tokenizer) 25 - (libraries bytesrw html5rw.tokenizer jsont jsont.bytesrw test_report)) 25 + (libraries bytesrw html5rw jsont jsont.bytesrw test_report)) 26 26 27 27 (rule 28 28 (alias runtest) ··· 34 34 (executable 35 35 (name test_encoding) 36 36 (modules test_encoding) 37 - (libraries html5rw.encoding test_report)) 37 + (libraries html5rw test_report)) 38 38 39 39 (rule 40 40 (alias runtest) ··· 46 46 (executable 47 47 (name test_serializer) 48 48 (modules test_serializer) 49 - (libraries html5rw.dom jsont jsont.bytesrw test_report)) 49 + (libraries html5rw jsont jsont.bytesrw test_report)) 50 50 51 51 (rule 52 52 (alias runtest)
+37 -37
test/test_all.ml
··· 10 10 (* ============================================================ *) 11 11 12 12 module TreeConstruction = struct 13 - module Parser = Html5rw_parser 14 - module Dom = Html5rw_dom 13 + module Parser = Html5rw.Parser 14 + module Dom = Html5rw.Dom 15 15 16 16 type test_case = { 17 17 input : string; ··· 108 108 | [tag] -> (None, tag) 109 109 | _ -> (None, ctx_str) 110 110 in 111 - let context = Parser.make_fragment_context ~tag_name ~namespace () in 111 + let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in 112 112 let reader = Bytes.Reader.of_string test.input in 113 - Parser.parse ~collect_errors:true ~fragment_context:context reader 113 + Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader 114 114 | None -> 115 115 let reader = Bytes.Reader.of_string test.input in 116 - Parser.parse ~collect_errors:true reader 116 + Html5rw.Parser.parse ~collect_errors:true reader 117 117 in 118 - let actual_tree = Dom.to_test_format (Parser.root result) in 118 + let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in 119 119 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 120 120 let actual = normalize_tree (strip_tree_prefix actual_tree) in 121 - let error_count = List.length (Parser.errors result) in 121 + let error_count = List.length (Html5rw.Parser.errors result) in 122 122 let expected_error_count = List.length test.expected_errors in 123 123 (expected = actual, expected, actual, error_count, expected_error_count) 124 124 with e -> ··· 190 190 (* ============================================================ *) 191 191 192 192 module Tokenizer_tests = struct 193 - module Tokenizer = Html5rw_tokenizer 193 + module Tokenizer = Html5rw.Tokenizer 194 194 195 195 module TokenCollector = struct 196 - type t = { mutable tokens : Tokenizer.Token.t list } 196 + type t = { mutable tokens : Html5rw.Tokenizer.Token.t list } 197 197 let create () = { tokens = [] } 198 198 let process t token = t.tokens <- token :: t.tokens; `Continue 199 199 let adjusted_current_node_in_html_namespace _ = true ··· 289 289 last_start_tag; double_escaped; xml_mode; raw_json } 290 290 291 291 let state_of_string = function 292 - | "Data state" -> Tokenizer.State.Data 293 - | "PLAINTEXT state" -> Tokenizer.State.Plaintext 294 - | "RCDATA state" -> Tokenizer.State.Rcdata 295 - | "RAWTEXT state" -> Tokenizer.State.Rawtext 296 - | "Script data state" -> Tokenizer.State.Script_data 297 - | "CDATA section state" -> Tokenizer.State.Cdata_section 292 + | "Data state" -> Html5rw.Tokenizer.State.Data 293 + | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext 294 + | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata 295 + | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext 296 + | "Script data state" -> Html5rw.Tokenizer.State.Script_data 297 + | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section 298 298 | s -> failwith ("Unknown state: " ^ s) 299 299 300 - let token_to_test_json (tok : Tokenizer.Token.t) : Jsont.json list = 300 + let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list = 301 301 let str s = Jsont.String (s, Jsont.Meta.none) in 302 302 let arr l = Jsont.Array (l, Jsont.Meta.none) in 303 303 match tok with 304 - | Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 304 + | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 305 305 let name_json = match name with Some n -> str n | None -> Jsont.Null ((), Jsont.Meta.none) in 306 306 let public_json = match public_id with Some p -> str p | None -> Jsont.Null ((), Jsont.Meta.none) in 307 307 let system_json = match system_id with Some s -> str s | None -> Jsont.Null ((), Jsont.Meta.none) in 308 308 let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in 309 309 [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]] 310 - | Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 310 + | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 311 311 let attrs_obj = Jsont.Object ( 312 312 List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs), 313 313 Jsont.Meta.none 314 314 ) in 315 315 if self_closing then [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]] 316 316 else [arr [str "StartTag"; str name; attrs_obj]] 317 - | Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]] 318 - | Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]] 319 - | Tokenizer.Token.Character data -> [arr [str "Character"; str data]] 320 - | Tokenizer.Token.EOF -> [] 317 + | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]] 318 + | Html5rw.Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]] 319 + | Html5rw.Tokenizer.Token.Character data -> [arr [str "Character"; str data]] 320 + | Html5rw.Tokenizer.Token.EOF -> [] 321 321 322 322 let rec json_equal a b = 323 323 match a, b with ··· 337 337 let merge_character_tokens tokens = 338 338 let rec loop acc = function 339 339 | [] -> List.rev acc 340 - | Tokenizer.Token.Character s1 :: Tokenizer.Token.Character s2 :: rest -> 341 - loop acc (Tokenizer.Token.Character (s1 ^ s2) :: rest) 340 + | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest -> 341 + loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest) 342 342 | tok :: rest -> loop (tok :: acc) rest 343 343 in loop [] tokens 344 344 345 345 let run_test test initial_state = 346 346 let input = if test.double_escaped then unescape_double test.input else test.input in 347 347 let collector = TokenCollector.create () in 348 - let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 349 - Tokenizer.set_state tokenizer initial_state; 350 - (match test.last_start_tag with Some tag -> Tokenizer.set_last_start_tag tokenizer tag | None -> ()); 348 + let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 349 + Html5rw.Tokenizer.set_state tokenizer initial_state; 350 + (match test.last_start_tag with Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag | None -> ()); 351 351 let reader = Bytes.Reader.of_string input in 352 - Tokenizer.run tokenizer (module TokenCollector) reader; 352 + Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader; 353 353 let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in 354 354 let actual_tokens = List.concat_map token_to_test_json tokens in 355 355 let expected_output = if test.double_escaped then ··· 373 373 List.length actual_tokens = List.length expected && 374 374 List.for_all2 json_equal actual_tokens expected 375 375 in 376 - let actual_error_count = List.length (Tokenizer.get_errors tokenizer) in 376 + let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in 377 377 let errors_count_match = actual_error_count = test.expected_error_count in 378 378 (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count) 379 379 ··· 465 465 (* ============================================================ *) 466 466 467 467 module Encoding_tests = struct 468 - module Encoding = Html5rw_encoding 468 + module Encoding = Html5rw.Encoding 469 469 470 470 type test_case = { 471 471 input : string; ··· 476 476 let normalize_encoding_name s = String.lowercase_ascii (String.trim s) 477 477 478 478 let encoding_to_test_name = function 479 - | Encoding.Utf8 -> "utf-8" 480 - | Encoding.Utf16le -> "utf-16le" 481 - | Encoding.Utf16be -> "utf-16be" 482 - | Encoding.Windows_1252 -> "windows-1252" 483 - | Encoding.Iso_8859_2 -> "iso-8859-2" 484 - | Encoding.Euc_jp -> "euc-jp" 479 + | Html5rw.Encoding.Utf8 -> "utf-8" 480 + | Html5rw.Encoding.Utf16le -> "utf-16le" 481 + | Html5rw.Encoding.Utf16be -> "utf-16be" 482 + | Html5rw.Encoding.Windows_1252 -> "windows-1252" 483 + | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2" 484 + | Html5rw.Encoding.Euc_jp -> "euc-jp" 485 485 486 486 let parse_test_case lines = 487 487 let raw_lines = String.concat "\n" lines in ··· 526 526 527 527 let run_test test = 528 528 try 529 - let (_, detected_encoding) = Encoding.decode (Bytes.of_string test.input) () in 529 + let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in 530 530 let detected_name = encoding_to_test_name detected_encoding in 531 531 let expected_name = normalize_encoding_name test.expected_encoding in 532 532 let match_encoding det exp =
+8 -8
test/test_encoding.ml
··· 1 1 (* Test runner for html5lib-tests encoding tests *) 2 2 3 - module Encoding = Html5rw_encoding 3 + module Encoding = Html5rw.Encoding 4 4 module Report = Test_report 5 5 6 6 type test_case = { ··· 15 15 16 16 (* Convert our encoding type to canonical test name *) 17 17 let encoding_to_test_name = function 18 - | Encoding.Utf8 -> "utf-8" 19 - | Encoding.Utf16le -> "utf-16le" 20 - | Encoding.Utf16be -> "utf-16be" 21 - | Encoding.Windows_1252 -> "windows-1252" 22 - | Encoding.Iso_8859_2 -> "iso-8859-2" 23 - | Encoding.Euc_jp -> "euc-jp" 18 + | Html5rw.Encoding.Utf8 -> "utf-8" 19 + | Html5rw.Encoding.Utf16le -> "utf-16le" 20 + | Html5rw.Encoding.Utf16be -> "utf-16be" 21 + | Html5rw.Encoding.Windows_1252 -> "windows-1252" 22 + | Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2" 23 + | Html5rw.Encoding.Euc_jp -> "euc-jp" 24 24 25 25 (* Parse a single test case from lines *) 26 26 let parse_test_case lines = ··· 79 79 let run_test test = 80 80 try 81 81 (* Detect encoding from the input bytes *) 82 - let (_, detected_encoding) = Encoding.decode (Bytes.of_string test.input) () in 82 + let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in 83 83 let detected_name = encoding_to_test_name detected_encoding in 84 84 let expected_name = normalize_encoding_name test.expected_encoding in 85 85
+7 -7
test/test_html5lib.ml
··· 2 2 3 3 open Bytesrw 4 4 5 - module Parser = Html5rw_parser 6 - module Dom = Html5rw_dom 5 + module Parser = Html5rw.Parser 6 + module Dom = Html5rw.Dom 7 7 module Report = Test_report 8 8 9 9 type test_case = { ··· 119 119 | [tag] -> (None, tag) 120 120 | _ -> (None, ctx_str) 121 121 in 122 - let context = Parser.make_fragment_context ~tag_name ~namespace () in 122 + let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in 123 123 let reader = Bytes.Reader.of_string test.input in 124 - Parser.parse ~collect_errors:true ~fragment_context:context reader 124 + Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader 125 125 | None -> 126 126 let reader = Bytes.Reader.of_string test.input in 127 - Parser.parse ~collect_errors:true reader 127 + Html5rw.Parser.parse ~collect_errors:true reader 128 128 in 129 - let actual_tree = Dom.to_test_format (Parser.root result) in 129 + let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in 130 130 let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 131 131 let actual = normalize_tree (strip_tree_prefix actual_tree) in 132 - let error_count = List.length (Parser.errors result) in 132 + let error_count = List.length (Html5rw.Parser.errors result) in 133 133 let expected_error_count = List.length test.expected_errors in 134 134 (expected = actual, expected, actual, error_count, expected_error_count) 135 135 with e ->
+9 -9
test/test_serializer.ml
··· 1 1 (* Test runner for html5lib-tests serializer tests *) 2 2 3 - module Dom = Html5rw_dom 3 + module Dom = Html5rw.Dom 4 4 module Report = Test_report 5 5 6 6 (* Extract values from JSON *) ··· 189 189 | EmptyTag of string * (string * string) list (* name, attrs *) 190 190 | TextNode of string 191 191 | CommentNode of string 192 - | DoctypeNode of Dom.node 192 + | DoctypeNode of Html5rw.Dom.node 193 193 194 194 type token_info = { 195 195 token : token_type option; ··· 235 235 236 236 | "Doctype", [name_json] -> 237 237 let name = json_string name_json in 238 - let node = Dom.create_doctype ~name () in 238 + let node = Html5rw.Dom.create_doctype ~name () in 239 239 { token = Some (DoctypeNode node) } 240 240 241 241 | "Doctype", [name_json; public_json] -> 242 242 let name = json_string name_json in 243 243 let public_id = json_string_opt public_json in 244 244 let node = match public_id with 245 - | Some pub -> Dom.create_doctype ~name ~public_id:pub () 246 - | None -> Dom.create_doctype ~name () 245 + | Some pub -> Html5rw.Dom.create_doctype ~name ~public_id:pub () 246 + | None -> Html5rw.Dom.create_doctype ~name () 247 247 in 248 248 { token = Some (DoctypeNode node) } 249 249 ··· 252 252 let public_id = json_string_opt public_json in 253 253 let system_id = json_string_opt system_json in 254 254 let node = match public_id, system_id with 255 - | Some pub, Some sys -> Dom.create_doctype ~name ~public_id:pub ~system_id:sys () 256 - | Some pub, None -> Dom.create_doctype ~name ~public_id:pub () 257 - | None, Some sys -> Dom.create_doctype ~name ~system_id:sys () 258 - | None, None -> Dom.create_doctype ~name () 255 + | Some pub, Some sys -> Html5rw.Dom.create_doctype ~name ~public_id:pub ~system_id:sys () 256 + | Some pub, None -> Html5rw.Dom.create_doctype ~name ~public_id:pub () 257 + | None, Some sys -> Html5rw.Dom.create_doctype ~name ~system_id:sys () 258 + | None, None -> Html5rw.Dom.create_doctype ~name () 259 259 in 260 260 { token = Some (DoctypeNode node) } 261 261
+22 -22
test/test_tokenizer.ml
··· 2 2 3 3 open Bytesrw 4 4 5 - module Tokenizer = Html5rw_tokenizer 5 + module Tokenizer = Html5rw.Tokenizer 6 6 module Report = Test_report 7 7 8 8 (* Token collector sink - collects all tokens into a list *) 9 9 module TokenCollector = struct 10 10 type t = { 11 - mutable tokens : Tokenizer.Token.t list; 11 + mutable tokens : Html5rw.Tokenizer.Token.t list; 12 12 } 13 13 14 14 let create () = { tokens = [] } ··· 139 139 140 140 (* Convert state name to State.t *) 141 141 let state_of_string = function 142 - | "Data state" -> Tokenizer.State.Data 143 - | "PLAINTEXT state" -> Tokenizer.State.Plaintext 144 - | "RCDATA state" -> Tokenizer.State.Rcdata 145 - | "RAWTEXT state" -> Tokenizer.State.Rawtext 146 - | "Script data state" -> Tokenizer.State.Script_data 147 - | "CDATA section state" -> Tokenizer.State.Cdata_section 142 + | "Data state" -> Html5rw.Tokenizer.State.Data 143 + | "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext 144 + | "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata 145 + | "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext 146 + | "Script data state" -> Html5rw.Tokenizer.State.Script_data 147 + | "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section 148 148 | s -> failwith ("Unknown state: " ^ s) 149 149 150 150 (* Convert our token to test format for comparison *) 151 - let token_to_test_json (tok : Tokenizer.Token.t) : Jsont.json list = 151 + let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list = 152 152 let str s = Jsont.String (s, Jsont.Meta.none) in 153 153 let arr l = Jsont.Array (l, Jsont.Meta.none) in 154 154 match tok with 155 - | Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 155 + | Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } -> 156 156 let name_json = match name with 157 157 | Some n -> str n 158 158 | None -> Jsont.Null ((), Jsont.Meta.none) ··· 167 167 in 168 168 let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in 169 169 [arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]] 170 - | Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 170 + | Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } -> 171 171 let attrs_obj = Jsont.Object ( 172 172 List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs), 173 173 Jsont.Meta.none ··· 176 176 [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]] 177 177 else 178 178 [arr [str "StartTag"; str name; attrs_obj]] 179 - | Tokenizer.Token.Tag { kind = End; name; _ } -> 179 + | Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> 180 180 [arr [str "EndTag"; str name]] 181 - | Tokenizer.Token.Comment data -> 181 + | Html5rw.Tokenizer.Token.Comment data -> 182 182 [arr [str "Comment"; str data]] 183 - | Tokenizer.Token.Character data -> 183 + | Html5rw.Tokenizer.Token.Character data -> 184 184 (* Split into individual characters for comparison - but actually 185 185 the tests expect consecutive characters to be merged *) 186 186 [arr [str "Character"; str data]] 187 - | Tokenizer.Token.EOF -> [] 187 + | Html5rw.Tokenizer.Token.EOF -> [] 188 188 189 189 (* Compare JSON values for equality *) 190 190 let rec json_equal a b = ··· 207 207 let merge_character_tokens tokens = 208 208 let rec loop acc = function 209 209 | [] -> List.rev acc 210 - | Tokenizer.Token.Character s1 :: Tokenizer.Token.Character s2 :: rest -> 211 - loop acc (Tokenizer.Token.Character (s1 ^ s2) :: rest) 210 + | Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest -> 211 + loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest) 212 212 | tok :: rest -> loop (tok :: acc) rest 213 213 in 214 214 loop [] tokens ··· 218 218 let input = if test.double_escaped then unescape_double test.input else test.input in 219 219 220 220 let collector = TokenCollector.create () in 221 - let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 221 + let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in 222 222 223 223 (* Set initial state *) 224 - Tokenizer.set_state tokenizer initial_state; 224 + Html5rw.Tokenizer.set_state tokenizer initial_state; 225 225 226 226 (* Set last start tag if specified *) 227 227 (match test.last_start_tag with 228 - | Some tag -> Tokenizer.set_last_start_tag tokenizer tag 228 + | Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag 229 229 | None -> ()); 230 230 231 231 (* Run tokenizer *) 232 232 let reader = Bytes.Reader.of_string input in 233 - Tokenizer.run tokenizer (module TokenCollector) reader; 233 + Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader; 234 234 235 235 (* Get results *) 236 236 let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in ··· 267 267 List.for_all2 json_equal actual_tokens expected 268 268 in 269 269 270 - let actual_error_count = List.length (Tokenizer.get_errors tokenizer) in 270 + let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in 271 271 let errors_count_match = actual_error_count = test.expected_error_count in 272 272 273 273 (tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count)