-4
lib/dom/dune
-4
lib/dom/dune
+5
-5
lib/dom/html5rw_dom.ml
lib/html5rw/dom/dom.ml
+5
-5
lib/dom/html5rw_dom.ml
lib/html5rw/dom/dom.ml
···
10
10
and serialization.
11
11
*)
12
12
13
-
include Node
13
+
include Dom_node
14
14
15
-
let to_html = Serialize.to_html
16
-
let to_writer = Serialize.to_writer
17
-
let to_test_format = Serialize.to_test_format
18
-
let to_text = Serialize.to_text
15
+
let to_html = Dom_serialize.to_html
16
+
let to_writer = Dom_serialize.to_writer
17
+
let to_test_format = Dom_serialize.to_test_format
18
+
let to_text = Dom_serialize.to_text
+3
-3
lib/dom/html5rw_dom.mli
lib/html5rw/dom/dom.mli
+3
-3
lib/dom/html5rw_dom.mli
lib/html5rw/dom/dom.mli
···
128
128
@see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
129
129
WHATWG: DOCTYPE handling during parsing
130
130
*)
131
-
type doctype_data = Node.doctype_data = {
131
+
type doctype_data = Dom_node.doctype_data = {
132
132
name : string option; (** The DOCTYPE name, e.g., "html" *)
133
133
public_id : string option; (** Public identifier (legacy, rarely used) *)
134
134
system_id : string option; (** System identifier (legacy, rarely used) *)
···
172
172
@see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
173
173
WHATWG: How the parser determines quirks mode
174
174
*)
175
-
type quirks_mode = Node.quirks_mode = No_quirks | Quirks | Limited_quirks
175
+
type quirks_mode = Dom_node.quirks_mode = No_quirks | Quirks | Limited_quirks
176
176
177
177
(** A DOM node in the parsed document tree.
178
178
···
240
240
@see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
241
241
WHATWG: The template element
242
242
*)
243
-
type node = Node.node = {
243
+
type node = Dom_node.node = {
244
244
mutable name : string;
245
245
(** Tag name for elements, or special name for other node types.
246
246
lib/dom/node.ml
lib/html5rw/dom/dom_node.ml
lib/dom/node.ml
lib/html5rw/dom/dom_node.ml
lib/dom/node.mli
lib/html5rw/dom/dom_node.mli
lib/dom/node.mli
lib/html5rw/dom/dom_node.mli
+1
-1
lib/dom/serialize.ml
lib/html5rw/dom/dom_serialize.ml
+1
-1
lib/dom/serialize.ml
lib/html5rw/dom/dom_serialize.ml
+3
-3
lib/encoding/bom.ml
lib/html5rw/encoding/encoding_bom.ml
+3
-3
lib/encoding/bom.ml
lib/html5rw/encoding/encoding_bom.ml
···
6
6
Bytes.get data 0 = '\xEF' &&
7
7
Bytes.get data 1 = '\xBB' &&
8
8
Bytes.get data 2 = '\xBF' then
9
-
Some (Encoding.Utf8, 3)
9
+
Some (Encoding_types.Utf8, 3)
10
10
else if len >= 2 &&
11
11
Bytes.get data 0 = '\xFF' &&
12
12
Bytes.get data 1 = '\xFE' then
13
-
Some (Encoding.Utf16le, 2)
13
+
Some (Encoding_types.Utf16le, 2)
14
14
else if len >= 2 &&
15
15
Bytes.get data 0 = '\xFE' &&
16
16
Bytes.get data 1 = '\xFF' then
17
-
Some (Encoding.Utf16be, 2)
17
+
Some (Encoding_types.Utf16be, 2)
18
18
else
19
19
None
+10
-10
lib/encoding/decode.ml
lib/html5rw/encoding/encoding_decode.ml
+10
-10
lib/encoding/decode.ml
lib/html5rw/encoding/encoding_decode.ml
···
51
51
52
52
let decode_with_encoding data enc ~bom_len =
53
53
match enc with
54
-
| Encoding.Utf8 ->
54
+
| Encoding_types.Utf8 ->
55
55
(* UTF-8: Just validate and replace errors with replacement character *)
56
56
let len = Bytes.length data in
57
57
let buf = Buffer.create len in
···
74
74
loop ();
75
75
Buffer.contents buf
76
76
77
-
| Encoding.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
78
-
| Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
77
+
| Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
78
+
| Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
79
79
80
-
| Encoding.Windows_1252 ->
80
+
| Encoding_types.Windows_1252 ->
81
81
(* Windows-1252 mapping table for 0x80-0x9F range *)
82
82
let len = Bytes.length data in
83
83
let buf = Buffer.create len in
···
98
98
done;
99
99
Buffer.contents buf
100
100
101
-
| Encoding.Iso_8859_2 ->
101
+
| Encoding_types.Iso_8859_2 ->
102
102
(* Use uuuu for ISO-8859-2 decoding *)
103
103
let len = Bytes.length data in
104
104
let buf = Buffer.create len in
···
109
109
) () s;
110
110
Buffer.contents buf
111
111
112
-
| Encoding.Euc_jp ->
112
+
| Encoding_types.Euc_jp ->
113
113
(* For EUC-JP, use uutf with best effort *)
114
114
let len = Bytes.length data in
115
115
let buf = Buffer.create len in
···
126
126
127
127
let decode data ?transport_encoding () =
128
128
(* Step 1: Check for BOM *)
129
-
let bom_result = Bom.sniff data in
129
+
let bom_result = Encoding_bom.sniff data in
130
130
match bom_result with
131
131
| Some (enc, bom_len) ->
132
132
(decode_with_encoding data enc ~bom_len, enc)
···
134
134
(* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
135
135
let enc_from_transport =
136
136
match transport_encoding with
137
-
| Some te -> Labels.normalize_label te
137
+
| Some te -> Encoding_labels.normalize_label te
138
138
| None -> None
139
139
in
140
140
match enc_from_transport with
141
141
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
142
142
| None ->
143
143
(* Step 3: Prescan for meta charset *)
144
-
match Prescan.prescan_for_meta_charset data with
144
+
match Encoding_prescan.prescan_for_meta_charset data with
145
145
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
146
146
| None ->
147
147
(* Default to Windows-1252 per HTML5 spec when no encoding detected *)
148
-
(decode_with_encoding data Encoding.Windows_1252 ~bom_len:0, Encoding.Windows_1252)
148
+
(decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)
-4
lib/encoding/dune
-4
lib/encoding/dune
lib/encoding/encoding.ml
lib/html5rw/encoding/encoding_types.ml
lib/encoding/encoding.ml
lib/html5rw/encoding/encoding_types.ml
+6
-6
lib/encoding/html5rw_encoding.ml
lib/html5rw/encoding/encoding.ml
+6
-6
lib/encoding/html5rw_encoding.ml
lib/html5rw/encoding/encoding.ml
···
30
30
encodings, but this implementation focuses on the most common ones.
31
31
Other encodings are mapped to their closest equivalent.
32
32
*)
33
-
type encoding = Encoding.t =
33
+
type encoding = Encoding_types.t =
34
34
| Utf8 (** UTF-8 encoding (default) *)
35
35
| Utf16le (** UTF-16 little-endian *)
36
36
| Utf16be (** UTF-16 big-endian *)
···
44
44
45
45
Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"].
46
46
*)
47
-
let encoding_to_string = Encoding.to_string
47
+
let encoding_to_string = Encoding_types.to_string
48
48
49
49
(** Detect encoding from a byte order mark.
50
50
···
54
54
@return [(Some (encoding, skip_bytes))] if a BOM is found,
55
55
[None] otherwise.
56
56
*)
57
-
let sniff_bom = Bom.sniff
57
+
let sniff_bom = Encoding_bom.sniff
58
58
59
59
(** Normalize an encoding label to its canonical form.
60
60
···
69
69
normalize_label "latin1" (* Some Windows_1252 *)
70
70
]}
71
71
*)
72
-
let normalize_label = Labels.normalize_label
72
+
let normalize_label = Encoding_labels.normalize_label
73
73
74
74
(** Prescan bytes to find a meta charset declaration.
75
75
···
78
78
79
79
@return [Some encoding] if a meta charset is found, [None] otherwise.
80
80
*)
81
-
let prescan_for_meta_charset = Prescan.prescan_for_meta_charset
81
+
let prescan_for_meta_charset = Encoding_prescan.prescan_for_meta_charset
82
82
83
83
(** {1 Decoding} *)
84
84
···
98
98
(* html is now a UTF-8 string, enc is the detected encoding *)
99
99
]}
100
100
*)
101
-
let decode = Decode.decode
101
+
let decode = Encoding_decode.decode
+1
-1
lib/encoding/html5rw_encoding.mli
lib/html5rw/encoding/encoding.mli
+1
-1
lib/encoding/html5rw_encoding.mli
lib/html5rw/encoding/encoding.mli
···
30
30
encodings, but this implementation focuses on the most common ones.
31
31
Other encodings are mapped to their closest equivalent.
32
32
*)
33
-
type encoding = Encoding.t =
33
+
type encoding = Encoding_types.t =
34
34
| Utf8 (** UTF-8 encoding (default) *)
35
35
| Utf16le (** UTF-16 little-endian *)
36
36
| Utf16be (** UTF-16 big-endian *)
+10
-10
lib/encoding/labels.ml
lib/html5rw/encoding/encoding_labels.ml
+10
-10
lib/encoding/labels.ml
lib/html5rw/encoding/encoding_labels.ml
···
8
8
else
9
9
(* Security: never allow utf-7 *)
10
10
if s = "utf-7" || s = "utf7" || s = "x-utf-7" then
11
-
Some Encoding.Windows_1252
11
+
Some Encoding_types.Windows_1252
12
12
else if s = "utf-8" || s = "utf8" then
13
-
Some Encoding.Utf8
13
+
Some Encoding_types.Utf8
14
14
(* HTML treats latin-1 labels as windows-1252 *)
15
15
else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" ||
16
16
s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then
17
-
Some Encoding.Windows_1252
17
+
Some Encoding_types.Windows_1252
18
18
else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then
19
-
Some Encoding.Windows_1252
19
+
Some Encoding_types.Windows_1252
20
20
else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then
21
-
Some Encoding.Iso_8859_2
21
+
Some Encoding_types.Iso_8859_2
22
22
else if s = "euc-jp" || s = "eucjp" then
23
-
Some Encoding.Euc_jp
23
+
Some Encoding_types.Euc_jp
24
24
else if s = "utf-16" || s = "utf16" then
25
-
Some Encoding.Utf16le (* Default to LE for ambiguous utf-16 *)
25
+
Some Encoding_types.Utf16le (* Default to LE for ambiguous utf-16 *)
26
26
else if s = "utf-16le" || s = "utf16le" then
27
-
Some Encoding.Utf16le
27
+
Some Encoding_types.Utf16le
28
28
else if s = "utf-16be" || s = "utf16be" then
29
-
Some Encoding.Utf16be
29
+
Some Encoding_types.Utf16be
30
30
else
31
31
None
32
32
···
37
37
(* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
38
38
treat them as UTF-8 *)
39
39
match enc with
40
-
| Encoding.Utf16le | Encoding.Utf16be -> Some Encoding.Utf8
40
+
| Encoding_types.Utf16le | Encoding_types.Utf16be -> Some Encoding_types.Utf8
41
41
| other -> Some other
+2
-2
lib/encoding/prescan.ml
lib/html5rw/encoding/encoding_prescan.ml
+2
-2
lib/encoding/prescan.ml
lib/html5rw/encoding/encoding_prescan.ml
···
229
229
(* Check for charset *)
230
230
(match !charset with
231
231
| Some cs ->
232
-
(match Labels.normalize_meta_declared cs with
232
+
(match Encoding_labels.normalize_meta_declared cs with
233
233
| Some enc -> result := Some enc
234
234
| None -> ())
235
235
| None -> ());
···
241
241
| Some he, Some ct when String.lowercase_ascii he = "content-type" ->
242
242
(match extract_charset_from_content ct with
243
243
| Some extracted ->
244
-
(match Labels.normalize_meta_declared extracted with
244
+
(match Encoding_labels.normalize_meta_declared extracted with
245
245
| Some enc -> result := Some enc
246
246
| None -> ())
247
247
| None -> ())
+8
-8
lib/entities/decode.ml
lib/html5rw/entities/entities_decode.ml
+8
-8
lib/entities/decode.ml
lib/html5rw/entities/entities_decode.ml
···
54
54
let digit_text = String.sub text digit_start (!j - digit_start) in
55
55
56
56
if String.length digit_text > 0 then begin
57
-
match Numeric_ref.decode digit_text ~is_hex with
57
+
match Entities_numeric_ref.decode digit_text ~is_hex with
58
58
| Some decoded ->
59
59
Buffer.add_string buf decoded;
60
60
i := if has_semicolon then !j + 1 else !j
···
84
84
(* Try exact match first (with semicolon expected) *)
85
85
let decoded =
86
86
if has_semicolon then
87
-
Entity_table.lookup entity_name
87
+
Entities_entity_table.lookup entity_name
88
88
else
89
89
None
90
90
in
···
101
101
if k <= 0 then None
102
102
else
103
103
let prefix = String.sub entity_name 0 k in
104
-
if Entity_table.is_legacy prefix then
105
-
match Entity_table.lookup prefix with
104
+
if Entities_entity_table.is_legacy prefix then
105
+
match Entities_entity_table.lookup prefix with
106
106
| Some value -> Some (value, k)
107
107
| None -> try_prefix (k - 1)
108
108
else
···
118
118
i := !j + 1
119
119
end else if not has_semicolon then begin
120
120
(* Try without semicolon for legacy compatibility *)
121
-
if Entity_table.is_legacy entity_name then
122
-
match Entity_table.lookup entity_name with
121
+
if Entities_entity_table.is_legacy entity_name then
122
+
match Entities_entity_table.lookup entity_name with
123
123
| Some value ->
124
124
(* Legacy entities without semicolon have strict rules in attributes *)
125
125
let next_char = if !j < len then Some text.[!j] else None in
···
145
145
if k <= 0 then None
146
146
else
147
147
let prefix = String.sub entity_name 0 k in
148
-
if Entity_table.is_legacy prefix then
149
-
match Entity_table.lookup prefix with
148
+
if Entities_entity_table.is_legacy prefix then
149
+
match Entities_entity_table.lookup prefix with
150
150
| Some value -> Some (value, k)
151
151
| None -> try_prefix (k - 1)
152
152
else
-9
lib/entities/dune
-9
lib/entities/dune
+6
-6
lib/entities/html5rw_entities.ml
lib/html5rw/entities/entities.ml
+6
-6
lib/entities/html5rw_entities.ml
lib/html5rw/entities/entities.ml
···
49
49
(* Returns: "<script>" *)
50
50
]}
51
51
*)
52
-
let decode = Decode.decode_entities_in_text
52
+
let decode = Entities_decode.decode_entities_in_text
53
53
54
54
(** Decode a numeric character reference.
55
55
···
60
60
specification (e.g., control characters in the 0x80-0x9F range
61
61
are mapped to Windows-1252 equivalents).
62
62
*)
63
-
let decode_numeric = Numeric_ref.decode
63
+
let decode_numeric = Entities_numeric_ref.decode
64
64
65
65
(** Look up a named character reference.
66
66
···
73
73
lookup "bogus" (* None *)
74
74
]}
75
75
*)
76
-
let lookup = Entity_table.lookup
76
+
let lookup = Entities_entity_table.lookup
77
77
78
78
(** Check if an entity is a legacy entity.
79
79
···
87
87
is_legacy "Aacute" (* false - requires semicolon *)
88
88
]}
89
89
*)
90
-
let is_legacy = Entity_table.is_legacy
90
+
let is_legacy = Entities_entity_table.is_legacy
91
91
92
92
(** Convert a Unicode codepoint to its UTF-8 encoding.
93
93
94
94
@param codepoint The Unicode codepoint (0 to 0x10FFFF)
95
95
@return The UTF-8 encoded string
96
96
*)
97
-
let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8
97
+
let codepoint_to_utf8 = Entities_numeric_ref.codepoint_to_utf8
98
98
99
99
(** {1 Sub-modules} *)
100
100
101
101
(** Numeric character reference handling. *)
102
-
module Numeric_ref = Numeric_ref
102
+
module Numeric_ref = Entities_numeric_ref
lib/entities/numeric_ref.ml
lib/html5rw/entities/entities_numeric_ref.ml
lib/entities/numeric_ref.ml
lib/html5rw/entities/entities_numeric_ref.ml
+3
-8
lib/html5rw/dune
+3
-8
lib/html5rw/dune
+4
lib/html5rw/entities/dune
+4
lib/html5rw/entities/dune
+6
-6
lib/html5rw/html5rw.ml
+6
-6
lib/html5rw/html5rw.ml
···
37
37
(** {1 Sub-modules} *)
38
38
39
39
(** DOM types and manipulation functions *)
40
-
module Dom = Html5rw_dom
40
+
module Dom = Dom
41
41
42
42
(** HTML5 tokenizer *)
43
-
module Tokenizer = Html5rw_tokenizer
43
+
module Tokenizer = Tokenizer
44
44
45
45
(** Encoding detection and decoding *)
46
-
module Encoding = Html5rw_encoding
46
+
module Encoding = Encoding
47
47
48
48
(** CSS selector engine *)
49
-
module Selector = Html5rw_selector
49
+
module Selector = Selector
50
50
51
51
(** HTML entity decoding *)
52
-
module Entities = Html5rw_entities
52
+
module Entities = Entities
53
53
54
54
(** Low-level parser access *)
55
-
module Parser = Html5rw_parser
55
+
module Parser = Parser
56
56
57
57
(** {1 Core Types} *)
58
58
+6
-6
lib/html5rw/html5rw.mli
+6
-6
lib/html5rw/html5rw.mli
···
132
132
133
133
@see <https://html.spec.whatwg.org/multipage/dom.html>
134
134
WHATWG: The elements of HTML *)
135
-
module Dom = Html5rw_dom
135
+
module Dom = Dom
136
136
137
137
(** HTML5 tokenizer.
138
138
···
146
146
147
147
@see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
148
148
WHATWG: Tokenization *)
149
-
module Tokenizer = Html5rw_tokenizer
149
+
module Tokenizer = Tokenizer
150
150
151
151
(** Encoding detection and decoding.
152
152
···
163
163
WHATWG: Determining the character encoding
164
164
@see <https://encoding.spec.whatwg.org/>
165
165
WHATWG Encoding Standard *)
166
-
module Encoding = Html5rw_encoding
166
+
module Encoding = Encoding
167
167
168
168
(** CSS selector engine.
169
169
···
180
180
181
181
@see <https://www.w3.org/TR/selectors-4/>
182
182
W3C Selectors Level 4 specification *)
183
-
module Selector = Html5rw_selector
183
+
module Selector = Selector
184
184
185
185
(** HTML entity decoding.
186
186
···
196
196
197
197
@see <https://html.spec.whatwg.org/multipage/named-characters.html>
198
198
WHATWG: Named character references *)
199
-
module Entities = Html5rw_entities
199
+
module Entities = Entities
200
200
201
201
(** Low-level parser access.
202
202
···
210
210
211
211
@see <https://html.spec.whatwg.org/multipage/parsing.html#tree-construction>
212
212
WHATWG: Tree construction *)
213
-
module Parser = Html5rw_parser
213
+
module Parser = Parser
214
214
215
215
(** {1 Core Types} *)
216
216
+41
lib/html5rw/parser/parser.ml
+41
lib/html5rw/parser/parser.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(* html5rw.parser - HTML5 parser with bytesrw-only API *)
7
+
8
+
module Dom = Dom
9
+
module Tokenizer = Tokenizer
10
+
module Encoding = Encoding
11
+
module Constants = Parser_constants
12
+
module Insertion_mode = Parser_insertion_mode
13
+
module Tree_builder = Parser_tree_builder
14
+
15
+
type parse_error = Parser_impl.parse_error
16
+
type fragment_context = Parser_impl.fragment_context
17
+
type t = Parser_impl.t
18
+
19
+
(* parse_error accessors *)
20
+
let error_code (e : parse_error) = e.Parser_tree_builder.code
21
+
let error_line (e : parse_error) = e.Parser_tree_builder.line
22
+
let error_column (e : parse_error) = e.Parser_tree_builder.column
23
+
24
+
(* fragment_context constructor and accessors *)
25
+
let make_fragment_context ~tag_name ?(namespace=None) () : fragment_context =
26
+
{ Parser_tree_builder.tag_name; namespace }
27
+
28
+
let fragment_context_tag (ctx : fragment_context) = ctx.Parser_tree_builder.tag_name
29
+
let fragment_context_namespace (ctx : fragment_context) = ctx.Parser_tree_builder.namespace
30
+
31
+
let parse = Parser_impl.parse
32
+
let parse_bytes = Parser_impl.parse_bytes
33
+
let query = Parser_impl.query
34
+
let to_writer = Parser_impl.to_writer
35
+
let to_string = Parser_impl.to_string
36
+
let to_text = Parser_impl.to_text
37
+
let to_test_format = Parser_impl.to_test_format
38
+
39
+
let root t = t.Parser_impl.root
40
+
let errors t = t.Parser_impl.errors
41
+
let encoding t = t.Parser_impl.encoding
+16
lib/html5rw/tokenizer/tokenizer.ml
+16
lib/html5rw/tokenizer/tokenizer.ml
···
1
+
(* html5rw.tokenizer - HTML5 tokenizer with bytesrw-only API *)
2
+
3
+
module Token = Tokenizer_token
4
+
module State = Tokenizer_state
5
+
module Errors = Tokenizer_errors
6
+
module Stream = Tokenizer_stream
7
+
8
+
module type SINK = Tokenizer_impl.SINK
9
+
10
+
type 'a t = 'a Tokenizer_impl.t
11
+
12
+
let create = Tokenizer_impl.create
13
+
let run = Tokenizer_impl.run
14
+
let get_errors = Tokenizer_impl.get_errors
15
+
let set_state = Tokenizer_impl.set_state
16
+
let set_last_start_tag = Tokenizer_impl.set_last_start_tag
+1975
lib/html5rw/tokenizer/tokenizer_impl.ml
+1975
lib/html5rw/tokenizer/tokenizer_impl.ml
···
1
+
(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
2
+
3
+
(* Character classification using Astring *)
4
+
let is_ascii_alpha = Astring.Char.Ascii.is_letter
5
+
let is_ascii_digit = Astring.Char.Ascii.is_digit
6
+
let is_ascii_hex = Astring.Char.Ascii.is_hex_digit
7
+
let is_ascii_alnum = Astring.Char.Ascii.is_alphanum
8
+
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
9
+
let ascii_lower = Astring.Char.Ascii.lowercase
10
+
11
+
(* Token sink interface *)
12
+
module type SINK = sig
13
+
type t
14
+
val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
15
+
val adjusted_current_node_in_html_namespace : t -> bool
16
+
end
17
+
18
+
type 'sink t = {
19
+
mutable stream : Tokenizer_stream.t;
20
+
sink : 'sink;
21
+
mutable state : Tokenizer_state.t;
22
+
mutable return_state : Tokenizer_state.t;
23
+
mutable char_ref_code : int;
24
+
mutable temp_buffer : Buffer.t;
25
+
mutable last_start_tag : string;
26
+
mutable current_tag_name : Buffer.t;
27
+
mutable current_tag_kind : Tokenizer_token.tag_kind;
28
+
mutable current_tag_self_closing : bool;
29
+
mutable current_attr_name : Buffer.t;
30
+
mutable current_attr_value : Buffer.t;
31
+
mutable current_attrs : (string * string) list;
32
+
mutable current_doctype_name : Buffer.t option;
33
+
mutable current_doctype_public : Buffer.t option;
34
+
mutable current_doctype_system : Buffer.t option;
35
+
mutable current_doctype_force_quirks : bool;
36
+
mutable current_comment : Buffer.t;
37
+
mutable pending_chars : Buffer.t;
38
+
mutable errors : Tokenizer_errors.t list;
39
+
collect_errors : bool;
40
+
xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *)
41
+
}
42
+
43
+
let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = {
44
+
stream = Tokenizer_stream.create "";
45
+
sink;
46
+
state = Tokenizer_state.Data;
47
+
return_state = Tokenizer_state.Data;
48
+
char_ref_code = 0;
49
+
temp_buffer = Buffer.create 64;
50
+
last_start_tag = "";
51
+
current_tag_name = Buffer.create 32;
52
+
current_tag_kind = Tokenizer_token.Start;
53
+
current_tag_self_closing = false;
54
+
current_attr_name = Buffer.create 32;
55
+
current_attr_value = Buffer.create 64;
56
+
current_attrs = [];
57
+
current_doctype_name = None;
58
+
current_doctype_public = None;
59
+
current_doctype_system = None;
60
+
current_doctype_force_quirks = false;
61
+
current_comment = Buffer.create 64;
62
+
pending_chars = Buffer.create 256;
63
+
errors = [];
64
+
collect_errors;
65
+
xml_mode;
66
+
}
67
+
68
+
let error t code =
69
+
if t.collect_errors then begin
70
+
let (line, column) = Tokenizer_stream.position t.stream in
71
+
t.errors <- Tokenizer_errors.make ~code ~line ~column :: t.errors
72
+
end
73
+
74
+
(* emit functions are defined locally inside run *)
75
+
76
+
(* XML mode character transformation: form feed → space *)
77
+
let emit_char t c =
78
+
if t.xml_mode && c = '\x0C' then
79
+
Buffer.add_char t.pending_chars ' '
80
+
else
81
+
Buffer.add_char t.pending_chars c
82
+
83
+
(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *)
84
+
let emit_str t s =
85
+
if t.xml_mode then begin
86
+
(* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *)
87
+
let len = String.length s in
88
+
let i = ref 0 in
89
+
while !i < len do
90
+
let c = s.[!i] in
91
+
if c = '\x0C' then begin
92
+
Buffer.add_char t.pending_chars ' ';
93
+
incr i
94
+
end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin
95
+
(* U+FFFF → U+FFFD *)
96
+
Buffer.add_string t.pending_chars "\xEF\xBF\xBD";
97
+
i := !i + 3
98
+
end else begin
99
+
Buffer.add_char t.pending_chars c;
100
+
incr i
101
+
end
102
+
done
103
+
end else
104
+
Buffer.add_string t.pending_chars s
105
+
106
+
let start_new_tag t kind =
107
+
Buffer.clear t.current_tag_name;
108
+
t.current_tag_kind <- kind;
109
+
t.current_tag_self_closing <- false;
110
+
t.current_attrs <- []
111
+
112
+
let start_new_attribute t =
113
+
(* Save previous attribute if any *)
114
+
let name = Buffer.contents t.current_attr_name in
115
+
if String.length name > 0 then begin
116
+
let value = Buffer.contents t.current_attr_value in
117
+
(* Check for duplicates - only add if not already present *)
118
+
if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then
119
+
t.current_attrs <- (name, value) :: t.current_attrs
120
+
else
121
+
error t "duplicate-attribute"
122
+
end;
123
+
Buffer.clear t.current_attr_name;
124
+
Buffer.clear t.current_attr_value
125
+
126
+
let finish_attribute t =
127
+
start_new_attribute t
128
+
129
+
let start_new_doctype t =
130
+
t.current_doctype_name <- None;
131
+
t.current_doctype_public <- None;
132
+
t.current_doctype_system <- None;
133
+
t.current_doctype_force_quirks <- false
134
+
135
+
(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *)
136
+
137
+
let is_appropriate_end_tag t =
138
+
let name = Buffer.contents t.current_tag_name in
139
+
String.length t.last_start_tag > 0 && name = t.last_start_tag
140
+
141
+
let flush_code_points_consumed_as_char_ref t =
142
+
let s = Buffer.contents t.temp_buffer in
143
+
match t.return_state with
144
+
| Tokenizer_state.Attribute_value_double_quoted
145
+
| Tokenizer_state.Attribute_value_single_quoted
146
+
| Tokenizer_state.Attribute_value_unquoted ->
147
+
Buffer.add_string t.current_attr_value s
148
+
| _ ->
149
+
emit_str t s
150
+
151
+
open Bytesrw
152
+
153
+
(* Main tokenization loop *)
154
+
let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
155
+
t.stream <- Tokenizer_stream.create_from_reader reader;
156
+
t.errors <- [];
157
+
(* Set up error callback for surrogate/noncharacter detection in stream *)
158
+
(* In XML mode, we don't report noncharacter errors - we transform them instead *)
159
+
if not t.xml_mode then
160
+
Tokenizer_stream.set_error_callback t.stream (fun code -> error t code);
161
+
162
+
(* XML mode transformation for pending chars: U+FFFF → U+FFFD *)
163
+
let transform_xml_chars data =
164
+
let len = String.length data in
165
+
let buf = Buffer.create len in
166
+
let i = ref 0 in
167
+
while !i < len do
168
+
let c = data.[!i] in
169
+
if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin
170
+
(* U+FFFF → U+FFFD *)
171
+
Buffer.add_string buf "\xEF\xBF\xBD";
172
+
i := !i + 3
173
+
end else begin
174
+
Buffer.add_char buf c;
175
+
incr i
176
+
end
177
+
done;
178
+
Buffer.contents buf
179
+
in
180
+
181
+
(* Local emit functions with access to S *)
182
+
let emit_pending_chars () =
183
+
if Buffer.length t.pending_chars > 0 then begin
184
+
let data = Buffer.contents t.pending_chars in
185
+
Buffer.clear t.pending_chars;
186
+
let data = if t.xml_mode then transform_xml_chars data else data in
187
+
ignore (S.process t.sink (Tokenizer_token.Character data))
188
+
end
189
+
in
190
+
191
+
let emit token =
192
+
emit_pending_chars ();
193
+
match S.process t.sink token with
194
+
| `Continue -> ()
195
+
| `SwitchTo new_state -> t.state <- new_state
196
+
in
197
+
198
+
let emit_current_tag () =
199
+
finish_attribute t;
200
+
let name = Buffer.contents t.current_tag_name in
201
+
let attrs = List.rev t.current_attrs in
202
+
(* Check for end tag with attributes or self-closing flag *)
203
+
if t.current_tag_kind = Tokenizer_token.End then begin
204
+
if attrs <> [] then
205
+
error t "end-tag-with-attributes";
206
+
if t.current_tag_self_closing then
207
+
error t "end-tag-with-trailing-solidus"
208
+
end;
209
+
let tag = {
210
+
Tokenizer_token.kind = t.current_tag_kind;
211
+
name;
212
+
attrs;
213
+
self_closing = t.current_tag_self_closing;
214
+
} in
215
+
if t.current_tag_kind = Tokenizer_token.Start then
216
+
t.last_start_tag <- name;
217
+
emit (Tokenizer_token.Tag tag)
218
+
in
219
+
220
+
let emit_current_doctype () =
221
+
let doctype = {
222
+
Tokenizer_token.name = Option.map Buffer.contents t.current_doctype_name;
223
+
public_id = Option.map Buffer.contents t.current_doctype_public;
224
+
system_id = Option.map Buffer.contents t.current_doctype_system;
225
+
force_quirks = t.current_doctype_force_quirks;
226
+
} in
227
+
emit (Tokenizer_token.Doctype doctype)
228
+
in
229
+
230
+
let emit_current_comment () =
231
+
let content = Buffer.contents t.current_comment in
232
+
let content =
233
+
if t.xml_mode then begin
234
+
(* XML mode: transform -- to - - in comments *)
235
+
let buf = Buffer.create (String.length content + 10) in
236
+
let len = String.length content in
237
+
let i = ref 0 in
238
+
while !i < len do
239
+
if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin
240
+
Buffer.add_string buf "- -";
241
+
i := !i + 2
242
+
end else begin
243
+
Buffer.add_char buf content.[!i];
244
+
incr i
245
+
end
246
+
done;
247
+
Buffer.contents buf
248
+
end else content
249
+
in
250
+
emit (Tokenizer_token.Comment content)
251
+
in
252
+
253
+
(* Check for control characters and emit error if needed *)
254
+
(* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *)
255
+
let check_control_char c =
256
+
let code = Char.code c in
257
+
(* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *)
258
+
(* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
259
+
(* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *)
260
+
(* Note: We only check single-byte control chars here; multi-byte checks are TODO *)
261
+
if (code >= 0x01 && code <= 0x08) ||
262
+
code = 0x0B ||
263
+
(code >= 0x0E && code <= 0x1F) ||
264
+
code = 0x7F then
265
+
error t "control-character-in-input-stream"
266
+
in
267
+
268
+
269
+
(* Emit char with control character check *)
270
+
let emit_char_checked c =
271
+
check_control_char c;
272
+
emit_char t c
273
+
in
274
+
275
+
let rec process_state () =
276
+
if Tokenizer_stream.is_eof t.stream && t.state <> Tokenizer_state.Data then begin
277
+
(* Handle EOF in various states *)
278
+
handle_eof ()
279
+
end else if Tokenizer_stream.is_eof t.stream then begin
280
+
emit_pending_chars ();
281
+
ignore (S.process t.sink Tokenizer_token.EOF)
282
+
end else begin
283
+
step ();
284
+
process_state ()
285
+
end
286
+
287
+
and handle_eof () =
288
+
match t.state with
289
+
| Tokenizer_state.Data ->
290
+
emit_pending_chars ();
291
+
ignore (S.process t.sink Tokenizer_token.EOF)
292
+
| Tokenizer_state.Tag_open ->
293
+
error t "eof-before-tag-name";
294
+
emit_char t '<';
295
+
emit_pending_chars ();
296
+
ignore (S.process t.sink Tokenizer_token.EOF)
297
+
| Tokenizer_state.End_tag_open ->
298
+
error t "eof-before-tag-name";
299
+
emit_str t "</";
300
+
emit_pending_chars ();
301
+
ignore (S.process t.sink Tokenizer_token.EOF)
302
+
| Tokenizer_state.Tag_name
303
+
| Tokenizer_state.Before_attribute_name
304
+
| Tokenizer_state.Attribute_name
305
+
| Tokenizer_state.After_attribute_name
306
+
| Tokenizer_state.Before_attribute_value
307
+
| Tokenizer_state.Attribute_value_double_quoted
308
+
| Tokenizer_state.Attribute_value_single_quoted
309
+
| Tokenizer_state.Attribute_value_unquoted
310
+
| Tokenizer_state.After_attribute_value_quoted
311
+
| Tokenizer_state.Self_closing_start_tag ->
312
+
error t "eof-in-tag";
313
+
emit_pending_chars ();
314
+
ignore (S.process t.sink Tokenizer_token.EOF)
315
+
| Tokenizer_state.Rawtext ->
316
+
emit_pending_chars ();
317
+
ignore (S.process t.sink Tokenizer_token.EOF)
318
+
| Tokenizer_state.Rawtext_less_than_sign ->
319
+
emit_char t '<';
320
+
emit_pending_chars ();
321
+
ignore (S.process t.sink Tokenizer_token.EOF)
322
+
| Tokenizer_state.Rawtext_end_tag_open ->
323
+
emit_str t "</";
324
+
emit_pending_chars ();
325
+
ignore (S.process t.sink Tokenizer_token.EOF)
326
+
| Tokenizer_state.Rawtext_end_tag_name ->
327
+
emit_str t "</";
328
+
emit_str t (Buffer.contents t.temp_buffer);
329
+
emit_pending_chars ();
330
+
ignore (S.process t.sink Tokenizer_token.EOF)
331
+
| Tokenizer_state.Rcdata ->
332
+
emit_pending_chars ();
333
+
ignore (S.process t.sink Tokenizer_token.EOF)
334
+
| Tokenizer_state.Rcdata_less_than_sign ->
335
+
emit_char t '<';
336
+
emit_pending_chars ();
337
+
ignore (S.process t.sink Tokenizer_token.EOF)
338
+
| Tokenizer_state.Rcdata_end_tag_open ->
339
+
emit_str t "</";
340
+
emit_pending_chars ();
341
+
ignore (S.process t.sink Tokenizer_token.EOF)
342
+
| Tokenizer_state.Rcdata_end_tag_name ->
343
+
emit_str t "</";
344
+
emit_str t (Buffer.contents t.temp_buffer);
345
+
emit_pending_chars ();
346
+
ignore (S.process t.sink Tokenizer_token.EOF)
347
+
| Tokenizer_state.Script_data ->
348
+
emit_pending_chars ();
349
+
ignore (S.process t.sink Tokenizer_token.EOF)
350
+
| Tokenizer_state.Script_data_less_than_sign ->
351
+
emit_char t '<';
352
+
emit_pending_chars ();
353
+
ignore (S.process t.sink Tokenizer_token.EOF)
354
+
| Tokenizer_state.Script_data_end_tag_open ->
355
+
emit_str t "</";
356
+
emit_pending_chars ();
357
+
ignore (S.process t.sink Tokenizer_token.EOF)
358
+
| Tokenizer_state.Script_data_end_tag_name ->
359
+
emit_str t "</";
360
+
emit_str t (Buffer.contents t.temp_buffer);
361
+
emit_pending_chars ();
362
+
ignore (S.process t.sink Tokenizer_token.EOF)
363
+
| Tokenizer_state.Script_data_escape_start
364
+
| Tokenizer_state.Script_data_escape_start_dash
365
+
| Tokenizer_state.Script_data_escaped
366
+
| Tokenizer_state.Script_data_escaped_dash
367
+
| Tokenizer_state.Script_data_escaped_dash_dash ->
368
+
error t "eof-in-script-html-comment-like-text";
369
+
emit_pending_chars ();
370
+
ignore (S.process t.sink Tokenizer_token.EOF)
371
+
| Tokenizer_state.Script_data_escaped_less_than_sign ->
372
+
emit_char t '<';
373
+
emit_pending_chars ();
374
+
ignore (S.process t.sink Tokenizer_token.EOF)
375
+
| Tokenizer_state.Script_data_escaped_end_tag_open ->
376
+
emit_str t "</";
377
+
emit_pending_chars ();
378
+
ignore (S.process t.sink Tokenizer_token.EOF)
379
+
| Tokenizer_state.Script_data_escaped_end_tag_name ->
380
+
emit_str t "</";
381
+
emit_str t (Buffer.contents t.temp_buffer);
382
+
emit_pending_chars ();
383
+
ignore (S.process t.sink Tokenizer_token.EOF)
384
+
| Tokenizer_state.Script_data_double_escape_start
385
+
| Tokenizer_state.Script_data_double_escaped
386
+
| Tokenizer_state.Script_data_double_escaped_dash
387
+
| Tokenizer_state.Script_data_double_escaped_dash_dash ->
388
+
error t "eof-in-script-html-comment-like-text";
389
+
emit_pending_chars ();
390
+
ignore (S.process t.sink Tokenizer_token.EOF)
391
+
| Tokenizer_state.Script_data_double_escaped_less_than_sign ->
392
+
(* '<' was already emitted when entering this state from Script_data_double_escaped *)
393
+
emit_pending_chars ();
394
+
ignore (S.process t.sink Tokenizer_token.EOF)
395
+
| Tokenizer_state.Script_data_double_escape_end ->
396
+
emit_pending_chars ();
397
+
ignore (S.process t.sink Tokenizer_token.EOF)
398
+
| Tokenizer_state.Plaintext ->
399
+
emit_pending_chars ();
400
+
ignore (S.process t.sink Tokenizer_token.EOF)
401
+
| Tokenizer_state.Comment_start
402
+
| Tokenizer_state.Comment_start_dash
403
+
| Tokenizer_state.Comment
404
+
| Tokenizer_state.Comment_less_than_sign
405
+
| Tokenizer_state.Comment_less_than_sign_bang
406
+
| Tokenizer_state.Comment_less_than_sign_bang_dash
407
+
| Tokenizer_state.Comment_less_than_sign_bang_dash_dash
408
+
| Tokenizer_state.Comment_end_dash
409
+
| Tokenizer_state.Comment_end
410
+
| Tokenizer_state.Comment_end_bang ->
411
+
error t "eof-in-comment";
412
+
emit_current_comment ();
413
+
emit_pending_chars ();
414
+
ignore (S.process t.sink Tokenizer_token.EOF)
415
+
| Tokenizer_state.Bogus_comment ->
416
+
emit_current_comment ();
417
+
emit_pending_chars ();
418
+
ignore (S.process t.sink Tokenizer_token.EOF)
419
+
| Tokenizer_state.Markup_declaration_open ->
420
+
error t "incorrectly-opened-comment";
421
+
Buffer.clear t.current_comment;
422
+
emit_current_comment ();
423
+
emit_pending_chars ();
424
+
ignore (S.process t.sink Tokenizer_token.EOF)
425
+
| Tokenizer_state.Doctype
426
+
| Tokenizer_state.Before_doctype_name ->
427
+
error t "eof-in-doctype";
428
+
start_new_doctype t;
429
+
t.current_doctype_force_quirks <- true;
430
+
emit_current_doctype ();
431
+
emit_pending_chars ();
432
+
ignore (S.process t.sink Tokenizer_token.EOF)
433
+
| Tokenizer_state.Doctype_name
434
+
| Tokenizer_state.After_doctype_name
435
+
| Tokenizer_state.After_doctype_public_keyword
436
+
| Tokenizer_state.Before_doctype_public_identifier
437
+
| Tokenizer_state.Doctype_public_identifier_double_quoted
438
+
| Tokenizer_state.Doctype_public_identifier_single_quoted
439
+
| Tokenizer_state.After_doctype_public_identifier
440
+
| Tokenizer_state.Between_doctype_public_and_system_identifiers
441
+
| Tokenizer_state.After_doctype_system_keyword
442
+
| Tokenizer_state.Before_doctype_system_identifier
443
+
| Tokenizer_state.Doctype_system_identifier_double_quoted
444
+
| Tokenizer_state.Doctype_system_identifier_single_quoted
445
+
| Tokenizer_state.After_doctype_system_identifier ->
446
+
error t "eof-in-doctype";
447
+
t.current_doctype_force_quirks <- true;
448
+
emit_current_doctype ();
449
+
emit_pending_chars ();
450
+
ignore (S.process t.sink Tokenizer_token.EOF)
451
+
| Tokenizer_state.Bogus_doctype ->
452
+
emit_current_doctype ();
453
+
emit_pending_chars ();
454
+
ignore (S.process t.sink Tokenizer_token.EOF)
455
+
| Tokenizer_state.Cdata_section ->
456
+
error t "eof-in-cdata";
457
+
emit_pending_chars ();
458
+
ignore (S.process t.sink Tokenizer_token.EOF)
459
+
| Tokenizer_state.Cdata_section_bracket ->
460
+
error t "eof-in-cdata";
461
+
emit_char t ']';
462
+
emit_pending_chars ();
463
+
ignore (S.process t.sink Tokenizer_token.EOF)
464
+
| Tokenizer_state.Cdata_section_end ->
465
+
error t "eof-in-cdata";
466
+
emit_str t "]]";
467
+
emit_pending_chars ();
468
+
ignore (S.process t.sink Tokenizer_token.EOF)
469
+
| Tokenizer_state.Character_reference ->
470
+
(* state_character_reference never ran, so initialize temp_buffer with & *)
471
+
Buffer.clear t.temp_buffer;
472
+
Buffer.add_char t.temp_buffer '&';
473
+
flush_code_points_consumed_as_char_ref t;
474
+
t.state <- t.return_state;
475
+
handle_eof ()
476
+
| Tokenizer_state.Named_character_reference ->
477
+
flush_code_points_consumed_as_char_ref t;
478
+
t.state <- t.return_state;
479
+
handle_eof ()
480
+
| Tokenizer_state.Numeric_character_reference ->
481
+
(* At EOF with just "&#" - no digits follow *)
482
+
error t "absence-of-digits-in-numeric-character-reference";
483
+
flush_code_points_consumed_as_char_ref t;
484
+
t.state <- t.return_state;
485
+
handle_eof ()
486
+
| Tokenizer_state.Hexadecimal_character_reference_start
487
+
| Tokenizer_state.Decimal_character_reference_start ->
488
+
error t "absence-of-digits-in-numeric-character-reference";
489
+
flush_code_points_consumed_as_char_ref t;
490
+
t.state <- t.return_state;
491
+
handle_eof ()
492
+
| Tokenizer_state.Numeric_character_reference_end ->
493
+
(* We have collected digits, just need to finalize the character reference *)
494
+
step ();
495
+
handle_eof ()
496
+
| Tokenizer_state.Ambiguous_ampersand ->
497
+
(* Buffer was already flushed when entering this state, just transition *)
498
+
t.state <- t.return_state;
499
+
handle_eof ()
500
+
| Tokenizer_state.Hexadecimal_character_reference
501
+
| Tokenizer_state.Decimal_character_reference ->
502
+
(* At EOF with collected digits - convert the numeric reference *)
503
+
error t "missing-semicolon-after-character-reference";
504
+
let code = t.char_ref_code in
505
+
let replacement_char = "\xEF\xBF\xBD" in
506
+
let result =
507
+
if code = 0 then begin
508
+
error t "null-character-reference";
509
+
replacement_char
510
+
end else if code > 0x10FFFF then begin
511
+
error t "character-reference-outside-unicode-range";
512
+
replacement_char
513
+
end else if code >= 0xD800 && code <= 0xDFFF then begin
514
+
error t "surrogate-character-reference";
515
+
replacement_char
516
+
end else
517
+
Entities.Numeric_ref.codepoint_to_utf8 code
518
+
in
519
+
Buffer.clear t.temp_buffer;
520
+
Buffer.add_string t.temp_buffer result;
521
+
flush_code_points_consumed_as_char_ref t;
522
+
t.state <- t.return_state;
523
+
handle_eof ()
524
+
525
+
and step () =
526
+
match t.state with
527
+
| Tokenizer_state.Data -> state_data ()
528
+
| Tokenizer_state.Rcdata -> state_rcdata ()
529
+
| Tokenizer_state.Rawtext -> state_rawtext ()
530
+
| Tokenizer_state.Script_data -> state_script_data ()
531
+
| Tokenizer_state.Plaintext -> state_plaintext ()
532
+
| Tokenizer_state.Tag_open -> state_tag_open ()
533
+
| Tokenizer_state.End_tag_open -> state_end_tag_open ()
534
+
| Tokenizer_state.Tag_name -> state_tag_name ()
535
+
| Tokenizer_state.Rcdata_less_than_sign -> state_rcdata_less_than_sign ()
536
+
| Tokenizer_state.Rcdata_end_tag_open -> state_rcdata_end_tag_open ()
537
+
| Tokenizer_state.Rcdata_end_tag_name -> state_rcdata_end_tag_name ()
538
+
| Tokenizer_state.Rawtext_less_than_sign -> state_rawtext_less_than_sign ()
539
+
| Tokenizer_state.Rawtext_end_tag_open -> state_rawtext_end_tag_open ()
540
+
| Tokenizer_state.Rawtext_end_tag_name -> state_rawtext_end_tag_name ()
541
+
| Tokenizer_state.Script_data_less_than_sign -> state_script_data_less_than_sign ()
542
+
| Tokenizer_state.Script_data_end_tag_open -> state_script_data_end_tag_open ()
543
+
| Tokenizer_state.Script_data_end_tag_name -> state_script_data_end_tag_name ()
544
+
| Tokenizer_state.Script_data_escape_start -> state_script_data_escape_start ()
545
+
| Tokenizer_state.Script_data_escape_start_dash -> state_script_data_escape_start_dash ()
546
+
| Tokenizer_state.Script_data_escaped -> state_script_data_escaped ()
547
+
| Tokenizer_state.Script_data_escaped_dash -> state_script_data_escaped_dash ()
548
+
| Tokenizer_state.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash ()
549
+
| Tokenizer_state.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign ()
550
+
| Tokenizer_state.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open ()
551
+
| Tokenizer_state.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name ()
552
+
| Tokenizer_state.Script_data_double_escape_start -> state_script_data_double_escape_start ()
553
+
| Tokenizer_state.Script_data_double_escaped -> state_script_data_double_escaped ()
554
+
| Tokenizer_state.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash ()
555
+
| Tokenizer_state.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash ()
556
+
| Tokenizer_state.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign ()
557
+
| Tokenizer_state.Script_data_double_escape_end -> state_script_data_double_escape_end ()
558
+
| Tokenizer_state.Before_attribute_name -> state_before_attribute_name ()
559
+
| Tokenizer_state.Attribute_name -> state_attribute_name ()
560
+
| Tokenizer_state.After_attribute_name -> state_after_attribute_name ()
561
+
| Tokenizer_state.Before_attribute_value -> state_before_attribute_value ()
562
+
| Tokenizer_state.Attribute_value_double_quoted -> state_attribute_value_double_quoted ()
563
+
| Tokenizer_state.Attribute_value_single_quoted -> state_attribute_value_single_quoted ()
564
+
| Tokenizer_state.Attribute_value_unquoted -> state_attribute_value_unquoted ()
565
+
| Tokenizer_state.After_attribute_value_quoted -> state_after_attribute_value_quoted ()
566
+
| Tokenizer_state.Self_closing_start_tag -> state_self_closing_start_tag ()
567
+
| Tokenizer_state.Bogus_comment -> state_bogus_comment ()
568
+
| Tokenizer_state.Markup_declaration_open -> state_markup_declaration_open ()
569
+
| Tokenizer_state.Comment_start -> state_comment_start ()
570
+
| Tokenizer_state.Comment_start_dash -> state_comment_start_dash ()
571
+
| Tokenizer_state.Comment -> state_comment ()
572
+
| Tokenizer_state.Comment_less_than_sign -> state_comment_less_than_sign ()
573
+
| Tokenizer_state.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang ()
574
+
| Tokenizer_state.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash ()
575
+
| Tokenizer_state.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash ()
576
+
| Tokenizer_state.Comment_end_dash -> state_comment_end_dash ()
577
+
| Tokenizer_state.Comment_end -> state_comment_end ()
578
+
| Tokenizer_state.Comment_end_bang -> state_comment_end_bang ()
579
+
| Tokenizer_state.Doctype -> state_doctype ()
580
+
| Tokenizer_state.Before_doctype_name -> state_before_doctype_name ()
581
+
| Tokenizer_state.Doctype_name -> state_doctype_name ()
582
+
| Tokenizer_state.After_doctype_name -> state_after_doctype_name ()
583
+
| Tokenizer_state.After_doctype_public_keyword -> state_after_doctype_public_keyword ()
584
+
| Tokenizer_state.Before_doctype_public_identifier -> state_before_doctype_public_identifier ()
585
+
| Tokenizer_state.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted ()
586
+
| Tokenizer_state.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted ()
587
+
| Tokenizer_state.After_doctype_public_identifier -> state_after_doctype_public_identifier ()
588
+
| Tokenizer_state.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers ()
589
+
| Tokenizer_state.After_doctype_system_keyword -> state_after_doctype_system_keyword ()
590
+
| Tokenizer_state.Before_doctype_system_identifier -> state_before_doctype_system_identifier ()
591
+
| Tokenizer_state.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted ()
592
+
| Tokenizer_state.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted ()
593
+
| Tokenizer_state.After_doctype_system_identifier -> state_after_doctype_system_identifier ()
594
+
| Tokenizer_state.Bogus_doctype -> state_bogus_doctype ()
595
+
| Tokenizer_state.Cdata_section -> state_cdata_section ()
596
+
| Tokenizer_state.Cdata_section_bracket -> state_cdata_section_bracket ()
597
+
| Tokenizer_state.Cdata_section_end -> state_cdata_section_end ()
598
+
| Tokenizer_state.Character_reference -> state_character_reference ()
599
+
| Tokenizer_state.Named_character_reference -> state_named_character_reference ()
600
+
| Tokenizer_state.Ambiguous_ampersand -> state_ambiguous_ampersand ()
601
+
| Tokenizer_state.Numeric_character_reference -> state_numeric_character_reference ()
602
+
| Tokenizer_state.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start ()
603
+
| Tokenizer_state.Decimal_character_reference_start -> state_decimal_character_reference_start ()
604
+
| Tokenizer_state.Hexadecimal_character_reference -> state_hexadecimal_character_reference ()
605
+
| Tokenizer_state.Decimal_character_reference -> state_decimal_character_reference ()
606
+
| Tokenizer_state.Numeric_character_reference_end -> state_numeric_character_reference_end ()
607
+
608
+
(* State implementations *)
609
+
and state_data () =
610
+
match Tokenizer_stream.consume t.stream with
611
+
| Some '&' ->
612
+
t.return_state <- Tokenizer_state.Data;
613
+
t.state <- Tokenizer_state.Character_reference
614
+
| Some '<' ->
615
+
t.state <- Tokenizer_state.Tag_open
616
+
| Some '\x00' ->
617
+
(* Emit pending chars first, then emit null separately for proper tree builder handling *)
618
+
emit_pending_chars ();
619
+
error t "unexpected-null-character";
620
+
ignore (S.process t.sink (Tokenizer_token.Character "\x00"))
621
+
| Some c ->
622
+
emit_char_checked c
623
+
| None -> ()
624
+
625
+
and state_rcdata () =
626
+
match Tokenizer_stream.consume t.stream with
627
+
| Some '&' ->
628
+
t.return_state <- Tokenizer_state.Rcdata;
629
+
t.state <- Tokenizer_state.Character_reference
630
+
| Some '<' ->
631
+
t.state <- Tokenizer_state.Rcdata_less_than_sign
632
+
| Some '\x00' ->
633
+
error t "unexpected-null-character";
634
+
emit_str t "\xEF\xBF\xBD"
635
+
| Some c ->
636
+
emit_char_checked c
637
+
| None -> ()
638
+
639
+
and state_rawtext () =
640
+
match Tokenizer_stream.consume t.stream with
641
+
| Some '<' ->
642
+
t.state <- Tokenizer_state.Rawtext_less_than_sign
643
+
| Some '\x00' ->
644
+
error t "unexpected-null-character";
645
+
emit_str t "\xEF\xBF\xBD"
646
+
| Some c ->
647
+
emit_char_checked c
648
+
| None -> ()
649
+
650
+
and state_script_data () =
651
+
match Tokenizer_stream.consume t.stream with
652
+
| Some '<' ->
653
+
t.state <- Tokenizer_state.Script_data_less_than_sign
654
+
| Some '\x00' ->
655
+
error t "unexpected-null-character";
656
+
emit_str t "\xEF\xBF\xBD"
657
+
| Some c ->
658
+
emit_char_checked c
659
+
| None -> ()
660
+
661
+
and state_plaintext () =
662
+
match Tokenizer_stream.consume t.stream with
663
+
| Some '\x00' ->
664
+
error t "unexpected-null-character";
665
+
emit_str t "\xEF\xBF\xBD"
666
+
| Some c ->
667
+
emit_char_checked c
668
+
| None -> ()
669
+
670
+
and state_tag_open () =
671
+
match Tokenizer_stream.peek t.stream with
672
+
| Some '!' ->
673
+
Tokenizer_stream.advance t.stream;
674
+
t.state <- Tokenizer_state.Markup_declaration_open
675
+
| Some '/' ->
676
+
Tokenizer_stream.advance t.stream;
677
+
t.state <- Tokenizer_state.End_tag_open
678
+
| Some c when is_ascii_alpha c ->
679
+
start_new_tag t Tokenizer_token.Start;
680
+
t.state <- Tokenizer_state.Tag_name
681
+
| Some '?' ->
682
+
error t "unexpected-question-mark-instead-of-tag-name";
683
+
Buffer.clear t.current_comment;
684
+
t.state <- Tokenizer_state.Bogus_comment
685
+
| None ->
686
+
error t "eof-before-tag-name";
687
+
emit_char t '<'
688
+
| Some _ ->
689
+
error t "invalid-first-character-of-tag-name";
690
+
emit_char t '<';
691
+
t.state <- Tokenizer_state.Data
692
+
693
+
and state_end_tag_open () =
694
+
match Tokenizer_stream.peek t.stream with
695
+
| Some c when is_ascii_alpha c ->
696
+
start_new_tag t Tokenizer_token.End;
697
+
t.state <- Tokenizer_state.Tag_name
698
+
| Some '>' ->
699
+
Tokenizer_stream.advance t.stream;
700
+
error t "missing-end-tag-name";
701
+
t.state <- Tokenizer_state.Data
702
+
| None ->
703
+
error t "eof-before-tag-name";
704
+
emit_str t "</"
705
+
| Some _ ->
706
+
error t "invalid-first-character-of-tag-name";
707
+
Buffer.clear t.current_comment;
708
+
t.state <- Tokenizer_state.Bogus_comment
709
+
710
+
and state_tag_name () =
711
+
match Tokenizer_stream.consume t.stream with
712
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
713
+
t.state <- Tokenizer_state.Before_attribute_name
714
+
| Some '/' ->
715
+
t.state <- Tokenizer_state.Self_closing_start_tag
716
+
| Some '>' ->
717
+
t.state <- Tokenizer_state.Data;
718
+
emit_current_tag ()
719
+
| Some '\x00' ->
720
+
error t "unexpected-null-character";
721
+
Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
722
+
| Some c ->
723
+
check_control_char c;
724
+
Buffer.add_char t.current_tag_name (ascii_lower c)
725
+
| None -> ()
726
+
727
+
and state_rcdata_less_than_sign () =
728
+
match Tokenizer_stream.peek t.stream with
729
+
| Some '/' ->
730
+
Tokenizer_stream.advance t.stream;
731
+
Buffer.clear t.temp_buffer;
732
+
t.state <- Tokenizer_state.Rcdata_end_tag_open
733
+
| _ ->
734
+
emit_char t '<';
735
+
t.state <- Tokenizer_state.Rcdata
736
+
737
+
and state_rcdata_end_tag_open () =
738
+
match Tokenizer_stream.peek t.stream with
739
+
| Some c when is_ascii_alpha c ->
740
+
start_new_tag t Tokenizer_token.End;
741
+
t.state <- Tokenizer_state.Rcdata_end_tag_name
742
+
| _ ->
743
+
emit_str t "</";
744
+
t.state <- Tokenizer_state.Rcdata
745
+
746
+
and state_rcdata_end_tag_name () =
747
+
match Tokenizer_stream.peek t.stream with
748
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
749
+
Tokenizer_stream.advance t.stream;
750
+
t.state <- Tokenizer_state.Before_attribute_name
751
+
| Some '/' when is_appropriate_end_tag t ->
752
+
Tokenizer_stream.advance t.stream;
753
+
t.state <- Tokenizer_state.Self_closing_start_tag
754
+
| Some '>' when is_appropriate_end_tag t ->
755
+
Tokenizer_stream.advance t.stream;
756
+
t.state <- Tokenizer_state.Data;
757
+
emit_current_tag ()
758
+
| Some c when is_ascii_alpha c ->
759
+
Tokenizer_stream.advance t.stream;
760
+
Buffer.add_char t.current_tag_name (ascii_lower c);
761
+
Buffer.add_char t.temp_buffer c
762
+
| _ ->
763
+
emit_str t "</";
764
+
emit_str t (Buffer.contents t.temp_buffer);
765
+
t.state <- Tokenizer_state.Rcdata
766
+
767
+
and state_rawtext_less_than_sign () =
768
+
match Tokenizer_stream.peek t.stream with
769
+
| Some '/' ->
770
+
Tokenizer_stream.advance t.stream;
771
+
Buffer.clear t.temp_buffer;
772
+
t.state <- Tokenizer_state.Rawtext_end_tag_open
773
+
| _ ->
774
+
emit_char t '<';
775
+
t.state <- Tokenizer_state.Rawtext
776
+
777
+
and state_rawtext_end_tag_open () =
778
+
match Tokenizer_stream.peek t.stream with
779
+
| Some c when is_ascii_alpha c ->
780
+
start_new_tag t Tokenizer_token.End;
781
+
t.state <- Tokenizer_state.Rawtext_end_tag_name
782
+
| _ ->
783
+
emit_str t "</";
784
+
t.state <- Tokenizer_state.Rawtext
785
+
786
+
and state_rawtext_end_tag_name () =
787
+
match Tokenizer_stream.peek t.stream with
788
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
789
+
Tokenizer_stream.advance t.stream;
790
+
t.state <- Tokenizer_state.Before_attribute_name
791
+
| Some '/' when is_appropriate_end_tag t ->
792
+
Tokenizer_stream.advance t.stream;
793
+
t.state <- Tokenizer_state.Self_closing_start_tag
794
+
| Some '>' when is_appropriate_end_tag t ->
795
+
Tokenizer_stream.advance t.stream;
796
+
t.state <- Tokenizer_state.Data;
797
+
emit_current_tag ()
798
+
| Some c when is_ascii_alpha c ->
799
+
Tokenizer_stream.advance t.stream;
800
+
Buffer.add_char t.current_tag_name (ascii_lower c);
801
+
Buffer.add_char t.temp_buffer c
802
+
| _ ->
803
+
emit_str t "</";
804
+
emit_str t (Buffer.contents t.temp_buffer);
805
+
t.state <- Tokenizer_state.Rawtext
806
+
807
+
and state_script_data_less_than_sign () =
808
+
match Tokenizer_stream.peek t.stream with
809
+
| Some '/' ->
810
+
Tokenizer_stream.advance t.stream;
811
+
Buffer.clear t.temp_buffer;
812
+
t.state <- Tokenizer_state.Script_data_end_tag_open
813
+
| Some '!' ->
814
+
Tokenizer_stream.advance t.stream;
815
+
t.state <- Tokenizer_state.Script_data_escape_start;
816
+
emit_str t "<!"
817
+
| _ ->
818
+
emit_char t '<';
819
+
t.state <- Tokenizer_state.Script_data
820
+
821
+
and state_script_data_end_tag_open () =
822
+
match Tokenizer_stream.peek t.stream with
823
+
| Some c when is_ascii_alpha c ->
824
+
start_new_tag t Tokenizer_token.End;
825
+
t.state <- Tokenizer_state.Script_data_end_tag_name
826
+
| _ ->
827
+
emit_str t "</";
828
+
t.state <- Tokenizer_state.Script_data
829
+
830
+
and state_script_data_end_tag_name () =
831
+
match Tokenizer_stream.peek t.stream with
832
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
833
+
Tokenizer_stream.advance t.stream;
834
+
t.state <- Tokenizer_state.Before_attribute_name
835
+
| Some '/' when is_appropriate_end_tag t ->
836
+
Tokenizer_stream.advance t.stream;
837
+
t.state <- Tokenizer_state.Self_closing_start_tag
838
+
| Some '>' when is_appropriate_end_tag t ->
839
+
Tokenizer_stream.advance t.stream;
840
+
t.state <- Tokenizer_state.Data;
841
+
emit_current_tag ()
842
+
| Some c when is_ascii_alpha c ->
843
+
Tokenizer_stream.advance t.stream;
844
+
Buffer.add_char t.current_tag_name (ascii_lower c);
845
+
Buffer.add_char t.temp_buffer c
846
+
| _ ->
847
+
emit_str t "</";
848
+
emit_str t (Buffer.contents t.temp_buffer);
849
+
t.state <- Tokenizer_state.Script_data
850
+
851
+
and state_script_data_escape_start () =
852
+
match Tokenizer_stream.peek t.stream with
853
+
| Some '-' ->
854
+
Tokenizer_stream.advance t.stream;
855
+
t.state <- Tokenizer_state.Script_data_escape_start_dash;
856
+
emit_char t '-'
857
+
| _ ->
858
+
t.state <- Tokenizer_state.Script_data
859
+
860
+
and state_script_data_escape_start_dash () =
861
+
match Tokenizer_stream.peek t.stream with
862
+
| Some '-' ->
863
+
Tokenizer_stream.advance t.stream;
864
+
t.state <- Tokenizer_state.Script_data_escaped_dash_dash;
865
+
emit_char t '-'
866
+
| _ ->
867
+
t.state <- Tokenizer_state.Script_data
868
+
869
+
and state_script_data_escaped () =
870
+
match Tokenizer_stream.consume t.stream with
871
+
| Some '-' ->
872
+
t.state <- Tokenizer_state.Script_data_escaped_dash;
873
+
emit_char t '-'
874
+
| Some '<' ->
875
+
t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
876
+
| Some '\x00' ->
877
+
error t "unexpected-null-character";
878
+
emit_str t "\xEF\xBF\xBD"
879
+
| Some c ->
880
+
emit_char_checked c
881
+
| None -> ()
882
+
883
+
and state_script_data_escaped_dash () =
884
+
match Tokenizer_stream.consume t.stream with
885
+
| Some '-' ->
886
+
t.state <- Tokenizer_state.Script_data_escaped_dash_dash;
887
+
emit_char t '-'
888
+
| Some '<' ->
889
+
t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
890
+
| Some '\x00' ->
891
+
error t "unexpected-null-character";
892
+
t.state <- Tokenizer_state.Script_data_escaped;
893
+
emit_str t "\xEF\xBF\xBD"
894
+
| Some c ->
895
+
t.state <- Tokenizer_state.Script_data_escaped;
896
+
emit_char_checked c
897
+
| None -> ()
898
+
899
+
and state_script_data_escaped_dash_dash () =
900
+
match Tokenizer_stream.consume t.stream with
901
+
| Some '-' ->
902
+
emit_char t '-'
903
+
| Some '<' ->
904
+
t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
905
+
| Some '>' ->
906
+
t.state <- Tokenizer_state.Script_data;
907
+
emit_char t '>'
908
+
| Some '\x00' ->
909
+
error t "unexpected-null-character";
910
+
t.state <- Tokenizer_state.Script_data_escaped;
911
+
emit_str t "\xEF\xBF\xBD"
912
+
| Some c ->
913
+
t.state <- Tokenizer_state.Script_data_escaped;
914
+
emit_char_checked c
915
+
| None -> ()
916
+
917
+
and state_script_data_escaped_less_than_sign () =
918
+
match Tokenizer_stream.peek t.stream with
919
+
| Some '/' ->
920
+
Tokenizer_stream.advance t.stream;
921
+
Buffer.clear t.temp_buffer;
922
+
t.state <- Tokenizer_state.Script_data_escaped_end_tag_open
923
+
| Some c when is_ascii_alpha c ->
924
+
Buffer.clear t.temp_buffer;
925
+
emit_char t '<';
926
+
t.state <- Tokenizer_state.Script_data_double_escape_start
927
+
| _ ->
928
+
emit_char t '<';
929
+
t.state <- Tokenizer_state.Script_data_escaped
930
+
931
+
and state_script_data_escaped_end_tag_open () =
932
+
match Tokenizer_stream.peek t.stream with
933
+
| Some c when is_ascii_alpha c ->
934
+
start_new_tag t Tokenizer_token.End;
935
+
t.state <- Tokenizer_state.Script_data_escaped_end_tag_name
936
+
| _ ->
937
+
emit_str t "</";
938
+
t.state <- Tokenizer_state.Script_data_escaped
939
+
940
+
and state_script_data_escaped_end_tag_name () =
941
+
match Tokenizer_stream.peek t.stream with
942
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
943
+
Tokenizer_stream.advance t.stream;
944
+
t.state <- Tokenizer_state.Before_attribute_name
945
+
| Some '/' when is_appropriate_end_tag t ->
946
+
Tokenizer_stream.advance t.stream;
947
+
t.state <- Tokenizer_state.Self_closing_start_tag
948
+
| Some '>' when is_appropriate_end_tag t ->
949
+
Tokenizer_stream.advance t.stream;
950
+
t.state <- Tokenizer_state.Data;
951
+
emit_current_tag ()
952
+
| Some c when is_ascii_alpha c ->
953
+
Tokenizer_stream.advance t.stream;
954
+
Buffer.add_char t.current_tag_name (ascii_lower c);
955
+
Buffer.add_char t.temp_buffer c
956
+
| _ ->
957
+
emit_str t "</";
958
+
emit_str t (Buffer.contents t.temp_buffer);
959
+
t.state <- Tokenizer_state.Script_data_escaped
960
+
961
+
and state_script_data_double_escape_start () =
962
+
match Tokenizer_stream.peek t.stream with
963
+
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
964
+
Tokenizer_stream.advance t.stream;
965
+
let c = Option.get c_opt in
966
+
if Buffer.contents t.temp_buffer = "script" then
967
+
t.state <- Tokenizer_state.Script_data_double_escaped
968
+
else
969
+
t.state <- Tokenizer_state.Script_data_escaped;
970
+
emit_char t c
971
+
| Some c when is_ascii_alpha c ->
972
+
Tokenizer_stream.advance t.stream;
973
+
Buffer.add_char t.temp_buffer (ascii_lower c);
974
+
emit_char t c
975
+
| _ ->
976
+
t.state <- Tokenizer_state.Script_data_escaped
977
+
978
+
and state_script_data_double_escaped () =
979
+
match Tokenizer_stream.consume t.stream with
980
+
| Some '-' ->
981
+
t.state <- Tokenizer_state.Script_data_double_escaped_dash;
982
+
emit_char t '-'
983
+
| Some '<' ->
984
+
t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
985
+
emit_char t '<'
986
+
| Some '\x00' ->
987
+
error t "unexpected-null-character";
988
+
emit_str t "\xEF\xBF\xBD"
989
+
| Some c ->
990
+
emit_char_checked c
991
+
| None -> ()
992
+
993
+
and state_script_data_double_escaped_dash () =
994
+
match Tokenizer_stream.consume t.stream with
995
+
| Some '-' ->
996
+
t.state <- Tokenizer_state.Script_data_double_escaped_dash_dash;
997
+
emit_char t '-'
998
+
| Some '<' ->
999
+
t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
1000
+
emit_char t '<'
1001
+
| Some '\x00' ->
1002
+
error t "unexpected-null-character";
1003
+
t.state <- Tokenizer_state.Script_data_double_escaped;
1004
+
emit_str t "\xEF\xBF\xBD"
1005
+
| Some c ->
1006
+
t.state <- Tokenizer_state.Script_data_double_escaped;
1007
+
emit_char_checked c
1008
+
| None -> ()
1009
+
1010
+
and state_script_data_double_escaped_dash_dash () =
1011
+
match Tokenizer_stream.consume t.stream with
1012
+
| Some '-' ->
1013
+
emit_char t '-'
1014
+
| Some '<' ->
1015
+
t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
1016
+
emit_char t '<'
1017
+
| Some '>' ->
1018
+
t.state <- Tokenizer_state.Script_data;
1019
+
emit_char t '>'
1020
+
| Some '\x00' ->
1021
+
error t "unexpected-null-character";
1022
+
t.state <- Tokenizer_state.Script_data_double_escaped;
1023
+
emit_str t "\xEF\xBF\xBD"
1024
+
| Some c ->
1025
+
t.state <- Tokenizer_state.Script_data_double_escaped;
1026
+
emit_char_checked c
1027
+
| None -> ()
1028
+
1029
+
and state_script_data_double_escaped_less_than_sign () =
1030
+
match Tokenizer_stream.peek t.stream with
1031
+
| Some '/' ->
1032
+
Tokenizer_stream.advance t.stream;
1033
+
Buffer.clear t.temp_buffer;
1034
+
t.state <- Tokenizer_state.Script_data_double_escape_end;
1035
+
emit_char t '/'
1036
+
| _ ->
1037
+
t.state <- Tokenizer_state.Script_data_double_escaped
1038
+
1039
+
and state_script_data_double_escape_end () =
1040
+
match Tokenizer_stream.peek t.stream with
1041
+
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
1042
+
Tokenizer_stream.advance t.stream;
1043
+
let c = Option.get c_opt in
1044
+
if Buffer.contents t.temp_buffer = "script" then
1045
+
t.state <- Tokenizer_state.Script_data_escaped
1046
+
else
1047
+
t.state <- Tokenizer_state.Script_data_double_escaped;
1048
+
emit_char t c
1049
+
| Some c when is_ascii_alpha c ->
1050
+
Tokenizer_stream.advance t.stream;
1051
+
Buffer.add_char t.temp_buffer (ascii_lower c);
1052
+
emit_char t c
1053
+
| _ ->
1054
+
t.state <- Tokenizer_state.Script_data_double_escaped
1055
+
1056
+
and state_before_attribute_name () =
1057
+
match Tokenizer_stream.peek t.stream with
1058
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1059
+
Tokenizer_stream.advance t.stream
1060
+
| Some '/' | Some '>' | None ->
1061
+
t.state <- Tokenizer_state.After_attribute_name
1062
+
| Some '=' ->
1063
+
Tokenizer_stream.advance t.stream;
1064
+
error t "unexpected-equals-sign-before-attribute-name";
1065
+
start_new_attribute t;
1066
+
Buffer.add_char t.current_attr_name '=';
1067
+
t.state <- Tokenizer_state.Attribute_name
1068
+
| Some _ ->
1069
+
start_new_attribute t;
1070
+
t.state <- Tokenizer_state.Attribute_name
1071
+
1072
+
and state_attribute_name () =
1073
+
match Tokenizer_stream.peek t.stream with
1074
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1075
+
Tokenizer_stream.advance t.stream;
1076
+
t.state <- Tokenizer_state.After_attribute_name
1077
+
| Some '/' | Some '>' | None ->
1078
+
t.state <- Tokenizer_state.After_attribute_name
1079
+
| Some '=' ->
1080
+
Tokenizer_stream.advance t.stream;
1081
+
t.state <- Tokenizer_state.Before_attribute_value
1082
+
| Some '\x00' ->
1083
+
Tokenizer_stream.advance t.stream;
1084
+
error t "unexpected-null-character";
1085
+
Buffer.add_string t.current_attr_name "\xEF\xBF\xBD"
1086
+
| Some ('"' | '\'' | '<') as c_opt ->
1087
+
Tokenizer_stream.advance t.stream;
1088
+
error t "unexpected-character-in-attribute-name";
1089
+
Buffer.add_char t.current_attr_name (Option.get c_opt)
1090
+
| Some c ->
1091
+
Tokenizer_stream.advance t.stream;
1092
+
check_control_char c;
1093
+
Buffer.add_char t.current_attr_name (ascii_lower c)
1094
+
1095
+
and state_after_attribute_name () =
1096
+
match Tokenizer_stream.peek t.stream with
1097
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1098
+
Tokenizer_stream.advance t.stream
1099
+
| Some '/' ->
1100
+
Tokenizer_stream.advance t.stream;
1101
+
t.state <- Tokenizer_state.Self_closing_start_tag
1102
+
| Some '=' ->
1103
+
Tokenizer_stream.advance t.stream;
1104
+
t.state <- Tokenizer_state.Before_attribute_value
1105
+
| Some '>' ->
1106
+
Tokenizer_stream.advance t.stream;
1107
+
t.state <- Tokenizer_state.Data;
1108
+
emit_current_tag ()
1109
+
| None -> ()
1110
+
| Some _ ->
1111
+
start_new_attribute t;
1112
+
t.state <- Tokenizer_state.Attribute_name
1113
+
1114
+
and state_before_attribute_value () =
1115
+
match Tokenizer_stream.peek t.stream with
1116
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1117
+
Tokenizer_stream.advance t.stream
1118
+
| Some '"' ->
1119
+
Tokenizer_stream.advance t.stream;
1120
+
t.state <- Tokenizer_state.Attribute_value_double_quoted
1121
+
| Some '\'' ->
1122
+
Tokenizer_stream.advance t.stream;
1123
+
t.state <- Tokenizer_state.Attribute_value_single_quoted
1124
+
| Some '>' ->
1125
+
Tokenizer_stream.advance t.stream;
1126
+
error t "missing-attribute-value";
1127
+
t.state <- Tokenizer_state.Data;
1128
+
emit_current_tag ()
1129
+
| _ ->
1130
+
t.state <- Tokenizer_state.Attribute_value_unquoted
1131
+
1132
+
and state_attribute_value_double_quoted () =
1133
+
match Tokenizer_stream.consume t.stream with
1134
+
| Some '"' ->
1135
+
t.state <- Tokenizer_state.After_attribute_value_quoted
1136
+
| Some '&' ->
1137
+
t.return_state <- Tokenizer_state.Attribute_value_double_quoted;
1138
+
t.state <- Tokenizer_state.Character_reference
1139
+
| Some '\x00' ->
1140
+
error t "unexpected-null-character";
1141
+
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1142
+
| Some c ->
1143
+
check_control_char c;
1144
+
Buffer.add_char t.current_attr_value c
1145
+
| None -> ()
1146
+
1147
+
and state_attribute_value_single_quoted () =
1148
+
match Tokenizer_stream.consume t.stream with
1149
+
| Some '\'' ->
1150
+
t.state <- Tokenizer_state.After_attribute_value_quoted
1151
+
| Some '&' ->
1152
+
t.return_state <- Tokenizer_state.Attribute_value_single_quoted;
1153
+
t.state <- Tokenizer_state.Character_reference
1154
+
| Some '\x00' ->
1155
+
error t "unexpected-null-character";
1156
+
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1157
+
| Some c ->
1158
+
check_control_char c;
1159
+
Buffer.add_char t.current_attr_value c
1160
+
| None -> ()
1161
+
1162
+
and state_attribute_value_unquoted () =
1163
+
match Tokenizer_stream.peek t.stream with
1164
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1165
+
Tokenizer_stream.advance t.stream;
1166
+
t.state <- Tokenizer_state.Before_attribute_name
1167
+
| Some '&' ->
1168
+
Tokenizer_stream.advance t.stream;
1169
+
t.return_state <- Tokenizer_state.Attribute_value_unquoted;
1170
+
t.state <- Tokenizer_state.Character_reference
1171
+
| Some '>' ->
1172
+
Tokenizer_stream.advance t.stream;
1173
+
t.state <- Tokenizer_state.Data;
1174
+
emit_current_tag ()
1175
+
| Some '\x00' ->
1176
+
Tokenizer_stream.advance t.stream;
1177
+
error t "unexpected-null-character";
1178
+
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1179
+
| Some ('"' | '\'' | '<' | '=' | '`') as c_opt ->
1180
+
Tokenizer_stream.advance t.stream;
1181
+
error t "unexpected-character-in-unquoted-attribute-value";
1182
+
Buffer.add_char t.current_attr_value (Option.get c_opt)
1183
+
| Some c ->
1184
+
Tokenizer_stream.advance t.stream;
1185
+
check_control_char c;
1186
+
Buffer.add_char t.current_attr_value c
1187
+
| None -> ()
1188
+
1189
+
and state_after_attribute_value_quoted () =
1190
+
match Tokenizer_stream.peek t.stream with
1191
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1192
+
Tokenizer_stream.advance t.stream;
1193
+
t.state <- Tokenizer_state.Before_attribute_name
1194
+
| Some '/' ->
1195
+
Tokenizer_stream.advance t.stream;
1196
+
t.state <- Tokenizer_state.Self_closing_start_tag
1197
+
| Some '>' ->
1198
+
Tokenizer_stream.advance t.stream;
1199
+
t.state <- Tokenizer_state.Data;
1200
+
emit_current_tag ()
1201
+
| None -> ()
1202
+
| Some _ ->
1203
+
error t "missing-whitespace-between-attributes";
1204
+
t.state <- Tokenizer_state.Before_attribute_name
1205
+
1206
+
and state_self_closing_start_tag () =
1207
+
match Tokenizer_stream.peek t.stream with
1208
+
| Some '>' ->
1209
+
Tokenizer_stream.advance t.stream;
1210
+
t.current_tag_self_closing <- true;
1211
+
t.state <- Tokenizer_state.Data;
1212
+
emit_current_tag ()
1213
+
| None -> ()
1214
+
| Some _ ->
1215
+
error t "unexpected-solidus-in-tag";
1216
+
t.state <- Tokenizer_state.Before_attribute_name
1217
+
1218
+
and state_bogus_comment () =
1219
+
match Tokenizer_stream.consume t.stream with
1220
+
| Some '>' ->
1221
+
t.state <- Tokenizer_state.Data;
1222
+
emit_current_comment ()
1223
+
| Some '\x00' ->
1224
+
error t "unexpected-null-character";
1225
+
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1226
+
| Some c ->
1227
+
check_control_char c;
1228
+
Buffer.add_char t.current_comment c
1229
+
| None -> ()
1230
+
1231
+
and state_markup_declaration_open () =
1232
+
if Tokenizer_stream.matches_ci t.stream "--" then begin
1233
+
ignore (Tokenizer_stream.consume_exact_ci t.stream "--");
1234
+
Buffer.clear t.current_comment;
1235
+
t.state <- Tokenizer_state.Comment_start
1236
+
end else if Tokenizer_stream.matches_ci t.stream "DOCTYPE" then begin
1237
+
ignore (Tokenizer_stream.consume_exact_ci t.stream "DOCTYPE");
1238
+
t.state <- Tokenizer_state.Doctype
1239
+
end else if Tokenizer_stream.matches_ci t.stream "[CDATA[" then begin
1240
+
ignore (Tokenizer_stream.consume_exact_ci t.stream "[CDATA[");
1241
+
(* CDATA only allowed in foreign content *)
1242
+
if S.adjusted_current_node_in_html_namespace t.sink then begin
1243
+
error t "cdata-in-html-content";
1244
+
Buffer.clear t.current_comment;
1245
+
Buffer.add_string t.current_comment "[CDATA[";
1246
+
t.state <- Tokenizer_state.Bogus_comment
1247
+
end else
1248
+
t.state <- Tokenizer_state.Cdata_section
1249
+
end else begin
1250
+
error t "incorrectly-opened-comment";
1251
+
Buffer.clear t.current_comment;
1252
+
t.state <- Tokenizer_state.Bogus_comment
1253
+
end
1254
+
1255
+
and state_comment_start () =
1256
+
match Tokenizer_stream.peek t.stream with
1257
+
| Some '-' ->
1258
+
Tokenizer_stream.advance t.stream;
1259
+
t.state <- Tokenizer_state.Comment_start_dash
1260
+
| Some '>' ->
1261
+
Tokenizer_stream.advance t.stream;
1262
+
error t "abrupt-closing-of-empty-comment";
1263
+
t.state <- Tokenizer_state.Data;
1264
+
emit_current_comment ()
1265
+
| _ ->
1266
+
t.state <- Tokenizer_state.Comment
1267
+
1268
+
and state_comment_start_dash () =
1269
+
match Tokenizer_stream.peek t.stream with
1270
+
| Some '-' ->
1271
+
Tokenizer_stream.advance t.stream;
1272
+
t.state <- Tokenizer_state.Comment_end
1273
+
| Some '>' ->
1274
+
Tokenizer_stream.advance t.stream;
1275
+
error t "abrupt-closing-of-empty-comment";
1276
+
t.state <- Tokenizer_state.Data;
1277
+
emit_current_comment ()
1278
+
| None -> ()
1279
+
| Some _ ->
1280
+
Buffer.add_char t.current_comment '-';
1281
+
t.state <- Tokenizer_state.Comment
1282
+
1283
+
and state_comment () =
1284
+
match Tokenizer_stream.consume t.stream with
1285
+
| Some '<' ->
1286
+
Buffer.add_char t.current_comment '<';
1287
+
t.state <- Tokenizer_state.Comment_less_than_sign
1288
+
| Some '-' ->
1289
+
t.state <- Tokenizer_state.Comment_end_dash
1290
+
| Some '\x00' ->
1291
+
error t "unexpected-null-character";
1292
+
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1293
+
| Some c ->
1294
+
check_control_char c;
1295
+
Buffer.add_char t.current_comment c
1296
+
| None -> ()
1297
+
1298
+
and state_comment_less_than_sign () =
1299
+
match Tokenizer_stream.peek t.stream with
1300
+
| Some '!' ->
1301
+
Tokenizer_stream.advance t.stream;
1302
+
Buffer.add_char t.current_comment '!';
1303
+
t.state <- Tokenizer_state.Comment_less_than_sign_bang
1304
+
| Some '<' ->
1305
+
Tokenizer_stream.advance t.stream;
1306
+
Buffer.add_char t.current_comment '<'
1307
+
| _ ->
1308
+
t.state <- Tokenizer_state.Comment
1309
+
1310
+
and state_comment_less_than_sign_bang () =
1311
+
match Tokenizer_stream.peek t.stream with
1312
+
| Some '-' ->
1313
+
Tokenizer_stream.advance t.stream;
1314
+
t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash
1315
+
| _ ->
1316
+
t.state <- Tokenizer_state.Comment
1317
+
1318
+
and state_comment_less_than_sign_bang_dash () =
1319
+
match Tokenizer_stream.peek t.stream with
1320
+
| Some '-' ->
1321
+
Tokenizer_stream.advance t.stream;
1322
+
t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash_dash
1323
+
| _ ->
1324
+
t.state <- Tokenizer_state.Comment_end_dash
1325
+
1326
+
and state_comment_less_than_sign_bang_dash_dash () =
1327
+
match Tokenizer_stream.peek t.stream with
1328
+
| Some '>' | None ->
1329
+
t.state <- Tokenizer_state.Comment_end
1330
+
| Some _ ->
1331
+
error t "nested-comment";
1332
+
t.state <- Tokenizer_state.Comment_end
1333
+
1334
+
and state_comment_end_dash () =
1335
+
match Tokenizer_stream.peek t.stream with
1336
+
| Some '-' ->
1337
+
Tokenizer_stream.advance t.stream;
1338
+
t.state <- Tokenizer_state.Comment_end
1339
+
| None -> ()
1340
+
| Some _ ->
1341
+
Buffer.add_char t.current_comment '-';
1342
+
t.state <- Tokenizer_state.Comment
1343
+
1344
+
and state_comment_end () =
1345
+
match Tokenizer_stream.peek t.stream with
1346
+
| Some '>' ->
1347
+
Tokenizer_stream.advance t.stream;
1348
+
t.state <- Tokenizer_state.Data;
1349
+
emit_current_comment ()
1350
+
| Some '!' ->
1351
+
Tokenizer_stream.advance t.stream;
1352
+
t.state <- Tokenizer_state.Comment_end_bang
1353
+
| Some '-' ->
1354
+
Tokenizer_stream.advance t.stream;
1355
+
Buffer.add_char t.current_comment '-'
1356
+
| None -> ()
1357
+
| Some _ ->
1358
+
Buffer.add_string t.current_comment "--";
1359
+
t.state <- Tokenizer_state.Comment
1360
+
1361
+
and state_comment_end_bang () =
1362
+
match Tokenizer_stream.peek t.stream with
1363
+
| Some '-' ->
1364
+
Tokenizer_stream.advance t.stream;
1365
+
Buffer.add_string t.current_comment "--!";
1366
+
t.state <- Tokenizer_state.Comment_end_dash
1367
+
| Some '>' ->
1368
+
Tokenizer_stream.advance t.stream;
1369
+
error t "incorrectly-closed-comment";
1370
+
t.state <- Tokenizer_state.Data;
1371
+
emit_current_comment ()
1372
+
| None -> ()
1373
+
| Some _ ->
1374
+
Buffer.add_string t.current_comment "--!";
1375
+
t.state <- Tokenizer_state.Comment
1376
+
1377
+
and state_doctype () =
1378
+
match Tokenizer_stream.peek t.stream with
1379
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1380
+
Tokenizer_stream.advance t.stream;
1381
+
t.state <- Tokenizer_state.Before_doctype_name
1382
+
| Some '>' ->
1383
+
t.state <- Tokenizer_state.Before_doctype_name
1384
+
| None -> ()
1385
+
| Some _ ->
1386
+
error t "missing-whitespace-before-doctype-name";
1387
+
t.state <- Tokenizer_state.Before_doctype_name
1388
+
1389
+
and state_before_doctype_name () =
1390
+
match Tokenizer_stream.peek t.stream with
1391
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1392
+
Tokenizer_stream.advance t.stream
1393
+
| Some '\x00' ->
1394
+
Tokenizer_stream.advance t.stream;
1395
+
error t "unexpected-null-character";
1396
+
start_new_doctype t;
1397
+
t.current_doctype_name <- Some (Buffer.create 8);
1398
+
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD";
1399
+
t.state <- Tokenizer_state.Doctype_name
1400
+
| Some '>' ->
1401
+
Tokenizer_stream.advance t.stream;
1402
+
error t "missing-doctype-name";
1403
+
start_new_doctype t;
1404
+
t.current_doctype_force_quirks <- true;
1405
+
t.state <- Tokenizer_state.Data;
1406
+
emit_current_doctype ()
1407
+
| None -> ()
1408
+
| Some c ->
1409
+
Tokenizer_stream.advance t.stream;
1410
+
check_control_char c;
1411
+
start_new_doctype t;
1412
+
t.current_doctype_name <- Some (Buffer.create 8);
1413
+
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
1414
+
t.state <- Tokenizer_state.Doctype_name
1415
+
1416
+
and state_doctype_name () =
1417
+
match Tokenizer_stream.consume t.stream with
1418
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1419
+
t.state <- Tokenizer_state.After_doctype_name
1420
+
| Some '>' ->
1421
+
t.state <- Tokenizer_state.Data;
1422
+
emit_current_doctype ()
1423
+
| Some '\x00' ->
1424
+
error t "unexpected-null-character";
1425
+
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
1426
+
| Some c ->
1427
+
check_control_char c;
1428
+
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
1429
+
| None -> ()
1430
+
1431
+
and state_after_doctype_name () =
1432
+
match Tokenizer_stream.peek t.stream with
1433
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1434
+
Tokenizer_stream.advance t.stream
1435
+
| Some '>' ->
1436
+
Tokenizer_stream.advance t.stream;
1437
+
t.state <- Tokenizer_state.Data;
1438
+
emit_current_doctype ()
1439
+
| None -> ()
1440
+
| Some _ ->
1441
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1442
+
if Tokenizer_stream.matches_ci t.stream "PUBLIC" then begin
1443
+
ignore (Tokenizer_stream.consume_exact_ci t.stream "PUBLIC");
1444
+
t.state <- Tokenizer_state.After_doctype_public_keyword
1445
+
end else if Tokenizer_stream.matches_ci t.stream "SYSTEM" then begin
1446
+
ignore (Tokenizer_stream.consume_exact_ci t.stream "SYSTEM");
1447
+
t.state <- Tokenizer_state.After_doctype_system_keyword
1448
+
end else begin
1449
+
error t "invalid-character-sequence-after-doctype-name";
1450
+
t.current_doctype_force_quirks <- true;
1451
+
t.state <- Tokenizer_state.Bogus_doctype
1452
+
end
1453
+
1454
+
and state_after_doctype_public_keyword () =
1455
+
match Tokenizer_stream.peek t.stream with
1456
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1457
+
Tokenizer_stream.advance t.stream;
1458
+
t.state <- Tokenizer_state.Before_doctype_public_identifier
1459
+
| Some '"' ->
1460
+
Tokenizer_stream.advance t.stream;
1461
+
error t "missing-whitespace-after-doctype-public-keyword";
1462
+
t.current_doctype_public <- Some (Buffer.create 32);
1463
+
t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted
1464
+
| Some '\'' ->
1465
+
Tokenizer_stream.advance t.stream;
1466
+
error t "missing-whitespace-after-doctype-public-keyword";
1467
+
t.current_doctype_public <- Some (Buffer.create 32);
1468
+
t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted
1469
+
| Some '>' ->
1470
+
Tokenizer_stream.advance t.stream;
1471
+
error t "missing-doctype-public-identifier";
1472
+
t.current_doctype_force_quirks <- true;
1473
+
t.state <- Tokenizer_state.Data;
1474
+
emit_current_doctype ()
1475
+
| None -> ()
1476
+
| Some _ ->
1477
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1478
+
error t "missing-quote-before-doctype-public-identifier";
1479
+
t.current_doctype_force_quirks <- true;
1480
+
t.state <- Tokenizer_state.Bogus_doctype
1481
+
1482
+
and state_before_doctype_public_identifier () =
1483
+
match Tokenizer_stream.peek t.stream with
1484
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1485
+
Tokenizer_stream.advance t.stream
1486
+
| Some '"' ->
1487
+
Tokenizer_stream.advance t.stream;
1488
+
t.current_doctype_public <- Some (Buffer.create 32);
1489
+
t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted
1490
+
| Some '\'' ->
1491
+
Tokenizer_stream.advance t.stream;
1492
+
t.current_doctype_public <- Some (Buffer.create 32);
1493
+
t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted
1494
+
| Some '>' ->
1495
+
Tokenizer_stream.advance t.stream;
1496
+
error t "missing-doctype-public-identifier";
1497
+
t.current_doctype_force_quirks <- true;
1498
+
t.state <- Tokenizer_state.Data;
1499
+
emit_current_doctype ()
1500
+
| None -> ()
1501
+
| Some _ ->
1502
+
error t "missing-quote-before-doctype-public-identifier";
1503
+
t.current_doctype_force_quirks <- true;
1504
+
t.state <- Tokenizer_state.Bogus_doctype
1505
+
1506
+
and state_doctype_public_identifier_double_quoted () =
1507
+
match Tokenizer_stream.consume t.stream with
1508
+
| Some '"' ->
1509
+
t.state <- Tokenizer_state.After_doctype_public_identifier
1510
+
| Some '\x00' ->
1511
+
error t "unexpected-null-character";
1512
+
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1513
+
| Some '>' ->
1514
+
error t "abrupt-doctype-public-identifier";
1515
+
t.current_doctype_force_quirks <- true;
1516
+
t.state <- Tokenizer_state.Data;
1517
+
emit_current_doctype ()
1518
+
| Some c ->
1519
+
check_control_char c;
1520
+
Buffer.add_char (Option.get t.current_doctype_public) c
1521
+
| None -> ()
1522
+
1523
+
and state_doctype_public_identifier_single_quoted () =
1524
+
match Tokenizer_stream.consume t.stream with
1525
+
| Some '\'' ->
1526
+
t.state <- Tokenizer_state.After_doctype_public_identifier
1527
+
| Some '\x00' ->
1528
+
error t "unexpected-null-character";
1529
+
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1530
+
| Some '>' ->
1531
+
error t "abrupt-doctype-public-identifier";
1532
+
t.current_doctype_force_quirks <- true;
1533
+
t.state <- Tokenizer_state.Data;
1534
+
emit_current_doctype ()
1535
+
| Some c ->
1536
+
check_control_char c;
1537
+
Buffer.add_char (Option.get t.current_doctype_public) c
1538
+
| None -> ()
1539
+
1540
+
and state_after_doctype_public_identifier () =
1541
+
match Tokenizer_stream.peek t.stream with
1542
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1543
+
Tokenizer_stream.advance t.stream;
1544
+
t.state <- Tokenizer_state.Between_doctype_public_and_system_identifiers
1545
+
| Some '>' ->
1546
+
Tokenizer_stream.advance t.stream;
1547
+
t.state <- Tokenizer_state.Data;
1548
+
emit_current_doctype ()
1549
+
| Some '"' ->
1550
+
Tokenizer_stream.advance t.stream;
1551
+
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1552
+
t.current_doctype_system <- Some (Buffer.create 32);
1553
+
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1554
+
| Some '\'' ->
1555
+
Tokenizer_stream.advance t.stream;
1556
+
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1557
+
t.current_doctype_system <- Some (Buffer.create 32);
1558
+
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1559
+
| None -> ()
1560
+
| Some _ ->
1561
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1562
+
error t "missing-quote-before-doctype-system-identifier";
1563
+
t.current_doctype_force_quirks <- true;
1564
+
t.state <- Tokenizer_state.Bogus_doctype
1565
+
1566
+
and state_between_doctype_public_and_system_identifiers () =
1567
+
match Tokenizer_stream.peek t.stream with
1568
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1569
+
Tokenizer_stream.advance t.stream
1570
+
| Some '>' ->
1571
+
Tokenizer_stream.advance t.stream;
1572
+
t.state <- Tokenizer_state.Data;
1573
+
emit_current_doctype ()
1574
+
| Some '"' ->
1575
+
Tokenizer_stream.advance t.stream;
1576
+
t.current_doctype_system <- Some (Buffer.create 32);
1577
+
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1578
+
| Some '\'' ->
1579
+
Tokenizer_stream.advance t.stream;
1580
+
t.current_doctype_system <- Some (Buffer.create 32);
1581
+
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1582
+
| None -> ()
1583
+
| Some _ ->
1584
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1585
+
error t "missing-quote-before-doctype-system-identifier";
1586
+
t.current_doctype_force_quirks <- true;
1587
+
t.state <- Tokenizer_state.Bogus_doctype
1588
+
1589
+
and state_after_doctype_system_keyword () =
1590
+
match Tokenizer_stream.peek t.stream with
1591
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1592
+
Tokenizer_stream.advance t.stream;
1593
+
t.state <- Tokenizer_state.Before_doctype_system_identifier
1594
+
| Some '"' ->
1595
+
Tokenizer_stream.advance t.stream;
1596
+
error t "missing-whitespace-after-doctype-system-keyword";
1597
+
t.current_doctype_system <- Some (Buffer.create 32);
1598
+
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1599
+
| Some '\'' ->
1600
+
Tokenizer_stream.advance t.stream;
1601
+
error t "missing-whitespace-after-doctype-system-keyword";
1602
+
t.current_doctype_system <- Some (Buffer.create 32);
1603
+
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1604
+
| Some '>' ->
1605
+
Tokenizer_stream.advance t.stream;
1606
+
error t "missing-doctype-system-identifier";
1607
+
t.current_doctype_force_quirks <- true;
1608
+
t.state <- Tokenizer_state.Data;
1609
+
emit_current_doctype ()
1610
+
| None -> ()
1611
+
| Some _ ->
1612
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1613
+
error t "missing-quote-before-doctype-system-identifier";
1614
+
t.current_doctype_force_quirks <- true;
1615
+
t.state <- Tokenizer_state.Bogus_doctype
1616
+
1617
+
and state_before_doctype_system_identifier () =
1618
+
match Tokenizer_stream.peek t.stream with
1619
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1620
+
Tokenizer_stream.advance t.stream
1621
+
| Some '"' ->
1622
+
Tokenizer_stream.advance t.stream;
1623
+
t.current_doctype_system <- Some (Buffer.create 32);
1624
+
t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1625
+
| Some '\'' ->
1626
+
Tokenizer_stream.advance t.stream;
1627
+
t.current_doctype_system <- Some (Buffer.create 32);
1628
+
t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1629
+
| Some '>' ->
1630
+
Tokenizer_stream.advance t.stream;
1631
+
error t "missing-doctype-system-identifier";
1632
+
t.current_doctype_force_quirks <- true;
1633
+
t.state <- Tokenizer_state.Data;
1634
+
emit_current_doctype ()
1635
+
| None -> ()
1636
+
| Some _ ->
1637
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1638
+
error t "missing-quote-before-doctype-system-identifier";
1639
+
t.current_doctype_force_quirks <- true;
1640
+
t.state <- Tokenizer_state.Bogus_doctype
1641
+
1642
+
and state_doctype_system_identifier_double_quoted () =
1643
+
match Tokenizer_stream.consume t.stream with
1644
+
| Some '"' ->
1645
+
t.state <- Tokenizer_state.After_doctype_system_identifier
1646
+
| Some '\x00' ->
1647
+
error t "unexpected-null-character";
1648
+
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1649
+
| Some '>' ->
1650
+
error t "abrupt-doctype-system-identifier";
1651
+
t.current_doctype_force_quirks <- true;
1652
+
t.state <- Tokenizer_state.Data;
1653
+
emit_current_doctype ()
1654
+
| Some c ->
1655
+
check_control_char c;
1656
+
Buffer.add_char (Option.get t.current_doctype_system) c
1657
+
| None -> ()
1658
+
1659
+
and state_doctype_system_identifier_single_quoted () =
1660
+
match Tokenizer_stream.consume t.stream with
1661
+
| Some '\'' ->
1662
+
t.state <- Tokenizer_state.After_doctype_system_identifier
1663
+
| Some '\x00' ->
1664
+
error t "unexpected-null-character";
1665
+
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1666
+
| Some '>' ->
1667
+
error t "abrupt-doctype-system-identifier";
1668
+
t.current_doctype_force_quirks <- true;
1669
+
t.state <- Tokenizer_state.Data;
1670
+
emit_current_doctype ()
1671
+
| Some c ->
1672
+
check_control_char c;
1673
+
Buffer.add_char (Option.get t.current_doctype_system) c
1674
+
| None -> ()
1675
+
1676
+
and state_after_doctype_system_identifier () =
1677
+
match Tokenizer_stream.peek t.stream with
1678
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1679
+
Tokenizer_stream.advance t.stream
1680
+
| Some '>' ->
1681
+
Tokenizer_stream.advance t.stream;
1682
+
t.state <- Tokenizer_state.Data;
1683
+
emit_current_doctype ()
1684
+
| None -> ()
1685
+
| Some _ ->
1686
+
(* Don't check control char here - bogus_doctype will check when it consumes *)
1687
+
error t "unexpected-character-after-doctype-system-identifier";
1688
+
t.state <- Tokenizer_state.Bogus_doctype
1689
+
1690
+
and state_bogus_doctype () =
1691
+
match Tokenizer_stream.consume t.stream with
1692
+
| Some '>' ->
1693
+
t.state <- Tokenizer_state.Data;
1694
+
emit_current_doctype ()
1695
+
| Some '\x00' ->
1696
+
error t "unexpected-null-character"
1697
+
| Some c ->
1698
+
check_control_char c (* Check all chars in bogus doctype *)
1699
+
| None -> ()
1700
+
1701
+
and state_cdata_section () =
1702
+
match Tokenizer_stream.consume t.stream with
1703
+
| Some ']' ->
1704
+
t.state <- Tokenizer_state.Cdata_section_bracket
1705
+
| Some c ->
1706
+
(* CDATA section emits all characters as-is, including NUL, but still check for control chars *)
1707
+
emit_char_checked c
1708
+
| None -> ()
1709
+
1710
+
and state_cdata_section_bracket () =
1711
+
match Tokenizer_stream.peek t.stream with
1712
+
| Some ']' ->
1713
+
Tokenizer_stream.advance t.stream;
1714
+
t.state <- Tokenizer_state.Cdata_section_end
1715
+
| _ ->
1716
+
emit_char t ']';
1717
+
t.state <- Tokenizer_state.Cdata_section
1718
+
1719
+
and state_cdata_section_end () =
1720
+
match Tokenizer_stream.peek t.stream with
1721
+
| Some ']' ->
1722
+
Tokenizer_stream.advance t.stream;
1723
+
emit_char t ']'
1724
+
| Some '>' ->
1725
+
Tokenizer_stream.advance t.stream;
1726
+
t.state <- Tokenizer_state.Data
1727
+
| _ ->
1728
+
emit_str t "]]";
1729
+
t.state <- Tokenizer_state.Cdata_section
1730
+
1731
+
and state_character_reference () =
1732
+
Buffer.clear t.temp_buffer;
1733
+
Buffer.add_char t.temp_buffer '&';
1734
+
match Tokenizer_stream.peek t.stream with
1735
+
| Some c when is_ascii_alnum c ->
1736
+
t.state <- Tokenizer_state.Named_character_reference
1737
+
| Some '#' ->
1738
+
Tokenizer_stream.advance t.stream;
1739
+
Buffer.add_char t.temp_buffer '#';
1740
+
t.state <- Tokenizer_state.Numeric_character_reference
1741
+
| _ ->
1742
+
flush_code_points_consumed_as_char_ref t;
1743
+
t.state <- t.return_state
1744
+
1745
+
and state_named_character_reference () =
1746
+
(* Collect alphanumeric characters *)
1747
+
let rec collect () =
1748
+
match Tokenizer_stream.peek t.stream with
1749
+
| Some c when is_ascii_alnum c ->
1750
+
Tokenizer_stream.advance t.stream;
1751
+
Buffer.add_char t.temp_buffer c;
1752
+
collect ()
1753
+
| _ -> ()
1754
+
in
1755
+
collect ();
1756
+
1757
+
let has_semicolon =
1758
+
match Tokenizer_stream.peek t.stream with
1759
+
| Some ';' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true
1760
+
| _ -> false
1761
+
in
1762
+
1763
+
(* Try to match entity - buffer contains "&name" or "&name;" *)
1764
+
let buf_contents = Buffer.contents t.temp_buffer in
1765
+
let name_start = 1 in (* Skip '&' *)
1766
+
let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in
1767
+
let entity_name = String.sub buf_contents name_start (name_end - name_start) in
1768
+
1769
+
(* Try progressively shorter matches *)
1770
+
(* Only match if:
1771
+
1. Full match with semicolon, OR
1772
+
2. Legacy entity (can be used without semicolon) *)
1773
+
let rec try_match len =
1774
+
if len <= 0 then None
1775
+
else
1776
+
let prefix = String.sub entity_name 0 len in
1777
+
let is_full = len = String.length entity_name in
1778
+
let would_have_semi = has_semicolon && is_full in
1779
+
(* Only use this match if it has semicolon or is a legacy entity *)
1780
+
if would_have_semi || Entities.is_legacy prefix then
1781
+
match Entities.lookup prefix with
1782
+
| Some decoded -> Some (decoded, len)
1783
+
| None -> try_match (len - 1)
1784
+
else
1785
+
try_match (len - 1)
1786
+
in
1787
+
1788
+
match try_match (String.length entity_name) with
1789
+
| Some (decoded, matched_len) ->
1790
+
let full_match = matched_len = String.length entity_name in
1791
+
let ends_with_semi = has_semicolon && full_match in
1792
+
1793
+
(* Check attribute context restrictions *)
1794
+
let in_attribute = match t.return_state with
1795
+
| Tokenizer_state.Attribute_value_double_quoted
1796
+
| Tokenizer_state.Attribute_value_single_quoted
1797
+
| Tokenizer_state.Attribute_value_unquoted -> true
1798
+
| _ -> false
1799
+
in
1800
+
1801
+
let next_char =
1802
+
if full_match && not has_semicolon then
1803
+
Tokenizer_stream.peek t.stream
1804
+
else if not full_match then
1805
+
Some entity_name.[matched_len]
1806
+
else None
1807
+
in
1808
+
1809
+
let blocked = in_attribute && not ends_with_semi &&
1810
+
match next_char with
1811
+
| Some '=' -> true
1812
+
| Some c when is_ascii_alnum c -> true
1813
+
| _ -> false
1814
+
in
1815
+
1816
+
if blocked then begin
1817
+
flush_code_points_consumed_as_char_ref t;
1818
+
t.state <- t.return_state
1819
+
end else begin
1820
+
if not ends_with_semi then
1821
+
error t "missing-semicolon-after-character-reference";
1822
+
Buffer.clear t.temp_buffer;
1823
+
Buffer.add_string t.temp_buffer decoded;
1824
+
flush_code_points_consumed_as_char_ref t;
1825
+
(* Emit unconsumed chars after partial match *)
1826
+
if not full_match then begin
1827
+
let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in
1828
+
emit_str t unconsumed;
1829
+
(* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *)
1830
+
if has_semicolon then
1831
+
emit_char t ';'
1832
+
end;
1833
+
t.state <- t.return_state
1834
+
end
1835
+
| None ->
1836
+
(* No match - check if we should report unknown-named-character-reference *)
1837
+
if String.length entity_name > 0 then begin
1838
+
(* If we have a semicolon, it's definitely an unknown named character reference *)
1839
+
if has_semicolon then
1840
+
error t "unknown-named-character-reference";
1841
+
(* Emit all the chars we consumed *)
1842
+
flush_code_points_consumed_as_char_ref t;
1843
+
t.state <- t.return_state
1844
+
end else begin
1845
+
flush_code_points_consumed_as_char_ref t;
1846
+
t.state <- t.return_state
1847
+
end
1848
+
1849
+
and state_ambiguous_ampersand () =
1850
+
match Tokenizer_stream.peek t.stream with
1851
+
| Some c when is_ascii_alnum c ->
1852
+
Tokenizer_stream.advance t.stream;
1853
+
(match t.return_state with
1854
+
| Tokenizer_state.Attribute_value_double_quoted
1855
+
| Tokenizer_state.Attribute_value_single_quoted
1856
+
| Tokenizer_state.Attribute_value_unquoted ->
1857
+
Buffer.add_char t.current_attr_value c
1858
+
| _ ->
1859
+
emit_char t c)
1860
+
| Some ';' ->
1861
+
error t "unknown-named-character-reference";
1862
+
t.state <- t.return_state
1863
+
| _ ->
1864
+
t.state <- t.return_state
1865
+
1866
+
and state_numeric_character_reference () =
1867
+
t.char_ref_code <- 0;
1868
+
match Tokenizer_stream.peek t.stream with
1869
+
| Some (('x' | 'X') as c) ->
1870
+
Tokenizer_stream.advance t.stream;
1871
+
Buffer.add_char t.temp_buffer c;
1872
+
t.state <- Tokenizer_state.Hexadecimal_character_reference_start
1873
+
| _ ->
1874
+
t.state <- Tokenizer_state.Decimal_character_reference_start
1875
+
1876
+
and state_hexadecimal_character_reference_start () =
1877
+
match Tokenizer_stream.peek t.stream with
1878
+
| Some c when is_ascii_hex c ->
1879
+
t.state <- Tokenizer_state.Hexadecimal_character_reference
1880
+
| _ ->
1881
+
error t "absence-of-digits-in-numeric-character-reference";
1882
+
flush_code_points_consumed_as_char_ref t;
1883
+
t.state <- t.return_state
1884
+
1885
+
and state_decimal_character_reference_start () =
1886
+
match Tokenizer_stream.peek t.stream with
1887
+
| Some c when is_ascii_digit c ->
1888
+
t.state <- Tokenizer_state.Decimal_character_reference
1889
+
| _ ->
1890
+
error t "absence-of-digits-in-numeric-character-reference";
1891
+
flush_code_points_consumed_as_char_ref t;
1892
+
t.state <- t.return_state
1893
+
1894
+
and state_hexadecimal_character_reference () =
1895
+
match Tokenizer_stream.peek t.stream with
1896
+
| Some c when is_ascii_digit c ->
1897
+
Tokenizer_stream.advance t.stream;
1898
+
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0');
1899
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1900
+
| Some c when c >= 'A' && c <= 'F' ->
1901
+
Tokenizer_stream.advance t.stream;
1902
+
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10);
1903
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1904
+
| Some c when c >= 'a' && c <= 'f' ->
1905
+
Tokenizer_stream.advance t.stream;
1906
+
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10);
1907
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1908
+
| Some ';' ->
1909
+
Tokenizer_stream.advance t.stream;
1910
+
t.state <- Tokenizer_state.Numeric_character_reference_end
1911
+
| _ ->
1912
+
error t "missing-semicolon-after-character-reference";
1913
+
t.state <- Tokenizer_state.Numeric_character_reference_end
1914
+
1915
+
and state_decimal_character_reference () =
1916
+
match Tokenizer_stream.peek t.stream with
1917
+
| Some c when is_ascii_digit c ->
1918
+
Tokenizer_stream.advance t.stream;
1919
+
t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0');
1920
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1921
+
| Some ';' ->
1922
+
Tokenizer_stream.advance t.stream;
1923
+
t.state <- Tokenizer_state.Numeric_character_reference_end
1924
+
| _ ->
1925
+
error t "missing-semicolon-after-character-reference";
1926
+
t.state <- Tokenizer_state.Numeric_character_reference_end
1927
+
1928
+
and state_numeric_character_reference_end () =
1929
+
let code = t.char_ref_code in
1930
+
let replacement_char = "\xEF\xBF\xBD" in
1931
+
1932
+
let result =
1933
+
if code = 0 then begin
1934
+
error t "null-character-reference";
1935
+
replacement_char
1936
+
end else if code > 0x10FFFF then begin
1937
+
error t "character-reference-outside-unicode-range";
1938
+
replacement_char
1939
+
end else if code >= 0xD800 && code <= 0xDFFF then begin
1940
+
error t "surrogate-character-reference";
1941
+
replacement_char
1942
+
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1943
+
List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
1944
+
0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
1945
+
0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
1946
+
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1947
+
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1948
+
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1949
+
error t "noncharacter-character-reference";
1950
+
Entities.Numeric_ref.codepoint_to_utf8 code
1951
+
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1952
+
(code >= 0x0D && code <= 0x1F) ||
1953
+
(code >= 0x7F && code <= 0x9F) then begin
1954
+
error t "control-character-reference";
1955
+
(* Apply Windows-1252 replacement table for 0x80-0x9F *)
1956
+
match Entities.Numeric_ref.find_replacement code with
1957
+
| Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
1958
+
| None -> Entities.Numeric_ref.codepoint_to_utf8 code
1959
+
end else
1960
+
Entities.Numeric_ref.codepoint_to_utf8 code
1961
+
in
1962
+
1963
+
Buffer.clear t.temp_buffer;
1964
+
Buffer.add_string t.temp_buffer result;
1965
+
flush_code_points_consumed_as_char_ref t;
1966
+
t.state <- t.return_state
1967
+
1968
+
in
1969
+
process_state ()
1970
+
1971
+
let get_errors t = List.rev t.errors
1972
+
1973
+
let set_state t state = t.state <- state
1974
+
1975
+
let set_last_start_tag t name = t.last_start_tag <- name
lib/parser/constants.ml
lib/html5rw/parser/parser_constants.ml
lib/parser/constants.ml
lib/html5rw/parser/parser_constants.ml
-4
lib/parser/dune
-4
lib/parser/dune
-41
lib/parser/html5rw_parser.ml
-41
lib/parser/html5rw_parser.ml
···
1
-
(*---------------------------------------------------------------------------
2
-
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
-
SPDX-License-Identifier: MIT
4
-
---------------------------------------------------------------------------*)
5
-
6
-
(* html5rw.parser - HTML5 parser with bytesrw-only API *)
7
-
8
-
module Dom = Html5rw_dom
9
-
module Tokenizer = Html5rw_tokenizer
10
-
module Encoding = Html5rw_encoding
11
-
module Constants = Constants
12
-
module Insertion_mode = Insertion_mode
13
-
module Tree_builder = Tree_builder
14
-
15
-
type parse_error = Parser.parse_error
16
-
type fragment_context = Parser.fragment_context
17
-
type t = Parser.t
18
-
19
-
(* parse_error accessors *)
20
-
let error_code (e : parse_error) = e.Tree_builder.code
21
-
let error_line (e : parse_error) = e.Tree_builder.line
22
-
let error_column (e : parse_error) = e.Tree_builder.column
23
-
24
-
(* fragment_context constructor and accessors *)
25
-
let make_fragment_context ~tag_name ?(namespace=None) () : fragment_context =
26
-
{ Tree_builder.tag_name; namespace }
27
-
28
-
let fragment_context_tag (ctx : fragment_context) = ctx.Tree_builder.tag_name
29
-
let fragment_context_namespace (ctx : fragment_context) = ctx.Tree_builder.namespace
30
-
31
-
let parse = Parser.parse
32
-
let parse_bytes = Parser.parse_bytes
33
-
let query = Parser.query
34
-
let to_writer = Parser.to_writer
35
-
let to_string = Parser.to_string
36
-
let to_text = Parser.to_text
37
-
let to_test_format = Parser.to_test_format
38
-
39
-
let root t = t.Parser.root
40
-
let errors t = t.Parser.errors
41
-
let encoding t = t.Parser.encoding
+5
-5
lib/parser/html5rw_parser.mli
lib/html5rw/parser/parser.mli
+5
-5
lib/parser/html5rw_parser.mli
lib/html5rw/parser/parser.mli
···
115
115
(** {1 Sub-modules} *)
116
116
117
117
(** DOM types and manipulation. *)
118
-
module Dom = Html5rw_dom
118
+
module Dom = Dom
119
119
120
120
(** HTML5 tokenizer.
121
121
···
125
125
126
126
@see <https://html.spec.whatwg.org/multipage/parsing.html#tokenization>
127
127
WHATWG: Tokenization *)
128
-
module Tokenizer = Html5rw_tokenizer
128
+
module Tokenizer = Tokenizer
129
129
130
130
(** Character encoding detection and conversion.
131
131
132
132
@see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
133
133
WHATWG: Determining the character encoding *)
134
-
module Encoding = Html5rw_encoding
134
+
module Encoding = Encoding
135
135
136
136
(** HTML element constants and categories.
137
137
···
480
480
481
481
{b Supported selectors:}
482
482
483
-
See {!Html5rw_selector} for the complete list. Key selectors include:
483
+
See {!Selector} for the complete list. Key selectors include:
484
484
- Type: [div], [p], [a]
485
485
- ID: [#myid]
486
486
- Class: [.myclass]
···
488
488
- Pseudo-class: [:first-child], [:nth-child(2)]
489
489
- Combinators: [div p] (descendant), [div > p] (child)
490
490
491
-
@raise Html5rw_selector.Selector_error if the selector is invalid
491
+
@raise Selector.Selector_error if the selector is invalid
492
492
493
493
@see <https://www.w3.org/TR/selectors-4/>
494
494
W3C: Selectors Level 4 *)
lib/parser/insertion_mode.ml
lib/html5rw/parser/parser_insertion_mode.ml
lib/parser/insertion_mode.ml
lib/html5rw/parser/parser_insertion_mode.ml
+22
-22
lib/parser/parser.ml
lib/html5rw/parser/parser_impl.ml
+22
-22
lib/parser/parser.ml
lib/html5rw/parser/parser_impl.ml
···
7
7
8
8
open Bytesrw
9
9
10
-
module Dom = Html5rw_dom
11
-
module Tokenizer = Html5rw_tokenizer
12
-
module Encoding = Html5rw_encoding
10
+
module Dom = Dom
11
+
module Tokenizer = Tokenizer
12
+
module Encoding = Encoding
13
13
14
-
type parse_error = Tree_builder.parse_error
14
+
type parse_error = Parser_tree_builder.parse_error
15
15
16
-
type fragment_context = Tree_builder.fragment_context
16
+
type fragment_context = Parser_tree_builder.fragment_context
17
17
18
18
type t = {
19
19
root : Dom.node;
···
23
23
24
24
(* Token sink that feeds tokens to tree builder *)
25
25
module TreeBuilderSink = struct
26
-
type t = Tree_builder.t
26
+
type t = Parser_tree_builder.t
27
27
28
28
let process tb token =
29
-
Tree_builder.process_token tb token;
29
+
Parser_tree_builder.process_token tb token;
30
30
(* Check if we need to switch tokenizer state based on current element *)
31
31
(* Only switch for HTML namespace elements - SVG/MathML use different rules *)
32
-
match Tree_builder.current_node tb with
32
+
match Parser_tree_builder.current_node tb with
33
33
| Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" ->
34
34
let name = node.Dom.name in
35
35
if List.mem name ["textarea"; "title"] then
36
-
`SwitchTo Tokenizer.State.Rcdata
36
+
`SwitchTo Tokenizer_state.Rcdata
37
37
else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
38
-
`SwitchTo Tokenizer.State.Rawtext
38
+
`SwitchTo Tokenizer_state.Rawtext
39
39
else if name = "script" then
40
-
`SwitchTo Tokenizer.State.Script_data
40
+
`SwitchTo Tokenizer_state.Script_data
41
41
else if name = "plaintext" then
42
-
`SwitchTo Tokenizer.State.Plaintext
42
+
`SwitchTo Tokenizer_state.Plaintext
43
43
else
44
44
`Continue
45
45
| _ -> `Continue
46
46
47
47
let adjusted_current_node_in_html_namespace tb =
48
-
Tree_builder.adjusted_current_node_in_html_namespace tb
48
+
Parser_tree_builder.adjusted_current_node_in_html_namespace tb
49
49
end
50
50
51
51
(* Core parsing function that takes a Bytes.Reader.t *)
52
52
let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) =
53
-
let tb = Tree_builder.create ~collect_errors ?fragment_context () in
53
+
let tb = Parser_tree_builder.create ~collect_errors ?fragment_context () in
54
54
let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in
55
55
56
56
(* Set tokenizer state for fragment parsing *)
···
62
62
| Some ctx when ctx.namespace = None || ctx.namespace = Some "html" ->
63
63
let name = String.lowercase_ascii ctx.tag_name in
64
64
if List.mem name ["title"; "textarea"] then
65
-
Tokenizer.set_state tokenizer Tokenizer.State.Rcdata
65
+
Tokenizer.set_state tokenizer Tokenizer_state.Rcdata
66
66
else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
67
-
Tokenizer.set_state tokenizer Tokenizer.State.Rawtext
67
+
Tokenizer.set_state tokenizer Tokenizer_state.Rawtext
68
68
else if name = "script" then
69
-
Tokenizer.set_state tokenizer Tokenizer.State.Script_data
69
+
Tokenizer.set_state tokenizer Tokenizer_state.Script_data
70
70
else if name = "plaintext" then
71
-
Tokenizer.set_state tokenizer Tokenizer.State.Plaintext
71
+
Tokenizer.set_state tokenizer Tokenizer_state.Plaintext
72
72
| _ -> ());
73
73
74
74
Tokenizer.run tokenizer (module TreeBuilderSink) reader;
75
75
76
-
let root = Tree_builder.finish tb in
76
+
let root = Parser_tree_builder.finish tb in
77
77
let tokenizer_errors = Tokenizer.get_errors tokenizer in
78
-
let tree_errors = Tree_builder.get_errors tb in
78
+
let tree_errors = Parser_tree_builder.get_errors tb in
79
79
let all_errors = List.map (fun e ->
80
-
{ Tree_builder.code = e.Tokenizer.Errors.code;
80
+
{ Parser_tree_builder.code = e.Tokenizer.Errors.code;
81
81
line = e.Tokenizer.Errors.line;
82
82
column = e.Tokenizer.Errors.column }
83
83
) tokenizer_errors @ tree_errors in
···
92
92
{ result with encoding = Some enc }
93
93
94
94
let query t selector =
95
-
Html5rw_selector.query t.root selector
95
+
Selector.query t.root selector
96
96
97
97
(* Serialize to a Bytes.Writer.t *)
98
98
let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) =
+172
-172
lib/parser/tree_builder.ml
lib/html5rw/parser/parser_tree_builder.ml
+172
-172
lib/parser/tree_builder.ml
lib/html5rw/parser/parser_tree_builder.ml
···
1
1
(* HTML5 Tree Builder *)
2
2
3
-
module Dom = Html5rw_dom
4
-
module Token = Html5rw_tokenizer.Token
5
-
module State = Html5rw_tokenizer.State
3
+
module Dom = Dom
4
+
module Token = Tokenizer_token
5
+
module State = Tokenizer_state
6
6
7
7
type fragment_context = {
8
8
tag_name : string;
···
25
25
26
26
type t = {
27
27
mutable document : Dom.node;
28
-
mutable mode : Insertion_mode.t;
29
-
mutable original_mode : Insertion_mode.t option;
28
+
mutable mode : Parser_insertion_mode.t;
29
+
mutable original_mode : Parser_insertion_mode.t option;
30
30
mutable open_elements : Dom.node list;
31
31
mutable active_formatting : formatting_entry list;
32
32
mutable head_element : Dom.node option;
···
35
35
mutable ignore_lf : bool;
36
36
mutable foster_parenting : bool;
37
37
mutable pending_table_chars : string list;
38
-
mutable template_modes : Insertion_mode.t list;
38
+
mutable template_modes : Parser_insertion_mode.t list;
39
39
mutable quirks_mode : Dom.quirks_mode;
40
40
mutable errors : parse_error list;
41
41
collect_errors : bool;
···
49
49
let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in
50
50
let t = {
51
51
document = doc;
52
-
mode = Insertion_mode.Initial;
52
+
mode = Parser_insertion_mode.Initial;
53
53
original_mode = None;
54
54
open_elements = [];
55
55
active_formatting = [];
···
86
86
| _ -> ());
87
87
(* Set initial mode based on context *)
88
88
t.mode <- (
89
-
if name = "html" then Insertion_mode.Before_head
89
+
if name = "html" then Parser_insertion_mode.Before_head
90
90
else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then
91
-
Insertion_mode.In_table_body
91
+
Parser_insertion_mode.In_table_body
92
92
else if name = "tr" && (ns = None || ns = Some "html") then
93
-
Insertion_mode.In_row
93
+
Parser_insertion_mode.In_row
94
94
else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then
95
-
Insertion_mode.In_cell
95
+
Parser_insertion_mode.In_cell
96
96
else if name = "caption" && (ns = None || ns = Some "html") then
97
-
Insertion_mode.In_caption
97
+
Parser_insertion_mode.In_caption
98
98
else if name = "colgroup" && (ns = None || ns = Some "html") then
99
-
Insertion_mode.In_column_group
99
+
Parser_insertion_mode.In_column_group
100
100
else if name = "table" && (ns = None || ns = Some "html") then
101
-
Insertion_mode.In_table
101
+
Parser_insertion_mode.In_table
102
102
else if name = "template" && (ns = None || ns = Some "html") then begin
103
-
t.template_modes <- [Insertion_mode.In_template];
104
-
Insertion_mode.In_template
103
+
t.template_modes <- [Parser_insertion_mode.In_template];
104
+
Parser_insertion_mode.In_template
105
105
end
106
106
else
107
-
Insertion_mode.In_body
107
+
Parser_insertion_mode.In_body
108
108
);
109
109
t.frameset_ok <- false
110
110
| None -> ());
···
136
136
let is_special_element node =
137
137
let name = String.lowercase_ascii node.Dom.name in
138
138
match node.Dom.namespace with
139
-
| None | Some "html" -> Constants.is_special name
139
+
| None | Some "html" -> Parser_constants.is_special name
140
140
| Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"]
141
141
| Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"]
142
142
| _ -> false
···
213
213
let insert_foreign_element t (tag : Token.tag) namespace =
214
214
let attrs =
215
215
if namespace = Some "svg" then
216
-
Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs tag.attrs)
216
+
Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs tag.attrs)
217
217
else
218
-
Constants.adjust_foreign_attrs tag.attrs
218
+
Parser_constants.adjust_foreign_attrs tag.attrs
219
219
in
220
220
let name =
221
-
if namespace = Some "svg" then Constants.adjust_svg_tag_name tag.name
221
+
if namespace = Some "svg" then Parser_constants.adjust_svg_tag_name tag.name
222
222
else tag.name
223
223
in
224
224
let node = insert_element t name ~namespace attrs in
···
282
282
let is_html_integration_point node =
283
283
(* SVG foreignObject, desc, and title are always HTML integration points *)
284
284
if node.Dom.namespace = Some "svg" &&
285
-
List.mem node.Dom.name Constants.svg_html_integration then true
285
+
List.mem node.Dom.name Parser_constants.svg_html_integration then true
286
286
(* annotation-xml is an HTML integration point only with specific encoding values *)
287
287
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
288
288
match List.assoc_opt "encoding" node.Dom.attrs with
···
313
313
check t.open_elements
314
314
315
315
let has_element_in_scope t name =
316
-
has_element_in_scope_impl t [name] Constants.default_scope ~check_integration_points:true
316
+
has_element_in_scope_impl t [name] Parser_constants.default_scope ~check_integration_points:true
317
317
318
318
let has_element_in_button_scope t name =
319
-
has_element_in_scope_impl t [name] Constants.button_scope ~check_integration_points:true
319
+
has_element_in_scope_impl t [name] Parser_constants.button_scope ~check_integration_points:true
320
320
321
321
let has_element_in_list_item_scope t name =
322
-
has_element_in_scope_impl t [name] Constants.list_item_scope ~check_integration_points:true
322
+
has_element_in_scope_impl t [name] Parser_constants.list_item_scope ~check_integration_points:true
323
323
324
324
let has_element_in_table_scope t name =
325
-
has_element_in_scope_impl t [name] Constants.table_scope ~check_integration_points:false
325
+
has_element_in_scope_impl t [name] Parser_constants.table_scope ~check_integration_points:false
326
326
327
327
let has_element_in_select_scope t name =
328
328
let rec check = function
329
329
| [] -> false
330
330
| n :: rest ->
331
331
if n.Dom.name = name then true
332
-
else if not (List.mem n.Dom.name Constants.select_scope_exclude) then false
332
+
else if not (List.mem n.Dom.name Parser_constants.select_scope_exclude) then false
333
333
else check rest
334
334
in
335
335
check t.open_elements
···
338
338
let generate_implied_end_tags t ?except () =
339
339
let rec loop () =
340
340
match current_node t with
341
-
| Some n when List.mem n.Dom.name Constants.implied_end_tags ->
341
+
| Some n when List.mem n.Dom.name Parser_constants.implied_end_tags ->
342
342
(match except with
343
343
| Some ex when n.Dom.name = ex -> ()
344
344
| _ -> pop_current t; loop ())
···
349
349
let generate_all_implied_end_tags t =
350
350
let rec loop () =
351
351
match current_node t with
352
-
| Some n when List.mem n.Dom.name Constants.thoroughly_implied_end_tags ->
352
+
| Some n when List.mem n.Dom.name Parser_constants.thoroughly_implied_end_tags ->
353
353
pop_current t; loop ()
354
354
| _ -> ()
355
355
in
···
662
662
(* Reset insertion mode *)
663
663
let reset_insertion_mode t =
664
664
let rec check_node last = function
665
-
| [] -> t.mode <- Insertion_mode.In_body
665
+
| [] -> t.mode <- Parser_insertion_mode.In_body
666
666
| node :: rest ->
667
667
let is_last = rest = [] in
668
668
let node_to_check =
···
678
678
let rec find_table_or_template = function
679
679
| [] -> ()
680
680
| n :: rest ->
681
-
if n.Dom.name = "template" then t.mode <- Insertion_mode.In_select
682
-
else if n.Dom.name = "table" then t.mode <- Insertion_mode.In_select_in_table
681
+
if n.Dom.name = "template" then t.mode <- Parser_insertion_mode.In_select
682
+
else if n.Dom.name = "table" then t.mode <- Parser_insertion_mode.In_select_in_table
683
683
else find_table_or_template rest
684
684
in
685
685
find_table_or_template rest
686
686
end;
687
-
if t.mode <> Insertion_mode.In_select_in_table then
688
-
t.mode <- Insertion_mode.In_select
687
+
if t.mode <> Parser_insertion_mode.In_select_in_table then
688
+
t.mode <- Parser_insertion_mode.In_select
689
689
end else if List.mem name ["td"; "th"] && not is_last then
690
-
t.mode <- Insertion_mode.In_cell
690
+
t.mode <- Parser_insertion_mode.In_cell
691
691
else if name = "tr" then
692
-
t.mode <- Insertion_mode.In_row
692
+
t.mode <- Parser_insertion_mode.In_row
693
693
else if List.mem name ["tbody"; "thead"; "tfoot"] then
694
-
t.mode <- Insertion_mode.In_table_body
694
+
t.mode <- Parser_insertion_mode.In_table_body
695
695
else if name = "caption" then
696
-
t.mode <- Insertion_mode.In_caption
696
+
t.mode <- Parser_insertion_mode.In_caption
697
697
else if name = "colgroup" then
698
-
t.mode <- Insertion_mode.In_column_group
698
+
t.mode <- Parser_insertion_mode.In_column_group
699
699
else if name = "table" then
700
-
t.mode <- Insertion_mode.In_table
700
+
t.mode <- Parser_insertion_mode.In_table
701
701
else if name = "template" then
702
-
t.mode <- (match t.template_modes with m :: _ -> m | [] -> Insertion_mode.In_template)
702
+
t.mode <- (match t.template_modes with m :: _ -> m | [] -> Parser_insertion_mode.In_template)
703
703
else if name = "head" && not is_last then
704
-
t.mode <- Insertion_mode.In_head
704
+
t.mode <- Parser_insertion_mode.In_head
705
705
else if name = "body" then
706
-
t.mode <- Insertion_mode.In_body
706
+
t.mode <- Parser_insertion_mode.In_body
707
707
else if name = "frameset" then
708
-
t.mode <- Insertion_mode.In_frameset
708
+
t.mode <- Parser_insertion_mode.In_frameset
709
709
else if name = "html" then
710
-
t.mode <- (if t.head_element = None then Insertion_mode.Before_head else Insertion_mode.After_head)
710
+
t.mode <- (if t.head_element = None then Parser_insertion_mode.Before_head else Parser_insertion_mode.After_head)
711
711
else if is_last then
712
-
t.mode <- Insertion_mode.In_body
712
+
t.mode <- Parser_insertion_mode.In_body
713
713
else
714
714
check_node last rest
715
715
in
···
737
737
let sys = Option.map String.lowercase_ascii dt.system_id in
738
738
let is_quirky =
739
739
(match pub with
740
-
| Some p -> List.mem p Constants.quirky_public_matches ||
740
+
| Some p -> List.mem p Parser_constants.quirky_public_matches ||
741
741
List.exists (fun prefix -> String.length p >= String.length prefix &&
742
-
String.sub p 0 (String.length prefix) = prefix) Constants.quirky_public_prefixes
742
+
String.sub p 0 (String.length prefix) = prefix) Parser_constants.quirky_public_prefixes
743
743
| None -> false) ||
744
744
(match sys with
745
-
| Some s -> List.mem s Constants.quirky_system_matches
745
+
| Some s -> List.mem s Parser_constants.quirky_system_matches
746
746
| None -> false)
747
747
in
748
748
if is_quirky then t.quirks_mode <- Dom.Quirks
···
751
751
match pub with
752
752
| Some p -> List.exists (fun prefix -> String.length p >= String.length prefix &&
753
753
String.sub p 0 (String.length prefix) = prefix)
754
-
Constants.limited_quirky_public_prefixes
754
+
Parser_constants.limited_quirky_public_prefixes
755
755
| None -> false
756
756
in
757
757
if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks
758
758
end
759
759
end;
760
-
t.mode <- Insertion_mode.Before_html
760
+
t.mode <- Parser_insertion_mode.Before_html
761
761
| _ ->
762
762
parse_error t "expected-doctype-but-got-other";
763
763
t.quirks_mode <- Dom.Quirks;
764
-
t.mode <- Insertion_mode.Before_html;
764
+
t.mode <- Parser_insertion_mode.Before_html;
765
765
process_token t token
766
766
767
767
and process_before_html t token =
···
772
772
| Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
773
773
let html = insert_element t "html" attrs in
774
774
t.open_elements <- [html];
775
-
t.mode <- Insertion_mode.Before_head
775
+
t.mode <- Parser_insertion_mode.Before_head
776
776
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
777
777
let html = insert_element t "html" [] in
778
778
t.open_elements <- [html];
779
-
t.mode <- Insertion_mode.Before_head;
779
+
t.mode <- Parser_insertion_mode.Before_head;
780
780
process_token t token
781
781
| Token.Tag { kind = Token.End; _ } ->
782
782
parse_error t "unexpected-end-tag"
783
783
| _ ->
784
784
let html = insert_element t "html" [] in
785
785
t.open_elements <- [html];
786
-
t.mode <- Insertion_mode.Before_head;
786
+
t.mode <- Parser_insertion_mode.Before_head;
787
787
process_token t token
788
788
789
789
and process_before_head t token =
···
797
797
let head = insert_element t "head" attrs in
798
798
t.open_elements <- head :: t.open_elements;
799
799
t.head_element <- Some head;
800
-
t.mode <- Insertion_mode.In_head
800
+
t.mode <- Parser_insertion_mode.In_head
801
801
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
802
802
let head = insert_element t "head" [] in
803
803
t.open_elements <- head :: t.open_elements;
804
804
t.head_element <- Some head;
805
-
t.mode <- Insertion_mode.In_head;
805
+
t.mode <- Parser_insertion_mode.In_head;
806
806
process_token t token
807
807
| Token.Tag { kind = Token.End; _ } ->
808
808
parse_error t "unexpected-end-tag"
···
810
810
let head = insert_element t "head" [] in
811
811
t.open_elements <- head :: t.open_elements;
812
812
t.head_element <- Some head;
813
-
t.mode <- Insertion_mode.In_head;
813
+
t.mode <- Parser_insertion_mode.In_head;
814
814
process_token t token
815
815
816
816
and process_in_head t token =
···
834
834
| Some n when n.Dom.children <> [] -> insert_character t leading_ws
835
835
| _ -> ());
836
836
pop_current t;
837
-
t.mode <- Insertion_mode.After_head;
837
+
t.mode <- Parser_insertion_mode.After_head;
838
838
process_token t (Token.Character remaining)
839
839
| Token.Comment data ->
840
840
insert_comment t data
···
848
848
| Token.Tag { kind = Token.Start; name = "title"; _ } ->
849
849
ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false });
850
850
t.original_mode <- Some t.mode;
851
-
t.mode <- Insertion_mode.Text
851
+
t.mode <- Parser_insertion_mode.Text
852
852
| Token.Tag { kind = Token.Start; name; _ }
853
853
when List.mem name ["noframes"; "style"] ->
854
854
ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
855
855
t.original_mode <- Some t.mode;
856
-
t.mode <- Insertion_mode.Text
856
+
t.mode <- Parser_insertion_mode.Text
857
857
| Token.Tag { kind = Token.Start; name = "noscript"; _ } ->
858
858
(* Scripting is disabled: parse noscript content as HTML *)
859
859
ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false });
860
-
t.mode <- Insertion_mode.In_head_noscript
860
+
t.mode <- Parser_insertion_mode.In_head_noscript
861
861
| Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
862
862
ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
863
863
t.original_mode <- Some t.mode;
864
-
t.mode <- Insertion_mode.Text
864
+
t.mode <- Parser_insertion_mode.Text
865
865
| Token.Tag { kind = Token.End; name = "head"; _ } ->
866
866
pop_current t;
867
-
t.mode <- Insertion_mode.After_head
867
+
t.mode <- Parser_insertion_mode.After_head
868
868
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
869
869
pop_current t;
870
-
t.mode <- Insertion_mode.After_head;
870
+
t.mode <- Parser_insertion_mode.After_head;
871
871
process_token t token
872
872
| Token.Tag { kind = Token.Start; name = "template"; attrs; _ } ->
873
873
let node = Dom.create_template ~attrs () in
···
876
876
t.open_elements <- node :: t.open_elements;
877
877
push_formatting_marker t;
878
878
t.frameset_ok <- false;
879
-
t.mode <- Insertion_mode.In_template;
880
-
t.template_modes <- Insertion_mode.In_template :: t.template_modes
879
+
t.mode <- Parser_insertion_mode.In_template;
880
+
t.template_modes <- Parser_insertion_mode.In_template :: t.template_modes
881
881
| Token.Tag { kind = Token.End; name = "template"; _ } ->
882
882
if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
883
883
parse_error t "unexpected-end-tag"
···
897
897
parse_error t "unexpected-end-tag"
898
898
| _ ->
899
899
pop_current t;
900
-
t.mode <- Insertion_mode.After_head;
900
+
t.mode <- Parser_insertion_mode.After_head;
901
901
process_token t token
902
902
903
903
and process_in_head_noscript t token =
···
907
907
| Token.Character _ ->
908
908
parse_error t "unexpected-char-in-noscript";
909
909
pop_current t; (* Pop noscript *)
910
-
t.mode <- Insertion_mode.In_head;
910
+
t.mode <- Parser_insertion_mode.In_head;
911
911
process_token t token
912
912
| Token.Comment _ ->
913
913
process_in_head t token
···
924
924
| Token.Tag { kind = Token.Start; _ } ->
925
925
parse_error t "unexpected-start-tag";
926
926
pop_current t; (* Pop noscript *)
927
-
t.mode <- Insertion_mode.In_head;
927
+
t.mode <- Parser_insertion_mode.In_head;
928
928
process_token t token
929
929
| Token.Tag { kind = Token.End; name = "noscript"; _ } ->
930
930
pop_current t; (* Pop noscript *)
931
-
t.mode <- Insertion_mode.In_head
931
+
t.mode <- Parser_insertion_mode.In_head
932
932
| Token.Tag { kind = Token.End; name = "br"; _ } ->
933
933
parse_error t "unexpected-end-tag";
934
934
pop_current t; (* Pop noscript *)
935
-
t.mode <- Insertion_mode.In_head;
935
+
t.mode <- Parser_insertion_mode.In_head;
936
936
process_token t token
937
937
| Token.Tag { kind = Token.End; _ } ->
938
938
parse_error t "unexpected-end-tag"
939
939
| Token.EOF ->
940
940
parse_error t "expected-closing-tag-but-got-eof";
941
941
pop_current t; (* Pop noscript *)
942
-
t.mode <- Insertion_mode.In_head;
942
+
t.mode <- Parser_insertion_mode.In_head;
943
943
process_token t token
944
944
945
945
and process_after_head t token =
···
955
955
| Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
956
956
ignore (insert_element t "body" ~push:true attrs);
957
957
t.frameset_ok <- false;
958
-
t.mode <- Insertion_mode.In_body
958
+
t.mode <- Parser_insertion_mode.In_body
959
959
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
960
960
ignore (insert_element t "frameset" ~push:true attrs);
961
-
t.mode <- Insertion_mode.In_frameset
961
+
t.mode <- Parser_insertion_mode.In_frameset
962
962
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
963
963
(* Special handling for input type="hidden" - parse error, ignore *)
964
964
let input_type = List.assoc_opt "type" attrs in
···
969
969
(* Non-hidden input creates body *)
970
970
let body = insert_element t "body" [] in
971
971
t.open_elements <- body :: t.open_elements;
972
-
t.mode <- Insertion_mode.In_body;
972
+
t.mode <- Parser_insertion_mode.In_body;
973
973
process_token t token)
974
974
| Token.Tag { kind = Token.Start; name; _ }
975
975
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
···
985
985
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
986
986
let body = insert_element t "body" [] in
987
987
t.open_elements <- body :: t.open_elements;
988
-
t.mode <- Insertion_mode.In_body;
988
+
t.mode <- Parser_insertion_mode.In_body;
989
989
process_token t token
990
990
| Token.Tag { kind = Token.Start; name = "head"; _ } ->
991
991
parse_error t "unexpected-start-tag"
···
994
994
| _ ->
995
995
let body = insert_element t "body" [] in
996
996
t.open_elements <- body :: t.open_elements;
997
-
t.mode <- Insertion_mode.In_body;
997
+
t.mode <- Parser_insertion_mode.In_body;
998
998
process_token t token
999
999
1000
1000
and process_in_body t token =
···
1060
1060
t.open_elements <- drop (idx + 1) t.open_elements;
1061
1061
(* Insert frameset element *)
1062
1062
ignore (insert_element t "frameset" ~push:true attrs);
1063
-
t.mode <- Insertion_mode.In_frameset
1063
+
t.mode <- Parser_insertion_mode.In_frameset
1064
1064
end
1065
1065
| Token.EOF ->
1066
1066
if t.template_modes <> [] then
···
1079
1079
not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1080
1080
) t.open_elements in
1081
1081
if has_unclosed then parse_error t "end-tag-too-early";
1082
-
t.mode <- Insertion_mode.After_body
1082
+
t.mode <- Parser_insertion_mode.After_body
1083
1083
end
1084
1084
| Token.Tag { kind = Token.End; name = "html"; _ } ->
1085
1085
if not (has_element_in_scope t "body") then
1086
1086
parse_error t "unexpected-end-tag"
1087
1087
else begin
1088
-
t.mode <- Insertion_mode.After_body;
1088
+
t.mode <- Parser_insertion_mode.After_body;
1089
1089
process_token t token
1090
1090
end
1091
1091
| Token.Tag { kind = Token.Start; name; attrs; _ }
1092
1092
when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
1093
1093
if has_element_in_button_scope t "p" then close_p_element t;
1094
1094
ignore (insert_element t name ~push:true attrs)
1095
-
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.heading_elements ->
1095
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Parser_constants.heading_elements ->
1096
1096
if has_element_in_button_scope t "p" then close_p_element t;
1097
1097
(match current_node t with
1098
-
| Some n when List.mem n.Dom.name Constants.heading_elements ->
1098
+
| Some n when List.mem n.Dom.name Parser_constants.heading_elements ->
1099
1099
parse_error t "unexpected-start-tag";
1100
1100
pop_current t
1101
1101
| _ -> ());
···
1230
1230
| _ -> ());
1231
1231
pop_until_tag t name
1232
1232
end
1233
-
| Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.heading_elements ->
1234
-
if not (has_element_in_scope_impl t Constants.heading_elements Constants.default_scope ~check_integration_points:true) then
1233
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name Parser_constants.heading_elements ->
1234
+
if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then
1235
1235
parse_error t "unexpected-end-tag"
1236
1236
else begin
1237
1237
generate_implied_end_tags t ();
1238
1238
(match current_node t with
1239
1239
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1240
1240
| _ -> ());
1241
-
pop_until_one_of t Constants.heading_elements
1241
+
pop_until_one_of t Parser_constants.heading_elements
1242
1242
end
1243
1243
| Token.Tag { kind = Token.Start; name = "a"; attrs; _ } ->
1244
1244
(* Check for existing <a> in active formatting *)
···
1310
1310
close_p_element t;
1311
1311
ignore (insert_element t "table" ~push:true attrs);
1312
1312
t.frameset_ok <- false;
1313
-
t.mode <- Insertion_mode.In_table
1313
+
t.mode <- Parser_insertion_mode.In_table
1314
1314
| Token.Tag { kind = Token.End; name = "br"; _ } ->
1315
1315
parse_error t "unexpected-end-tag";
1316
1316
reconstruct_active_formatting t;
···
1352
1352
t.ignore_lf <- true;
1353
1353
t.original_mode <- Some t.mode;
1354
1354
t.frameset_ok <- false;
1355
-
t.mode <- Insertion_mode.Text
1355
+
t.mode <- Parser_insertion_mode.Text
1356
1356
| Token.Tag { kind = Token.Start; name = "xmp"; _ } ->
1357
1357
if has_element_in_button_scope t "p" then close_p_element t;
1358
1358
reconstruct_active_formatting t;
1359
1359
t.frameset_ok <- false;
1360
1360
ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false });
1361
1361
t.original_mode <- Some t.mode;
1362
-
t.mode <- Insertion_mode.Text
1362
+
t.mode <- Parser_insertion_mode.Text
1363
1363
| Token.Tag { kind = Token.Start; name = "iframe"; _ } ->
1364
1364
t.frameset_ok <- false;
1365
1365
ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false });
1366
1366
t.original_mode <- Some t.mode;
1367
-
t.mode <- Insertion_mode.Text
1367
+
t.mode <- Parser_insertion_mode.Text
1368
1368
| Token.Tag { kind = Token.Start; name = "noembed"; _ } ->
1369
1369
ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false });
1370
1370
t.original_mode <- Some t.mode;
1371
-
t.mode <- Insertion_mode.Text
1371
+
t.mode <- Parser_insertion_mode.Text
1372
1372
| Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
1373
1373
reconstruct_active_formatting t;
1374
1374
ignore (insert_element t "select" ~push:true attrs);
1375
1375
t.frameset_ok <- false;
1376
-
if List.mem t.mode [Insertion_mode.In_table; Insertion_mode.In_caption; Insertion_mode.In_table_body; Insertion_mode.In_row; Insertion_mode.In_cell] then
1377
-
t.mode <- Insertion_mode.In_select_in_table
1376
+
if List.mem t.mode [Parser_insertion_mode.In_table; Parser_insertion_mode.In_caption; Parser_insertion_mode.In_table_body; Parser_insertion_mode.In_row; Parser_insertion_mode.In_cell] then
1377
+
t.mode <- Parser_insertion_mode.In_select_in_table
1378
1378
else
1379
-
t.mode <- Insertion_mode.In_select
1379
+
t.mode <- Parser_insertion_mode.In_select
1380
1380
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] ->
1381
1381
(match current_node t with
1382
1382
| Some n when n.Dom.name = "option" -> pop_current t
···
1401
1401
ignore (insert_element t name ~push:true attrs)
1402
1402
| Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1403
1403
reconstruct_active_formatting t;
1404
-
let adjusted_attrs = Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) in
1404
+
let adjusted_attrs = Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs) in
1405
1405
ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml"));
1406
1406
if self_closing then pop_current t
1407
1407
| Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1408
1408
reconstruct_active_formatting t;
1409
-
let adjusted_attrs = Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) in
1409
+
let adjusted_attrs = Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs) in
1410
1410
ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg"));
1411
1411
if self_closing then pop_current t
1412
1412
| Token.Tag { kind = Token.Start; name; attrs; _ }
···
1448
1448
| Token.EOF ->
1449
1449
parse_error t "expected-closing-tag-but-got-eof";
1450
1450
pop_current t;
1451
-
t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body;
1451
+
t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body;
1452
1452
process_token t token
1453
1453
| Token.Tag { kind = Token.End; _ } ->
1454
1454
pop_current t;
1455
-
t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body
1455
+
t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_body
1456
1456
| _ -> ()
1457
1457
1458
1458
and process_in_table t token =
···
1460
1460
| Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) ->
1461
1461
t.pending_table_chars <- [];
1462
1462
t.original_mode <- Some t.mode;
1463
-
t.mode <- Insertion_mode.In_table_text;
1463
+
t.mode <- Parser_insertion_mode.In_table_text;
1464
1464
process_token t token
1465
1465
| Token.Comment data ->
1466
1466
insert_comment t data
···
1470
1470
clear_stack_back_to_table_context t;
1471
1471
push_formatting_marker t;
1472
1472
ignore (insert_element t "caption" ~push:true attrs);
1473
-
t.mode <- Insertion_mode.In_caption
1473
+
t.mode <- Parser_insertion_mode.In_caption
1474
1474
| Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } ->
1475
1475
clear_stack_back_to_table_context t;
1476
1476
ignore (insert_element t "colgroup" ~push:true attrs);
1477
-
t.mode <- Insertion_mode.In_column_group
1477
+
t.mode <- Parser_insertion_mode.In_column_group
1478
1478
| Token.Tag { kind = Token.Start; name = "col"; _ } ->
1479
1479
clear_stack_back_to_table_context t;
1480
1480
ignore (insert_element t "colgroup" ~push:true []);
1481
-
t.mode <- Insertion_mode.In_column_group;
1481
+
t.mode <- Parser_insertion_mode.In_column_group;
1482
1482
process_token t token
1483
1483
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1484
1484
clear_stack_back_to_table_context t;
1485
1485
ignore (insert_element t name ~push:true attrs);
1486
-
t.mode <- Insertion_mode.In_table_body
1486
+
t.mode <- Parser_insertion_mode.In_table_body
1487
1487
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] ->
1488
1488
clear_stack_back_to_table_context t;
1489
1489
ignore (insert_element t "tbody" ~push:true []);
1490
-
t.mode <- Insertion_mode.In_table_body;
1490
+
t.mode <- Parser_insertion_mode.In_table_body;
1491
1491
process_token t token
1492
1492
| Token.Tag { kind = Token.Start; name = "table"; _ } ->
1493
1493
parse_error t "unexpected-start-tag";
···
1568
1568
t.foster_parenting <- false
1569
1569
end else
1570
1570
insert_character t pending;
1571
-
t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_table;
1571
+
t.mode <- Option.value t.original_mode ~default:Parser_insertion_mode.In_table;
1572
1572
process_token t token
1573
1573
1574
1574
and process_in_caption t token =
···
1583
1583
| _ -> ());
1584
1584
pop_until_tag t "caption";
1585
1585
clear_active_formatting_to_marker t;
1586
-
t.mode <- Insertion_mode.In_table
1586
+
t.mode <- Parser_insertion_mode.In_table
1587
1587
end
1588
1588
| Token.Tag { kind = Token.Start; name; _ }
1589
1589
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
···
1593
1593
generate_implied_end_tags t ();
1594
1594
pop_until_tag t "caption";
1595
1595
clear_active_formatting_to_marker t;
1596
-
t.mode <- Insertion_mode.In_table;
1596
+
t.mode <- Parser_insertion_mode.In_table;
1597
1597
process_token t token
1598
1598
end
1599
1599
| Token.Tag { kind = Token.End; name = "table"; _ } ->
···
1603
1603
generate_implied_end_tags t ();
1604
1604
pop_until_tag t "caption";
1605
1605
clear_active_formatting_to_marker t;
1606
-
t.mode <- Insertion_mode.In_table;
1606
+
t.mode <- Parser_insertion_mode.In_table;
1607
1607
process_token t token
1608
1608
end
1609
1609
| Token.Tag { kind = Token.End; name; _ }
···
1629
1629
(match current_node t with
1630
1630
| Some n when n.Dom.name = "colgroup" ->
1631
1631
pop_current t;
1632
-
t.mode <- Insertion_mode.In_table;
1632
+
t.mode <- Parser_insertion_mode.In_table;
1633
1633
process_token t (Token.Character remaining)
1634
1634
| _ ->
1635
1635
parse_error t "unexpected-token")
···
1646
1646
| Token.Tag { kind = Token.End; name = "colgroup"; _ } ->
1647
1647
(match current_node t with
1648
1648
| Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag"
1649
-
| Some _ -> pop_current t; t.mode <- Insertion_mode.In_table
1649
+
| Some _ -> pop_current t; t.mode <- Parser_insertion_mode.In_table
1650
1650
| None -> parse_error t "unexpected-end-tag")
1651
1651
| Token.Tag { kind = Token.End; name = "col"; _ } ->
1652
1652
parse_error t "unexpected-end-tag"
···
1659
1659
(match current_node t with
1660
1660
| Some n when n.Dom.name = "colgroup" ->
1661
1661
pop_current t;
1662
-
t.mode <- Insertion_mode.In_table;
1662
+
t.mode <- Parser_insertion_mode.In_table;
1663
1663
process_token t token
1664
1664
| _ ->
1665
1665
parse_error t "unexpected-token")
···
1669
1669
| Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } ->
1670
1670
clear_stack_back_to_table_body_context t;
1671
1671
ignore (insert_element t "tr" ~push:true attrs);
1672
-
t.mode <- Insertion_mode.In_row
1672
+
t.mode <- Parser_insertion_mode.In_row
1673
1673
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] ->
1674
1674
parse_error t "unexpected-start-tag";
1675
1675
clear_stack_back_to_table_body_context t;
1676
1676
ignore (insert_element t "tr" ~push:true []);
1677
-
t.mode <- Insertion_mode.In_row;
1677
+
t.mode <- Parser_insertion_mode.In_row;
1678
1678
process_token t token
1679
1679
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1680
1680
if not (has_element_in_table_scope t name) then
···
1682
1682
else begin
1683
1683
clear_stack_back_to_table_body_context t;
1684
1684
pop_current t;
1685
-
t.mode <- Insertion_mode.In_table
1685
+
t.mode <- Parser_insertion_mode.In_table
1686
1686
end
1687
1687
| Token.Tag { kind = Token.Start; name; _ }
1688
1688
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
1689
-
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then
1689
+
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
1690
1690
parse_error t "unexpected-start-tag"
1691
1691
else begin
1692
1692
clear_stack_back_to_table_body_context t;
1693
1693
pop_current t;
1694
-
t.mode <- Insertion_mode.In_table;
1694
+
t.mode <- Parser_insertion_mode.In_table;
1695
1695
process_token t token
1696
1696
end
1697
1697
| Token.Tag { kind = Token.End; name = "table"; _ } ->
1698
-
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then
1698
+
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Parser_constants.table_scope ~check_integration_points:false) then
1699
1699
parse_error t "unexpected-end-tag"
1700
1700
else begin
1701
1701
clear_stack_back_to_table_body_context t;
1702
1702
pop_current t;
1703
-
t.mode <- Insertion_mode.In_table;
1703
+
t.mode <- Parser_insertion_mode.In_table;
1704
1704
process_token t token
1705
1705
end
1706
1706
| Token.Tag { kind = Token.End; name; _ }
···
1724
1724
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] ->
1725
1725
clear_stack_back_to_table_row_context t;
1726
1726
ignore (insert_element t name ~push:true attrs);
1727
-
t.mode <- Insertion_mode.In_cell;
1727
+
t.mode <- Parser_insertion_mode.In_cell;
1728
1728
push_formatting_marker t
1729
1729
| Token.Tag { kind = Token.End; name = "tr"; _ } ->
1730
1730
if not (has_element_in_table_scope t "tr") then
···
1732
1732
else begin
1733
1733
clear_stack_back_to_table_row_context t;
1734
1734
pop_current t;
1735
-
t.mode <- Insertion_mode.In_table_body
1735
+
t.mode <- Parser_insertion_mode.In_table_body
1736
1736
end
1737
1737
| Token.Tag { kind = Token.Start; name; _ }
1738
1738
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] ->
···
1741
1741
else begin
1742
1742
clear_stack_back_to_table_row_context t;
1743
1743
pop_current t;
1744
-
t.mode <- Insertion_mode.In_table_body;
1744
+
t.mode <- Parser_insertion_mode.In_table_body;
1745
1745
process_token t token
1746
1746
end
1747
1747
| Token.Tag { kind = Token.End; name = "table"; _ } ->
···
1750
1750
else begin
1751
1751
clear_stack_back_to_table_row_context t;
1752
1752
pop_current t;
1753
-
t.mode <- Insertion_mode.In_table_body;
1753
+
t.mode <- Parser_insertion_mode.In_table_body;
1754
1754
process_token t token
1755
1755
end
1756
1756
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
···
1761
1761
else begin
1762
1762
clear_stack_back_to_table_row_context t;
1763
1763
pop_current t;
1764
-
t.mode <- Insertion_mode.In_table_body;
1764
+
t.mode <- Parser_insertion_mode.In_table_body;
1765
1765
process_token t token
1766
1766
end
1767
1767
| Token.Tag { kind = Token.End; name; _ }
···
1792
1792
| _ -> ());
1793
1793
pop_until_html_tag t name;
1794
1794
clear_active_formatting_to_marker t;
1795
-
t.mode <- Insertion_mode.In_row
1795
+
t.mode <- Parser_insertion_mode.In_row
1796
1796
end
1797
1797
| Token.Tag { kind = Token.Start; name; _ }
1798
1798
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1799
-
if not (has_element_in_scope_impl t ["td"; "th"] Constants.table_scope ~check_integration_points:false) then
1799
+
if not (has_element_in_scope_impl t ["td"; "th"] Parser_constants.table_scope ~check_integration_points:false) then
1800
1800
parse_error t "unexpected-start-tag"
1801
1801
else begin
1802
1802
close_cell t;
···
1823
1823
| _ -> ());
1824
1824
pop_until_html_one_of t ["td"; "th"];
1825
1825
clear_active_formatting_to_marker t;
1826
-
t.mode <- Insertion_mode.In_row
1826
+
t.mode <- Parser_insertion_mode.In_row
1827
1827
1828
1828
and process_in_select t token =
1829
1829
match token with
···
1927
1927
ignore (insert_element t name attrs)
1928
1928
(* Don't push to stack - void elements *)
1929
1929
(* Handle formatting elements in select *)
1930
-
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.formatting_elements ->
1930
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Parser_constants.formatting_elements ->
1931
1931
reconstruct_active_formatting t;
1932
1932
let node = insert_element t name ~push:true attrs in
1933
1933
push_formatting_element t node name attrs
1934
-
| Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.formatting_elements ->
1934
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name Parser_constants.formatting_elements ->
1935
1935
(* Find select element and check if formatting element is inside select *)
1936
1936
let select_idx = ref None in
1937
1937
let fmt_idx = ref None in
···
2021
2021
| Token.Tag { kind = Token.Start; name; _ }
2022
2022
when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
2023
2023
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2024
-
t.template_modes <- Insertion_mode.In_table :: t.template_modes;
2025
-
t.mode <- Insertion_mode.In_table;
2024
+
t.template_modes <- Parser_insertion_mode.In_table :: t.template_modes;
2025
+
t.mode <- Parser_insertion_mode.In_table;
2026
2026
process_token t token
2027
2027
| Token.Tag { kind = Token.Start; name = "col"; _ } ->
2028
2028
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2029
-
t.template_modes <- Insertion_mode.In_column_group :: t.template_modes;
2030
-
t.mode <- Insertion_mode.In_column_group;
2029
+
t.template_modes <- Parser_insertion_mode.In_column_group :: t.template_modes;
2030
+
t.mode <- Parser_insertion_mode.In_column_group;
2031
2031
process_token t token
2032
2032
| Token.Tag { kind = Token.Start; name = "tr"; _ } ->
2033
2033
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2034
-
t.template_modes <- Insertion_mode.In_table_body :: t.template_modes;
2035
-
t.mode <- Insertion_mode.In_table_body;
2034
+
t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2035
+
t.mode <- Parser_insertion_mode.In_table_body;
2036
2036
process_token t token
2037
2037
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] ->
2038
2038
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2039
-
t.template_modes <- Insertion_mode.In_row :: t.template_modes;
2040
-
t.mode <- Insertion_mode.In_row;
2039
+
t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2040
+
t.mode <- Parser_insertion_mode.In_row;
2041
2041
process_token t token
2042
2042
| Token.Tag { kind = Token.Start; _ } ->
2043
2043
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2044
-
t.template_modes <- Insertion_mode.In_body :: t.template_modes;
2045
-
t.mode <- Insertion_mode.In_body;
2044
+
t.template_modes <- Parser_insertion_mode.In_body :: t.template_modes;
2045
+
t.mode <- Parser_insertion_mode.In_body;
2046
2046
process_token t token
2047
2047
| Token.Tag { kind = Token.End; _ } ->
2048
2048
parse_error t "unexpected-end-tag"
···
2076
2076
if t.fragment_context <> None then
2077
2077
parse_error t "unexpected-end-tag"
2078
2078
else
2079
-
t.mode <- Insertion_mode.After_after_body
2079
+
t.mode <- Parser_insertion_mode.After_after_body
2080
2080
| Token.EOF ->
2081
2081
() (* Stop parsing *)
2082
2082
| _ ->
2083
2083
parse_error t "unexpected-token-after-body";
2084
-
t.mode <- Insertion_mode.In_body;
2084
+
t.mode <- Parser_insertion_mode.In_body;
2085
2085
process_token t token
2086
2086
2087
2087
and process_in_frameset t token =
···
2109
2109
pop_current t;
2110
2110
if t.fragment_context = None then
2111
2111
(match current_node t with
2112
-
| Some n when n.Dom.name <> "frameset" -> t.mode <- Insertion_mode.After_frameset
2112
+
| Some n when n.Dom.name <> "frameset" -> t.mode <- Parser_insertion_mode.After_frameset
2113
2113
| _ -> ()))
2114
2114
| Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } ->
2115
2115
ignore (insert_element t "frame" ~push:true attrs);
···
2140
2140
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
2141
2141
process_in_body t token
2142
2142
| Token.Tag { kind = Token.End; name = "html"; _ } ->
2143
-
t.mode <- Insertion_mode.After_after_frameset
2143
+
t.mode <- Parser_insertion_mode.After_after_frameset
2144
2144
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2145
2145
process_in_head t token
2146
2146
| Token.EOF ->
···
2162
2162
() (* Stop parsing *)
2163
2163
| _ ->
2164
2164
parse_error t "unexpected-token-after-after-body";
2165
-
t.mode <- Insertion_mode.In_body;
2165
+
t.mode <- Parser_insertion_mode.In_body;
2166
2166
process_token t token
2167
2167
2168
2168
and process_after_after_frameset t token =
···
2193
2193
let is_html_integration_point node =
2194
2194
(* SVG foreignObject, desc, and title are always HTML integration points *)
2195
2195
if node.Dom.namespace = Some "svg" &&
2196
-
List.mem node.Dom.name Constants.svg_html_integration then true
2196
+
List.mem node.Dom.name Parser_constants.svg_html_integration then true
2197
2197
(* annotation-xml is an HTML integration point only with specific encoding values *)
2198
2198
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2199
2199
match List.assoc_opt "encoding" node.Dom.attrs with
···
2275
2275
let is_html_integration_point node =
2276
2276
(* SVG foreignObject, desc, and title are always HTML integration points *)
2277
2277
if node.Dom.namespace = Some "svg" &&
2278
-
List.mem node.Dom.name Constants.svg_html_integration then true
2278
+
List.mem node.Dom.name Parser_constants.svg_html_integration then true
2279
2279
(* annotation-xml is an HTML integration point only with specific encoding values *)
2280
2280
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2281
2281
match List.assoc_opt "encoding" node.Dom.attrs with
···
2357
2357
| Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
2358
2358
let name =
2359
2359
match adjusted_current_node t with
2360
-
| Some n when n.Dom.namespace = Some "svg" -> Constants.adjust_svg_tag_name name
2360
+
| Some n when n.Dom.namespace = Some "svg" -> Parser_constants.adjust_svg_tag_name name
2361
2361
| _ -> name
2362
2362
in
2363
2363
let attrs =
2364
2364
match adjusted_current_node t with
2365
2365
| Some n when n.Dom.namespace = Some "svg" ->
2366
-
Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs)
2366
+
Parser_constants.adjust_svg_attrs (Parser_constants.adjust_foreign_attrs attrs)
2367
2367
| Some n when n.Dom.namespace = Some "mathml" ->
2368
-
Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs)
2369
-
| _ -> Constants.adjust_foreign_attrs attrs
2368
+
Parser_constants.adjust_mathml_attrs (Parser_constants.adjust_foreign_attrs attrs)
2369
+
| _ -> Parser_constants.adjust_foreign_attrs attrs
2370
2370
in
2371
2371
let namespace =
2372
2372
match adjusted_current_node t with
···
2436
2436
2437
2437
and process_by_mode t token =
2438
2438
match t.mode with
2439
-
| Insertion_mode.Initial -> process_initial t token
2440
-
| Insertion_mode.Before_html -> process_before_html t token
2441
-
| Insertion_mode.Before_head -> process_before_head t token
2442
-
| Insertion_mode.In_head -> process_in_head t token
2443
-
| Insertion_mode.In_head_noscript -> process_in_head_noscript t token
2444
-
| Insertion_mode.After_head -> process_after_head t token
2445
-
| Insertion_mode.In_body -> process_in_body t token
2446
-
| Insertion_mode.Text -> process_text t token
2447
-
| Insertion_mode.In_table -> process_in_table t token
2448
-
| Insertion_mode.In_table_text -> process_in_table_text t token
2449
-
| Insertion_mode.In_caption -> process_in_caption t token
2450
-
| Insertion_mode.In_column_group -> process_in_column_group t token
2451
-
| Insertion_mode.In_table_body -> process_in_table_body t token
2452
-
| Insertion_mode.In_row -> process_in_row t token
2453
-
| Insertion_mode.In_cell -> process_in_cell t token
2454
-
| Insertion_mode.In_select -> process_in_select t token
2455
-
| Insertion_mode.In_select_in_table -> process_in_select_in_table t token
2456
-
| Insertion_mode.In_template -> process_in_template t token
2457
-
| Insertion_mode.After_body -> process_after_body t token
2458
-
| Insertion_mode.In_frameset -> process_in_frameset t token
2459
-
| Insertion_mode.After_frameset -> process_after_frameset t token
2460
-
| Insertion_mode.After_after_body -> process_after_after_body t token
2461
-
| Insertion_mode.After_after_frameset -> process_after_after_frameset t token
2439
+
| Parser_insertion_mode.Initial -> process_initial t token
2440
+
| Parser_insertion_mode.Before_html -> process_before_html t token
2441
+
| Parser_insertion_mode.Before_head -> process_before_head t token
2442
+
| Parser_insertion_mode.In_head -> process_in_head t token
2443
+
| Parser_insertion_mode.In_head_noscript -> process_in_head_noscript t token
2444
+
| Parser_insertion_mode.After_head -> process_after_head t token
2445
+
| Parser_insertion_mode.In_body -> process_in_body t token
2446
+
| Parser_insertion_mode.Text -> process_text t token
2447
+
| Parser_insertion_mode.In_table -> process_in_table t token
2448
+
| Parser_insertion_mode.In_table_text -> process_in_table_text t token
2449
+
| Parser_insertion_mode.In_caption -> process_in_caption t token
2450
+
| Parser_insertion_mode.In_column_group -> process_in_column_group t token
2451
+
| Parser_insertion_mode.In_table_body -> process_in_table_body t token
2452
+
| Parser_insertion_mode.In_row -> process_in_row t token
2453
+
| Parser_insertion_mode.In_cell -> process_in_cell t token
2454
+
| Parser_insertion_mode.In_select -> process_in_select t token
2455
+
| Parser_insertion_mode.In_select_in_table -> process_in_select_in_table t token
2456
+
| Parser_insertion_mode.In_template -> process_in_template t token
2457
+
| Parser_insertion_mode.After_body -> process_after_body t token
2458
+
| Parser_insertion_mode.In_frameset -> process_in_frameset t token
2459
+
| Parser_insertion_mode.After_frameset -> process_after_frameset t token
2460
+
| Parser_insertion_mode.After_after_body -> process_after_after_body t token
2461
+
| Parser_insertion_mode.After_after_frameset -> process_after_after_frameset t token
2462
2462
2463
2463
(* Populate selectedcontent elements with content from selected option *)
2464
2464
let find_elements name node =
-4
lib/selector/dune
-4
lib/selector/dune
lib/selector/html5rw_selector.ml
lib/html5rw/selector/selector.ml
lib/selector/html5rw_selector.ml
lib/html5rw/selector/selector.ml
+2
-2
lib/selector/html5rw_selector.mli
lib/html5rw/selector/selector.mli
+2
-2
lib/selector/html5rw_selector.mli
lib/html5rw/selector/selector.mli
···
131
131
@raise Selector_error if the selector is malformed.
132
132
*)
133
133
134
-
val query : Html5rw_dom.node -> string -> Html5rw_dom.node list
134
+
val query : Dom.node -> string -> Dom.node list
135
135
(** Query the DOM tree with a CSS selector.
136
136
137
137
Returns all nodes matching the selector in document order.
···
143
143
]}
144
144
*)
145
145
146
-
val matches : Html5rw_dom.node -> string -> bool
146
+
val matches : Dom.node -> string -> bool
147
147
(** Check if a node matches a CSS selector.
148
148
149
149
@raise Selector_error if the selector is malformed.
lib/selector/selector_ast.ml
lib/html5rw/selector/selector_ast.ml
lib/selector/selector_ast.ml
lib/html5rw/selector/selector_ast.ml
lib/selector/selector_lexer.ml
lib/html5rw/selector/selector_lexer.ml
lib/selector/selector_lexer.ml
lib/html5rw/selector/selector_lexer.ml
+1
-1
lib/selector/selector_match.ml
lib/html5rw/selector/selector_match.ml
+1
-1
lib/selector/selector_match.ml
lib/html5rw/selector/selector_match.ml
lib/selector/selector_parser.ml
lib/html5rw/selector/selector_parser.ml
lib/selector/selector_parser.ml
lib/html5rw/selector/selector_parser.ml
lib/selector/selector_token.ml
lib/html5rw/selector/selector_token.ml
lib/selector/selector_token.ml
lib/html5rw/selector/selector_token.ml
-4
lib/tokenizer/dune
-4
lib/tokenizer/dune
lib/tokenizer/errors.ml
lib/html5rw/tokenizer/tokenizer_errors.ml
lib/tokenizer/errors.ml
lib/html5rw/tokenizer/tokenizer_errors.ml
-16
lib/tokenizer/html5rw_tokenizer.ml
-16
lib/tokenizer/html5rw_tokenizer.ml
···
1
-
(* html5rw.tokenizer - HTML5 tokenizer with bytesrw-only API *)
2
-
3
-
module Token = Token
4
-
module State = State
5
-
module Errors = Errors
6
-
module Stream = Stream
7
-
8
-
module type SINK = Tokenizer.SINK
9
-
10
-
type 'a t = 'a Tokenizer.t
11
-
12
-
let create = Tokenizer.create
13
-
let run = Tokenizer.run
14
-
let get_errors = Tokenizer.get_errors
15
-
let set_state = Tokenizer.set_state
16
-
let set_last_start_tag = Tokenizer.set_last_start_tag
+10
-10
lib/tokenizer/html5rw_tokenizer.mli
lib/html5rw/tokenizer/tokenizer.mli
+10
-10
lib/tokenizer/html5rw_tokenizer.mli
lib/html5rw/tokenizer/tokenizer.mli
···
15
15
16
16
(** Token types produced by the tokenizer. *)
17
17
module Token : sig
18
-
type tag_kind = Token.tag_kind = Start | End
18
+
type tag_kind = Tokenizer_token.tag_kind = Start | End
19
19
20
-
type doctype = Token.doctype = {
20
+
type doctype = Tokenizer_token.doctype = {
21
21
name : string option;
22
22
public_id : string option;
23
23
system_id : string option;
24
24
force_quirks : bool;
25
25
}
26
26
27
-
type tag = Token.tag = {
27
+
type tag = Tokenizer_token.tag = {
28
28
kind : tag_kind;
29
29
name : string;
30
30
attrs : (string * string) list;
31
31
self_closing : bool;
32
32
}
33
33
34
-
type t = Token.t =
34
+
type t = Tokenizer_token.t =
35
35
| Tag of tag
36
36
| Character of string
37
37
| Comment of string
···
54
54
55
55
(** Tokenizer states. *)
56
56
module State : sig
57
-
type t = State.t =
57
+
type t = Tokenizer_state.t =
58
58
| Data
59
59
| Rcdata
60
60
| Rawtext
···
139
139
140
140
(** Parse error types. *)
141
141
module Errors : sig
142
-
type t = Errors.t = {
142
+
type t = Tokenizer_errors.t = {
143
143
code : string;
144
144
line : int;
145
145
column : int;
···
151
151
152
152
(** Input stream with position tracking. *)
153
153
module Stream : sig
154
-
type t = Stream.t
154
+
type t = Tokenizer_stream.t
155
155
156
156
val create : string -> t
157
157
val create_from_reader : Bytesrw.Bytes.Reader.t -> t
···
170
170
*)
171
171
module type SINK = sig
172
172
type t
173
-
val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ]
173
+
val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
174
174
val adjusted_current_node_in_html_namespace : t -> bool
175
175
end
176
176
···
204
204
function for each token until EOF is reached.
205
205
*)
206
206
207
-
val get_errors : 'sink t -> Errors.t list
207
+
val get_errors : 'sink t -> Tokenizer_errors.t list
208
208
(** Get the list of parse errors encountered during tokenization.
209
209
210
210
Only populated if [collect_errors:true] was passed to {!create}.
211
211
*)
212
212
213
-
val set_state : 'sink t -> State.t -> unit
213
+
val set_state : 'sink t -> Tokenizer_state.t -> unit
214
214
(** Set the tokenizer state.
215
215
216
216
Used by the tree builder to switch states for raw text elements.
lib/tokenizer/state.ml
lib/html5rw/tokenizer/tokenizer_state.ml
lib/tokenizer/state.ml
lib/html5rw/tokenizer/tokenizer_state.ml
lib/tokenizer/stream.ml
lib/html5rw/tokenizer/tokenizer_stream.ml
lib/tokenizer/stream.ml
lib/html5rw/tokenizer/tokenizer_stream.ml
lib/tokenizer/token.ml
lib/html5rw/tokenizer/tokenizer_token.ml
lib/tokenizer/token.ml
lib/html5rw/tokenizer/tokenizer_token.ml
-1975
lib/tokenizer/tokenizer.ml
-1975
lib/tokenizer/tokenizer.ml
···
1
-
(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
2
-
3
-
(* Character classification using Astring *)
4
-
let is_ascii_alpha = Astring.Char.Ascii.is_letter
5
-
let is_ascii_digit = Astring.Char.Ascii.is_digit
6
-
let is_ascii_hex = Astring.Char.Ascii.is_hex_digit
7
-
let is_ascii_alnum = Astring.Char.Ascii.is_alphanum
8
-
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
9
-
let ascii_lower = Astring.Char.Ascii.lowercase
10
-
11
-
(* Token sink interface *)
12
-
module type SINK = sig
13
-
type t
14
-
val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ]
15
-
val adjusted_current_node_in_html_namespace : t -> bool
16
-
end
17
-
18
-
type 'sink t = {
19
-
mutable stream : Stream.t;
20
-
sink : 'sink;
21
-
mutable state : State.t;
22
-
mutable return_state : State.t;
23
-
mutable char_ref_code : int;
24
-
mutable temp_buffer : Buffer.t;
25
-
mutable last_start_tag : string;
26
-
mutable current_tag_name : Buffer.t;
27
-
mutable current_tag_kind : Token.tag_kind;
28
-
mutable current_tag_self_closing : bool;
29
-
mutable current_attr_name : Buffer.t;
30
-
mutable current_attr_value : Buffer.t;
31
-
mutable current_attrs : (string * string) list;
32
-
mutable current_doctype_name : Buffer.t option;
33
-
mutable current_doctype_public : Buffer.t option;
34
-
mutable current_doctype_system : Buffer.t option;
35
-
mutable current_doctype_force_quirks : bool;
36
-
mutable current_comment : Buffer.t;
37
-
mutable pending_chars : Buffer.t;
38
-
mutable errors : Errors.t list;
39
-
collect_errors : bool;
40
-
xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *)
41
-
}
42
-
43
-
let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = {
44
-
stream = Stream.create "";
45
-
sink;
46
-
state = State.Data;
47
-
return_state = State.Data;
48
-
char_ref_code = 0;
49
-
temp_buffer = Buffer.create 64;
50
-
last_start_tag = "";
51
-
current_tag_name = Buffer.create 32;
52
-
current_tag_kind = Token.Start;
53
-
current_tag_self_closing = false;
54
-
current_attr_name = Buffer.create 32;
55
-
current_attr_value = Buffer.create 64;
56
-
current_attrs = [];
57
-
current_doctype_name = None;
58
-
current_doctype_public = None;
59
-
current_doctype_system = None;
60
-
current_doctype_force_quirks = false;
61
-
current_comment = Buffer.create 64;
62
-
pending_chars = Buffer.create 256;
63
-
errors = [];
64
-
collect_errors;
65
-
xml_mode;
66
-
}
67
-
68
-
let error t code =
69
-
if t.collect_errors then begin
70
-
let (line, column) = Stream.position t.stream in
71
-
t.errors <- Errors.make ~code ~line ~column :: t.errors
72
-
end
73
-
74
-
(* emit functions are defined locally inside run *)
75
-
76
-
(* XML mode character transformation: form feed → space *)
77
-
let emit_char t c =
78
-
if t.xml_mode && c = '\x0C' then
79
-
Buffer.add_char t.pending_chars ' '
80
-
else
81
-
Buffer.add_char t.pending_chars c
82
-
83
-
(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *)
84
-
let emit_str t s =
85
-
if t.xml_mode then begin
86
-
(* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *)
87
-
let len = String.length s in
88
-
let i = ref 0 in
89
-
while !i < len do
90
-
let c = s.[!i] in
91
-
if c = '\x0C' then begin
92
-
Buffer.add_char t.pending_chars ' ';
93
-
incr i
94
-
end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin
95
-
(* U+FFFF → U+FFFD *)
96
-
Buffer.add_string t.pending_chars "\xEF\xBF\xBD";
97
-
i := !i + 3
98
-
end else begin
99
-
Buffer.add_char t.pending_chars c;
100
-
incr i
101
-
end
102
-
done
103
-
end else
104
-
Buffer.add_string t.pending_chars s
105
-
106
-
let start_new_tag t kind =
107
-
Buffer.clear t.current_tag_name;
108
-
t.current_tag_kind <- kind;
109
-
t.current_tag_self_closing <- false;
110
-
t.current_attrs <- []
111
-
112
-
let start_new_attribute t =
113
-
(* Save previous attribute if any *)
114
-
let name = Buffer.contents t.current_attr_name in
115
-
if String.length name > 0 then begin
116
-
let value = Buffer.contents t.current_attr_value in
117
-
(* Check for duplicates - only add if not already present *)
118
-
if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then
119
-
t.current_attrs <- (name, value) :: t.current_attrs
120
-
else
121
-
error t "duplicate-attribute"
122
-
end;
123
-
Buffer.clear t.current_attr_name;
124
-
Buffer.clear t.current_attr_value
125
-
126
-
let finish_attribute t =
127
-
start_new_attribute t
128
-
129
-
let start_new_doctype t =
130
-
t.current_doctype_name <- None;
131
-
t.current_doctype_public <- None;
132
-
t.current_doctype_system <- None;
133
-
t.current_doctype_force_quirks <- false
134
-
135
-
(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *)
136
-
137
-
let is_appropriate_end_tag t =
138
-
let name = Buffer.contents t.current_tag_name in
139
-
String.length t.last_start_tag > 0 && name = t.last_start_tag
140
-
141
-
let flush_code_points_consumed_as_char_ref t =
142
-
let s = Buffer.contents t.temp_buffer in
143
-
match t.return_state with
144
-
| State.Attribute_value_double_quoted
145
-
| State.Attribute_value_single_quoted
146
-
| State.Attribute_value_unquoted ->
147
-
Buffer.add_string t.current_attr_value s
148
-
| _ ->
149
-
emit_str t s
150
-
151
-
open Bytesrw
152
-
153
-
(* Main tokenization loop *)
154
-
let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
155
-
t.stream <- Stream.create_from_reader reader;
156
-
t.errors <- [];
157
-
(* Set up error callback for surrogate/noncharacter detection in stream *)
158
-
(* In XML mode, we don't report noncharacter errors - we transform them instead *)
159
-
if not t.xml_mode then
160
-
Stream.set_error_callback t.stream (fun code -> error t code);
161
-
162
-
(* XML mode transformation for pending chars: U+FFFF → U+FFFD *)
163
-
let transform_xml_chars data =
164
-
let len = String.length data in
165
-
let buf = Buffer.create len in
166
-
let i = ref 0 in
167
-
while !i < len do
168
-
let c = data.[!i] in
169
-
if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin
170
-
(* U+FFFF → U+FFFD *)
171
-
Buffer.add_string buf "\xEF\xBF\xBD";
172
-
i := !i + 3
173
-
end else begin
174
-
Buffer.add_char buf c;
175
-
incr i
176
-
end
177
-
done;
178
-
Buffer.contents buf
179
-
in
180
-
181
-
(* Local emit functions with access to S *)
182
-
let emit_pending_chars () =
183
-
if Buffer.length t.pending_chars > 0 then begin
184
-
let data = Buffer.contents t.pending_chars in
185
-
Buffer.clear t.pending_chars;
186
-
let data = if t.xml_mode then transform_xml_chars data else data in
187
-
ignore (S.process t.sink (Token.Character data))
188
-
end
189
-
in
190
-
191
-
let emit token =
192
-
emit_pending_chars ();
193
-
match S.process t.sink token with
194
-
| `Continue -> ()
195
-
| `SwitchTo new_state -> t.state <- new_state
196
-
in
197
-
198
-
let emit_current_tag () =
199
-
finish_attribute t;
200
-
let name = Buffer.contents t.current_tag_name in
201
-
let attrs = List.rev t.current_attrs in
202
-
(* Check for end tag with attributes or self-closing flag *)
203
-
if t.current_tag_kind = Token.End then begin
204
-
if attrs <> [] then
205
-
error t "end-tag-with-attributes";
206
-
if t.current_tag_self_closing then
207
-
error t "end-tag-with-trailing-solidus"
208
-
end;
209
-
let tag = {
210
-
Token.kind = t.current_tag_kind;
211
-
name;
212
-
attrs;
213
-
self_closing = t.current_tag_self_closing;
214
-
} in
215
-
if t.current_tag_kind = Token.Start then
216
-
t.last_start_tag <- name;
217
-
emit (Token.Tag tag)
218
-
in
219
-
220
-
let emit_current_doctype () =
221
-
let doctype = {
222
-
Token.name = Option.map Buffer.contents t.current_doctype_name;
223
-
public_id = Option.map Buffer.contents t.current_doctype_public;
224
-
system_id = Option.map Buffer.contents t.current_doctype_system;
225
-
force_quirks = t.current_doctype_force_quirks;
226
-
} in
227
-
emit (Token.Doctype doctype)
228
-
in
229
-
230
-
let emit_current_comment () =
231
-
let content = Buffer.contents t.current_comment in
232
-
let content =
233
-
if t.xml_mode then begin
234
-
(* XML mode: transform -- to - - in comments *)
235
-
let buf = Buffer.create (String.length content + 10) in
236
-
let len = String.length content in
237
-
let i = ref 0 in
238
-
while !i < len do
239
-
if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin
240
-
Buffer.add_string buf "- -";
241
-
i := !i + 2
242
-
end else begin
243
-
Buffer.add_char buf content.[!i];
244
-
incr i
245
-
end
246
-
done;
247
-
Buffer.contents buf
248
-
end else content
249
-
in
250
-
emit (Token.Comment content)
251
-
in
252
-
253
-
(* Check for control characters and emit error if needed *)
254
-
(* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *)
255
-
let check_control_char c =
256
-
let code = Char.code c in
257
-
(* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *)
258
-
(* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
259
-
(* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *)
260
-
(* Note: We only check single-byte control chars here; multi-byte checks are TODO *)
261
-
if (code >= 0x01 && code <= 0x08) ||
262
-
code = 0x0B ||
263
-
(code >= 0x0E && code <= 0x1F) ||
264
-
code = 0x7F then
265
-
error t "control-character-in-input-stream"
266
-
in
267
-
268
-
269
-
(* Emit char with control character check *)
270
-
let emit_char_checked c =
271
-
check_control_char c;
272
-
emit_char t c
273
-
in
274
-
275
-
let rec process_state () =
276
-
if Stream.is_eof t.stream && t.state <> State.Data then begin
277
-
(* Handle EOF in various states *)
278
-
handle_eof ()
279
-
end else if Stream.is_eof t.stream then begin
280
-
emit_pending_chars ();
281
-
ignore (S.process t.sink Token.EOF)
282
-
end else begin
283
-
step ();
284
-
process_state ()
285
-
end
286
-
287
-
and handle_eof () =
288
-
match t.state with
289
-
| State.Data ->
290
-
emit_pending_chars ();
291
-
ignore (S.process t.sink Token.EOF)
292
-
| State.Tag_open ->
293
-
error t "eof-before-tag-name";
294
-
emit_char t '<';
295
-
emit_pending_chars ();
296
-
ignore (S.process t.sink Token.EOF)
297
-
| State.End_tag_open ->
298
-
error t "eof-before-tag-name";
299
-
emit_str t "</";
300
-
emit_pending_chars ();
301
-
ignore (S.process t.sink Token.EOF)
302
-
| State.Tag_name
303
-
| State.Before_attribute_name
304
-
| State.Attribute_name
305
-
| State.After_attribute_name
306
-
| State.Before_attribute_value
307
-
| State.Attribute_value_double_quoted
308
-
| State.Attribute_value_single_quoted
309
-
| State.Attribute_value_unquoted
310
-
| State.After_attribute_value_quoted
311
-
| State.Self_closing_start_tag ->
312
-
error t "eof-in-tag";
313
-
emit_pending_chars ();
314
-
ignore (S.process t.sink Token.EOF)
315
-
| State.Rawtext ->
316
-
emit_pending_chars ();
317
-
ignore (S.process t.sink Token.EOF)
318
-
| State.Rawtext_less_than_sign ->
319
-
emit_char t '<';
320
-
emit_pending_chars ();
321
-
ignore (S.process t.sink Token.EOF)
322
-
| State.Rawtext_end_tag_open ->
323
-
emit_str t "</";
324
-
emit_pending_chars ();
325
-
ignore (S.process t.sink Token.EOF)
326
-
| State.Rawtext_end_tag_name ->
327
-
emit_str t "</";
328
-
emit_str t (Buffer.contents t.temp_buffer);
329
-
emit_pending_chars ();
330
-
ignore (S.process t.sink Token.EOF)
331
-
| State.Rcdata ->
332
-
emit_pending_chars ();
333
-
ignore (S.process t.sink Token.EOF)
334
-
| State.Rcdata_less_than_sign ->
335
-
emit_char t '<';
336
-
emit_pending_chars ();
337
-
ignore (S.process t.sink Token.EOF)
338
-
| State.Rcdata_end_tag_open ->
339
-
emit_str t "</";
340
-
emit_pending_chars ();
341
-
ignore (S.process t.sink Token.EOF)
342
-
| State.Rcdata_end_tag_name ->
343
-
emit_str t "</";
344
-
emit_str t (Buffer.contents t.temp_buffer);
345
-
emit_pending_chars ();
346
-
ignore (S.process t.sink Token.EOF)
347
-
| State.Script_data ->
348
-
emit_pending_chars ();
349
-
ignore (S.process t.sink Token.EOF)
350
-
| State.Script_data_less_than_sign ->
351
-
emit_char t '<';
352
-
emit_pending_chars ();
353
-
ignore (S.process t.sink Token.EOF)
354
-
| State.Script_data_end_tag_open ->
355
-
emit_str t "</";
356
-
emit_pending_chars ();
357
-
ignore (S.process t.sink Token.EOF)
358
-
| State.Script_data_end_tag_name ->
359
-
emit_str t "</";
360
-
emit_str t (Buffer.contents t.temp_buffer);
361
-
emit_pending_chars ();
362
-
ignore (S.process t.sink Token.EOF)
363
-
| State.Script_data_escape_start
364
-
| State.Script_data_escape_start_dash
365
-
| State.Script_data_escaped
366
-
| State.Script_data_escaped_dash
367
-
| State.Script_data_escaped_dash_dash ->
368
-
error t "eof-in-script-html-comment-like-text";
369
-
emit_pending_chars ();
370
-
ignore (S.process t.sink Token.EOF)
371
-
| State.Script_data_escaped_less_than_sign ->
372
-
emit_char t '<';
373
-
emit_pending_chars ();
374
-
ignore (S.process t.sink Token.EOF)
375
-
| State.Script_data_escaped_end_tag_open ->
376
-
emit_str t "</";
377
-
emit_pending_chars ();
378
-
ignore (S.process t.sink Token.EOF)
379
-
| State.Script_data_escaped_end_tag_name ->
380
-
emit_str t "</";
381
-
emit_str t (Buffer.contents t.temp_buffer);
382
-
emit_pending_chars ();
383
-
ignore (S.process t.sink Token.EOF)
384
-
| State.Script_data_double_escape_start
385
-
| State.Script_data_double_escaped
386
-
| State.Script_data_double_escaped_dash
387
-
| State.Script_data_double_escaped_dash_dash ->
388
-
error t "eof-in-script-html-comment-like-text";
389
-
emit_pending_chars ();
390
-
ignore (S.process t.sink Token.EOF)
391
-
| State.Script_data_double_escaped_less_than_sign ->
392
-
(* '<' was already emitted when entering this state from Script_data_double_escaped *)
393
-
emit_pending_chars ();
394
-
ignore (S.process t.sink Token.EOF)
395
-
| State.Script_data_double_escape_end ->
396
-
emit_pending_chars ();
397
-
ignore (S.process t.sink Token.EOF)
398
-
| State.Plaintext ->
399
-
emit_pending_chars ();
400
-
ignore (S.process t.sink Token.EOF)
401
-
| State.Comment_start
402
-
| State.Comment_start_dash
403
-
| State.Comment
404
-
| State.Comment_less_than_sign
405
-
| State.Comment_less_than_sign_bang
406
-
| State.Comment_less_than_sign_bang_dash
407
-
| State.Comment_less_than_sign_bang_dash_dash
408
-
| State.Comment_end_dash
409
-
| State.Comment_end
410
-
| State.Comment_end_bang ->
411
-
error t "eof-in-comment";
412
-
emit_current_comment ();
413
-
emit_pending_chars ();
414
-
ignore (S.process t.sink Token.EOF)
415
-
| State.Bogus_comment ->
416
-
emit_current_comment ();
417
-
emit_pending_chars ();
418
-
ignore (S.process t.sink Token.EOF)
419
-
| State.Markup_declaration_open ->
420
-
error t "incorrectly-opened-comment";
421
-
Buffer.clear t.current_comment;
422
-
emit_current_comment ();
423
-
emit_pending_chars ();
424
-
ignore (S.process t.sink Token.EOF)
425
-
| State.Doctype
426
-
| State.Before_doctype_name ->
427
-
error t "eof-in-doctype";
428
-
start_new_doctype t;
429
-
t.current_doctype_force_quirks <- true;
430
-
emit_current_doctype ();
431
-
emit_pending_chars ();
432
-
ignore (S.process t.sink Token.EOF)
433
-
| State.Doctype_name
434
-
| State.After_doctype_name
435
-
| State.After_doctype_public_keyword
436
-
| State.Before_doctype_public_identifier
437
-
| State.Doctype_public_identifier_double_quoted
438
-
| State.Doctype_public_identifier_single_quoted
439
-
| State.After_doctype_public_identifier
440
-
| State.Between_doctype_public_and_system_identifiers
441
-
| State.After_doctype_system_keyword
442
-
| State.Before_doctype_system_identifier
443
-
| State.Doctype_system_identifier_double_quoted
444
-
| State.Doctype_system_identifier_single_quoted
445
-
| State.After_doctype_system_identifier ->
446
-
error t "eof-in-doctype";
447
-
t.current_doctype_force_quirks <- true;
448
-
emit_current_doctype ();
449
-
emit_pending_chars ();
450
-
ignore (S.process t.sink Token.EOF)
451
-
| State.Bogus_doctype ->
452
-
emit_current_doctype ();
453
-
emit_pending_chars ();
454
-
ignore (S.process t.sink Token.EOF)
455
-
| State.Cdata_section ->
456
-
error t "eof-in-cdata";
457
-
emit_pending_chars ();
458
-
ignore (S.process t.sink Token.EOF)
459
-
| State.Cdata_section_bracket ->
460
-
error t "eof-in-cdata";
461
-
emit_char t ']';
462
-
emit_pending_chars ();
463
-
ignore (S.process t.sink Token.EOF)
464
-
| State.Cdata_section_end ->
465
-
error t "eof-in-cdata";
466
-
emit_str t "]]";
467
-
emit_pending_chars ();
468
-
ignore (S.process t.sink Token.EOF)
469
-
| State.Character_reference ->
470
-
(* state_character_reference never ran, so initialize temp_buffer with & *)
471
-
Buffer.clear t.temp_buffer;
472
-
Buffer.add_char t.temp_buffer '&';
473
-
flush_code_points_consumed_as_char_ref t;
474
-
t.state <- t.return_state;
475
-
handle_eof ()
476
-
| State.Named_character_reference ->
477
-
flush_code_points_consumed_as_char_ref t;
478
-
t.state <- t.return_state;
479
-
handle_eof ()
480
-
| State.Numeric_character_reference ->
481
-
(* At EOF with just "&#" - no digits follow *)
482
-
error t "absence-of-digits-in-numeric-character-reference";
483
-
flush_code_points_consumed_as_char_ref t;
484
-
t.state <- t.return_state;
485
-
handle_eof ()
486
-
| State.Hexadecimal_character_reference_start
487
-
| State.Decimal_character_reference_start ->
488
-
error t "absence-of-digits-in-numeric-character-reference";
489
-
flush_code_points_consumed_as_char_ref t;
490
-
t.state <- t.return_state;
491
-
handle_eof ()
492
-
| State.Numeric_character_reference_end ->
493
-
(* We have collected digits, just need to finalize the character reference *)
494
-
step ();
495
-
handle_eof ()
496
-
| State.Ambiguous_ampersand ->
497
-
(* Buffer was already flushed when entering this state, just transition *)
498
-
t.state <- t.return_state;
499
-
handle_eof ()
500
-
| State.Hexadecimal_character_reference
501
-
| State.Decimal_character_reference ->
502
-
(* At EOF with collected digits - convert the numeric reference *)
503
-
error t "missing-semicolon-after-character-reference";
504
-
let code = t.char_ref_code in
505
-
let replacement_char = "\xEF\xBF\xBD" in
506
-
let result =
507
-
if code = 0 then begin
508
-
error t "null-character-reference";
509
-
replacement_char
510
-
end else if code > 0x10FFFF then begin
511
-
error t "character-reference-outside-unicode-range";
512
-
replacement_char
513
-
end else if code >= 0xD800 && code <= 0xDFFF then begin
514
-
error t "surrogate-character-reference";
515
-
replacement_char
516
-
end else
517
-
Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
518
-
in
519
-
Buffer.clear t.temp_buffer;
520
-
Buffer.add_string t.temp_buffer result;
521
-
flush_code_points_consumed_as_char_ref t;
522
-
t.state <- t.return_state;
523
-
handle_eof ()
524
-
525
-
and step () =
526
-
match t.state with
527
-
| State.Data -> state_data ()
528
-
| State.Rcdata -> state_rcdata ()
529
-
| State.Rawtext -> state_rawtext ()
530
-
| State.Script_data -> state_script_data ()
531
-
| State.Plaintext -> state_plaintext ()
532
-
| State.Tag_open -> state_tag_open ()
533
-
| State.End_tag_open -> state_end_tag_open ()
534
-
| State.Tag_name -> state_tag_name ()
535
-
| State.Rcdata_less_than_sign -> state_rcdata_less_than_sign ()
536
-
| State.Rcdata_end_tag_open -> state_rcdata_end_tag_open ()
537
-
| State.Rcdata_end_tag_name -> state_rcdata_end_tag_name ()
538
-
| State.Rawtext_less_than_sign -> state_rawtext_less_than_sign ()
539
-
| State.Rawtext_end_tag_open -> state_rawtext_end_tag_open ()
540
-
| State.Rawtext_end_tag_name -> state_rawtext_end_tag_name ()
541
-
| State.Script_data_less_than_sign -> state_script_data_less_than_sign ()
542
-
| State.Script_data_end_tag_open -> state_script_data_end_tag_open ()
543
-
| State.Script_data_end_tag_name -> state_script_data_end_tag_name ()
544
-
| State.Script_data_escape_start -> state_script_data_escape_start ()
545
-
| State.Script_data_escape_start_dash -> state_script_data_escape_start_dash ()
546
-
| State.Script_data_escaped -> state_script_data_escaped ()
547
-
| State.Script_data_escaped_dash -> state_script_data_escaped_dash ()
548
-
| State.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash ()
549
-
| State.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign ()
550
-
| State.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open ()
551
-
| State.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name ()
552
-
| State.Script_data_double_escape_start -> state_script_data_double_escape_start ()
553
-
| State.Script_data_double_escaped -> state_script_data_double_escaped ()
554
-
| State.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash ()
555
-
| State.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash ()
556
-
| State.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign ()
557
-
| State.Script_data_double_escape_end -> state_script_data_double_escape_end ()
558
-
| State.Before_attribute_name -> state_before_attribute_name ()
559
-
| State.Attribute_name -> state_attribute_name ()
560
-
| State.After_attribute_name -> state_after_attribute_name ()
561
-
| State.Before_attribute_value -> state_before_attribute_value ()
562
-
| State.Attribute_value_double_quoted -> state_attribute_value_double_quoted ()
563
-
| State.Attribute_value_single_quoted -> state_attribute_value_single_quoted ()
564
-
| State.Attribute_value_unquoted -> state_attribute_value_unquoted ()
565
-
| State.After_attribute_value_quoted -> state_after_attribute_value_quoted ()
566
-
| State.Self_closing_start_tag -> state_self_closing_start_tag ()
567
-
| State.Bogus_comment -> state_bogus_comment ()
568
-
| State.Markup_declaration_open -> state_markup_declaration_open ()
569
-
| State.Comment_start -> state_comment_start ()
570
-
| State.Comment_start_dash -> state_comment_start_dash ()
571
-
| State.Comment -> state_comment ()
572
-
| State.Comment_less_than_sign -> state_comment_less_than_sign ()
573
-
| State.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang ()
574
-
| State.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash ()
575
-
| State.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash ()
576
-
| State.Comment_end_dash -> state_comment_end_dash ()
577
-
| State.Comment_end -> state_comment_end ()
578
-
| State.Comment_end_bang -> state_comment_end_bang ()
579
-
| State.Doctype -> state_doctype ()
580
-
| State.Before_doctype_name -> state_before_doctype_name ()
581
-
| State.Doctype_name -> state_doctype_name ()
582
-
| State.After_doctype_name -> state_after_doctype_name ()
583
-
| State.After_doctype_public_keyword -> state_after_doctype_public_keyword ()
584
-
| State.Before_doctype_public_identifier -> state_before_doctype_public_identifier ()
585
-
| State.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted ()
586
-
| State.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted ()
587
-
| State.After_doctype_public_identifier -> state_after_doctype_public_identifier ()
588
-
| State.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers ()
589
-
| State.After_doctype_system_keyword -> state_after_doctype_system_keyword ()
590
-
| State.Before_doctype_system_identifier -> state_before_doctype_system_identifier ()
591
-
| State.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted ()
592
-
| State.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted ()
593
-
| State.After_doctype_system_identifier -> state_after_doctype_system_identifier ()
594
-
| State.Bogus_doctype -> state_bogus_doctype ()
595
-
| State.Cdata_section -> state_cdata_section ()
596
-
| State.Cdata_section_bracket -> state_cdata_section_bracket ()
597
-
| State.Cdata_section_end -> state_cdata_section_end ()
598
-
| State.Character_reference -> state_character_reference ()
599
-
| State.Named_character_reference -> state_named_character_reference ()
600
-
| State.Ambiguous_ampersand -> state_ambiguous_ampersand ()
601
-
| State.Numeric_character_reference -> state_numeric_character_reference ()
602
-
| State.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start ()
603
-
| State.Decimal_character_reference_start -> state_decimal_character_reference_start ()
604
-
| State.Hexadecimal_character_reference -> state_hexadecimal_character_reference ()
605
-
| State.Decimal_character_reference -> state_decimal_character_reference ()
606
-
| State.Numeric_character_reference_end -> state_numeric_character_reference_end ()
607
-
608
-
(* State implementations *)
609
-
and state_data () =
610
-
match Stream.consume t.stream with
611
-
| Some '&' ->
612
-
t.return_state <- State.Data;
613
-
t.state <- State.Character_reference
614
-
| Some '<' ->
615
-
t.state <- State.Tag_open
616
-
| Some '\x00' ->
617
-
(* Emit pending chars first, then emit null separately for proper tree builder handling *)
618
-
emit_pending_chars ();
619
-
error t "unexpected-null-character";
620
-
ignore (S.process t.sink (Token.Character "\x00"))
621
-
| Some c ->
622
-
emit_char_checked c
623
-
| None -> ()
624
-
625
-
and state_rcdata () =
626
-
match Stream.consume t.stream with
627
-
| Some '&' ->
628
-
t.return_state <- State.Rcdata;
629
-
t.state <- State.Character_reference
630
-
| Some '<' ->
631
-
t.state <- State.Rcdata_less_than_sign
632
-
| Some '\x00' ->
633
-
error t "unexpected-null-character";
634
-
emit_str t "\xEF\xBF\xBD"
635
-
| Some c ->
636
-
emit_char_checked c
637
-
| None -> ()
638
-
639
-
and state_rawtext () =
640
-
match Stream.consume t.stream with
641
-
| Some '<' ->
642
-
t.state <- State.Rawtext_less_than_sign
643
-
| Some '\x00' ->
644
-
error t "unexpected-null-character";
645
-
emit_str t "\xEF\xBF\xBD"
646
-
| Some c ->
647
-
emit_char_checked c
648
-
| None -> ()
649
-
650
-
and state_script_data () =
651
-
match Stream.consume t.stream with
652
-
| Some '<' ->
653
-
t.state <- State.Script_data_less_than_sign
654
-
| Some '\x00' ->
655
-
error t "unexpected-null-character";
656
-
emit_str t "\xEF\xBF\xBD"
657
-
| Some c ->
658
-
emit_char_checked c
659
-
| None -> ()
660
-
661
-
and state_plaintext () =
662
-
match Stream.consume t.stream with
663
-
| Some '\x00' ->
664
-
error t "unexpected-null-character";
665
-
emit_str t "\xEF\xBF\xBD"
666
-
| Some c ->
667
-
emit_char_checked c
668
-
| None -> ()
669
-
670
-
and state_tag_open () =
671
-
match Stream.peek t.stream with
672
-
| Some '!' ->
673
-
Stream.advance t.stream;
674
-
t.state <- State.Markup_declaration_open
675
-
| Some '/' ->
676
-
Stream.advance t.stream;
677
-
t.state <- State.End_tag_open
678
-
| Some c when is_ascii_alpha c ->
679
-
start_new_tag t Token.Start;
680
-
t.state <- State.Tag_name
681
-
| Some '?' ->
682
-
error t "unexpected-question-mark-instead-of-tag-name";
683
-
Buffer.clear t.current_comment;
684
-
t.state <- State.Bogus_comment
685
-
| None ->
686
-
error t "eof-before-tag-name";
687
-
emit_char t '<'
688
-
| Some _ ->
689
-
error t "invalid-first-character-of-tag-name";
690
-
emit_char t '<';
691
-
t.state <- State.Data
692
-
693
-
and state_end_tag_open () =
694
-
match Stream.peek t.stream with
695
-
| Some c when is_ascii_alpha c ->
696
-
start_new_tag t Token.End;
697
-
t.state <- State.Tag_name
698
-
| Some '>' ->
699
-
Stream.advance t.stream;
700
-
error t "missing-end-tag-name";
701
-
t.state <- State.Data
702
-
| None ->
703
-
error t "eof-before-tag-name";
704
-
emit_str t "</"
705
-
| Some _ ->
706
-
error t "invalid-first-character-of-tag-name";
707
-
Buffer.clear t.current_comment;
708
-
t.state <- State.Bogus_comment
709
-
710
-
and state_tag_name () =
711
-
match Stream.consume t.stream with
712
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
713
-
t.state <- State.Before_attribute_name
714
-
| Some '/' ->
715
-
t.state <- State.Self_closing_start_tag
716
-
| Some '>' ->
717
-
t.state <- State.Data;
718
-
emit_current_tag ()
719
-
| Some '\x00' ->
720
-
error t "unexpected-null-character";
721
-
Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
722
-
| Some c ->
723
-
check_control_char c;
724
-
Buffer.add_char t.current_tag_name (ascii_lower c)
725
-
| None -> ()
726
-
727
-
and state_rcdata_less_than_sign () =
728
-
match Stream.peek t.stream with
729
-
| Some '/' ->
730
-
Stream.advance t.stream;
731
-
Buffer.clear t.temp_buffer;
732
-
t.state <- State.Rcdata_end_tag_open
733
-
| _ ->
734
-
emit_char t '<';
735
-
t.state <- State.Rcdata
736
-
737
-
and state_rcdata_end_tag_open () =
738
-
match Stream.peek t.stream with
739
-
| Some c when is_ascii_alpha c ->
740
-
start_new_tag t Token.End;
741
-
t.state <- State.Rcdata_end_tag_name
742
-
| _ ->
743
-
emit_str t "</";
744
-
t.state <- State.Rcdata
745
-
746
-
and state_rcdata_end_tag_name () =
747
-
match Stream.peek t.stream with
748
-
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
749
-
Stream.advance t.stream;
750
-
t.state <- State.Before_attribute_name
751
-
| Some '/' when is_appropriate_end_tag t ->
752
-
Stream.advance t.stream;
753
-
t.state <- State.Self_closing_start_tag
754
-
| Some '>' when is_appropriate_end_tag t ->
755
-
Stream.advance t.stream;
756
-
t.state <- State.Data;
757
-
emit_current_tag ()
758
-
| Some c when is_ascii_alpha c ->
759
-
Stream.advance t.stream;
760
-
Buffer.add_char t.current_tag_name (ascii_lower c);
761
-
Buffer.add_char t.temp_buffer c
762
-
| _ ->
763
-
emit_str t "</";
764
-
emit_str t (Buffer.contents t.temp_buffer);
765
-
t.state <- State.Rcdata
766
-
767
-
and state_rawtext_less_than_sign () =
768
-
match Stream.peek t.stream with
769
-
| Some '/' ->
770
-
Stream.advance t.stream;
771
-
Buffer.clear t.temp_buffer;
772
-
t.state <- State.Rawtext_end_tag_open
773
-
| _ ->
774
-
emit_char t '<';
775
-
t.state <- State.Rawtext
776
-
777
-
and state_rawtext_end_tag_open () =
778
-
match Stream.peek t.stream with
779
-
| Some c when is_ascii_alpha c ->
780
-
start_new_tag t Token.End;
781
-
t.state <- State.Rawtext_end_tag_name
782
-
| _ ->
783
-
emit_str t "</";
784
-
t.state <- State.Rawtext
785
-
786
-
and state_rawtext_end_tag_name () =
787
-
match Stream.peek t.stream with
788
-
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
789
-
Stream.advance t.stream;
790
-
t.state <- State.Before_attribute_name
791
-
| Some '/' when is_appropriate_end_tag t ->
792
-
Stream.advance t.stream;
793
-
t.state <- State.Self_closing_start_tag
794
-
| Some '>' when is_appropriate_end_tag t ->
795
-
Stream.advance t.stream;
796
-
t.state <- State.Data;
797
-
emit_current_tag ()
798
-
| Some c when is_ascii_alpha c ->
799
-
Stream.advance t.stream;
800
-
Buffer.add_char t.current_tag_name (ascii_lower c);
801
-
Buffer.add_char t.temp_buffer c
802
-
| _ ->
803
-
emit_str t "</";
804
-
emit_str t (Buffer.contents t.temp_buffer);
805
-
t.state <- State.Rawtext
806
-
807
-
and state_script_data_less_than_sign () =
808
-
match Stream.peek t.stream with
809
-
| Some '/' ->
810
-
Stream.advance t.stream;
811
-
Buffer.clear t.temp_buffer;
812
-
t.state <- State.Script_data_end_tag_open
813
-
| Some '!' ->
814
-
Stream.advance t.stream;
815
-
t.state <- State.Script_data_escape_start;
816
-
emit_str t "<!"
817
-
| _ ->
818
-
emit_char t '<';
819
-
t.state <- State.Script_data
820
-
821
-
and state_script_data_end_tag_open () =
822
-
match Stream.peek t.stream with
823
-
| Some c when is_ascii_alpha c ->
824
-
start_new_tag t Token.End;
825
-
t.state <- State.Script_data_end_tag_name
826
-
| _ ->
827
-
emit_str t "</";
828
-
t.state <- State.Script_data
829
-
830
-
and state_script_data_end_tag_name () =
831
-
match Stream.peek t.stream with
832
-
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
833
-
Stream.advance t.stream;
834
-
t.state <- State.Before_attribute_name
835
-
| Some '/' when is_appropriate_end_tag t ->
836
-
Stream.advance t.stream;
837
-
t.state <- State.Self_closing_start_tag
838
-
| Some '>' when is_appropriate_end_tag t ->
839
-
Stream.advance t.stream;
840
-
t.state <- State.Data;
841
-
emit_current_tag ()
842
-
| Some c when is_ascii_alpha c ->
843
-
Stream.advance t.stream;
844
-
Buffer.add_char t.current_tag_name (ascii_lower c);
845
-
Buffer.add_char t.temp_buffer c
846
-
| _ ->
847
-
emit_str t "</";
848
-
emit_str t (Buffer.contents t.temp_buffer);
849
-
t.state <- State.Script_data
850
-
851
-
and state_script_data_escape_start () =
852
-
match Stream.peek t.stream with
853
-
| Some '-' ->
854
-
Stream.advance t.stream;
855
-
t.state <- State.Script_data_escape_start_dash;
856
-
emit_char t '-'
857
-
| _ ->
858
-
t.state <- State.Script_data
859
-
860
-
and state_script_data_escape_start_dash () =
861
-
match Stream.peek t.stream with
862
-
| Some '-' ->
863
-
Stream.advance t.stream;
864
-
t.state <- State.Script_data_escaped_dash_dash;
865
-
emit_char t '-'
866
-
| _ ->
867
-
t.state <- State.Script_data
868
-
869
-
and state_script_data_escaped () =
870
-
match Stream.consume t.stream with
871
-
| Some '-' ->
872
-
t.state <- State.Script_data_escaped_dash;
873
-
emit_char t '-'
874
-
| Some '<' ->
875
-
t.state <- State.Script_data_escaped_less_than_sign
876
-
| Some '\x00' ->
877
-
error t "unexpected-null-character";
878
-
emit_str t "\xEF\xBF\xBD"
879
-
| Some c ->
880
-
emit_char_checked c
881
-
| None -> ()
882
-
883
-
and state_script_data_escaped_dash () =
884
-
match Stream.consume t.stream with
885
-
| Some '-' ->
886
-
t.state <- State.Script_data_escaped_dash_dash;
887
-
emit_char t '-'
888
-
| Some '<' ->
889
-
t.state <- State.Script_data_escaped_less_than_sign
890
-
| Some '\x00' ->
891
-
error t "unexpected-null-character";
892
-
t.state <- State.Script_data_escaped;
893
-
emit_str t "\xEF\xBF\xBD"
894
-
| Some c ->
895
-
t.state <- State.Script_data_escaped;
896
-
emit_char_checked c
897
-
| None -> ()
898
-
899
-
and state_script_data_escaped_dash_dash () =
900
-
match Stream.consume t.stream with
901
-
| Some '-' ->
902
-
emit_char t '-'
903
-
| Some '<' ->
904
-
t.state <- State.Script_data_escaped_less_than_sign
905
-
| Some '>' ->
906
-
t.state <- State.Script_data;
907
-
emit_char t '>'
908
-
| Some '\x00' ->
909
-
error t "unexpected-null-character";
910
-
t.state <- State.Script_data_escaped;
911
-
emit_str t "\xEF\xBF\xBD"
912
-
| Some c ->
913
-
t.state <- State.Script_data_escaped;
914
-
emit_char_checked c
915
-
| None -> ()
916
-
917
-
and state_script_data_escaped_less_than_sign () =
918
-
match Stream.peek t.stream with
919
-
| Some '/' ->
920
-
Stream.advance t.stream;
921
-
Buffer.clear t.temp_buffer;
922
-
t.state <- State.Script_data_escaped_end_tag_open
923
-
| Some c when is_ascii_alpha c ->
924
-
Buffer.clear t.temp_buffer;
925
-
emit_char t '<';
926
-
t.state <- State.Script_data_double_escape_start
927
-
| _ ->
928
-
emit_char t '<';
929
-
t.state <- State.Script_data_escaped
930
-
931
-
and state_script_data_escaped_end_tag_open () =
932
-
match Stream.peek t.stream with
933
-
| Some c when is_ascii_alpha c ->
934
-
start_new_tag t Token.End;
935
-
t.state <- State.Script_data_escaped_end_tag_name
936
-
| _ ->
937
-
emit_str t "</";
938
-
t.state <- State.Script_data_escaped
939
-
940
-
and state_script_data_escaped_end_tag_name () =
941
-
match Stream.peek t.stream with
942
-
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
943
-
Stream.advance t.stream;
944
-
t.state <- State.Before_attribute_name
945
-
| Some '/' when is_appropriate_end_tag t ->
946
-
Stream.advance t.stream;
947
-
t.state <- State.Self_closing_start_tag
948
-
| Some '>' when is_appropriate_end_tag t ->
949
-
Stream.advance t.stream;
950
-
t.state <- State.Data;
951
-
emit_current_tag ()
952
-
| Some c when is_ascii_alpha c ->
953
-
Stream.advance t.stream;
954
-
Buffer.add_char t.current_tag_name (ascii_lower c);
955
-
Buffer.add_char t.temp_buffer c
956
-
| _ ->
957
-
emit_str t "</";
958
-
emit_str t (Buffer.contents t.temp_buffer);
959
-
t.state <- State.Script_data_escaped
960
-
961
-
and state_script_data_double_escape_start () =
962
-
match Stream.peek t.stream with
963
-
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
964
-
Stream.advance t.stream;
965
-
let c = Option.get c_opt in
966
-
if Buffer.contents t.temp_buffer = "script" then
967
-
t.state <- State.Script_data_double_escaped
968
-
else
969
-
t.state <- State.Script_data_escaped;
970
-
emit_char t c
971
-
| Some c when is_ascii_alpha c ->
972
-
Stream.advance t.stream;
973
-
Buffer.add_char t.temp_buffer (ascii_lower c);
974
-
emit_char t c
975
-
| _ ->
976
-
t.state <- State.Script_data_escaped
977
-
978
-
and state_script_data_double_escaped () =
979
-
match Stream.consume t.stream with
980
-
| Some '-' ->
981
-
t.state <- State.Script_data_double_escaped_dash;
982
-
emit_char t '-'
983
-
| Some '<' ->
984
-
t.state <- State.Script_data_double_escaped_less_than_sign;
985
-
emit_char t '<'
986
-
| Some '\x00' ->
987
-
error t "unexpected-null-character";
988
-
emit_str t "\xEF\xBF\xBD"
989
-
| Some c ->
990
-
emit_char_checked c
991
-
| None -> ()
992
-
993
-
and state_script_data_double_escaped_dash () =
994
-
match Stream.consume t.stream with
995
-
| Some '-' ->
996
-
t.state <- State.Script_data_double_escaped_dash_dash;
997
-
emit_char t '-'
998
-
| Some '<' ->
999
-
t.state <- State.Script_data_double_escaped_less_than_sign;
1000
-
emit_char t '<'
1001
-
| Some '\x00' ->
1002
-
error t "unexpected-null-character";
1003
-
t.state <- State.Script_data_double_escaped;
1004
-
emit_str t "\xEF\xBF\xBD"
1005
-
| Some c ->
1006
-
t.state <- State.Script_data_double_escaped;
1007
-
emit_char_checked c
1008
-
| None -> ()
1009
-
1010
-
and state_script_data_double_escaped_dash_dash () =
1011
-
match Stream.consume t.stream with
1012
-
| Some '-' ->
1013
-
emit_char t '-'
1014
-
| Some '<' ->
1015
-
t.state <- State.Script_data_double_escaped_less_than_sign;
1016
-
emit_char t '<'
1017
-
| Some '>' ->
1018
-
t.state <- State.Script_data;
1019
-
emit_char t '>'
1020
-
| Some '\x00' ->
1021
-
error t "unexpected-null-character";
1022
-
t.state <- State.Script_data_double_escaped;
1023
-
emit_str t "\xEF\xBF\xBD"
1024
-
| Some c ->
1025
-
t.state <- State.Script_data_double_escaped;
1026
-
emit_char_checked c
1027
-
| None -> ()
1028
-
1029
-
and state_script_data_double_escaped_less_than_sign () =
1030
-
match Stream.peek t.stream with
1031
-
| Some '/' ->
1032
-
Stream.advance t.stream;
1033
-
Buffer.clear t.temp_buffer;
1034
-
t.state <- State.Script_data_double_escape_end;
1035
-
emit_char t '/'
1036
-
| _ ->
1037
-
t.state <- State.Script_data_double_escaped
1038
-
1039
-
and state_script_data_double_escape_end () =
1040
-
match Stream.peek t.stream with
1041
-
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
1042
-
Stream.advance t.stream;
1043
-
let c = Option.get c_opt in
1044
-
if Buffer.contents t.temp_buffer = "script" then
1045
-
t.state <- State.Script_data_escaped
1046
-
else
1047
-
t.state <- State.Script_data_double_escaped;
1048
-
emit_char t c
1049
-
| Some c when is_ascii_alpha c ->
1050
-
Stream.advance t.stream;
1051
-
Buffer.add_char t.temp_buffer (ascii_lower c);
1052
-
emit_char t c
1053
-
| _ ->
1054
-
t.state <- State.Script_data_double_escaped
1055
-
1056
-
and state_before_attribute_name () =
1057
-
match Stream.peek t.stream with
1058
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1059
-
Stream.advance t.stream
1060
-
| Some '/' | Some '>' | None ->
1061
-
t.state <- State.After_attribute_name
1062
-
| Some '=' ->
1063
-
Stream.advance t.stream;
1064
-
error t "unexpected-equals-sign-before-attribute-name";
1065
-
start_new_attribute t;
1066
-
Buffer.add_char t.current_attr_name '=';
1067
-
t.state <- State.Attribute_name
1068
-
| Some _ ->
1069
-
start_new_attribute t;
1070
-
t.state <- State.Attribute_name
1071
-
1072
-
and state_attribute_name () =
1073
-
match Stream.peek t.stream with
1074
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1075
-
Stream.advance t.stream;
1076
-
t.state <- State.After_attribute_name
1077
-
| Some '/' | Some '>' | None ->
1078
-
t.state <- State.After_attribute_name
1079
-
| Some '=' ->
1080
-
Stream.advance t.stream;
1081
-
t.state <- State.Before_attribute_value
1082
-
| Some '\x00' ->
1083
-
Stream.advance t.stream;
1084
-
error t "unexpected-null-character";
1085
-
Buffer.add_string t.current_attr_name "\xEF\xBF\xBD"
1086
-
| Some ('"' | '\'' | '<') as c_opt ->
1087
-
Stream.advance t.stream;
1088
-
error t "unexpected-character-in-attribute-name";
1089
-
Buffer.add_char t.current_attr_name (Option.get c_opt)
1090
-
| Some c ->
1091
-
Stream.advance t.stream;
1092
-
check_control_char c;
1093
-
Buffer.add_char t.current_attr_name (ascii_lower c)
1094
-
1095
-
and state_after_attribute_name () =
1096
-
match Stream.peek t.stream with
1097
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1098
-
Stream.advance t.stream
1099
-
| Some '/' ->
1100
-
Stream.advance t.stream;
1101
-
t.state <- State.Self_closing_start_tag
1102
-
| Some '=' ->
1103
-
Stream.advance t.stream;
1104
-
t.state <- State.Before_attribute_value
1105
-
| Some '>' ->
1106
-
Stream.advance t.stream;
1107
-
t.state <- State.Data;
1108
-
emit_current_tag ()
1109
-
| None -> ()
1110
-
| Some _ ->
1111
-
start_new_attribute t;
1112
-
t.state <- State.Attribute_name
1113
-
1114
-
and state_before_attribute_value () =
1115
-
match Stream.peek t.stream with
1116
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1117
-
Stream.advance t.stream
1118
-
| Some '"' ->
1119
-
Stream.advance t.stream;
1120
-
t.state <- State.Attribute_value_double_quoted
1121
-
| Some '\'' ->
1122
-
Stream.advance t.stream;
1123
-
t.state <- State.Attribute_value_single_quoted
1124
-
| Some '>' ->
1125
-
Stream.advance t.stream;
1126
-
error t "missing-attribute-value";
1127
-
t.state <- State.Data;
1128
-
emit_current_tag ()
1129
-
| _ ->
1130
-
t.state <- State.Attribute_value_unquoted
1131
-
1132
-
and state_attribute_value_double_quoted () =
1133
-
match Stream.consume t.stream with
1134
-
| Some '"' ->
1135
-
t.state <- State.After_attribute_value_quoted
1136
-
| Some '&' ->
1137
-
t.return_state <- State.Attribute_value_double_quoted;
1138
-
t.state <- State.Character_reference
1139
-
| Some '\x00' ->
1140
-
error t "unexpected-null-character";
1141
-
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1142
-
| Some c ->
1143
-
check_control_char c;
1144
-
Buffer.add_char t.current_attr_value c
1145
-
| None -> ()
1146
-
1147
-
and state_attribute_value_single_quoted () =
1148
-
match Stream.consume t.stream with
1149
-
| Some '\'' ->
1150
-
t.state <- State.After_attribute_value_quoted
1151
-
| Some '&' ->
1152
-
t.return_state <- State.Attribute_value_single_quoted;
1153
-
t.state <- State.Character_reference
1154
-
| Some '\x00' ->
1155
-
error t "unexpected-null-character";
1156
-
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1157
-
| Some c ->
1158
-
check_control_char c;
1159
-
Buffer.add_char t.current_attr_value c
1160
-
| None -> ()
1161
-
1162
-
and state_attribute_value_unquoted () =
1163
-
match Stream.peek t.stream with
1164
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1165
-
Stream.advance t.stream;
1166
-
t.state <- State.Before_attribute_name
1167
-
| Some '&' ->
1168
-
Stream.advance t.stream;
1169
-
t.return_state <- State.Attribute_value_unquoted;
1170
-
t.state <- State.Character_reference
1171
-
| Some '>' ->
1172
-
Stream.advance t.stream;
1173
-
t.state <- State.Data;
1174
-
emit_current_tag ()
1175
-
| Some '\x00' ->
1176
-
Stream.advance t.stream;
1177
-
error t "unexpected-null-character";
1178
-
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1179
-
| Some ('"' | '\'' | '<' | '=' | '`') as c_opt ->
1180
-
Stream.advance t.stream;
1181
-
error t "unexpected-character-in-unquoted-attribute-value";
1182
-
Buffer.add_char t.current_attr_value (Option.get c_opt)
1183
-
| Some c ->
1184
-
Stream.advance t.stream;
1185
-
check_control_char c;
1186
-
Buffer.add_char t.current_attr_value c
1187
-
| None -> ()
1188
-
1189
-
and state_after_attribute_value_quoted () =
1190
-
match Stream.peek t.stream with
1191
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1192
-
Stream.advance t.stream;
1193
-
t.state <- State.Before_attribute_name
1194
-
| Some '/' ->
1195
-
Stream.advance t.stream;
1196
-
t.state <- State.Self_closing_start_tag
1197
-
| Some '>' ->
1198
-
Stream.advance t.stream;
1199
-
t.state <- State.Data;
1200
-
emit_current_tag ()
1201
-
| None -> ()
1202
-
| Some _ ->
1203
-
error t "missing-whitespace-between-attributes";
1204
-
t.state <- State.Before_attribute_name
1205
-
1206
-
and state_self_closing_start_tag () =
1207
-
match Stream.peek t.stream with
1208
-
| Some '>' ->
1209
-
Stream.advance t.stream;
1210
-
t.current_tag_self_closing <- true;
1211
-
t.state <- State.Data;
1212
-
emit_current_tag ()
1213
-
| None -> ()
1214
-
| Some _ ->
1215
-
error t "unexpected-solidus-in-tag";
1216
-
t.state <- State.Before_attribute_name
1217
-
1218
-
and state_bogus_comment () =
1219
-
match Stream.consume t.stream with
1220
-
| Some '>' ->
1221
-
t.state <- State.Data;
1222
-
emit_current_comment ()
1223
-
| Some '\x00' ->
1224
-
error t "unexpected-null-character";
1225
-
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1226
-
| Some c ->
1227
-
check_control_char c;
1228
-
Buffer.add_char t.current_comment c
1229
-
| None -> ()
1230
-
1231
-
and state_markup_declaration_open () =
1232
-
if Stream.matches_ci t.stream "--" then begin
1233
-
ignore (Stream.consume_exact_ci t.stream "--");
1234
-
Buffer.clear t.current_comment;
1235
-
t.state <- State.Comment_start
1236
-
end else if Stream.matches_ci t.stream "DOCTYPE" then begin
1237
-
ignore (Stream.consume_exact_ci t.stream "DOCTYPE");
1238
-
t.state <- State.Doctype
1239
-
end else if Stream.matches_ci t.stream "[CDATA[" then begin
1240
-
ignore (Stream.consume_exact_ci t.stream "[CDATA[");
1241
-
(* CDATA only allowed in foreign content *)
1242
-
if S.adjusted_current_node_in_html_namespace t.sink then begin
1243
-
error t "cdata-in-html-content";
1244
-
Buffer.clear t.current_comment;
1245
-
Buffer.add_string t.current_comment "[CDATA[";
1246
-
t.state <- State.Bogus_comment
1247
-
end else
1248
-
t.state <- State.Cdata_section
1249
-
end else begin
1250
-
error t "incorrectly-opened-comment";
1251
-
Buffer.clear t.current_comment;
1252
-
t.state <- State.Bogus_comment
1253
-
end
1254
-
1255
-
and state_comment_start () =
1256
-
match Stream.peek t.stream with
1257
-
| Some '-' ->
1258
-
Stream.advance t.stream;
1259
-
t.state <- State.Comment_start_dash
1260
-
| Some '>' ->
1261
-
Stream.advance t.stream;
1262
-
error t "abrupt-closing-of-empty-comment";
1263
-
t.state <- State.Data;
1264
-
emit_current_comment ()
1265
-
| _ ->
1266
-
t.state <- State.Comment
1267
-
1268
-
and state_comment_start_dash () =
1269
-
match Stream.peek t.stream with
1270
-
| Some '-' ->
1271
-
Stream.advance t.stream;
1272
-
t.state <- State.Comment_end
1273
-
| Some '>' ->
1274
-
Stream.advance t.stream;
1275
-
error t "abrupt-closing-of-empty-comment";
1276
-
t.state <- State.Data;
1277
-
emit_current_comment ()
1278
-
| None -> ()
1279
-
| Some _ ->
1280
-
Buffer.add_char t.current_comment '-';
1281
-
t.state <- State.Comment
1282
-
1283
-
and state_comment () =
1284
-
match Stream.consume t.stream with
1285
-
| Some '<' ->
1286
-
Buffer.add_char t.current_comment '<';
1287
-
t.state <- State.Comment_less_than_sign
1288
-
| Some '-' ->
1289
-
t.state <- State.Comment_end_dash
1290
-
| Some '\x00' ->
1291
-
error t "unexpected-null-character";
1292
-
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1293
-
| Some c ->
1294
-
check_control_char c;
1295
-
Buffer.add_char t.current_comment c
1296
-
| None -> ()
1297
-
1298
-
and state_comment_less_than_sign () =
1299
-
match Stream.peek t.stream with
1300
-
| Some '!' ->
1301
-
Stream.advance t.stream;
1302
-
Buffer.add_char t.current_comment '!';
1303
-
t.state <- State.Comment_less_than_sign_bang
1304
-
| Some '<' ->
1305
-
Stream.advance t.stream;
1306
-
Buffer.add_char t.current_comment '<'
1307
-
| _ ->
1308
-
t.state <- State.Comment
1309
-
1310
-
and state_comment_less_than_sign_bang () =
1311
-
match Stream.peek t.stream with
1312
-
| Some '-' ->
1313
-
Stream.advance t.stream;
1314
-
t.state <- State.Comment_less_than_sign_bang_dash
1315
-
| _ ->
1316
-
t.state <- State.Comment
1317
-
1318
-
and state_comment_less_than_sign_bang_dash () =
1319
-
match Stream.peek t.stream with
1320
-
| Some '-' ->
1321
-
Stream.advance t.stream;
1322
-
t.state <- State.Comment_less_than_sign_bang_dash_dash
1323
-
| _ ->
1324
-
t.state <- State.Comment_end_dash
1325
-
1326
-
and state_comment_less_than_sign_bang_dash_dash () =
1327
-
match Stream.peek t.stream with
1328
-
| Some '>' | None ->
1329
-
t.state <- State.Comment_end
1330
-
| Some _ ->
1331
-
error t "nested-comment";
1332
-
t.state <- State.Comment_end
1333
-
1334
-
and state_comment_end_dash () =
1335
-
match Stream.peek t.stream with
1336
-
| Some '-' ->
1337
-
Stream.advance t.stream;
1338
-
t.state <- State.Comment_end
1339
-
| None -> ()
1340
-
| Some _ ->
1341
-
Buffer.add_char t.current_comment '-';
1342
-
t.state <- State.Comment
1343
-
1344
-
and state_comment_end () =
1345
-
match Stream.peek t.stream with
1346
-
| Some '>' ->
1347
-
Stream.advance t.stream;
1348
-
t.state <- State.Data;
1349
-
emit_current_comment ()
1350
-
| Some '!' ->
1351
-
Stream.advance t.stream;
1352
-
t.state <- State.Comment_end_bang
1353
-
| Some '-' ->
1354
-
Stream.advance t.stream;
1355
-
Buffer.add_char t.current_comment '-'
1356
-
| None -> ()
1357
-
| Some _ ->
1358
-
Buffer.add_string t.current_comment "--";
1359
-
t.state <- State.Comment
1360
-
1361
-
and state_comment_end_bang () =
1362
-
match Stream.peek t.stream with
1363
-
| Some '-' ->
1364
-
Stream.advance t.stream;
1365
-
Buffer.add_string t.current_comment "--!";
1366
-
t.state <- State.Comment_end_dash
1367
-
| Some '>' ->
1368
-
Stream.advance t.stream;
1369
-
error t "incorrectly-closed-comment";
1370
-
t.state <- State.Data;
1371
-
emit_current_comment ()
1372
-
| None -> ()
1373
-
| Some _ ->
1374
-
Buffer.add_string t.current_comment "--!";
1375
-
t.state <- State.Comment
1376
-
1377
-
and state_doctype () =
1378
-
match Stream.peek t.stream with
1379
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1380
-
Stream.advance t.stream;
1381
-
t.state <- State.Before_doctype_name
1382
-
| Some '>' ->
1383
-
t.state <- State.Before_doctype_name
1384
-
| None -> ()
1385
-
| Some _ ->
1386
-
error t "missing-whitespace-before-doctype-name";
1387
-
t.state <- State.Before_doctype_name
1388
-
1389
-
and state_before_doctype_name () =
1390
-
match Stream.peek t.stream with
1391
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1392
-
Stream.advance t.stream
1393
-
| Some '\x00' ->
1394
-
Stream.advance t.stream;
1395
-
error t "unexpected-null-character";
1396
-
start_new_doctype t;
1397
-
t.current_doctype_name <- Some (Buffer.create 8);
1398
-
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD";
1399
-
t.state <- State.Doctype_name
1400
-
| Some '>' ->
1401
-
Stream.advance t.stream;
1402
-
error t "missing-doctype-name";
1403
-
start_new_doctype t;
1404
-
t.current_doctype_force_quirks <- true;
1405
-
t.state <- State.Data;
1406
-
emit_current_doctype ()
1407
-
| None -> ()
1408
-
| Some c ->
1409
-
Stream.advance t.stream;
1410
-
check_control_char c;
1411
-
start_new_doctype t;
1412
-
t.current_doctype_name <- Some (Buffer.create 8);
1413
-
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
1414
-
t.state <- State.Doctype_name
1415
-
1416
-
and state_doctype_name () =
1417
-
match Stream.consume t.stream with
1418
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1419
-
t.state <- State.After_doctype_name
1420
-
| Some '>' ->
1421
-
t.state <- State.Data;
1422
-
emit_current_doctype ()
1423
-
| Some '\x00' ->
1424
-
error t "unexpected-null-character";
1425
-
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
1426
-
| Some c ->
1427
-
check_control_char c;
1428
-
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
1429
-
| None -> ()
1430
-
1431
-
and state_after_doctype_name () =
1432
-
match Stream.peek t.stream with
1433
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1434
-
Stream.advance t.stream
1435
-
| Some '>' ->
1436
-
Stream.advance t.stream;
1437
-
t.state <- State.Data;
1438
-
emit_current_doctype ()
1439
-
| None -> ()
1440
-
| Some _ ->
1441
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1442
-
if Stream.matches_ci t.stream "PUBLIC" then begin
1443
-
ignore (Stream.consume_exact_ci t.stream "PUBLIC");
1444
-
t.state <- State.After_doctype_public_keyword
1445
-
end else if Stream.matches_ci t.stream "SYSTEM" then begin
1446
-
ignore (Stream.consume_exact_ci t.stream "SYSTEM");
1447
-
t.state <- State.After_doctype_system_keyword
1448
-
end else begin
1449
-
error t "invalid-character-sequence-after-doctype-name";
1450
-
t.current_doctype_force_quirks <- true;
1451
-
t.state <- State.Bogus_doctype
1452
-
end
1453
-
1454
-
and state_after_doctype_public_keyword () =
1455
-
match Stream.peek t.stream with
1456
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1457
-
Stream.advance t.stream;
1458
-
t.state <- State.Before_doctype_public_identifier
1459
-
| Some '"' ->
1460
-
Stream.advance t.stream;
1461
-
error t "missing-whitespace-after-doctype-public-keyword";
1462
-
t.current_doctype_public <- Some (Buffer.create 32);
1463
-
t.state <- State.Doctype_public_identifier_double_quoted
1464
-
| Some '\'' ->
1465
-
Stream.advance t.stream;
1466
-
error t "missing-whitespace-after-doctype-public-keyword";
1467
-
t.current_doctype_public <- Some (Buffer.create 32);
1468
-
t.state <- State.Doctype_public_identifier_single_quoted
1469
-
| Some '>' ->
1470
-
Stream.advance t.stream;
1471
-
error t "missing-doctype-public-identifier";
1472
-
t.current_doctype_force_quirks <- true;
1473
-
t.state <- State.Data;
1474
-
emit_current_doctype ()
1475
-
| None -> ()
1476
-
| Some _ ->
1477
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1478
-
error t "missing-quote-before-doctype-public-identifier";
1479
-
t.current_doctype_force_quirks <- true;
1480
-
t.state <- State.Bogus_doctype
1481
-
1482
-
and state_before_doctype_public_identifier () =
1483
-
match Stream.peek t.stream with
1484
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1485
-
Stream.advance t.stream
1486
-
| Some '"' ->
1487
-
Stream.advance t.stream;
1488
-
t.current_doctype_public <- Some (Buffer.create 32);
1489
-
t.state <- State.Doctype_public_identifier_double_quoted
1490
-
| Some '\'' ->
1491
-
Stream.advance t.stream;
1492
-
t.current_doctype_public <- Some (Buffer.create 32);
1493
-
t.state <- State.Doctype_public_identifier_single_quoted
1494
-
| Some '>' ->
1495
-
Stream.advance t.stream;
1496
-
error t "missing-doctype-public-identifier";
1497
-
t.current_doctype_force_quirks <- true;
1498
-
t.state <- State.Data;
1499
-
emit_current_doctype ()
1500
-
| None -> ()
1501
-
| Some _ ->
1502
-
error t "missing-quote-before-doctype-public-identifier";
1503
-
t.current_doctype_force_quirks <- true;
1504
-
t.state <- State.Bogus_doctype
1505
-
1506
-
and state_doctype_public_identifier_double_quoted () =
1507
-
match Stream.consume t.stream with
1508
-
| Some '"' ->
1509
-
t.state <- State.After_doctype_public_identifier
1510
-
| Some '\x00' ->
1511
-
error t "unexpected-null-character";
1512
-
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1513
-
| Some '>' ->
1514
-
error t "abrupt-doctype-public-identifier";
1515
-
t.current_doctype_force_quirks <- true;
1516
-
t.state <- State.Data;
1517
-
emit_current_doctype ()
1518
-
| Some c ->
1519
-
check_control_char c;
1520
-
Buffer.add_char (Option.get t.current_doctype_public) c
1521
-
| None -> ()
1522
-
1523
-
and state_doctype_public_identifier_single_quoted () =
1524
-
match Stream.consume t.stream with
1525
-
| Some '\'' ->
1526
-
t.state <- State.After_doctype_public_identifier
1527
-
| Some '\x00' ->
1528
-
error t "unexpected-null-character";
1529
-
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1530
-
| Some '>' ->
1531
-
error t "abrupt-doctype-public-identifier";
1532
-
t.current_doctype_force_quirks <- true;
1533
-
t.state <- State.Data;
1534
-
emit_current_doctype ()
1535
-
| Some c ->
1536
-
check_control_char c;
1537
-
Buffer.add_char (Option.get t.current_doctype_public) c
1538
-
| None -> ()
1539
-
1540
-
and state_after_doctype_public_identifier () =
1541
-
match Stream.peek t.stream with
1542
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1543
-
Stream.advance t.stream;
1544
-
t.state <- State.Between_doctype_public_and_system_identifiers
1545
-
| Some '>' ->
1546
-
Stream.advance t.stream;
1547
-
t.state <- State.Data;
1548
-
emit_current_doctype ()
1549
-
| Some '"' ->
1550
-
Stream.advance t.stream;
1551
-
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1552
-
t.current_doctype_system <- Some (Buffer.create 32);
1553
-
t.state <- State.Doctype_system_identifier_double_quoted
1554
-
| Some '\'' ->
1555
-
Stream.advance t.stream;
1556
-
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1557
-
t.current_doctype_system <- Some (Buffer.create 32);
1558
-
t.state <- State.Doctype_system_identifier_single_quoted
1559
-
| None -> ()
1560
-
| Some _ ->
1561
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1562
-
error t "missing-quote-before-doctype-system-identifier";
1563
-
t.current_doctype_force_quirks <- true;
1564
-
t.state <- State.Bogus_doctype
1565
-
1566
-
and state_between_doctype_public_and_system_identifiers () =
1567
-
match Stream.peek t.stream with
1568
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1569
-
Stream.advance t.stream
1570
-
| Some '>' ->
1571
-
Stream.advance t.stream;
1572
-
t.state <- State.Data;
1573
-
emit_current_doctype ()
1574
-
| Some '"' ->
1575
-
Stream.advance t.stream;
1576
-
t.current_doctype_system <- Some (Buffer.create 32);
1577
-
t.state <- State.Doctype_system_identifier_double_quoted
1578
-
| Some '\'' ->
1579
-
Stream.advance t.stream;
1580
-
t.current_doctype_system <- Some (Buffer.create 32);
1581
-
t.state <- State.Doctype_system_identifier_single_quoted
1582
-
| None -> ()
1583
-
| Some _ ->
1584
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1585
-
error t "missing-quote-before-doctype-system-identifier";
1586
-
t.current_doctype_force_quirks <- true;
1587
-
t.state <- State.Bogus_doctype
1588
-
1589
-
and state_after_doctype_system_keyword () =
1590
-
match Stream.peek t.stream with
1591
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1592
-
Stream.advance t.stream;
1593
-
t.state <- State.Before_doctype_system_identifier
1594
-
| Some '"' ->
1595
-
Stream.advance t.stream;
1596
-
error t "missing-whitespace-after-doctype-system-keyword";
1597
-
t.current_doctype_system <- Some (Buffer.create 32);
1598
-
t.state <- State.Doctype_system_identifier_double_quoted
1599
-
| Some '\'' ->
1600
-
Stream.advance t.stream;
1601
-
error t "missing-whitespace-after-doctype-system-keyword";
1602
-
t.current_doctype_system <- Some (Buffer.create 32);
1603
-
t.state <- State.Doctype_system_identifier_single_quoted
1604
-
| Some '>' ->
1605
-
Stream.advance t.stream;
1606
-
error t "missing-doctype-system-identifier";
1607
-
t.current_doctype_force_quirks <- true;
1608
-
t.state <- State.Data;
1609
-
emit_current_doctype ()
1610
-
| None -> ()
1611
-
| Some _ ->
1612
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1613
-
error t "missing-quote-before-doctype-system-identifier";
1614
-
t.current_doctype_force_quirks <- true;
1615
-
t.state <- State.Bogus_doctype
1616
-
1617
-
and state_before_doctype_system_identifier () =
1618
-
match Stream.peek t.stream with
1619
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1620
-
Stream.advance t.stream
1621
-
| Some '"' ->
1622
-
Stream.advance t.stream;
1623
-
t.current_doctype_system <- Some (Buffer.create 32);
1624
-
t.state <- State.Doctype_system_identifier_double_quoted
1625
-
| Some '\'' ->
1626
-
Stream.advance t.stream;
1627
-
t.current_doctype_system <- Some (Buffer.create 32);
1628
-
t.state <- State.Doctype_system_identifier_single_quoted
1629
-
| Some '>' ->
1630
-
Stream.advance t.stream;
1631
-
error t "missing-doctype-system-identifier";
1632
-
t.current_doctype_force_quirks <- true;
1633
-
t.state <- State.Data;
1634
-
emit_current_doctype ()
1635
-
| None -> ()
1636
-
| Some _ ->
1637
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1638
-
error t "missing-quote-before-doctype-system-identifier";
1639
-
t.current_doctype_force_quirks <- true;
1640
-
t.state <- State.Bogus_doctype
1641
-
1642
-
and state_doctype_system_identifier_double_quoted () =
1643
-
match Stream.consume t.stream with
1644
-
| Some '"' ->
1645
-
t.state <- State.After_doctype_system_identifier
1646
-
| Some '\x00' ->
1647
-
error t "unexpected-null-character";
1648
-
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1649
-
| Some '>' ->
1650
-
error t "abrupt-doctype-system-identifier";
1651
-
t.current_doctype_force_quirks <- true;
1652
-
t.state <- State.Data;
1653
-
emit_current_doctype ()
1654
-
| Some c ->
1655
-
check_control_char c;
1656
-
Buffer.add_char (Option.get t.current_doctype_system) c
1657
-
| None -> ()
1658
-
1659
-
and state_doctype_system_identifier_single_quoted () =
1660
-
match Stream.consume t.stream with
1661
-
| Some '\'' ->
1662
-
t.state <- State.After_doctype_system_identifier
1663
-
| Some '\x00' ->
1664
-
error t "unexpected-null-character";
1665
-
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1666
-
| Some '>' ->
1667
-
error t "abrupt-doctype-system-identifier";
1668
-
t.current_doctype_force_quirks <- true;
1669
-
t.state <- State.Data;
1670
-
emit_current_doctype ()
1671
-
| Some c ->
1672
-
check_control_char c;
1673
-
Buffer.add_char (Option.get t.current_doctype_system) c
1674
-
| None -> ()
1675
-
1676
-
and state_after_doctype_system_identifier () =
1677
-
match Stream.peek t.stream with
1678
-
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1679
-
Stream.advance t.stream
1680
-
| Some '>' ->
1681
-
Stream.advance t.stream;
1682
-
t.state <- State.Data;
1683
-
emit_current_doctype ()
1684
-
| None -> ()
1685
-
| Some _ ->
1686
-
(* Don't check control char here - bogus_doctype will check when it consumes *)
1687
-
error t "unexpected-character-after-doctype-system-identifier";
1688
-
t.state <- State.Bogus_doctype
1689
-
1690
-
and state_bogus_doctype () =
1691
-
match Stream.consume t.stream with
1692
-
| Some '>' ->
1693
-
t.state <- State.Data;
1694
-
emit_current_doctype ()
1695
-
| Some '\x00' ->
1696
-
error t "unexpected-null-character"
1697
-
| Some c ->
1698
-
check_control_char c (* Check all chars in bogus doctype *)
1699
-
| None -> ()
1700
-
1701
-
and state_cdata_section () =
1702
-
match Stream.consume t.stream with
1703
-
| Some ']' ->
1704
-
t.state <- State.Cdata_section_bracket
1705
-
| Some c ->
1706
-
(* CDATA section emits all characters as-is, including NUL, but still check for control chars *)
1707
-
emit_char_checked c
1708
-
| None -> ()
1709
-
1710
-
and state_cdata_section_bracket () =
1711
-
match Stream.peek t.stream with
1712
-
| Some ']' ->
1713
-
Stream.advance t.stream;
1714
-
t.state <- State.Cdata_section_end
1715
-
| _ ->
1716
-
emit_char t ']';
1717
-
t.state <- State.Cdata_section
1718
-
1719
-
and state_cdata_section_end () =
1720
-
match Stream.peek t.stream with
1721
-
| Some ']' ->
1722
-
Stream.advance t.stream;
1723
-
emit_char t ']'
1724
-
| Some '>' ->
1725
-
Stream.advance t.stream;
1726
-
t.state <- State.Data
1727
-
| _ ->
1728
-
emit_str t "]]";
1729
-
t.state <- State.Cdata_section
1730
-
1731
-
and state_character_reference () =
1732
-
Buffer.clear t.temp_buffer;
1733
-
Buffer.add_char t.temp_buffer '&';
1734
-
match Stream.peek t.stream with
1735
-
| Some c when is_ascii_alnum c ->
1736
-
t.state <- State.Named_character_reference
1737
-
| Some '#' ->
1738
-
Stream.advance t.stream;
1739
-
Buffer.add_char t.temp_buffer '#';
1740
-
t.state <- State.Numeric_character_reference
1741
-
| _ ->
1742
-
flush_code_points_consumed_as_char_ref t;
1743
-
t.state <- t.return_state
1744
-
1745
-
and state_named_character_reference () =
1746
-
(* Collect alphanumeric characters *)
1747
-
let rec collect () =
1748
-
match Stream.peek t.stream with
1749
-
| Some c when is_ascii_alnum c ->
1750
-
Stream.advance t.stream;
1751
-
Buffer.add_char t.temp_buffer c;
1752
-
collect ()
1753
-
| _ -> ()
1754
-
in
1755
-
collect ();
1756
-
1757
-
let has_semicolon =
1758
-
match Stream.peek t.stream with
1759
-
| Some ';' -> Stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true
1760
-
| _ -> false
1761
-
in
1762
-
1763
-
(* Try to match entity - buffer contains "&name" or "&name;" *)
1764
-
let buf_contents = Buffer.contents t.temp_buffer in
1765
-
let name_start = 1 in (* Skip '&' *)
1766
-
let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in
1767
-
let entity_name = String.sub buf_contents name_start (name_end - name_start) in
1768
-
1769
-
(* Try progressively shorter matches *)
1770
-
(* Only match if:
1771
-
1. Full match with semicolon, OR
1772
-
2. Legacy entity (can be used without semicolon) *)
1773
-
let rec try_match len =
1774
-
if len <= 0 then None
1775
-
else
1776
-
let prefix = String.sub entity_name 0 len in
1777
-
let is_full = len = String.length entity_name in
1778
-
let would_have_semi = has_semicolon && is_full in
1779
-
(* Only use this match if it has semicolon or is a legacy entity *)
1780
-
if would_have_semi || Html5rw_entities.is_legacy prefix then
1781
-
match Html5rw_entities.lookup prefix with
1782
-
| Some decoded -> Some (decoded, len)
1783
-
| None -> try_match (len - 1)
1784
-
else
1785
-
try_match (len - 1)
1786
-
in
1787
-
1788
-
match try_match (String.length entity_name) with
1789
-
| Some (decoded, matched_len) ->
1790
-
let full_match = matched_len = String.length entity_name in
1791
-
let ends_with_semi = has_semicolon && full_match in
1792
-
1793
-
(* Check attribute context restrictions *)
1794
-
let in_attribute = match t.return_state with
1795
-
| State.Attribute_value_double_quoted
1796
-
| State.Attribute_value_single_quoted
1797
-
| State.Attribute_value_unquoted -> true
1798
-
| _ -> false
1799
-
in
1800
-
1801
-
let next_char =
1802
-
if full_match && not has_semicolon then
1803
-
Stream.peek t.stream
1804
-
else if not full_match then
1805
-
Some entity_name.[matched_len]
1806
-
else None
1807
-
in
1808
-
1809
-
let blocked = in_attribute && not ends_with_semi &&
1810
-
match next_char with
1811
-
| Some '=' -> true
1812
-
| Some c when is_ascii_alnum c -> true
1813
-
| _ -> false
1814
-
in
1815
-
1816
-
if blocked then begin
1817
-
flush_code_points_consumed_as_char_ref t;
1818
-
t.state <- t.return_state
1819
-
end else begin
1820
-
if not ends_with_semi then
1821
-
error t "missing-semicolon-after-character-reference";
1822
-
Buffer.clear t.temp_buffer;
1823
-
Buffer.add_string t.temp_buffer decoded;
1824
-
flush_code_points_consumed_as_char_ref t;
1825
-
(* Emit unconsumed chars after partial match *)
1826
-
if not full_match then begin
1827
-
let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in
1828
-
emit_str t unconsumed;
1829
-
(* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *)
1830
-
if has_semicolon then
1831
-
emit_char t ';'
1832
-
end;
1833
-
t.state <- t.return_state
1834
-
end
1835
-
| None ->
1836
-
(* No match - check if we should report unknown-named-character-reference *)
1837
-
if String.length entity_name > 0 then begin
1838
-
(* If we have a semicolon, it's definitely an unknown named character reference *)
1839
-
if has_semicolon then
1840
-
error t "unknown-named-character-reference";
1841
-
(* Emit all the chars we consumed *)
1842
-
flush_code_points_consumed_as_char_ref t;
1843
-
t.state <- t.return_state
1844
-
end else begin
1845
-
flush_code_points_consumed_as_char_ref t;
1846
-
t.state <- t.return_state
1847
-
end
1848
-
1849
-
and state_ambiguous_ampersand () =
1850
-
match Stream.peek t.stream with
1851
-
| Some c when is_ascii_alnum c ->
1852
-
Stream.advance t.stream;
1853
-
(match t.return_state with
1854
-
| State.Attribute_value_double_quoted
1855
-
| State.Attribute_value_single_quoted
1856
-
| State.Attribute_value_unquoted ->
1857
-
Buffer.add_char t.current_attr_value c
1858
-
| _ ->
1859
-
emit_char t c)
1860
-
| Some ';' ->
1861
-
error t "unknown-named-character-reference";
1862
-
t.state <- t.return_state
1863
-
| _ ->
1864
-
t.state <- t.return_state
1865
-
1866
-
and state_numeric_character_reference () =
1867
-
t.char_ref_code <- 0;
1868
-
match Stream.peek t.stream with
1869
-
| Some (('x' | 'X') as c) ->
1870
-
Stream.advance t.stream;
1871
-
Buffer.add_char t.temp_buffer c;
1872
-
t.state <- State.Hexadecimal_character_reference_start
1873
-
| _ ->
1874
-
t.state <- State.Decimal_character_reference_start
1875
-
1876
-
and state_hexadecimal_character_reference_start () =
1877
-
match Stream.peek t.stream with
1878
-
| Some c when is_ascii_hex c ->
1879
-
t.state <- State.Hexadecimal_character_reference
1880
-
| _ ->
1881
-
error t "absence-of-digits-in-numeric-character-reference";
1882
-
flush_code_points_consumed_as_char_ref t;
1883
-
t.state <- t.return_state
1884
-
1885
-
and state_decimal_character_reference_start () =
1886
-
match Stream.peek t.stream with
1887
-
| Some c when is_ascii_digit c ->
1888
-
t.state <- State.Decimal_character_reference
1889
-
| _ ->
1890
-
error t "absence-of-digits-in-numeric-character-reference";
1891
-
flush_code_points_consumed_as_char_ref t;
1892
-
t.state <- t.return_state
1893
-
1894
-
and state_hexadecimal_character_reference () =
1895
-
match Stream.peek t.stream with
1896
-
| Some c when is_ascii_digit c ->
1897
-
Stream.advance t.stream;
1898
-
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0');
1899
-
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1900
-
| Some c when c >= 'A' && c <= 'F' ->
1901
-
Stream.advance t.stream;
1902
-
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10);
1903
-
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1904
-
| Some c when c >= 'a' && c <= 'f' ->
1905
-
Stream.advance t.stream;
1906
-
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10);
1907
-
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1908
-
| Some ';' ->
1909
-
Stream.advance t.stream;
1910
-
t.state <- State.Numeric_character_reference_end
1911
-
| _ ->
1912
-
error t "missing-semicolon-after-character-reference";
1913
-
t.state <- State.Numeric_character_reference_end
1914
-
1915
-
and state_decimal_character_reference () =
1916
-
match Stream.peek t.stream with
1917
-
| Some c when is_ascii_digit c ->
1918
-
Stream.advance t.stream;
1919
-
t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0');
1920
-
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1921
-
| Some ';' ->
1922
-
Stream.advance t.stream;
1923
-
t.state <- State.Numeric_character_reference_end
1924
-
| _ ->
1925
-
error t "missing-semicolon-after-character-reference";
1926
-
t.state <- State.Numeric_character_reference_end
1927
-
1928
-
and state_numeric_character_reference_end () =
1929
-
let code = t.char_ref_code in
1930
-
let replacement_char = "\xEF\xBF\xBD" in
1931
-
1932
-
let result =
1933
-
if code = 0 then begin
1934
-
error t "null-character-reference";
1935
-
replacement_char
1936
-
end else if code > 0x10FFFF then begin
1937
-
error t "character-reference-outside-unicode-range";
1938
-
replacement_char
1939
-
end else if code >= 0xD800 && code <= 0xDFFF then begin
1940
-
error t "surrogate-character-reference";
1941
-
replacement_char
1942
-
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1943
-
List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
1944
-
0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
1945
-
0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
1946
-
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1947
-
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1948
-
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1949
-
error t "noncharacter-character-reference";
1950
-
Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
1951
-
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1952
-
(code >= 0x0D && code <= 0x1F) ||
1953
-
(code >= 0x7F && code <= 0x9F) then begin
1954
-
error t "control-character-reference";
1955
-
(* Apply Windows-1252 replacement table for 0x80-0x9F *)
1956
-
match Html5rw_entities.Numeric_ref.find_replacement code with
1957
-
| Some replacement -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 replacement
1958
-
| None -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
1959
-
end else
1960
-
Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
1961
-
in
1962
-
1963
-
Buffer.clear t.temp_buffer;
1964
-
Buffer.add_string t.temp_buffer result;
1965
-
flush_code_points_consumed_as_char_ref t;
1966
-
t.state <- t.return_state
1967
-
1968
-
in
1969
-
process_state ()
1970
-
1971
-
let get_errors t = List.rev t.errors
1972
-
1973
-
let set_state t state = t.state <- state
1974
-
1975
-
let set_last_start_tag t name = t.last_start_tag <- name
+5
-5
test/dune
+5
-5
test/dune
···
5
5
(executable
6
6
(name test_all)
7
7
(modules test_all)
8
-
(libraries bytesrw html5rw.parser html5rw.dom html5rw.tokenizer html5rw.encoding jsont jsont.bytesrw test_report))
8
+
(libraries bytesrw html5rw jsont jsont.bytesrw test_report))
9
9
10
10
(executable
11
11
(name test_html5lib)
12
12
(modules test_html5lib)
13
-
(libraries bytesrw html5rw.parser html5rw.dom test_report))
13
+
(libraries bytesrw html5rw test_report))
14
14
15
15
(rule
16
16
(alias runtest)
···
22
22
(executable
23
23
(name test_tokenizer)
24
24
(modules test_tokenizer)
25
-
(libraries bytesrw html5rw.tokenizer jsont jsont.bytesrw test_report))
25
+
(libraries bytesrw html5rw jsont jsont.bytesrw test_report))
26
26
27
27
(rule
28
28
(alias runtest)
···
34
34
(executable
35
35
(name test_encoding)
36
36
(modules test_encoding)
37
-
(libraries html5rw.encoding test_report))
37
+
(libraries html5rw test_report))
38
38
39
39
(rule
40
40
(alias runtest)
···
46
46
(executable
47
47
(name test_serializer)
48
48
(modules test_serializer)
49
-
(libraries html5rw.dom jsont jsont.bytesrw test_report))
49
+
(libraries html5rw jsont jsont.bytesrw test_report))
50
50
51
51
(rule
52
52
(alias runtest)
+37
-37
test/test_all.ml
+37
-37
test/test_all.ml
···
10
10
(* ============================================================ *)
11
11
12
12
module TreeConstruction = struct
13
-
module Parser = Html5rw_parser
14
-
module Dom = Html5rw_dom
13
+
module Parser = Html5rw.Parser
14
+
module Dom = Html5rw.Dom
15
15
16
16
type test_case = {
17
17
input : string;
···
108
108
| [tag] -> (None, tag)
109
109
| _ -> (None, ctx_str)
110
110
in
111
-
let context = Parser.make_fragment_context ~tag_name ~namespace () in
111
+
let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in
112
112
let reader = Bytes.Reader.of_string test.input in
113
-
Parser.parse ~collect_errors:true ~fragment_context:context reader
113
+
Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader
114
114
| None ->
115
115
let reader = Bytes.Reader.of_string test.input in
116
-
Parser.parse ~collect_errors:true reader
116
+
Html5rw.Parser.parse ~collect_errors:true reader
117
117
in
118
-
let actual_tree = Dom.to_test_format (Parser.root result) in
118
+
let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in
119
119
let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
120
120
let actual = normalize_tree (strip_tree_prefix actual_tree) in
121
-
let error_count = List.length (Parser.errors result) in
121
+
let error_count = List.length (Html5rw.Parser.errors result) in
122
122
let expected_error_count = List.length test.expected_errors in
123
123
(expected = actual, expected, actual, error_count, expected_error_count)
124
124
with e ->
···
190
190
(* ============================================================ *)
191
191
192
192
module Tokenizer_tests = struct
193
-
module Tokenizer = Html5rw_tokenizer
193
+
module Tokenizer = Html5rw.Tokenizer
194
194
195
195
module TokenCollector = struct
196
-
type t = { mutable tokens : Tokenizer.Token.t list }
196
+
type t = { mutable tokens : Html5rw.Tokenizer.Token.t list }
197
197
let create () = { tokens = [] }
198
198
let process t token = t.tokens <- token :: t.tokens; `Continue
199
199
let adjusted_current_node_in_html_namespace _ = true
···
289
289
last_start_tag; double_escaped; xml_mode; raw_json }
290
290
291
291
let state_of_string = function
292
-
| "Data state" -> Tokenizer.State.Data
293
-
| "PLAINTEXT state" -> Tokenizer.State.Plaintext
294
-
| "RCDATA state" -> Tokenizer.State.Rcdata
295
-
| "RAWTEXT state" -> Tokenizer.State.Rawtext
296
-
| "Script data state" -> Tokenizer.State.Script_data
297
-
| "CDATA section state" -> Tokenizer.State.Cdata_section
292
+
| "Data state" -> Html5rw.Tokenizer.State.Data
293
+
| "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext
294
+
| "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata
295
+
| "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext
296
+
| "Script data state" -> Html5rw.Tokenizer.State.Script_data
297
+
| "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section
298
298
| s -> failwith ("Unknown state: " ^ s)
299
299
300
-
let token_to_test_json (tok : Tokenizer.Token.t) : Jsont.json list =
300
+
let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list =
301
301
let str s = Jsont.String (s, Jsont.Meta.none) in
302
302
let arr l = Jsont.Array (l, Jsont.Meta.none) in
303
303
match tok with
304
-
| Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
304
+
| Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
305
305
let name_json = match name with Some n -> str n | None -> Jsont.Null ((), Jsont.Meta.none) in
306
306
let public_json = match public_id with Some p -> str p | None -> Jsont.Null ((), Jsont.Meta.none) in
307
307
let system_json = match system_id with Some s -> str s | None -> Jsont.Null ((), Jsont.Meta.none) in
308
308
let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in
309
309
[arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]]
310
-
| Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
310
+
| Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
311
311
let attrs_obj = Jsont.Object (
312
312
List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs),
313
313
Jsont.Meta.none
314
314
) in
315
315
if self_closing then [arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]]
316
316
else [arr [str "StartTag"; str name; attrs_obj]]
317
-
| Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]]
318
-
| Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]]
319
-
| Tokenizer.Token.Character data -> [arr [str "Character"; str data]]
320
-
| Tokenizer.Token.EOF -> []
317
+
| Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } -> [arr [str "EndTag"; str name]]
318
+
| Html5rw.Tokenizer.Token.Comment data -> [arr [str "Comment"; str data]]
319
+
| Html5rw.Tokenizer.Token.Character data -> [arr [str "Character"; str data]]
320
+
| Html5rw.Tokenizer.Token.EOF -> []
321
321
322
322
let rec json_equal a b =
323
323
match a, b with
···
337
337
let merge_character_tokens tokens =
338
338
let rec loop acc = function
339
339
| [] -> List.rev acc
340
-
| Tokenizer.Token.Character s1 :: Tokenizer.Token.Character s2 :: rest ->
341
-
loop acc (Tokenizer.Token.Character (s1 ^ s2) :: rest)
340
+
| Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest ->
341
+
loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest)
342
342
| tok :: rest -> loop (tok :: acc) rest
343
343
in loop [] tokens
344
344
345
345
let run_test test initial_state =
346
346
let input = if test.double_escaped then unescape_double test.input else test.input in
347
347
let collector = TokenCollector.create () in
348
-
let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
349
-
Tokenizer.set_state tokenizer initial_state;
350
-
(match test.last_start_tag with Some tag -> Tokenizer.set_last_start_tag tokenizer tag | None -> ());
348
+
let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
349
+
Html5rw.Tokenizer.set_state tokenizer initial_state;
350
+
(match test.last_start_tag with Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag | None -> ());
351
351
let reader = Bytes.Reader.of_string input in
352
-
Tokenizer.run tokenizer (module TokenCollector) reader;
352
+
Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader;
353
353
let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in
354
354
let actual_tokens = List.concat_map token_to_test_json tokens in
355
355
let expected_output = if test.double_escaped then
···
373
373
List.length actual_tokens = List.length expected &&
374
374
List.for_all2 json_equal actual_tokens expected
375
375
in
376
-
let actual_error_count = List.length (Tokenizer.get_errors tokenizer) in
376
+
let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in
377
377
let errors_count_match = actual_error_count = test.expected_error_count in
378
378
(tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count)
379
379
···
465
465
(* ============================================================ *)
466
466
467
467
module Encoding_tests = struct
468
-
module Encoding = Html5rw_encoding
468
+
module Encoding = Html5rw.Encoding
469
469
470
470
type test_case = {
471
471
input : string;
···
476
476
let normalize_encoding_name s = String.lowercase_ascii (String.trim s)
477
477
478
478
let encoding_to_test_name = function
479
-
| Encoding.Utf8 -> "utf-8"
480
-
| Encoding.Utf16le -> "utf-16le"
481
-
| Encoding.Utf16be -> "utf-16be"
482
-
| Encoding.Windows_1252 -> "windows-1252"
483
-
| Encoding.Iso_8859_2 -> "iso-8859-2"
484
-
| Encoding.Euc_jp -> "euc-jp"
479
+
| Html5rw.Encoding.Utf8 -> "utf-8"
480
+
| Html5rw.Encoding.Utf16le -> "utf-16le"
481
+
| Html5rw.Encoding.Utf16be -> "utf-16be"
482
+
| Html5rw.Encoding.Windows_1252 -> "windows-1252"
483
+
| Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
484
+
| Html5rw.Encoding.Euc_jp -> "euc-jp"
485
485
486
486
let parse_test_case lines =
487
487
let raw_lines = String.concat "\n" lines in
···
526
526
527
527
let run_test test =
528
528
try
529
-
let (_, detected_encoding) = Encoding.decode (Bytes.of_string test.input) () in
529
+
let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
530
530
let detected_name = encoding_to_test_name detected_encoding in
531
531
let expected_name = normalize_encoding_name test.expected_encoding in
532
532
let match_encoding det exp =
+8
-8
test/test_encoding.ml
+8
-8
test/test_encoding.ml
···
1
1
(* Test runner for html5lib-tests encoding tests *)
2
2
3
-
module Encoding = Html5rw_encoding
3
+
module Encoding = Html5rw.Encoding
4
4
module Report = Test_report
5
5
6
6
type test_case = {
···
15
15
16
16
(* Convert our encoding type to canonical test name *)
17
17
let encoding_to_test_name = function
18
-
| Encoding.Utf8 -> "utf-8"
19
-
| Encoding.Utf16le -> "utf-16le"
20
-
| Encoding.Utf16be -> "utf-16be"
21
-
| Encoding.Windows_1252 -> "windows-1252"
22
-
| Encoding.Iso_8859_2 -> "iso-8859-2"
23
-
| Encoding.Euc_jp -> "euc-jp"
18
+
| Html5rw.Encoding.Utf8 -> "utf-8"
19
+
| Html5rw.Encoding.Utf16le -> "utf-16le"
20
+
| Html5rw.Encoding.Utf16be -> "utf-16be"
21
+
| Html5rw.Encoding.Windows_1252 -> "windows-1252"
22
+
| Html5rw.Encoding.Iso_8859_2 -> "iso-8859-2"
23
+
| Html5rw.Encoding.Euc_jp -> "euc-jp"
24
24
25
25
(* Parse a single test case from lines *)
26
26
let parse_test_case lines =
···
79
79
let run_test test =
80
80
try
81
81
(* Detect encoding from the input bytes *)
82
-
let (_, detected_encoding) = Encoding.decode (Bytes.of_string test.input) () in
82
+
let (_, detected_encoding) = Html5rw.Encoding.decode (Bytes.of_string test.input) () in
83
83
let detected_name = encoding_to_test_name detected_encoding in
84
84
let expected_name = normalize_encoding_name test.expected_encoding in
85
85
+7
-7
test/test_html5lib.ml
+7
-7
test/test_html5lib.ml
···
2
2
3
3
open Bytesrw
4
4
5
-
module Parser = Html5rw_parser
6
-
module Dom = Html5rw_dom
5
+
module Parser = Html5rw.Parser
6
+
module Dom = Html5rw.Dom
7
7
module Report = Test_report
8
8
9
9
type test_case = {
···
119
119
| [tag] -> (None, tag)
120
120
| _ -> (None, ctx_str)
121
121
in
122
-
let context = Parser.make_fragment_context ~tag_name ~namespace () in
122
+
let context = Html5rw.Parser.make_fragment_context ~tag_name ~namespace () in
123
123
let reader = Bytes.Reader.of_string test.input in
124
-
Parser.parse ~collect_errors:true ~fragment_context:context reader
124
+
Html5rw.Parser.parse ~collect_errors:true ~fragment_context:context reader
125
125
| None ->
126
126
let reader = Bytes.Reader.of_string test.input in
127
-
Parser.parse ~collect_errors:true reader
127
+
Html5rw.Parser.parse ~collect_errors:true reader
128
128
in
129
-
let actual_tree = Dom.to_test_format (Parser.root result) in
129
+
let actual_tree = Html5rw.Dom.to_test_format (Html5rw.Parser.root result) in
130
130
let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
131
131
let actual = normalize_tree (strip_tree_prefix actual_tree) in
132
-
let error_count = List.length (Parser.errors result) in
132
+
let error_count = List.length (Html5rw.Parser.errors result) in
133
133
let expected_error_count = List.length test.expected_errors in
134
134
(expected = actual, expected, actual, error_count, expected_error_count)
135
135
with e ->
+9
-9
test/test_serializer.ml
+9
-9
test/test_serializer.ml
···
1
1
(* Test runner for html5lib-tests serializer tests *)
2
2
3
-
module Dom = Html5rw_dom
3
+
module Dom = Html5rw.Dom
4
4
module Report = Test_report
5
5
6
6
(* Extract values from JSON *)
···
189
189
| EmptyTag of string * (string * string) list (* name, attrs *)
190
190
| TextNode of string
191
191
| CommentNode of string
192
-
| DoctypeNode of Dom.node
192
+
| DoctypeNode of Html5rw.Dom.node
193
193
194
194
type token_info = {
195
195
token : token_type option;
···
235
235
236
236
| "Doctype", [name_json] ->
237
237
let name = json_string name_json in
238
-
let node = Dom.create_doctype ~name () in
238
+
let node = Html5rw.Dom.create_doctype ~name () in
239
239
{ token = Some (DoctypeNode node) }
240
240
241
241
| "Doctype", [name_json; public_json] ->
242
242
let name = json_string name_json in
243
243
let public_id = json_string_opt public_json in
244
244
let node = match public_id with
245
-
| Some pub -> Dom.create_doctype ~name ~public_id:pub ()
246
-
| None -> Dom.create_doctype ~name ()
245
+
| Some pub -> Html5rw.Dom.create_doctype ~name ~public_id:pub ()
246
+
| None -> Html5rw.Dom.create_doctype ~name ()
247
247
in
248
248
{ token = Some (DoctypeNode node) }
249
249
···
252
252
let public_id = json_string_opt public_json in
253
253
let system_id = json_string_opt system_json in
254
254
let node = match public_id, system_id with
255
-
| Some pub, Some sys -> Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()
256
-
| Some pub, None -> Dom.create_doctype ~name ~public_id:pub ()
257
-
| None, Some sys -> Dom.create_doctype ~name ~system_id:sys ()
258
-
| None, None -> Dom.create_doctype ~name ()
255
+
| Some pub, Some sys -> Html5rw.Dom.create_doctype ~name ~public_id:pub ~system_id:sys ()
256
+
| Some pub, None -> Html5rw.Dom.create_doctype ~name ~public_id:pub ()
257
+
| None, Some sys -> Html5rw.Dom.create_doctype ~name ~system_id:sys ()
258
+
| None, None -> Html5rw.Dom.create_doctype ~name ()
259
259
in
260
260
{ token = Some (DoctypeNode node) }
261
261
+22
-22
test/test_tokenizer.ml
+22
-22
test/test_tokenizer.ml
···
2
2
3
3
open Bytesrw
4
4
5
-
module Tokenizer = Html5rw_tokenizer
5
+
module Tokenizer = Html5rw.Tokenizer
6
6
module Report = Test_report
7
7
8
8
(* Token collector sink - collects all tokens into a list *)
9
9
module TokenCollector = struct
10
10
type t = {
11
-
mutable tokens : Tokenizer.Token.t list;
11
+
mutable tokens : Html5rw.Tokenizer.Token.t list;
12
12
}
13
13
14
14
let create () = { tokens = [] }
···
139
139
140
140
(* Convert state name to State.t *)
141
141
let state_of_string = function
142
-
| "Data state" -> Tokenizer.State.Data
143
-
| "PLAINTEXT state" -> Tokenizer.State.Plaintext
144
-
| "RCDATA state" -> Tokenizer.State.Rcdata
145
-
| "RAWTEXT state" -> Tokenizer.State.Rawtext
146
-
| "Script data state" -> Tokenizer.State.Script_data
147
-
| "CDATA section state" -> Tokenizer.State.Cdata_section
142
+
| "Data state" -> Html5rw.Tokenizer.State.Data
143
+
| "PLAINTEXT state" -> Html5rw.Tokenizer.State.Plaintext
144
+
| "RCDATA state" -> Html5rw.Tokenizer.State.Rcdata
145
+
| "RAWTEXT state" -> Html5rw.Tokenizer.State.Rawtext
146
+
| "Script data state" -> Html5rw.Tokenizer.State.Script_data
147
+
| "CDATA section state" -> Html5rw.Tokenizer.State.Cdata_section
148
148
| s -> failwith ("Unknown state: " ^ s)
149
149
150
150
(* Convert our token to test format for comparison *)
151
-
let token_to_test_json (tok : Tokenizer.Token.t) : Jsont.json list =
151
+
let token_to_test_json (tok : Html5rw.Tokenizer.Token.t) : Jsont.json list =
152
152
let str s = Jsont.String (s, Jsont.Meta.none) in
153
153
let arr l = Jsont.Array (l, Jsont.Meta.none) in
154
154
match tok with
155
-
| Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
155
+
| Html5rw.Tokenizer.Token.Doctype { name; public_id; system_id; force_quirks } ->
156
156
let name_json = match name with
157
157
| Some n -> str n
158
158
| None -> Jsont.Null ((), Jsont.Meta.none)
···
167
167
in
168
168
let correctness = Jsont.Bool (not force_quirks, Jsont.Meta.none) in
169
169
[arr [str "DOCTYPE"; name_json; public_json; system_json; correctness]]
170
-
| Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
170
+
| Html5rw.Tokenizer.Token.Tag { kind = Start; name; attrs; self_closing } ->
171
171
let attrs_obj = Jsont.Object (
172
172
List.map (fun (n, v) -> ((n, Jsont.Meta.none), str v)) (List.rev attrs),
173
173
Jsont.Meta.none
···
176
176
[arr [str "StartTag"; str name; attrs_obj; Jsont.Bool (true, Jsont.Meta.none)]]
177
177
else
178
178
[arr [str "StartTag"; str name; attrs_obj]]
179
-
| Tokenizer.Token.Tag { kind = End; name; _ } ->
179
+
| Html5rw.Tokenizer.Token.Tag { kind = End; name; _ } ->
180
180
[arr [str "EndTag"; str name]]
181
-
| Tokenizer.Token.Comment data ->
181
+
| Html5rw.Tokenizer.Token.Comment data ->
182
182
[arr [str "Comment"; str data]]
183
-
| Tokenizer.Token.Character data ->
183
+
| Html5rw.Tokenizer.Token.Character data ->
184
184
(* Split into individual characters for comparison - but actually
185
185
the tests expect consecutive characters to be merged *)
186
186
[arr [str "Character"; str data]]
187
-
| Tokenizer.Token.EOF -> []
187
+
| Html5rw.Tokenizer.Token.EOF -> []
188
188
189
189
(* Compare JSON values for equality *)
190
190
let rec json_equal a b =
···
207
207
let merge_character_tokens tokens =
208
208
let rec loop acc = function
209
209
| [] -> List.rev acc
210
-
| Tokenizer.Token.Character s1 :: Tokenizer.Token.Character s2 :: rest ->
211
-
loop acc (Tokenizer.Token.Character (s1 ^ s2) :: rest)
210
+
| Html5rw.Tokenizer.Token.Character s1 :: Html5rw.Tokenizer.Token.Character s2 :: rest ->
211
+
loop acc (Html5rw.Tokenizer.Token.Character (s1 ^ s2) :: rest)
212
212
| tok :: rest -> loop (tok :: acc) rest
213
213
in
214
214
loop [] tokens
···
218
218
let input = if test.double_escaped then unescape_double test.input else test.input in
219
219
220
220
let collector = TokenCollector.create () in
221
-
let tokenizer = Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
221
+
let tokenizer = Html5rw.Tokenizer.create (module TokenCollector) collector ~collect_errors:true ~xml_mode:test.xml_mode () in
222
222
223
223
(* Set initial state *)
224
-
Tokenizer.set_state tokenizer initial_state;
224
+
Html5rw.Tokenizer.set_state tokenizer initial_state;
225
225
226
226
(* Set last start tag if specified *)
227
227
(match test.last_start_tag with
228
-
| Some tag -> Tokenizer.set_last_start_tag tokenizer tag
228
+
| Some tag -> Html5rw.Tokenizer.set_last_start_tag tokenizer tag
229
229
| None -> ());
230
230
231
231
(* Run tokenizer *)
232
232
let reader = Bytes.Reader.of_string input in
233
-
Tokenizer.run tokenizer (module TokenCollector) reader;
233
+
Html5rw.Tokenizer.run tokenizer (module TokenCollector) reader;
234
234
235
235
(* Get results *)
236
236
let tokens = merge_character_tokens (TokenCollector.get_tokens collector) in
···
267
267
List.for_all2 json_equal actual_tokens expected
268
268
in
269
269
270
-
let actual_error_count = List.length (Tokenizer.get_errors tokenizer) in
270
+
let actual_error_count = List.length (Html5rw.Tokenizer.get_errors tokenizer) in
271
271
let errors_count_match = actual_error_count = test.expected_error_count in
272
272
273
273
(tokens_match && errors_count_match, actual_tokens, expected, actual_error_count, test.expected_error_count)