OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
2
3(* Character classification using Astring *)
4let is_ascii_alpha = Astring.Char.Ascii.is_letter
5let is_ascii_digit = Astring.Char.Ascii.is_digit
6let is_ascii_hex = Astring.Char.Ascii.is_hex_digit
7let is_ascii_alnum = Astring.Char.Ascii.is_alphanum
8let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
9let ascii_lower = Astring.Char.Ascii.lowercase
10
11(* Token sink interface *)
12module type SINK = sig
13 type t
14 val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
15 val adjusted_current_node_in_html_namespace : t -> bool
16end
17
18type 'sink t = {
19 mutable stream : Tokenizer_stream.t;
20 sink : 'sink;
21 mutable state : Tokenizer_state.t;
22 mutable return_state : Tokenizer_state.t;
23 mutable char_ref_code : int;
24 mutable temp_buffer : Buffer.t;
25 mutable last_start_tag : string;
26 mutable current_tag_name : Buffer.t;
27 mutable current_tag_kind : Tokenizer_token.tag_kind;
28 mutable current_tag_self_closing : bool;
29 mutable current_attr_name : Buffer.t;
30 mutable current_attr_value : Buffer.t;
31 mutable current_attrs : (string * string) list;
32 mutable current_doctype_name : Buffer.t option;
33 mutable current_doctype_public : Buffer.t option;
34 mutable current_doctype_system : Buffer.t option;
35 mutable current_doctype_force_quirks : bool;
36 mutable current_comment : Buffer.t;
37 mutable pending_chars : Buffer.t;
38 mutable errors : Tokenizer_errors.t list;
39 collect_errors : bool;
40 xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *)
41}
42
43let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = {
44 stream = Tokenizer_stream.create "";
45 sink;
46 state = Tokenizer_state.Data;
47 return_state = Tokenizer_state.Data;
48 char_ref_code = 0;
49 temp_buffer = Buffer.create 64;
50 last_start_tag = "";
51 current_tag_name = Buffer.create 32;
52 current_tag_kind = Tokenizer_token.Start;
53 current_tag_self_closing = false;
54 current_attr_name = Buffer.create 32;
55 current_attr_value = Buffer.create 64;
56 current_attrs = [];
57 current_doctype_name = None;
58 current_doctype_public = None;
59 current_doctype_system = None;
60 current_doctype_force_quirks = false;
61 current_comment = Buffer.create 64;
62 pending_chars = Buffer.create 256;
63 errors = [];
64 collect_errors;
65 xml_mode;
66}
67
68let error t code =
69 if t.collect_errors then begin
70 let (line, column) = Tokenizer_stream.position t.stream in
71 t.errors <- Tokenizer_errors.make ~code ~line ~column :: t.errors
72 end
73
74(* emit functions are defined locally inside run *)
75
76(* XML mode character transformation: form feed → space *)
77let emit_char t c =
78 if t.xml_mode && c = '\x0C' then
79 Buffer.add_char t.pending_chars ' '
80 else
81 Buffer.add_char t.pending_chars c
82
83(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *)
84let emit_str t s =
85 if t.xml_mode then begin
86 (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *)
87 let len = String.length s in
88 let i = ref 0 in
89 while !i < len do
90 let c = s.[!i] in
91 if c = '\x0C' then begin
92 Buffer.add_char t.pending_chars ' ';
93 incr i
94 end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin
95 (* U+FFFF → U+FFFD *)
96 Buffer.add_string t.pending_chars "\xEF\xBF\xBD";
97 i := !i + 3
98 end else begin
99 Buffer.add_char t.pending_chars c;
100 incr i
101 end
102 done
103 end else
104 Buffer.add_string t.pending_chars s
105
106let start_new_tag t kind =
107 Buffer.clear t.current_tag_name;
108 t.current_tag_kind <- kind;
109 t.current_tag_self_closing <- false;
110 t.current_attrs <- []
111
112let start_new_attribute t =
113 (* Save previous attribute if any *)
114 let name = Buffer.contents t.current_attr_name in
115 if String.length name > 0 then begin
116 let value = Buffer.contents t.current_attr_value in
117 (* Check for duplicates - only add if not already present *)
118 if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then
119 t.current_attrs <- (name, value) :: t.current_attrs
120 else
121 error t "duplicate-attribute"
122 end;
123 Buffer.clear t.current_attr_name;
124 Buffer.clear t.current_attr_value
125
126let finish_attribute t =
127 start_new_attribute t
128
129let start_new_doctype t =
130 t.current_doctype_name <- None;
131 t.current_doctype_public <- None;
132 t.current_doctype_system <- None;
133 t.current_doctype_force_quirks <- false
134
135(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *)
136
137let is_appropriate_end_tag t =
138 let name = Buffer.contents t.current_tag_name in
139 String.length t.last_start_tag > 0 && name = t.last_start_tag
140
141let flush_code_points_consumed_as_char_ref t =
142 let s = Buffer.contents t.temp_buffer in
143 match t.return_state with
144 | Tokenizer_state.Attribute_value_double_quoted
145 | Tokenizer_state.Attribute_value_single_quoted
146 | Tokenizer_state.Attribute_value_unquoted ->
147 Buffer.add_string t.current_attr_value s
148 | _ ->
149 emit_str t s
150
151open Bytesrw
152
153(* Main tokenization loop *)
154let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
155 t.stream <- Tokenizer_stream.create_from_reader reader;
156 t.errors <- [];
157 (* Set up error callback for surrogate/noncharacter detection in stream *)
158 (* In XML mode, we don't report noncharacter errors - we transform them instead *)
159 if not t.xml_mode then
160 Tokenizer_stream.set_error_callback t.stream (fun code -> error t code);
161
162 (* XML mode transformation for pending chars: U+FFFF → U+FFFD *)
163 let transform_xml_chars data =
164 let len = String.length data in
165 let buf = Buffer.create len in
166 let i = ref 0 in
167 while !i < len do
168 let c = data.[!i] in
169 if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin
170 (* U+FFFF → U+FFFD *)
171 Buffer.add_string buf "\xEF\xBF\xBD";
172 i := !i + 3
173 end else begin
174 Buffer.add_char buf c;
175 incr i
176 end
177 done;
178 Buffer.contents buf
179 in
180
181 (* Local emit functions with access to S *)
182 let emit_pending_chars () =
183 if Buffer.length t.pending_chars > 0 then begin
184 let data = Buffer.contents t.pending_chars in
185 Buffer.clear t.pending_chars;
186 let data = if t.xml_mode then transform_xml_chars data else data in
187 let line, column = Tokenizer_stream.position t.stream in
188 ignore (S.process t.sink (Tokenizer_token.Character data) ~line ~column)
189 end
190 in
191
192 let emit token =
193 emit_pending_chars ();
194 let line, column = Tokenizer_stream.position t.stream in
195 match S.process t.sink token ~line ~column with
196 | `Continue -> ()
197 | `SwitchTo new_state -> t.state <- new_state
198 in
199
200 let emit_current_tag () =
201 finish_attribute t;
202 let name = Buffer.contents t.current_tag_name in
203 let attrs = List.rev t.current_attrs in
204 (* Check for end tag with attributes or self-closing flag *)
205 if t.current_tag_kind = Tokenizer_token.End then begin
206 if attrs <> [] then
207 error t "end-tag-with-attributes";
208 if t.current_tag_self_closing then
209 error t "end-tag-with-trailing-solidus"
210 end;
211 let tag = {
212 Tokenizer_token.kind = t.current_tag_kind;
213 name;
214 attrs;
215 self_closing = t.current_tag_self_closing;
216 } in
217 if t.current_tag_kind = Tokenizer_token.Start then
218 t.last_start_tag <- name;
219 emit (Tokenizer_token.Tag tag)
220 in
221
222 let emit_current_doctype () =
223 let doctype = {
224 Tokenizer_token.name = Option.map Buffer.contents t.current_doctype_name;
225 public_id = Option.map Buffer.contents t.current_doctype_public;
226 system_id = Option.map Buffer.contents t.current_doctype_system;
227 force_quirks = t.current_doctype_force_quirks;
228 } in
229 emit (Tokenizer_token.Doctype doctype)
230 in
231
232 let emit_current_comment () =
233 let content = Buffer.contents t.current_comment in
234 let content =
235 if t.xml_mode then begin
236 (* XML mode: transform -- to - - in comments *)
237 let buf = Buffer.create (String.length content + 10) in
238 let len = String.length content in
239 let i = ref 0 in
240 while !i < len do
241 if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin
242 Buffer.add_string buf "- -";
243 i := !i + 2
244 end else begin
245 Buffer.add_char buf content.[!i];
246 incr i
247 end
248 done;
249 Buffer.contents buf
250 end else content
251 in
252 emit (Tokenizer_token.Comment content)
253 in
254
255 (* Check for control characters and emit error if needed *)
256 (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *)
257 let check_control_char c =
258 let code = Char.code c in
259 (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *)
260 (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *)
261 (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *)
262 (* Note: We only check single-byte control chars here; multi-byte checks are TODO *)
263 if (code >= 0x01 && code <= 0x08) ||
264 code = 0x0B ||
265 (code >= 0x0E && code <= 0x1F) ||
266 code = 0x7F then
267 error t (Printf.sprintf "control-character-in-input-stream:%04x" code)
268 in
269
270
271 (* Emit char with control character check *)
272 let emit_char_checked c =
273 check_control_char c;
274 emit_char t c
275 in
276
277 let rec process_state () =
278 if Tokenizer_stream.is_eof t.stream && t.state <> Tokenizer_state.Data then begin
279 (* Handle EOF in various states *)
280 handle_eof ()
281 end else if Tokenizer_stream.is_eof t.stream then begin
282 emit_pending_chars ();
283 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
284 end else begin
285 step ();
286 process_state ()
287 end
288
289 and handle_eof () =
290 match t.state with
291 | Tokenizer_state.Data ->
292 emit_pending_chars ();
293 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
294 | Tokenizer_state.Tag_open ->
295 error t "eof-before-tag-name";
296 emit_char t '<';
297 emit_pending_chars ();
298 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
299 | Tokenizer_state.End_tag_open ->
300 error t "eof-before-tag-name";
301 emit_str t "</";
302 emit_pending_chars ();
303 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
304 | Tokenizer_state.Tag_name
305 | Tokenizer_state.Before_attribute_name
306 | Tokenizer_state.Attribute_name
307 | Tokenizer_state.After_attribute_name
308 | Tokenizer_state.Before_attribute_value
309 | Tokenizer_state.Attribute_value_double_quoted
310 | Tokenizer_state.Attribute_value_single_quoted
311 | Tokenizer_state.Attribute_value_unquoted
312 | Tokenizer_state.After_attribute_value_quoted
313 | Tokenizer_state.Self_closing_start_tag ->
314 error t "eof-in-tag";
315 emit_pending_chars ();
316 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
317 | Tokenizer_state.Rawtext ->
318 emit_pending_chars ();
319 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
320 | Tokenizer_state.Rawtext_less_than_sign ->
321 emit_char t '<';
322 emit_pending_chars ();
323 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
324 | Tokenizer_state.Rawtext_end_tag_open ->
325 emit_str t "</";
326 emit_pending_chars ();
327 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
328 | Tokenizer_state.Rawtext_end_tag_name ->
329 emit_str t "</";
330 emit_str t (Buffer.contents t.temp_buffer);
331 emit_pending_chars ();
332 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
333 | Tokenizer_state.Rcdata ->
334 emit_pending_chars ();
335 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
336 | Tokenizer_state.Rcdata_less_than_sign ->
337 emit_char t '<';
338 emit_pending_chars ();
339 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
340 | Tokenizer_state.Rcdata_end_tag_open ->
341 emit_str t "</";
342 emit_pending_chars ();
343 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
344 | Tokenizer_state.Rcdata_end_tag_name ->
345 emit_str t "</";
346 emit_str t (Buffer.contents t.temp_buffer);
347 emit_pending_chars ();
348 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
349 | Tokenizer_state.Script_data ->
350 emit_pending_chars ();
351 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
352 | Tokenizer_state.Script_data_less_than_sign ->
353 emit_char t '<';
354 emit_pending_chars ();
355 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
356 | Tokenizer_state.Script_data_end_tag_open ->
357 emit_str t "</";
358 emit_pending_chars ();
359 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
360 | Tokenizer_state.Script_data_end_tag_name ->
361 emit_str t "</";
362 emit_str t (Buffer.contents t.temp_buffer);
363 emit_pending_chars ();
364 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
365 | Tokenizer_state.Script_data_escape_start
366 | Tokenizer_state.Script_data_escape_start_dash
367 | Tokenizer_state.Script_data_escaped
368 | Tokenizer_state.Script_data_escaped_dash
369 | Tokenizer_state.Script_data_escaped_dash_dash ->
370 error t "eof-in-script-html-comment-like-text";
371 emit_pending_chars ();
372 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
373 | Tokenizer_state.Script_data_escaped_less_than_sign ->
374 emit_char t '<';
375 emit_pending_chars ();
376 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
377 | Tokenizer_state.Script_data_escaped_end_tag_open ->
378 emit_str t "</";
379 emit_pending_chars ();
380 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
381 | Tokenizer_state.Script_data_escaped_end_tag_name ->
382 emit_str t "</";
383 emit_str t (Buffer.contents t.temp_buffer);
384 emit_pending_chars ();
385 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
386 | Tokenizer_state.Script_data_double_escape_start
387 | Tokenizer_state.Script_data_double_escaped
388 | Tokenizer_state.Script_data_double_escaped_dash
389 | Tokenizer_state.Script_data_double_escaped_dash_dash ->
390 error t "eof-in-script-html-comment-like-text";
391 emit_pending_chars ();
392 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
393 | Tokenizer_state.Script_data_double_escaped_less_than_sign ->
394 (* '<' was already emitted when entering this state from Script_data_double_escaped *)
395 emit_pending_chars ();
396 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
397 | Tokenizer_state.Script_data_double_escape_end ->
398 emit_pending_chars ();
399 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
400 | Tokenizer_state.Plaintext ->
401 emit_pending_chars ();
402 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
403 | Tokenizer_state.Comment_start
404 | Tokenizer_state.Comment_start_dash
405 | Tokenizer_state.Comment
406 | Tokenizer_state.Comment_less_than_sign
407 | Tokenizer_state.Comment_less_than_sign_bang
408 | Tokenizer_state.Comment_less_than_sign_bang_dash
409 | Tokenizer_state.Comment_less_than_sign_bang_dash_dash
410 | Tokenizer_state.Comment_end_dash
411 | Tokenizer_state.Comment_end
412 | Tokenizer_state.Comment_end_bang ->
413 error t "eof-in-comment";
414 emit_current_comment ();
415 emit_pending_chars ();
416 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
417 | Tokenizer_state.Bogus_comment ->
418 emit_current_comment ();
419 emit_pending_chars ();
420 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
421 | Tokenizer_state.Markup_declaration_open ->
422 error t "incorrectly-opened-comment";
423 Buffer.clear t.current_comment;
424 emit_current_comment ();
425 emit_pending_chars ();
426 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
427 | Tokenizer_state.Doctype
428 | Tokenizer_state.Before_doctype_name ->
429 error t "eof-in-doctype";
430 start_new_doctype t;
431 t.current_doctype_force_quirks <- true;
432 emit_current_doctype ();
433 emit_pending_chars ();
434 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
435 | Tokenizer_state.Doctype_name
436 | Tokenizer_state.After_doctype_name
437 | Tokenizer_state.After_doctype_public_keyword
438 | Tokenizer_state.Before_doctype_public_identifier
439 | Tokenizer_state.Doctype_public_identifier_double_quoted
440 | Tokenizer_state.Doctype_public_identifier_single_quoted
441 | Tokenizer_state.After_doctype_public_identifier
442 | Tokenizer_state.Between_doctype_public_and_system_identifiers
443 | Tokenizer_state.After_doctype_system_keyword
444 | Tokenizer_state.Before_doctype_system_identifier
445 | Tokenizer_state.Doctype_system_identifier_double_quoted
446 | Tokenizer_state.Doctype_system_identifier_single_quoted
447 | Tokenizer_state.After_doctype_system_identifier ->
448 error t "eof-in-doctype";
449 t.current_doctype_force_quirks <- true;
450 emit_current_doctype ();
451 emit_pending_chars ();
452 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
453 | Tokenizer_state.Bogus_doctype ->
454 emit_current_doctype ();
455 emit_pending_chars ();
456 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
457 | Tokenizer_state.Cdata_section ->
458 error t "eof-in-cdata";
459 emit_pending_chars ();
460 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
461 | Tokenizer_state.Cdata_section_bracket ->
462 error t "eof-in-cdata";
463 emit_char t ']';
464 emit_pending_chars ();
465 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
466 | Tokenizer_state.Cdata_section_end ->
467 error t "eof-in-cdata";
468 emit_str t "]]";
469 emit_pending_chars ();
470 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
471 | Tokenizer_state.Character_reference ->
472 (* state_character_reference never ran, so initialize temp_buffer with & *)
473 Buffer.clear t.temp_buffer;
474 Buffer.add_char t.temp_buffer '&';
475 flush_code_points_consumed_as_char_ref t;
476 t.state <- t.return_state;
477 handle_eof ()
478 | Tokenizer_state.Named_character_reference ->
479 flush_code_points_consumed_as_char_ref t;
480 t.state <- t.return_state;
481 handle_eof ()
482 | Tokenizer_state.Numeric_character_reference ->
483 (* At EOF with just "&#" - no digits follow *)
484 error t "absence-of-digits-in-numeric-character-reference";
485 flush_code_points_consumed_as_char_ref t;
486 t.state <- t.return_state;
487 handle_eof ()
488 | Tokenizer_state.Hexadecimal_character_reference_start
489 | Tokenizer_state.Decimal_character_reference_start ->
490 error t "absence-of-digits-in-numeric-character-reference";
491 flush_code_points_consumed_as_char_ref t;
492 t.state <- t.return_state;
493 handle_eof ()
494 | Tokenizer_state.Numeric_character_reference_end ->
495 (* We have collected digits, just need to finalize the character reference *)
496 step ();
497 handle_eof ()
498 | Tokenizer_state.Ambiguous_ampersand ->
499 (* Buffer was already flushed when entering this state, just transition *)
500 t.state <- t.return_state;
501 handle_eof ()
502 | Tokenizer_state.Hexadecimal_character_reference
503 | Tokenizer_state.Decimal_character_reference ->
504 (* At EOF with collected digits - convert the numeric reference *)
505 error t "missing-semicolon-after-character-reference";
506 let code = t.char_ref_code in
507 let replacement_char = "\xEF\xBF\xBD" in
508 let result =
509 if code = 0 then begin
510 error t "null-character-reference";
511 replacement_char
512 end else if code > 0x10FFFF then begin
513 error t "character-reference-outside-unicode-range";
514 replacement_char
515 end else if code >= 0xD800 && code <= 0xDFFF then begin
516 error t "surrogate-character-reference";
517 replacement_char
518 end else
519 Entities.Numeric_ref.codepoint_to_utf8 code
520 in
521 Buffer.clear t.temp_buffer;
522 Buffer.add_string t.temp_buffer result;
523 flush_code_points_consumed_as_char_ref t;
524 t.state <- t.return_state;
525 handle_eof ()
526
527 and step () =
528 match t.state with
529 | Tokenizer_state.Data -> state_data ()
530 | Tokenizer_state.Rcdata -> state_rcdata ()
531 | Tokenizer_state.Rawtext -> state_rawtext ()
532 | Tokenizer_state.Script_data -> state_script_data ()
533 | Tokenizer_state.Plaintext -> state_plaintext ()
534 | Tokenizer_state.Tag_open -> state_tag_open ()
535 | Tokenizer_state.End_tag_open -> state_end_tag_open ()
536 | Tokenizer_state.Tag_name -> state_tag_name ()
537 | Tokenizer_state.Rcdata_less_than_sign -> state_rcdata_less_than_sign ()
538 | Tokenizer_state.Rcdata_end_tag_open -> state_rcdata_end_tag_open ()
539 | Tokenizer_state.Rcdata_end_tag_name -> state_rcdata_end_tag_name ()
540 | Tokenizer_state.Rawtext_less_than_sign -> state_rawtext_less_than_sign ()
541 | Tokenizer_state.Rawtext_end_tag_open -> state_rawtext_end_tag_open ()
542 | Tokenizer_state.Rawtext_end_tag_name -> state_rawtext_end_tag_name ()
543 | Tokenizer_state.Script_data_less_than_sign -> state_script_data_less_than_sign ()
544 | Tokenizer_state.Script_data_end_tag_open -> state_script_data_end_tag_open ()
545 | Tokenizer_state.Script_data_end_tag_name -> state_script_data_end_tag_name ()
546 | Tokenizer_state.Script_data_escape_start -> state_script_data_escape_start ()
547 | Tokenizer_state.Script_data_escape_start_dash -> state_script_data_escape_start_dash ()
548 | Tokenizer_state.Script_data_escaped -> state_script_data_escaped ()
549 | Tokenizer_state.Script_data_escaped_dash -> state_script_data_escaped_dash ()
550 | Tokenizer_state.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash ()
551 | Tokenizer_state.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign ()
552 | Tokenizer_state.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open ()
553 | Tokenizer_state.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name ()
554 | Tokenizer_state.Script_data_double_escape_start -> state_script_data_double_escape_start ()
555 | Tokenizer_state.Script_data_double_escaped -> state_script_data_double_escaped ()
556 | Tokenizer_state.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash ()
557 | Tokenizer_state.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash ()
558 | Tokenizer_state.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign ()
559 | Tokenizer_state.Script_data_double_escape_end -> state_script_data_double_escape_end ()
560 | Tokenizer_state.Before_attribute_name -> state_before_attribute_name ()
561 | Tokenizer_state.Attribute_name -> state_attribute_name ()
562 | Tokenizer_state.After_attribute_name -> state_after_attribute_name ()
563 | Tokenizer_state.Before_attribute_value -> state_before_attribute_value ()
564 | Tokenizer_state.Attribute_value_double_quoted -> state_attribute_value_double_quoted ()
565 | Tokenizer_state.Attribute_value_single_quoted -> state_attribute_value_single_quoted ()
566 | Tokenizer_state.Attribute_value_unquoted -> state_attribute_value_unquoted ()
567 | Tokenizer_state.After_attribute_value_quoted -> state_after_attribute_value_quoted ()
568 | Tokenizer_state.Self_closing_start_tag -> state_self_closing_start_tag ()
569 | Tokenizer_state.Bogus_comment -> state_bogus_comment ()
570 | Tokenizer_state.Markup_declaration_open -> state_markup_declaration_open ()
571 | Tokenizer_state.Comment_start -> state_comment_start ()
572 | Tokenizer_state.Comment_start_dash -> state_comment_start_dash ()
573 | Tokenizer_state.Comment -> state_comment ()
574 | Tokenizer_state.Comment_less_than_sign -> state_comment_less_than_sign ()
575 | Tokenizer_state.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang ()
576 | Tokenizer_state.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash ()
577 | Tokenizer_state.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash ()
578 | Tokenizer_state.Comment_end_dash -> state_comment_end_dash ()
579 | Tokenizer_state.Comment_end -> state_comment_end ()
580 | Tokenizer_state.Comment_end_bang -> state_comment_end_bang ()
581 | Tokenizer_state.Doctype -> state_doctype ()
582 | Tokenizer_state.Before_doctype_name -> state_before_doctype_name ()
583 | Tokenizer_state.Doctype_name -> state_doctype_name ()
584 | Tokenizer_state.After_doctype_name -> state_after_doctype_name ()
585 | Tokenizer_state.After_doctype_public_keyword -> state_after_doctype_public_keyword ()
586 | Tokenizer_state.Before_doctype_public_identifier -> state_before_doctype_public_identifier ()
587 | Tokenizer_state.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted ()
588 | Tokenizer_state.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted ()
589 | Tokenizer_state.After_doctype_public_identifier -> state_after_doctype_public_identifier ()
590 | Tokenizer_state.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers ()
591 | Tokenizer_state.After_doctype_system_keyword -> state_after_doctype_system_keyword ()
592 | Tokenizer_state.Before_doctype_system_identifier -> state_before_doctype_system_identifier ()
593 | Tokenizer_state.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted ()
594 | Tokenizer_state.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted ()
595 | Tokenizer_state.After_doctype_system_identifier -> state_after_doctype_system_identifier ()
596 | Tokenizer_state.Bogus_doctype -> state_bogus_doctype ()
597 | Tokenizer_state.Cdata_section -> state_cdata_section ()
598 | Tokenizer_state.Cdata_section_bracket -> state_cdata_section_bracket ()
599 | Tokenizer_state.Cdata_section_end -> state_cdata_section_end ()
600 | Tokenizer_state.Character_reference -> state_character_reference ()
601 | Tokenizer_state.Named_character_reference -> state_named_character_reference ()
602 | Tokenizer_state.Ambiguous_ampersand -> state_ambiguous_ampersand ()
603 | Tokenizer_state.Numeric_character_reference -> state_numeric_character_reference ()
604 | Tokenizer_state.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start ()
605 | Tokenizer_state.Decimal_character_reference_start -> state_decimal_character_reference_start ()
606 | Tokenizer_state.Hexadecimal_character_reference -> state_hexadecimal_character_reference ()
607 | Tokenizer_state.Decimal_character_reference -> state_decimal_character_reference ()
608 | Tokenizer_state.Numeric_character_reference_end -> state_numeric_character_reference_end ()
609
610 (* State implementations *)
611 and state_data () =
612 match Tokenizer_stream.consume t.stream with
613 | Some '&' ->
614 t.return_state <- Tokenizer_state.Data;
615 t.state <- Tokenizer_state.Character_reference
616 | Some '<' ->
617 t.state <- Tokenizer_state.Tag_open
618 | Some '\x00' ->
619 (* Emit pending chars first, then emit null separately for proper tree builder handling *)
620 emit_pending_chars ();
621 error t "unexpected-null-character";
622 let line, column = Tokenizer_stream.position t.stream in
623 ignore (S.process t.sink (Tokenizer_token.Character "\x00") ~line ~column)
624 | Some c ->
625 emit_char_checked c
626 | None -> ()
627
628 and state_rcdata () =
629 match Tokenizer_stream.consume t.stream with
630 | Some '&' ->
631 t.return_state <- Tokenizer_state.Rcdata;
632 t.state <- Tokenizer_state.Character_reference
633 | Some '<' ->
634 t.state <- Tokenizer_state.Rcdata_less_than_sign
635 | Some '\x00' ->
636 error t "unexpected-null-character";
637 emit_str t "\xEF\xBF\xBD"
638 | Some c ->
639 emit_char_checked c
640 | None -> ()
641
642 and state_rawtext () =
643 match Tokenizer_stream.consume t.stream with
644 | Some '<' ->
645 t.state <- Tokenizer_state.Rawtext_less_than_sign
646 | Some '\x00' ->
647 error t "unexpected-null-character";
648 emit_str t "\xEF\xBF\xBD"
649 | Some c ->
650 emit_char_checked c
651 | None -> ()
652
653 and state_script_data () =
654 match Tokenizer_stream.consume t.stream with
655 | Some '<' ->
656 t.state <- Tokenizer_state.Script_data_less_than_sign
657 | Some '\x00' ->
658 error t "unexpected-null-character";
659 emit_str t "\xEF\xBF\xBD"
660 | Some c ->
661 emit_char_checked c
662 | None -> ()
663
664 and state_plaintext () =
665 match Tokenizer_stream.consume t.stream with
666 | Some '\x00' ->
667 error t "unexpected-null-character";
668 emit_str t "\xEF\xBF\xBD"
669 | Some c ->
670 emit_char_checked c
671 | None -> ()
672
673 and state_tag_open () =
674 match Tokenizer_stream.peek t.stream with
675 | Some '!' ->
676 Tokenizer_stream.advance t.stream;
677 t.state <- Tokenizer_state.Markup_declaration_open
678 | Some '/' ->
679 Tokenizer_stream.advance t.stream;
680 t.state <- Tokenizer_state.End_tag_open
681 | Some c when is_ascii_alpha c ->
682 start_new_tag t Tokenizer_token.Start;
683 t.state <- Tokenizer_state.Tag_name
684 | Some '?' ->
685 error t "unexpected-question-mark-instead-of-tag-name";
686 Buffer.clear t.current_comment;
687 t.state <- Tokenizer_state.Bogus_comment
688 | None ->
689 error t "eof-before-tag-name";
690 emit_char t '<'
691 | Some _ ->
692 error t "invalid-first-character-of-tag-name";
693 emit_char t '<';
694 t.state <- Tokenizer_state.Data
695
696 and state_end_tag_open () =
697 match Tokenizer_stream.peek t.stream with
698 | Some c when is_ascii_alpha c ->
699 start_new_tag t Tokenizer_token.End;
700 t.state <- Tokenizer_state.Tag_name
701 | Some '>' ->
702 Tokenizer_stream.advance t.stream;
703 error t "missing-end-tag-name";
704 t.state <- Tokenizer_state.Data
705 | None ->
706 error t "eof-before-tag-name";
707 emit_str t "</"
708 | Some _ ->
709 error t "invalid-first-character-of-tag-name";
710 Buffer.clear t.current_comment;
711 t.state <- Tokenizer_state.Bogus_comment
712
713 and state_tag_name () =
714 match Tokenizer_stream.consume t.stream with
715 | Some ('\t' | '\n' | '\x0C' | ' ') ->
716 t.state <- Tokenizer_state.Before_attribute_name
717 | Some '/' ->
718 t.state <- Tokenizer_state.Self_closing_start_tag
719 | Some '>' ->
720 t.state <- Tokenizer_state.Data;
721 emit_current_tag ()
722 | Some '\x00' ->
723 error t "unexpected-null-character";
724 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
725 | Some c ->
726 check_control_char c;
727 Buffer.add_char t.current_tag_name (ascii_lower c)
728 | None -> ()
729
730 and state_rcdata_less_than_sign () =
731 match Tokenizer_stream.peek t.stream with
732 | Some '/' ->
733 Tokenizer_stream.advance t.stream;
734 Buffer.clear t.temp_buffer;
735 t.state <- Tokenizer_state.Rcdata_end_tag_open
736 | _ ->
737 emit_char t '<';
738 t.state <- Tokenizer_state.Rcdata
739
740 and state_rcdata_end_tag_open () =
741 match Tokenizer_stream.peek t.stream with
742 | Some c when is_ascii_alpha c ->
743 start_new_tag t Tokenizer_token.End;
744 t.state <- Tokenizer_state.Rcdata_end_tag_name
745 | _ ->
746 emit_str t "</";
747 t.state <- Tokenizer_state.Rcdata
748
749 and state_rcdata_end_tag_name () =
750 match Tokenizer_stream.peek t.stream with
751 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
752 Tokenizer_stream.advance t.stream;
753 t.state <- Tokenizer_state.Before_attribute_name
754 | Some '/' when is_appropriate_end_tag t ->
755 Tokenizer_stream.advance t.stream;
756 t.state <- Tokenizer_state.Self_closing_start_tag
757 | Some '>' when is_appropriate_end_tag t ->
758 Tokenizer_stream.advance t.stream;
759 t.state <- Tokenizer_state.Data;
760 emit_current_tag ()
761 | Some c when is_ascii_alpha c ->
762 Tokenizer_stream.advance t.stream;
763 Buffer.add_char t.current_tag_name (ascii_lower c);
764 Buffer.add_char t.temp_buffer c
765 | _ ->
766 emit_str t "</";
767 emit_str t (Buffer.contents t.temp_buffer);
768 t.state <- Tokenizer_state.Rcdata
769
770 and state_rawtext_less_than_sign () =
771 match Tokenizer_stream.peek t.stream with
772 | Some '/' ->
773 Tokenizer_stream.advance t.stream;
774 Buffer.clear t.temp_buffer;
775 t.state <- Tokenizer_state.Rawtext_end_tag_open
776 | _ ->
777 emit_char t '<';
778 t.state <- Tokenizer_state.Rawtext
779
780 and state_rawtext_end_tag_open () =
781 match Tokenizer_stream.peek t.stream with
782 | Some c when is_ascii_alpha c ->
783 start_new_tag t Tokenizer_token.End;
784 t.state <- Tokenizer_state.Rawtext_end_tag_name
785 | _ ->
786 emit_str t "</";
787 t.state <- Tokenizer_state.Rawtext
788
789 and state_rawtext_end_tag_name () =
790 match Tokenizer_stream.peek t.stream with
791 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
792 Tokenizer_stream.advance t.stream;
793 t.state <- Tokenizer_state.Before_attribute_name
794 | Some '/' when is_appropriate_end_tag t ->
795 Tokenizer_stream.advance t.stream;
796 t.state <- Tokenizer_state.Self_closing_start_tag
797 | Some '>' when is_appropriate_end_tag t ->
798 Tokenizer_stream.advance t.stream;
799 t.state <- Tokenizer_state.Data;
800 emit_current_tag ()
801 | Some c when is_ascii_alpha c ->
802 Tokenizer_stream.advance t.stream;
803 Buffer.add_char t.current_tag_name (ascii_lower c);
804 Buffer.add_char t.temp_buffer c
805 | _ ->
806 emit_str t "</";
807 emit_str t (Buffer.contents t.temp_buffer);
808 t.state <- Tokenizer_state.Rawtext
809
810 and state_script_data_less_than_sign () =
811 match Tokenizer_stream.peek t.stream with
812 | Some '/' ->
813 Tokenizer_stream.advance t.stream;
814 Buffer.clear t.temp_buffer;
815 t.state <- Tokenizer_state.Script_data_end_tag_open
816 | Some '!' ->
817 Tokenizer_stream.advance t.stream;
818 t.state <- Tokenizer_state.Script_data_escape_start;
819 emit_str t "<!"
820 | _ ->
821 emit_char t '<';
822 t.state <- Tokenizer_state.Script_data
823
824 and state_script_data_end_tag_open () =
825 match Tokenizer_stream.peek t.stream with
826 | Some c when is_ascii_alpha c ->
827 start_new_tag t Tokenizer_token.End;
828 t.state <- Tokenizer_state.Script_data_end_tag_name
829 | _ ->
830 emit_str t "</";
831 t.state <- Tokenizer_state.Script_data
832
833 and state_script_data_end_tag_name () =
834 match Tokenizer_stream.peek t.stream with
835 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
836 Tokenizer_stream.advance t.stream;
837 t.state <- Tokenizer_state.Before_attribute_name
838 | Some '/' when is_appropriate_end_tag t ->
839 Tokenizer_stream.advance t.stream;
840 t.state <- Tokenizer_state.Self_closing_start_tag
841 | Some '>' when is_appropriate_end_tag t ->
842 Tokenizer_stream.advance t.stream;
843 t.state <- Tokenizer_state.Data;
844 emit_current_tag ()
845 | Some c when is_ascii_alpha c ->
846 Tokenizer_stream.advance t.stream;
847 Buffer.add_char t.current_tag_name (ascii_lower c);
848 Buffer.add_char t.temp_buffer c
849 | _ ->
850 emit_str t "</";
851 emit_str t (Buffer.contents t.temp_buffer);
852 t.state <- Tokenizer_state.Script_data
853
854 and state_script_data_escape_start () =
855 match Tokenizer_stream.peek t.stream with
856 | Some '-' ->
857 Tokenizer_stream.advance t.stream;
858 t.state <- Tokenizer_state.Script_data_escape_start_dash;
859 emit_char t '-'
860 | _ ->
861 t.state <- Tokenizer_state.Script_data
862
863 and state_script_data_escape_start_dash () =
864 match Tokenizer_stream.peek t.stream with
865 | Some '-' ->
866 Tokenizer_stream.advance t.stream;
867 t.state <- Tokenizer_state.Script_data_escaped_dash_dash;
868 emit_char t '-'
869 | _ ->
870 t.state <- Tokenizer_state.Script_data
871
872 and state_script_data_escaped () =
873 match Tokenizer_stream.consume t.stream with
874 | Some '-' ->
875 t.state <- Tokenizer_state.Script_data_escaped_dash;
876 emit_char t '-'
877 | Some '<' ->
878 t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
879 | Some '\x00' ->
880 error t "unexpected-null-character";
881 emit_str t "\xEF\xBF\xBD"
882 | Some c ->
883 emit_char_checked c
884 | None -> ()
885
886 and state_script_data_escaped_dash () =
887 match Tokenizer_stream.consume t.stream with
888 | Some '-' ->
889 t.state <- Tokenizer_state.Script_data_escaped_dash_dash;
890 emit_char t '-'
891 | Some '<' ->
892 t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
893 | Some '\x00' ->
894 error t "unexpected-null-character";
895 t.state <- Tokenizer_state.Script_data_escaped;
896 emit_str t "\xEF\xBF\xBD"
897 | Some c ->
898 t.state <- Tokenizer_state.Script_data_escaped;
899 emit_char_checked c
900 | None -> ()
901
902 and state_script_data_escaped_dash_dash () =
903 match Tokenizer_stream.consume t.stream with
904 | Some '-' ->
905 emit_char t '-'
906 | Some '<' ->
907 t.state <- Tokenizer_state.Script_data_escaped_less_than_sign
908 | Some '>' ->
909 t.state <- Tokenizer_state.Script_data;
910 emit_char t '>'
911 | Some '\x00' ->
912 error t "unexpected-null-character";
913 t.state <- Tokenizer_state.Script_data_escaped;
914 emit_str t "\xEF\xBF\xBD"
915 | Some c ->
916 t.state <- Tokenizer_state.Script_data_escaped;
917 emit_char_checked c
918 | None -> ()
919
920 and state_script_data_escaped_less_than_sign () =
921 match Tokenizer_stream.peek t.stream with
922 | Some '/' ->
923 Tokenizer_stream.advance t.stream;
924 Buffer.clear t.temp_buffer;
925 t.state <- Tokenizer_state.Script_data_escaped_end_tag_open
926 | Some c when is_ascii_alpha c ->
927 Buffer.clear t.temp_buffer;
928 emit_char t '<';
929 t.state <- Tokenizer_state.Script_data_double_escape_start
930 | _ ->
931 emit_char t '<';
932 t.state <- Tokenizer_state.Script_data_escaped
933
934 and state_script_data_escaped_end_tag_open () =
935 match Tokenizer_stream.peek t.stream with
936 | Some c when is_ascii_alpha c ->
937 start_new_tag t Tokenizer_token.End;
938 t.state <- Tokenizer_state.Script_data_escaped_end_tag_name
939 | _ ->
940 emit_str t "</";
941 t.state <- Tokenizer_state.Script_data_escaped
942
943 and state_script_data_escaped_end_tag_name () =
944 match Tokenizer_stream.peek t.stream with
945 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
946 Tokenizer_stream.advance t.stream;
947 t.state <- Tokenizer_state.Before_attribute_name
948 | Some '/' when is_appropriate_end_tag t ->
949 Tokenizer_stream.advance t.stream;
950 t.state <- Tokenizer_state.Self_closing_start_tag
951 | Some '>' when is_appropriate_end_tag t ->
952 Tokenizer_stream.advance t.stream;
953 t.state <- Tokenizer_state.Data;
954 emit_current_tag ()
955 | Some c when is_ascii_alpha c ->
956 Tokenizer_stream.advance t.stream;
957 Buffer.add_char t.current_tag_name (ascii_lower c);
958 Buffer.add_char t.temp_buffer c
959 | _ ->
960 emit_str t "</";
961 emit_str t (Buffer.contents t.temp_buffer);
962 t.state <- Tokenizer_state.Script_data_escaped
963
964 and state_script_data_double_escape_start () =
965 match Tokenizer_stream.peek t.stream with
966 | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
967 Tokenizer_stream.advance t.stream;
968 let c = Option.get c_opt in
969 if Buffer.contents t.temp_buffer = "script" then
970 t.state <- Tokenizer_state.Script_data_double_escaped
971 else
972 t.state <- Tokenizer_state.Script_data_escaped;
973 emit_char t c
974 | Some c when is_ascii_alpha c ->
975 Tokenizer_stream.advance t.stream;
976 Buffer.add_char t.temp_buffer (ascii_lower c);
977 emit_char t c
978 | _ ->
979 t.state <- Tokenizer_state.Script_data_escaped
980
981 and state_script_data_double_escaped () =
982 match Tokenizer_stream.consume t.stream with
983 | Some '-' ->
984 t.state <- Tokenizer_state.Script_data_double_escaped_dash;
985 emit_char t '-'
986 | Some '<' ->
987 t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
988 emit_char t '<'
989 | Some '\x00' ->
990 error t "unexpected-null-character";
991 emit_str t "\xEF\xBF\xBD"
992 | Some c ->
993 emit_char_checked c
994 | None -> ()
995
996 and state_script_data_double_escaped_dash () =
997 match Tokenizer_stream.consume t.stream with
998 | Some '-' ->
999 t.state <- Tokenizer_state.Script_data_double_escaped_dash_dash;
1000 emit_char t '-'
1001 | Some '<' ->
1002 t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
1003 emit_char t '<'
1004 | Some '\x00' ->
1005 error t "unexpected-null-character";
1006 t.state <- Tokenizer_state.Script_data_double_escaped;
1007 emit_str t "\xEF\xBF\xBD"
1008 | Some c ->
1009 t.state <- Tokenizer_state.Script_data_double_escaped;
1010 emit_char_checked c
1011 | None -> ()
1012
1013 and state_script_data_double_escaped_dash_dash () =
1014 match Tokenizer_stream.consume t.stream with
1015 | Some '-' ->
1016 emit_char t '-'
1017 | Some '<' ->
1018 t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign;
1019 emit_char t '<'
1020 | Some '>' ->
1021 t.state <- Tokenizer_state.Script_data;
1022 emit_char t '>'
1023 | Some '\x00' ->
1024 error t "unexpected-null-character";
1025 t.state <- Tokenizer_state.Script_data_double_escaped;
1026 emit_str t "\xEF\xBF\xBD"
1027 | Some c ->
1028 t.state <- Tokenizer_state.Script_data_double_escaped;
1029 emit_char_checked c
1030 | None -> ()
1031
1032 and state_script_data_double_escaped_less_than_sign () =
1033 match Tokenizer_stream.peek t.stream with
1034 | Some '/' ->
1035 Tokenizer_stream.advance t.stream;
1036 Buffer.clear t.temp_buffer;
1037 t.state <- Tokenizer_state.Script_data_double_escape_end;
1038 emit_char t '/'
1039 | _ ->
1040 t.state <- Tokenizer_state.Script_data_double_escaped
1041
1042 and state_script_data_double_escape_end () =
1043 match Tokenizer_stream.peek t.stream with
1044 | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
1045 Tokenizer_stream.advance t.stream;
1046 let c = Option.get c_opt in
1047 if Buffer.contents t.temp_buffer = "script" then
1048 t.state <- Tokenizer_state.Script_data_escaped
1049 else
1050 t.state <- Tokenizer_state.Script_data_double_escaped;
1051 emit_char t c
1052 | Some c when is_ascii_alpha c ->
1053 Tokenizer_stream.advance t.stream;
1054 Buffer.add_char t.temp_buffer (ascii_lower c);
1055 emit_char t c
1056 | _ ->
1057 t.state <- Tokenizer_state.Script_data_double_escaped
1058
1059 and state_before_attribute_name () =
1060 match Tokenizer_stream.peek t.stream with
1061 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1062 Tokenizer_stream.advance t.stream
1063 | Some '/' | Some '>' | None ->
1064 t.state <- Tokenizer_state.After_attribute_name
1065 | Some '=' ->
1066 Tokenizer_stream.advance t.stream;
1067 error t "unexpected-equals-sign-before-attribute-name";
1068 start_new_attribute t;
1069 Buffer.add_char t.current_attr_name '=';
1070 t.state <- Tokenizer_state.Attribute_name
1071 | Some _ ->
1072 start_new_attribute t;
1073 t.state <- Tokenizer_state.Attribute_name
1074
1075 and state_attribute_name () =
1076 match Tokenizer_stream.peek t.stream with
1077 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1078 Tokenizer_stream.advance t.stream;
1079 t.state <- Tokenizer_state.After_attribute_name
1080 | Some '/' | Some '>' | None ->
1081 t.state <- Tokenizer_state.After_attribute_name
1082 | Some '=' ->
1083 Tokenizer_stream.advance t.stream;
1084 t.state <- Tokenizer_state.Before_attribute_value
1085 | Some '\x00' ->
1086 Tokenizer_stream.advance t.stream;
1087 error t "unexpected-null-character";
1088 Buffer.add_string t.current_attr_name "\xEF\xBF\xBD"
1089 | Some ('"' | '\'' | '<') as c_opt ->
1090 Tokenizer_stream.advance t.stream;
1091 error t "unexpected-character-in-attribute-name";
1092 Buffer.add_char t.current_attr_name (Option.get c_opt)
1093 | Some c ->
1094 Tokenizer_stream.advance t.stream;
1095 check_control_char c;
1096 Buffer.add_char t.current_attr_name (ascii_lower c)
1097
1098 and state_after_attribute_name () =
1099 match Tokenizer_stream.peek t.stream with
1100 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1101 Tokenizer_stream.advance t.stream
1102 | Some '/' ->
1103 Tokenizer_stream.advance t.stream;
1104 t.state <- Tokenizer_state.Self_closing_start_tag
1105 | Some '=' ->
1106 Tokenizer_stream.advance t.stream;
1107 t.state <- Tokenizer_state.Before_attribute_value
1108 | Some '>' ->
1109 Tokenizer_stream.advance t.stream;
1110 t.state <- Tokenizer_state.Data;
1111 emit_current_tag ()
1112 | None -> ()
1113 | Some _ ->
1114 start_new_attribute t;
1115 t.state <- Tokenizer_state.Attribute_name
1116
1117 and state_before_attribute_value () =
1118 match Tokenizer_stream.peek t.stream with
1119 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1120 Tokenizer_stream.advance t.stream
1121 | Some '"' ->
1122 Tokenizer_stream.advance t.stream;
1123 t.state <- Tokenizer_state.Attribute_value_double_quoted
1124 | Some '\'' ->
1125 Tokenizer_stream.advance t.stream;
1126 t.state <- Tokenizer_state.Attribute_value_single_quoted
1127 | Some '>' ->
1128 Tokenizer_stream.advance t.stream;
1129 error t "missing-attribute-value";
1130 t.state <- Tokenizer_state.Data;
1131 emit_current_tag ()
1132 | _ ->
1133 t.state <- Tokenizer_state.Attribute_value_unquoted
1134
1135 and state_attribute_value_double_quoted () =
1136 match Tokenizer_stream.consume t.stream with
1137 | Some '"' ->
1138 t.state <- Tokenizer_state.After_attribute_value_quoted
1139 | Some '&' ->
1140 t.return_state <- Tokenizer_state.Attribute_value_double_quoted;
1141 t.state <- Tokenizer_state.Character_reference
1142 | Some '\x00' ->
1143 error t "unexpected-null-character";
1144 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1145 | Some c ->
1146 check_control_char c;
1147 Buffer.add_char t.current_attr_value c
1148 | None -> ()
1149
1150 and state_attribute_value_single_quoted () =
1151 match Tokenizer_stream.consume t.stream with
1152 | Some '\'' ->
1153 t.state <- Tokenizer_state.After_attribute_value_quoted
1154 | Some '&' ->
1155 t.return_state <- Tokenizer_state.Attribute_value_single_quoted;
1156 t.state <- Tokenizer_state.Character_reference
1157 | Some '\x00' ->
1158 error t "unexpected-null-character";
1159 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1160 | Some c ->
1161 check_control_char c;
1162 Buffer.add_char t.current_attr_value c
1163 | None -> ()
1164
1165 and state_attribute_value_unquoted () =
1166 match Tokenizer_stream.peek t.stream with
1167 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1168 Tokenizer_stream.advance t.stream;
1169 t.state <- Tokenizer_state.Before_attribute_name
1170 | Some '&' ->
1171 Tokenizer_stream.advance t.stream;
1172 t.return_state <- Tokenizer_state.Attribute_value_unquoted;
1173 t.state <- Tokenizer_state.Character_reference
1174 | Some '>' ->
1175 Tokenizer_stream.advance t.stream;
1176 t.state <- Tokenizer_state.Data;
1177 emit_current_tag ()
1178 | Some '\x00' ->
1179 Tokenizer_stream.advance t.stream;
1180 error t "unexpected-null-character";
1181 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1182 | Some ('"' | '\'' | '<' | '=' | '`') as c_opt ->
1183 Tokenizer_stream.advance t.stream;
1184 error t "unexpected-character-in-unquoted-attribute-value";
1185 Buffer.add_char t.current_attr_value (Option.get c_opt)
1186 | Some c ->
1187 Tokenizer_stream.advance t.stream;
1188 check_control_char c;
1189 Buffer.add_char t.current_attr_value c
1190 | None -> ()
1191
1192 and state_after_attribute_value_quoted () =
1193 match Tokenizer_stream.peek t.stream with
1194 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1195 Tokenizer_stream.advance t.stream;
1196 t.state <- Tokenizer_state.Before_attribute_name
1197 | Some '/' ->
1198 Tokenizer_stream.advance t.stream;
1199 t.state <- Tokenizer_state.Self_closing_start_tag
1200 | Some '>' ->
1201 Tokenizer_stream.advance t.stream;
1202 t.state <- Tokenizer_state.Data;
1203 emit_current_tag ()
1204 | None -> ()
1205 | Some _ ->
1206 error t "missing-whitespace-between-attributes";
1207 t.state <- Tokenizer_state.Before_attribute_name
1208
1209 and state_self_closing_start_tag () =
1210 match Tokenizer_stream.peek t.stream with
1211 | Some '>' ->
1212 Tokenizer_stream.advance t.stream;
1213 t.current_tag_self_closing <- true;
1214 t.state <- Tokenizer_state.Data;
1215 emit_current_tag ()
1216 | None -> ()
1217 | Some _ ->
1218 error t "unexpected-solidus-in-tag";
1219 t.state <- Tokenizer_state.Before_attribute_name
1220
1221 and state_bogus_comment () =
1222 match Tokenizer_stream.consume t.stream with
1223 | Some '>' ->
1224 t.state <- Tokenizer_state.Data;
1225 emit_current_comment ()
1226 | Some '\x00' ->
1227 error t "unexpected-null-character";
1228 Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1229 | Some c ->
1230 check_control_char c;
1231 Buffer.add_char t.current_comment c
1232 | None -> ()
1233
1234 and state_markup_declaration_open () =
1235 if Tokenizer_stream.matches_ci t.stream "--" then begin
1236 ignore (Tokenizer_stream.consume_exact_ci t.stream "--");
1237 Buffer.clear t.current_comment;
1238 t.state <- Tokenizer_state.Comment_start
1239 end else if Tokenizer_stream.matches_ci t.stream "DOCTYPE" then begin
1240 ignore (Tokenizer_stream.consume_exact_ci t.stream "DOCTYPE");
1241 t.state <- Tokenizer_state.Doctype
1242 end else if Tokenizer_stream.matches_ci t.stream "[CDATA[" then begin
1243 ignore (Tokenizer_stream.consume_exact_ci t.stream "[CDATA[");
1244 (* CDATA only allowed in foreign content *)
1245 if S.adjusted_current_node_in_html_namespace t.sink then begin
1246 error t "cdata-in-html-content";
1247 Buffer.clear t.current_comment;
1248 Buffer.add_string t.current_comment "[CDATA[";
1249 t.state <- Tokenizer_state.Bogus_comment
1250 end else
1251 t.state <- Tokenizer_state.Cdata_section
1252 end else begin
1253 error t "incorrectly-opened-comment";
1254 Buffer.clear t.current_comment;
1255 t.state <- Tokenizer_state.Bogus_comment
1256 end
1257
1258 and state_comment_start () =
1259 match Tokenizer_stream.peek t.stream with
1260 | Some '-' ->
1261 Tokenizer_stream.advance t.stream;
1262 t.state <- Tokenizer_state.Comment_start_dash
1263 | Some '>' ->
1264 Tokenizer_stream.advance t.stream;
1265 error t "abrupt-closing-of-empty-comment";
1266 t.state <- Tokenizer_state.Data;
1267 emit_current_comment ()
1268 | _ ->
1269 t.state <- Tokenizer_state.Comment
1270
1271 and state_comment_start_dash () =
1272 match Tokenizer_stream.peek t.stream with
1273 | Some '-' ->
1274 Tokenizer_stream.advance t.stream;
1275 t.state <- Tokenizer_state.Comment_end
1276 | Some '>' ->
1277 Tokenizer_stream.advance t.stream;
1278 error t "abrupt-closing-of-empty-comment";
1279 t.state <- Tokenizer_state.Data;
1280 emit_current_comment ()
1281 | None -> ()
1282 | Some _ ->
1283 Buffer.add_char t.current_comment '-';
1284 t.state <- Tokenizer_state.Comment
1285
1286 and state_comment () =
1287 match Tokenizer_stream.consume t.stream with
1288 | Some '<' ->
1289 Buffer.add_char t.current_comment '<';
1290 t.state <- Tokenizer_state.Comment_less_than_sign
1291 | Some '-' ->
1292 t.state <- Tokenizer_state.Comment_end_dash
1293 | Some '\x00' ->
1294 error t "unexpected-null-character";
1295 Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1296 | Some c ->
1297 check_control_char c;
1298 Buffer.add_char t.current_comment c
1299 | None -> ()
1300
1301 and state_comment_less_than_sign () =
1302 match Tokenizer_stream.peek t.stream with
1303 | Some '!' ->
1304 Tokenizer_stream.advance t.stream;
1305 Buffer.add_char t.current_comment '!';
1306 t.state <- Tokenizer_state.Comment_less_than_sign_bang
1307 | Some '<' ->
1308 Tokenizer_stream.advance t.stream;
1309 Buffer.add_char t.current_comment '<'
1310 | _ ->
1311 t.state <- Tokenizer_state.Comment
1312
1313 and state_comment_less_than_sign_bang () =
1314 match Tokenizer_stream.peek t.stream with
1315 | Some '-' ->
1316 Tokenizer_stream.advance t.stream;
1317 t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash
1318 | _ ->
1319 t.state <- Tokenizer_state.Comment
1320
1321 and state_comment_less_than_sign_bang_dash () =
1322 match Tokenizer_stream.peek t.stream with
1323 | Some '-' ->
1324 Tokenizer_stream.advance t.stream;
1325 t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash_dash
1326 | _ ->
1327 t.state <- Tokenizer_state.Comment_end_dash
1328
1329 and state_comment_less_than_sign_bang_dash_dash () =
1330 match Tokenizer_stream.peek t.stream with
1331 | Some '>' | None ->
1332 t.state <- Tokenizer_state.Comment_end
1333 | Some _ ->
1334 error t "nested-comment";
1335 t.state <- Tokenizer_state.Comment_end
1336
1337 and state_comment_end_dash () =
1338 match Tokenizer_stream.peek t.stream with
1339 | Some '-' ->
1340 Tokenizer_stream.advance t.stream;
1341 t.state <- Tokenizer_state.Comment_end
1342 | None -> ()
1343 | Some _ ->
1344 Buffer.add_char t.current_comment '-';
1345 t.state <- Tokenizer_state.Comment
1346
1347 and state_comment_end () =
1348 match Tokenizer_stream.peek t.stream with
1349 | Some '>' ->
1350 Tokenizer_stream.advance t.stream;
1351 t.state <- Tokenizer_state.Data;
1352 emit_current_comment ()
1353 | Some '!' ->
1354 Tokenizer_stream.advance t.stream;
1355 t.state <- Tokenizer_state.Comment_end_bang
1356 | Some '-' ->
1357 Tokenizer_stream.advance t.stream;
1358 Buffer.add_char t.current_comment '-'
1359 | None -> ()
1360 | Some _ ->
1361 Buffer.add_string t.current_comment "--";
1362 t.state <- Tokenizer_state.Comment
1363
1364 and state_comment_end_bang () =
1365 match Tokenizer_stream.peek t.stream with
1366 | Some '-' ->
1367 Tokenizer_stream.advance t.stream;
1368 Buffer.add_string t.current_comment "--!";
1369 t.state <- Tokenizer_state.Comment_end_dash
1370 | Some '>' ->
1371 Tokenizer_stream.advance t.stream;
1372 error t "incorrectly-closed-comment";
1373 t.state <- Tokenizer_state.Data;
1374 emit_current_comment ()
1375 | None -> ()
1376 | Some _ ->
1377 Buffer.add_string t.current_comment "--!";
1378 t.state <- Tokenizer_state.Comment
1379
1380 and state_doctype () =
1381 match Tokenizer_stream.peek t.stream with
1382 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1383 Tokenizer_stream.advance t.stream;
1384 t.state <- Tokenizer_state.Before_doctype_name
1385 | Some '>' ->
1386 t.state <- Tokenizer_state.Before_doctype_name
1387 | None -> ()
1388 | Some _ ->
1389 error t "missing-whitespace-before-doctype-name";
1390 t.state <- Tokenizer_state.Before_doctype_name
1391
1392 and state_before_doctype_name () =
1393 match Tokenizer_stream.peek t.stream with
1394 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1395 Tokenizer_stream.advance t.stream
1396 | Some '\x00' ->
1397 Tokenizer_stream.advance t.stream;
1398 error t "unexpected-null-character";
1399 start_new_doctype t;
1400 t.current_doctype_name <- Some (Buffer.create 8);
1401 Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD";
1402 t.state <- Tokenizer_state.Doctype_name
1403 | Some '>' ->
1404 Tokenizer_stream.advance t.stream;
1405 error t "missing-doctype-name";
1406 start_new_doctype t;
1407 t.current_doctype_force_quirks <- true;
1408 t.state <- Tokenizer_state.Data;
1409 emit_current_doctype ()
1410 | None -> ()
1411 | Some c ->
1412 Tokenizer_stream.advance t.stream;
1413 check_control_char c;
1414 start_new_doctype t;
1415 t.current_doctype_name <- Some (Buffer.create 8);
1416 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
1417 t.state <- Tokenizer_state.Doctype_name
1418
1419 and state_doctype_name () =
1420 match Tokenizer_stream.consume t.stream with
1421 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1422 t.state <- Tokenizer_state.After_doctype_name
1423 | Some '>' ->
1424 t.state <- Tokenizer_state.Data;
1425 emit_current_doctype ()
1426 | Some '\x00' ->
1427 error t "unexpected-null-character";
1428 Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
1429 | Some c ->
1430 check_control_char c;
1431 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
1432 | None -> ()
1433
1434 and state_after_doctype_name () =
1435 match Tokenizer_stream.peek t.stream with
1436 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1437 Tokenizer_stream.advance t.stream
1438 | Some '>' ->
1439 Tokenizer_stream.advance t.stream;
1440 t.state <- Tokenizer_state.Data;
1441 emit_current_doctype ()
1442 | None -> ()
1443 | Some _ ->
1444 (* Don't check control char here - bogus_doctype will check when it consumes *)
1445 if Tokenizer_stream.matches_ci t.stream "PUBLIC" then begin
1446 ignore (Tokenizer_stream.consume_exact_ci t.stream "PUBLIC");
1447 t.state <- Tokenizer_state.After_doctype_public_keyword
1448 end else if Tokenizer_stream.matches_ci t.stream "SYSTEM" then begin
1449 ignore (Tokenizer_stream.consume_exact_ci t.stream "SYSTEM");
1450 t.state <- Tokenizer_state.After_doctype_system_keyword
1451 end else begin
1452 error t "invalid-character-sequence-after-doctype-name";
1453 t.current_doctype_force_quirks <- true;
1454 t.state <- Tokenizer_state.Bogus_doctype
1455 end
1456
1457 and state_after_doctype_public_keyword () =
1458 match Tokenizer_stream.peek t.stream with
1459 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1460 Tokenizer_stream.advance t.stream;
1461 t.state <- Tokenizer_state.Before_doctype_public_identifier
1462 | Some '"' ->
1463 Tokenizer_stream.advance t.stream;
1464 error t "missing-whitespace-after-doctype-public-keyword";
1465 t.current_doctype_public <- Some (Buffer.create 32);
1466 t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted
1467 | Some '\'' ->
1468 Tokenizer_stream.advance t.stream;
1469 error t "missing-whitespace-after-doctype-public-keyword";
1470 t.current_doctype_public <- Some (Buffer.create 32);
1471 t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted
1472 | Some '>' ->
1473 Tokenizer_stream.advance t.stream;
1474 error t "missing-doctype-public-identifier";
1475 t.current_doctype_force_quirks <- true;
1476 t.state <- Tokenizer_state.Data;
1477 emit_current_doctype ()
1478 | None -> ()
1479 | Some _ ->
1480 (* Don't check control char here - bogus_doctype will check when it consumes *)
1481 error t "missing-quote-before-doctype-public-identifier";
1482 t.current_doctype_force_quirks <- true;
1483 t.state <- Tokenizer_state.Bogus_doctype
1484
1485 and state_before_doctype_public_identifier () =
1486 match Tokenizer_stream.peek t.stream with
1487 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1488 Tokenizer_stream.advance t.stream
1489 | Some '"' ->
1490 Tokenizer_stream.advance t.stream;
1491 t.current_doctype_public <- Some (Buffer.create 32);
1492 t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted
1493 | Some '\'' ->
1494 Tokenizer_stream.advance t.stream;
1495 t.current_doctype_public <- Some (Buffer.create 32);
1496 t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted
1497 | Some '>' ->
1498 Tokenizer_stream.advance t.stream;
1499 error t "missing-doctype-public-identifier";
1500 t.current_doctype_force_quirks <- true;
1501 t.state <- Tokenizer_state.Data;
1502 emit_current_doctype ()
1503 | None -> ()
1504 | Some _ ->
1505 error t "missing-quote-before-doctype-public-identifier";
1506 t.current_doctype_force_quirks <- true;
1507 t.state <- Tokenizer_state.Bogus_doctype
1508
1509 and state_doctype_public_identifier_double_quoted () =
1510 match Tokenizer_stream.consume t.stream with
1511 | Some '"' ->
1512 t.state <- Tokenizer_state.After_doctype_public_identifier
1513 | Some '\x00' ->
1514 error t "unexpected-null-character";
1515 Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1516 | Some '>' ->
1517 error t "abrupt-doctype-public-identifier";
1518 t.current_doctype_force_quirks <- true;
1519 t.state <- Tokenizer_state.Data;
1520 emit_current_doctype ()
1521 | Some c ->
1522 check_control_char c;
1523 Buffer.add_char (Option.get t.current_doctype_public) c
1524 | None -> ()
1525
1526 and state_doctype_public_identifier_single_quoted () =
1527 match Tokenizer_stream.consume t.stream with
1528 | Some '\'' ->
1529 t.state <- Tokenizer_state.After_doctype_public_identifier
1530 | Some '\x00' ->
1531 error t "unexpected-null-character";
1532 Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1533 | Some '>' ->
1534 error t "abrupt-doctype-public-identifier";
1535 t.current_doctype_force_quirks <- true;
1536 t.state <- Tokenizer_state.Data;
1537 emit_current_doctype ()
1538 | Some c ->
1539 check_control_char c;
1540 Buffer.add_char (Option.get t.current_doctype_public) c
1541 | None -> ()
1542
1543 and state_after_doctype_public_identifier () =
1544 match Tokenizer_stream.peek t.stream with
1545 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1546 Tokenizer_stream.advance t.stream;
1547 t.state <- Tokenizer_state.Between_doctype_public_and_system_identifiers
1548 | Some '>' ->
1549 Tokenizer_stream.advance t.stream;
1550 t.state <- Tokenizer_state.Data;
1551 emit_current_doctype ()
1552 | Some '"' ->
1553 Tokenizer_stream.advance t.stream;
1554 error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1555 t.current_doctype_system <- Some (Buffer.create 32);
1556 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1557 | Some '\'' ->
1558 Tokenizer_stream.advance t.stream;
1559 error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1560 t.current_doctype_system <- Some (Buffer.create 32);
1561 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1562 | None -> ()
1563 | Some _ ->
1564 (* Don't check control char here - bogus_doctype will check when it consumes *)
1565 error t "missing-quote-before-doctype-system-identifier";
1566 t.current_doctype_force_quirks <- true;
1567 t.state <- Tokenizer_state.Bogus_doctype
1568
1569 and state_between_doctype_public_and_system_identifiers () =
1570 match Tokenizer_stream.peek t.stream with
1571 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1572 Tokenizer_stream.advance t.stream
1573 | Some '>' ->
1574 Tokenizer_stream.advance t.stream;
1575 t.state <- Tokenizer_state.Data;
1576 emit_current_doctype ()
1577 | Some '"' ->
1578 Tokenizer_stream.advance t.stream;
1579 t.current_doctype_system <- Some (Buffer.create 32);
1580 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1581 | Some '\'' ->
1582 Tokenizer_stream.advance t.stream;
1583 t.current_doctype_system <- Some (Buffer.create 32);
1584 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1585 | None -> ()
1586 | Some _ ->
1587 (* Don't check control char here - bogus_doctype will check when it consumes *)
1588 error t "missing-quote-before-doctype-system-identifier";
1589 t.current_doctype_force_quirks <- true;
1590 t.state <- Tokenizer_state.Bogus_doctype
1591
1592 and state_after_doctype_system_keyword () =
1593 match Tokenizer_stream.peek t.stream with
1594 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1595 Tokenizer_stream.advance t.stream;
1596 t.state <- Tokenizer_state.Before_doctype_system_identifier
1597 | Some '"' ->
1598 Tokenizer_stream.advance t.stream;
1599 error t "missing-whitespace-after-doctype-system-keyword";
1600 t.current_doctype_system <- Some (Buffer.create 32);
1601 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1602 | Some '\'' ->
1603 Tokenizer_stream.advance t.stream;
1604 error t "missing-whitespace-after-doctype-system-keyword";
1605 t.current_doctype_system <- Some (Buffer.create 32);
1606 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1607 | Some '>' ->
1608 Tokenizer_stream.advance t.stream;
1609 error t "missing-doctype-system-identifier";
1610 t.current_doctype_force_quirks <- true;
1611 t.state <- Tokenizer_state.Data;
1612 emit_current_doctype ()
1613 | None -> ()
1614 | Some _ ->
1615 (* Don't check control char here - bogus_doctype will check when it consumes *)
1616 error t "missing-quote-before-doctype-system-identifier";
1617 t.current_doctype_force_quirks <- true;
1618 t.state <- Tokenizer_state.Bogus_doctype
1619
1620 and state_before_doctype_system_identifier () =
1621 match Tokenizer_stream.peek t.stream with
1622 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1623 Tokenizer_stream.advance t.stream
1624 | Some '"' ->
1625 Tokenizer_stream.advance t.stream;
1626 t.current_doctype_system <- Some (Buffer.create 32);
1627 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted
1628 | Some '\'' ->
1629 Tokenizer_stream.advance t.stream;
1630 t.current_doctype_system <- Some (Buffer.create 32);
1631 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted
1632 | Some '>' ->
1633 Tokenizer_stream.advance t.stream;
1634 error t "missing-doctype-system-identifier";
1635 t.current_doctype_force_quirks <- true;
1636 t.state <- Tokenizer_state.Data;
1637 emit_current_doctype ()
1638 | None -> ()
1639 | Some _ ->
1640 (* Don't check control char here - bogus_doctype will check when it consumes *)
1641 error t "missing-quote-before-doctype-system-identifier";
1642 t.current_doctype_force_quirks <- true;
1643 t.state <- Tokenizer_state.Bogus_doctype
1644
1645 and state_doctype_system_identifier_double_quoted () =
1646 match Tokenizer_stream.consume t.stream with
1647 | Some '"' ->
1648 t.state <- Tokenizer_state.After_doctype_system_identifier
1649 | Some '\x00' ->
1650 error t "unexpected-null-character";
1651 Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1652 | Some '>' ->
1653 error t "abrupt-doctype-system-identifier";
1654 t.current_doctype_force_quirks <- true;
1655 t.state <- Tokenizer_state.Data;
1656 emit_current_doctype ()
1657 | Some c ->
1658 check_control_char c;
1659 Buffer.add_char (Option.get t.current_doctype_system) c
1660 | None -> ()
1661
1662 and state_doctype_system_identifier_single_quoted () =
1663 match Tokenizer_stream.consume t.stream with
1664 | Some '\'' ->
1665 t.state <- Tokenizer_state.After_doctype_system_identifier
1666 | Some '\x00' ->
1667 error t "unexpected-null-character";
1668 Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1669 | Some '>' ->
1670 error t "abrupt-doctype-system-identifier";
1671 t.current_doctype_force_quirks <- true;
1672 t.state <- Tokenizer_state.Data;
1673 emit_current_doctype ()
1674 | Some c ->
1675 check_control_char c;
1676 Buffer.add_char (Option.get t.current_doctype_system) c
1677 | None -> ()
1678
1679 and state_after_doctype_system_identifier () =
1680 match Tokenizer_stream.peek t.stream with
1681 | Some ('\t' | '\n' | '\x0C' | ' ') ->
1682 Tokenizer_stream.advance t.stream
1683 | Some '>' ->
1684 Tokenizer_stream.advance t.stream;
1685 t.state <- Tokenizer_state.Data;
1686 emit_current_doctype ()
1687 | None -> ()
1688 | Some _ ->
1689 (* Don't check control char here - bogus_doctype will check when it consumes *)
1690 error t "unexpected-character-after-doctype-system-identifier";
1691 t.state <- Tokenizer_state.Bogus_doctype
1692
1693 and state_bogus_doctype () =
1694 match Tokenizer_stream.consume t.stream with
1695 | Some '>' ->
1696 t.state <- Tokenizer_state.Data;
1697 emit_current_doctype ()
1698 | Some '\x00' ->
1699 error t "unexpected-null-character"
1700 | Some c ->
1701 check_control_char c (* Check all chars in bogus doctype *)
1702 | None -> ()
1703
1704 and state_cdata_section () =
1705 match Tokenizer_stream.consume t.stream with
1706 | Some ']' ->
1707 t.state <- Tokenizer_state.Cdata_section_bracket
1708 | Some c ->
1709 (* CDATA section emits all characters as-is, including NUL, but still check for control chars *)
1710 emit_char_checked c
1711 | None -> ()
1712
1713 and state_cdata_section_bracket () =
1714 match Tokenizer_stream.peek t.stream with
1715 | Some ']' ->
1716 Tokenizer_stream.advance t.stream;
1717 t.state <- Tokenizer_state.Cdata_section_end
1718 | _ ->
1719 emit_char t ']';
1720 t.state <- Tokenizer_state.Cdata_section
1721
1722 and state_cdata_section_end () =
1723 match Tokenizer_stream.peek t.stream with
1724 | Some ']' ->
1725 Tokenizer_stream.advance t.stream;
1726 emit_char t ']'
1727 | Some '>' ->
1728 Tokenizer_stream.advance t.stream;
1729 t.state <- Tokenizer_state.Data
1730 | _ ->
1731 emit_str t "]]";
1732 t.state <- Tokenizer_state.Cdata_section
1733
1734 and state_character_reference () =
1735 Buffer.clear t.temp_buffer;
1736 Buffer.add_char t.temp_buffer '&';
1737 match Tokenizer_stream.peek t.stream with
1738 | Some c when is_ascii_alnum c ->
1739 t.state <- Tokenizer_state.Named_character_reference
1740 | Some '#' ->
1741 Tokenizer_stream.advance t.stream;
1742 Buffer.add_char t.temp_buffer '#';
1743 t.state <- Tokenizer_state.Numeric_character_reference
1744 | _ ->
1745 flush_code_points_consumed_as_char_ref t;
1746 t.state <- t.return_state
1747
1748 and state_named_character_reference () =
1749 (* Collect alphanumeric characters *)
1750 let rec collect () =
1751 match Tokenizer_stream.peek t.stream with
1752 | Some c when is_ascii_alnum c ->
1753 Tokenizer_stream.advance t.stream;
1754 Buffer.add_char t.temp_buffer c;
1755 collect ()
1756 | _ -> ()
1757 in
1758 collect ();
1759
1760 let has_semicolon =
1761 match Tokenizer_stream.peek t.stream with
1762 | Some ';' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true
1763 | _ -> false
1764 in
1765
1766 (* Try to match entity - buffer contains "&name" or "&name;" *)
1767 let buf_contents = Buffer.contents t.temp_buffer in
1768 let name_start = 1 in (* Skip '&' *)
1769 let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in
1770 let entity_name = String.sub buf_contents name_start (name_end - name_start) in
1771
1772 (* Try progressively shorter matches *)
1773 (* Only match if:
1774 1. Full match with semicolon, OR
1775 2. Legacy entity (can be used without semicolon) *)
1776 let rec try_match len =
1777 if len <= 0 then None
1778 else
1779 let prefix = String.sub entity_name 0 len in
1780 let is_full = len = String.length entity_name in
1781 let would_have_semi = has_semicolon && is_full in
1782 (* Only use this match if it has semicolon or is a legacy entity *)
1783 if would_have_semi || Entities.is_legacy prefix then
1784 match Entities.lookup prefix with
1785 | Some decoded -> Some (decoded, len)
1786 | None -> try_match (len - 1)
1787 else
1788 try_match (len - 1)
1789 in
1790
1791 match try_match (String.length entity_name) with
1792 | Some (decoded, matched_len) ->
1793 let full_match = matched_len = String.length entity_name in
1794 let ends_with_semi = has_semicolon && full_match in
1795
1796 (* Check attribute context restrictions *)
1797 let in_attribute = match t.return_state with
1798 | Tokenizer_state.Attribute_value_double_quoted
1799 | Tokenizer_state.Attribute_value_single_quoted
1800 | Tokenizer_state.Attribute_value_unquoted -> true
1801 | _ -> false
1802 in
1803
1804 let next_char =
1805 if full_match && not has_semicolon then
1806 Tokenizer_stream.peek t.stream
1807 else if not full_match then
1808 Some entity_name.[matched_len]
1809 else None
1810 in
1811
1812 let blocked = in_attribute && not ends_with_semi &&
1813 match next_char with
1814 | Some '=' -> true
1815 | Some c when is_ascii_alnum c -> true
1816 | _ -> false
1817 in
1818
1819 if blocked then begin
1820 flush_code_points_consumed_as_char_ref t;
1821 t.state <- t.return_state
1822 end else begin
1823 if not ends_with_semi then
1824 error t "missing-semicolon-after-character-reference";
1825 Buffer.clear t.temp_buffer;
1826 Buffer.add_string t.temp_buffer decoded;
1827 flush_code_points_consumed_as_char_ref t;
1828 (* Emit unconsumed chars after partial match *)
1829 if not full_match then begin
1830 let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in
1831 emit_str t unconsumed;
1832 (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *)
1833 if has_semicolon then
1834 emit_char t ';'
1835 end;
1836 t.state <- t.return_state
1837 end
1838 | None ->
1839 (* No match - check if we should report unknown-named-character-reference *)
1840 if String.length entity_name > 0 then begin
1841 (* If we have a semicolon, it's definitely an unknown named character reference *)
1842 if has_semicolon then
1843 error t "unknown-named-character-reference";
1844 (* Emit all the chars we consumed *)
1845 flush_code_points_consumed_as_char_ref t;
1846 t.state <- t.return_state
1847 end else begin
1848 flush_code_points_consumed_as_char_ref t;
1849 t.state <- t.return_state
1850 end
1851
1852 and state_ambiguous_ampersand () =
1853 match Tokenizer_stream.peek t.stream with
1854 | Some c when is_ascii_alnum c ->
1855 Tokenizer_stream.advance t.stream;
1856 (match t.return_state with
1857 | Tokenizer_state.Attribute_value_double_quoted
1858 | Tokenizer_state.Attribute_value_single_quoted
1859 | Tokenizer_state.Attribute_value_unquoted ->
1860 Buffer.add_char t.current_attr_value c
1861 | _ ->
1862 emit_char t c)
1863 | Some ';' ->
1864 error t "unknown-named-character-reference";
1865 t.state <- t.return_state
1866 | _ ->
1867 t.state <- t.return_state
1868
1869 and state_numeric_character_reference () =
1870 t.char_ref_code <- 0;
1871 match Tokenizer_stream.peek t.stream with
1872 | Some (('x' | 'X') as c) ->
1873 Tokenizer_stream.advance t.stream;
1874 Buffer.add_char t.temp_buffer c;
1875 t.state <- Tokenizer_state.Hexadecimal_character_reference_start
1876 | _ ->
1877 t.state <- Tokenizer_state.Decimal_character_reference_start
1878
1879 and state_hexadecimal_character_reference_start () =
1880 match Tokenizer_stream.peek t.stream with
1881 | Some c when is_ascii_hex c ->
1882 t.state <- Tokenizer_state.Hexadecimal_character_reference
1883 | _ ->
1884 error t "absence-of-digits-in-numeric-character-reference";
1885 flush_code_points_consumed_as_char_ref t;
1886 t.state <- t.return_state
1887
1888 and state_decimal_character_reference_start () =
1889 match Tokenizer_stream.peek t.stream with
1890 | Some c when is_ascii_digit c ->
1891 t.state <- Tokenizer_state.Decimal_character_reference
1892 | _ ->
1893 error t "absence-of-digits-in-numeric-character-reference";
1894 flush_code_points_consumed_as_char_ref t;
1895 t.state <- t.return_state
1896
1897 and state_hexadecimal_character_reference () =
1898 match Tokenizer_stream.peek t.stream with
1899 | Some c when is_ascii_digit c ->
1900 Tokenizer_stream.advance t.stream;
1901 t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0');
1902 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1903 | Some c when c >= 'A' && c <= 'F' ->
1904 Tokenizer_stream.advance t.stream;
1905 t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10);
1906 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1907 | Some c when c >= 'a' && c <= 'f' ->
1908 Tokenizer_stream.advance t.stream;
1909 t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10);
1910 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1911 | Some ';' ->
1912 Tokenizer_stream.advance t.stream;
1913 t.state <- Tokenizer_state.Numeric_character_reference_end
1914 | _ ->
1915 error t "missing-semicolon-after-character-reference";
1916 t.state <- Tokenizer_state.Numeric_character_reference_end
1917
1918 and state_decimal_character_reference () =
1919 match Tokenizer_stream.peek t.stream with
1920 | Some c when is_ascii_digit c ->
1921 Tokenizer_stream.advance t.stream;
1922 t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0');
1923 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1924 | Some ';' ->
1925 Tokenizer_stream.advance t.stream;
1926 t.state <- Tokenizer_state.Numeric_character_reference_end
1927 | _ ->
1928 error t "missing-semicolon-after-character-reference";
1929 t.state <- Tokenizer_state.Numeric_character_reference_end
1930
1931 and state_numeric_character_reference_end () =
1932 let code = t.char_ref_code in
1933 let replacement_char = "\xEF\xBF\xBD" in
1934
1935 let result =
1936 if code = 0 then begin
1937 error t "null-character-reference";
1938 replacement_char
1939 end else if code > 0x10FFFF then begin
1940 error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code);
1941 replacement_char
1942 end else if code >= 0xD800 && code <= 0xDFFF then begin
1943 error t (Printf.sprintf "surrogate-character-reference:%04x" code);
1944 replacement_char
1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946 (* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16).
1947 O(1) bitwise check instead of O(n) list membership. *)
1948 (let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin
1949 error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
1950 Entities.Numeric_ref.codepoint_to_utf8 code
1951 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1952 (code >= 0x0D && code <= 0x1F) ||
1953 (code >= 0x7F && code <= 0x9F) then begin
1954 error t (Printf.sprintf "control-character-reference:%04x" code);
1955 (* Apply Windows-1252 replacement table for 0x80-0x9F *)
1956 match Entities.Numeric_ref.find_replacement code with
1957 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
1958 | None -> Entities.Numeric_ref.codepoint_to_utf8 code
1959 end else
1960 Entities.Numeric_ref.codepoint_to_utf8 code
1961 in
1962
1963 Buffer.clear t.temp_buffer;
1964 Buffer.add_string t.temp_buffer result;
1965 flush_code_points_consumed_as_char_ref t;
1966 t.state <- t.return_state
1967
1968 in
1969 process_state ()
1970
1971let get_errors t = List.rev t.errors
1972
1973let set_state t state = t.state <- state
1974
1975let set_last_start_tag t name = t.last_start_tag <- name