OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 3(* Character classification using Astring *) 4let is_ascii_alpha = Astring.Char.Ascii.is_letter 5let is_ascii_digit = Astring.Char.Ascii.is_digit 6let is_ascii_hex = Astring.Char.Ascii.is_hex_digit 7let is_ascii_alnum = Astring.Char.Ascii.is_alphanum 8let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9let ascii_lower = Astring.Char.Ascii.lowercase 10 11(* Token sink interface *) 12module type SINK = sig 13 type t 14 val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ] 15 val adjusted_current_node_in_html_namespace : t -> bool 16end 17 18type 'sink t = { 19 mutable stream : Tokenizer_stream.t; 20 sink : 'sink; 21 mutable state : Tokenizer_state.t; 22 mutable return_state : Tokenizer_state.t; 23 mutable char_ref_code : int; 24 mutable temp_buffer : Buffer.t; 25 mutable last_start_tag : string; 26 mutable current_tag_name : Buffer.t; 27 mutable current_tag_kind : Tokenizer_token.tag_kind; 28 mutable current_tag_self_closing : bool; 29 mutable current_attr_name : Buffer.t; 30 mutable current_attr_value : Buffer.t; 31 mutable current_attrs : (string * string) list; 32 mutable current_doctype_name : Buffer.t option; 33 mutable current_doctype_public : Buffer.t option; 34 mutable current_doctype_system : Buffer.t option; 35 mutable current_doctype_force_quirks : bool; 36 mutable current_comment : Buffer.t; 37 mutable pending_chars : Buffer.t; 38 mutable errors : Tokenizer_errors.t list; 39 collect_errors : bool; 40 xml_mode : bool; (* XML violation mode: transform chars for XML compatibility *) 41} 42 43let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) ?(xml_mode=false) () = { 44 stream = Tokenizer_stream.create ""; 45 sink; 46 state = Tokenizer_state.Data; 47 return_state = Tokenizer_state.Data; 48 char_ref_code = 0; 49 temp_buffer = Buffer.create 64; 50 last_start_tag = ""; 51 current_tag_name = Buffer.create 32; 52 current_tag_kind = Tokenizer_token.Start; 53 current_tag_self_closing = false; 54 current_attr_name = Buffer.create 32; 55 current_attr_value = Buffer.create 64; 56 current_attrs = []; 57 current_doctype_name = None; 58 current_doctype_public = None; 59 current_doctype_system = None; 60 current_doctype_force_quirks = false; 61 current_comment = Buffer.create 64; 62 pending_chars = Buffer.create 256; 63 errors = []; 64 collect_errors; 65 xml_mode; 66} 67 68let error t code = 69 if t.collect_errors then begin 70 let (line, column) = Tokenizer_stream.position t.stream in 71 t.errors <- Tokenizer_errors.make ~code ~line ~column :: t.errors 72 end 73 74(* emit functions are defined locally inside run *) 75 76(* XML mode character transformation: form feed → space *) 77let emit_char t c = 78 if t.xml_mode && c = '\x0C' then 79 Buffer.add_char t.pending_chars ' ' 80 else 81 Buffer.add_char t.pending_chars c 82 83(* XML mode string transformation: U+FFFF → U+FFFD, form feed → space *) 84let emit_str t s = 85 if t.xml_mode then begin 86 (* Transform: \xEF\xBF\xBF (U+FFFF) → \xEF\xBF\xBD (U+FFFD), \x0C → space *) 87 let len = String.length s in 88 let i = ref 0 in 89 while !i < len do 90 let c = s.[!i] in 91 if c = '\x0C' then begin 92 Buffer.add_char t.pending_chars ' '; 93 incr i 94 end else if c = '\xEF' && !i + 2 < len && s.[!i+1] = '\xBF' && s.[!i+2] = '\xBF' then begin 95 (* U+FFFF → U+FFFD *) 96 Buffer.add_string t.pending_chars "\xEF\xBF\xBD"; 97 i := !i + 3 98 end else begin 99 Buffer.add_char t.pending_chars c; 100 incr i 101 end 102 done 103 end else 104 Buffer.add_string t.pending_chars s 105 106let start_new_tag t kind = 107 Buffer.clear t.current_tag_name; 108 t.current_tag_kind <- kind; 109 t.current_tag_self_closing <- false; 110 t.current_attrs <- [] 111 112let start_new_attribute t = 113 (* Save previous attribute if any *) 114 let name = Buffer.contents t.current_attr_name in 115 if String.length name > 0 then begin 116 let value = Buffer.contents t.current_attr_value in 117 (* Check for duplicates - only add if not already present *) 118 if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then 119 t.current_attrs <- (name, value) :: t.current_attrs 120 else 121 error t "duplicate-attribute" 122 end; 123 Buffer.clear t.current_attr_name; 124 Buffer.clear t.current_attr_value 125 126let finish_attribute t = 127 start_new_attribute t 128 129let start_new_doctype t = 130 t.current_doctype_name <- None; 131 t.current_doctype_public <- None; 132 t.current_doctype_system <- None; 133 t.current_doctype_force_quirks <- false 134 135(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *) 136 137let is_appropriate_end_tag t = 138 let name = Buffer.contents t.current_tag_name in 139 String.length t.last_start_tag > 0 && name = t.last_start_tag 140 141let flush_code_points_consumed_as_char_ref t = 142 let s = Buffer.contents t.temp_buffer in 143 match t.return_state with 144 | Tokenizer_state.Attribute_value_double_quoted 145 | Tokenizer_state.Attribute_value_single_quoted 146 | Tokenizer_state.Attribute_value_unquoted -> 147 Buffer.add_string t.current_attr_value s 148 | _ -> 149 emit_str t s 150 151open Bytesrw 152 153(* Main tokenization loop *) 154let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) = 155 t.stream <- Tokenizer_stream.create_from_reader reader; 156 t.errors <- []; 157 (* Set up error callback for surrogate/noncharacter detection in stream *) 158 (* In XML mode, we don't report noncharacter errors - we transform them instead *) 159 if not t.xml_mode then 160 Tokenizer_stream.set_error_callback t.stream (fun code -> error t code); 161 162 (* XML mode transformation for pending chars: U+FFFF → U+FFFD *) 163 let transform_xml_chars data = 164 let len = String.length data in 165 let buf = Buffer.create len in 166 let i = ref 0 in 167 while !i < len do 168 let c = data.[!i] in 169 if c = '\xEF' && !i + 2 < len && data.[!i+1] = '\xBF' && data.[!i+2] = '\xBF' then begin 170 (* U+FFFF → U+FFFD *) 171 Buffer.add_string buf "\xEF\xBF\xBD"; 172 i := !i + 3 173 end else begin 174 Buffer.add_char buf c; 175 incr i 176 end 177 done; 178 Buffer.contents buf 179 in 180 181 (* Local emit functions with access to S *) 182 let emit_pending_chars () = 183 if Buffer.length t.pending_chars > 0 then begin 184 let data = Buffer.contents t.pending_chars in 185 Buffer.clear t.pending_chars; 186 let data = if t.xml_mode then transform_xml_chars data else data in 187 let line, column = Tokenizer_stream.position t.stream in 188 ignore (S.process t.sink (Tokenizer_token.Character data) ~line ~column) 189 end 190 in 191 192 let emit token = 193 emit_pending_chars (); 194 let line, column = Tokenizer_stream.position t.stream in 195 match S.process t.sink token ~line ~column with 196 | `Continue -> () 197 | `SwitchTo new_state -> t.state <- new_state 198 in 199 200 let emit_current_tag () = 201 finish_attribute t; 202 let name = Buffer.contents t.current_tag_name in 203 let attrs = List.rev t.current_attrs in 204 (* Check for end tag with attributes or self-closing flag *) 205 if t.current_tag_kind = Tokenizer_token.End then begin 206 if attrs <> [] then 207 error t "end-tag-with-attributes"; 208 if t.current_tag_self_closing then 209 error t "end-tag-with-trailing-solidus" 210 end; 211 let tag = { 212 Tokenizer_token.kind = t.current_tag_kind; 213 name; 214 attrs; 215 self_closing = t.current_tag_self_closing; 216 } in 217 if t.current_tag_kind = Tokenizer_token.Start then 218 t.last_start_tag <- name; 219 emit (Tokenizer_token.Tag tag) 220 in 221 222 let emit_current_doctype () = 223 let doctype = { 224 Tokenizer_token.name = Option.map Buffer.contents t.current_doctype_name; 225 public_id = Option.map Buffer.contents t.current_doctype_public; 226 system_id = Option.map Buffer.contents t.current_doctype_system; 227 force_quirks = t.current_doctype_force_quirks; 228 } in 229 emit (Tokenizer_token.Doctype doctype) 230 in 231 232 let emit_current_comment () = 233 let content = Buffer.contents t.current_comment in 234 let content = 235 if t.xml_mode then begin 236 (* XML mode: transform -- to - - in comments *) 237 let buf = Buffer.create (String.length content + 10) in 238 let len = String.length content in 239 let i = ref 0 in 240 while !i < len do 241 if !i + 1 < len && content.[!i] = '-' && content.[!i+1] = '-' then begin 242 Buffer.add_string buf "- -"; 243 i := !i + 2 244 end else begin 245 Buffer.add_char buf content.[!i]; 246 incr i 247 end 248 done; 249 Buffer.contents buf 250 end else content 251 in 252 emit (Tokenizer_token.Comment content) 253 in 254 255 (* Check for control characters and emit error if needed *) 256 (* Only checks ASCII control chars; C1 controls (U+0080-U+009F) are 2-byte in UTF-8 *) 257 let check_control_char c = 258 let code = Char.code c in 259 (* Control chars: U+0001-U+0008, U+000B, U+000E-U+001F, U+007F *) 260 (* Allowed: U+0009 (tab), U+000A (LF), U+000C (FF), U+000D (CR) *) 261 (* Note: U+0080-U+009F (C1 controls) are 2-byte UTF-8 sequences starting with 0xC2 *) 262 (* Note: We only check single-byte control chars here; multi-byte checks are TODO *) 263 if (code >= 0x01 && code <= 0x08) || 264 code = 0x0B || 265 (code >= 0x0E && code <= 0x1F) || 266 code = 0x7F then 267 error t (Printf.sprintf "control-character-in-input-stream:%04x" code) 268 in 269 270 271 (* Emit char with control character check *) 272 let emit_char_checked c = 273 check_control_char c; 274 emit_char t c 275 in 276 277 let rec process_state () = 278 if Tokenizer_stream.is_eof t.stream && t.state <> Tokenizer_state.Data then begin 279 (* Handle EOF in various states *) 280 handle_eof () 281 end else if Tokenizer_stream.is_eof t.stream then begin 282 emit_pending_chars (); 283 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 284 end else begin 285 step (); 286 process_state () 287 end 288 289 and handle_eof () = 290 match t.state with 291 | Tokenizer_state.Data -> 292 emit_pending_chars (); 293 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 294 | Tokenizer_state.Tag_open -> 295 error t "eof-before-tag-name"; 296 emit_char t '<'; 297 emit_pending_chars (); 298 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 299 | Tokenizer_state.End_tag_open -> 300 error t "eof-before-tag-name"; 301 emit_str t "</"; 302 emit_pending_chars (); 303 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 304 | Tokenizer_state.Tag_name 305 | Tokenizer_state.Before_attribute_name 306 | Tokenizer_state.Attribute_name 307 | Tokenizer_state.After_attribute_name 308 | Tokenizer_state.Before_attribute_value 309 | Tokenizer_state.Attribute_value_double_quoted 310 | Tokenizer_state.Attribute_value_single_quoted 311 | Tokenizer_state.Attribute_value_unquoted 312 | Tokenizer_state.After_attribute_value_quoted 313 | Tokenizer_state.Self_closing_start_tag -> 314 error t "eof-in-tag"; 315 emit_pending_chars (); 316 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 317 | Tokenizer_state.Rawtext -> 318 emit_pending_chars (); 319 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 320 | Tokenizer_state.Rawtext_less_than_sign -> 321 emit_char t '<'; 322 emit_pending_chars (); 323 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 324 | Tokenizer_state.Rawtext_end_tag_open -> 325 emit_str t "</"; 326 emit_pending_chars (); 327 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 328 | Tokenizer_state.Rawtext_end_tag_name -> 329 emit_str t "</"; 330 emit_str t (Buffer.contents t.temp_buffer); 331 emit_pending_chars (); 332 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 333 | Tokenizer_state.Rcdata -> 334 emit_pending_chars (); 335 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 336 | Tokenizer_state.Rcdata_less_than_sign -> 337 emit_char t '<'; 338 emit_pending_chars (); 339 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 340 | Tokenizer_state.Rcdata_end_tag_open -> 341 emit_str t "</"; 342 emit_pending_chars (); 343 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 344 | Tokenizer_state.Rcdata_end_tag_name -> 345 emit_str t "</"; 346 emit_str t (Buffer.contents t.temp_buffer); 347 emit_pending_chars (); 348 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 349 | Tokenizer_state.Script_data -> 350 emit_pending_chars (); 351 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 352 | Tokenizer_state.Script_data_less_than_sign -> 353 emit_char t '<'; 354 emit_pending_chars (); 355 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 356 | Tokenizer_state.Script_data_end_tag_open -> 357 emit_str t "</"; 358 emit_pending_chars (); 359 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 360 | Tokenizer_state.Script_data_end_tag_name -> 361 emit_str t "</"; 362 emit_str t (Buffer.contents t.temp_buffer); 363 emit_pending_chars (); 364 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 365 | Tokenizer_state.Script_data_escape_start 366 | Tokenizer_state.Script_data_escape_start_dash 367 | Tokenizer_state.Script_data_escaped 368 | Tokenizer_state.Script_data_escaped_dash 369 | Tokenizer_state.Script_data_escaped_dash_dash -> 370 error t "eof-in-script-html-comment-like-text"; 371 emit_pending_chars (); 372 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 373 | Tokenizer_state.Script_data_escaped_less_than_sign -> 374 emit_char t '<'; 375 emit_pending_chars (); 376 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 377 | Tokenizer_state.Script_data_escaped_end_tag_open -> 378 emit_str t "</"; 379 emit_pending_chars (); 380 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 381 | Tokenizer_state.Script_data_escaped_end_tag_name -> 382 emit_str t "</"; 383 emit_str t (Buffer.contents t.temp_buffer); 384 emit_pending_chars (); 385 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 386 | Tokenizer_state.Script_data_double_escape_start 387 | Tokenizer_state.Script_data_double_escaped 388 | Tokenizer_state.Script_data_double_escaped_dash 389 | Tokenizer_state.Script_data_double_escaped_dash_dash -> 390 error t "eof-in-script-html-comment-like-text"; 391 emit_pending_chars (); 392 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 393 | Tokenizer_state.Script_data_double_escaped_less_than_sign -> 394 (* '<' was already emitted when entering this state from Script_data_double_escaped *) 395 emit_pending_chars (); 396 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 397 | Tokenizer_state.Script_data_double_escape_end -> 398 emit_pending_chars (); 399 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 400 | Tokenizer_state.Plaintext -> 401 emit_pending_chars (); 402 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 403 | Tokenizer_state.Comment_start 404 | Tokenizer_state.Comment_start_dash 405 | Tokenizer_state.Comment 406 | Tokenizer_state.Comment_less_than_sign 407 | Tokenizer_state.Comment_less_than_sign_bang 408 | Tokenizer_state.Comment_less_than_sign_bang_dash 409 | Tokenizer_state.Comment_less_than_sign_bang_dash_dash 410 | Tokenizer_state.Comment_end_dash 411 | Tokenizer_state.Comment_end 412 | Tokenizer_state.Comment_end_bang -> 413 error t "eof-in-comment"; 414 emit_current_comment (); 415 emit_pending_chars (); 416 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 417 | Tokenizer_state.Bogus_comment -> 418 emit_current_comment (); 419 emit_pending_chars (); 420 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 421 | Tokenizer_state.Markup_declaration_open -> 422 error t "incorrectly-opened-comment"; 423 Buffer.clear t.current_comment; 424 emit_current_comment (); 425 emit_pending_chars (); 426 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 427 | Tokenizer_state.Doctype 428 | Tokenizer_state.Before_doctype_name -> 429 error t "eof-in-doctype"; 430 start_new_doctype t; 431 t.current_doctype_force_quirks <- true; 432 emit_current_doctype (); 433 emit_pending_chars (); 434 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 435 | Tokenizer_state.Doctype_name 436 | Tokenizer_state.After_doctype_name 437 | Tokenizer_state.After_doctype_public_keyword 438 | Tokenizer_state.Before_doctype_public_identifier 439 | Tokenizer_state.Doctype_public_identifier_double_quoted 440 | Tokenizer_state.Doctype_public_identifier_single_quoted 441 | Tokenizer_state.After_doctype_public_identifier 442 | Tokenizer_state.Between_doctype_public_and_system_identifiers 443 | Tokenizer_state.After_doctype_system_keyword 444 | Tokenizer_state.Before_doctype_system_identifier 445 | Tokenizer_state.Doctype_system_identifier_double_quoted 446 | Tokenizer_state.Doctype_system_identifier_single_quoted 447 | Tokenizer_state.After_doctype_system_identifier -> 448 error t "eof-in-doctype"; 449 t.current_doctype_force_quirks <- true; 450 emit_current_doctype (); 451 emit_pending_chars (); 452 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 453 | Tokenizer_state.Bogus_doctype -> 454 emit_current_doctype (); 455 emit_pending_chars (); 456 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 457 | Tokenizer_state.Cdata_section -> 458 error t "eof-in-cdata"; 459 emit_pending_chars (); 460 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 461 | Tokenizer_state.Cdata_section_bracket -> 462 error t "eof-in-cdata"; 463 emit_char t ']'; 464 emit_pending_chars (); 465 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 466 | Tokenizer_state.Cdata_section_end -> 467 error t "eof-in-cdata"; 468 emit_str t "]]"; 469 emit_pending_chars (); 470 let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column) 471 | Tokenizer_state.Character_reference -> 472 (* state_character_reference never ran, so initialize temp_buffer with & *) 473 Buffer.clear t.temp_buffer; 474 Buffer.add_char t.temp_buffer '&'; 475 flush_code_points_consumed_as_char_ref t; 476 t.state <- t.return_state; 477 handle_eof () 478 | Tokenizer_state.Named_character_reference -> 479 flush_code_points_consumed_as_char_ref t; 480 t.state <- t.return_state; 481 handle_eof () 482 | Tokenizer_state.Numeric_character_reference -> 483 (* At EOF with just "&#" - no digits follow *) 484 error t "absence-of-digits-in-numeric-character-reference"; 485 flush_code_points_consumed_as_char_ref t; 486 t.state <- t.return_state; 487 handle_eof () 488 | Tokenizer_state.Hexadecimal_character_reference_start 489 | Tokenizer_state.Decimal_character_reference_start -> 490 error t "absence-of-digits-in-numeric-character-reference"; 491 flush_code_points_consumed_as_char_ref t; 492 t.state <- t.return_state; 493 handle_eof () 494 | Tokenizer_state.Numeric_character_reference_end -> 495 (* We have collected digits, just need to finalize the character reference *) 496 step (); 497 handle_eof () 498 | Tokenizer_state.Ambiguous_ampersand -> 499 (* Buffer was already flushed when entering this state, just transition *) 500 t.state <- t.return_state; 501 handle_eof () 502 | Tokenizer_state.Hexadecimal_character_reference 503 | Tokenizer_state.Decimal_character_reference -> 504 (* At EOF with collected digits - convert the numeric reference *) 505 error t "missing-semicolon-after-character-reference"; 506 let code = t.char_ref_code in 507 let replacement_char = "\xEF\xBF\xBD" in 508 let result = 509 if code = 0 then begin 510 error t "null-character-reference"; 511 replacement_char 512 end else if code > 0x10FFFF then begin 513 error t "character-reference-outside-unicode-range"; 514 replacement_char 515 end else if code >= 0xD800 && code <= 0xDFFF then begin 516 error t "surrogate-character-reference"; 517 replacement_char 518 end else 519 Entities.Numeric_ref.codepoint_to_utf8 code 520 in 521 Buffer.clear t.temp_buffer; 522 Buffer.add_string t.temp_buffer result; 523 flush_code_points_consumed_as_char_ref t; 524 t.state <- t.return_state; 525 handle_eof () 526 527 and step () = 528 match t.state with 529 | Tokenizer_state.Data -> state_data () 530 | Tokenizer_state.Rcdata -> state_rcdata () 531 | Tokenizer_state.Rawtext -> state_rawtext () 532 | Tokenizer_state.Script_data -> state_script_data () 533 | Tokenizer_state.Plaintext -> state_plaintext () 534 | Tokenizer_state.Tag_open -> state_tag_open () 535 | Tokenizer_state.End_tag_open -> state_end_tag_open () 536 | Tokenizer_state.Tag_name -> state_tag_name () 537 | Tokenizer_state.Rcdata_less_than_sign -> state_rcdata_less_than_sign () 538 | Tokenizer_state.Rcdata_end_tag_open -> state_rcdata_end_tag_open () 539 | Tokenizer_state.Rcdata_end_tag_name -> state_rcdata_end_tag_name () 540 | Tokenizer_state.Rawtext_less_than_sign -> state_rawtext_less_than_sign () 541 | Tokenizer_state.Rawtext_end_tag_open -> state_rawtext_end_tag_open () 542 | Tokenizer_state.Rawtext_end_tag_name -> state_rawtext_end_tag_name () 543 | Tokenizer_state.Script_data_less_than_sign -> state_script_data_less_than_sign () 544 | Tokenizer_state.Script_data_end_tag_open -> state_script_data_end_tag_open () 545 | Tokenizer_state.Script_data_end_tag_name -> state_script_data_end_tag_name () 546 | Tokenizer_state.Script_data_escape_start -> state_script_data_escape_start () 547 | Tokenizer_state.Script_data_escape_start_dash -> state_script_data_escape_start_dash () 548 | Tokenizer_state.Script_data_escaped -> state_script_data_escaped () 549 | Tokenizer_state.Script_data_escaped_dash -> state_script_data_escaped_dash () 550 | Tokenizer_state.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash () 551 | Tokenizer_state.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign () 552 | Tokenizer_state.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open () 553 | Tokenizer_state.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name () 554 | Tokenizer_state.Script_data_double_escape_start -> state_script_data_double_escape_start () 555 | Tokenizer_state.Script_data_double_escaped -> state_script_data_double_escaped () 556 | Tokenizer_state.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash () 557 | Tokenizer_state.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash () 558 | Tokenizer_state.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign () 559 | Tokenizer_state.Script_data_double_escape_end -> state_script_data_double_escape_end () 560 | Tokenizer_state.Before_attribute_name -> state_before_attribute_name () 561 | Tokenizer_state.Attribute_name -> state_attribute_name () 562 | Tokenizer_state.After_attribute_name -> state_after_attribute_name () 563 | Tokenizer_state.Before_attribute_value -> state_before_attribute_value () 564 | Tokenizer_state.Attribute_value_double_quoted -> state_attribute_value_double_quoted () 565 | Tokenizer_state.Attribute_value_single_quoted -> state_attribute_value_single_quoted () 566 | Tokenizer_state.Attribute_value_unquoted -> state_attribute_value_unquoted () 567 | Tokenizer_state.After_attribute_value_quoted -> state_after_attribute_value_quoted () 568 | Tokenizer_state.Self_closing_start_tag -> state_self_closing_start_tag () 569 | Tokenizer_state.Bogus_comment -> state_bogus_comment () 570 | Tokenizer_state.Markup_declaration_open -> state_markup_declaration_open () 571 | Tokenizer_state.Comment_start -> state_comment_start () 572 | Tokenizer_state.Comment_start_dash -> state_comment_start_dash () 573 | Tokenizer_state.Comment -> state_comment () 574 | Tokenizer_state.Comment_less_than_sign -> state_comment_less_than_sign () 575 | Tokenizer_state.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang () 576 | Tokenizer_state.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash () 577 | Tokenizer_state.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash () 578 | Tokenizer_state.Comment_end_dash -> state_comment_end_dash () 579 | Tokenizer_state.Comment_end -> state_comment_end () 580 | Tokenizer_state.Comment_end_bang -> state_comment_end_bang () 581 | Tokenizer_state.Doctype -> state_doctype () 582 | Tokenizer_state.Before_doctype_name -> state_before_doctype_name () 583 | Tokenizer_state.Doctype_name -> state_doctype_name () 584 | Tokenizer_state.After_doctype_name -> state_after_doctype_name () 585 | Tokenizer_state.After_doctype_public_keyword -> state_after_doctype_public_keyword () 586 | Tokenizer_state.Before_doctype_public_identifier -> state_before_doctype_public_identifier () 587 | Tokenizer_state.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted () 588 | Tokenizer_state.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted () 589 | Tokenizer_state.After_doctype_public_identifier -> state_after_doctype_public_identifier () 590 | Tokenizer_state.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers () 591 | Tokenizer_state.After_doctype_system_keyword -> state_after_doctype_system_keyword () 592 | Tokenizer_state.Before_doctype_system_identifier -> state_before_doctype_system_identifier () 593 | Tokenizer_state.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted () 594 | Tokenizer_state.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted () 595 | Tokenizer_state.After_doctype_system_identifier -> state_after_doctype_system_identifier () 596 | Tokenizer_state.Bogus_doctype -> state_bogus_doctype () 597 | Tokenizer_state.Cdata_section -> state_cdata_section () 598 | Tokenizer_state.Cdata_section_bracket -> state_cdata_section_bracket () 599 | Tokenizer_state.Cdata_section_end -> state_cdata_section_end () 600 | Tokenizer_state.Character_reference -> state_character_reference () 601 | Tokenizer_state.Named_character_reference -> state_named_character_reference () 602 | Tokenizer_state.Ambiguous_ampersand -> state_ambiguous_ampersand () 603 | Tokenizer_state.Numeric_character_reference -> state_numeric_character_reference () 604 | Tokenizer_state.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start () 605 | Tokenizer_state.Decimal_character_reference_start -> state_decimal_character_reference_start () 606 | Tokenizer_state.Hexadecimal_character_reference -> state_hexadecimal_character_reference () 607 | Tokenizer_state.Decimal_character_reference -> state_decimal_character_reference () 608 | Tokenizer_state.Numeric_character_reference_end -> state_numeric_character_reference_end () 609 610 (* State implementations *) 611 and state_data () = 612 match Tokenizer_stream.consume t.stream with 613 | Some '&' -> 614 t.return_state <- Tokenizer_state.Data; 615 t.state <- Tokenizer_state.Character_reference 616 | Some '<' -> 617 t.state <- Tokenizer_state.Tag_open 618 | Some '\x00' -> 619 (* Emit pending chars first, then emit null separately for proper tree builder handling *) 620 emit_pending_chars (); 621 error t "unexpected-null-character"; 622 let line, column = Tokenizer_stream.position t.stream in 623 ignore (S.process t.sink (Tokenizer_token.Character "\x00") ~line ~column) 624 | Some c -> 625 emit_char_checked c 626 | None -> () 627 628 and state_rcdata () = 629 match Tokenizer_stream.consume t.stream with 630 | Some '&' -> 631 t.return_state <- Tokenizer_state.Rcdata; 632 t.state <- Tokenizer_state.Character_reference 633 | Some '<' -> 634 t.state <- Tokenizer_state.Rcdata_less_than_sign 635 | Some '\x00' -> 636 error t "unexpected-null-character"; 637 emit_str t "\xEF\xBF\xBD" 638 | Some c -> 639 emit_char_checked c 640 | None -> () 641 642 and state_rawtext () = 643 match Tokenizer_stream.consume t.stream with 644 | Some '<' -> 645 t.state <- Tokenizer_state.Rawtext_less_than_sign 646 | Some '\x00' -> 647 error t "unexpected-null-character"; 648 emit_str t "\xEF\xBF\xBD" 649 | Some c -> 650 emit_char_checked c 651 | None -> () 652 653 and state_script_data () = 654 match Tokenizer_stream.consume t.stream with 655 | Some '<' -> 656 t.state <- Tokenizer_state.Script_data_less_than_sign 657 | Some '\x00' -> 658 error t "unexpected-null-character"; 659 emit_str t "\xEF\xBF\xBD" 660 | Some c -> 661 emit_char_checked c 662 | None -> () 663 664 and state_plaintext () = 665 match Tokenizer_stream.consume t.stream with 666 | Some '\x00' -> 667 error t "unexpected-null-character"; 668 emit_str t "\xEF\xBF\xBD" 669 | Some c -> 670 emit_char_checked c 671 | None -> () 672 673 and state_tag_open () = 674 match Tokenizer_stream.peek t.stream with 675 | Some '!' -> 676 Tokenizer_stream.advance t.stream; 677 t.state <- Tokenizer_state.Markup_declaration_open 678 | Some '/' -> 679 Tokenizer_stream.advance t.stream; 680 t.state <- Tokenizer_state.End_tag_open 681 | Some c when is_ascii_alpha c -> 682 start_new_tag t Tokenizer_token.Start; 683 t.state <- Tokenizer_state.Tag_name 684 | Some '?' -> 685 error t "unexpected-question-mark-instead-of-tag-name"; 686 Buffer.clear t.current_comment; 687 t.state <- Tokenizer_state.Bogus_comment 688 | None -> 689 error t "eof-before-tag-name"; 690 emit_char t '<' 691 | Some _ -> 692 error t "invalid-first-character-of-tag-name"; 693 emit_char t '<'; 694 t.state <- Tokenizer_state.Data 695 696 and state_end_tag_open () = 697 match Tokenizer_stream.peek t.stream with 698 | Some c when is_ascii_alpha c -> 699 start_new_tag t Tokenizer_token.End; 700 t.state <- Tokenizer_state.Tag_name 701 | Some '>' -> 702 Tokenizer_stream.advance t.stream; 703 error t "missing-end-tag-name"; 704 t.state <- Tokenizer_state.Data 705 | None -> 706 error t "eof-before-tag-name"; 707 emit_str t "</" 708 | Some _ -> 709 error t "invalid-first-character-of-tag-name"; 710 Buffer.clear t.current_comment; 711 t.state <- Tokenizer_state.Bogus_comment 712 713 and state_tag_name () = 714 match Tokenizer_stream.consume t.stream with 715 | Some ('\t' | '\n' | '\x0C' | ' ') -> 716 t.state <- Tokenizer_state.Before_attribute_name 717 | Some '/' -> 718 t.state <- Tokenizer_state.Self_closing_start_tag 719 | Some '>' -> 720 t.state <- Tokenizer_state.Data; 721 emit_current_tag () 722 | Some '\x00' -> 723 error t "unexpected-null-character"; 724 Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 725 | Some c -> 726 check_control_char c; 727 Buffer.add_char t.current_tag_name (ascii_lower c) 728 | None -> () 729 730 and state_rcdata_less_than_sign () = 731 match Tokenizer_stream.peek t.stream with 732 | Some '/' -> 733 Tokenizer_stream.advance t.stream; 734 Buffer.clear t.temp_buffer; 735 t.state <- Tokenizer_state.Rcdata_end_tag_open 736 | _ -> 737 emit_char t '<'; 738 t.state <- Tokenizer_state.Rcdata 739 740 and state_rcdata_end_tag_open () = 741 match Tokenizer_stream.peek t.stream with 742 | Some c when is_ascii_alpha c -> 743 start_new_tag t Tokenizer_token.End; 744 t.state <- Tokenizer_state.Rcdata_end_tag_name 745 | _ -> 746 emit_str t "</"; 747 t.state <- Tokenizer_state.Rcdata 748 749 and state_rcdata_end_tag_name () = 750 match Tokenizer_stream.peek t.stream with 751 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 752 Tokenizer_stream.advance t.stream; 753 t.state <- Tokenizer_state.Before_attribute_name 754 | Some '/' when is_appropriate_end_tag t -> 755 Tokenizer_stream.advance t.stream; 756 t.state <- Tokenizer_state.Self_closing_start_tag 757 | Some '>' when is_appropriate_end_tag t -> 758 Tokenizer_stream.advance t.stream; 759 t.state <- Tokenizer_state.Data; 760 emit_current_tag () 761 | Some c when is_ascii_alpha c -> 762 Tokenizer_stream.advance t.stream; 763 Buffer.add_char t.current_tag_name (ascii_lower c); 764 Buffer.add_char t.temp_buffer c 765 | _ -> 766 emit_str t "</"; 767 emit_str t (Buffer.contents t.temp_buffer); 768 t.state <- Tokenizer_state.Rcdata 769 770 and state_rawtext_less_than_sign () = 771 match Tokenizer_stream.peek t.stream with 772 | Some '/' -> 773 Tokenizer_stream.advance t.stream; 774 Buffer.clear t.temp_buffer; 775 t.state <- Tokenizer_state.Rawtext_end_tag_open 776 | _ -> 777 emit_char t '<'; 778 t.state <- Tokenizer_state.Rawtext 779 780 and state_rawtext_end_tag_open () = 781 match Tokenizer_stream.peek t.stream with 782 | Some c when is_ascii_alpha c -> 783 start_new_tag t Tokenizer_token.End; 784 t.state <- Tokenizer_state.Rawtext_end_tag_name 785 | _ -> 786 emit_str t "</"; 787 t.state <- Tokenizer_state.Rawtext 788 789 and state_rawtext_end_tag_name () = 790 match Tokenizer_stream.peek t.stream with 791 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 792 Tokenizer_stream.advance t.stream; 793 t.state <- Tokenizer_state.Before_attribute_name 794 | Some '/' when is_appropriate_end_tag t -> 795 Tokenizer_stream.advance t.stream; 796 t.state <- Tokenizer_state.Self_closing_start_tag 797 | Some '>' when is_appropriate_end_tag t -> 798 Tokenizer_stream.advance t.stream; 799 t.state <- Tokenizer_state.Data; 800 emit_current_tag () 801 | Some c when is_ascii_alpha c -> 802 Tokenizer_stream.advance t.stream; 803 Buffer.add_char t.current_tag_name (ascii_lower c); 804 Buffer.add_char t.temp_buffer c 805 | _ -> 806 emit_str t "</"; 807 emit_str t (Buffer.contents t.temp_buffer); 808 t.state <- Tokenizer_state.Rawtext 809 810 and state_script_data_less_than_sign () = 811 match Tokenizer_stream.peek t.stream with 812 | Some '/' -> 813 Tokenizer_stream.advance t.stream; 814 Buffer.clear t.temp_buffer; 815 t.state <- Tokenizer_state.Script_data_end_tag_open 816 | Some '!' -> 817 Tokenizer_stream.advance t.stream; 818 t.state <- Tokenizer_state.Script_data_escape_start; 819 emit_str t "<!" 820 | _ -> 821 emit_char t '<'; 822 t.state <- Tokenizer_state.Script_data 823 824 and state_script_data_end_tag_open () = 825 match Tokenizer_stream.peek t.stream with 826 | Some c when is_ascii_alpha c -> 827 start_new_tag t Tokenizer_token.End; 828 t.state <- Tokenizer_state.Script_data_end_tag_name 829 | _ -> 830 emit_str t "</"; 831 t.state <- Tokenizer_state.Script_data 832 833 and state_script_data_end_tag_name () = 834 match Tokenizer_stream.peek t.stream with 835 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 836 Tokenizer_stream.advance t.stream; 837 t.state <- Tokenizer_state.Before_attribute_name 838 | Some '/' when is_appropriate_end_tag t -> 839 Tokenizer_stream.advance t.stream; 840 t.state <- Tokenizer_state.Self_closing_start_tag 841 | Some '>' when is_appropriate_end_tag t -> 842 Tokenizer_stream.advance t.stream; 843 t.state <- Tokenizer_state.Data; 844 emit_current_tag () 845 | Some c when is_ascii_alpha c -> 846 Tokenizer_stream.advance t.stream; 847 Buffer.add_char t.current_tag_name (ascii_lower c); 848 Buffer.add_char t.temp_buffer c 849 | _ -> 850 emit_str t "</"; 851 emit_str t (Buffer.contents t.temp_buffer); 852 t.state <- Tokenizer_state.Script_data 853 854 and state_script_data_escape_start () = 855 match Tokenizer_stream.peek t.stream with 856 | Some '-' -> 857 Tokenizer_stream.advance t.stream; 858 t.state <- Tokenizer_state.Script_data_escape_start_dash; 859 emit_char t '-' 860 | _ -> 861 t.state <- Tokenizer_state.Script_data 862 863 and state_script_data_escape_start_dash () = 864 match Tokenizer_stream.peek t.stream with 865 | Some '-' -> 866 Tokenizer_stream.advance t.stream; 867 t.state <- Tokenizer_state.Script_data_escaped_dash_dash; 868 emit_char t '-' 869 | _ -> 870 t.state <- Tokenizer_state.Script_data 871 872 and state_script_data_escaped () = 873 match Tokenizer_stream.consume t.stream with 874 | Some '-' -> 875 t.state <- Tokenizer_state.Script_data_escaped_dash; 876 emit_char t '-' 877 | Some '<' -> 878 t.state <- Tokenizer_state.Script_data_escaped_less_than_sign 879 | Some '\x00' -> 880 error t "unexpected-null-character"; 881 emit_str t "\xEF\xBF\xBD" 882 | Some c -> 883 emit_char_checked c 884 | None -> () 885 886 and state_script_data_escaped_dash () = 887 match Tokenizer_stream.consume t.stream with 888 | Some '-' -> 889 t.state <- Tokenizer_state.Script_data_escaped_dash_dash; 890 emit_char t '-' 891 | Some '<' -> 892 t.state <- Tokenizer_state.Script_data_escaped_less_than_sign 893 | Some '\x00' -> 894 error t "unexpected-null-character"; 895 t.state <- Tokenizer_state.Script_data_escaped; 896 emit_str t "\xEF\xBF\xBD" 897 | Some c -> 898 t.state <- Tokenizer_state.Script_data_escaped; 899 emit_char_checked c 900 | None -> () 901 902 and state_script_data_escaped_dash_dash () = 903 match Tokenizer_stream.consume t.stream with 904 | Some '-' -> 905 emit_char t '-' 906 | Some '<' -> 907 t.state <- Tokenizer_state.Script_data_escaped_less_than_sign 908 | Some '>' -> 909 t.state <- Tokenizer_state.Script_data; 910 emit_char t '>' 911 | Some '\x00' -> 912 error t "unexpected-null-character"; 913 t.state <- Tokenizer_state.Script_data_escaped; 914 emit_str t "\xEF\xBF\xBD" 915 | Some c -> 916 t.state <- Tokenizer_state.Script_data_escaped; 917 emit_char_checked c 918 | None -> () 919 920 and state_script_data_escaped_less_than_sign () = 921 match Tokenizer_stream.peek t.stream with 922 | Some '/' -> 923 Tokenizer_stream.advance t.stream; 924 Buffer.clear t.temp_buffer; 925 t.state <- Tokenizer_state.Script_data_escaped_end_tag_open 926 | Some c when is_ascii_alpha c -> 927 Buffer.clear t.temp_buffer; 928 emit_char t '<'; 929 t.state <- Tokenizer_state.Script_data_double_escape_start 930 | _ -> 931 emit_char t '<'; 932 t.state <- Tokenizer_state.Script_data_escaped 933 934 and state_script_data_escaped_end_tag_open () = 935 match Tokenizer_stream.peek t.stream with 936 | Some c when is_ascii_alpha c -> 937 start_new_tag t Tokenizer_token.End; 938 t.state <- Tokenizer_state.Script_data_escaped_end_tag_name 939 | _ -> 940 emit_str t "</"; 941 t.state <- Tokenizer_state.Script_data_escaped 942 943 and state_script_data_escaped_end_tag_name () = 944 match Tokenizer_stream.peek t.stream with 945 | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 946 Tokenizer_stream.advance t.stream; 947 t.state <- Tokenizer_state.Before_attribute_name 948 | Some '/' when is_appropriate_end_tag t -> 949 Tokenizer_stream.advance t.stream; 950 t.state <- Tokenizer_state.Self_closing_start_tag 951 | Some '>' when is_appropriate_end_tag t -> 952 Tokenizer_stream.advance t.stream; 953 t.state <- Tokenizer_state.Data; 954 emit_current_tag () 955 | Some c when is_ascii_alpha c -> 956 Tokenizer_stream.advance t.stream; 957 Buffer.add_char t.current_tag_name (ascii_lower c); 958 Buffer.add_char t.temp_buffer c 959 | _ -> 960 emit_str t "</"; 961 emit_str t (Buffer.contents t.temp_buffer); 962 t.state <- Tokenizer_state.Script_data_escaped 963 964 and state_script_data_double_escape_start () = 965 match Tokenizer_stream.peek t.stream with 966 | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 967 Tokenizer_stream.advance t.stream; 968 let c = Option.get c_opt in 969 if Buffer.contents t.temp_buffer = "script" then 970 t.state <- Tokenizer_state.Script_data_double_escaped 971 else 972 t.state <- Tokenizer_state.Script_data_escaped; 973 emit_char t c 974 | Some c when is_ascii_alpha c -> 975 Tokenizer_stream.advance t.stream; 976 Buffer.add_char t.temp_buffer (ascii_lower c); 977 emit_char t c 978 | _ -> 979 t.state <- Tokenizer_state.Script_data_escaped 980 981 and state_script_data_double_escaped () = 982 match Tokenizer_stream.consume t.stream with 983 | Some '-' -> 984 t.state <- Tokenizer_state.Script_data_double_escaped_dash; 985 emit_char t '-' 986 | Some '<' -> 987 t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; 988 emit_char t '<' 989 | Some '\x00' -> 990 error t "unexpected-null-character"; 991 emit_str t "\xEF\xBF\xBD" 992 | Some c -> 993 emit_char_checked c 994 | None -> () 995 996 and state_script_data_double_escaped_dash () = 997 match Tokenizer_stream.consume t.stream with 998 | Some '-' -> 999 t.state <- Tokenizer_state.Script_data_double_escaped_dash_dash; 1000 emit_char t '-' 1001 | Some '<' -> 1002 t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; 1003 emit_char t '<' 1004 | Some '\x00' -> 1005 error t "unexpected-null-character"; 1006 t.state <- Tokenizer_state.Script_data_double_escaped; 1007 emit_str t "\xEF\xBF\xBD" 1008 | Some c -> 1009 t.state <- Tokenizer_state.Script_data_double_escaped; 1010 emit_char_checked c 1011 | None -> () 1012 1013 and state_script_data_double_escaped_dash_dash () = 1014 match Tokenizer_stream.consume t.stream with 1015 | Some '-' -> 1016 emit_char t '-' 1017 | Some '<' -> 1018 t.state <- Tokenizer_state.Script_data_double_escaped_less_than_sign; 1019 emit_char t '<' 1020 | Some '>' -> 1021 t.state <- Tokenizer_state.Script_data; 1022 emit_char t '>' 1023 | Some '\x00' -> 1024 error t "unexpected-null-character"; 1025 t.state <- Tokenizer_state.Script_data_double_escaped; 1026 emit_str t "\xEF\xBF\xBD" 1027 | Some c -> 1028 t.state <- Tokenizer_state.Script_data_double_escaped; 1029 emit_char_checked c 1030 | None -> () 1031 1032 and state_script_data_double_escaped_less_than_sign () = 1033 match Tokenizer_stream.peek t.stream with 1034 | Some '/' -> 1035 Tokenizer_stream.advance t.stream; 1036 Buffer.clear t.temp_buffer; 1037 t.state <- Tokenizer_state.Script_data_double_escape_end; 1038 emit_char t '/' 1039 | _ -> 1040 t.state <- Tokenizer_state.Script_data_double_escaped 1041 1042 and state_script_data_double_escape_end () = 1043 match Tokenizer_stream.peek t.stream with 1044 | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 1045 Tokenizer_stream.advance t.stream; 1046 let c = Option.get c_opt in 1047 if Buffer.contents t.temp_buffer = "script" then 1048 t.state <- Tokenizer_state.Script_data_escaped 1049 else 1050 t.state <- Tokenizer_state.Script_data_double_escaped; 1051 emit_char t c 1052 | Some c when is_ascii_alpha c -> 1053 Tokenizer_stream.advance t.stream; 1054 Buffer.add_char t.temp_buffer (ascii_lower c); 1055 emit_char t c 1056 | _ -> 1057 t.state <- Tokenizer_state.Script_data_double_escaped 1058 1059 and state_before_attribute_name () = 1060 match Tokenizer_stream.peek t.stream with 1061 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1062 Tokenizer_stream.advance t.stream 1063 | Some '/' | Some '>' | None -> 1064 t.state <- Tokenizer_state.After_attribute_name 1065 | Some '=' -> 1066 Tokenizer_stream.advance t.stream; 1067 error t "unexpected-equals-sign-before-attribute-name"; 1068 start_new_attribute t; 1069 Buffer.add_char t.current_attr_name '='; 1070 t.state <- Tokenizer_state.Attribute_name 1071 | Some _ -> 1072 start_new_attribute t; 1073 t.state <- Tokenizer_state.Attribute_name 1074 1075 and state_attribute_name () = 1076 match Tokenizer_stream.peek t.stream with 1077 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1078 Tokenizer_stream.advance t.stream; 1079 t.state <- Tokenizer_state.After_attribute_name 1080 | Some '/' | Some '>' | None -> 1081 t.state <- Tokenizer_state.After_attribute_name 1082 | Some '=' -> 1083 Tokenizer_stream.advance t.stream; 1084 t.state <- Tokenizer_state.Before_attribute_value 1085 | Some '\x00' -> 1086 Tokenizer_stream.advance t.stream; 1087 error t "unexpected-null-character"; 1088 Buffer.add_string t.current_attr_name "\xEF\xBF\xBD" 1089 | Some ('"' | '\'' | '<') as c_opt -> 1090 Tokenizer_stream.advance t.stream; 1091 error t "unexpected-character-in-attribute-name"; 1092 Buffer.add_char t.current_attr_name (Option.get c_opt) 1093 | Some c -> 1094 Tokenizer_stream.advance t.stream; 1095 check_control_char c; 1096 Buffer.add_char t.current_attr_name (ascii_lower c) 1097 1098 and state_after_attribute_name () = 1099 match Tokenizer_stream.peek t.stream with 1100 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1101 Tokenizer_stream.advance t.stream 1102 | Some '/' -> 1103 Tokenizer_stream.advance t.stream; 1104 t.state <- Tokenizer_state.Self_closing_start_tag 1105 | Some '=' -> 1106 Tokenizer_stream.advance t.stream; 1107 t.state <- Tokenizer_state.Before_attribute_value 1108 | Some '>' -> 1109 Tokenizer_stream.advance t.stream; 1110 t.state <- Tokenizer_state.Data; 1111 emit_current_tag () 1112 | None -> () 1113 | Some _ -> 1114 start_new_attribute t; 1115 t.state <- Tokenizer_state.Attribute_name 1116 1117 and state_before_attribute_value () = 1118 match Tokenizer_stream.peek t.stream with 1119 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1120 Tokenizer_stream.advance t.stream 1121 | Some '"' -> 1122 Tokenizer_stream.advance t.stream; 1123 t.state <- Tokenizer_state.Attribute_value_double_quoted 1124 | Some '\'' -> 1125 Tokenizer_stream.advance t.stream; 1126 t.state <- Tokenizer_state.Attribute_value_single_quoted 1127 | Some '>' -> 1128 Tokenizer_stream.advance t.stream; 1129 error t "missing-attribute-value"; 1130 t.state <- Tokenizer_state.Data; 1131 emit_current_tag () 1132 | _ -> 1133 t.state <- Tokenizer_state.Attribute_value_unquoted 1134 1135 and state_attribute_value_double_quoted () = 1136 match Tokenizer_stream.consume t.stream with 1137 | Some '"' -> 1138 t.state <- Tokenizer_state.After_attribute_value_quoted 1139 | Some '&' -> 1140 t.return_state <- Tokenizer_state.Attribute_value_double_quoted; 1141 t.state <- Tokenizer_state.Character_reference 1142 | Some '\x00' -> 1143 error t "unexpected-null-character"; 1144 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1145 | Some c -> 1146 check_control_char c; 1147 Buffer.add_char t.current_attr_value c 1148 | None -> () 1149 1150 and state_attribute_value_single_quoted () = 1151 match Tokenizer_stream.consume t.stream with 1152 | Some '\'' -> 1153 t.state <- Tokenizer_state.After_attribute_value_quoted 1154 | Some '&' -> 1155 t.return_state <- Tokenizer_state.Attribute_value_single_quoted; 1156 t.state <- Tokenizer_state.Character_reference 1157 | Some '\x00' -> 1158 error t "unexpected-null-character"; 1159 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1160 | Some c -> 1161 check_control_char c; 1162 Buffer.add_char t.current_attr_value c 1163 | None -> () 1164 1165 and state_attribute_value_unquoted () = 1166 match Tokenizer_stream.peek t.stream with 1167 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1168 Tokenizer_stream.advance t.stream; 1169 t.state <- Tokenizer_state.Before_attribute_name 1170 | Some '&' -> 1171 Tokenizer_stream.advance t.stream; 1172 t.return_state <- Tokenizer_state.Attribute_value_unquoted; 1173 t.state <- Tokenizer_state.Character_reference 1174 | Some '>' -> 1175 Tokenizer_stream.advance t.stream; 1176 t.state <- Tokenizer_state.Data; 1177 emit_current_tag () 1178 | Some '\x00' -> 1179 Tokenizer_stream.advance t.stream; 1180 error t "unexpected-null-character"; 1181 Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1182 | Some ('"' | '\'' | '<' | '=' | '`') as c_opt -> 1183 Tokenizer_stream.advance t.stream; 1184 error t "unexpected-character-in-unquoted-attribute-value"; 1185 Buffer.add_char t.current_attr_value (Option.get c_opt) 1186 | Some c -> 1187 Tokenizer_stream.advance t.stream; 1188 check_control_char c; 1189 Buffer.add_char t.current_attr_value c 1190 | None -> () 1191 1192 and state_after_attribute_value_quoted () = 1193 match Tokenizer_stream.peek t.stream with 1194 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1195 Tokenizer_stream.advance t.stream; 1196 t.state <- Tokenizer_state.Before_attribute_name 1197 | Some '/' -> 1198 Tokenizer_stream.advance t.stream; 1199 t.state <- Tokenizer_state.Self_closing_start_tag 1200 | Some '>' -> 1201 Tokenizer_stream.advance t.stream; 1202 t.state <- Tokenizer_state.Data; 1203 emit_current_tag () 1204 | None -> () 1205 | Some _ -> 1206 error t "missing-whitespace-between-attributes"; 1207 t.state <- Tokenizer_state.Before_attribute_name 1208 1209 and state_self_closing_start_tag () = 1210 match Tokenizer_stream.peek t.stream with 1211 | Some '>' -> 1212 Tokenizer_stream.advance t.stream; 1213 t.current_tag_self_closing <- true; 1214 t.state <- Tokenizer_state.Data; 1215 emit_current_tag () 1216 | None -> () 1217 | Some _ -> 1218 error t "unexpected-solidus-in-tag"; 1219 t.state <- Tokenizer_state.Before_attribute_name 1220 1221 and state_bogus_comment () = 1222 match Tokenizer_stream.consume t.stream with 1223 | Some '>' -> 1224 t.state <- Tokenizer_state.Data; 1225 emit_current_comment () 1226 | Some '\x00' -> 1227 error t "unexpected-null-character"; 1228 Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1229 | Some c -> 1230 check_control_char c; 1231 Buffer.add_char t.current_comment c 1232 | None -> () 1233 1234 and state_markup_declaration_open () = 1235 if Tokenizer_stream.matches_ci t.stream "--" then begin 1236 ignore (Tokenizer_stream.consume_exact_ci t.stream "--"); 1237 Buffer.clear t.current_comment; 1238 t.state <- Tokenizer_state.Comment_start 1239 end else if Tokenizer_stream.matches_ci t.stream "DOCTYPE" then begin 1240 ignore (Tokenizer_stream.consume_exact_ci t.stream "DOCTYPE"); 1241 t.state <- Tokenizer_state.Doctype 1242 end else if Tokenizer_stream.matches_ci t.stream "[CDATA[" then begin 1243 ignore (Tokenizer_stream.consume_exact_ci t.stream "[CDATA["); 1244 (* CDATA only allowed in foreign content *) 1245 if S.adjusted_current_node_in_html_namespace t.sink then begin 1246 error t "cdata-in-html-content"; 1247 Buffer.clear t.current_comment; 1248 Buffer.add_string t.current_comment "[CDATA["; 1249 t.state <- Tokenizer_state.Bogus_comment 1250 end else 1251 t.state <- Tokenizer_state.Cdata_section 1252 end else begin 1253 error t "incorrectly-opened-comment"; 1254 Buffer.clear t.current_comment; 1255 t.state <- Tokenizer_state.Bogus_comment 1256 end 1257 1258 and state_comment_start () = 1259 match Tokenizer_stream.peek t.stream with 1260 | Some '-' -> 1261 Tokenizer_stream.advance t.stream; 1262 t.state <- Tokenizer_state.Comment_start_dash 1263 | Some '>' -> 1264 Tokenizer_stream.advance t.stream; 1265 error t "abrupt-closing-of-empty-comment"; 1266 t.state <- Tokenizer_state.Data; 1267 emit_current_comment () 1268 | _ -> 1269 t.state <- Tokenizer_state.Comment 1270 1271 and state_comment_start_dash () = 1272 match Tokenizer_stream.peek t.stream with 1273 | Some '-' -> 1274 Tokenizer_stream.advance t.stream; 1275 t.state <- Tokenizer_state.Comment_end 1276 | Some '>' -> 1277 Tokenizer_stream.advance t.stream; 1278 error t "abrupt-closing-of-empty-comment"; 1279 t.state <- Tokenizer_state.Data; 1280 emit_current_comment () 1281 | None -> () 1282 | Some _ -> 1283 Buffer.add_char t.current_comment '-'; 1284 t.state <- Tokenizer_state.Comment 1285 1286 and state_comment () = 1287 match Tokenizer_stream.consume t.stream with 1288 | Some '<' -> 1289 Buffer.add_char t.current_comment '<'; 1290 t.state <- Tokenizer_state.Comment_less_than_sign 1291 | Some '-' -> 1292 t.state <- Tokenizer_state.Comment_end_dash 1293 | Some '\x00' -> 1294 error t "unexpected-null-character"; 1295 Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1296 | Some c -> 1297 check_control_char c; 1298 Buffer.add_char t.current_comment c 1299 | None -> () 1300 1301 and state_comment_less_than_sign () = 1302 match Tokenizer_stream.peek t.stream with 1303 | Some '!' -> 1304 Tokenizer_stream.advance t.stream; 1305 Buffer.add_char t.current_comment '!'; 1306 t.state <- Tokenizer_state.Comment_less_than_sign_bang 1307 | Some '<' -> 1308 Tokenizer_stream.advance t.stream; 1309 Buffer.add_char t.current_comment '<' 1310 | _ -> 1311 t.state <- Tokenizer_state.Comment 1312 1313 and state_comment_less_than_sign_bang () = 1314 match Tokenizer_stream.peek t.stream with 1315 | Some '-' -> 1316 Tokenizer_stream.advance t.stream; 1317 t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash 1318 | _ -> 1319 t.state <- Tokenizer_state.Comment 1320 1321 and state_comment_less_than_sign_bang_dash () = 1322 match Tokenizer_stream.peek t.stream with 1323 | Some '-' -> 1324 Tokenizer_stream.advance t.stream; 1325 t.state <- Tokenizer_state.Comment_less_than_sign_bang_dash_dash 1326 | _ -> 1327 t.state <- Tokenizer_state.Comment_end_dash 1328 1329 and state_comment_less_than_sign_bang_dash_dash () = 1330 match Tokenizer_stream.peek t.stream with 1331 | Some '>' | None -> 1332 t.state <- Tokenizer_state.Comment_end 1333 | Some _ -> 1334 error t "nested-comment"; 1335 t.state <- Tokenizer_state.Comment_end 1336 1337 and state_comment_end_dash () = 1338 match Tokenizer_stream.peek t.stream with 1339 | Some '-' -> 1340 Tokenizer_stream.advance t.stream; 1341 t.state <- Tokenizer_state.Comment_end 1342 | None -> () 1343 | Some _ -> 1344 Buffer.add_char t.current_comment '-'; 1345 t.state <- Tokenizer_state.Comment 1346 1347 and state_comment_end () = 1348 match Tokenizer_stream.peek t.stream with 1349 | Some '>' -> 1350 Tokenizer_stream.advance t.stream; 1351 t.state <- Tokenizer_state.Data; 1352 emit_current_comment () 1353 | Some '!' -> 1354 Tokenizer_stream.advance t.stream; 1355 t.state <- Tokenizer_state.Comment_end_bang 1356 | Some '-' -> 1357 Tokenizer_stream.advance t.stream; 1358 Buffer.add_char t.current_comment '-' 1359 | None -> () 1360 | Some _ -> 1361 Buffer.add_string t.current_comment "--"; 1362 t.state <- Tokenizer_state.Comment 1363 1364 and state_comment_end_bang () = 1365 match Tokenizer_stream.peek t.stream with 1366 | Some '-' -> 1367 Tokenizer_stream.advance t.stream; 1368 Buffer.add_string t.current_comment "--!"; 1369 t.state <- Tokenizer_state.Comment_end_dash 1370 | Some '>' -> 1371 Tokenizer_stream.advance t.stream; 1372 error t "incorrectly-closed-comment"; 1373 t.state <- Tokenizer_state.Data; 1374 emit_current_comment () 1375 | None -> () 1376 | Some _ -> 1377 Buffer.add_string t.current_comment "--!"; 1378 t.state <- Tokenizer_state.Comment 1379 1380 and state_doctype () = 1381 match Tokenizer_stream.peek t.stream with 1382 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1383 Tokenizer_stream.advance t.stream; 1384 t.state <- Tokenizer_state.Before_doctype_name 1385 | Some '>' -> 1386 t.state <- Tokenizer_state.Before_doctype_name 1387 | None -> () 1388 | Some _ -> 1389 error t "missing-whitespace-before-doctype-name"; 1390 t.state <- Tokenizer_state.Before_doctype_name 1391 1392 and state_before_doctype_name () = 1393 match Tokenizer_stream.peek t.stream with 1394 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1395 Tokenizer_stream.advance t.stream 1396 | Some '\x00' -> 1397 Tokenizer_stream.advance t.stream; 1398 error t "unexpected-null-character"; 1399 start_new_doctype t; 1400 t.current_doctype_name <- Some (Buffer.create 8); 1401 Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"; 1402 t.state <- Tokenizer_state.Doctype_name 1403 | Some '>' -> 1404 Tokenizer_stream.advance t.stream; 1405 error t "missing-doctype-name"; 1406 start_new_doctype t; 1407 t.current_doctype_force_quirks <- true; 1408 t.state <- Tokenizer_state.Data; 1409 emit_current_doctype () 1410 | None -> () 1411 | Some c -> 1412 Tokenizer_stream.advance t.stream; 1413 check_control_char c; 1414 start_new_doctype t; 1415 t.current_doctype_name <- Some (Buffer.create 8); 1416 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c); 1417 t.state <- Tokenizer_state.Doctype_name 1418 1419 and state_doctype_name () = 1420 match Tokenizer_stream.consume t.stream with 1421 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1422 t.state <- Tokenizer_state.After_doctype_name 1423 | Some '>' -> 1424 t.state <- Tokenizer_state.Data; 1425 emit_current_doctype () 1426 | Some '\x00' -> 1427 error t "unexpected-null-character"; 1428 Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD" 1429 | Some c -> 1430 check_control_char c; 1431 Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c) 1432 | None -> () 1433 1434 and state_after_doctype_name () = 1435 match Tokenizer_stream.peek t.stream with 1436 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1437 Tokenizer_stream.advance t.stream 1438 | Some '>' -> 1439 Tokenizer_stream.advance t.stream; 1440 t.state <- Tokenizer_state.Data; 1441 emit_current_doctype () 1442 | None -> () 1443 | Some _ -> 1444 (* Don't check control char here - bogus_doctype will check when it consumes *) 1445 if Tokenizer_stream.matches_ci t.stream "PUBLIC" then begin 1446 ignore (Tokenizer_stream.consume_exact_ci t.stream "PUBLIC"); 1447 t.state <- Tokenizer_state.After_doctype_public_keyword 1448 end else if Tokenizer_stream.matches_ci t.stream "SYSTEM" then begin 1449 ignore (Tokenizer_stream.consume_exact_ci t.stream "SYSTEM"); 1450 t.state <- Tokenizer_state.After_doctype_system_keyword 1451 end else begin 1452 error t "invalid-character-sequence-after-doctype-name"; 1453 t.current_doctype_force_quirks <- true; 1454 t.state <- Tokenizer_state.Bogus_doctype 1455 end 1456 1457 and state_after_doctype_public_keyword () = 1458 match Tokenizer_stream.peek t.stream with 1459 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1460 Tokenizer_stream.advance t.stream; 1461 t.state <- Tokenizer_state.Before_doctype_public_identifier 1462 | Some '"' -> 1463 Tokenizer_stream.advance t.stream; 1464 error t "missing-whitespace-after-doctype-public-keyword"; 1465 t.current_doctype_public <- Some (Buffer.create 32); 1466 t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted 1467 | Some '\'' -> 1468 Tokenizer_stream.advance t.stream; 1469 error t "missing-whitespace-after-doctype-public-keyword"; 1470 t.current_doctype_public <- Some (Buffer.create 32); 1471 t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted 1472 | Some '>' -> 1473 Tokenizer_stream.advance t.stream; 1474 error t "missing-doctype-public-identifier"; 1475 t.current_doctype_force_quirks <- true; 1476 t.state <- Tokenizer_state.Data; 1477 emit_current_doctype () 1478 | None -> () 1479 | Some _ -> 1480 (* Don't check control char here - bogus_doctype will check when it consumes *) 1481 error t "missing-quote-before-doctype-public-identifier"; 1482 t.current_doctype_force_quirks <- true; 1483 t.state <- Tokenizer_state.Bogus_doctype 1484 1485 and state_before_doctype_public_identifier () = 1486 match Tokenizer_stream.peek t.stream with 1487 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1488 Tokenizer_stream.advance t.stream 1489 | Some '"' -> 1490 Tokenizer_stream.advance t.stream; 1491 t.current_doctype_public <- Some (Buffer.create 32); 1492 t.state <- Tokenizer_state.Doctype_public_identifier_double_quoted 1493 | Some '\'' -> 1494 Tokenizer_stream.advance t.stream; 1495 t.current_doctype_public <- Some (Buffer.create 32); 1496 t.state <- Tokenizer_state.Doctype_public_identifier_single_quoted 1497 | Some '>' -> 1498 Tokenizer_stream.advance t.stream; 1499 error t "missing-doctype-public-identifier"; 1500 t.current_doctype_force_quirks <- true; 1501 t.state <- Tokenizer_state.Data; 1502 emit_current_doctype () 1503 | None -> () 1504 | Some _ -> 1505 error t "missing-quote-before-doctype-public-identifier"; 1506 t.current_doctype_force_quirks <- true; 1507 t.state <- Tokenizer_state.Bogus_doctype 1508 1509 and state_doctype_public_identifier_double_quoted () = 1510 match Tokenizer_stream.consume t.stream with 1511 | Some '"' -> 1512 t.state <- Tokenizer_state.After_doctype_public_identifier 1513 | Some '\x00' -> 1514 error t "unexpected-null-character"; 1515 Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1516 | Some '>' -> 1517 error t "abrupt-doctype-public-identifier"; 1518 t.current_doctype_force_quirks <- true; 1519 t.state <- Tokenizer_state.Data; 1520 emit_current_doctype () 1521 | Some c -> 1522 check_control_char c; 1523 Buffer.add_char (Option.get t.current_doctype_public) c 1524 | None -> () 1525 1526 and state_doctype_public_identifier_single_quoted () = 1527 match Tokenizer_stream.consume t.stream with 1528 | Some '\'' -> 1529 t.state <- Tokenizer_state.After_doctype_public_identifier 1530 | Some '\x00' -> 1531 error t "unexpected-null-character"; 1532 Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1533 | Some '>' -> 1534 error t "abrupt-doctype-public-identifier"; 1535 t.current_doctype_force_quirks <- true; 1536 t.state <- Tokenizer_state.Data; 1537 emit_current_doctype () 1538 | Some c -> 1539 check_control_char c; 1540 Buffer.add_char (Option.get t.current_doctype_public) c 1541 | None -> () 1542 1543 and state_after_doctype_public_identifier () = 1544 match Tokenizer_stream.peek t.stream with 1545 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1546 Tokenizer_stream.advance t.stream; 1547 t.state <- Tokenizer_state.Between_doctype_public_and_system_identifiers 1548 | Some '>' -> 1549 Tokenizer_stream.advance t.stream; 1550 t.state <- Tokenizer_state.Data; 1551 emit_current_doctype () 1552 | Some '"' -> 1553 Tokenizer_stream.advance t.stream; 1554 error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1555 t.current_doctype_system <- Some (Buffer.create 32); 1556 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1557 | Some '\'' -> 1558 Tokenizer_stream.advance t.stream; 1559 error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1560 t.current_doctype_system <- Some (Buffer.create 32); 1561 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1562 | None -> () 1563 | Some _ -> 1564 (* Don't check control char here - bogus_doctype will check when it consumes *) 1565 error t "missing-quote-before-doctype-system-identifier"; 1566 t.current_doctype_force_quirks <- true; 1567 t.state <- Tokenizer_state.Bogus_doctype 1568 1569 and state_between_doctype_public_and_system_identifiers () = 1570 match Tokenizer_stream.peek t.stream with 1571 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1572 Tokenizer_stream.advance t.stream 1573 | Some '>' -> 1574 Tokenizer_stream.advance t.stream; 1575 t.state <- Tokenizer_state.Data; 1576 emit_current_doctype () 1577 | Some '"' -> 1578 Tokenizer_stream.advance t.stream; 1579 t.current_doctype_system <- Some (Buffer.create 32); 1580 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1581 | Some '\'' -> 1582 Tokenizer_stream.advance t.stream; 1583 t.current_doctype_system <- Some (Buffer.create 32); 1584 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1585 | None -> () 1586 | Some _ -> 1587 (* Don't check control char here - bogus_doctype will check when it consumes *) 1588 error t "missing-quote-before-doctype-system-identifier"; 1589 t.current_doctype_force_quirks <- true; 1590 t.state <- Tokenizer_state.Bogus_doctype 1591 1592 and state_after_doctype_system_keyword () = 1593 match Tokenizer_stream.peek t.stream with 1594 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1595 Tokenizer_stream.advance t.stream; 1596 t.state <- Tokenizer_state.Before_doctype_system_identifier 1597 | Some '"' -> 1598 Tokenizer_stream.advance t.stream; 1599 error t "missing-whitespace-after-doctype-system-keyword"; 1600 t.current_doctype_system <- Some (Buffer.create 32); 1601 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1602 | Some '\'' -> 1603 Tokenizer_stream.advance t.stream; 1604 error t "missing-whitespace-after-doctype-system-keyword"; 1605 t.current_doctype_system <- Some (Buffer.create 32); 1606 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1607 | Some '>' -> 1608 Tokenizer_stream.advance t.stream; 1609 error t "missing-doctype-system-identifier"; 1610 t.current_doctype_force_quirks <- true; 1611 t.state <- Tokenizer_state.Data; 1612 emit_current_doctype () 1613 | None -> () 1614 | Some _ -> 1615 (* Don't check control char here - bogus_doctype will check when it consumes *) 1616 error t "missing-quote-before-doctype-system-identifier"; 1617 t.current_doctype_force_quirks <- true; 1618 t.state <- Tokenizer_state.Bogus_doctype 1619 1620 and state_before_doctype_system_identifier () = 1621 match Tokenizer_stream.peek t.stream with 1622 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1623 Tokenizer_stream.advance t.stream 1624 | Some '"' -> 1625 Tokenizer_stream.advance t.stream; 1626 t.current_doctype_system <- Some (Buffer.create 32); 1627 t.state <- Tokenizer_state.Doctype_system_identifier_double_quoted 1628 | Some '\'' -> 1629 Tokenizer_stream.advance t.stream; 1630 t.current_doctype_system <- Some (Buffer.create 32); 1631 t.state <- Tokenizer_state.Doctype_system_identifier_single_quoted 1632 | Some '>' -> 1633 Tokenizer_stream.advance t.stream; 1634 error t "missing-doctype-system-identifier"; 1635 t.current_doctype_force_quirks <- true; 1636 t.state <- Tokenizer_state.Data; 1637 emit_current_doctype () 1638 | None -> () 1639 | Some _ -> 1640 (* Don't check control char here - bogus_doctype will check when it consumes *) 1641 error t "missing-quote-before-doctype-system-identifier"; 1642 t.current_doctype_force_quirks <- true; 1643 t.state <- Tokenizer_state.Bogus_doctype 1644 1645 and state_doctype_system_identifier_double_quoted () = 1646 match Tokenizer_stream.consume t.stream with 1647 | Some '"' -> 1648 t.state <- Tokenizer_state.After_doctype_system_identifier 1649 | Some '\x00' -> 1650 error t "unexpected-null-character"; 1651 Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1652 | Some '>' -> 1653 error t "abrupt-doctype-system-identifier"; 1654 t.current_doctype_force_quirks <- true; 1655 t.state <- Tokenizer_state.Data; 1656 emit_current_doctype () 1657 | Some c -> 1658 check_control_char c; 1659 Buffer.add_char (Option.get t.current_doctype_system) c 1660 | None -> () 1661 1662 and state_doctype_system_identifier_single_quoted () = 1663 match Tokenizer_stream.consume t.stream with 1664 | Some '\'' -> 1665 t.state <- Tokenizer_state.After_doctype_system_identifier 1666 | Some '\x00' -> 1667 error t "unexpected-null-character"; 1668 Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1669 | Some '>' -> 1670 error t "abrupt-doctype-system-identifier"; 1671 t.current_doctype_force_quirks <- true; 1672 t.state <- Tokenizer_state.Data; 1673 emit_current_doctype () 1674 | Some c -> 1675 check_control_char c; 1676 Buffer.add_char (Option.get t.current_doctype_system) c 1677 | None -> () 1678 1679 and state_after_doctype_system_identifier () = 1680 match Tokenizer_stream.peek t.stream with 1681 | Some ('\t' | '\n' | '\x0C' | ' ') -> 1682 Tokenizer_stream.advance t.stream 1683 | Some '>' -> 1684 Tokenizer_stream.advance t.stream; 1685 t.state <- Tokenizer_state.Data; 1686 emit_current_doctype () 1687 | None -> () 1688 | Some _ -> 1689 (* Don't check control char here - bogus_doctype will check when it consumes *) 1690 error t "unexpected-character-after-doctype-system-identifier"; 1691 t.state <- Tokenizer_state.Bogus_doctype 1692 1693 and state_bogus_doctype () = 1694 match Tokenizer_stream.consume t.stream with 1695 | Some '>' -> 1696 t.state <- Tokenizer_state.Data; 1697 emit_current_doctype () 1698 | Some '\x00' -> 1699 error t "unexpected-null-character" 1700 | Some c -> 1701 check_control_char c (* Check all chars in bogus doctype *) 1702 | None -> () 1703 1704 and state_cdata_section () = 1705 match Tokenizer_stream.consume t.stream with 1706 | Some ']' -> 1707 t.state <- Tokenizer_state.Cdata_section_bracket 1708 | Some c -> 1709 (* CDATA section emits all characters as-is, including NUL, but still check for control chars *) 1710 emit_char_checked c 1711 | None -> () 1712 1713 and state_cdata_section_bracket () = 1714 match Tokenizer_stream.peek t.stream with 1715 | Some ']' -> 1716 Tokenizer_stream.advance t.stream; 1717 t.state <- Tokenizer_state.Cdata_section_end 1718 | _ -> 1719 emit_char t ']'; 1720 t.state <- Tokenizer_state.Cdata_section 1721 1722 and state_cdata_section_end () = 1723 match Tokenizer_stream.peek t.stream with 1724 | Some ']' -> 1725 Tokenizer_stream.advance t.stream; 1726 emit_char t ']' 1727 | Some '>' -> 1728 Tokenizer_stream.advance t.stream; 1729 t.state <- Tokenizer_state.Data 1730 | _ -> 1731 emit_str t "]]"; 1732 t.state <- Tokenizer_state.Cdata_section 1733 1734 and state_character_reference () = 1735 Buffer.clear t.temp_buffer; 1736 Buffer.add_char t.temp_buffer '&'; 1737 match Tokenizer_stream.peek t.stream with 1738 | Some c when is_ascii_alnum c -> 1739 t.state <- Tokenizer_state.Named_character_reference 1740 | Some '#' -> 1741 Tokenizer_stream.advance t.stream; 1742 Buffer.add_char t.temp_buffer '#'; 1743 t.state <- Tokenizer_state.Numeric_character_reference 1744 | _ -> 1745 flush_code_points_consumed_as_char_ref t; 1746 t.state <- t.return_state 1747 1748 and state_named_character_reference () = 1749 (* Collect alphanumeric characters *) 1750 let rec collect () = 1751 match Tokenizer_stream.peek t.stream with 1752 | Some c when is_ascii_alnum c -> 1753 Tokenizer_stream.advance t.stream; 1754 Buffer.add_char t.temp_buffer c; 1755 collect () 1756 | _ -> () 1757 in 1758 collect (); 1759 1760 let has_semicolon = 1761 match Tokenizer_stream.peek t.stream with 1762 | Some ';' -> Tokenizer_stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true 1763 | _ -> false 1764 in 1765 1766 (* Try to match entity - buffer contains "&name" or "&name;" *) 1767 let buf_contents = Buffer.contents t.temp_buffer in 1768 let name_start = 1 in (* Skip '&' *) 1769 let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in 1770 let entity_name = String.sub buf_contents name_start (name_end - name_start) in 1771 1772 (* Try progressively shorter matches *) 1773 (* Only match if: 1774 1. Full match with semicolon, OR 1775 2. Legacy entity (can be used without semicolon) *) 1776 let rec try_match len = 1777 if len <= 0 then None 1778 else 1779 let prefix = String.sub entity_name 0 len in 1780 let is_full = len = String.length entity_name in 1781 let would_have_semi = has_semicolon && is_full in 1782 (* Only use this match if it has semicolon or is a legacy entity *) 1783 if would_have_semi || Entities.is_legacy prefix then 1784 match Entities.lookup prefix with 1785 | Some decoded -> Some (decoded, len) 1786 | None -> try_match (len - 1) 1787 else 1788 try_match (len - 1) 1789 in 1790 1791 match try_match (String.length entity_name) with 1792 | Some (decoded, matched_len) -> 1793 let full_match = matched_len = String.length entity_name in 1794 let ends_with_semi = has_semicolon && full_match in 1795 1796 (* Check attribute context restrictions *) 1797 let in_attribute = match t.return_state with 1798 | Tokenizer_state.Attribute_value_double_quoted 1799 | Tokenizer_state.Attribute_value_single_quoted 1800 | Tokenizer_state.Attribute_value_unquoted -> true 1801 | _ -> false 1802 in 1803 1804 let next_char = 1805 if full_match && not has_semicolon then 1806 Tokenizer_stream.peek t.stream 1807 else if not full_match then 1808 Some entity_name.[matched_len] 1809 else None 1810 in 1811 1812 let blocked = in_attribute && not ends_with_semi && 1813 match next_char with 1814 | Some '=' -> true 1815 | Some c when is_ascii_alnum c -> true 1816 | _ -> false 1817 in 1818 1819 if blocked then begin 1820 flush_code_points_consumed_as_char_ref t; 1821 t.state <- t.return_state 1822 end else begin 1823 if not ends_with_semi then 1824 error t "missing-semicolon-after-character-reference"; 1825 Buffer.clear t.temp_buffer; 1826 Buffer.add_string t.temp_buffer decoded; 1827 flush_code_points_consumed_as_char_ref t; 1828 (* Emit unconsumed chars after partial match *) 1829 if not full_match then begin 1830 let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in 1831 emit_str t unconsumed; 1832 (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *) 1833 if has_semicolon then 1834 emit_char t ';' 1835 end; 1836 t.state <- t.return_state 1837 end 1838 | None -> 1839 (* No match - check if we should report unknown-named-character-reference *) 1840 if String.length entity_name > 0 then begin 1841 (* If we have a semicolon, it's definitely an unknown named character reference *) 1842 if has_semicolon then 1843 error t "unknown-named-character-reference"; 1844 (* Emit all the chars we consumed *) 1845 flush_code_points_consumed_as_char_ref t; 1846 t.state <- t.return_state 1847 end else begin 1848 flush_code_points_consumed_as_char_ref t; 1849 t.state <- t.return_state 1850 end 1851 1852 and state_ambiguous_ampersand () = 1853 match Tokenizer_stream.peek t.stream with 1854 | Some c when is_ascii_alnum c -> 1855 Tokenizer_stream.advance t.stream; 1856 (match t.return_state with 1857 | Tokenizer_state.Attribute_value_double_quoted 1858 | Tokenizer_state.Attribute_value_single_quoted 1859 | Tokenizer_state.Attribute_value_unquoted -> 1860 Buffer.add_char t.current_attr_value c 1861 | _ -> 1862 emit_char t c) 1863 | Some ';' -> 1864 error t "unknown-named-character-reference"; 1865 t.state <- t.return_state 1866 | _ -> 1867 t.state <- t.return_state 1868 1869 and state_numeric_character_reference () = 1870 t.char_ref_code <- 0; 1871 match Tokenizer_stream.peek t.stream with 1872 | Some (('x' | 'X') as c) -> 1873 Tokenizer_stream.advance t.stream; 1874 Buffer.add_char t.temp_buffer c; 1875 t.state <- Tokenizer_state.Hexadecimal_character_reference_start 1876 | _ -> 1877 t.state <- Tokenizer_state.Decimal_character_reference_start 1878 1879 and state_hexadecimal_character_reference_start () = 1880 match Tokenizer_stream.peek t.stream with 1881 | Some c when is_ascii_hex c -> 1882 t.state <- Tokenizer_state.Hexadecimal_character_reference 1883 | _ -> 1884 error t "absence-of-digits-in-numeric-character-reference"; 1885 flush_code_points_consumed_as_char_ref t; 1886 t.state <- t.return_state 1887 1888 and state_decimal_character_reference_start () = 1889 match Tokenizer_stream.peek t.stream with 1890 | Some c when is_ascii_digit c -> 1891 t.state <- Tokenizer_state.Decimal_character_reference 1892 | _ -> 1893 error t "absence-of-digits-in-numeric-character-reference"; 1894 flush_code_points_consumed_as_char_ref t; 1895 t.state <- t.return_state 1896 1897 and state_hexadecimal_character_reference () = 1898 match Tokenizer_stream.peek t.stream with 1899 | Some c when is_ascii_digit c -> 1900 Tokenizer_stream.advance t.stream; 1901 t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0'); 1902 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1903 | Some c when c >= 'A' && c <= 'F' -> 1904 Tokenizer_stream.advance t.stream; 1905 t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10); 1906 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1907 | Some c when c >= 'a' && c <= 'f' -> 1908 Tokenizer_stream.advance t.stream; 1909 t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10); 1910 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1911 | Some ';' -> 1912 Tokenizer_stream.advance t.stream; 1913 t.state <- Tokenizer_state.Numeric_character_reference_end 1914 | _ -> 1915 error t "missing-semicolon-after-character-reference"; 1916 t.state <- Tokenizer_state.Numeric_character_reference_end 1917 1918 and state_decimal_character_reference () = 1919 match Tokenizer_stream.peek t.stream with 1920 | Some c when is_ascii_digit c -> 1921 Tokenizer_stream.advance t.stream; 1922 t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0'); 1923 if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1924 | Some ';' -> 1925 Tokenizer_stream.advance t.stream; 1926 t.state <- Tokenizer_state.Numeric_character_reference_end 1927 | _ -> 1928 error t "missing-semicolon-after-character-reference"; 1929 t.state <- Tokenizer_state.Numeric_character_reference_end 1930 1931 and state_numeric_character_reference_end () = 1932 let code = t.char_ref_code in 1933 let replacement_char = "\xEF\xBF\xBD" in 1934 1935 let result = 1936 if code = 0 then begin 1937 error t "null-character-reference"; 1938 replacement_char 1939 end else if code > 0x10FFFF then begin 1940 error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code); 1941 replacement_char 1942 end else if code >= 0xD800 && code <= 0xDFFF then begin 1943 error t (Printf.sprintf "surrogate-character-reference:%04x" code); 1944 replacement_char 1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1946 (* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16). 1947 O(1) bitwise check instead of O(n) list membership. *) 1948 (let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin 1949 error t (Printf.sprintf "noncharacter-character-reference:%05x" code); 1950 Entities.Numeric_ref.codepoint_to_utf8 code 1951 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1952 (code >= 0x0D && code <= 0x1F) || 1953 (code >= 0x7F && code <= 0x9F) then begin 1954 error t (Printf.sprintf "control-character-reference:%04x" code); 1955 (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1956 match Entities.Numeric_ref.find_replacement code with 1957 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement 1958 | None -> Entities.Numeric_ref.codepoint_to_utf8 code 1959 end else 1960 Entities.Numeric_ref.codepoint_to_utf8 code 1961 in 1962 1963 Buffer.clear t.temp_buffer; 1964 Buffer.add_string t.temp_buffer result; 1965 flush_code_points_consumed_as_char_ref t; 1966 t.state <- t.return_state 1967 1968 in 1969 process_state () 1970 1971let get_errors t = List.rev t.errors 1972 1973let set_state t state = t.state <- state 1974 1975let set_last_start_tag t name = t.last_start_tag <- name