OCaml HTML5 parser/serialiser based on Python's JustHTML
at validator 6.1 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6let q = Error_code.q 7 8(** Generate human-readable message for a parse error code *) 9let message_of_parse_error code = 10 let code_str = Html5rw.Parse_error_code.to_string code in 11 match code with 12 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 13 "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag." 14 | Html5rw.Parse_error_code.Null_character_reference -> 15 "Character reference expands to zero." 16 | Html5rw.Parse_error_code.Tree_construction_error s -> 17 (* Check for control-character/noncharacter/surrogate with codepoint info *) 18 (try 19 if String.starts_with ~prefix:"control-character-in-input-s" s then 20 let colon_pos = String.index s ':' in 21 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 22 let cp = int_of_string ("0x" ^ cp_str) in 23 Printf.sprintf "Forbidden code point U+%04x." cp 24 else if String.starts_with ~prefix:"noncharacter-in-input-str" s then 25 let colon_pos = String.index s ':' in 26 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 27 let cp = int_of_string ("0x" ^ cp_str) in 28 Printf.sprintf "Forbidden code point U+%04x." cp 29 else if String.starts_with ~prefix:"surrogate-in-input-str" s then 30 let colon_pos = String.index s ':' in 31 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 32 let cp = int_of_string ("0x" ^ cp_str) in 33 Printf.sprintf "Forbidden code point U+%04x." cp 34 (* Character reference errors *) 35 else if String.starts_with ~prefix:"control-character-reference:" s then 36 let cp_str = String.sub s 28 (String.length s - 28) in 37 let cp = int_of_string ("0x" ^ cp_str) in 38 if cp = 0x0D then 39 "A numeric character reference expanded to carriage return." 40 else 41 Printf.sprintf "Character reference expands to a control character (U+%04x)." cp 42 else if String.starts_with ~prefix:"noncharacter-character-referenc" s then 43 let colon_pos = String.index s ':' in 44 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 45 let cp = int_of_string ("0x" ^ cp_str) in 46 (* U+FDD0-U+FDEF are "permanently unassigned" *) 47 if cp >= 0xFDD0 && cp <= 0xFDEF then 48 "Character reference expands to a permanently unassigned code point." 49 (* Astral noncharacters (planes 1-16) *) 50 else if cp >= 0x10000 then 51 Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp 52 else 53 Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp 54 else if String.starts_with ~prefix:"character-reference-outside-unicode-" s then 55 "Character reference outside the permissible Unicode range." 56 else if String.starts_with ~prefix:"surrogate-character-referen" s then 57 let colon_pos = String.index s ':' in 58 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 59 let cp = int_of_string ("0x" ^ cp_str) in 60 Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp 61 else if s = "no-p-element-in-scope" then 62 Printf.sprintf "No %s element in scope but a %s end tag seen." (q "p") (q "p") 63 else if s = "end-tag-p-implied-but-open-elements" then 64 Printf.sprintf "End tag %s implied, but there were open elements." (q "p") 65 else if s = "end-tag-br" then 66 Printf.sprintf "End tag %s." (q "br") 67 else if s = "expected-closing-tag-but-got-eof" then 68 "End of file seen and there were open elements." 69 else if String.starts_with ~prefix:"bad-start-tag-in-head-noscri" s then 70 let colon_pos = String.index s ':' in 71 let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 72 Printf.sprintf "Bad start tag in %s in %s in %s." (q element) (q "noscript") (q "head") 73 else if String.starts_with ~prefix:"unexpected-end-tag:" s then 74 let element = String.sub s 19 (String.length s - 19) in 75 Printf.sprintf "Stray end tag %s." (q element) 76 else if String.starts_with ~prefix:"start-tag-in-table:" s then 77 let tag = String.sub s 19 (String.length s - 19) in 78 Printf.sprintf "Start tag %s seen in %s." (q tag) (q "table") 79 else 80 Printf.sprintf "Parse error: %s" s 81 with _ -> Printf.sprintf "Parse error: %s" s) 82 | _ -> Printf.sprintf "Parse error: %s" code_str 83 84let of_parse_error ?system_id err = 85 let code = Html5rw.error_code err in 86 let line = Html5rw.error_line err in 87 let column = Html5rw.error_column err in 88 let location = Message.make_location ~line ~column ?system_id () in 89 let message = message_of_parse_error code in 90 Message.of_parse_error ~location ~message code 91 92let collect_parse_errors ?system_id result = 93 let errors = Html5rw.errors result in 94 let is_xhtml = match system_id with 95 | Some s -> String.length s > 6 && String.sub s (String.length s - 6) 6 = ".xhtml" 96 | None -> false 97 in 98 let filtered_errors = 99 if is_xhtml then 100 (* XHTML has different requirements than HTML: 101 - No DOCTYPE required 102 - Self-closing syntax is valid for all elements *) 103 List.filter (fun err -> 104 match Html5rw.error_code err with 105 | Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false 106 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> false 107 | _ -> true 108 ) errors 109 else errors 110 in 111 List.map (of_parse_error ?system_id) filtered_errors