OCaml HTML5 parser/serialiser based on Python's JustHTML

more

Changed files
+113
lib
html5_checker
+113
lib/html5_checker/specialized/unknown_element_checker.ml
···
··· 1 + (** Unknown HTML element checker. 2 + 3 + Detects elements that are not in the HTML5 specification and produces 4 + appropriate error messages. Custom elements (with hyphens) are allowed. *) 5 + 6 + (** Set of all known HTML5 element names. *) 7 + let known_elements = 8 + let elements = [ 9 + (* Document metadata *) 10 + "html"; "head"; "title"; "base"; "link"; "meta"; "style"; 11 + 12 + (* Sections *) 13 + "body"; "article"; "section"; "nav"; "aside"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 14 + "hgroup"; "header"; "footer"; "address"; "main"; 15 + 16 + (* Grouping content *) 17 + "p"; "hr"; "pre"; "blockquote"; "ol"; "ul"; "menu"; "li"; "dl"; "dt"; "dd"; 18 + "figure"; "figcaption"; "div"; 19 + 20 + (* Text-level semantics *) 21 + "a"; "em"; "strong"; "small"; "s"; "cite"; "q"; "dfn"; "abbr"; "ruby"; "rt"; "rp"; 22 + "data"; "time"; "code"; "var"; "samp"; "kbd"; "sub"; "sup"; "i"; "b"; "u"; "mark"; 23 + "bdi"; "bdo"; "span"; "br"; "wbr"; "search"; 24 + 25 + (* Edits *) 26 + "ins"; "del"; 27 + 28 + (* Embedded content *) 29 + "picture"; "source"; "img"; "iframe"; "embed"; "object"; "video"; "audio"; 30 + "track"; "map"; "area"; "math"; "svg"; 31 + 32 + (* Tables *) 33 + "table"; "caption"; "colgroup"; "col"; "tbody"; "thead"; "tfoot"; "tr"; "td"; "th"; 34 + 35 + (* Forms *) 36 + "form"; "label"; "input"; "button"; "select"; "datalist"; "optgroup"; "option"; 37 + "textarea"; "output"; "progress"; "meter"; "fieldset"; "legend"; 38 + 39 + (* Interactive *) 40 + "details"; "summary"; "dialog"; 41 + 42 + (* Scripting *) 43 + "script"; "noscript"; "template"; "slot"; "canvas"; 44 + 45 + (* Deprecated but still recognized *) 46 + "param"; 47 + ] in 48 + let tbl = Hashtbl.create (List.length elements) in 49 + List.iter (fun el -> Hashtbl.add tbl el ()) elements; 50 + tbl 51 + 52 + (** Check if an element name is a custom element (contains hyphen). *) 53 + let is_custom_element name = 54 + String.contains name '-' 55 + 56 + (** Check if an element name is known. *) 57 + let is_known_element name = 58 + let name_lower = String.lowercase_ascii name in 59 + Hashtbl.mem known_elements name_lower || is_custom_element name_lower 60 + 61 + type state = { 62 + mutable stack : string list; (* Parent element stack *) 63 + } 64 + 65 + let create () = { stack = [] } 66 + 67 + let reset state = 68 + state.stack <- [] 69 + 70 + let start_element state ~name ~namespace ~attrs:_ collector = 71 + (* Only check HTML namespace elements *) 72 + match namespace with 73 + | Some _ -> () (* Skip SVG, MathML, etc. *) 74 + | None -> 75 + let name_lower = String.lowercase_ascii name in 76 + 77 + (* Check if element is unknown *) 78 + if not (is_known_element name_lower) then begin 79 + (* Get the parent element name *) 80 + let parent = match state.stack with 81 + | p :: _ -> p 82 + | [] -> "document" 83 + in 84 + (* Produce error: unknown element not allowed as child *) 85 + Message_collector.add_typed collector 86 + (Error_code.Element_not_allowed_as_child { child = name; parent }) 87 + end; 88 + 89 + (* Always push to stack for tracking *) 90 + state.stack <- name_lower :: state.stack 91 + 92 + let end_element state ~name:_ ~namespace _ = 93 + match namespace with 94 + | Some _ -> () 95 + | None -> 96 + match state.stack with 97 + | _ :: rest -> state.stack <- rest 98 + | [] -> () (* Stack underflow - shouldn't happen *) 99 + 100 + let characters _state _text _collector = () 101 + 102 + let end_document _state _collector = () 103 + 104 + let checker = 105 + (module struct 106 + type nonrec state = state 107 + let create = create 108 + let reset = reset 109 + let start_element = start_element 110 + let end_element = end_element 111 + let characters = characters 112 + let end_document = end_document 113 + end : Checker.S)