OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+29
lib/htmlrw_check/checker.ml
··· 38 end 39 40 let noop () = (module Noop : S)
··· 38 end 39 40 let noop () = (module Noop : S) 41 + 42 + (** Input signature for Make functor *) 43 + module type Input = sig 44 + type state 45 + val create : unit -> state 46 + val reset : state -> unit 47 + val start_element : state -> element:Element.t -> Message_collector.t -> unit 48 + val end_element : state -> tag:Tag.element_tag -> Message_collector.t -> unit 49 + val characters : (state -> string -> Message_collector.t -> unit) option 50 + val end_document : (state -> Message_collector.t -> unit) option 51 + end 52 + 53 + (** Functor to create a checker with default implementations for optional callbacks *) 54 + module Make (I : Input) : S with type state = I.state = struct 55 + type state = I.state 56 + 57 + let create = I.create 58 + let reset = I.reset 59 + let start_element = I.start_element 60 + let end_element = I.end_element 61 + 62 + let characters = match I.characters with 63 + | Some f -> f 64 + | None -> fun _ _ _ -> () 65 + 66 + let end_document = match I.end_document with 67 + | Some f -> f 68 + | None -> fun _ _ -> () 69 + end
+42
lib/htmlrw_check/checker.mli
··· 172 (* Does nothing when walked over a DOM tree *) 173 ]} 174 *)
··· 172 (* Does nothing when walked over a DOM tree *) 173 ]} 174 *) 175 + 176 + (** {1 Checker Construction Helpers} *) 177 + 178 + (** Input signature for {!Make} functor. 179 + 180 + Only the required callbacks need to be provided. Optional callbacks 181 + (characters, end_document) default to no-op implementations. *) 182 + module type Input = sig 183 + type state 184 + val create : unit -> state 185 + val reset : state -> unit 186 + val start_element : state -> element:Element.t -> Message_collector.t -> unit 187 + val end_element : state -> tag:Tag.element_tag -> Message_collector.t -> unit 188 + 189 + (** Optional: called for text content. Default: no-op. *) 190 + val characters : (state -> string -> Message_collector.t -> unit) option 191 + 192 + (** Optional: called at document end. Default: no-op. *) 193 + val end_document : (state -> Message_collector.t -> unit) option 194 + end 195 + 196 + (** Functor to create a checker from an {!Input} module. 197 + 198 + This reduces boilerplate when creating checkers that don't need 199 + to handle all events. The characters and end_document callbacks 200 + default to no-ops if not provided. 201 + 202 + {b Example:} 203 + {[ 204 + let checker = Checker.Make(struct 205 + type state = { mutable count : int } 206 + let create () = { count = 0 } 207 + let reset s = s.count <- 0 208 + let start_element s ~element collector = 209 + s.count <- s.count + 1 210 + let end_element _ ~tag:_ _ = () 211 + let characters = None (* Use default no-op *) 212 + let end_document = None (* Use default no-op *) 213 + end) 214 + ]} 215 + *) 216 + module Make : functor (I : Input) -> S with type state = I.state
+141
lib/htmlrw_check/context_tracker.ml
···
··· 1 + (** Reusable context/ancestor tracking for checkers. 2 + 3 + Many checkers need to track element ancestors, depth, or maintain 4 + context stacks during DOM traversal. This module provides common 5 + utilities to reduce duplication. *) 6 + 7 + (** Generic stack-based context tracker. *) 8 + module Stack : sig 9 + type 'a t 10 + 11 + (** Create an empty context stack. *) 12 + val create : unit -> 'a t 13 + 14 + (** Reset the stack to empty. *) 15 + val reset : 'a t -> unit 16 + 17 + (** Push a context onto the stack. *) 18 + val push : 'a t -> 'a -> unit 19 + 20 + (** Pop a context from the stack. Returns None if empty. *) 21 + val pop : 'a t -> 'a option 22 + 23 + (** Get the current (top) context without removing it. *) 24 + val current : 'a t -> 'a option 25 + 26 + (** Get current depth (number of items on stack). *) 27 + val depth : 'a t -> int 28 + 29 + (** Check if stack is empty. *) 30 + val is_empty : 'a t -> bool 31 + 32 + (** Get all ancestors (bottom to top). *) 33 + val to_list : 'a t -> 'a list 34 + 35 + (** Check if any ancestor satisfies predicate. *) 36 + val exists : 'a t -> ('a -> bool) -> bool 37 + 38 + (** Find first ancestor satisfying predicate (top to bottom). *) 39 + val find : 'a t -> ('a -> bool) -> 'a option 40 + 41 + (** Iterate over all contexts (top to bottom). *) 42 + val iter : 'a t -> ('a -> unit) -> unit 43 + end = struct 44 + type 'a t = { mutable stack : 'a list } 45 + 46 + let create () = { stack = [] } 47 + let reset t = t.stack <- [] 48 + let push t x = t.stack <- x :: t.stack 49 + let pop t = match t.stack with 50 + | [] -> None 51 + | x :: rest -> t.stack <- rest; Some x 52 + let current t = match t.stack with 53 + | [] -> None 54 + | x :: _ -> Some x 55 + let depth t = List.length t.stack 56 + let is_empty t = t.stack = [] 57 + let to_list t = List.rev t.stack 58 + let exists t f = List.exists f t.stack 59 + let find t f = List.find_opt f t.stack 60 + let iter t f = List.iter f t.stack 61 + end 62 + 63 + (** Simple depth counter for tracking nesting level. *) 64 + module Depth : sig 65 + type t 66 + 67 + (** Create a depth counter starting at 0. *) 68 + val create : unit -> t 69 + 70 + (** Reset depth to 0. *) 71 + val reset : t -> unit 72 + 73 + (** Increment depth (entering element). *) 74 + val enter : t -> unit 75 + 76 + (** Decrement depth (leaving element). Returns false if was already 0. *) 77 + val leave : t -> bool 78 + 79 + (** Get current depth. *) 80 + val get : t -> int 81 + 82 + (** Check if inside (depth > 0). *) 83 + val is_inside : t -> bool 84 + end = struct 85 + type t = { mutable depth : int } 86 + 87 + let create () = { depth = 0 } 88 + let reset t = t.depth <- 0 89 + let enter t = t.depth <- t.depth + 1 90 + let leave t = 91 + if t.depth > 0 then begin 92 + t.depth <- t.depth - 1; 93 + true 94 + end else false 95 + let get t = t.depth 96 + let is_inside t = t.depth > 0 97 + end 98 + 99 + (** Element name stack for tracking ancestors by name. *) 100 + module Ancestors : sig 101 + type t 102 + 103 + (** Create an empty ancestor tracker. *) 104 + val create : unit -> t 105 + 106 + (** Reset to empty. *) 107 + val reset : t -> unit 108 + 109 + (** Push an element name onto the ancestor stack. *) 110 + val push : t -> string -> unit 111 + 112 + (** Pop an element from the ancestor stack. *) 113 + val pop : t -> unit 114 + 115 + (** Get the immediate parent element name. *) 116 + val parent : t -> string option 117 + 118 + (** Check if an element name is an ancestor. *) 119 + val has_ancestor : t -> string -> bool 120 + 121 + (** Get depth (number of ancestors). *) 122 + val depth : t -> int 123 + 124 + (** Get all ancestor names (outermost first). *) 125 + val to_list : t -> string list 126 + end = struct 127 + type t = { mutable stack : string list } 128 + 129 + let create () = { stack = [] } 130 + let reset t = t.stack <- [] 131 + let push t name = t.stack <- name :: t.stack 132 + let pop t = match t.stack with 133 + | _ :: rest -> t.stack <- rest 134 + | [] -> () 135 + let parent t = match t.stack with 136 + | x :: _ -> Some x 137 + | [] -> None 138 + let has_ancestor t name = List.mem name t.stack 139 + let depth t = List.length t.stack 140 + let to_list t = List.rev t.stack 141 + end
+116
lib/htmlrw_check/context_tracker.mli
···
··· 1 + (** Reusable context/ancestor tracking for checkers. 2 + 3 + Many checkers need to track element ancestors, depth, or maintain 4 + context stacks during DOM traversal. This module provides common 5 + utilities to reduce duplication across checkers. 6 + 7 + {2 Available Trackers} 8 + 9 + - {!Stack}: Generic stack for any context type 10 + - {!Depth}: Simple integer depth counter 11 + - {!Ancestors}: String-based element ancestor tracking *) 12 + 13 + (** {2 Generic Stack} *) 14 + 15 + (** Generic stack-based context tracker. 16 + 17 + Use this when you need to track complex state at each nesting level. 18 + For example, tracking whether each ancestor has certain attributes. *) 19 + module Stack : sig 20 + type 'a t 21 + 22 + (** Create an empty context stack. *) 23 + val create : unit -> 'a t 24 + 25 + (** Reset the stack to empty. *) 26 + val reset : 'a t -> unit 27 + 28 + (** Push a context onto the stack. *) 29 + val push : 'a t -> 'a -> unit 30 + 31 + (** Pop a context from the stack. Returns None if empty. *) 32 + val pop : 'a t -> 'a option 33 + 34 + (** Get the current (top) context without removing it. *) 35 + val current : 'a t -> 'a option 36 + 37 + (** Get current depth (number of items on stack). *) 38 + val depth : 'a t -> int 39 + 40 + (** Check if stack is empty. *) 41 + val is_empty : 'a t -> bool 42 + 43 + (** Get all ancestors (bottom to top). *) 44 + val to_list : 'a t -> 'a list 45 + 46 + (** Check if any ancestor satisfies predicate. *) 47 + val exists : 'a t -> ('a -> bool) -> bool 48 + 49 + (** Find first ancestor satisfying predicate (top to bottom). *) 50 + val find : 'a t -> ('a -> bool) -> 'a option 51 + 52 + (** Iterate over all contexts (top to bottom). *) 53 + val iter : 'a t -> ('a -> unit) -> unit 54 + end 55 + 56 + (** {2 Depth Counter} *) 57 + 58 + (** Simple depth counter for tracking nesting level. 59 + 60 + Use this when you only need to know if you're inside a certain 61 + element type, not the full context at each level. *) 62 + module Depth : sig 63 + type t 64 + 65 + (** Create a depth counter starting at 0. *) 66 + val create : unit -> t 67 + 68 + (** Reset depth to 0. *) 69 + val reset : t -> unit 70 + 71 + (** Increment depth (entering element). *) 72 + val enter : t -> unit 73 + 74 + (** Decrement depth (leaving element). Returns false if was already 0. *) 75 + val leave : t -> bool 76 + 77 + (** Get current depth. *) 78 + val get : t -> int 79 + 80 + (** Check if inside (depth > 0). *) 81 + val is_inside : t -> bool 82 + end 83 + 84 + (** {2 Ancestor Tracker} *) 85 + 86 + (** Element name stack for tracking ancestors by name. 87 + 88 + Use this when you need to check if certain elements are ancestors, 89 + but don't need complex context at each level. *) 90 + module Ancestors : sig 91 + type t 92 + 93 + (** Create an empty ancestor tracker. *) 94 + val create : unit -> t 95 + 96 + (** Reset to empty. *) 97 + val reset : t -> unit 98 + 99 + (** Push an element name onto the ancestor stack. *) 100 + val push : t -> string -> unit 101 + 102 + (** Pop an element from the ancestor stack. *) 103 + val pop : t -> unit 104 + 105 + (** Get the immediate parent element name. *) 106 + val parent : t -> string option 107 + 108 + (** Check if an element name is an ancestor. *) 109 + val has_ancestor : t -> string -> bool 110 + 111 + (** Get depth (number of ancestors). *) 112 + val depth : t -> int 113 + 114 + (** Get all ancestor names (outermost first). *) 115 + val to_list : t -> string list 116 + end
+15
lib/htmlrw_check/datatype/datatype.ml
··· 40 let end_pos = find_end (len - 1) in 41 if start > end_pos then "" 42 else String.sub s start (end_pos - start + 1)
··· 40 let end_pos = find_end (len - 1) in 41 if start > end_pos then "" 42 else String.sub s start (end_pos - start + 1) 43 + 44 + (** Factory for creating enum-based validators. 45 + Many HTML attributes accept a fixed set of keyword values. *) 46 + let make_enum ~name ~values ?(allow_empty = true) () : t = 47 + let values_set = List.map String.lowercase_ascii values in 48 + let values_str = String.concat ", " (List.map (Printf.sprintf "'%s'") values) in 49 + (module struct 50 + let name = name 51 + let validate s = 52 + let s_lower = string_to_ascii_lowercase s in 53 + if (allow_empty && s = "") || List.mem s_lower values_set then Ok () 54 + else Error (Printf.sprintf "The value '%s' is not a valid %s value. Expected %s%s." 55 + s name (if allow_empty then "empty string, " else "") values_str) 56 + let is_valid s = Result.is_ok (validate s) 57 + end : S)
+8
lib/htmlrw_check/datatype/datatype.mli
··· 43 44 (** Trim HTML5 whitespace from both ends of a string. *) 45 val trim_html_spaces : string -> string
··· 43 44 (** Trim HTML5 whitespace from both ends of a string. *) 45 val trim_html_spaces : string -> string 46 + 47 + (** {2 Datatype Factories} *) 48 + 49 + (** Create an enum-based validator for attributes with fixed keyword values. 50 + @param name The datatype name (e.g., "loading", "crossorigin") 51 + @param values List of valid keyword values (case-insensitive) 52 + @param allow_empty Whether empty string is valid (default: true) *) 53 + val make_enum : name:string -> values:string list -> ?allow_empty:bool -> unit -> t
+26
lib/htmlrw_check/error_code.ml
··· 726 727 (* Generic *) 728 | `Generic message -> message
··· 726 727 (* Generic *) 728 | `Generic message -> message 729 + 730 + (** {2 Error Construction Helpers} *) 731 + 732 + (** Create a bad attribute value error with element, attribute, value, and reason. *) 733 + let bad_value ~element ~attr ~value ~reason : t = 734 + `Attr (`Bad_value (`Elem element, `Attr attr, `Value value, `Reason reason)) 735 + 736 + (** Create a bad attribute value error with just a message. *) 737 + let bad_value_msg msg : t = 738 + `Attr (`Bad_value_generic (`Message msg)) 739 + 740 + (** Create a missing required attribute error. *) 741 + let missing_attr ~element ~attr : t = 742 + `Attr (`Missing (`Elem element, `Attr attr)) 743 + 744 + (** Create an attribute not allowed error. *) 745 + let attr_not_allowed ~element ~attr : t = 746 + `Attr (`Not_allowed (`Attr attr, `Elem element)) 747 + 748 + (** Create an element not allowed as child error. *) 749 + let not_allowed_as_child ~child ~parent : t = 750 + `Element (`Not_allowed_as_child (`Child child, `Parent parent)) 751 + 752 + (** Create a must not be empty error. *) 753 + let must_not_be_empty ~element : t = 754 + `Element (`Must_not_be_empty (`Elem element))
+28
lib/htmlrw_check/error_code.mli
··· 748 (** Format a string with Unicode curly quotes. 749 Wraps the string in U+201C and U+201D ("..."). *) 750 val q : string -> string
··· 748 (** Format a string with Unicode curly quotes. 749 Wraps the string in U+201C and U+201D ("..."). *) 750 val q : string -> string 751 + 752 + (** {1 Error Construction Helpers} 753 + 754 + These functions simplify creating common error types. *) 755 + 756 + (** Create a bad attribute value error. 757 + Example: [bad_value ~element:"img" ~attr:"src" ~value:"" ~reason:"URL cannot be empty"] *) 758 + val bad_value : element:string -> attr:string -> value:string -> reason:string -> t 759 + 760 + (** Create a bad attribute value error with just a message. 761 + Example: [bad_value_msg "The value must be a valid URL"] *) 762 + val bad_value_msg : string -> t 763 + 764 + (** Create a missing required attribute error. 765 + Example: [missing_attr ~element:"img" ~attr:"alt"] *) 766 + val missing_attr : element:string -> attr:string -> t 767 + 768 + (** Create an attribute not allowed error. 769 + Example: [attr_not_allowed ~element:"span" ~attr:"href"] *) 770 + val attr_not_allowed : element:string -> attr:string -> t 771 + 772 + (** Create an element not allowed as child error. 773 + Example: [not_allowed_as_child ~child:"div" ~parent:"p"] *) 774 + val not_allowed_as_child : child:string -> parent:string -> t 775 + 776 + (** Create a must not be empty error. 777 + Example: [must_not_be_empty ~element:"title"] *) 778 + val must_not_be_empty : element:string -> t
+8 -27
lib/htmlrw_check/specialized/h1_checker.mli
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 6 - (** H1 element counter and validator. 7 8 - This checker warns about multiple [<h1>] elements in a document. 9 - While HTML5 technically allows multiple [<h1>] elements when using 10 - the document outline algorithm, this algorithm was never implemented 11 - by browsers and has been removed from the specification. 12 13 - {2 Best Practice} 14 - 15 - Documents should have exactly one [<h1>] element that represents the 16 - main heading of the page. Multiple [<h1>] elements can confuse users 17 - and assistive technologies about the document's structure. 18 - 19 - {2 Special Cases} 20 - 21 - - [<h1>] elements inside [<svg>] content (e.g., in [<foreignObject>]) 22 - are not counted, as they may represent different content contexts 23 - - The checker reports a warning after the second [<h1>] is encountered 24 25 {2 Error Messages} 26 27 - Reports [Multiple_h1] when more than one [<h1>] element is found 28 - in the document. 29 - 30 - @see <https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements> 31 - HTML Standard: The h1-h6 elements 32 - *) 33 34 val checker : Checker.t 35 - (** The h1 element counter/validator instance. *)
··· 1 + (** H1 element counter checker. 2 3 + This checker validates that documents don't have multiple h1 elements, 4 + which can confuse document structure and accessibility tools. 5 6 + {2 Validation Rules} 7 8 + - Documents should have at most one [<h1>] element 9 + - [<h1>] elements inside SVG content (foreignObject, desc) are not counted 10 11 {2 Error Messages} 12 13 + - [Multiple_h1]: Document contains more than one h1 element *) 14 15 val checker : Checker.t 16 + (** The H1 checker instance. *)
+14 -2
lib/htmlrw_check/specialized/heading_checker.ml
··· 14 mutable first_heading_checked : bool; 15 mutable in_heading : Tag.html_tag option; 16 mutable heading_has_text : bool; 17 } 18 19 let create () = ··· 24 first_heading_checked = false; 25 in_heading = None; 26 heading_has_text = false; 27 } 28 29 let reset state = ··· 32 state.has_any_heading <- false; 33 state.first_heading_checked <- false; 34 state.in_heading <- None; 35 - state.heading_has_text <- false 36 37 (** Check if text is effectively empty (only whitespace). *) 38 let is_empty_text text = ··· 48 49 let start_element state ~element collector = 50 match element.Element.tag with 51 - | Tag.Html (#Tag.heading_tag as h) -> 52 let level = match Tag.heading_level h with Some l -> l | None -> 0 in 53 let name = Tag.html_tag_to_string h in 54 state.has_any_heading <- true; ··· 89 | _ -> () 90 91 let end_element state ~tag collector = 92 match state.in_heading, tag with 93 | Some h, Tag.Html h2 when h = h2 -> 94 if not state.heading_has_text then
··· 14 mutable first_heading_checked : bool; 15 mutable in_heading : Tag.html_tag option; 16 mutable heading_has_text : bool; 17 + mutable svg_depth : int; (* Track depth inside SVG - headings in SVG don't count *) 18 } 19 20 let create () = ··· 25 first_heading_checked = false; 26 in_heading = None; 27 heading_has_text = false; 28 + svg_depth = 0; 29 } 30 31 let reset state = ··· 34 state.has_any_heading <- false; 35 state.first_heading_checked <- false; 36 state.in_heading <- None; 37 + state.heading_has_text <- false; 38 + state.svg_depth <- 0 39 40 (** Check if text is effectively empty (only whitespace). *) 41 let is_empty_text text = ··· 51 52 let start_element state ~element collector = 53 match element.Element.tag with 54 + | Tag.Svg _ -> 55 + (* Track SVG depth - headings inside SVG (foreignObject, desc) don't count *) 56 + state.svg_depth <- state.svg_depth + 1 57 + | Tag.Html (#Tag.heading_tag as h) when state.svg_depth = 0 -> 58 let level = match Tag.heading_level h with Some l -> l | None -> 0 in 59 let name = Tag.html_tag_to_string h in 60 state.has_any_heading <- true; ··· 95 | _ -> () 96 97 let end_element state ~tag collector = 98 + (* Track SVG depth *) 99 + (match tag with 100 + | Tag.Svg _ when state.svg_depth > 0 -> 101 + state.svg_depth <- state.svg_depth - 1 102 + | _ -> ()); 103 + (* Check for empty headings *) 104 match state.in_heading, tag with 105 | Some h, Tag.Html h2 when h = h2 -> 106 if not state.heading_has_text then