OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 7.1 kB view raw
1(** DOM tree traversal for HTML5 conformance checking. 2 3 This module provides functions to traverse DOM trees and apply checkers 4 to validate HTML5 documents. It implements a depth-first, in-order 5 traversal that visits every node in the tree and notifies checkers 6 of traversal events. 7 8 {2 Traversal Model} 9 10 The walker follows a SAX-like event model, emitting events as it 11 encounters different node types during traversal: 12 13 {v 14 Document 15 └── html (start_element "html") 16 ├── head (start_element "head") 17 │ └── title (start_element "title") 18 │ ├── #text "Page Title" (characters) 19 │ └── (end_element "title") 20 └── body (start_element "body") 21 └── p (start_element "p") 22 ├── #text "Hello " (characters) 23 ├── b (start_element "b") 24 │ ├── #text "world" (characters) 25 │ └── (end_element "b") 26 ├── #text "!" (characters) 27 └── (end_element "p") 28 end_document 29 v} 30 31 {2 Event Sequence} 32 33 For each element node: 34 1. {!Checker.S.start_element} is called when entering the element 35 2. Children are recursively traversed 36 3. {!Checker.S.end_element} is called when exiting the element 37 38 For text and comment nodes: 39 - {!Checker.S.characters} is called with the text content 40 41 After the entire tree is traversed: 42 - {!Checker.S.end_document} is called on all checkers 43 44 {2 Checker Coordination} 45 46 When multiple checkers are used: 47 - All checkers receive the same event sequence 48 - Events are delivered to checkers in the order they appear in the list 49 - Each checker maintains independent state 50 - Messages from all checkers are collected together 51 52 This allows composing orthogonal validation rules without interference. 53 54 {2 Usage Examples} 55 56 {b Single checker:} 57 {[ 58 let checker = Checker.noop () in 59 let collector = Message_collector.create () in 60 walk checker collector dom; 61 let messages = Message_collector.messages collector in 62 List.iter Message.pp messages 63 ]} 64 65 {b Multiple checkers:} 66 {[ 67 let checkers = [checker1; checker2; checker3] in 68 let collector = Message_collector.create () in 69 walk_all checkers collector dom; 70 (* Analyze messages from all checkers *) 71 ]} 72 73 {b Registry of checkers:} 74 {[ 75 let registry = Checker_registry.default () in 76 let collector = Message_collector.create () in 77 walk_registry registry collector dom; 78 (* All registered checkers have validated the DOM *) 79 ]} *) 80 81(** {1 Single Checker Traversal} *) 82 83val walk : Checker.t -> Message_collector.t -> Html5rw.Dom.node -> unit 84(** [walk checker collector node] traverses a DOM tree with a single checker. 85 86 @param checker The checker to apply during traversal 87 @param collector The message collector for validation messages 88 @param node The root node to start traversal from 89 90 The traversal is depth-first and in-order: for each element, the 91 checker receives a {!Checker.S.start_element} event, then children 92 are recursively traversed, then an {!Checker.S.end_element} event 93 is emitted. 94 95 After the entire tree is traversed, {!Checker.S.end_document} is 96 called to allow the checker to emit any final validation messages. 97 98 {b Example:} 99 {[ 100 (* Validate a parsed HTML document *) 101 let checker = Checker.noop () in 102 let collector = Message_collector.create () in 103 walk checker collector document_node; 104 105 (* Check for errors *) 106 let messages = Message_collector.messages collector in 107 let errors = List.filter 108 (fun msg -> msg.Message.severity = Message.Error) 109 messages in 110 if errors <> [] then 111 Printf.printf "Found %d errors\n" (List.length errors) 112 ]} 113 114 {b Notes:} 115 - Only element nodes trigger start/end events 116 - Text and comment nodes trigger character events 117 - Document and doctype nodes are silently skipped 118 - The traversal follows document order (parent before children, 119 earlier siblings before later ones) *) 120 121(** {1 Multiple Checker Traversal} *) 122 123val walk_all : 124 Checker.t list -> Message_collector.t -> Html5rw.Dom.node -> unit 125(** [walk_all checkers collector node] traverses a DOM tree with multiple 126 checkers. 127 128 @param checkers List of checkers to apply during traversal 129 @param collector The message collector for validation messages 130 @param node The root node to start traversal from 131 132 This performs a single tree traversal, delivering each event to all 133 checkers in sequence. This is more efficient than calling {!walk} 134 multiple times. 135 136 All checkers receive events in the order they appear in the list. 137 Each checker maintains independent state, so validation rules can 138 be composed without interference. 139 140 {b Example:} 141 {[ 142 (* Run multiple validation passes in one traversal *) 143 let structure_checker = (module StructureChecker : Checker.S) in 144 let attribute_checker = (module AttributeChecker : Checker.S) in 145 let obsolete_checker = (module ObsoleteChecker : Checker.S) in 146 147 let checkers = [structure_checker; attribute_checker; obsolete_checker] in 148 let collector = Message_collector.create () in 149 150 walk_all checkers collector document_node; 151 152 (* All three checkers have validated the document *) 153 let messages = Message_collector.messages collector in 154 Message_format.print_messages messages 155 ]} 156 157 {b Empty list behavior:} 158 If the checkers list is empty, the tree is traversed but no validation 159 is performed. This is equivalent to calling [walk (Checker.noop ()) ...]. *) 160 161(** {1 Registry-Based Traversal} *) 162 163val walk_registry : 164 Checker_registry.t -> Message_collector.t -> Html5rw.Dom.node -> unit 165(** [walk_registry registry collector node] traverses a DOM tree with all 166 checkers from a registry. 167 168 @param registry The registry containing checkers to apply 169 @param collector The message collector for validation messages 170 @param node The root node to start traversal from 171 172 This is equivalent to: 173 {[ 174 let checkers = Checker_registry.all registry in 175 walk_all checkers collector node 176 ]} 177 178 Use this when you want to run a pre-configured set of checkers 179 without manually extracting them from the registry. 180 181 {b Example:} 182 {[ 183 (* Set up registry with desired checkers *) 184 let registry = Checker_registry.default () in 185 Checker_registry.register registry "custom" my_checker; 186 187 (* Validate multiple documents with same checker set *) 188 List.iter (fun doc -> 189 let collector = Message_collector.create () in 190 walk_registry registry collector doc; 191 report_results collector 192 ) documents 193 ]} 194 195 {b Empty registry behavior:} 196 If the registry is empty, the tree is traversed but no validation 197 is performed. *)