OCaml HTML5 parser/serialiser based on Python's JustHTML

fix javascript overflow

Changed files
+137 -158
lib
check
+137 -158
lib/check/semantic/nesting_checker.ml
··· 1 - (** Interactive element nesting checker implementation. *) 1 + (** Interactive element nesting checker implementation. 2 2 3 - (** Special ancestors that need tracking for nesting validation. 3 + Uses bool arrays instead of bitmasks for JavaScript compatibility 4 + (JS bitwise ops are limited to 32 bits). *) 4 5 5 - This array defines the elements whose presence in the ancestor chain 6 - affects validation of descendant elements. The order is significant 7 - as it determines bit positions in the ancestor bitmask. *) 6 + (** Special ancestors that need tracking for nesting validation. *) 8 7 let special_ancestors = 9 8 [| "a"; "address"; "body"; "button"; "caption"; "dfn"; "dt"; "figcaption"; 10 9 "figure"; "footer"; "form"; "header"; "label"; "map"; "noscript"; "th"; ··· 13 12 "s"; "small"; "mark"; "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp"; 14 13 "kbd"; "var" |] 15 14 16 - (** Hashtable for O(1) lookup of special ancestor bit positions *) 15 + let num_ancestors = Array.length special_ancestors 16 + 17 + (** Hashtable for O(1) lookup of special ancestor indices *) 17 18 let special_ancestor_table : (string, int) Hashtbl.t = 18 19 let tbl = Hashtbl.create 64 in 19 20 Array.iteri (fun i name -> Hashtbl.add tbl name i) special_ancestors; 20 21 tbl 21 22 22 - (** Get the bit position for a special ancestor element. 23 - Returns [-1] if the element is not a special ancestor. O(1) lookup. *) 24 - let special_ancestor_number name = 23 + (** Get the index for a special ancestor element. 24 + Returns [-1] if the element is not a special ancestor. *) 25 + let special_ancestor_index name = 25 26 match Hashtbl.find_opt special_ancestor_table name with 26 27 | Some i -> i 27 28 | None -> -1 ··· 31 32 [| "a"; "button"; "details"; "embed"; "iframe"; "label"; "select"; 32 33 "textarea" |] 33 34 34 - (** Map from descendant element name to bitmask of prohibited ancestors. *) 35 - let ancestor_mask_by_descendant : (string, int) Hashtbl.t = 35 + (** Create an empty bool array for ancestor tracking *) 36 + let empty_flags () = Array.make num_ancestors false 37 + 38 + (** Copy a bool array *) 39 + let copy_flags flags = Array.copy flags 40 + 41 + (** Map from descendant element name to prohibited ancestor flags. *) 42 + let prohibited_ancestors_by_descendant : (string, bool array) Hashtbl.t = 36 43 Hashtbl.create 64 37 44 38 - (** Map from descendant element name to bitmask of ancestors that cause content model violations. 39 - (These use different error messages than nesting violations.) *) 40 - let content_model_violation_mask : (string, int) Hashtbl.t = 45 + (** Map from descendant element name to content model violation flags. *) 46 + let content_model_violations : (string, bool array) Hashtbl.t = 41 47 Hashtbl.create 64 42 48 49 + (** Get or create prohibited ancestors array for a descendant *) 50 + let get_prohibited descendant = 51 + match Hashtbl.find_opt prohibited_ancestors_by_descendant descendant with 52 + | Some arr -> arr 53 + | None -> 54 + let arr = empty_flags () in 55 + Hashtbl.replace prohibited_ancestors_by_descendant descendant arr; 56 + arr 57 + 58 + (** Get or create content model violations array for a descendant *) 59 + let get_content_model_violations descendant = 60 + match Hashtbl.find_opt content_model_violations descendant with 61 + | Some arr -> arr 62 + | None -> 63 + let arr = empty_flags () in 64 + Hashtbl.replace content_model_violations descendant arr; 65 + arr 66 + 43 67 (** Register that [ancestor] is prohibited for [descendant]. *) 44 68 let register_prohibited_ancestor ancestor descendant = 45 - let number = special_ancestor_number ancestor in 46 - if number = -1 then 69 + let idx = special_ancestor_index ancestor in 70 + if idx = -1 then 47 71 failwith ("Ancestor not found in array: " ^ ancestor); 48 - let mask = 49 - match Hashtbl.find_opt ancestor_mask_by_descendant descendant with 50 - | None -> 0 51 - | Some m -> m 52 - in 53 - let new_mask = mask lor (1 lsl number) in 54 - Hashtbl.replace ancestor_mask_by_descendant descendant new_mask 72 + let arr = get_prohibited descendant in 73 + arr.(idx) <- true 55 74 56 75 (** Register a content model violation (phrasing-only element containing flow content). *) 57 76 let register_content_model_violation ancestor descendant = 58 77 register_prohibited_ancestor ancestor descendant; 59 - let number = special_ancestor_number ancestor in 60 - let mask = 61 - match Hashtbl.find_opt content_model_violation_mask descendant with 62 - | None -> 0 63 - | Some m -> m 64 - in 65 - let new_mask = mask lor (1 lsl number) in 66 - Hashtbl.replace content_model_violation_mask descendant new_mask 78 + let idx = special_ancestor_index ancestor in 79 + let arr = get_content_model_violations descendant in 80 + arr.(idx) <- true 67 81 68 82 (** Initialize the prohibited ancestor map. *) 69 83 let () = ··· 133 147 ) interactive_elements; 134 148 135 149 (* Phrasing-only elements: cannot contain flow content like p, div, h1-h6, etc. *) 136 - (* These are content model violations, not nesting violations. *) 137 150 let phrasing_only = ["span"; "strong"; "em"; "b"; "i"; "u"; "s"; "small"; "mark"; 138 151 "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp"; "kbd"; "var"] in 139 152 let flow_content = ["p"; "div"; "article"; "section"; "nav"; "aside"; "header"; "footer"; ··· 145 158 ) flow_content 146 159 ) phrasing_only 147 160 148 - (** Bitmask constants for common checks. *) 149 - let a_button_mask = 150 - let a_num = special_ancestor_number "a" in 151 - let button_num = special_ancestor_number "button" in 152 - (1 lsl a_num) lor (1 lsl button_num) 153 - 154 - let map_mask = 155 - let map_num = special_ancestor_number "map" in 156 - 1 lsl map_num 161 + (** Indices for common checks *) 162 + let a_index = special_ancestor_index "a" 163 + let button_index = special_ancestor_index "button" 164 + let map_index = special_ancestor_index "map" 157 165 158 - (** Transparent elements - inherit content model from parent. O(1) hashtable lookup. *) 166 + (** Transparent elements - inherit content model from parent. *) 159 167 let transparent_elements_tbl = 160 168 Attr_utils.hashtbl_of_list ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"] 161 169 ··· 163 171 164 172 (** Stack node representing an element's context. *) 165 173 type stack_node = { 166 - ancestor_mask : int; 174 + ancestor_flags : bool array; 167 175 name : string; 168 176 is_transparent : bool; 169 177 } ··· 171 179 (** Checker state. *) 172 180 type state = { 173 181 mutable stack : stack_node list; 174 - mutable ancestor_mask : int; 182 + mutable ancestor_flags : bool array; 175 183 } 176 184 177 185 let create () = 178 - { stack = []; ancestor_mask = 0 } 186 + { stack = []; ancestor_flags = empty_flags () } 179 187 180 188 let reset state = 181 189 state.stack <- []; 182 - state.ancestor_mask <- 0 190 + state.ancestor_flags <- empty_flags () 183 191 184 192 (** Get attribute value by name from attribute list. *) 185 193 let get_attr attrs name = ··· 192 200 (** Check if element is interactive based on its attributes. *) 193 201 let is_interactive_element name attrs = 194 202 match name with 195 - | "a" -> 196 - has_attr attrs "href" 197 - | "audio" | "video" -> 198 - has_attr attrs "controls" 199 - | "img" | "object" -> 200 - has_attr attrs "usemap" 203 + | "a" -> has_attr attrs "href" 204 + | "audio" | "video" -> has_attr attrs "controls" 205 + | "img" | "object" -> has_attr attrs "usemap" 201 206 | "input" -> 202 - begin match get_attr attrs "type" with 203 - | Some "hidden" -> false 204 - | _ -> true 205 - end 207 + (match get_attr attrs "type" with 208 + | Some "hidden" -> false 209 + | _ -> true) 206 210 | "button" | "details" | "embed" | "iframe" | "label" | "select" 207 - | "textarea" -> 208 - true 209 - | _ -> 210 - false 211 + | "textarea" -> true 212 + | _ -> false 211 213 212 - (** Find the nearest transparent element in the ancestor stack, if any. 213 - Returns the immediate parent's name if it's transparent, otherwise None. *) 214 + (** Find the nearest transparent element in the ancestor stack. *) 214 215 let find_nearest_transparent_parent state = 215 216 match state.stack with 216 217 | parent :: _ when parent.is_transparent -> Some parent.name ··· 218 219 219 220 (** Report nesting violations. *) 220 221 let check_nesting state name attrs collector = 221 - (* Compute the prohibited ancestor mask for this element *) 222 - let base_mask = 223 - match Hashtbl.find_opt ancestor_mask_by_descendant name with 224 - | Some m -> m 225 - | None -> 0 222 + (* Get prohibited ancestors for this element *) 223 + let prohibited = 224 + match Hashtbl.find_opt prohibited_ancestors_by_descendant name with 225 + | Some arr -> arr 226 + | None -> empty_flags () 226 227 in 227 228 228 - (* Get content model violation mask for this element *) 229 - let content_model_mask = 230 - match Hashtbl.find_opt content_model_violation_mask name with 231 - | Some m -> m 232 - | None -> 0 229 + (* Get content model violations for this element *) 230 + let content_violations = 231 + match Hashtbl.find_opt content_model_violations name with 232 + | Some arr -> arr 233 + | None -> empty_flags () 233 234 in 234 235 235 - (* Add interactive element restrictions if applicable *) 236 - let mask = 237 - if is_interactive_element name attrs then 238 - base_mask lor a_button_mask 239 - else 240 - base_mask 236 + (* Check if element is interactive (adds a/button restrictions) *) 237 + let is_interactive = is_interactive_element name attrs in 238 + 239 + (* Determine attribute to mention in error messages *) 240 + let attr = 241 + match name with 242 + | "a" when has_attr attrs "href" -> Some "href" 243 + | "audio" when has_attr attrs "controls" -> Some "controls" 244 + | "video" when has_attr attrs "controls" -> Some "controls" 245 + | "img" when has_attr attrs "usemap" -> Some "usemap" 246 + | "object" when has_attr attrs "usemap" -> Some "usemap" 247 + | _ -> None 241 248 in 242 249 243 - (* Check for violations *) 244 - if mask <> 0 then begin 245 - let mask_hit = state.ancestor_mask land mask in 246 - if mask_hit <> 0 then begin 247 - (* Determine if element has a special attribute to mention *) 248 - let attr = 249 - match name with 250 - | "a" when has_attr attrs "href" -> Some "href" 251 - | "audio" when has_attr attrs "controls" -> Some "controls" 252 - | "video" when has_attr attrs "controls" -> Some "controls" 253 - | "img" when has_attr attrs "usemap" -> Some "usemap" 254 - | "object" when has_attr attrs "usemap" -> Some "usemap" 255 - | _ -> None 250 + (* Find transparent parent if any *) 251 + let transparent_parent = find_nearest_transparent_parent state in 252 + 253 + (* Check each special ancestor *) 254 + Array.iteri (fun i ancestor -> 255 + (* Is this ancestor in our current ancestor chain? *) 256 + if state.ancestor_flags.(i) then begin 257 + (* Is this ancestor prohibited for this element? *) 258 + let is_prohibited = 259 + prohibited.(i) || 260 + (is_interactive && (i = a_index || i = button_index)) 256 261 in 257 - (* Find the transparent parent (like canvas) if any *) 258 - let transparent_parent = find_nearest_transparent_parent state in 259 - (* Find which ancestors are violated *) 260 - Array.iteri (fun i ancestor -> 261 - let bit = 1 lsl i in 262 - if (mask_hit land bit) <> 0 then begin 263 - (* Check if this is a content model violation or a nesting violation *) 264 - if (content_model_mask land bit) <> 0 then begin 265 - (* Content model violation: use "not allowed as child" format *) 266 - (* If there's a transparent parent, use that instead of the ancestor *) 267 - let parent = match transparent_parent with 268 - | Some p -> p 269 - | None -> ancestor 270 - in 271 - Message_collector.add_typed collector 272 - (`Element (`Not_allowed_as_child (`Child name, `Parent parent))) 273 - end else 274 - (* Nesting violation: use "must not be descendant" format *) 275 - Message_collector.add_typed collector 276 - (`Element (`Must_not_descend (`Elem name, `Attr attr, `Ancestor ancestor))) 277 - end 278 - ) special_ancestors 262 + if is_prohibited then begin 263 + (* Is this a content model violation or a nesting violation? *) 264 + if content_violations.(i) then begin 265 + (* Content model violation: use "not allowed as child" format *) 266 + let parent = match transparent_parent with 267 + | Some p -> p 268 + | None -> ancestor 269 + in 270 + Message_collector.add_typed collector 271 + (`Element (`Not_allowed_as_child (`Child name, `Parent parent))) 272 + end else 273 + (* Nesting violation: use "must not be descendant" format *) 274 + Message_collector.add_typed collector 275 + (`Element (`Must_not_descend (`Elem name, `Attr attr, `Ancestor ancestor))) 276 + end 279 277 end 280 - end 278 + ) special_ancestors 281 279 282 280 (** Check for required ancestors. *) 283 281 let check_required_ancestors state name collector = 284 282 match name with 285 283 | "area" -> 286 - if (state.ancestor_mask land map_mask) = 0 then 284 + if not state.ancestor_flags.(map_index) then 287 285 Message_collector.add_typed collector 288 286 (`Generic (Printf.sprintf "The %s element must have a %s ancestor." 289 287 (Error_code.q "area") (Error_code.q "map"))) 290 288 | _ -> () 291 289 292 - (** Check for metadata-only elements appearing outside valid contexts. 293 - style element is only valid in head or in noscript (in head). *) 290 + (** Check for metadata-only elements appearing outside valid contexts. *) 294 291 let check_metadata_element_context state name collector = 295 292 match name with 296 293 | "style" -> 297 - (* style is only valid inside head or noscript *) 298 - begin match state.stack with 299 - | parent :: _ when parent.name = "head" -> () (* valid *) 300 - | parent :: _ when parent.name = "noscript" -> () (* valid in noscript in head *) 301 - | parent :: _ -> 302 - (* style inside any other element is not allowed *) 303 - Message_collector.add_typed collector 304 - (`Element (`Not_allowed_as_child (`Child "style", `Parent parent.name))) 305 - | [] -> () (* at root level, would be caught elsewhere *) 306 - end 294 + (match state.stack with 295 + | parent :: _ when parent.name = "head" -> () 296 + | parent :: _ when parent.name = "noscript" -> () 297 + | parent :: _ -> 298 + Message_collector.add_typed collector 299 + (`Element (`Not_allowed_as_child (`Child "style", `Parent parent.name))) 300 + | [] -> ()) 307 301 | _ -> () 308 302 309 303 let start_element state ~element collector = 310 - (* Only check HTML elements, not SVG or MathML *) 311 304 match element.Element.tag with 312 305 | Tag.Html _ -> 313 306 let name = Tag.tag_to_string element.tag in 314 307 let attrs = element.raw_attrs in 308 + 315 309 (* Check for nesting violations *) 316 310 check_nesting state name attrs collector; 317 311 check_required_ancestors state name collector; 318 312 check_metadata_element_context state name collector; 319 313 320 - (* Update ancestor mask if this is a special ancestor *) 321 - let new_mask = state.ancestor_mask in 322 - let number = special_ancestor_number name in 323 - let new_mask = 324 - if number >= 0 then 325 - new_mask lor (1 lsl number) 326 - else 327 - new_mask 328 - in 314 + (* Create new flags, copying current state *) 315 + let new_flags = copy_flags state.ancestor_flags in 329 316 330 - (* Add href tracking for <a> elements *) 331 - let new_mask = 332 - if name = "a" && has_attr attrs "href" then 333 - let a_num = special_ancestor_number "a" in 334 - new_mask lor (1 lsl a_num) 335 - else 336 - new_mask 337 - in 317 + (* Set flag if this is a special ancestor *) 318 + let idx = special_ancestor_index name in 319 + if idx >= 0 then 320 + new_flags.(idx) <- true; 338 321 339 - (* Push onto stack *) 322 + (* Push onto stack (save old flags) *) 340 323 let is_transparent = is_transparent_element name in 341 - let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in 324 + let node = { ancestor_flags = state.ancestor_flags; name; is_transparent } in 342 325 state.stack <- node :: state.stack; 343 - state.ancestor_mask <- new_mask 344 - | _ -> () (* SVG, MathML, Custom, Unknown *) 326 + state.ancestor_flags <- new_flags 327 + | _ -> () 345 328 346 329 let end_element state ~tag _collector = 347 - (* Only track HTML elements *) 348 330 match tag with 349 331 | Tag.Html _ -> 350 - (* Pop from stack and restore ancestor mask *) 351 - begin match state.stack with 352 - | [] -> () (* Should not happen in well-formed documents *) 353 - | node :: rest -> 354 - state.stack <- rest; 355 - state.ancestor_mask <- node.ancestor_mask 356 - end 332 + (match state.stack with 333 + | [] -> () 334 + | node :: rest -> 335 + state.stack <- rest; 336 + state.ancestor_flags <- node.ancestor_flags) 357 337 | _ -> () 358 338 359 - (** Create the checker as a first-class module. *) 360 339 let checker = Checker.make ~create ~reset ~start_element ~end_element ()