OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 22 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6open Brr 7open Htmlrw_js_types 8 9let ensure_doctype html = 10 let lower = String.lowercase_ascii html in 11 if String.length lower >= 9 && String.sub lower 0 9 = "<!doctype" then 12 html 13 else 14 "<!DOCTYPE html>" ^ html 15 16let validate_string raw_html = 17 let html = ensure_doctype raw_html in 18 try 19 let core_result = Htmlrw_check.check_string html in 20 let messages = List.map (fun msg -> 21 { message = msg; element_ref = None } 22 ) (Htmlrw_check.messages core_result) in 23 { messages; core_result; source_element = None } 24 with exn -> 25 (* Return empty result with error message on parse failure *) 26 let error_msg = { 27 Htmlrw_check.severity = Htmlrw_check.Error; 28 text = Printf.sprintf "Parse error: %s" (Printexc.to_string exn); 29 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1); 30 location = None; 31 element = None; 32 attribute = None; 33 extract = None; 34 } in 35 let core_result = Htmlrw_check.check_string "" in 36 { messages = [{ message = error_msg; element_ref = None }]; 37 core_result; 38 source_element = None } 39 40let validate_element el = 41 try 42 let el_map, html = Htmlrw_js_dom.create el in 43 let core_result = Htmlrw_check.check_string html in 44 let messages = List.map (fun msg -> 45 let element_ref = 46 match Htmlrw_js_dom.find_for_message el_map msg with 47 | Some browser_el -> 48 Some { 49 element = Some browser_el; 50 selector = Htmlrw_js_dom.selector_path browser_el; 51 } 52 | None -> 53 (* No direct mapping found - try to find by element name *) 54 match msg.Htmlrw_check.element with 55 | Some tag -> 56 let matches = Htmlrw_js_dom.filter_elements (fun e -> 57 String.lowercase_ascii (Jstr.to_string (El.tag_name e)) = 58 String.lowercase_ascii tag 59 ) el in 60 (match matches with 61 | browser_el :: _ -> 62 Some { 63 element = Some browser_el; 64 selector = Htmlrw_js_dom.selector_path browser_el; 65 } 66 | [] -> None) 67 | None -> None 68 in 69 { message = msg; element_ref } 70 ) (Htmlrw_check.messages core_result) in 71 { messages; core_result; source_element = Some el } 72 with exn -> 73 (* Return error result on parse failure *) 74 let error_msg = { 75 Htmlrw_check.severity = Htmlrw_check.Error; 76 text = Printf.sprintf "Parse error: %s" (Printexc.to_string exn); 77 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1); 78 location = None; 79 element = None; 80 attribute = None; 81 extract = None; 82 } in 83 let core_result = Htmlrw_check.check_string "" in 84 { messages = [{ message = error_msg; element_ref = None }]; 85 core_result; 86 source_element = Some el } 87 88let validate_and_annotate ?(config = default_annotation_config) el = 89 let result = validate_element el in 90 (* Inject styles if not already present *) 91 let doc = El.document el in 92 let existing = El.find_first_by_selector (Jstr.v "[data-html5rw-styles]") 93 ~root:(Document.head doc) in 94 if Option.is_none existing then 95 ignore (Htmlrw_js_annotate.inject_default_styles ~theme:`Auto); 96 (* Annotate elements *) 97 Htmlrw_js_annotate.annotate ~config ~root:el result.messages; 98 result 99 100let validate_and_show_panel 101 ?(annotation_config = default_annotation_config) 102 ?(panel_config = default_panel_config) 103 el = 104 let result = validate_and_annotate ~config:annotation_config el in 105 (* Inject panel styles if not already present *) 106 let doc = El.document el in 107 let existing = El.find_first_by_selector (Jstr.v "[data-html5rw-panel-styles]") 108 ~root:(Document.head doc) in 109 if Option.is_none existing then 110 ignore (Htmlrw_js_ui.inject_default_styles ~theme:panel_config.theme); 111 (* Create and show panel *) 112 ignore (Htmlrw_js_ui.create ~config:panel_config result); 113 result 114 115let errors result = 116 List.filter (fun bm -> 117 bm.message.Htmlrw_check.severity = Htmlrw_check.Error 118 ) result.messages 119 120let warnings_only result = 121 List.filter (fun bm -> 122 bm.message.Htmlrw_check.severity = Htmlrw_check.Warning 123 ) result.messages 124 125let infos result = 126 List.filter (fun bm -> 127 bm.message.Htmlrw_check.severity = Htmlrw_check.Info 128 ) result.messages 129 130let has_errors result = 131 Htmlrw_check.has_errors result.core_result 132 133let has_issues result = 134 Htmlrw_check.has_errors result.core_result || 135 Htmlrw_check.has_warnings result.core_result 136 137let message_count result = 138 List.length result.messages 139 140let element_map result = 141 match result.source_element with 142 | Some el -> Some (fst (Htmlrw_js_dom.create el)) 143 | None -> None 144 145(* JavaScript API registration *) 146 147let register_api_on obj = 148 (* validateString(html) -> result *) 149 Jv.set obj "validateString" (Jv.callback ~arity:1 (fun html -> 150 let html_str = Jv.to_string html in 151 let result = validate_string html_str in 152 result_to_jv result 153 )); 154 155 (* validateElement(el) -> result *) 156 Jv.set obj "validateElement" (Jv.callback ~arity:1 (fun el_jv -> 157 let el = El.of_jv el_jv in 158 let result = validate_element el in 159 result_to_jv result 160 )); 161 162 (* validateAndAnnotate(el, config?) -> result *) 163 Jv.set obj "validateAndAnnotate" (Jv.callback ~arity:2 (fun el_jv config_jv -> 164 let el = El.of_jv el_jv in 165 let config = 166 if Jv.is_none config_jv then 167 default_annotation_config 168 else 169 { 170 add_data_attrs = Jv.to_bool (Jv.get config_jv "addDataAttrs"); 171 add_classes = Jv.to_bool (Jv.get config_jv "addClasses"); 172 show_tooltips = Jv.to_bool (Jv.get config_jv "showTooltips"); 173 tooltip_position = `Auto; 174 highlight_on_hover = Jv.to_bool (Jv.get config_jv "highlightOnHover"); 175 } 176 in 177 let result = validate_and_annotate ~config el in 178 result_to_jv result 179 )); 180 181 (* validateAndShowPanel(el, config?) -> result *) 182 Jv.set obj "validateAndShowPanel" (Jv.callback ~arity:2 (fun el_jv config_jv -> 183 let el = El.of_jv el_jv in 184 let annotation_config, panel_config = 185 if Jv.is_none config_jv then 186 default_annotation_config, default_panel_config 187 else 188 let ann_jv = Jv.get config_jv "annotation" in 189 let panel_jv = Jv.get config_jv "panel" in 190 let ann_config = 191 if Jv.is_none ann_jv then default_annotation_config 192 else { 193 add_data_attrs = 194 (let v = Jv.get ann_jv "addDataAttrs" in 195 if Jv.is_none v then true else Jv.to_bool v); 196 add_classes = 197 (let v = Jv.get ann_jv "addClasses" in 198 if Jv.is_none v then true else Jv.to_bool v); 199 show_tooltips = 200 (let v = Jv.get ann_jv "showTooltips" in 201 if Jv.is_none v then true else Jv.to_bool v); 202 tooltip_position = `Auto; 203 highlight_on_hover = 204 (let v = Jv.get ann_jv "highlightOnHover" in 205 if Jv.is_none v then true else Jv.to_bool v); 206 } 207 in 208 let panel_config = 209 if Jv.is_none panel_jv then default_panel_config 210 else { 211 initial_position = 212 (let v = Jv.get panel_jv "initialPosition" in 213 if Jv.is_none v then `TopRight 214 else match Jv.to_string v with 215 | "topRight" -> `TopRight 216 | "topLeft" -> `TopLeft 217 | "bottomRight" -> `BottomRight 218 | "bottomLeft" -> `BottomLeft 219 | _ -> `TopRight); 220 draggable = 221 (let v = Jv.get panel_jv "draggable" in 222 if Jv.is_none v then true else Jv.to_bool v); 223 resizable = 224 (let v = Jv.get panel_jv "resizable" in 225 if Jv.is_none v then true else Jv.to_bool v); 226 collapsible = 227 (let v = Jv.get panel_jv "collapsible" in 228 if Jv.is_none v then true else Jv.to_bool v); 229 start_collapsed = 230 (let v = Jv.get panel_jv "startCollapsed" in 231 if Jv.is_none v then false else Jv.to_bool v); 232 max_height = 233 (let v = Jv.get panel_jv "maxHeight" in 234 if Jv.is_none v then Some 400 else Some (Jv.to_int v)); 235 group_by_severity = 236 (let v = Jv.get panel_jv "groupBySeverity" in 237 if Jv.is_none v then true else Jv.to_bool v); 238 click_to_highlight = 239 (let v = Jv.get panel_jv "clickToHighlight" in 240 if Jv.is_none v then true else Jv.to_bool v); 241 show_selector_path = 242 (let v = Jv.get panel_jv "showSelectorPath" in 243 if Jv.is_none v then true else Jv.to_bool v); 244 theme = 245 (let v = Jv.get panel_jv "theme" in 246 if Jv.is_none v then `Auto 247 else match Jv.to_string v with 248 | "light" -> `Light 249 | "dark" -> `Dark 250 | _ -> `Auto); 251 } 252 in 253 ann_config, panel_config 254 in 255 let result = validate_and_show_panel ~annotation_config ~panel_config el in 256 result_to_jv result 257 )); 258 259 (* clearAnnotations(el) *) 260 Jv.set obj "clearAnnotations" (Jv.callback ~arity:1 (fun el_jv -> 261 let el = El.of_jv el_jv in 262 Htmlrw_js_annotate.clear el; 263 Jv.undefined 264 )); 265 266 (* hidePanel() *) 267 Jv.set obj "hidePanel" (Jv.callback ~arity:0 (fun () -> 268 Htmlrw_js_ui.hide_current (); 269 Jv.undefined 270 )); 271 272 (* showPanel(result, config?) *) 273 Jv.set obj "showPanel" (Jv.callback ~arity:2 (fun result_jv config_jv -> 274 (* This expects a previously returned result object *) 275 (* For now, just create a panel with the warnings from the result *) 276 let warnings_jv = Jv.get result_jv "warnings" in 277 let warnings = Jv.to_list (fun w_jv -> 278 let msg = { 279 Htmlrw_check.severity = 280 (match Jv.to_string (Jv.get w_jv "severity") with 281 | "error" -> Htmlrw_check.Error 282 | "warning" -> Htmlrw_check.Warning 283 | _ -> Htmlrw_check.Info); 284 text = Jv.to_string (Jv.get w_jv "message"); 285 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1); 286 location = None; 287 element = None; 288 attribute = None; 289 extract = None; 290 } in 291 let element_ref = 292 let sel_jv = Jv.get w_jv "selector" in 293 let el_jv = Jv.get w_jv "element" in 294 if Jv.is_none sel_jv then None 295 else Some { 296 selector = Jv.to_string sel_jv; 297 element = if Jv.is_none el_jv then None else Some (El.of_jv el_jv); 298 } 299 in 300 { message = msg; element_ref } 301 ) warnings_jv in 302 let result = { 303 messages = warnings; 304 core_result = Htmlrw_check.check_string ""; 305 source_element = None; 306 } in 307 let config = 308 if Jv.is_none config_jv then default_panel_config 309 else default_panel_config (* TODO: parse config *) 310 in 311 ignore (Htmlrw_js_ui.create ~config result); 312 Jv.undefined 313 )) 314 315(* Async/Worker support *) 316 317let console_log msg = 318 ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |]) 319 320let console_log_result prefix result = 321 let error_count = List.length (List.filter (fun bm -> 322 bm.message.Htmlrw_check.severity = Htmlrw_check.Error 323 ) result.messages) in 324 let warning_count = List.length (List.filter (fun bm -> 325 bm.message.Htmlrw_check.severity = Htmlrw_check.Warning 326 ) result.messages) in 327 let msg = Printf.sprintf "[html5rw] %s: %d errors, %d warnings, %d total issues" 328 prefix error_count warning_count (List.length result.messages) in 329 console_log msg 330 331let _worker : Jv.t option ref = ref None 332let _pending_callbacks : (int, Jv.t -> unit) Hashtbl.t = Hashtbl.create 16 333let _next_id = ref 0 334 335let init_worker worker_url = 336 console_log (Printf.sprintf "[html5rw] Initializing web worker from %s" worker_url); 337 let worker = Jv.new' (Jv.get Jv.global "Worker") [| Jv.of_string worker_url |] in 338 339 (* Error handler for worker-level errors *) 340 let error_handler = Jv.callback ~arity:1 (fun ev -> 341 let msg = Jv.get ev "message" in 342 let filename = Jv.get ev "filename" in 343 let lineno = Jv.get ev "lineno" in 344 console_log (Printf.sprintf "[html5rw] Worker error: %s at %s:%d" 345 (if Jv.is_undefined msg then "unknown" else Jv.to_string msg) 346 (if Jv.is_undefined filename then "unknown" else Jv.to_string filename) 347 (if Jv.is_undefined lineno then 0 else Jv.to_int lineno)) 348 ) in 349 ignore (Jv.call worker "addEventListener" [| Jv.of_string "error"; error_handler |]); 350 351 let handler = Jv.callback ~arity:1 (fun ev -> 352 let data = Jv.get ev "data" in 353 let id = Jv.get data "id" |> Jv.to_int in 354 let error_count = Jv.get data "errorCount" |> Jv.to_int in 355 let warning_count = Jv.get data "warningCount" |> Jv.to_int in 356 let total = Jv.get data "warnings" |> Jv.to_list (fun _ -> ()) |> List.length in 357 console_log (Printf.sprintf "[html5rw] Worker validation complete: %d errors, %d warnings, %d total issues" 358 error_count warning_count total); 359 match Hashtbl.find_opt _pending_callbacks id with 360 | Some callback -> 361 Hashtbl.remove _pending_callbacks id; 362 callback data 363 | None -> () 364 ) in 365 ignore (Jv.call worker "addEventListener" [| Jv.of_string "message"; handler |]); 366 _worker := Some worker; 367 console_log "[html5rw] Web worker ready"; 368 worker 369 370let validate_string_async ~callback html = 371 match !_worker with 372 | None -> failwith "Worker not initialized. Call html5rw.initWorker(url) first." 373 | Some worker -> 374 console_log (Printf.sprintf "[html5rw] Sending %d bytes to worker for validation..." (String.length html)); 375 let id = !_next_id in 376 incr _next_id; 377 Hashtbl.add _pending_callbacks id callback; 378 let msg = Jv.obj [| 379 "id", Jv.of_int id; 380 "html", Jv.of_string html 381 |] in 382 ignore (Jv.call worker "postMessage" [| msg |]) 383 384let _validate_element_async ~callback el = 385 let html = Htmlrw_js_dom.outer_html el in 386 validate_string_async ~callback html 387 388let validate_after_load callback el = 389 (* Use requestIdleCallback if available, otherwise setTimeout *) 390 console_log "[html5rw] Waiting for page load..."; 391 let run () = 392 console_log "[html5rw] Starting validation..."; 393 let result = validate_element el in 394 console_log_result "Validation complete" result; 395 callback result 396 in 397 let request_idle = Jv.get Jv.global "requestIdleCallback" in 398 if not (Jv.is_undefined request_idle) then 399 ignore (Jv.apply request_idle [| Jv.callback ~arity:1 (fun _ -> run ()) |]) 400 else 401 ignore (Jv.call Jv.global "setTimeout" [| 402 Jv.callback ~arity:0 run; 403 Jv.of_int 0 404 |]) 405 406let validate_on_idle ?(timeout=5000) callback el = 407 (* Wait for page load, then use requestIdleCallback with timeout *) 408 console_log "[html5rw] Scheduling validation for idle time..."; 409 let run_when_ready () = 410 let request_idle = Jv.get Jv.global "requestIdleCallback" in 411 if not (Jv.is_undefined request_idle) then begin 412 let opts = Jv.obj [| "timeout", Jv.of_int timeout |] in 413 ignore (Jv.call Jv.global "requestIdleCallback" [| 414 Jv.callback ~arity:1 (fun _ -> 415 console_log "[html5rw] Browser idle, starting validation..."; 416 let result = validate_element el in 417 console_log_result "Validation complete" result; 418 callback result 419 ); 420 opts 421 |]) 422 end else begin 423 ignore (Jv.call Jv.global "setTimeout" [| 424 Jv.callback ~arity:0 (fun () -> 425 console_log "[html5rw] Starting validation..."; 426 let result = validate_element el in 427 console_log_result "Validation complete" result; 428 callback result 429 ); 430 Jv.of_int 100 431 |]) 432 end 433 in 434 let ready_state = Jv.get (Jv.get Jv.global "document") "readyState" |> Jv.to_string in 435 if ready_state = "complete" then 436 run_when_ready () 437 else 438 ignore (Jv.call Jv.global "addEventListener" [| 439 Jv.of_string "load"; 440 Jv.callback ~arity:1 (fun _ -> run_when_ready ()) 441 |]) 442 443let register_global_api () = 444 let api = Jv.obj [||] in 445 register_api_on api; 446 447 (* Add async functions *) 448 449 (* initWorker(url) - initialize web worker *) 450 Jv.set api "initWorker" (Jv.callback ~arity:1 (fun url_jv -> 451 let url = Jv.to_string url_jv in 452 init_worker url 453 )); 454 455 (* validateStringAsync(html, callback) - validate in worker *) 456 Jv.set api "validateStringAsync" (Jv.callback ~arity:2 (fun html_jv callback_jv -> 457 let html = Jv.to_string html_jv in 458 let callback result = ignore (Jv.apply callback_jv [| result |]) in 459 validate_string_async ~callback html; 460 Jv.undefined 461 )); 462 463 (* validateElementAsync(el, callback) - validate element in worker *) 464 Jv.set api "validateElementAsync" (Jv.callback ~arity:2 (fun el_jv callback_jv -> 465 let el = El.of_jv el_jv in 466 let html = Htmlrw_js_dom.outer_html el in 467 let callback result = ignore (Jv.apply callback_jv [| result |]) in 468 validate_string_async ~callback html; 469 Jv.undefined 470 )); 471 472 (* validateAfterLoad(el, callback) - validate after page load *) 473 Jv.set api "validateAfterLoad" (Jv.callback ~arity:2 (fun el_jv callback_jv -> 474 let el = El.of_jv el_jv in 475 let callback result = ignore (Jv.apply callback_jv [| result_to_jv result |]) in 476 validate_after_load callback el; 477 Jv.undefined 478 )); 479 480 (* validateOnIdle(el, callback, timeout?) - validate when browser is idle *) 481 Jv.set api "validateOnIdle" (Jv.callback ~arity:3 (fun el_jv callback_jv timeout_jv -> 482 let el = El.of_jv el_jv in 483 let timeout = if Jv.is_undefined timeout_jv then 5000 else Jv.to_int timeout_jv in 484 let callback result = ignore (Jv.apply callback_jv [| result_to_jv result |]) in 485 validate_on_idle ~timeout callback el; 486 Jv.undefined 487 )); 488 489 (* validateAndShowPanelAsync(el, config?) - non-blocking panel display *) 490 Jv.set api "validateAndShowPanelAsync" (Jv.callback ~arity:2 (fun el_jv config_jv -> 491 let el = El.of_jv el_jv in 492 validate_on_idle ~timeout:3000 (fun result -> 493 let annotation_config, panel_config = 494 if Jv.is_none config_jv then 495 default_annotation_config, default_panel_config 496 else 497 (* Parse config same as validateAndShowPanel *) 498 default_annotation_config, default_panel_config 499 in 500 (* Inject styles if needed *) 501 let doc = El.document el in 502 let existing = El.find_first_by_selector (Jstr.v "[data-html5rw-styles]") 503 ~root:(Document.head doc) in 504 if Option.is_none existing then 505 ignore (Htmlrw_js_annotate.inject_default_styles ~theme:`Auto); 506 let existing_panel = El.find_first_by_selector (Jstr.v "[data-html5rw-panel-styles]") 507 ~root:(Document.head doc) in 508 if Option.is_none existing_panel then 509 ignore (Htmlrw_js_ui.inject_default_styles ~theme:panel_config.theme); 510 (* Annotate and show panel *) 511 Htmlrw_js_annotate.annotate ~config:annotation_config ~root:el result.messages; 512 ignore (Htmlrw_js_ui.create ~config:panel_config result) 513 ) el; 514 Jv.undefined 515 )); 516 517 (* showPanelFromWorkerResult(result) - show panel from worker validation result *) 518 Jv.set api "showPanelFromWorkerResult" (Jv.callback ~arity:1 (fun result_jv -> 519 console_log "[html5rw] Showing panel from worker result"; 520 (* Convert worker result format to internal format *) 521 let warnings_jv = Jv.get result_jv "warnings" in 522 let messages = Jv.to_list (fun w_jv -> 523 let severity_str = Jv.to_string (Jv.get w_jv "severity") in 524 let msg = { 525 Htmlrw_check.severity = 526 (match severity_str with 527 | "error" -> Htmlrw_check.Error 528 | "warning" -> Htmlrw_check.Warning 529 | _ -> Htmlrw_check.Info); 530 text = Jv.to_string (Jv.get w_jv "message"); 531 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1); 532 location = ( 533 let line_jv = Jv.get w_jv "line" in 534 let col_jv = Jv.get w_jv "column" in 535 if Jv.is_undefined line_jv then None 536 else Some { 537 Htmlrw_check.line = Jv.to_int line_jv; 538 column = (if Jv.is_undefined col_jv then 1 else Jv.to_int col_jv); 539 end_line = None; 540 end_column = None; 541 system_id = None; 542 } 543 ); 544 element = ( 545 let el_jv = Jv.get w_jv "elementName" in 546 if Jv.is_undefined el_jv then None else Some (Jv.to_string el_jv) 547 ); 548 attribute = ( 549 let attr_jv = Jv.get w_jv "attribute" in 550 if Jv.is_undefined attr_jv then None else Some (Jv.to_string attr_jv) 551 ); 552 extract = None; 553 } in 554 { message = msg; element_ref = None } 555 ) warnings_jv in 556 557 let result = { 558 messages; 559 core_result = Htmlrw_check.check_string ""; 560 source_element = None; 561 } in 562 563 (* Inject panel styles *) 564 let doc = Document.of_jv (Jv.get Jv.global "document") in 565 let existing_panel = El.find_first_by_selector (Jstr.v "[data-html5rw-panel-styles]") 566 ~root:(Document.head doc) in 567 if Option.is_none existing_panel then 568 ignore (Htmlrw_js_ui.inject_default_styles ~theme:`Auto); 569 570 (* Create and show panel *) 571 console_log (Printf.sprintf "[html5rw] Creating panel with %d messages" (List.length messages)); 572 ignore (Htmlrw_js_ui.create ~config:default_panel_config result); 573 Jv.undefined 574 )); 575 576 Jv.set Jv.global "html5rw" api; 577 578 (* Dispatch 'html5rwReady' event for async loaders (WASM) *) 579 let document = Jv.get Jv.global "document" in 580 let event_class = Jv.get Jv.global "CustomEvent" in 581 let event = Jv.new' event_class [| Jv.of_string "html5rwReady" |] in 582 ignore (Jv.call document "dispatchEvent" [| event |]); 583 console_log "[html5rw] API ready"