OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6open Brr
7open Htmlrw_js_types
8
9let ensure_doctype html =
10 let lower = String.lowercase_ascii html in
11 if String.length lower >= 9 && String.sub lower 0 9 = "<!doctype" then
12 html
13 else
14 "<!DOCTYPE html>" ^ html
15
16let validate_string raw_html =
17 let html = ensure_doctype raw_html in
18 try
19 let core_result = Htmlrw_check.check_string html in
20 let messages = List.map (fun msg ->
21 { message = msg; element_ref = None }
22 ) (Htmlrw_check.messages core_result) in
23 { messages; core_result; source_element = None }
24 with exn ->
25 (* Return empty result with error message on parse failure *)
26 let error_msg = {
27 Htmlrw_check.severity = Htmlrw_check.Error;
28 text = Printf.sprintf "Parse error: %s" (Printexc.to_string exn);
29 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1);
30 location = None;
31 element = None;
32 attribute = None;
33 extract = None;
34 } in
35 let core_result = Htmlrw_check.check_string "" in
36 { messages = [{ message = error_msg; element_ref = None }];
37 core_result;
38 source_element = None }
39
40let validate_element el =
41 try
42 let el_map, html = Htmlrw_js_dom.create el in
43 let core_result = Htmlrw_check.check_string html in
44 let messages = List.map (fun msg ->
45 let element_ref =
46 match Htmlrw_js_dom.find_for_message el_map msg with
47 | Some browser_el ->
48 Some {
49 element = Some browser_el;
50 selector = Htmlrw_js_dom.selector_path browser_el;
51 }
52 | None ->
53 (* No direct mapping found - try to find by element name *)
54 match msg.Htmlrw_check.element with
55 | Some tag ->
56 let matches = Htmlrw_js_dom.filter_elements (fun e ->
57 String.lowercase_ascii (Jstr.to_string (El.tag_name e)) =
58 String.lowercase_ascii tag
59 ) el in
60 (match matches with
61 | browser_el :: _ ->
62 Some {
63 element = Some browser_el;
64 selector = Htmlrw_js_dom.selector_path browser_el;
65 }
66 | [] -> None)
67 | None -> None
68 in
69 { message = msg; element_ref }
70 ) (Htmlrw_check.messages core_result) in
71 { messages; core_result; source_element = Some el }
72 with exn ->
73 (* Return error result on parse failure *)
74 let error_msg = {
75 Htmlrw_check.severity = Htmlrw_check.Error;
76 text = Printf.sprintf "Parse error: %s" (Printexc.to_string exn);
77 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1);
78 location = None;
79 element = None;
80 attribute = None;
81 extract = None;
82 } in
83 let core_result = Htmlrw_check.check_string "" in
84 { messages = [{ message = error_msg; element_ref = None }];
85 core_result;
86 source_element = Some el }
87
88let validate_and_annotate ?(config = default_annotation_config) el =
89 let result = validate_element el in
90 (* Inject styles if not already present *)
91 let doc = El.document el in
92 let existing = El.find_first_by_selector (Jstr.v "[data-html5rw-styles]")
93 ~root:(Document.head doc) in
94 if Option.is_none existing then
95 ignore (Htmlrw_js_annotate.inject_default_styles ~theme:`Auto);
96 (* Annotate elements *)
97 Htmlrw_js_annotate.annotate ~config ~root:el result.messages;
98 result
99
100let validate_and_show_panel
101 ?(annotation_config = default_annotation_config)
102 ?(panel_config = default_panel_config)
103 el =
104 let result = validate_and_annotate ~config:annotation_config el in
105 (* Inject panel styles if not already present *)
106 let doc = El.document el in
107 let existing = El.find_first_by_selector (Jstr.v "[data-html5rw-panel-styles]")
108 ~root:(Document.head doc) in
109 if Option.is_none existing then
110 ignore (Htmlrw_js_ui.inject_default_styles ~theme:panel_config.theme);
111 (* Create and show panel *)
112 ignore (Htmlrw_js_ui.create ~config:panel_config result);
113 result
114
115let errors result =
116 List.filter (fun bm ->
117 bm.message.Htmlrw_check.severity = Htmlrw_check.Error
118 ) result.messages
119
120let warnings_only result =
121 List.filter (fun bm ->
122 bm.message.Htmlrw_check.severity = Htmlrw_check.Warning
123 ) result.messages
124
125let infos result =
126 List.filter (fun bm ->
127 bm.message.Htmlrw_check.severity = Htmlrw_check.Info
128 ) result.messages
129
130let has_errors result =
131 Htmlrw_check.has_errors result.core_result
132
133let has_issues result =
134 Htmlrw_check.has_errors result.core_result ||
135 Htmlrw_check.has_warnings result.core_result
136
137let message_count result =
138 List.length result.messages
139
140let element_map result =
141 match result.source_element with
142 | Some el -> Some (fst (Htmlrw_js_dom.create el))
143 | None -> None
144
145(* JavaScript API registration *)
146
147let register_api_on obj =
148 (* validateString(html) -> result *)
149 Jv.set obj "validateString" (Jv.callback ~arity:1 (fun html ->
150 let html_str = Jv.to_string html in
151 let result = validate_string html_str in
152 result_to_jv result
153 ));
154
155 (* validateElement(el) -> result *)
156 Jv.set obj "validateElement" (Jv.callback ~arity:1 (fun el_jv ->
157 let el = El.of_jv el_jv in
158 let result = validate_element el in
159 result_to_jv result
160 ));
161
162 (* validateAndAnnotate(el, config?) -> result *)
163 Jv.set obj "validateAndAnnotate" (Jv.callback ~arity:2 (fun el_jv config_jv ->
164 let el = El.of_jv el_jv in
165 let config =
166 if Jv.is_none config_jv then
167 default_annotation_config
168 else
169 {
170 add_data_attrs = Jv.to_bool (Jv.get config_jv "addDataAttrs");
171 add_classes = Jv.to_bool (Jv.get config_jv "addClasses");
172 show_tooltips = Jv.to_bool (Jv.get config_jv "showTooltips");
173 tooltip_position = `Auto;
174 highlight_on_hover = Jv.to_bool (Jv.get config_jv "highlightOnHover");
175 }
176 in
177 let result = validate_and_annotate ~config el in
178 result_to_jv result
179 ));
180
181 (* validateAndShowPanel(el, config?) -> result *)
182 Jv.set obj "validateAndShowPanel" (Jv.callback ~arity:2 (fun el_jv config_jv ->
183 let el = El.of_jv el_jv in
184 let annotation_config, panel_config =
185 if Jv.is_none config_jv then
186 default_annotation_config, default_panel_config
187 else
188 let ann_jv = Jv.get config_jv "annotation" in
189 let panel_jv = Jv.get config_jv "panel" in
190 let ann_config =
191 if Jv.is_none ann_jv then default_annotation_config
192 else {
193 add_data_attrs =
194 (let v = Jv.get ann_jv "addDataAttrs" in
195 if Jv.is_none v then true else Jv.to_bool v);
196 add_classes =
197 (let v = Jv.get ann_jv "addClasses" in
198 if Jv.is_none v then true else Jv.to_bool v);
199 show_tooltips =
200 (let v = Jv.get ann_jv "showTooltips" in
201 if Jv.is_none v then true else Jv.to_bool v);
202 tooltip_position = `Auto;
203 highlight_on_hover =
204 (let v = Jv.get ann_jv "highlightOnHover" in
205 if Jv.is_none v then true else Jv.to_bool v);
206 }
207 in
208 let panel_config =
209 if Jv.is_none panel_jv then default_panel_config
210 else {
211 initial_position =
212 (let v = Jv.get panel_jv "initialPosition" in
213 if Jv.is_none v then `TopRight
214 else match Jv.to_string v with
215 | "topRight" -> `TopRight
216 | "topLeft" -> `TopLeft
217 | "bottomRight" -> `BottomRight
218 | "bottomLeft" -> `BottomLeft
219 | _ -> `TopRight);
220 draggable =
221 (let v = Jv.get panel_jv "draggable" in
222 if Jv.is_none v then true else Jv.to_bool v);
223 resizable =
224 (let v = Jv.get panel_jv "resizable" in
225 if Jv.is_none v then true else Jv.to_bool v);
226 collapsible =
227 (let v = Jv.get panel_jv "collapsible" in
228 if Jv.is_none v then true else Jv.to_bool v);
229 start_collapsed =
230 (let v = Jv.get panel_jv "startCollapsed" in
231 if Jv.is_none v then false else Jv.to_bool v);
232 max_height =
233 (let v = Jv.get panel_jv "maxHeight" in
234 if Jv.is_none v then Some 400 else Some (Jv.to_int v));
235 group_by_severity =
236 (let v = Jv.get panel_jv "groupBySeverity" in
237 if Jv.is_none v then true else Jv.to_bool v);
238 click_to_highlight =
239 (let v = Jv.get panel_jv "clickToHighlight" in
240 if Jv.is_none v then true else Jv.to_bool v);
241 show_selector_path =
242 (let v = Jv.get panel_jv "showSelectorPath" in
243 if Jv.is_none v then true else Jv.to_bool v);
244 theme =
245 (let v = Jv.get panel_jv "theme" in
246 if Jv.is_none v then `Auto
247 else match Jv.to_string v with
248 | "light" -> `Light
249 | "dark" -> `Dark
250 | _ -> `Auto);
251 }
252 in
253 ann_config, panel_config
254 in
255 let result = validate_and_show_panel ~annotation_config ~panel_config el in
256 result_to_jv result
257 ));
258
259 (* clearAnnotations(el) *)
260 Jv.set obj "clearAnnotations" (Jv.callback ~arity:1 (fun el_jv ->
261 let el = El.of_jv el_jv in
262 Htmlrw_js_annotate.clear el;
263 Jv.undefined
264 ));
265
266 (* hidePanel() *)
267 Jv.set obj "hidePanel" (Jv.callback ~arity:0 (fun () ->
268 Htmlrw_js_ui.hide_current ();
269 Jv.undefined
270 ));
271
272 (* showPanel(result, config?) *)
273 Jv.set obj "showPanel" (Jv.callback ~arity:2 (fun result_jv config_jv ->
274 (* This expects a previously returned result object *)
275 (* For now, just create a panel with the warnings from the result *)
276 let warnings_jv = Jv.get result_jv "warnings" in
277 let warnings = Jv.to_list (fun w_jv ->
278 let msg = {
279 Htmlrw_check.severity =
280 (match Jv.to_string (Jv.get w_jv "severity") with
281 | "error" -> Htmlrw_check.Error
282 | "warning" -> Htmlrw_check.Warning
283 | _ -> Htmlrw_check.Info);
284 text = Jv.to_string (Jv.get w_jv "message");
285 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1);
286 location = None;
287 element = None;
288 attribute = None;
289 extract = None;
290 } in
291 let element_ref =
292 let sel_jv = Jv.get w_jv "selector" in
293 let el_jv = Jv.get w_jv "element" in
294 if Jv.is_none sel_jv then None
295 else Some {
296 selector = Jv.to_string sel_jv;
297 element = if Jv.is_none el_jv then None else Some (El.of_jv el_jv);
298 }
299 in
300 { message = msg; element_ref }
301 ) warnings_jv in
302 let result = {
303 messages = warnings;
304 core_result = Htmlrw_check.check_string "";
305 source_element = None;
306 } in
307 let config =
308 if Jv.is_none config_jv then default_panel_config
309 else default_panel_config (* TODO: parse config *)
310 in
311 ignore (Htmlrw_js_ui.create ~config result);
312 Jv.undefined
313 ))
314
315(* Async/Worker support *)
316
317let console_log msg =
318 ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |])
319
320let console_log_result prefix result =
321 let error_count = List.length (List.filter (fun bm ->
322 bm.message.Htmlrw_check.severity = Htmlrw_check.Error
323 ) result.messages) in
324 let warning_count = List.length (List.filter (fun bm ->
325 bm.message.Htmlrw_check.severity = Htmlrw_check.Warning
326 ) result.messages) in
327 let msg = Printf.sprintf "[html5rw] %s: %d errors, %d warnings, %d total issues"
328 prefix error_count warning_count (List.length result.messages) in
329 console_log msg
330
331let _worker : Jv.t option ref = ref None
332let _pending_callbacks : (int, Jv.t -> unit) Hashtbl.t = Hashtbl.create 16
333let _next_id = ref 0
334
335let init_worker worker_url =
336 console_log (Printf.sprintf "[html5rw] Initializing web worker from %s" worker_url);
337 let worker = Jv.new' (Jv.get Jv.global "Worker") [| Jv.of_string worker_url |] in
338
339 (* Error handler for worker-level errors *)
340 let error_handler = Jv.callback ~arity:1 (fun ev ->
341 let msg = Jv.get ev "message" in
342 let filename = Jv.get ev "filename" in
343 let lineno = Jv.get ev "lineno" in
344 console_log (Printf.sprintf "[html5rw] Worker error: %s at %s:%d"
345 (if Jv.is_undefined msg then "unknown" else Jv.to_string msg)
346 (if Jv.is_undefined filename then "unknown" else Jv.to_string filename)
347 (if Jv.is_undefined lineno then 0 else Jv.to_int lineno))
348 ) in
349 ignore (Jv.call worker "addEventListener" [| Jv.of_string "error"; error_handler |]);
350
351 let handler = Jv.callback ~arity:1 (fun ev ->
352 let data = Jv.get ev "data" in
353 let id = Jv.get data "id" |> Jv.to_int in
354 let error_count = Jv.get data "errorCount" |> Jv.to_int in
355 let warning_count = Jv.get data "warningCount" |> Jv.to_int in
356 let total = Jv.get data "warnings" |> Jv.to_list (fun _ -> ()) |> List.length in
357 console_log (Printf.sprintf "[html5rw] Worker validation complete: %d errors, %d warnings, %d total issues"
358 error_count warning_count total);
359 match Hashtbl.find_opt _pending_callbacks id with
360 | Some callback ->
361 Hashtbl.remove _pending_callbacks id;
362 callback data
363 | None -> ()
364 ) in
365 ignore (Jv.call worker "addEventListener" [| Jv.of_string "message"; handler |]);
366 _worker := Some worker;
367 console_log "[html5rw] Web worker ready";
368 worker
369
370let validate_string_async ~callback html =
371 match !_worker with
372 | None -> failwith "Worker not initialized. Call html5rw.initWorker(url) first."
373 | Some worker ->
374 console_log (Printf.sprintf "[html5rw] Sending %d bytes to worker for validation..." (String.length html));
375 let id = !_next_id in
376 incr _next_id;
377 Hashtbl.add _pending_callbacks id callback;
378 let msg = Jv.obj [|
379 "id", Jv.of_int id;
380 "html", Jv.of_string html
381 |] in
382 ignore (Jv.call worker "postMessage" [| msg |])
383
384let _validate_element_async ~callback el =
385 let html = Htmlrw_js_dom.outer_html el in
386 validate_string_async ~callback html
387
388let validate_after_load callback el =
389 (* Use requestIdleCallback if available, otherwise setTimeout *)
390 console_log "[html5rw] Waiting for page load...";
391 let run () =
392 console_log "[html5rw] Starting validation...";
393 let result = validate_element el in
394 console_log_result "Validation complete" result;
395 callback result
396 in
397 let request_idle = Jv.get Jv.global "requestIdleCallback" in
398 if not (Jv.is_undefined request_idle) then
399 ignore (Jv.apply request_idle [| Jv.callback ~arity:1 (fun _ -> run ()) |])
400 else
401 ignore (Jv.call Jv.global "setTimeout" [|
402 Jv.callback ~arity:0 run;
403 Jv.of_int 0
404 |])
405
406let validate_on_idle ?(timeout=5000) callback el =
407 (* Wait for page load, then use requestIdleCallback with timeout *)
408 console_log "[html5rw] Scheduling validation for idle time...";
409 let run_when_ready () =
410 let request_idle = Jv.get Jv.global "requestIdleCallback" in
411 if not (Jv.is_undefined request_idle) then begin
412 let opts = Jv.obj [| "timeout", Jv.of_int timeout |] in
413 ignore (Jv.call Jv.global "requestIdleCallback" [|
414 Jv.callback ~arity:1 (fun _ ->
415 console_log "[html5rw] Browser idle, starting validation...";
416 let result = validate_element el in
417 console_log_result "Validation complete" result;
418 callback result
419 );
420 opts
421 |])
422 end else begin
423 ignore (Jv.call Jv.global "setTimeout" [|
424 Jv.callback ~arity:0 (fun () ->
425 console_log "[html5rw] Starting validation...";
426 let result = validate_element el in
427 console_log_result "Validation complete" result;
428 callback result
429 );
430 Jv.of_int 100
431 |])
432 end
433 in
434 let ready_state = Jv.get (Jv.get Jv.global "document") "readyState" |> Jv.to_string in
435 if ready_state = "complete" then
436 run_when_ready ()
437 else
438 ignore (Jv.call Jv.global "addEventListener" [|
439 Jv.of_string "load";
440 Jv.callback ~arity:1 (fun _ -> run_when_ready ())
441 |])
442
443let register_global_api () =
444 let api = Jv.obj [||] in
445 register_api_on api;
446
447 (* Add async functions *)
448
449 (* initWorker(url) - initialize web worker *)
450 Jv.set api "initWorker" (Jv.callback ~arity:1 (fun url_jv ->
451 let url = Jv.to_string url_jv in
452 init_worker url
453 ));
454
455 (* validateStringAsync(html, callback) - validate in worker *)
456 Jv.set api "validateStringAsync" (Jv.callback ~arity:2 (fun html_jv callback_jv ->
457 let html = Jv.to_string html_jv in
458 let callback result = ignore (Jv.apply callback_jv [| result |]) in
459 validate_string_async ~callback html;
460 Jv.undefined
461 ));
462
463 (* validateElementAsync(el, callback) - validate element in worker *)
464 Jv.set api "validateElementAsync" (Jv.callback ~arity:2 (fun el_jv callback_jv ->
465 let el = El.of_jv el_jv in
466 let html = Htmlrw_js_dom.outer_html el in
467 let callback result = ignore (Jv.apply callback_jv [| result |]) in
468 validate_string_async ~callback html;
469 Jv.undefined
470 ));
471
472 (* validateAfterLoad(el, callback) - validate after page load *)
473 Jv.set api "validateAfterLoad" (Jv.callback ~arity:2 (fun el_jv callback_jv ->
474 let el = El.of_jv el_jv in
475 let callback result = ignore (Jv.apply callback_jv [| result_to_jv result |]) in
476 validate_after_load callback el;
477 Jv.undefined
478 ));
479
480 (* validateOnIdle(el, callback, timeout?) - validate when browser is idle *)
481 Jv.set api "validateOnIdle" (Jv.callback ~arity:3 (fun el_jv callback_jv timeout_jv ->
482 let el = El.of_jv el_jv in
483 let timeout = if Jv.is_undefined timeout_jv then 5000 else Jv.to_int timeout_jv in
484 let callback result = ignore (Jv.apply callback_jv [| result_to_jv result |]) in
485 validate_on_idle ~timeout callback el;
486 Jv.undefined
487 ));
488
489 (* validateAndShowPanelAsync(el, config?) - non-blocking panel display *)
490 Jv.set api "validateAndShowPanelAsync" (Jv.callback ~arity:2 (fun el_jv config_jv ->
491 let el = El.of_jv el_jv in
492 validate_on_idle ~timeout:3000 (fun result ->
493 let annotation_config, panel_config =
494 if Jv.is_none config_jv then
495 default_annotation_config, default_panel_config
496 else
497 (* Parse config same as validateAndShowPanel *)
498 default_annotation_config, default_panel_config
499 in
500 (* Inject styles if needed *)
501 let doc = El.document el in
502 let existing = El.find_first_by_selector (Jstr.v "[data-html5rw-styles]")
503 ~root:(Document.head doc) in
504 if Option.is_none existing then
505 ignore (Htmlrw_js_annotate.inject_default_styles ~theme:`Auto);
506 let existing_panel = El.find_first_by_selector (Jstr.v "[data-html5rw-panel-styles]")
507 ~root:(Document.head doc) in
508 if Option.is_none existing_panel then
509 ignore (Htmlrw_js_ui.inject_default_styles ~theme:panel_config.theme);
510 (* Annotate and show panel *)
511 Htmlrw_js_annotate.annotate ~config:annotation_config ~root:el result.messages;
512 ignore (Htmlrw_js_ui.create ~config:panel_config result)
513 ) el;
514 Jv.undefined
515 ));
516
517 (* showPanelFromWorkerResult(result) - show panel from worker validation result *)
518 Jv.set api "showPanelFromWorkerResult" (Jv.callback ~arity:1 (fun result_jv ->
519 console_log "[html5rw] Showing panel from worker result";
520 (* Convert worker result format to internal format *)
521 let warnings_jv = Jv.get result_jv "warnings" in
522 let messages = Jv.to_list (fun w_jv ->
523 let severity_str = Jv.to_string (Jv.get w_jv "severity") in
524 let msg = {
525 Htmlrw_check.severity =
526 (match severity_str with
527 | "error" -> Htmlrw_check.Error
528 | "warning" -> Htmlrw_check.Warning
529 | _ -> Htmlrw_check.Info);
530 text = Jv.to_string (Jv.get w_jv "message");
531 error_code = Htmlrw_check.Conformance (`Misc `Multiple_h1);
532 location = (
533 let line_jv = Jv.get w_jv "line" in
534 let col_jv = Jv.get w_jv "column" in
535 if Jv.is_undefined line_jv then None
536 else Some {
537 Htmlrw_check.line = Jv.to_int line_jv;
538 column = (if Jv.is_undefined col_jv then 1 else Jv.to_int col_jv);
539 end_line = None;
540 end_column = None;
541 system_id = None;
542 }
543 );
544 element = (
545 let el_jv = Jv.get w_jv "elementName" in
546 if Jv.is_undefined el_jv then None else Some (Jv.to_string el_jv)
547 );
548 attribute = (
549 let attr_jv = Jv.get w_jv "attribute" in
550 if Jv.is_undefined attr_jv then None else Some (Jv.to_string attr_jv)
551 );
552 extract = None;
553 } in
554 { message = msg; element_ref = None }
555 ) warnings_jv in
556
557 let result = {
558 messages;
559 core_result = Htmlrw_check.check_string "";
560 source_element = None;
561 } in
562
563 (* Inject panel styles *)
564 let doc = Document.of_jv (Jv.get Jv.global "document") in
565 let existing_panel = El.find_first_by_selector (Jstr.v "[data-html5rw-panel-styles]")
566 ~root:(Document.head doc) in
567 if Option.is_none existing_panel then
568 ignore (Htmlrw_js_ui.inject_default_styles ~theme:`Auto);
569
570 (* Create and show panel *)
571 console_log (Printf.sprintf "[html5rw] Creating panel with %d messages" (List.length messages));
572 ignore (Htmlrw_js_ui.create ~config:default_panel_config result);
573 Jv.undefined
574 ));
575
576 Jv.set Jv.global "html5rw" api;
577
578 (* Dispatch 'html5rwReady' event for async loaders (WASM) *)
579 let document = Jv.get Jv.global "document" in
580 let event_class = Jv.get Jv.global "CustomEvent" in
581 let event = Jv.new' event_class [| Jv.of_string "html5rwReady" |] in
582 ignore (Jv.call document "dispatchEvent" [| event |]);
583 console_log "[html5rw] API ready"