OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(* Web Worker entry point for background HTML validation.
7
8 This runs in a separate thread and communicates via postMessage.
9 It only does string-based validation since workers can't access the DOM.
10*)
11
12[@@@warning "-33"] (* Suppress unused open - we only need Jv from Brr *)
13open Brr
14
15let console_log msg =
16 ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |])
17
18let console_error msg =
19 ignore (Jv.call (Jv.get Jv.global "console") "error" [| Jv.of_string msg |])
20
21let ensure_doctype html =
22 let lower = String.lowercase_ascii html in
23 if String.length lower >= 9 && String.sub lower 0 9 = "<!doctype" then
24 html
25 else
26 "<!DOCTYPE html>" ^ html
27
28(* Debug: dump tree structure to see what parser built *)
29let dump_tree_structure html =
30 let doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string html) in
31 let root = Html5rw.root doc in
32 let buf = Buffer.create 1024 in
33 let rec dump indent node =
34 let prefix = String.make (indent * 2) ' ' in
35 let name = node.Html5rw.Dom.name in
36 if name = "#text" then begin
37 let text = String.trim node.Html5rw.Dom.data in
38 if String.length text > 0 then
39 Buffer.add_string buf (Printf.sprintf "%s#text: \"%s\"\n" prefix
40 (if String.length text > 30 then String.sub text 0 30 ^ "..." else text))
41 end else if name = "#comment" then
42 ()
43 else begin
44 Buffer.add_string buf (Printf.sprintf "%s<%s>\n" prefix name);
45 if indent < 5 then (* only show first 5 levels *)
46 List.iter (dump (indent + 1)) node.Html5rw.Dom.children
47 end
48 in
49 dump 0 root;
50 Buffer.contents buf
51
52let handle_message msg_data =
53 console_log "[html5rw worker] Message received";
54 let response = Jv.obj [||] in
55 try
56 let id = Jv.get msg_data "id" |> Jv.to_int in
57 let raw_html = Jv.get msg_data "html" |> Jv.to_string in
58 let html = ensure_doctype raw_html in
59 console_log (Printf.sprintf "[html5rw worker] Validating %d bytes (id=%d)" (String.length html) id);
60 (* Log first 500 chars of HTML for debugging *)
61 let preview = if String.length html > 500 then String.sub html 0 500 ^ "..." else html in
62 console_log (Printf.sprintf "[html5rw worker] HTML preview:\n%s" preview);
63
64 Jv.set response "id" (Jv.of_int id);
65
66 (try
67 (* Run validation *)
68 let core_result = Htmlrw_check.check_string html in
69 let messages = Htmlrw_check.messages core_result in
70
71 (* Convert messages to JS-friendly format *)
72 let warnings = Jv.of_list (fun msg ->
73 let obj = Jv.obj [||] in
74 Jv.set obj "severity" (Jv.of_string (Htmlrw_check.severity_to_string msg.Htmlrw_check.severity));
75 Jv.set obj "message" (Jv.of_string msg.Htmlrw_check.text);
76 Jv.set obj "errorCode" (Jv.of_string (Htmlrw_check.error_code_to_string msg.Htmlrw_check.error_code));
77 (match msg.Htmlrw_check.element with
78 | Some el -> Jv.set obj "elementName" (Jv.of_string el)
79 | None -> ());
80 (match msg.Htmlrw_check.attribute with
81 | Some attr -> Jv.set obj "attribute" (Jv.of_string attr)
82 | None -> ());
83 (match msg.Htmlrw_check.location with
84 | Some loc ->
85 Jv.set obj "line" (Jv.of_int loc.line);
86 Jv.set obj "column" (Jv.of_int loc.column)
87 | None -> ());
88 obj
89 ) messages in
90
91 let error_count = List.length (List.filter (fun m ->
92 m.Htmlrw_check.severity = Htmlrw_check.Error) messages) in
93 let warning_count = List.length (List.filter (fun m ->
94 m.Htmlrw_check.severity = Htmlrw_check.Warning) messages) in
95 let info_count = List.length (List.filter (fun m ->
96 m.Htmlrw_check.severity = Htmlrw_check.Info) messages) in
97
98 Jv.set response "warnings" warnings;
99 Jv.set response "errorCount" (Jv.of_int error_count);
100 Jv.set response "warningCount" (Jv.of_int warning_count);
101 Jv.set response "infoCount" (Jv.of_int info_count);
102 Jv.set response "hasErrors" (Jv.of_bool (error_count > 0));
103 (* Add tree structure for debugging *)
104 let tree_dump = dump_tree_structure html in
105 Jv.set response "treeStructure" (Jv.of_string tree_dump);
106 Jv.set response "htmlPreview" (Jv.of_string preview);
107 console_log (Printf.sprintf "[html5rw worker] Tree structure:\n%s" tree_dump)
108 with exn ->
109 (* Return error on parse failure *)
110 let error_obj = Jv.obj [||] in
111 Jv.set error_obj "severity" (Jv.of_string "error");
112 Jv.set error_obj "message" (Jv.of_string (Printf.sprintf "Parse error: %s" (Printexc.to_string exn)));
113 Jv.set error_obj "errorCode" (Jv.of_string "parse-error");
114 Jv.set response "warnings" (Jv.of_list Fun.id [error_obj]);
115 Jv.set response "errorCount" (Jv.of_int 1);
116 Jv.set response "warningCount" (Jv.of_int 0);
117 Jv.set response "infoCount" (Jv.of_int 0);
118 Jv.set response "hasErrors" (Jv.of_bool true);
119 Jv.set response "parseError" (Jv.of_string (Printexc.to_string exn)));
120
121 console_log "[html5rw worker] Validation complete, posting response";
122 (* Post result back to main thread *)
123 let self = Jv.get Jv.global "self" in
124 ignore (Jv.call self "postMessage" [| response |])
125 with exn ->
126 (* Outer error handler - catches message parsing errors *)
127 console_error (Printf.sprintf "[html5rw worker] Fatal error: %s" (Printexc.to_string exn));
128 let error_obj = Jv.obj [||] in
129 Jv.set error_obj "severity" (Jv.of_string "error");
130 Jv.set error_obj "message" (Jv.of_string (Printf.sprintf "Worker error: %s" (Printexc.to_string exn)));
131 Jv.set error_obj "errorCode" (Jv.of_string "worker-error");
132 Jv.set response "id" (Jv.of_int (-1));
133 Jv.set response "warnings" (Jv.of_list Fun.id [error_obj]);
134 Jv.set response "errorCount" (Jv.of_int 1);
135 Jv.set response "warningCount" (Jv.of_int 0);
136 Jv.set response "infoCount" (Jv.of_int 0);
137 Jv.set response "hasErrors" (Jv.of_bool true);
138 Jv.set response "fatalError" (Jv.of_string (Printexc.to_string exn));
139 let self = Jv.get Jv.global "self" in
140 ignore (Jv.call self "postMessage" [| response |])
141
142let () =
143 console_log "[html5rw worker] Worker script starting...";
144 (* Set up message handler *)
145 let self = Jv.get Jv.global "self" in
146 let handler = Jv.callback ~arity:1 (fun ev ->
147 let data = Jv.get ev "data" in
148 handle_message data
149 ) in
150 ignore (Jv.call self "addEventListener" [| Jv.of_string "message"; handler |]);
151 console_log "[html5rw worker] Message handler registered, ready for messages"