OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 6.7 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(* Web Worker entry point for background HTML validation. 7 8 This runs in a separate thread and communicates via postMessage. 9 It only does string-based validation since workers can't access the DOM. 10*) 11 12[@@@warning "-33"] (* Suppress unused open - we only need Jv from Brr *) 13open Brr 14 15let console_log msg = 16 ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |]) 17 18let console_error msg = 19 ignore (Jv.call (Jv.get Jv.global "console") "error" [| Jv.of_string msg |]) 20 21let ensure_doctype html = 22 let lower = String.lowercase_ascii html in 23 if String.length lower >= 9 && String.sub lower 0 9 = "<!doctype" then 24 html 25 else 26 "<!DOCTYPE html>" ^ html 27 28(* Debug: dump tree structure to see what parser built *) 29let dump_tree_structure html = 30 let doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string html) in 31 let root = Html5rw.root doc in 32 let buf = Buffer.create 1024 in 33 let rec dump indent node = 34 let prefix = String.make (indent * 2) ' ' in 35 let name = node.Html5rw.Dom.name in 36 if name = "#text" then begin 37 let text = String.trim node.Html5rw.Dom.data in 38 if String.length text > 0 then 39 Buffer.add_string buf (Printf.sprintf "%s#text: \"%s\"\n" prefix 40 (if String.length text > 30 then String.sub text 0 30 ^ "..." else text)) 41 end else if name = "#comment" then 42 () 43 else begin 44 Buffer.add_string buf (Printf.sprintf "%s<%s>\n" prefix name); 45 if indent < 5 then (* only show first 5 levels *) 46 List.iter (dump (indent + 1)) node.Html5rw.Dom.children 47 end 48 in 49 dump 0 root; 50 Buffer.contents buf 51 52let handle_message msg_data = 53 console_log "[html5rw worker] Message received"; 54 let response = Jv.obj [||] in 55 try 56 let id = Jv.get msg_data "id" |> Jv.to_int in 57 let raw_html = Jv.get msg_data "html" |> Jv.to_string in 58 let html = ensure_doctype raw_html in 59 console_log (Printf.sprintf "[html5rw worker] Validating %d bytes (id=%d)" (String.length html) id); 60 (* Log first 500 chars of HTML for debugging *) 61 let preview = if String.length html > 500 then String.sub html 0 500 ^ "..." else html in 62 console_log (Printf.sprintf "[html5rw worker] HTML preview:\n%s" preview); 63 64 Jv.set response "id" (Jv.of_int id); 65 66 (try 67 (* Run validation *) 68 let core_result = Htmlrw_check.check_string html in 69 let messages = Htmlrw_check.messages core_result in 70 71 (* Convert messages to JS-friendly format *) 72 let warnings = Jv.of_list (fun msg -> 73 let obj = Jv.obj [||] in 74 Jv.set obj "severity" (Jv.of_string (Htmlrw_check.severity_to_string msg.Htmlrw_check.severity)); 75 Jv.set obj "message" (Jv.of_string msg.Htmlrw_check.text); 76 Jv.set obj "errorCode" (Jv.of_string (Htmlrw_check.error_code_to_string msg.Htmlrw_check.error_code)); 77 (match msg.Htmlrw_check.element with 78 | Some el -> Jv.set obj "elementName" (Jv.of_string el) 79 | None -> ()); 80 (match msg.Htmlrw_check.attribute with 81 | Some attr -> Jv.set obj "attribute" (Jv.of_string attr) 82 | None -> ()); 83 (match msg.Htmlrw_check.location with 84 | Some loc -> 85 Jv.set obj "line" (Jv.of_int loc.line); 86 Jv.set obj "column" (Jv.of_int loc.column) 87 | None -> ()); 88 obj 89 ) messages in 90 91 let error_count = List.length (List.filter (fun m -> 92 m.Htmlrw_check.severity = Htmlrw_check.Error) messages) in 93 let warning_count = List.length (List.filter (fun m -> 94 m.Htmlrw_check.severity = Htmlrw_check.Warning) messages) in 95 let info_count = List.length (List.filter (fun m -> 96 m.Htmlrw_check.severity = Htmlrw_check.Info) messages) in 97 98 Jv.set response "warnings" warnings; 99 Jv.set response "errorCount" (Jv.of_int error_count); 100 Jv.set response "warningCount" (Jv.of_int warning_count); 101 Jv.set response "infoCount" (Jv.of_int info_count); 102 Jv.set response "hasErrors" (Jv.of_bool (error_count > 0)); 103 (* Add tree structure for debugging *) 104 let tree_dump = dump_tree_structure html in 105 Jv.set response "treeStructure" (Jv.of_string tree_dump); 106 Jv.set response "htmlPreview" (Jv.of_string preview); 107 console_log (Printf.sprintf "[html5rw worker] Tree structure:\n%s" tree_dump) 108 with exn -> 109 (* Return error on parse failure *) 110 let error_obj = Jv.obj [||] in 111 Jv.set error_obj "severity" (Jv.of_string "error"); 112 Jv.set error_obj "message" (Jv.of_string (Printf.sprintf "Parse error: %s" (Printexc.to_string exn))); 113 Jv.set error_obj "errorCode" (Jv.of_string "parse-error"); 114 Jv.set response "warnings" (Jv.of_list Fun.id [error_obj]); 115 Jv.set response "errorCount" (Jv.of_int 1); 116 Jv.set response "warningCount" (Jv.of_int 0); 117 Jv.set response "infoCount" (Jv.of_int 0); 118 Jv.set response "hasErrors" (Jv.of_bool true); 119 Jv.set response "parseError" (Jv.of_string (Printexc.to_string exn))); 120 121 console_log "[html5rw worker] Validation complete, posting response"; 122 (* Post result back to main thread *) 123 let self = Jv.get Jv.global "self" in 124 ignore (Jv.call self "postMessage" [| response |]) 125 with exn -> 126 (* Outer error handler - catches message parsing errors *) 127 console_error (Printf.sprintf "[html5rw worker] Fatal error: %s" (Printexc.to_string exn)); 128 let error_obj = Jv.obj [||] in 129 Jv.set error_obj "severity" (Jv.of_string "error"); 130 Jv.set error_obj "message" (Jv.of_string (Printf.sprintf "Worker error: %s" (Printexc.to_string exn))); 131 Jv.set error_obj "errorCode" (Jv.of_string "worker-error"); 132 Jv.set response "id" (Jv.of_int (-1)); 133 Jv.set response "warnings" (Jv.of_list Fun.id [error_obj]); 134 Jv.set response "errorCount" (Jv.of_int 1); 135 Jv.set response "warningCount" (Jv.of_int 0); 136 Jv.set response "infoCount" (Jv.of_int 0); 137 Jv.set response "hasErrors" (Jv.of_bool true); 138 Jv.set response "fatalError" (Jv.of_string (Printexc.to_string exn)); 139 let self = Jv.get Jv.global "self" in 140 ignore (Jv.call self "postMessage" [| response |]) 141 142let () = 143 console_log "[html5rw worker] Worker script starting..."; 144 (* Set up message handler *) 145 let self = Jv.get Jv.global "self" in 146 let handler = Jv.callback ~arity:1 (fun ev -> 147 let data = Jv.get ev "data" in 148 handle_message data 149 ) in 150 ignore (Jv.call self "addEventListener" [| Jv.of_string "message"; handler |]); 151 console_log "[html5rw worker] Message handler registered, ready for messages"