OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** XHTML parser using xmlm for proper XML parsing. 2 3 This module provides XML parsing for XHTML files, which the HTML5 parser 4 cannot handle correctly (especially self-closing tags on non-void elements). *) 5 6(** Parse XHTML content using xmlm and return a DOM tree. *) 7let parse_xhtml content = 8 let input = Xmlm.make_input (`String (0, content)) in 9 10 (* Stack of nodes during parsing *) 11 let stack = ref [] in 12 let root = Html5rw.Dom.create_document () in 13 stack := [root]; 14 15 (* Helper to get namespace shorthand *) 16 let ns_shorthand ns = 17 if ns = "http://www.w3.org/2000/svg" then Some "svg" 18 else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml" 19 else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *) 20 else if ns = "" then None (* No namespace = HTML *) 21 else Some ns (* Keep other namespaces as-is *) 22 in 23 24 (* Process xmlm signals *) 25 let rec process () = 26 if Xmlm.eoi input then () 27 else begin 28 match Xmlm.input input with 29 | `Dtd _ -> 30 (* Skip DTD for now *) 31 process () 32 | `El_start ((ns, local), attrs) -> 33 (* Create element node *) 34 let namespace = ns_shorthand ns in 35 let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in 36 let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in 37 (* Append to current parent *) 38 (match !stack with 39 | parent :: _ -> Html5rw.Dom.append_child parent node 40 | [] -> ()); 41 (* Push onto stack *) 42 stack := node :: !stack; 43 process () 44 | `El_end -> 45 (* Pop from stack *) 46 (match !stack with 47 | _ :: rest -> stack := rest 48 | [] -> ()); 49 process () 50 | `Data text -> 51 (* Create text node and append to current parent *) 52 let trimmed = String.trim text in 53 if trimmed <> "" || String.length text > 0 then begin 54 let text_node = Html5rw.Dom.create_text text in 55 (match !stack with 56 | parent :: _ -> Html5rw.Dom.append_child parent text_node 57 | [] -> ()) 58 end; 59 process () 60 end 61 in 62 63 try 64 process (); 65 Ok root 66 with 67 | Xmlm.Error ((line, col), err) -> 68 Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err)) 69 70(** Check if a system_id indicates an XHTML file. *) 71let is_xhtml_file system_id = 72 match system_id with 73 | Some path -> 74 String.length path > 6 && 75 String.sub path (String.length path - 6) 6 = ".xhtml" 76 | None -> false 77 78(** Wrap DOM in an Html5rw.t-compatible structure for the checker. *) 79type xhtml_doc = { 80 root : Html5rw.Dom.node; 81 errors : Html5rw.Error.t list; 82} 83 84let xhtml_root doc = doc.root 85let xhtml_errors _doc = [] (* XML parser handles errors differently *)