OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(* Main parser entry point - bytesrw-only API *)
7
8open Bytesrw
9
10module Dom = Dom
11module Tokenizer = Tokenizer
12module Encoding = Encoding
13
14type parse_error = Parser_tree_builder.parse_error
15
16type fragment_context = Parser_tree_builder.fragment_context
17
18type t = {
19 root : Dom.node;
20 errors : parse_error list;
21 encoding : Encoding.encoding option;
22}
23
24(* Token sink that feeds tokens to tree builder *)
25module TreeBuilderSink = struct
26 type t = Parser_tree_builder.t
27
28 let process tb token ~line ~column =
29 Parser_tree_builder.set_position tb ~line ~column;
30 Parser_tree_builder.process_token tb token;
31 (* Check if we need to switch tokenizer state based on current element *)
32 (* Only switch for HTML namespace elements - SVG/MathML use different rules *)
33 match Parser_tree_builder.current_node tb with
34 | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" ->
35 let name = node.Dom.name in
36 if List.mem name ["textarea"; "title"] then
37 `SwitchTo Tokenizer_state.Rcdata
38 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
39 `SwitchTo Tokenizer_state.Rawtext
40 else if name = "script" then
41 `SwitchTo Tokenizer_state.Script_data
42 else if name = "plaintext" then
43 `SwitchTo Tokenizer_state.Plaintext
44 else
45 `Continue
46 | _ -> `Continue
47
48 let adjusted_current_node_in_html_namespace tb =
49 Parser_tree_builder.adjusted_current_node_in_html_namespace tb
50end
51
52(* Core parsing function that takes a Bytes.Reader.t *)
53let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) =
54 let tb = Parser_tree_builder.create ~collect_errors ?fragment_context () in
55 let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in
56
57 (* Set tokenizer state for fragment parsing *)
58 (* Note: We do NOT set last_start_tag because in fragment parsing, no start tag has been
59 emitted. This means end tags won't match as "appropriate end tags" and will be treated
60 as raw text in RCDATA/RAWTEXT/Script modes. *)
61 (* Only change tokenizer state for HTML namespace contexts - foreign contexts use Data state *)
62 (match fragment_context with
63 | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" ->
64 let name = String.lowercase_ascii ctx.tag_name in
65 if List.mem name ["title"; "textarea"] then
66 Tokenizer.set_state tokenizer Tokenizer_state.Rcdata
67 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
68 Tokenizer.set_state tokenizer Tokenizer_state.Rawtext
69 else if name = "script" then
70 Tokenizer.set_state tokenizer Tokenizer_state.Script_data
71 else if name = "plaintext" then
72 Tokenizer.set_state tokenizer Tokenizer_state.Plaintext
73 | _ -> ());
74
75 Tokenizer.run tokenizer (module TreeBuilderSink) reader;
76
77 let root = Parser_tree_builder.finish tb in
78 let tokenizer_errors = Tokenizer.get_errors tokenizer in
79 let tree_errors = Parser_tree_builder.get_errors tb in
80 let all_errors = List.map (fun e ->
81 { Parser_tree_builder.code = e.Tokenizer.Errors.code;
82 line = e.Tokenizer.Errors.line;
83 column = e.Tokenizer.Errors.column }
84 ) tokenizer_errors @ tree_errors in
85
86 { root; errors = all_errors; encoding = None }
87
88(* Parse raw bytes with automatic encoding detection *)
89let parse_bytes ?(collect_errors=false) ?transport_encoding ?fragment_context data =
90 let (html, enc) = Encoding.decode data ?transport_encoding () in
91 let reader = Bytes.Reader.of_string html in
92 let result = parse ~collect_errors ?fragment_context reader in
93 { result with encoding = Some enc }
94
95let query t selector =
96 Selector.query t.root selector
97
98(* Serialize to a Bytes.Writer.t *)
99let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) =
100 let html = Dom.to_html ~pretty ~indent_size t.root in
101 Bytes.Writer.write_string writer html
102
103(* Serialize to string (convenience for when result fits in memory) *)
104let to_string ?(pretty=true) ?(indent_size=2) t =
105 Dom.to_html ~pretty ~indent_size t.root
106
107(* Extract text content *)
108let to_text ?(separator=" ") ?(strip=true) t =
109 Dom.to_text ~separator ~strip t.root
110
111(* For testing *)
112let to_test_format t =
113 Dom.to_test_format t.root