OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 3549498fc6a104ca0f4e4921dd84f58e25e5826d 113 lines 4.5 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(* Main parser entry point - bytesrw-only API *) 7 8open Bytesrw 9 10module Dom = Dom 11module Tokenizer = Tokenizer 12module Encoding = Encoding 13 14type parse_error = Parser_tree_builder.parse_error 15 16type fragment_context = Parser_tree_builder.fragment_context 17 18type t = { 19 root : Dom.node; 20 errors : parse_error list; 21 encoding : Encoding.encoding option; 22} 23 24(* Token sink that feeds tokens to tree builder *) 25module TreeBuilderSink = struct 26 type t = Parser_tree_builder.t 27 28 let process tb token ~line ~column = 29 Parser_tree_builder.set_position tb ~line ~column; 30 Parser_tree_builder.process_token tb token; 31 (* Check if we need to switch tokenizer state based on current element *) 32 (* Only switch for HTML namespace elements - SVG/MathML use different rules *) 33 match Parser_tree_builder.current_node tb with 34 | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" -> 35 let name = node.Dom.name in 36 if List.mem name ["textarea"; "title"] then 37 `SwitchTo Tokenizer_state.Rcdata 38 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then 39 `SwitchTo Tokenizer_state.Rawtext 40 else if name = "script" then 41 `SwitchTo Tokenizer_state.Script_data 42 else if name = "plaintext" then 43 `SwitchTo Tokenizer_state.Plaintext 44 else 45 `Continue 46 | _ -> `Continue 47 48 let adjusted_current_node_in_html_namespace tb = 49 Parser_tree_builder.adjusted_current_node_in_html_namespace tb 50end 51 52(* Core parsing function that takes a Bytes.Reader.t *) 53let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) = 54 let tb = Parser_tree_builder.create ~collect_errors ?fragment_context () in 55 let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in 56 57 (* Set tokenizer state for fragment parsing *) 58 (* Note: We do NOT set last_start_tag because in fragment parsing, no start tag has been 59 emitted. This means end tags won't match as "appropriate end tags" and will be treated 60 as raw text in RCDATA/RAWTEXT/Script modes. *) 61 (* Only change tokenizer state for HTML namespace contexts - foreign contexts use Data state *) 62 (match fragment_context with 63 | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" -> 64 let name = String.lowercase_ascii ctx.tag_name in 65 if List.mem name ["title"; "textarea"] then 66 Tokenizer.set_state tokenizer Tokenizer_state.Rcdata 67 else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then 68 Tokenizer.set_state tokenizer Tokenizer_state.Rawtext 69 else if name = "script" then 70 Tokenizer.set_state tokenizer Tokenizer_state.Script_data 71 else if name = "plaintext" then 72 Tokenizer.set_state tokenizer Tokenizer_state.Plaintext 73 | _ -> ()); 74 75 Tokenizer.run tokenizer (module TreeBuilderSink) reader; 76 77 let root = Parser_tree_builder.finish tb in 78 let tokenizer_errors = Tokenizer.get_errors tokenizer in 79 let tree_errors = Parser_tree_builder.get_errors tb in 80 let all_errors = List.map (fun e -> 81 { Parser_tree_builder.code = e.Tokenizer.Errors.code; 82 line = e.Tokenizer.Errors.line; 83 column = e.Tokenizer.Errors.column } 84 ) tokenizer_errors @ tree_errors in 85 86 { root; errors = all_errors; encoding = None } 87 88(* Parse raw bytes with automatic encoding detection *) 89let parse_bytes ?(collect_errors=false) ?transport_encoding ?fragment_context data = 90 let (html, enc) = Encoding.decode data ?transport_encoding () in 91 let reader = Bytes.Reader.of_string html in 92 let result = parse ~collect_errors ?fragment_context reader in 93 { result with encoding = Some enc } 94 95let query t selector = 96 Selector.query t.root selector 97 98(* Serialize to a Bytes.Writer.t *) 99let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) = 100 let html = Dom.to_html ~pretty ~indent_size t.root in 101 Bytes.Writer.write_string writer html 102 103(* Serialize to string (convenience for when result fits in memory) *) 104let to_string ?(pretty=true) ?(indent_size=2) t = 105 Dom.to_html ~pretty ~indent_size t.root 106 107(* Extract text content *) 108let to_text ?(separator=" ") ?(strip=true) t = 109 Dom.to_text ~separator ~strip t.root 110 111(* For testing *) 112let to_test_format t = 113 Dom.to_test_format t.root