OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 Named Character Reference Decoding
7
8 This module provides functions for decoding HTML5 named character
9 references (entities) and numeric character references. It includes
10 the complete table of 2,231 named character references defined in
11 the WHATWG HTML5 specification.
12
13 {2 Character Reference Types}
14
15 HTML5 supports three types of character references:
16
17 {3 Named References}
18 - Standard form: [&], [<], [>], [ ]
19 - Some entities have multiple codepoint outputs: [⪡̸]
20
21 {3 Decimal Numeric References}
22 - Form: [{] (decimal codepoint)
23
24 {3 Hexadecimal Numeric References}
25 - Form: [{] or [{] (hexadecimal codepoint)
26
27 {2 Legacy Entity Handling}
28
29 Some named entities are "legacy" - they were supported without a
30 trailing semicolon in older browsers (e.g., [&] instead of [&]).
31 The parser handles these according to the WHATWG specification.
32
33 @see <https://html.spec.whatwg.org/multipage/named-characters.html>
34 The complete list of named character references
35*)
36
37(** {1 Decoding Functions} *)
38
39(** Decode all character references in a text string.
40
41 Processes the string and replaces all valid character references
42 (named and numeric) with their decoded UTF-8 equivalents.
43
44 {[
45 decode "Hello & goodbye"
46 (* Returns: "Hello & goodbye" *)
47
48 decode "<script>"
49 (* Returns: "<script>" *)
50 ]}
51*)
52let decode = Entities_decode.decode_entities_in_text
53
54(** Decode a numeric character reference.
55
56 @param codepoint The Unicode codepoint to decode
57 @return The UTF-8 string representation
58
59 Note: Some codepoints are replaced according to the HTML5
60 specification (e.g., control characters in the 0x80-0x9F range
61 are mapped to Windows-1252 equivalents).
62*)
63let decode_numeric = Entities_numeric_ref.decode
64
65(** Look up a named character reference.
66
67 @param name The entity name without [&] and [;] (e.g., ["amp"])
68 @return [Some codepoints] if the entity exists, [None] otherwise
69
70 {[
71 lookup "amp" (* Some [0x26] *)
72 lookup "nbsp" (* Some [0xA0] *)
73 lookup "bogus" (* None *)
74 ]}
75*)
76let lookup = Entities_entity_table.lookup
77
78(** Check if an entity is a legacy entity.
79
80 Legacy entities are those that were historically recognized without
81 a trailing semicolon. The parser handles these specially to maintain
82 browser compatibility.
83
84 {[
85 is_legacy "amp" (* true - & works without ; *)
86 is_legacy "nbsp" (* true *)
87 is_legacy "Aacute" (* false - requires semicolon *)
88 ]}
89*)
90let is_legacy = Entities_entity_table.is_legacy
91
92(** Convert a Unicode codepoint to its UTF-8 encoding.
93
94 @param codepoint The Unicode codepoint (0 to 0x10FFFF)
95 @return The UTF-8 encoded string
96*)
97let codepoint_to_utf8 = Entities_numeric_ref.codepoint_to_utf8
98
99(** {1 Sub-modules} *)
100
101(** Numeric character reference handling. *)
102module Numeric_ref = Entities_numeric_ref