OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 3.3 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** HTML5 Named Character Reference Decoding 7 8 This module provides functions for decoding HTML5 named character 9 references (entities) and numeric character references. It includes 10 the complete table of 2,231 named character references defined in 11 the WHATWG HTML5 specification. 12 13 {2 Character Reference Types} 14 15 HTML5 supports three types of character references: 16 17 {3 Named References} 18 - Standard form: [&amp;], [&lt;], [&gt;], [&nbsp;] 19 - Some entities have multiple codepoint outputs: [&NotNestedLessLess;] 20 21 {3 Decimal Numeric References} 22 - Form: [&#123;] (decimal codepoint) 23 24 {3 Hexadecimal Numeric References} 25 - Form: [&#x7B;] or [&#X7B;] (hexadecimal codepoint) 26 27 {2 Legacy Entity Handling} 28 29 Some named entities are "legacy" - they were supported without a 30 trailing semicolon in older browsers (e.g., [&amp] instead of [&amp;]). 31 The parser handles these according to the WHATWG specification. 32 33 @see <https://html.spec.whatwg.org/multipage/named-characters.html> 34 The complete list of named character references 35*) 36 37(** {1 Decoding Functions} *) 38 39(** Decode all character references in a text string. 40 41 Processes the string and replaces all valid character references 42 (named and numeric) with their decoded UTF-8 equivalents. 43 44 {[ 45 decode "Hello &amp; goodbye" 46 (* Returns: "Hello & goodbye" *) 47 48 decode "&#60;script&#62;" 49 (* Returns: "<script>" *) 50 ]} 51*) 52let decode = Entities_decode.decode_entities_in_text 53 54(** Decode a numeric character reference. 55 56 @param codepoint The Unicode codepoint to decode 57 @return The UTF-8 string representation 58 59 Note: Some codepoints are replaced according to the HTML5 60 specification (e.g., control characters in the 0x80-0x9F range 61 are mapped to Windows-1252 equivalents). 62*) 63let decode_numeric = Entities_numeric_ref.decode 64 65(** Look up a named character reference. 66 67 @param name The entity name without [&] and [;] (e.g., ["amp"]) 68 @return [Some codepoints] if the entity exists, [None] otherwise 69 70 {[ 71 lookup "amp" (* Some [0x26] *) 72 lookup "nbsp" (* Some [0xA0] *) 73 lookup "bogus" (* None *) 74 ]} 75*) 76let lookup = Entities_entity_table.lookup 77 78(** Check if an entity is a legacy entity. 79 80 Legacy entities are those that were historically recognized without 81 a trailing semicolon. The parser handles these specially to maintain 82 browser compatibility. 83 84 {[ 85 is_legacy "amp" (* true - &amp works without ; *) 86 is_legacy "nbsp" (* true *) 87 is_legacy "Aacute" (* false - requires semicolon *) 88 ]} 89*) 90let is_legacy = Entities_entity_table.is_legacy 91 92(** Convert a Unicode codepoint to its UTF-8 encoding. 93 94 @param codepoint The Unicode codepoint (0 to 0x10FFFF) 95 @return The UTF-8 encoded string 96*) 97let codepoint_to_utf8 = Entities_numeric_ref.codepoint_to_utf8 98 99(** {1 Sub-modules} *) 100 101(** Numeric character reference handling. *) 102module Numeric_ref = Entities_numeric_ref