OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 84a2203f96dd85d03a6a6da73df91e62c08db43a 103 lines 3.6 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** HTML5 Encoding Detection and Decoding 7 8 This module implements the WHATWG encoding sniffing and decoding 9 algorithms for HTML5 documents. It handles automatic character 10 encoding detection from byte order marks (BOM), meta charset 11 declarations, and transport layer hints. 12 13 {2 Encoding Detection Algorithm} 14 15 The encoding detection follows the WHATWG specification: 16 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE) 17 2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">] 18 3. Use transport layer encoding hint if provided 19 4. Fall back to UTF-8 as the default 20 21 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding> 22 WHATWG encoding sniffing algorithm 23*) 24 25(** {1 Types} *) 26 27(** Character encodings supported by the parser. 28 29 The HTML5 specification requires support for a large number of 30 encodings, but this implementation focuses on the most common ones. 31 Other encodings are mapped to their closest equivalent. 32*) 33type encoding = Encoding_types.t = 34 | Utf8 (** UTF-8 encoding (default) *) 35 | Utf16le (** UTF-16 little-endian *) 36 | Utf16be (** UTF-16 big-endian *) 37 | Windows_1252 (** Windows-1252 (Latin-1 superset) *) 38 | Iso_8859_2 (** ISO-8859-2 (Central European) *) 39 | Euc_jp (** EUC-JP (Japanese) *) 40 41(** {1 Encoding Utilities} *) 42 43(** Convert an encoding to its canonical label string. 44 45 Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"]. 46*) 47let encoding_to_string = Encoding_types.to_string 48 49(** Detect encoding from a byte order mark. 50 51 Examines the first bytes of the input for a BOM and returns the 52 detected encoding with the number of bytes to skip. 53 54 @return [(Some (encoding, skip_bytes))] if a BOM is found, 55 [None] otherwise. 56*) 57let sniff_bom = Encoding_bom.sniff 58 59(** Normalize an encoding label to its canonical form. 60 61 Maps encoding labels (case-insensitive, with optional whitespace) 62 to the supported encoding types. 63 64 @return [Some encoding] if the label is recognized, [None] otherwise. 65 66 {[ 67 normalize_label "UTF-8" (* Some Utf8 *) 68 normalize_label "utf8" (* Some Utf8 *) 69 normalize_label "latin1" (* Some Windows_1252 *) 70 ]} 71*) 72let normalize_label = Encoding_labels.normalize_label 73 74(** Prescan bytes to find a meta charset declaration. 75 76 Implements the WHATWG prescan algorithm that looks for encoding 77 declarations in the first 1024 bytes of an HTML document. 78 79 @return [Some encoding] if a meta charset is found, [None] otherwise. 80*) 81let prescan_for_meta_charset = Encoding_prescan.prescan_for_meta_charset 82 83(** {1 Decoding} *) 84 85(** Decode raw bytes to a UTF-8 string with automatic encoding detection. 86 87 This function implements the full encoding sniffing algorithm: 88 1. Check for BOM 89 2. Prescan for meta charset 90 3. Use transport encoding hint if provided 91 4. Fall back to UTF-8 92 93 @param transport_encoding Encoding hint from HTTP Content-Type header 94 @return [(decoded_string, detected_encoding)] 95 96 {[ 97 let (html, enc) = decode raw_bytes () 98 (* html is now a UTF-8 string, enc is the detected encoding *) 99 ]} 100*) 101let decode = Encoding_decode.decode 102 103let pp fmt enc = Format.pp_print_string fmt (encoding_to_string enc)