OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 Encoding Detection and Decoding
7
8 This module implements the WHATWG encoding sniffing and decoding
9 algorithms for HTML5 documents. It handles automatic character
10 encoding detection from byte order marks (BOM), meta charset
11 declarations, and transport layer hints.
12
13 {2 Encoding Detection Algorithm}
14
15 The encoding detection follows the WHATWG specification:
16 1. Check for a BOM (UTF-8, UTF-16LE, UTF-16BE)
17 2. Prescan for [<meta charset>] or [<meta http-equiv="content-type">]
18 3. Use transport layer encoding hint if provided
19 4. Fall back to UTF-8 as the default
20
21 @see <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>
22 WHATWG encoding sniffing algorithm
23*)
24
25(** {1 Types} *)
26
27(** Character encodings supported by the parser.
28
29 The HTML5 specification requires support for a large number of
30 encodings, but this implementation focuses on the most common ones.
31 Other encodings are mapped to their closest equivalent.
32*)
33type encoding = Encoding_types.t =
34 | Utf8 (** UTF-8 encoding (default) *)
35 | Utf16le (** UTF-16 little-endian *)
36 | Utf16be (** UTF-16 big-endian *)
37 | Windows_1252 (** Windows-1252 (Latin-1 superset) *)
38 | Iso_8859_2 (** ISO-8859-2 (Central European) *)
39 | Euc_jp (** EUC-JP (Japanese) *)
40
41(** {1 Encoding Utilities} *)
42
43(** Convert an encoding to its canonical label string.
44
45 Returns the WHATWG canonical name, e.g., ["utf-8"], ["utf-16le"].
46*)
47let encoding_to_string = Encoding_types.to_string
48
49(** Detect encoding from a byte order mark.
50
51 Examines the first bytes of the input for a BOM and returns the
52 detected encoding with the number of bytes to skip.
53
54 @return [(Some (encoding, skip_bytes))] if a BOM is found,
55 [None] otherwise.
56*)
57let sniff_bom = Encoding_bom.sniff
58
59(** Normalize an encoding label to its canonical form.
60
61 Maps encoding labels (case-insensitive, with optional whitespace)
62 to the supported encoding types.
63
64 @return [Some encoding] if the label is recognized, [None] otherwise.
65
66 {[
67 normalize_label "UTF-8" (* Some Utf8 *)
68 normalize_label "utf8" (* Some Utf8 *)
69 normalize_label "latin1" (* Some Windows_1252 *)
70 ]}
71*)
72let normalize_label = Encoding_labels.normalize_label
73
74(** Prescan bytes to find a meta charset declaration.
75
76 Implements the WHATWG prescan algorithm that looks for encoding
77 declarations in the first 1024 bytes of an HTML document.
78
79 @return [Some encoding] if a meta charset is found, [None] otherwise.
80*)
81let prescan_for_meta_charset = Encoding_prescan.prescan_for_meta_charset
82
83(** {1 Decoding} *)
84
85(** Decode raw bytes to a UTF-8 string with automatic encoding detection.
86
87 This function implements the full encoding sniffing algorithm:
88 1. Check for BOM
89 2. Prescan for meta charset
90 3. Use transport encoding hint if provided
91 4. Fall back to UTF-8
92
93 @param transport_encoding Encoding hint from HTTP Content-Type header
94 @return [(decoded_string, detected_encoding)]
95
96 {[
97 let (html, enc) = decode raw_bytes ()
98 (* html is now a UTF-8 string, enc is the detected encoding *)
99 ]}
100*)
101let decode = Encoding_decode.decode
102
103let pp fmt enc = Format.pp_print_string fmt (encoding_to_string enc)