forked from
anil.recoil.org/ocaml-punycode
Punycode (RFC3492) in OCaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
7
8 This module implements the Punycode algorithm as specified in
9 {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing
10 encoding and decoding of Unicode strings to/from ASCII-compatible encoding
11 suitable for use in internationalized domain names.
12
13 Punycode is an instance of Bootstring that uses particular parameter values
14 appropriate for IDNA. See
15 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
16 5} for the specific parameter values.
17
18 {2 References}
19 - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
20 Bootstring encoding of Unicode for IDNA
21 - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol
22*)
23
24(** {1 Position Tracking} *)
25
26type position
27(** Abstract type representing a position in input for error reporting.
28 Positions track both byte offset and Unicode character index. *)
29
30val position_byte_offset : position -> int
31(** [position_byte_offset pos] returns the byte offset in the input. *)
32
33val position_char_index : position -> int
34(** [position_char_index pos] returns the Unicode character index (0-based). *)
35
36val pp_position : Format.formatter -> position -> unit
37(** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *)
38
39(** {1 Error Types} *)
40
41type error_reason =
42 | Overflow of position
43 (** Arithmetic overflow during encode/decode. This can occur with very
44 long strings or extreme Unicode code point values. See
45 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492
46 Section 6.4} for overflow handling requirements. *)
47 | Invalid_character of position * Uchar.t
48 (** A non-basic code point appeared where only basic code points (ASCII <
49 128) are allowed. Per
50 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492
51 Section 3.1}, basic code points must be segregated at the beginning
52 of the encoded string. *)
53 | Invalid_digit of position * char
54 (** An invalid Punycode digit was encountered during decoding. Valid
55 digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See
56 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492
57 Section 5} for digit-value mappings. *)
58 | Unexpected_end of position
59 (** The input ended prematurely during decoding of a delta value. See
60 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492
61 Section 6.2} decoding procedure. *)
62 | Invalid_utf8 of position (** Malformed UTF-8 sequence in input string. *)
63 | Label_too_long of int
64 (** Encoded label exceeds 63 bytes (DNS limit per
65 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int
66 is the actual length. *)
67 | Empty_label (** Empty label is not valid for encoding. *)
68
69exception Error of error_reason
70(** Exception raised for all Punycode encoding/decoding errors. *)
71
72val pp_error_reason : Format.formatter -> error_reason -> unit
73(** [pp_error_reason fmt e] pretty-prints an error with position information. *)
74
75val error_reason_to_string : error_reason -> string
76(** [error_reason_to_string e] converts an error to a human-readable string. *)
77
78(** {1 Constants}
79
80 Punycode parameters as specified in
81 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
82 5}. *)
83
84val ace_prefix : string
85(** [ace_prefix] is the ACE prefix ["xn--"] used for Punycode-encoded domain
86 labels. See
87 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
88 5}. *)
89
90val max_label_length : int
91(** Maximum length of a domain label in bytes (63), per
92 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
93
94(** {1 Case Flags for Mixed-Case Annotation}
95
96 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
97 Appendix A} describes an optional mechanism for preserving case information
98 through the encoding/decoding round-trip. This is useful when the original
99 string's case should be recoverable.
100
101 Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
102 operations of IDNA. *)
103
104type case_flag =
105 | Uppercase
106 | Lowercase (** Case annotation for a character. *)
107
108(** {1 Core Punycode Operations}
109
110 These functions implement the Bootstring algorithms from
111 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section
112 6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
113 encoded output is a plain ASCII string without the ACE prefix. *)
114
115val encode : Uchar.t array -> string
116(** [encode codepoints] encodes an array of Unicode code points to a Punycode
117 ASCII string.
118
119 Implements the encoding procedure from
120 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492
121 Section 6.3}:
122
123 1. Basic code points (ASCII < 128) are copied literally to the beginning of
124 the output per
125 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1
126 (Basic code point segregation)} 2. A delimiter ('-') is appended if there
127 are any basic code points 3. Non-basic code points are encoded as deltas
128 using the generalized variable-length integer representation from
129 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
130
131 @raise Error on encoding failure (overflow, etc.)
132
133 Example:
134 {[
135 encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
136 (* = "ihqwcrb4cv8a8dqg056pqjye" *)
137 ]} *)
138
139val decode : string -> Uchar.t array
140(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
141 code points.
142
143 Implements the decoding procedure from
144 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
145 Section 6.2}.
146
147 The input should be the Punycode portion only, without the ACE prefix. The
148 decoder is case-insensitive for the encoded portion, as required by
149 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
150 5}: "A decoder MUST recognize the letters in both uppercase and lowercase
151 forms".
152
153 @raise Error on decoding failure (invalid digit, unexpected end, etc.)
154
155 Example:
156 {[
157 decode "ihqwcrb4cv8a8dqg056pqjye"
158 (* = [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
159 ]} *)
160
161(** {1 Mixed-Case Annotation}
162
163 These functions support round-trip case preservation as described in
164 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
165 Appendix A}. *)
166
167val encode_with_case : Uchar.t array -> case_flag array -> string
168(** [encode_with_case codepoints case_flags] encodes with case annotation.
169
170 Per
171 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
172 Appendix A}:
173 - For basic (ASCII) letters, the output preserves the case flag directly
174 - For non-ASCII characters, the case of the final digit in each delta
175 encoding indicates the flag (uppercase = suggested uppercase)
176
177 The [case_flags] array must have the same length as [codepoints].
178
179 @raise Invalid_argument if array lengths don't match.
180 @raise Error on encoding failure. *)
181
182val decode_with_case : string -> Uchar.t array * case_flag array
183(** [decode_with_case punycode] decodes and extracts case annotations.
184
185 Per
186 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
187 Appendix A}, returns both the decoded code points and an array of case
188 flags indicating the suggested case for each character based on the
189 uppercase/lowercase form of the encoding digits.
190
191 @raise Error on decoding failure. *)
192
193(** {1 UTF-8 String Operations}
194
195 Convenience functions that work directly with UTF-8 encoded OCaml strings.
196 These combine UTF-8 decoding/encoding with the core Punycode operations. *)
197
198val encode_utf8 : string -> string
199(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
200
201 This is equivalent to decoding [s] from UTF-8 to code points, then calling
202 {!encode}.
203
204 @raise Error on encoding failure.
205
206 Example:
207 {[
208 encode_utf8 "münchen"
209 (* = "mnchen-3ya" *)
210 ]} *)
211
212val decode_utf8 : string -> string
213(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
214
215 This is equivalent to calling {!decode} then encoding the result as UTF-8.
216
217 @raise Error on decoding failure.
218
219 Example:
220 {[
221 decode_utf8 "mnchen-3ya"
222 (* = "münchen" *)
223 ]} *)
224
225(** {1 Domain Label Operations}
226
227 These functions handle the ACE prefix automatically and enforce DNS label
228 length limits per
229 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
230
231val encode_label : string -> string
232(** [encode_label label] encodes a domain label for use in DNS.
233
234 If the label contains only ASCII characters, it is returned unchanged.
235 Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as
236 specified in
237 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
238 5}.
239
240 @raise Error with {!Label_too_long} if the result exceeds 63 bytes.
241 @raise Error with {!Empty_label} if the label is empty.
242
243 Example:
244 {[
245 encode_label "münchen"
246 (* = "xn--mnchen-3ya" *)
247 encode_label "example"
248 (* = "example" *)
249 ]} *)
250
251val decode_label : string -> string
252(** [decode_label label] decodes a domain label.
253
254 If the label starts with the ACE prefix ("xn--", case-insensitive), it is
255 Punycode-decoded. Otherwise, it is returned unchanged.
256
257 @raise Error on decoding failure.
258
259 Example:
260 {[
261 decode_label "xn--mnchen-3ya"
262 (* = "münchen" *)
263 decode_label "example"
264 (* = "example" *)
265 ]} *)
266
267(** {1 Validation}
268
269 Predicate functions for checking code point and string properties. *)
270
271val is_basic : Uchar.t -> bool
272(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
273
274 Per
275 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
276 5}, basic code points for Punycode are the ASCII code points (0..7F). *)
277
278val is_ascii_string : string -> bool
279(** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all
280 bytes < 128). *)
281
282val has_ace_prefix : string -> bool
283(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
284 (case-insensitive comparison). *)