lib/punycode.mli at main · anil.recoil.org/ocaml-punycode

Punycode (RFC3492) in OCaml
ocaml-punycode / lib / punycode.mli
at main 283 lines 11 kB view raw
  1(*---------------------------------------------------------------------------
  2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
  3  SPDX-License-Identifier: ISC
  4 ---------------------------------------------------------------------------*)
  5
  6(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
  7
  8    This module implements the Punycode algorithm as specified in
  9    {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing
 10    encoding and decoding of Unicode strings to/from ASCII-compatible encoding
 11    suitable for use in internationalized domain names.
 12
 13    Punycode is an instance of Bootstring that uses particular parameter values
 14    appropriate for IDNA. See
 15    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
 16     5} for the specific parameter values.
 17
 18    {2 References}
 19    - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
 20      Bootstring encoding of Unicode for IDNA
 21    - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol
 22*)
 23
 24(** {1 Position Tracking} *)
 25
 26type position
 27(** Abstract type representing a position in input for error reporting.
 28    Positions track both byte offset and Unicode character index. *)
 29
 30val position_byte_offset : position -> int
 31(** [position_byte_offset pos] returns the byte offset in the input. *)
 32
 33val position_char_index : position -> int
 34(** [position_char_index pos] returns the Unicode character index (0-based). *)
 35
 36val pp_position : Format.formatter -> position -> unit
 37(** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *)
 38
 39(** {1 Error Types} *)
 40
 41type error_reason =
 42  | Overflow of position
 43      (** Arithmetic overflow during encode/decode. This can occur with very
 44          long strings or extreme Unicode code point values. See
 45          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492
 46           Section 6.4} for overflow handling requirements. *)
 47  | Invalid_character of position * Uchar.t
 48      (** A non-basic code point appeared where only basic code points (ASCII <
 49          128) are allowed. Per
 50          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492
 51           Section 3.1}, basic code points must be segregated at the beginning
 52          of the encoded string. *)
 53  | Invalid_digit of position * char
 54      (** An invalid Punycode digit was encountered during decoding. Valid
 55          digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See
 56          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492
 57           Section 5} for digit-value mappings. *)
 58  | Unexpected_end of position
 59      (** The input ended prematurely during decoding of a delta value. See
 60          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492
 61           Section 6.2} decoding procedure. *)
 62  | Invalid_utf8 of position  (** Malformed UTF-8 sequence in input string. *)
 63  | Label_too_long of int
 64      (** Encoded label exceeds 63 bytes (DNS limit per
 65          {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int
 66          is the actual length. *)
 67  | Empty_label  (** Empty label is not valid for encoding. *)
 68
 69exception Error of error_reason
 70(** Exception raised for all Punycode encoding/decoding errors. *)
 71
 72val pp_error_reason : Format.formatter -> error_reason -> unit
 73(** [pp_error_reason fmt e] pretty-prints an error with position information. *)
 74
 75val error_reason_to_string : error_reason -> string
 76(** [error_reason_to_string e] converts an error to a human-readable string. *)
 77
 78(** {1 Constants}
 79
 80    Punycode parameters as specified in
 81    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
 82     5}. *)
 83
 84val ace_prefix : string
 85(** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. See
 86    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
 87     5} which notes that IDNA prepends this prefix. *)
 88
 89val max_label_length : int
 90(** Maximum length of a domain label in bytes (63), per
 91    {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
 92
 93(** {1 Case Flags for Mixed-Case Annotation}
 94
 95    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
 96     Appendix A} describes an optional mechanism for preserving case information
 97    through the encoding/decoding round-trip. This is useful when the original
 98    string's case should be recoverable.
 99
100    Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
101    operations of IDNA. *)
102
103type case_flag =
104  | Uppercase
105  | Lowercase  (** Case annotation for a character. *)
106
107(** {1 Core Punycode Operations}
108
109    These functions implement the Bootstring algorithms from
110    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section
111     6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
112    encoded output is a plain ASCII string without the ACE prefix. *)
113
114val encode : Uchar.t array -> string
115(** [encode codepoints] encodes an array of Unicode code points to a Punycode
116    ASCII string.
117
118    Implements the encoding procedure from
119    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492
120     Section 6.3}:
121
122    1. Basic code points (ASCII < 128) are copied literally to the beginning of
123    the output per
124    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1
125     (Basic code point segregation)} 2. A delimiter ('-') is appended if there
126    are any basic code points 3. Non-basic code points are encoded as deltas
127    using the generalized variable-length integer representation from
128    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
129
130    @raise Error on encoding failure (overflow, etc.)
131
132    Example:
133    {[
134      encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
135      (* = "ihqwcrb4cv8a8dqg056pqjye" *)
136    ]} *)
137
138val decode : string -> Uchar.t array
139(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
140    code points.
141
142    Implements the decoding procedure from
143    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
144     Section 6.2}.
145
146    The input should be the Punycode portion only, without the ACE prefix. The
147    decoder is case-insensitive for the encoded portion, as required by
148    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
149     5}: "A decoder MUST recognize the letters in both uppercase and lowercase
150    forms".
151
152    @raise Error on decoding failure (invalid digit, unexpected end, etc.)
153
154    Example:
155    {[
156      decode "ihqwcrb4cv8a8dqg056pqjye"
157      (* = [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
158    ]} *)
159
160(** {1 Mixed-Case Annotation}
161
162    These functions support round-trip case preservation as described in
163    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
164     Appendix A}. *)
165
166val encode_with_case : Uchar.t array -> case_flag array -> string
167(** [encode_with_case codepoints case_flags] encodes with case annotation.
168
169    Per
170    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
171     Appendix A}:
172    - For basic (ASCII) letters, the output preserves the case flag directly
173    - For non-ASCII characters, the case of the final digit in each delta
174      encoding indicates the flag (uppercase = suggested uppercase)
175
176    The [case_flags] array must have the same length as [codepoints].
177
178    @raise Invalid_argument if array lengths don't match.
179    @raise Error on encoding failure. *)
180
181val decode_with_case : string -> Uchar.t array * case_flag array
182(** [decode_with_case punycode] decodes and extracts case annotations.
183
184    Per
185    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
186     Appendix A}, returns both the decoded code points and an array of case
187    flags indicating the suggested case for each character based on the
188    uppercase/lowercase form of the encoding digits.
189
190    @raise Error on decoding failure. *)
191
192(** {1 UTF-8 String Operations}
193
194    Convenience functions that work directly with UTF-8 encoded OCaml strings.
195    These combine UTF-8 decoding/encoding with the core Punycode operations. *)
196
197val encode_utf8 : string -> string
198(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
199
200    This is equivalent to decoding [s] from UTF-8 to code points, then calling
201    {!encode}.
202
203    @raise Error on encoding failure.
204
205    Example:
206    {[
207      encode_utf8 "münchen"
208      (* = "mnchen-3ya" *)
209    ]} *)
210
211val decode_utf8 : string -> string
212(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
213
214    This is equivalent to calling {!decode} then encoding the result as UTF-8.
215
216    @raise Error on decoding failure.
217
218    Example:
219    {[
220      decode_utf8 "mnchen-3ya"
221      (* = "münchen" *)
222    ]} *)
223
224(** {1 Domain Label Operations}
225
226    These functions handle the ACE prefix automatically and enforce DNS label
227    length limits per
228    {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
229
230val encode_label : string -> string
231(** [encode_label label] encodes a domain label for use in DNS.
232
233    If the label contains only ASCII characters, it is returned unchanged.
234    Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as
235    specified in
236    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
237     5}.
238
239    @raise Error with {!Label_too_long} if the result exceeds 63 bytes.
240    @raise Error with {!Empty_label} if the label is empty.
241
242    Example:
243    {[
244      encode_label "münchen"
245        (* = "xn--mnchen-3ya" *)
246        encode_label "example"
247      (* = "example" *)
248    ]} *)
249
250val decode_label : string -> string
251(** [decode_label label] decodes a domain label.
252
253    If the label starts with the ACE prefix ("xn--", case-insensitive), it is
254    Punycode-decoded. Otherwise, it is returned unchanged.
255
256    @raise Error on decoding failure.
257
258    Example:
259    {[
260      decode_label "xn--mnchen-3ya"
261        (* = "münchen" *)
262        decode_label "example"
263      (* = "example" *)
264    ]} *)
265
266(** {1 Validation}
267
268    Predicate functions for checking code point and string properties. *)
269
270val is_basic : Uchar.t -> bool
271(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
272
273    Per
274    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
275     5}, basic code points for Punycode are the ASCII code points (0..7F). *)
276
277val is_ascii_string : string -> bool
278(** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all
279    bytes < 128). *)
280
281val has_ace_prefix : string -> bool
282(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
283    (case-insensitive comparison). *)