lib/punycode.mli at main · gazagnaire.org/ocaml-punycode

gazagnaire.org / ocaml-punycode
forked from anil.recoil.org/ocaml-punycode
fork atom
Punycode (RFC3492) in OCaml
fork atom
ocaml-punycode / lib / punycode.mli
at main 284 lines 11 kB view raw
wrap content
  1(*---------------------------------------------------------------------------
  2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
  3  SPDX-License-Identifier: ISC
  4 ---------------------------------------------------------------------------*)
  5
  6(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
  7
  8    This module implements the Punycode algorithm as specified in
  9    {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing
 10    encoding and decoding of Unicode strings to/from ASCII-compatible encoding
 11    suitable for use in internationalized domain names.
 12
 13    Punycode is an instance of Bootstring that uses particular parameter values
 14    appropriate for IDNA. See
 15    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
 16     5} for the specific parameter values.
 17
 18    {2 References}
 19    - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
 20      Bootstring encoding of Unicode for IDNA
 21    - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol
 22*)
 23
 24(** {1 Position Tracking} *)
 25
 26type position
 27(** Abstract type representing a position in input for error reporting.
 28    Positions track both byte offset and Unicode character index. *)
 29
 30val position_byte_offset : position -> int
 31(** [position_byte_offset pos] returns the byte offset in the input. *)
 32
 33val position_char_index : position -> int
 34(** [position_char_index pos] returns the Unicode character index (0-based). *)
 35
 36val pp_position : Format.formatter -> position -> unit
 37(** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *)
 38
 39(** {1 Error Types} *)
 40
 41type error_reason =
 42  | Overflow of position
 43      (** Arithmetic overflow during encode/decode. This can occur with very
 44          long strings or extreme Unicode code point values. See
 45          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492
 46           Section 6.4} for overflow handling requirements. *)
 47  | Invalid_character of position * Uchar.t
 48      (** A non-basic code point appeared where only basic code points (ASCII <
 49          128) are allowed. Per
 50          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492
 51           Section 3.1}, basic code points must be segregated at the beginning
 52          of the encoded string. *)
 53  | Invalid_digit of position * char
 54      (** An invalid Punycode digit was encountered during decoding. Valid
 55          digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See
 56          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492
 57           Section 5} for digit-value mappings. *)
 58  | Unexpected_end of position
 59      (** The input ended prematurely during decoding of a delta value. See
 60          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492
 61           Section 6.2} decoding procedure. *)
 62  | Invalid_utf8 of position  (** Malformed UTF-8 sequence in input string. *)
 63  | Label_too_long of int
 64      (** Encoded label exceeds 63 bytes (DNS limit per
 65          {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int
 66          is the actual length. *)
 67  | Empty_label  (** Empty label is not valid for encoding. *)
 68
 69exception Error of error_reason
 70(** Exception raised for all Punycode encoding/decoding errors. *)
 71
 72val pp_error_reason : Format.formatter -> error_reason -> unit
 73(** [pp_error_reason fmt e] pretty-prints an error with position information. *)
 74
 75val error_reason_to_string : error_reason -> string
 76(** [error_reason_to_string e] converts an error to a human-readable string. *)
 77
 78(** {1 Constants}
 79
 80    Punycode parameters as specified in
 81    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
 82     5}. *)
 83
 84val ace_prefix : string
 85(** [ace_prefix] is the ACE prefix ["xn--"] used for Punycode-encoded domain
 86    labels. See
 87    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
 88     5}. *)
 89
 90val max_label_length : int
 91(** Maximum length of a domain label in bytes (63), per
 92    {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
 93
 94(** {1 Case Flags for Mixed-Case Annotation}
 95
 96    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
 97     Appendix A} describes an optional mechanism for preserving case information
 98    through the encoding/decoding round-trip. This is useful when the original
 99    string's case should be recoverable.
100
101    Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
102    operations of IDNA. *)
103
104type case_flag =
105  | Uppercase
106  | Lowercase  (** Case annotation for a character. *)
107
108(** {1 Core Punycode Operations}
109
110    These functions implement the Bootstring algorithms from
111    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section
112     6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
113    encoded output is a plain ASCII string without the ACE prefix. *)
114
115val encode : Uchar.t array -> string
116(** [encode codepoints] encodes an array of Unicode code points to a Punycode
117    ASCII string.
118
119    Implements the encoding procedure from
120    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492
121     Section 6.3}:
122
123    1. Basic code points (ASCII < 128) are copied literally to the beginning of
124    the output per
125    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1
126     (Basic code point segregation)} 2. A delimiter ('-') is appended if there
127    are any basic code points 3. Non-basic code points are encoded as deltas
128    using the generalized variable-length integer representation from
129    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
130
131    @raise Error on encoding failure (overflow, etc.)
132
133    Example:
134    {[
135      encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
136      (* = "ihqwcrb4cv8a8dqg056pqjye" *)
137    ]} *)
138
139val decode : string -> Uchar.t array
140(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
141    code points.
142
143    Implements the decoding procedure from
144    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
145     Section 6.2}.
146
147    The input should be the Punycode portion only, without the ACE prefix. The
148    decoder is case-insensitive for the encoded portion, as required by
149    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
150     5}: "A decoder MUST recognize the letters in both uppercase and lowercase
151    forms".
152
153    @raise Error on decoding failure (invalid digit, unexpected end, etc.)
154
155    Example:
156    {[
157      decode "ihqwcrb4cv8a8dqg056pqjye"
158      (* = [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
159    ]} *)
160
161(** {1 Mixed-Case Annotation}
162
163    These functions support round-trip case preservation as described in
164    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
165     Appendix A}. *)
166
167val encode_with_case : Uchar.t array -> case_flag array -> string
168(** [encode_with_case codepoints case_flags] encodes with case annotation.
169
170    Per
171    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
172     Appendix A}:
173    - For basic (ASCII) letters, the output preserves the case flag directly
174    - For non-ASCII characters, the case of the final digit in each delta
175      encoding indicates the flag (uppercase = suggested uppercase)
176
177    The [case_flags] array must have the same length as [codepoints].
178
179    @raise Invalid_argument if array lengths don't match.
180    @raise Error on encoding failure. *)
181
182val decode_with_case : string -> Uchar.t array * case_flag array
183(** [decode_with_case punycode] decodes and extracts case annotations.
184
185    Per
186    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
187     Appendix A}, returns both the decoded code points and an array of case
188    flags indicating the suggested case for each character based on the
189    uppercase/lowercase form of the encoding digits.
190
191    @raise Error on decoding failure. *)
192
193(** {1 UTF-8 String Operations}
194
195    Convenience functions that work directly with UTF-8 encoded OCaml strings.
196    These combine UTF-8 decoding/encoding with the core Punycode operations. *)
197
198val encode_utf8 : string -> string
199(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
200
201    This is equivalent to decoding [s] from UTF-8 to code points, then calling
202    {!encode}.
203
204    @raise Error on encoding failure.
205
206    Example:
207    {[
208      encode_utf8 "münchen"
209      (* = "mnchen-3ya" *)
210    ]} *)
211
212val decode_utf8 : string -> string
213(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
214
215    This is equivalent to calling {!decode} then encoding the result as UTF-8.
216
217    @raise Error on decoding failure.
218
219    Example:
220    {[
221      decode_utf8 "mnchen-3ya"
222      (* = "münchen" *)
223    ]} *)
224
225(** {1 Domain Label Operations}
226
227    These functions handle the ACE prefix automatically and enforce DNS label
228    length limits per
229    {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
230
231val encode_label : string -> string
232(** [encode_label label] encodes a domain label for use in DNS.
233
234    If the label contains only ASCII characters, it is returned unchanged.
235    Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as
236    specified in
237    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
238     5}.
239
240    @raise Error with {!Label_too_long} if the result exceeds 63 bytes.
241    @raise Error with {!Empty_label} if the label is empty.
242
243    Example:
244    {[
245      encode_label "münchen"
246        (* = "xn--mnchen-3ya" *)
247        encode_label "example"
248      (* = "example" *)
249    ]} *)
250
251val decode_label : string -> string
252(** [decode_label label] decodes a domain label.
253
254    If the label starts with the ACE prefix ("xn--", case-insensitive), it is
255    Punycode-decoded. Otherwise, it is returned unchanged.
256
257    @raise Error on decoding failure.
258
259    Example:
260    {[
261      decode_label "xn--mnchen-3ya"
262        (* = "münchen" *)
263        decode_label "example"
264      (* = "example" *)
265    ]} *)
266
267(** {1 Validation}
268
269    Predicate functions for checking code point and string properties. *)
270
271val is_basic : Uchar.t -> bool
272(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
273
274    Per
275    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
276     5}, basic code points for Punycode are the ASCII code points (0..7F). *)
277
278val is_ascii_string : string -> bool
279(** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all
280    bytes < 128). *)
281
282val has_ace_prefix : string -> bool
283(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
284    (case-insensitive comparison). *)