Serenity Operating System
1/*
2 * Copyright (c) 2022, mat
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/Find.h>
8#include <AK/QuickSort.h>
9#include <AK/Utf8View.h>
10#include <AK/Vector.h>
11#include <LibUnicode/CharacterTypes.h>
12#include <LibUnicode/Normalize.h>
13
14#if ENABLE_UNICODE_DATA
15# include <LibUnicode/UnicodeData.h>
16#else
17struct Unicode::CodePointDecomposition { };
18#endif
19
20namespace Unicode {
21
22Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
23Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition_by_index(size_t) { return {}; }
24
25NormalizationForm normalization_form_from_string(StringView form)
26{
27 if (form == "NFD"sv)
28 return NormalizationForm::NFD;
29 if (form == "NFC"sv)
30 return NormalizationForm::NFC;
31 if (form == "NFKD"sv)
32 return NormalizationForm::NFKD;
33 if (form == "NFKC"sv)
34 return NormalizationForm::NFKC;
35 VERIFY_NOT_REACHED();
36}
37
38StringView normalization_form_to_string(NormalizationForm form)
39{
40 switch (form) {
41 case NormalizationForm::NFD:
42 return "NFD"sv;
43 case NormalizationForm::NFC:
44 return "NFC"sv;
45 case NormalizationForm::NFKD:
46 return "NFKD"sv;
47 case NormalizationForm::NFKC:
48 return "NFKC"sv;
49 }
50 VERIFY_NOT_REACHED();
51}
52
53ALWAYS_INLINE static bool is_starter(u32 code_point)
54{
55 return Unicode::canonical_combining_class(code_point) == 0;
56}
57
58// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
59static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
60static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
61static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
62static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
63static constexpr u32 HANGUL_LEADING_COUNT = 19;
64static constexpr u32 HANGUL_VOWEL_COUNT = 21;
65static constexpr u32 HANGUL_TRAILING_COUNT = 28;
66// NCount in the standard.
67static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
68static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
69
70ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
71{
72 return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
73}
74
75ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
76{
77 return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
78}
79
80ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
81{
82 return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
83}
84
85ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
86{
87 return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
88}
89
90// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
91static ErrorOr<void> decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
92{
93 auto const index = code_point - HANGUL_SYLLABLE_BASE;
94
95 auto const leading_index = index / HANGUL_BLOCK_COUNT;
96 auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
97 auto const trailing_index = index % HANGUL_TRAILING_COUNT;
98
99 auto const leading_part = HANGUL_LEADING_BASE + leading_index;
100 auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
101 auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
102
103 TRY(code_points_output.try_append(leading_part));
104 TRY(code_points_output.try_append(vowel_part));
105 if (trailing_index != 0)
106 TRY(code_points_output.try_append(trailing_part));
107
108 return {};
109}
110
111// L, V and LV, T Hangul Syllable Composition
112// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
113static u32 combine_hangul_code_points(u32 a, u32 b)
114{
115 if (is_hangul_leading(a) && is_hangul_vowel(b)) {
116 auto const leading_index = a - HANGUL_LEADING_BASE;
117 auto const vowel_index = b - HANGUL_VOWEL_BASE;
118 auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
119 return HANGUL_SYLLABLE_BASE + leading_vowel_index;
120 }
121 // LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
122 if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
123 return a + b - HANGUL_TRAILING_BASE;
124 }
125 return 0;
126}
127
128static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
129{
130#if ENABLE_UNICODE_DATA
131 Array<u32, 2> const points { a, b };
132
133 // FIXME: Do something better than linear search to find reverse mappings.
134 for (size_t index = 0;; ++index) {
135 auto mapping_maybe = Unicode::code_point_decomposition_by_index(index);
136 if (!mapping_maybe.has_value())
137 break;
138 auto& mapping = mapping_maybe.value();
139 if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) {
140 if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion))
141 continue;
142 return mapping.code_point;
143 }
144 }
145#endif
146
147 return 0;
148}
149
150enum class UseCompatibility {
151 Yes,
152 No
153};
154
155static ErrorOr<void> decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
156{
157 if (is_hangul_code_point(code_point))
158 return decompose_hangul_code_point(code_point, code_points_output);
159
160#if ENABLE_UNICODE_DATA
161 auto const mapping = Unicode::code_point_decomposition(code_point);
162 if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
163 for (auto code_point : mapping->decomposition) {
164 TRY(decompose_code_point(code_point, code_points_output, use_compatibility));
165 }
166 } else {
167 TRY(code_points_output.try_append(code_point));
168 }
169#endif
170
171 return {};
172}
173
174// This can be any sorting algorithm that maintains order (like std::stable_sort),
175// however bubble sort is easier to implement, so go with it (for now).
176template<typename T, typename LessThan>
177void bubble_sort(Span<T> span, LessThan less_than)
178{
179 for (size_t i = 0; i < span.size() - 1; ++i) {
180 for (size_t j = 0; j < span.size() - 1 - i; ++j) {
181 if (!less_than(span[j], span[j + 1]))
182 swap(span[j], span[j + 1]);
183 }
184 }
185}
186
187// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
188// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
189// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
190static void canonical_ordering_algorithm(Span<u32> code_points)
191{
192 for (size_t i = 0; i < code_points.size(); ++i) {
193 if (!is_starter(code_points[i])) {
194 auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
195 auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
196 // Nothing to reorder, so continue.
197 if (span_size <= 1)
198 continue;
199 Span<u32> const span { code_points.data() + i, span_size };
200
201 bubble_sort(span, [](u32 a, u32 b) {
202 // Use <= to keep ordering.
203 return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
204 });
205
206 // Skip over span we just sorted.
207 i += span_size - 1;
208 }
209 }
210}
211
212// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
213static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
214{
215 if (!is_starter(code_points[a]) || a == c - 1)
216 return false;
217 auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
218 auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
219 return b_combining_class == 0 || b_combining_class >= c_combining_class;
220}
221
222// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
223// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
224// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
225static void canonical_composition_algorithm(Vector<u32>& code_points)
226{
227 for (size_t i = 1; i < code_points.size(); ++i) {
228 auto const current_character = code_points[i];
229 // R1. Seek back (left) to find the last Starter L preceding C in the character sequence
230 for (ssize_t j = i - 1; j >= 0; --j) {
231 if (!is_starter(code_points[j]))
232 continue;
233 // R2. If there is such an L, and C is not blocked from L,
234 // and there exists a Primary Composite P which is canonically equivalent to <L, C>,
235 // then replace L by P in the sequence and delete C from the sequence.
236 if (is_blocked(code_points.span(), j, i))
237 continue;
238
239 auto composite = combine_hangul_code_points(code_points[j], current_character);
240
241 if (composite == 0)
242 composite = combine_code_points(code_points[j], current_character);
243
244 if (composite != 0) {
245 code_points[j] = composite;
246 code_points.remove(i);
247 --i;
248 break;
249 }
250 }
251 }
252}
253
254static ErrorOr<Vector<u32>> normalize_nfd(Utf8View string)
255{
256 Vector<u32> result;
257 for (auto const code_point : string)
258 TRY(decompose_code_point(code_point, result, UseCompatibility::No));
259
260 canonical_ordering_algorithm(result);
261 return result;
262}
263
264static ErrorOr<Vector<u32>> normalize_nfc(Utf8View string)
265{
266 auto result = TRY(normalize_nfd(string));
267 canonical_composition_algorithm(result);
268
269 return result;
270}
271
272static ErrorOr<Vector<u32>> normalize_nfkd(Utf8View string)
273{
274 Vector<u32> result;
275 for (auto const code_point : string)
276 TRY(decompose_code_point(code_point, result, UseCompatibility::Yes));
277
278 canonical_ordering_algorithm(result);
279 return result;
280}
281
282static ErrorOr<Vector<u32>> normalize_nfkc(Utf8View string)
283{
284 auto result = TRY(normalize_nfkd(string));
285 canonical_composition_algorithm(result);
286
287 return result;
288}
289
290static ErrorOr<Vector<u32>> normalize_implementation(Utf8View string, NormalizationForm form)
291{
292 switch (form) {
293 case NormalizationForm::NFD:
294 return normalize_nfd(string);
295 case NormalizationForm::NFC:
296 return normalize_nfc(string);
297 case NormalizationForm::NFKD:
298 return normalize_nfkd(string);
299 case NormalizationForm::NFKC:
300 return normalize_nfkc(string);
301 }
302 VERIFY_NOT_REACHED();
303}
304
305ErrorOr<String> normalize(StringView string, NormalizationForm form)
306{
307 auto const code_points = TRY(normalize_implementation(Utf8View { string }, form));
308
309 StringBuilder builder;
310 for (auto code_point : code_points)
311 TRY(builder.try_append_code_point(code_point));
312
313 return builder.to_string();
314}
315
316}