Serenity Operating System
at master 316 lines 11 kB view raw
1/* 2 * Copyright (c) 2022, mat 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/Find.h> 8#include <AK/QuickSort.h> 9#include <AK/Utf8View.h> 10#include <AK/Vector.h> 11#include <LibUnicode/CharacterTypes.h> 12#include <LibUnicode/Normalize.h> 13 14#if ENABLE_UNICODE_DATA 15# include <LibUnicode/UnicodeData.h> 16#else 17struct Unicode::CodePointDecomposition { }; 18#endif 19 20namespace Unicode { 21 22Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; } 23Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition_by_index(size_t) { return {}; } 24 25NormalizationForm normalization_form_from_string(StringView form) 26{ 27 if (form == "NFD"sv) 28 return NormalizationForm::NFD; 29 if (form == "NFC"sv) 30 return NormalizationForm::NFC; 31 if (form == "NFKD"sv) 32 return NormalizationForm::NFKD; 33 if (form == "NFKC"sv) 34 return NormalizationForm::NFKC; 35 VERIFY_NOT_REACHED(); 36} 37 38StringView normalization_form_to_string(NormalizationForm form) 39{ 40 switch (form) { 41 case NormalizationForm::NFD: 42 return "NFD"sv; 43 case NormalizationForm::NFC: 44 return "NFC"sv; 45 case NormalizationForm::NFKD: 46 return "NFKD"sv; 47 case NormalizationForm::NFKC: 48 return "NFKC"sv; 49 } 50 VERIFY_NOT_REACHED(); 51} 52 53ALWAYS_INLINE static bool is_starter(u32 code_point) 54{ 55 return Unicode::canonical_combining_class(code_point) == 0; 56} 57 58// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 59static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00; 60static constexpr u32 HANGUL_LEADING_BASE = 0x1100; 61static constexpr u32 HANGUL_VOWEL_BASE = 0x1161; 62static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7; 63static constexpr u32 HANGUL_LEADING_COUNT = 19; 64static constexpr u32 HANGUL_VOWEL_COUNT = 21; 65static constexpr u32 HANGUL_TRAILING_COUNT = 28; 66// NCount in the standard. 67static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT; 68static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT; 69 70ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point) 71{ 72 return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT; 73} 74 75ALWAYS_INLINE static bool is_hangul_leading(u32 code_point) 76{ 77 return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT; 78} 79 80ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point) 81{ 82 return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT; 83} 84 85ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point) 86{ 87 return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT; 88} 89 90// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 91static ErrorOr<void> decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output) 92{ 93 auto const index = code_point - HANGUL_SYLLABLE_BASE; 94 95 auto const leading_index = index / HANGUL_BLOCK_COUNT; 96 auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT; 97 auto const trailing_index = index % HANGUL_TRAILING_COUNT; 98 99 auto const leading_part = HANGUL_LEADING_BASE + leading_index; 100 auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index; 101 auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index; 102 103 TRY(code_points_output.try_append(leading_part)); 104 TRY(code_points_output.try_append(vowel_part)); 105 if (trailing_index != 0) 106 TRY(code_points_output.try_append(trailing_part)); 107 108 return {}; 109} 110 111// L, V and LV, T Hangul Syllable Composition 112// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688 113static u32 combine_hangul_code_points(u32 a, u32 b) 114{ 115 if (is_hangul_leading(a) && is_hangul_vowel(b)) { 116 auto const leading_index = a - HANGUL_LEADING_BASE; 117 auto const vowel_index = b - HANGUL_VOWEL_BASE; 118 auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT; 119 return HANGUL_SYLLABLE_BASE + leading_vowel_index; 120 } 121 // LV characters are the first in each "T block", so use this check to avoid combining LVT with T. 122 if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) { 123 return a + b - HANGUL_TRAILING_BASE; 124 } 125 return 0; 126} 127 128static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b) 129{ 130#if ENABLE_UNICODE_DATA 131 Array<u32, 2> const points { a, b }; 132 133 // FIXME: Do something better than linear search to find reverse mappings. 134 for (size_t index = 0;; ++index) { 135 auto mapping_maybe = Unicode::code_point_decomposition_by_index(index); 136 if (!mapping_maybe.has_value()) 137 break; 138 auto& mapping = mapping_maybe.value(); 139 if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) { 140 if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion)) 141 continue; 142 return mapping.code_point; 143 } 144 } 145#endif 146 147 return 0; 148} 149 150enum class UseCompatibility { 151 Yes, 152 No 153}; 154 155static ErrorOr<void> decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility) 156{ 157 if (is_hangul_code_point(code_point)) 158 return decompose_hangul_code_point(code_point, code_points_output); 159 160#if ENABLE_UNICODE_DATA 161 auto const mapping = Unicode::code_point_decomposition(code_point); 162 if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) { 163 for (auto code_point : mapping->decomposition) { 164 TRY(decompose_code_point(code_point, code_points_output, use_compatibility)); 165 } 166 } else { 167 TRY(code_points_output.try_append(code_point)); 168 } 169#endif 170 171 return {}; 172} 173 174// This can be any sorting algorithm that maintains order (like std::stable_sort), 175// however bubble sort is easier to implement, so go with it (for now). 176template<typename T, typename LessThan> 177void bubble_sort(Span<T> span, LessThan less_than) 178{ 179 for (size_t i = 0; i < span.size() - 1; ++i) { 180 for (size_t j = 0; j < span.size() - 1 - i; ++j) { 181 if (!less_than(span[j], span[j + 1])) 182 swap(span[j], span[j + 1]); 183 } 184 } 185} 186 187// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard. 188// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15 189// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591 190static void canonical_ordering_algorithm(Span<u32> code_points) 191{ 192 for (size_t i = 0; i < code_points.size(); ++i) { 193 if (!is_starter(code_points[i])) { 194 auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter); 195 auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i)); 196 // Nothing to reorder, so continue. 197 if (span_size <= 1) 198 continue; 199 Span<u32> const span { code_points.data() + i, span_size }; 200 201 bubble_sort(span, [](u32 a, u32 b) { 202 // Use <= to keep ordering. 203 return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b); 204 }); 205 206 // Skip over span we just sorted. 207 i += span_size - 1; 208 } 209 } 210} 211 212// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard. 213static bool is_blocked(Span<u32> code_points, size_t a, size_t c) 214{ 215 if (!is_starter(code_points[a]) || a == c - 1) 216 return false; 217 auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]); 218 auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]); 219 return b_combining_class == 0 || b_combining_class >= c_combining_class; 220} 221 222// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard. 223// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15 224// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628 225static void canonical_composition_algorithm(Vector<u32>& code_points) 226{ 227 for (size_t i = 1; i < code_points.size(); ++i) { 228 auto const current_character = code_points[i]; 229 // R1. Seek back (left) to find the last Starter L preceding C in the character sequence 230 for (ssize_t j = i - 1; j >= 0; --j) { 231 if (!is_starter(code_points[j])) 232 continue; 233 // R2. If there is such an L, and C is not blocked from L, 234 // and there exists a Primary Composite P which is canonically equivalent to <L, C>, 235 // then replace L by P in the sequence and delete C from the sequence. 236 if (is_blocked(code_points.span(), j, i)) 237 continue; 238 239 auto composite = combine_hangul_code_points(code_points[j], current_character); 240 241 if (composite == 0) 242 composite = combine_code_points(code_points[j], current_character); 243 244 if (composite != 0) { 245 code_points[j] = composite; 246 code_points.remove(i); 247 --i; 248 break; 249 } 250 } 251 } 252} 253 254static ErrorOr<Vector<u32>> normalize_nfd(Utf8View string) 255{ 256 Vector<u32> result; 257 for (auto const code_point : string) 258 TRY(decompose_code_point(code_point, result, UseCompatibility::No)); 259 260 canonical_ordering_algorithm(result); 261 return result; 262} 263 264static ErrorOr<Vector<u32>> normalize_nfc(Utf8View string) 265{ 266 auto result = TRY(normalize_nfd(string)); 267 canonical_composition_algorithm(result); 268 269 return result; 270} 271 272static ErrorOr<Vector<u32>> normalize_nfkd(Utf8View string) 273{ 274 Vector<u32> result; 275 for (auto const code_point : string) 276 TRY(decompose_code_point(code_point, result, UseCompatibility::Yes)); 277 278 canonical_ordering_algorithm(result); 279 return result; 280} 281 282static ErrorOr<Vector<u32>> normalize_nfkc(Utf8View string) 283{ 284 auto result = TRY(normalize_nfkd(string)); 285 canonical_composition_algorithm(result); 286 287 return result; 288} 289 290static ErrorOr<Vector<u32>> normalize_implementation(Utf8View string, NormalizationForm form) 291{ 292 switch (form) { 293 case NormalizationForm::NFD: 294 return normalize_nfd(string); 295 case NormalizationForm::NFC: 296 return normalize_nfc(string); 297 case NormalizationForm::NFKD: 298 return normalize_nfkd(string); 299 case NormalizationForm::NFKC: 300 return normalize_nfkc(string); 301 } 302 VERIFY_NOT_REACHED(); 303} 304 305ErrorOr<String> normalize(StringView string, NormalizationForm form) 306{ 307 auto const code_points = TRY(normalize_implementation(Utf8View { string }, form)); 308 309 StringBuilder builder; 310 for (auto code_point : code_points) 311 TRY(builder.try_append_code_point(code_point)); 312 313 return builder.to_string(); 314} 315 316}