Serenity Operating System
at master 1140 lines 46 kB view raw
1/* 2 * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include "../LibUnicode/GeneratorUtil.h" // FIXME: Move this somewhere common. 8#include <AK/AllOf.h> 9#include <AK/Array.h> 10#include <AK/CharacterTypes.h> 11#include <AK/DeprecatedString.h> 12#include <AK/Find.h> 13#include <AK/Format.h> 14#include <AK/HashFunctions.h> 15#include <AK/HashMap.h> 16#include <AK/JsonObject.h> 17#include <AK/JsonParser.h> 18#include <AK/JsonValue.h> 19#include <AK/LexicalPath.h> 20#include <AK/QuickSort.h> 21#include <AK/SourceGenerator.h> 22#include <AK/StringBuilder.h> 23#include <AK/Traits.h> 24#include <AK/Utf8View.h> 25#include <LibCore/ArgsParser.h> 26#include <LibCore/DeprecatedFile.h> 27#include <LibCore/DirIterator.h> 28#include <LibJS/Runtime/Intl/SingleUnitIdentifiers.h> 29#include <LibLocale/Locale.h> 30#include <LibLocale/NumberFormat.h> 31#include <LibLocale/PluralRules.h> 32#include <math.h> 33 34enum class NumberFormatType { 35 Standard, 36 Compact, 37}; 38 39struct NumberFormat : public Locale::NumberFormat { 40 using Base = Locale::NumberFormat; 41 42 unsigned hash() const 43 { 44 auto hash = pair_int_hash(magnitude, exponent); 45 hash = pair_int_hash(hash, to_underlying(plurality)); 46 hash = pair_int_hash(hash, zero_format_index); 47 hash = pair_int_hash(hash, positive_format_index); 48 hash = pair_int_hash(hash, negative_format_index); 49 50 for (auto index : identifier_indices) 51 hash = pair_int_hash(hash, index); 52 53 return hash; 54 } 55 56 bool operator==(NumberFormat const& other) const 57 { 58 return (magnitude == other.magnitude) 59 && (exponent == other.exponent) 60 && (plurality == other.plurality) 61 && (zero_format_index == other.zero_format_index) 62 && (positive_format_index == other.positive_format_index) 63 && (negative_format_index == other.negative_format_index) 64 && (identifier_indices == other.identifier_indices); 65 } 66 67 size_t zero_format_index { 0 }; 68 size_t positive_format_index { 0 }; 69 size_t negative_format_index { 0 }; 70 Vector<size_t> identifier_indices {}; 71}; 72 73template<> 74struct AK::Formatter<NumberFormat> : Formatter<FormatString> { 75 ErrorOr<void> format(FormatBuilder& builder, NumberFormat const& format) 76 { 77 StringBuilder identifier_indices; 78 identifier_indices.join(", "sv, format.identifier_indices); 79 80 return Formatter<FormatString>::format(builder, 81 "{{ {}, {}, {}, {}, {}, {}, {{ {} }} }}"sv, 82 format.magnitude, 83 format.exponent, 84 to_underlying(format.plurality), 85 format.zero_format_index, 86 format.positive_format_index, 87 format.negative_format_index, 88 identifier_indices.to_deprecated_string()); 89 } 90}; 91 92template<> 93struct AK::Traits<NumberFormat> : public GenericTraits<NumberFormat> { 94 static unsigned hash(NumberFormat const& f) { return f.hash(); } 95}; 96 97using NumberFormatList = Vector<size_t>; 98using NumericSymbolList = Vector<size_t>; 99 100struct NumberSystem { 101 unsigned hash() const 102 { 103 auto hash = int_hash(symbols); 104 hash = pair_int_hash(hash, primary_grouping_size); 105 hash = pair_int_hash(hash, secondary_grouping_size); 106 hash = pair_int_hash(hash, decimal_format); 107 hash = pair_int_hash(hash, decimal_long_formats); 108 hash = pair_int_hash(hash, decimal_short_formats); 109 hash = pair_int_hash(hash, currency_format); 110 hash = pair_int_hash(hash, accounting_format); 111 hash = pair_int_hash(hash, currency_unit_formats); 112 hash = pair_int_hash(hash, currency_short_formats); 113 hash = pair_int_hash(hash, percent_format); 114 hash = pair_int_hash(hash, scientific_format); 115 return hash; 116 } 117 118 bool operator==(NumberSystem const& other) const 119 { 120 return (symbols == other.symbols) 121 && (primary_grouping_size == other.primary_grouping_size) 122 && (secondary_grouping_size == other.secondary_grouping_size) 123 && (decimal_format == other.decimal_format) 124 && (decimal_long_formats == other.decimal_long_formats) 125 && (decimal_short_formats == other.decimal_short_formats) 126 && (currency_format == other.currency_format) 127 && (accounting_format == other.accounting_format) 128 && (currency_unit_formats == other.currency_unit_formats) 129 && (currency_short_formats == other.currency_short_formats) 130 && (percent_format == other.percent_format) 131 && (scientific_format == other.scientific_format); 132 } 133 134 size_t symbols { 0 }; 135 136 u8 primary_grouping_size { 0 }; 137 u8 secondary_grouping_size { 0 }; 138 139 size_t decimal_format { 0 }; 140 size_t decimal_long_formats { 0 }; 141 size_t decimal_short_formats { 0 }; 142 143 size_t currency_format { 0 }; 144 size_t accounting_format { 0 }; 145 size_t currency_unit_formats { 0 }; 146 size_t currency_short_formats { 0 }; 147 148 size_t percent_format { 0 }; 149 size_t scientific_format { 0 }; 150}; 151 152template<> 153struct AK::Formatter<NumberSystem> : Formatter<FormatString> { 154 ErrorOr<void> format(FormatBuilder& builder, NumberSystem const& system) 155 { 156 return Formatter<FormatString>::format(builder, 157 "{{ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} }}"sv, 158 system.symbols, 159 system.primary_grouping_size, 160 system.secondary_grouping_size, 161 system.decimal_format, 162 system.decimal_long_formats, 163 system.decimal_short_formats, 164 system.currency_format, 165 system.accounting_format, 166 system.currency_unit_formats, 167 system.currency_short_formats, 168 system.percent_format, 169 system.scientific_format); 170 } 171}; 172 173template<> 174struct AK::Traits<NumberSystem> : public GenericTraits<NumberSystem> { 175 static unsigned hash(NumberSystem const& s) { return s.hash(); } 176}; 177 178struct Unit { 179 unsigned hash() const 180 { 181 auto hash = int_hash(unit); 182 hash = pair_int_hash(hash, long_formats); 183 hash = pair_int_hash(hash, short_formats); 184 hash = pair_int_hash(hash, narrow_formats); 185 return hash; 186 } 187 188 bool operator==(Unit const& other) const 189 { 190 return (unit == other.unit) 191 && (long_formats == other.long_formats) 192 && (short_formats == other.short_formats) 193 && (narrow_formats == other.narrow_formats); 194 } 195 196 size_t unit { 0 }; 197 size_t long_formats { 0 }; 198 size_t short_formats { 0 }; 199 size_t narrow_formats { 0 }; 200}; 201 202template<> 203struct AK::Formatter<Unit> : Formatter<FormatString> { 204 ErrorOr<void> format(FormatBuilder& builder, Unit const& system) 205 { 206 return Formatter<FormatString>::format(builder, 207 "{{ {}, {}, {}, {} }}"sv, 208 system.unit, 209 system.long_formats, 210 system.short_formats, 211 system.narrow_formats); 212 } 213}; 214 215template<> 216struct AK::Traits<Unit> : public GenericTraits<Unit> { 217 static unsigned hash(Unit const& u) { return u.hash(); } 218}; 219 220struct LocaleData { 221 Vector<size_t> number_systems; 222 HashMap<DeprecatedString, size_t> units {}; 223 u8 minimum_grouping_digits { 0 }; 224}; 225 226struct CLDR { 227 UniqueStringStorage unique_strings; 228 UniqueStorage<NumberFormat> unique_formats; 229 UniqueStorage<NumberFormatList> unique_format_lists; 230 UniqueStorage<NumericSymbolList> unique_symbols; 231 UniqueStorage<NumberSystem> unique_systems; 232 UniqueStorage<Unit> unique_units; 233 234 HashMap<DeprecatedString, Array<u32, 10>> number_system_digits; 235 Vector<DeprecatedString> number_systems; 236 237 HashMap<DeprecatedString, LocaleData> locales; 238 size_t max_identifier_count { 0 }; 239}; 240 241static ErrorOr<void> parse_number_system_digits(DeprecatedString core_supplemental_path, CLDR& cldr) 242{ 243 LexicalPath number_systems_path(move(core_supplemental_path)); 244 number_systems_path = number_systems_path.append("numberingSystems.json"sv); 245 246 auto number_systems = TRY(read_json_file(number_systems_path.string())); 247 auto const& supplemental_object = number_systems.as_object().get_object("supplemental"sv).value(); 248 auto const& number_systems_object = supplemental_object.get_object("numberingSystems"sv).value(); 249 250 number_systems_object.for_each_member([&](auto const& number_system, auto const& digits_object) { 251 auto type = digits_object.as_object().get_deprecated_string("_type"sv).value(); 252 if (type != "numeric"sv) 253 return; 254 255 auto digits = digits_object.as_object().get_deprecated_string("_digits"sv).value(); 256 257 Utf8View utf8_digits { digits }; 258 VERIFY(utf8_digits.length() == 10); 259 260 auto& number_system_digits = cldr.number_system_digits.ensure(number_system); 261 size_t index = 0; 262 263 for (u32 digit : utf8_digits) 264 number_system_digits[index++] = digit; 265 266 if (!cldr.number_systems.contains_slow(number_system)) 267 cldr.number_systems.append(number_system); 268 }); 269 270 return {}; 271} 272 273static DeprecatedString parse_identifiers(DeprecatedString pattern, StringView replacement, CLDR& cldr, NumberFormat& format) 274{ 275 static constexpr Utf8View whitespace { "\u0020\u00a0\u200f"sv }; 276 277 while (true) { 278 Utf8View utf8_pattern { pattern }; 279 Optional<size_t> start_index; 280 Optional<size_t> end_index; 281 bool inside_replacement = false; 282 283 for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) { 284 if (*it == '{') { 285 if (start_index.has_value()) { 286 end_index = utf8_pattern.byte_offset_of(it); 287 break; 288 } 289 290 inside_replacement = true; 291 } else if (*it == '}') { 292 inside_replacement = false; 293 } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) { 294 start_index = utf8_pattern.byte_offset_of(it); 295 } 296 } 297 298 if (!start_index.has_value()) 299 return pattern; 300 301 end_index = end_index.value_or(pattern.length()); 302 303 utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index); 304 utf8_pattern = utf8_pattern.trim(whitespace); 305 306 auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv, ReplaceMode::FirstOnly); 307 auto identifier_index = cldr.unique_strings.ensure(move(identifier)); 308 size_t replacement_index = 0; 309 310 if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) { 311 replacement_index = *index; 312 } else { 313 replacement_index = format.identifier_indices.size(); 314 format.identifier_indices.append(identifier_index); 315 316 cldr.max_identifier_count = max(cldr.max_identifier_count, format.identifier_indices.size()); 317 } 318 319 pattern = DeprecatedString::formatted("{}{{{}:{}}}{}", 320 *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv, 321 replacement, 322 replacement_index, 323 pattern.substring_view(*start_index + utf8_pattern.byte_length())); 324 } 325} 326 327static void parse_number_pattern(Vector<DeprecatedString> patterns, CLDR& cldr, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr) 328{ 329 // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns 330 // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns 331 VERIFY((patterns.size() == 1) || (patterns.size() == 2)); 332 333 auto replace_patterns = [&](DeprecatedString pattern) { 334 static HashMap<StringView, StringView> replacements = { 335 { "{0}"sv, "{number}"sv }, 336 { "{1}"sv, "{currency}"sv }, 337 { "%"sv, "{percentSign}"sv }, 338 { "+"sv, "{plusSign}"sv }, 339 { "-"sv, "{minusSign}"sv }, 340 { "¤"sv, "{currency}"sv }, // U+00A4 Currency Sign 341 { "E"sv, "{scientificSeparator}"sv }, 342 }; 343 344 for (auto const& replacement : replacements) 345 pattern = pattern.replace(replacement.key, replacement.value, ReplaceMode::All); 346 347 if (auto start_number_index = pattern.find_any_of("#0"sv, DeprecatedString::SearchDirection::Forward); start_number_index.has_value()) { 348 auto end_number_index = *start_number_index + 1; 349 350 for (; end_number_index < pattern.length(); ++end_number_index) { 351 auto ch = pattern[end_number_index]; 352 if ((ch != '#') && (ch != '0') && (ch != ',') && (ch != '.')) 353 break; 354 } 355 356 if (number_system_for_groupings) { 357 auto number_pattern = pattern.substring_view(*start_number_index, end_number_index - *start_number_index); 358 359 auto group_separators = number_pattern.find_all(","sv); 360 VERIFY((group_separators.size() == 1) || (group_separators.size() == 2)); 361 362 auto decimal = number_pattern.find('.'); 363 VERIFY(decimal.has_value()); 364 365 if (group_separators.size() == 1) { 366 number_system_for_groupings->primary_grouping_size = *decimal - group_separators[0] - 1; 367 number_system_for_groupings->secondary_grouping_size = number_system_for_groupings->primary_grouping_size; 368 } else { 369 number_system_for_groupings->primary_grouping_size = *decimal - group_separators[1] - 1; 370 number_system_for_groupings->secondary_grouping_size = group_separators[1] - group_separators[0] - 1; 371 } 372 } 373 374 pattern = DeprecatedString::formatted("{}{{number}}{}", 375 *start_number_index > 0 ? pattern.substring_view(0, *start_number_index) : ""sv, 376 pattern.substring_view(end_number_index)); 377 378 // This is specifically handled here rather than in the replacements HashMap above so 379 // that we do not errantly replace zeroes in number patterns. 380 if (pattern.contains(*replacements.get("E"sv))) 381 pattern = pattern.replace("0"sv, "{scientificExponent}"sv, ReplaceMode::FirstOnly); 382 } 383 384 if (type == NumberFormatType::Compact) 385 return parse_identifiers(move(pattern), "compactIdentifier"sv, cldr, format); 386 387 return pattern; 388 }; 389 390 auto zero_format = replace_patterns(move(patterns[0])); 391 format.positive_format_index = cldr.unique_strings.ensure(DeprecatedString::formatted("{{plusSign}}{}", zero_format)); 392 393 if (patterns.size() == 2) { 394 auto negative_format = replace_patterns(move(patterns[1])); 395 format.negative_format_index = cldr.unique_strings.ensure(move(negative_format)); 396 } else { 397 format.negative_format_index = cldr.unique_strings.ensure(DeprecatedString::formatted("{{minusSign}}{}", zero_format)); 398 } 399 400 format.zero_format_index = cldr.unique_strings.ensure(move(zero_format)); 401} 402 403static void parse_number_pattern(Vector<DeprecatedString> patterns, CLDR& cldr, NumberFormatType type, size_t& format_index, NumberSystem* number_system_for_groupings = nullptr) 404{ 405 NumberFormat format {}; 406 parse_number_pattern(move(patterns), cldr, type, format, number_system_for_groupings); 407 408 format_index = cldr.unique_formats.ensure(move(format)); 409} 410 411static ErrorOr<void> parse_number_systems(DeprecatedString locale_numbers_path, CLDR& cldr, LocaleData& locale) 412{ 413 LexicalPath numbers_path(move(locale_numbers_path)); 414 numbers_path = numbers_path.append("numbers.json"sv); 415 416 auto numbers = TRY(read_json_file(numbers_path.string())); 417 auto const& main_object = numbers.as_object().get_object("main"sv).value(); 418 auto const& locale_object = main_object.get_object(numbers_path.parent().basename()).value(); 419 auto const& locale_numbers_object = locale_object.get_object("numbers"sv).value(); 420 auto const& minimum_grouping_digits = locale_numbers_object.get_deprecated_string("minimumGroupingDigits"sv).value(); 421 422 Vector<Optional<NumberSystem>> number_systems; 423 number_systems.resize(cldr.number_systems.size()); 424 425 auto ensure_number_system = [&](auto const& system) -> NumberSystem& { 426 auto system_index = cldr.number_systems.find_first_index(system).value(); 427 VERIFY(system_index < number_systems.size()); 428 429 auto& number_system = number_systems.at(system_index); 430 if (!number_system.has_value()) 431 number_system = NumberSystem {}; 432 433 return number_system.value(); 434 }; 435 436 auto parse_number_format = [&](auto const& format_object) { 437 Vector<size_t> result; 438 result.ensure_capacity(format_object.size()); 439 440 format_object.for_each_member([&](auto const& key, JsonValue const& value) { 441 auto split_key = key.split_view('-'); 442 if (split_key.size() != 3) 443 return; 444 445 auto patterns = value.as_string().split(';'); 446 NumberFormat format {}; 447 448 if (auto type = split_key[0].template to_uint<u64>(); type.has_value()) { 449 VERIFY(*type % 10 == 0); 450 format.magnitude = static_cast<u8>(log10(*type)); 451 452 if (patterns[0] != "0"sv) { 453 auto number_of_zeroes_in_pattern = patterns[0].count("0"sv); 454 VERIFY(format.magnitude >= number_of_zeroes_in_pattern); 455 456 format.exponent = format.magnitude + 1 - number_of_zeroes_in_pattern; 457 } 458 } else { 459 VERIFY(split_key[0] == "unitPattern"sv); 460 } 461 462 format.plurality = Locale::plural_category_from_string(split_key[2]); 463 parse_number_pattern(move(patterns), cldr, NumberFormatType::Compact, format); 464 465 auto format_index = cldr.unique_formats.ensure(move(format)); 466 result.append(format_index); 467 }); 468 469 return cldr.unique_format_lists.ensure(move(result)); 470 }; 471 472 auto numeric_symbol_from_string = [&](StringView numeric_symbol) -> Optional<Locale::NumericSymbol> { 473 if (numeric_symbol == "approximatelySign"sv) 474 return Locale::NumericSymbol::ApproximatelySign; 475 if (numeric_symbol == "decimal"sv) 476 return Locale::NumericSymbol::Decimal; 477 if (numeric_symbol == "exponential"sv) 478 return Locale::NumericSymbol::Exponential; 479 if (numeric_symbol == "group"sv) 480 return Locale::NumericSymbol::Group; 481 if (numeric_symbol == "infinity"sv) 482 return Locale::NumericSymbol::Infinity; 483 if (numeric_symbol == "minusSign"sv) 484 return Locale::NumericSymbol::MinusSign; 485 if (numeric_symbol == "nan"sv) 486 return Locale::NumericSymbol::NaN; 487 if (numeric_symbol == "percentSign"sv) 488 return Locale::NumericSymbol::PercentSign; 489 if (numeric_symbol == "plusSign"sv) 490 return Locale::NumericSymbol::PlusSign; 491 if (numeric_symbol == "timeSeparator"sv) 492 return Locale::NumericSymbol::TimeSeparator; 493 return {}; 494 }; 495 496 locale_numbers_object.for_each_member([&](auto const& key, JsonValue const& value) { 497 constexpr auto symbols_prefix = "symbols-numberSystem-"sv; 498 constexpr auto decimal_formats_prefix = "decimalFormats-numberSystem-"sv; 499 constexpr auto currency_formats_prefix = "currencyFormats-numberSystem-"sv; 500 constexpr auto percent_formats_prefix = "percentFormats-numberSystem-"sv; 501 constexpr auto scientific_formats_prefix = "scientificFormats-numberSystem-"sv; 502 constexpr auto misc_patterns_prefix = "miscPatterns-numberSystem-"sv; 503 504 if (key.starts_with(symbols_prefix)) { 505 auto system = key.substring(symbols_prefix.length()); 506 auto& number_system = ensure_number_system(system); 507 508 NumericSymbolList symbols; 509 510 value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) { 511 auto numeric_symbol = numeric_symbol_from_string(symbol); 512 if (!numeric_symbol.has_value()) 513 return; 514 515 if (to_underlying(*numeric_symbol) >= symbols.size()) 516 symbols.resize(to_underlying(*numeric_symbol) + 1); 517 518 auto symbol_index = cldr.unique_strings.ensure(localization.as_string()); 519 symbols[to_underlying(*numeric_symbol)] = symbol_index; 520 }); 521 522 // The range separator does not appear in the symbols list, we have to extract it from 523 // the range pattern. 524 auto misc_patterns_key = DeprecatedString::formatted("{}{}", misc_patterns_prefix, system); 525 auto misc_patterns = locale_numbers_object.get_object(misc_patterns_key).value(); 526 auto range_separator = misc_patterns.get_deprecated_string("range"sv).value(); 527 528 auto begin_index = range_separator.find("{0}"sv).value() + "{0}"sv.length(); 529 auto end_index = range_separator.find("{1}"sv).value(); 530 range_separator = range_separator.substring(begin_index, end_index - begin_index); 531 532 if (to_underlying(Locale::NumericSymbol::RangeSeparator) >= symbols.size()) 533 symbols.resize(to_underlying(Locale::NumericSymbol::RangeSeparator) + 1); 534 535 auto symbol_index = cldr.unique_strings.ensure(move(range_separator)); 536 symbols[to_underlying(Locale::NumericSymbol::RangeSeparator)] = symbol_index; 537 538 number_system.symbols = cldr.unique_symbols.ensure(move(symbols)); 539 } else if (key.starts_with(decimal_formats_prefix)) { 540 auto system = key.substring(decimal_formats_prefix.length()); 541 auto& number_system = ensure_number_system(system); 542 543 auto format_object = value.as_object().get_deprecated_string("standard"sv).value(); 544 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.decimal_format, &number_system); 545 546 auto const& long_format = value.as_object().get_object("long"sv)->get_object("decimalFormat"sv).value(); 547 number_system.decimal_long_formats = parse_number_format(long_format); 548 549 auto const& short_format = value.as_object().get_object("short"sv)->get_object("decimalFormat"sv).value(); 550 number_system.decimal_short_formats = parse_number_format(short_format); 551 } else if (key.starts_with(currency_formats_prefix)) { 552 auto system = key.substring(currency_formats_prefix.length()); 553 auto& number_system = ensure_number_system(system); 554 555 auto format_object = value.as_object().get_deprecated_string("standard"sv).value(); 556 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.currency_format); 557 558 format_object = value.as_object().get_deprecated_string("accounting"sv).value(); 559 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.accounting_format); 560 561 number_system.currency_unit_formats = parse_number_format(value.as_object()); 562 563 if (value.as_object().has_object("short"sv)) { 564 auto const& short_format = value.as_object().get_object("short"sv)->get_object("standard"sv).value(); 565 number_system.currency_short_formats = parse_number_format(short_format); 566 } 567 } else if (key.starts_with(percent_formats_prefix)) { 568 auto system = key.substring(percent_formats_prefix.length()); 569 auto& number_system = ensure_number_system(system); 570 571 auto format_object = value.as_object().get_deprecated_string("standard"sv).value(); 572 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.percent_format); 573 } else if (key.starts_with(scientific_formats_prefix)) { 574 auto system = key.substring(scientific_formats_prefix.length()); 575 auto& number_system = ensure_number_system(system); 576 577 auto format_object = value.as_object().get_deprecated_string("standard"sv).value(); 578 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.scientific_format); 579 } 580 }); 581 582 locale.number_systems.ensure_capacity(number_systems.size()); 583 584 for (auto& number_system : number_systems) { 585 size_t system_index = 0; 586 if (number_system.has_value()) 587 system_index = cldr.unique_systems.ensure(number_system.release_value()); 588 589 locale.number_systems.append(system_index); 590 } 591 592 locale.minimum_grouping_digits = minimum_grouping_digits.template to_uint<u8>().value(); 593 return {}; 594} 595 596static ErrorOr<void> parse_units(DeprecatedString locale_units_path, CLDR& cldr, LocaleData& locale) 597{ 598 LexicalPath units_path(move(locale_units_path)); 599 units_path = units_path.append("units.json"sv); 600 601 auto locale_units = TRY(read_json_file(units_path.string())); 602 auto const& main_object = locale_units.as_object().get_object("main"sv).value(); 603 auto const& locale_object = main_object.get_object(units_path.parent().basename()).value(); 604 auto const& locale_units_object = locale_object.get_object("units"sv).value(); 605 auto const& long_object = locale_units_object.get_object("long"sv).value(); 606 auto const& short_object = locale_units_object.get_object("short"sv).value(); 607 auto const& narrow_object = locale_units_object.get_object("narrow"sv).value(); 608 609 HashMap<DeprecatedString, Unit> units; 610 611 auto ensure_unit = [&](auto const& unit) -> Unit& { 612 return units.ensure(unit, [&]() { 613 auto unit_index = cldr.unique_strings.ensure(unit); 614 return Unit { .unit = unit_index }; 615 }); 616 }; 617 618 auto is_sanctioned_unit = [](StringView unit_name) { 619 // LibUnicode generally tries to avoid being directly dependent on ECMA-402, but this rather significantly reduces the amount 620 // of data generated here, and ECMA-402 is currently the only consumer of this data. 621 constexpr auto sanctioned_units = JS::Intl::sanctioned_single_unit_identifiers(); 622 return find(sanctioned_units.begin(), sanctioned_units.end(), unit_name) != sanctioned_units.end(); 623 }; 624 625 auto parse_units_object = [&](auto const& units_object, Locale::Style style) { 626 constexpr auto unit_pattern_prefix = "unitPattern-count-"sv; 627 constexpr auto combined_unit_separator = "-per-"sv; 628 629 units_object.for_each_member([&](auto const& key, JsonValue const& value) { 630 auto end_of_category = key.find('-'); 631 if (!end_of_category.has_value()) 632 return; 633 634 auto unit_name = key.substring(*end_of_category + 1); 635 636 if (!is_sanctioned_unit(unit_name)) { 637 auto indices = unit_name.find_all(combined_unit_separator); 638 if (indices.size() != 1) 639 return; 640 641 auto numerator = unit_name.substring_view(0, indices[0]); 642 auto denominator = unit_name.substring_view(indices[0] + combined_unit_separator.length()); 643 if (!is_sanctioned_unit(numerator) || !is_sanctioned_unit(denominator)) 644 return; 645 } 646 647 auto& unit = ensure_unit(unit_name); 648 NumberFormatList formats; 649 650 value.as_object().for_each_member([&](auto const& unit_key, JsonValue const& pattern_value) { 651 if (!unit_key.starts_with(unit_pattern_prefix)) 652 return; 653 654 NumberFormat format {}; 655 656 auto plurality = unit_key.substring_view(unit_pattern_prefix.length()); 657 format.plurality = Locale::plural_category_from_string(plurality); 658 659 auto zero_format = pattern_value.as_string().replace("{0}"sv, "{number}"sv, ReplaceMode::FirstOnly); 660 zero_format = parse_identifiers(zero_format, "unitIdentifier"sv, cldr, format); 661 662 format.positive_format_index = cldr.unique_strings.ensure(zero_format.replace("{number}"sv, "{plusSign}{number}"sv, ReplaceMode::FirstOnly)); 663 format.negative_format_index = cldr.unique_strings.ensure(zero_format.replace("{number}"sv, "{minusSign}{number}"sv, ReplaceMode::FirstOnly)); 664 format.zero_format_index = cldr.unique_strings.ensure(move(zero_format)); 665 666 formats.append(cldr.unique_formats.ensure(move(format))); 667 }); 668 669 auto number_format_list_index = cldr.unique_format_lists.ensure(move(formats)); 670 671 switch (style) { 672 case Locale::Style::Long: 673 unit.long_formats = number_format_list_index; 674 break; 675 case Locale::Style::Short: 676 unit.short_formats = number_format_list_index; 677 break; 678 case Locale::Style::Narrow: 679 unit.narrow_formats = number_format_list_index; 680 break; 681 default: 682 VERIFY_NOT_REACHED(); 683 } 684 }); 685 }; 686 687 parse_units_object(long_object, Locale::Style::Long); 688 parse_units_object(short_object, Locale::Style::Short); 689 parse_units_object(narrow_object, Locale::Style::Narrow); 690 691 for (auto& unit : units) { 692 auto unit_index = cldr.unique_units.ensure(move(unit.value)); 693 locale.units.set(unit.key, unit_index); 694 } 695 696 return {}; 697} 698 699static ErrorOr<void> parse_all_locales(DeprecatedString core_path, DeprecatedString numbers_path, DeprecatedString units_path, CLDR& cldr) 700{ 701 auto numbers_iterator = TRY(path_to_dir_iterator(move(numbers_path))); 702 auto units_iterator = TRY(path_to_dir_iterator(move(units_path))); 703 704 LexicalPath core_supplemental_path(move(core_path)); 705 core_supplemental_path = core_supplemental_path.append("supplemental"sv); 706 VERIFY(Core::DeprecatedFile::is_directory(core_supplemental_path.string())); 707 708 TRY(parse_number_system_digits(core_supplemental_path.string(), cldr)); 709 710 auto remove_variants_from_path = [&](DeprecatedString path) -> ErrorOr<DeprecatedString> { 711 auto parsed_locale = TRY(CanonicalLanguageID::parse(cldr.unique_strings, LexicalPath::basename(path))); 712 713 StringBuilder builder; 714 builder.append(cldr.unique_strings.get(parsed_locale.language)); 715 if (auto script = cldr.unique_strings.get(parsed_locale.script); !script.is_empty()) 716 builder.appendff("-{}", script); 717 if (auto region = cldr.unique_strings.get(parsed_locale.region); !region.is_empty()) 718 builder.appendff("-{}", region); 719 720 return builder.to_deprecated_string(); 721 }; 722 723 while (numbers_iterator.has_next()) { 724 auto numbers_path = TRY(next_path_from_dir_iterator(numbers_iterator)); 725 auto language = TRY(remove_variants_from_path(numbers_path)); 726 727 auto& locale = cldr.locales.ensure(language); 728 TRY(parse_number_systems(numbers_path, cldr, locale)); 729 } 730 731 while (units_iterator.has_next()) { 732 auto units_path = TRY(next_path_from_dir_iterator(units_iterator)); 733 auto language = TRY(remove_variants_from_path(units_path)); 734 735 auto& locale = cldr.locales.ensure(language); 736 TRY(parse_units(units_path, cldr, locale)); 737 } 738 739 return {}; 740} 741 742static DeprecatedString format_identifier(StringView, DeprecatedString identifier) 743{ 744 return identifier.to_titlecase(); 745} 746 747static ErrorOr<void> generate_unicode_locale_header(Core::BufferedFile& file, CLDR& cldr) 748{ 749 StringBuilder builder; 750 SourceGenerator generator { builder }; 751 752 generator.append(R"~~~( 753#include <AK/Types.h> 754 755#pragma once 756 757namespace Locale { 758)~~~"); 759 760 generate_enum(generator, format_identifier, "NumberSystem"sv, {}, cldr.number_systems); 761 762 generator.append(R"~~~( 763} 764)~~~"); 765 766 TRY(file.write_until_depleted(generator.as_string_view().bytes())); 767 return {}; 768} 769 770static ErrorOr<void> generate_unicode_locale_implementation(Core::BufferedFile& file, CLDR& cldr) 771{ 772 StringBuilder builder; 773 SourceGenerator generator { builder }; 774 generator.set("string_index_type"sv, cldr.unique_strings.type_that_fits()); 775 generator.set("number_format_index_type"sv, cldr.unique_formats.type_that_fits()); 776 generator.set("number_format_list_index_type"sv, cldr.unique_format_lists.type_that_fits()); 777 generator.set("numeric_symbol_list_index_type"sv, cldr.unique_symbols.type_that_fits()); 778 generator.set("identifier_count", DeprecatedString::number(cldr.max_identifier_count)); 779 780 generator.append(R"~~~( 781#include <AK/Array.h> 782#include <AK/BinarySearch.h> 783#include <AK/Optional.h> 784#include <AK/Span.h> 785#include <AK/StringView.h> 786#include <AK/Vector.h> 787#include <LibLocale/Locale.h> 788#include <LibLocale/LocaleData.h> 789#include <LibLocale/NumberFormat.h> 790#include <LibLocale/NumberFormatData.h> 791#include <LibLocale/PluralRules.h> 792 793namespace Locale { 794)~~~"); 795 796 cldr.unique_strings.generate(generator); 797 798 generator.append(R"~~~( 799struct NumberFormatImpl { 800 ErrorOr<NumberFormat> to_unicode_number_format() const { 801 NumberFormat number_format {}; 802 803 number_format.magnitude = magnitude; 804 number_format.exponent = exponent; 805 number_format.plurality = static_cast<PluralCategory>(plurality); 806 number_format.zero_format = decode_string(zero_format); 807 number_format.positive_format = decode_string(positive_format); 808 number_format.negative_format = decode_string(negative_format); 809 810 TRY(number_format.identifiers.try_ensure_capacity(identifiers.size())); 811 for (@string_index_type@ identifier : identifiers) 812 number_format.identifiers.unchecked_append(decode_string(identifier)); 813 814 return number_format; 815 } 816 817 u8 magnitude { 0 }; 818 u8 exponent { 0 }; 819 u8 plurality { 0 }; 820 @string_index_type@ zero_format { 0 }; 821 @string_index_type@ positive_format { 0 }; 822 @string_index_type@ negative_format { 0 }; 823 Array<@string_index_type@, @identifier_count@> identifiers {}; 824}; 825 826struct NumberSystemData { 827 @numeric_symbol_list_index_type@ symbols { 0 }; 828 829 u8 primary_grouping_size { 0 }; 830 u8 secondary_grouping_size { 0 }; 831 832 @number_format_index_type@ decimal_format { 0 }; 833 @number_format_list_index_type@ decimal_long_formats { 0 }; 834 @number_format_list_index_type@ decimal_short_formats { 0 }; 835 836 @number_format_index_type@ currency_format { 0 }; 837 @number_format_index_type@ accounting_format { 0 }; 838 @number_format_list_index_type@ currency_unit_formats { 0 }; 839 @number_format_list_index_type@ currency_short_formats { 0 }; 840 841 @number_format_index_type@ percent_format { 0 }; 842 @number_format_index_type@ scientific_format { 0 }; 843}; 844 845struct Unit { 846 @string_index_type@ unit { 0 }; 847 @number_format_list_index_type@ long_formats { 0 }; 848 @number_format_list_index_type@ short_formats { 0 }; 849 @number_format_list_index_type@ narrow_formats { 0 }; 850}; 851)~~~"); 852 853 cldr.unique_formats.generate(generator, "NumberFormatImpl"sv, "s_number_formats"sv, 10); 854 cldr.unique_format_lists.generate(generator, cldr.unique_formats.type_that_fits(), "s_number_format_lists"sv); 855 cldr.unique_symbols.generate(generator, cldr.unique_strings.type_that_fits(), "s_numeric_symbol_lists"sv); 856 cldr.unique_systems.generate(generator, "NumberSystemData"sv, "s_number_systems"sv, 10); 857 cldr.unique_units.generate(generator, "Unit"sv, "s_units"sv, 10); 858 859 auto locales = cldr.locales.keys(); 860 quick_sort(locales); 861 862 generator.set("size", DeprecatedString::number(locales.size())); 863 generator.append(R"~~~( 864static constexpr Array<u8, @size@> s_minimum_grouping_digits { { )~~~"); 865 866 bool first = true; 867 for (auto const& locale : locales) { 868 generator.append(first ? " "sv : ", "sv); 869 generator.append(DeprecatedString::number(cldr.locales.find(locale)->value.minimum_grouping_digits)); 870 first = false; 871 } 872 generator.append(" } };\n"); 873 874 auto append_map = [&](DeprecatedString name, auto type, auto const& map) { 875 generator.set("name", name); 876 generator.set("type", type); 877 generator.set("size", DeprecatedString::number(map.size())); 878 879 generator.append(R"~~~( 880static constexpr Array<@type@, @size@> @name@ { {)~~~"); 881 882 bool first = true; 883 for (auto const& item : map) { 884 generator.append(first ? " "sv : ", "sv); 885 if constexpr (requires { item.value; }) 886 generator.append(DeprecatedString::number(item.value)); 887 else 888 generator.append(DeprecatedString::number(item)); 889 first = false; 890 } 891 892 generator.append(" } };"); 893 }; 894 895 generate_mapping(generator, cldr.number_system_digits, "u32"sv, "s_number_systems_digits"sv, "s_number_systems_digits_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, "u32"sv, value); }); 896 generate_mapping(generator, cldr.locales, cldr.unique_systems.type_that_fits(), "s_locale_number_systems"sv, "s_number_systems_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, cldr.unique_systems.type_that_fits(), value.number_systems); }); 897 generate_mapping(generator, cldr.locales, cldr.unique_units.type_that_fits(), "s_locale_units"sv, "s_units_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, cldr.unique_units.type_that_fits(), value.units); }); 898 899 generator.append(R"~~~( 900static Optional<NumberSystem> keyword_to_number_system(KeywordNumbers keyword) 901{ 902 switch (keyword) {)~~~"); 903 904 for (auto const& number_system : cldr.number_systems) { 905 generator.set("name"sv, format_identifier({}, number_system)); 906 generator.append(R"~~~( 907 case KeywordNumbers::@name@: 908 return NumberSystem::@name@;)~~~"); 909 } 910 911 generator.append(R"~~~( 912 default: 913 return {}; 914 } 915} 916 917Optional<ReadonlySpan<u32>> get_digits_for_number_system(StringView system) 918{ 919 auto number_system_keyword = keyword_nu_from_string(system); 920 if (!number_system_keyword.has_value()) 921 return {}; 922 923 auto number_system_value = keyword_to_number_system(*number_system_keyword); 924 if (!number_system_value.has_value()) 925 return {}; 926 927 auto number_system_index = to_underlying(*number_system_value); 928 return s_number_systems_digits[number_system_index]; 929} 930 931static ErrorOr<NumberSystemData const*> find_number_system(StringView locale, StringView system) 932{ 933 auto locale_value = locale_from_string(locale); 934 if (!locale_value.has_value()) 935 return nullptr; 936 937 auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None. 938 auto const& number_systems = s_locale_number_systems.at(locale_index); 939 940 auto lookup_number_system = [&](auto number_system) -> NumberSystemData const* { 941 auto number_system_keyword = keyword_nu_from_string(number_system); 942 if (!number_system_keyword.has_value()) 943 return nullptr; 944 945 auto number_system_value = keyword_to_number_system(*number_system_keyword); 946 if (!number_system_value.has_value()) 947 return nullptr; 948 949 auto number_system_index = to_underlying(*number_system_value); 950 number_system_index = number_systems.at(number_system_index); 951 952 if (number_system_index == 0) 953 return nullptr; 954 955 return &s_number_systems.at(number_system_index); 956 }; 957 958 if (auto const* number_system = lookup_number_system(system)) 959 return number_system; 960 961 auto default_number_system = TRY(get_preferred_keyword_value_for_locale(locale, "nu"sv)); 962 if (!default_number_system.has_value()) 963 return nullptr; 964 965 return lookup_number_system(*default_number_system); 966} 967 968ErrorOr<Optional<StringView>> get_number_system_symbol(StringView locale, StringView system, NumericSymbol symbol) 969{ 970 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) { 971 auto symbols = s_numeric_symbol_lists.at(number_system->symbols); 972 973 auto symbol_index = to_underlying(symbol); 974 if (symbol_index >= symbols.size()) 975 return OptionalNone {}; 976 977 return Optional<StringView> { decode_string(symbols[symbol_index]) }; 978 } 979 980 return OptionalNone {}; 981} 982 983ErrorOr<Optional<NumberGroupings>> get_number_system_groupings(StringView locale, StringView system) 984{ 985 auto locale_value = locale_from_string(locale); 986 if (!locale_value.has_value()) 987 return OptionalNone {}; 988 989 u8 minimum_grouping_digits = s_minimum_grouping_digits[to_underlying(*locale_value) - 1]; 990 991 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) 992 return NumberGroupings { minimum_grouping_digits, number_system->primary_grouping_size, number_system->secondary_grouping_size }; 993 return OptionalNone {}; 994} 995 996ErrorOr<Optional<NumberFormat>> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type) 997{ 998 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) { 999 @number_format_index_type@ format_index = 0; 1000 1001 switch (type) { 1002 case StandardNumberFormatType::Decimal: 1003 format_index = number_system->decimal_format; 1004 break; 1005 case StandardNumberFormatType::Currency: 1006 format_index = number_system->currency_format; 1007 break; 1008 case StandardNumberFormatType::Accounting: 1009 format_index = number_system->accounting_format; 1010 break; 1011 case StandardNumberFormatType::Percent: 1012 format_index = number_system->percent_format; 1013 break; 1014 case StandardNumberFormatType::Scientific: 1015 format_index = number_system->scientific_format; 1016 break; 1017 } 1018 1019 return TRY(s_number_formats[format_index].to_unicode_number_format()); 1020 } 1021 1022 return OptionalNone {}; 1023} 1024 1025ErrorOr<Vector<NumberFormat>> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type) 1026{ 1027 Vector<NumberFormat> formats; 1028 1029 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) { 1030 @number_format_list_index_type@ number_format_list_index { 0 }; 1031 1032 switch (type) { 1033 case CompactNumberFormatType::DecimalLong: 1034 number_format_list_index = number_system->decimal_long_formats; 1035 break; 1036 case CompactNumberFormatType::DecimalShort: 1037 number_format_list_index = number_system->decimal_short_formats; 1038 break; 1039 case CompactNumberFormatType::CurrencyUnit: 1040 number_format_list_index = number_system->currency_unit_formats; 1041 break; 1042 case CompactNumberFormatType::CurrencyShort: 1043 number_format_list_index = number_system->currency_short_formats; 1044 break; 1045 } 1046 1047 auto number_formats = s_number_format_lists.at(number_format_list_index); 1048 TRY(formats.try_ensure_capacity(number_formats.size())); 1049 1050 for (auto number_format : number_formats) 1051 formats.unchecked_append(TRY(s_number_formats[number_format].to_unicode_number_format())); 1052 } 1053 1054 return formats; 1055} 1056 1057static Unit const* find_units(StringView locale, StringView unit) 1058{ 1059 auto locale_value = locale_from_string(locale); 1060 if (!locale_value.has_value()) 1061 return nullptr; 1062 1063 auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None. 1064 auto const& locale_units = s_locale_units.at(locale_index); 1065 1066 for (auto unit_index : locale_units) { 1067 auto const& units = s_units.at(unit_index); 1068 1069 if (unit == decode_string(units.unit)) 1070 return &units; 1071 }; 1072 1073 return nullptr; 1074} 1075 1076ErrorOr<Vector<NumberFormat>> get_unit_formats(StringView locale, StringView unit, Style style) 1077{ 1078 Vector<NumberFormat> formats; 1079 1080 if (auto const* units = find_units(locale, unit); units != nullptr) { 1081 @number_format_list_index_type@ number_format_list_index { 0 }; 1082 1083 switch (style) { 1084 case Style::Long: 1085 number_format_list_index = units->long_formats; 1086 break; 1087 case Style::Short: 1088 number_format_list_index = units->short_formats; 1089 break; 1090 case Style::Narrow: 1091 number_format_list_index = units->narrow_formats; 1092 break; 1093 default: 1094 VERIFY_NOT_REACHED(); 1095 } 1096 1097 auto number_formats = s_number_format_lists.at(number_format_list_index); 1098 TRY(formats.try_ensure_capacity(number_formats.size())); 1099 1100 for (auto number_format : number_formats) 1101 formats.unchecked_append(TRY(s_number_formats[number_format].to_unicode_number_format())); 1102 } 1103 1104 return formats; 1105} 1106 1107} 1108)~~~"); 1109 1110 TRY(file.write_until_depleted(generator.as_string_view().bytes())); 1111 return {}; 1112} 1113 1114ErrorOr<int> serenity_main(Main::Arguments arguments) 1115{ 1116 StringView generated_header_path; 1117 StringView generated_implementation_path; 1118 StringView core_path; 1119 StringView numbers_path; 1120 StringView units_path; 1121 1122 Core::ArgsParser args_parser; 1123 args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path"); 1124 args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); 1125 args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path"); 1126 args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path"); 1127 args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path"); 1128 args_parser.parse(arguments); 1129 1130 auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); 1131 auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); 1132 1133 CLDR cldr; 1134 TRY(parse_all_locales(core_path, numbers_path, units_path, cldr)); 1135 1136 TRY(generate_unicode_locale_header(*generated_header_file, cldr)); 1137 TRY(generate_unicode_locale_implementation(*generated_implementation_file, cldr)); 1138 1139 return 0; 1140}