Serenity Operating System
1/*
2 * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include "../LibUnicode/GeneratorUtil.h" // FIXME: Move this somewhere common.
8#include <AK/AllOf.h>
9#include <AK/Array.h>
10#include <AK/CharacterTypes.h>
11#include <AK/DeprecatedString.h>
12#include <AK/Find.h>
13#include <AK/Format.h>
14#include <AK/HashFunctions.h>
15#include <AK/HashMap.h>
16#include <AK/JsonObject.h>
17#include <AK/JsonParser.h>
18#include <AK/JsonValue.h>
19#include <AK/LexicalPath.h>
20#include <AK/QuickSort.h>
21#include <AK/SourceGenerator.h>
22#include <AK/StringBuilder.h>
23#include <AK/Traits.h>
24#include <AK/Utf8View.h>
25#include <LibCore/ArgsParser.h>
26#include <LibCore/DeprecatedFile.h>
27#include <LibCore/DirIterator.h>
28#include <LibJS/Runtime/Intl/SingleUnitIdentifiers.h>
29#include <LibLocale/Locale.h>
30#include <LibLocale/NumberFormat.h>
31#include <LibLocale/PluralRules.h>
32#include <math.h>
33
34enum class NumberFormatType {
35 Standard,
36 Compact,
37};
38
39struct NumberFormat : public Locale::NumberFormat {
40 using Base = Locale::NumberFormat;
41
42 unsigned hash() const
43 {
44 auto hash = pair_int_hash(magnitude, exponent);
45 hash = pair_int_hash(hash, to_underlying(plurality));
46 hash = pair_int_hash(hash, zero_format_index);
47 hash = pair_int_hash(hash, positive_format_index);
48 hash = pair_int_hash(hash, negative_format_index);
49
50 for (auto index : identifier_indices)
51 hash = pair_int_hash(hash, index);
52
53 return hash;
54 }
55
56 bool operator==(NumberFormat const& other) const
57 {
58 return (magnitude == other.magnitude)
59 && (exponent == other.exponent)
60 && (plurality == other.plurality)
61 && (zero_format_index == other.zero_format_index)
62 && (positive_format_index == other.positive_format_index)
63 && (negative_format_index == other.negative_format_index)
64 && (identifier_indices == other.identifier_indices);
65 }
66
67 size_t zero_format_index { 0 };
68 size_t positive_format_index { 0 };
69 size_t negative_format_index { 0 };
70 Vector<size_t> identifier_indices {};
71};
72
73template<>
74struct AK::Formatter<NumberFormat> : Formatter<FormatString> {
75 ErrorOr<void> format(FormatBuilder& builder, NumberFormat const& format)
76 {
77 StringBuilder identifier_indices;
78 identifier_indices.join(", "sv, format.identifier_indices);
79
80 return Formatter<FormatString>::format(builder,
81 "{{ {}, {}, {}, {}, {}, {}, {{ {} }} }}"sv,
82 format.magnitude,
83 format.exponent,
84 to_underlying(format.plurality),
85 format.zero_format_index,
86 format.positive_format_index,
87 format.negative_format_index,
88 identifier_indices.to_deprecated_string());
89 }
90};
91
92template<>
93struct AK::Traits<NumberFormat> : public GenericTraits<NumberFormat> {
94 static unsigned hash(NumberFormat const& f) { return f.hash(); }
95};
96
97using NumberFormatList = Vector<size_t>;
98using NumericSymbolList = Vector<size_t>;
99
100struct NumberSystem {
101 unsigned hash() const
102 {
103 auto hash = int_hash(symbols);
104 hash = pair_int_hash(hash, primary_grouping_size);
105 hash = pair_int_hash(hash, secondary_grouping_size);
106 hash = pair_int_hash(hash, decimal_format);
107 hash = pair_int_hash(hash, decimal_long_formats);
108 hash = pair_int_hash(hash, decimal_short_formats);
109 hash = pair_int_hash(hash, currency_format);
110 hash = pair_int_hash(hash, accounting_format);
111 hash = pair_int_hash(hash, currency_unit_formats);
112 hash = pair_int_hash(hash, currency_short_formats);
113 hash = pair_int_hash(hash, percent_format);
114 hash = pair_int_hash(hash, scientific_format);
115 return hash;
116 }
117
118 bool operator==(NumberSystem const& other) const
119 {
120 return (symbols == other.symbols)
121 && (primary_grouping_size == other.primary_grouping_size)
122 && (secondary_grouping_size == other.secondary_grouping_size)
123 && (decimal_format == other.decimal_format)
124 && (decimal_long_formats == other.decimal_long_formats)
125 && (decimal_short_formats == other.decimal_short_formats)
126 && (currency_format == other.currency_format)
127 && (accounting_format == other.accounting_format)
128 && (currency_unit_formats == other.currency_unit_formats)
129 && (currency_short_formats == other.currency_short_formats)
130 && (percent_format == other.percent_format)
131 && (scientific_format == other.scientific_format);
132 }
133
134 size_t symbols { 0 };
135
136 u8 primary_grouping_size { 0 };
137 u8 secondary_grouping_size { 0 };
138
139 size_t decimal_format { 0 };
140 size_t decimal_long_formats { 0 };
141 size_t decimal_short_formats { 0 };
142
143 size_t currency_format { 0 };
144 size_t accounting_format { 0 };
145 size_t currency_unit_formats { 0 };
146 size_t currency_short_formats { 0 };
147
148 size_t percent_format { 0 };
149 size_t scientific_format { 0 };
150};
151
152template<>
153struct AK::Formatter<NumberSystem> : Formatter<FormatString> {
154 ErrorOr<void> format(FormatBuilder& builder, NumberSystem const& system)
155 {
156 return Formatter<FormatString>::format(builder,
157 "{{ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} }}"sv,
158 system.symbols,
159 system.primary_grouping_size,
160 system.secondary_grouping_size,
161 system.decimal_format,
162 system.decimal_long_formats,
163 system.decimal_short_formats,
164 system.currency_format,
165 system.accounting_format,
166 system.currency_unit_formats,
167 system.currency_short_formats,
168 system.percent_format,
169 system.scientific_format);
170 }
171};
172
173template<>
174struct AK::Traits<NumberSystem> : public GenericTraits<NumberSystem> {
175 static unsigned hash(NumberSystem const& s) { return s.hash(); }
176};
177
178struct Unit {
179 unsigned hash() const
180 {
181 auto hash = int_hash(unit);
182 hash = pair_int_hash(hash, long_formats);
183 hash = pair_int_hash(hash, short_formats);
184 hash = pair_int_hash(hash, narrow_formats);
185 return hash;
186 }
187
188 bool operator==(Unit const& other) const
189 {
190 return (unit == other.unit)
191 && (long_formats == other.long_formats)
192 && (short_formats == other.short_formats)
193 && (narrow_formats == other.narrow_formats);
194 }
195
196 size_t unit { 0 };
197 size_t long_formats { 0 };
198 size_t short_formats { 0 };
199 size_t narrow_formats { 0 };
200};
201
202template<>
203struct AK::Formatter<Unit> : Formatter<FormatString> {
204 ErrorOr<void> format(FormatBuilder& builder, Unit const& system)
205 {
206 return Formatter<FormatString>::format(builder,
207 "{{ {}, {}, {}, {} }}"sv,
208 system.unit,
209 system.long_formats,
210 system.short_formats,
211 system.narrow_formats);
212 }
213};
214
215template<>
216struct AK::Traits<Unit> : public GenericTraits<Unit> {
217 static unsigned hash(Unit const& u) { return u.hash(); }
218};
219
220struct LocaleData {
221 Vector<size_t> number_systems;
222 HashMap<DeprecatedString, size_t> units {};
223 u8 minimum_grouping_digits { 0 };
224};
225
226struct CLDR {
227 UniqueStringStorage unique_strings;
228 UniqueStorage<NumberFormat> unique_formats;
229 UniqueStorage<NumberFormatList> unique_format_lists;
230 UniqueStorage<NumericSymbolList> unique_symbols;
231 UniqueStorage<NumberSystem> unique_systems;
232 UniqueStorage<Unit> unique_units;
233
234 HashMap<DeprecatedString, Array<u32, 10>> number_system_digits;
235 Vector<DeprecatedString> number_systems;
236
237 HashMap<DeprecatedString, LocaleData> locales;
238 size_t max_identifier_count { 0 };
239};
240
241static ErrorOr<void> parse_number_system_digits(DeprecatedString core_supplemental_path, CLDR& cldr)
242{
243 LexicalPath number_systems_path(move(core_supplemental_path));
244 number_systems_path = number_systems_path.append("numberingSystems.json"sv);
245
246 auto number_systems = TRY(read_json_file(number_systems_path.string()));
247 auto const& supplemental_object = number_systems.as_object().get_object("supplemental"sv).value();
248 auto const& number_systems_object = supplemental_object.get_object("numberingSystems"sv).value();
249
250 number_systems_object.for_each_member([&](auto const& number_system, auto const& digits_object) {
251 auto type = digits_object.as_object().get_deprecated_string("_type"sv).value();
252 if (type != "numeric"sv)
253 return;
254
255 auto digits = digits_object.as_object().get_deprecated_string("_digits"sv).value();
256
257 Utf8View utf8_digits { digits };
258 VERIFY(utf8_digits.length() == 10);
259
260 auto& number_system_digits = cldr.number_system_digits.ensure(number_system);
261 size_t index = 0;
262
263 for (u32 digit : utf8_digits)
264 number_system_digits[index++] = digit;
265
266 if (!cldr.number_systems.contains_slow(number_system))
267 cldr.number_systems.append(number_system);
268 });
269
270 return {};
271}
272
273static DeprecatedString parse_identifiers(DeprecatedString pattern, StringView replacement, CLDR& cldr, NumberFormat& format)
274{
275 static constexpr Utf8View whitespace { "\u0020\u00a0\u200f"sv };
276
277 while (true) {
278 Utf8View utf8_pattern { pattern };
279 Optional<size_t> start_index;
280 Optional<size_t> end_index;
281 bool inside_replacement = false;
282
283 for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
284 if (*it == '{') {
285 if (start_index.has_value()) {
286 end_index = utf8_pattern.byte_offset_of(it);
287 break;
288 }
289
290 inside_replacement = true;
291 } else if (*it == '}') {
292 inside_replacement = false;
293 } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) {
294 start_index = utf8_pattern.byte_offset_of(it);
295 }
296 }
297
298 if (!start_index.has_value())
299 return pattern;
300
301 end_index = end_index.value_or(pattern.length());
302
303 utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index);
304 utf8_pattern = utf8_pattern.trim(whitespace);
305
306 auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv, ReplaceMode::FirstOnly);
307 auto identifier_index = cldr.unique_strings.ensure(move(identifier));
308 size_t replacement_index = 0;
309
310 if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) {
311 replacement_index = *index;
312 } else {
313 replacement_index = format.identifier_indices.size();
314 format.identifier_indices.append(identifier_index);
315
316 cldr.max_identifier_count = max(cldr.max_identifier_count, format.identifier_indices.size());
317 }
318
319 pattern = DeprecatedString::formatted("{}{{{}:{}}}{}",
320 *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv,
321 replacement,
322 replacement_index,
323 pattern.substring_view(*start_index + utf8_pattern.byte_length()));
324 }
325}
326
327static void parse_number_pattern(Vector<DeprecatedString> patterns, CLDR& cldr, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
328{
329 // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
330 // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
331 VERIFY((patterns.size() == 1) || (patterns.size() == 2));
332
333 auto replace_patterns = [&](DeprecatedString pattern) {
334 static HashMap<StringView, StringView> replacements = {
335 { "{0}"sv, "{number}"sv },
336 { "{1}"sv, "{currency}"sv },
337 { "%"sv, "{percentSign}"sv },
338 { "+"sv, "{plusSign}"sv },
339 { "-"sv, "{minusSign}"sv },
340 { "¤"sv, "{currency}"sv }, // U+00A4 Currency Sign
341 { "E"sv, "{scientificSeparator}"sv },
342 };
343
344 for (auto const& replacement : replacements)
345 pattern = pattern.replace(replacement.key, replacement.value, ReplaceMode::All);
346
347 if (auto start_number_index = pattern.find_any_of("#0"sv, DeprecatedString::SearchDirection::Forward); start_number_index.has_value()) {
348 auto end_number_index = *start_number_index + 1;
349
350 for (; end_number_index < pattern.length(); ++end_number_index) {
351 auto ch = pattern[end_number_index];
352 if ((ch != '#') && (ch != '0') && (ch != ',') && (ch != '.'))
353 break;
354 }
355
356 if (number_system_for_groupings) {
357 auto number_pattern = pattern.substring_view(*start_number_index, end_number_index - *start_number_index);
358
359 auto group_separators = number_pattern.find_all(","sv);
360 VERIFY((group_separators.size() == 1) || (group_separators.size() == 2));
361
362 auto decimal = number_pattern.find('.');
363 VERIFY(decimal.has_value());
364
365 if (group_separators.size() == 1) {
366 number_system_for_groupings->primary_grouping_size = *decimal - group_separators[0] - 1;
367 number_system_for_groupings->secondary_grouping_size = number_system_for_groupings->primary_grouping_size;
368 } else {
369 number_system_for_groupings->primary_grouping_size = *decimal - group_separators[1] - 1;
370 number_system_for_groupings->secondary_grouping_size = group_separators[1] - group_separators[0] - 1;
371 }
372 }
373
374 pattern = DeprecatedString::formatted("{}{{number}}{}",
375 *start_number_index > 0 ? pattern.substring_view(0, *start_number_index) : ""sv,
376 pattern.substring_view(end_number_index));
377
378 // This is specifically handled here rather than in the replacements HashMap above so
379 // that we do not errantly replace zeroes in number patterns.
380 if (pattern.contains(*replacements.get("E"sv)))
381 pattern = pattern.replace("0"sv, "{scientificExponent}"sv, ReplaceMode::FirstOnly);
382 }
383
384 if (type == NumberFormatType::Compact)
385 return parse_identifiers(move(pattern), "compactIdentifier"sv, cldr, format);
386
387 return pattern;
388 };
389
390 auto zero_format = replace_patterns(move(patterns[0]));
391 format.positive_format_index = cldr.unique_strings.ensure(DeprecatedString::formatted("{{plusSign}}{}", zero_format));
392
393 if (patterns.size() == 2) {
394 auto negative_format = replace_patterns(move(patterns[1]));
395 format.negative_format_index = cldr.unique_strings.ensure(move(negative_format));
396 } else {
397 format.negative_format_index = cldr.unique_strings.ensure(DeprecatedString::formatted("{{minusSign}}{}", zero_format));
398 }
399
400 format.zero_format_index = cldr.unique_strings.ensure(move(zero_format));
401}
402
403static void parse_number_pattern(Vector<DeprecatedString> patterns, CLDR& cldr, NumberFormatType type, size_t& format_index, NumberSystem* number_system_for_groupings = nullptr)
404{
405 NumberFormat format {};
406 parse_number_pattern(move(patterns), cldr, type, format, number_system_for_groupings);
407
408 format_index = cldr.unique_formats.ensure(move(format));
409}
410
411static ErrorOr<void> parse_number_systems(DeprecatedString locale_numbers_path, CLDR& cldr, LocaleData& locale)
412{
413 LexicalPath numbers_path(move(locale_numbers_path));
414 numbers_path = numbers_path.append("numbers.json"sv);
415
416 auto numbers = TRY(read_json_file(numbers_path.string()));
417 auto const& main_object = numbers.as_object().get_object("main"sv).value();
418 auto const& locale_object = main_object.get_object(numbers_path.parent().basename()).value();
419 auto const& locale_numbers_object = locale_object.get_object("numbers"sv).value();
420 auto const& minimum_grouping_digits = locale_numbers_object.get_deprecated_string("minimumGroupingDigits"sv).value();
421
422 Vector<Optional<NumberSystem>> number_systems;
423 number_systems.resize(cldr.number_systems.size());
424
425 auto ensure_number_system = [&](auto const& system) -> NumberSystem& {
426 auto system_index = cldr.number_systems.find_first_index(system).value();
427 VERIFY(system_index < number_systems.size());
428
429 auto& number_system = number_systems.at(system_index);
430 if (!number_system.has_value())
431 number_system = NumberSystem {};
432
433 return number_system.value();
434 };
435
436 auto parse_number_format = [&](auto const& format_object) {
437 Vector<size_t> result;
438 result.ensure_capacity(format_object.size());
439
440 format_object.for_each_member([&](auto const& key, JsonValue const& value) {
441 auto split_key = key.split_view('-');
442 if (split_key.size() != 3)
443 return;
444
445 auto patterns = value.as_string().split(';');
446 NumberFormat format {};
447
448 if (auto type = split_key[0].template to_uint<u64>(); type.has_value()) {
449 VERIFY(*type % 10 == 0);
450 format.magnitude = static_cast<u8>(log10(*type));
451
452 if (patterns[0] != "0"sv) {
453 auto number_of_zeroes_in_pattern = patterns[0].count("0"sv);
454 VERIFY(format.magnitude >= number_of_zeroes_in_pattern);
455
456 format.exponent = format.magnitude + 1 - number_of_zeroes_in_pattern;
457 }
458 } else {
459 VERIFY(split_key[0] == "unitPattern"sv);
460 }
461
462 format.plurality = Locale::plural_category_from_string(split_key[2]);
463 parse_number_pattern(move(patterns), cldr, NumberFormatType::Compact, format);
464
465 auto format_index = cldr.unique_formats.ensure(move(format));
466 result.append(format_index);
467 });
468
469 return cldr.unique_format_lists.ensure(move(result));
470 };
471
472 auto numeric_symbol_from_string = [&](StringView numeric_symbol) -> Optional<Locale::NumericSymbol> {
473 if (numeric_symbol == "approximatelySign"sv)
474 return Locale::NumericSymbol::ApproximatelySign;
475 if (numeric_symbol == "decimal"sv)
476 return Locale::NumericSymbol::Decimal;
477 if (numeric_symbol == "exponential"sv)
478 return Locale::NumericSymbol::Exponential;
479 if (numeric_symbol == "group"sv)
480 return Locale::NumericSymbol::Group;
481 if (numeric_symbol == "infinity"sv)
482 return Locale::NumericSymbol::Infinity;
483 if (numeric_symbol == "minusSign"sv)
484 return Locale::NumericSymbol::MinusSign;
485 if (numeric_symbol == "nan"sv)
486 return Locale::NumericSymbol::NaN;
487 if (numeric_symbol == "percentSign"sv)
488 return Locale::NumericSymbol::PercentSign;
489 if (numeric_symbol == "plusSign"sv)
490 return Locale::NumericSymbol::PlusSign;
491 if (numeric_symbol == "timeSeparator"sv)
492 return Locale::NumericSymbol::TimeSeparator;
493 return {};
494 };
495
496 locale_numbers_object.for_each_member([&](auto const& key, JsonValue const& value) {
497 constexpr auto symbols_prefix = "symbols-numberSystem-"sv;
498 constexpr auto decimal_formats_prefix = "decimalFormats-numberSystem-"sv;
499 constexpr auto currency_formats_prefix = "currencyFormats-numberSystem-"sv;
500 constexpr auto percent_formats_prefix = "percentFormats-numberSystem-"sv;
501 constexpr auto scientific_formats_prefix = "scientificFormats-numberSystem-"sv;
502 constexpr auto misc_patterns_prefix = "miscPatterns-numberSystem-"sv;
503
504 if (key.starts_with(symbols_prefix)) {
505 auto system = key.substring(symbols_prefix.length());
506 auto& number_system = ensure_number_system(system);
507
508 NumericSymbolList symbols;
509
510 value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) {
511 auto numeric_symbol = numeric_symbol_from_string(symbol);
512 if (!numeric_symbol.has_value())
513 return;
514
515 if (to_underlying(*numeric_symbol) >= symbols.size())
516 symbols.resize(to_underlying(*numeric_symbol) + 1);
517
518 auto symbol_index = cldr.unique_strings.ensure(localization.as_string());
519 symbols[to_underlying(*numeric_symbol)] = symbol_index;
520 });
521
522 // The range separator does not appear in the symbols list, we have to extract it from
523 // the range pattern.
524 auto misc_patterns_key = DeprecatedString::formatted("{}{}", misc_patterns_prefix, system);
525 auto misc_patterns = locale_numbers_object.get_object(misc_patterns_key).value();
526 auto range_separator = misc_patterns.get_deprecated_string("range"sv).value();
527
528 auto begin_index = range_separator.find("{0}"sv).value() + "{0}"sv.length();
529 auto end_index = range_separator.find("{1}"sv).value();
530 range_separator = range_separator.substring(begin_index, end_index - begin_index);
531
532 if (to_underlying(Locale::NumericSymbol::RangeSeparator) >= symbols.size())
533 symbols.resize(to_underlying(Locale::NumericSymbol::RangeSeparator) + 1);
534
535 auto symbol_index = cldr.unique_strings.ensure(move(range_separator));
536 symbols[to_underlying(Locale::NumericSymbol::RangeSeparator)] = symbol_index;
537
538 number_system.symbols = cldr.unique_symbols.ensure(move(symbols));
539 } else if (key.starts_with(decimal_formats_prefix)) {
540 auto system = key.substring(decimal_formats_prefix.length());
541 auto& number_system = ensure_number_system(system);
542
543 auto format_object = value.as_object().get_deprecated_string("standard"sv).value();
544 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.decimal_format, &number_system);
545
546 auto const& long_format = value.as_object().get_object("long"sv)->get_object("decimalFormat"sv).value();
547 number_system.decimal_long_formats = parse_number_format(long_format);
548
549 auto const& short_format = value.as_object().get_object("short"sv)->get_object("decimalFormat"sv).value();
550 number_system.decimal_short_formats = parse_number_format(short_format);
551 } else if (key.starts_with(currency_formats_prefix)) {
552 auto system = key.substring(currency_formats_prefix.length());
553 auto& number_system = ensure_number_system(system);
554
555 auto format_object = value.as_object().get_deprecated_string("standard"sv).value();
556 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.currency_format);
557
558 format_object = value.as_object().get_deprecated_string("accounting"sv).value();
559 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.accounting_format);
560
561 number_system.currency_unit_formats = parse_number_format(value.as_object());
562
563 if (value.as_object().has_object("short"sv)) {
564 auto const& short_format = value.as_object().get_object("short"sv)->get_object("standard"sv).value();
565 number_system.currency_short_formats = parse_number_format(short_format);
566 }
567 } else if (key.starts_with(percent_formats_prefix)) {
568 auto system = key.substring(percent_formats_prefix.length());
569 auto& number_system = ensure_number_system(system);
570
571 auto format_object = value.as_object().get_deprecated_string("standard"sv).value();
572 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.percent_format);
573 } else if (key.starts_with(scientific_formats_prefix)) {
574 auto system = key.substring(scientific_formats_prefix.length());
575 auto& number_system = ensure_number_system(system);
576
577 auto format_object = value.as_object().get_deprecated_string("standard"sv).value();
578 parse_number_pattern(format_object.split(';'), cldr, NumberFormatType::Standard, number_system.scientific_format);
579 }
580 });
581
582 locale.number_systems.ensure_capacity(number_systems.size());
583
584 for (auto& number_system : number_systems) {
585 size_t system_index = 0;
586 if (number_system.has_value())
587 system_index = cldr.unique_systems.ensure(number_system.release_value());
588
589 locale.number_systems.append(system_index);
590 }
591
592 locale.minimum_grouping_digits = minimum_grouping_digits.template to_uint<u8>().value();
593 return {};
594}
595
596static ErrorOr<void> parse_units(DeprecatedString locale_units_path, CLDR& cldr, LocaleData& locale)
597{
598 LexicalPath units_path(move(locale_units_path));
599 units_path = units_path.append("units.json"sv);
600
601 auto locale_units = TRY(read_json_file(units_path.string()));
602 auto const& main_object = locale_units.as_object().get_object("main"sv).value();
603 auto const& locale_object = main_object.get_object(units_path.parent().basename()).value();
604 auto const& locale_units_object = locale_object.get_object("units"sv).value();
605 auto const& long_object = locale_units_object.get_object("long"sv).value();
606 auto const& short_object = locale_units_object.get_object("short"sv).value();
607 auto const& narrow_object = locale_units_object.get_object("narrow"sv).value();
608
609 HashMap<DeprecatedString, Unit> units;
610
611 auto ensure_unit = [&](auto const& unit) -> Unit& {
612 return units.ensure(unit, [&]() {
613 auto unit_index = cldr.unique_strings.ensure(unit);
614 return Unit { .unit = unit_index };
615 });
616 };
617
618 auto is_sanctioned_unit = [](StringView unit_name) {
619 // LibUnicode generally tries to avoid being directly dependent on ECMA-402, but this rather significantly reduces the amount
620 // of data generated here, and ECMA-402 is currently the only consumer of this data.
621 constexpr auto sanctioned_units = JS::Intl::sanctioned_single_unit_identifiers();
622 return find(sanctioned_units.begin(), sanctioned_units.end(), unit_name) != sanctioned_units.end();
623 };
624
625 auto parse_units_object = [&](auto const& units_object, Locale::Style style) {
626 constexpr auto unit_pattern_prefix = "unitPattern-count-"sv;
627 constexpr auto combined_unit_separator = "-per-"sv;
628
629 units_object.for_each_member([&](auto const& key, JsonValue const& value) {
630 auto end_of_category = key.find('-');
631 if (!end_of_category.has_value())
632 return;
633
634 auto unit_name = key.substring(*end_of_category + 1);
635
636 if (!is_sanctioned_unit(unit_name)) {
637 auto indices = unit_name.find_all(combined_unit_separator);
638 if (indices.size() != 1)
639 return;
640
641 auto numerator = unit_name.substring_view(0, indices[0]);
642 auto denominator = unit_name.substring_view(indices[0] + combined_unit_separator.length());
643 if (!is_sanctioned_unit(numerator) || !is_sanctioned_unit(denominator))
644 return;
645 }
646
647 auto& unit = ensure_unit(unit_name);
648 NumberFormatList formats;
649
650 value.as_object().for_each_member([&](auto const& unit_key, JsonValue const& pattern_value) {
651 if (!unit_key.starts_with(unit_pattern_prefix))
652 return;
653
654 NumberFormat format {};
655
656 auto plurality = unit_key.substring_view(unit_pattern_prefix.length());
657 format.plurality = Locale::plural_category_from_string(plurality);
658
659 auto zero_format = pattern_value.as_string().replace("{0}"sv, "{number}"sv, ReplaceMode::FirstOnly);
660 zero_format = parse_identifiers(zero_format, "unitIdentifier"sv, cldr, format);
661
662 format.positive_format_index = cldr.unique_strings.ensure(zero_format.replace("{number}"sv, "{plusSign}{number}"sv, ReplaceMode::FirstOnly));
663 format.negative_format_index = cldr.unique_strings.ensure(zero_format.replace("{number}"sv, "{minusSign}{number}"sv, ReplaceMode::FirstOnly));
664 format.zero_format_index = cldr.unique_strings.ensure(move(zero_format));
665
666 formats.append(cldr.unique_formats.ensure(move(format)));
667 });
668
669 auto number_format_list_index = cldr.unique_format_lists.ensure(move(formats));
670
671 switch (style) {
672 case Locale::Style::Long:
673 unit.long_formats = number_format_list_index;
674 break;
675 case Locale::Style::Short:
676 unit.short_formats = number_format_list_index;
677 break;
678 case Locale::Style::Narrow:
679 unit.narrow_formats = number_format_list_index;
680 break;
681 default:
682 VERIFY_NOT_REACHED();
683 }
684 });
685 };
686
687 parse_units_object(long_object, Locale::Style::Long);
688 parse_units_object(short_object, Locale::Style::Short);
689 parse_units_object(narrow_object, Locale::Style::Narrow);
690
691 for (auto& unit : units) {
692 auto unit_index = cldr.unique_units.ensure(move(unit.value));
693 locale.units.set(unit.key, unit_index);
694 }
695
696 return {};
697}
698
699static ErrorOr<void> parse_all_locales(DeprecatedString core_path, DeprecatedString numbers_path, DeprecatedString units_path, CLDR& cldr)
700{
701 auto numbers_iterator = TRY(path_to_dir_iterator(move(numbers_path)));
702 auto units_iterator = TRY(path_to_dir_iterator(move(units_path)));
703
704 LexicalPath core_supplemental_path(move(core_path));
705 core_supplemental_path = core_supplemental_path.append("supplemental"sv);
706 VERIFY(Core::DeprecatedFile::is_directory(core_supplemental_path.string()));
707
708 TRY(parse_number_system_digits(core_supplemental_path.string(), cldr));
709
710 auto remove_variants_from_path = [&](DeprecatedString path) -> ErrorOr<DeprecatedString> {
711 auto parsed_locale = TRY(CanonicalLanguageID::parse(cldr.unique_strings, LexicalPath::basename(path)));
712
713 StringBuilder builder;
714 builder.append(cldr.unique_strings.get(parsed_locale.language));
715 if (auto script = cldr.unique_strings.get(parsed_locale.script); !script.is_empty())
716 builder.appendff("-{}", script);
717 if (auto region = cldr.unique_strings.get(parsed_locale.region); !region.is_empty())
718 builder.appendff("-{}", region);
719
720 return builder.to_deprecated_string();
721 };
722
723 while (numbers_iterator.has_next()) {
724 auto numbers_path = TRY(next_path_from_dir_iterator(numbers_iterator));
725 auto language = TRY(remove_variants_from_path(numbers_path));
726
727 auto& locale = cldr.locales.ensure(language);
728 TRY(parse_number_systems(numbers_path, cldr, locale));
729 }
730
731 while (units_iterator.has_next()) {
732 auto units_path = TRY(next_path_from_dir_iterator(units_iterator));
733 auto language = TRY(remove_variants_from_path(units_path));
734
735 auto& locale = cldr.locales.ensure(language);
736 TRY(parse_units(units_path, cldr, locale));
737 }
738
739 return {};
740}
741
742static DeprecatedString format_identifier(StringView, DeprecatedString identifier)
743{
744 return identifier.to_titlecase();
745}
746
747static ErrorOr<void> generate_unicode_locale_header(Core::BufferedFile& file, CLDR& cldr)
748{
749 StringBuilder builder;
750 SourceGenerator generator { builder };
751
752 generator.append(R"~~~(
753#include <AK/Types.h>
754
755#pragma once
756
757namespace Locale {
758)~~~");
759
760 generate_enum(generator, format_identifier, "NumberSystem"sv, {}, cldr.number_systems);
761
762 generator.append(R"~~~(
763}
764)~~~");
765
766 TRY(file.write_until_depleted(generator.as_string_view().bytes()));
767 return {};
768}
769
770static ErrorOr<void> generate_unicode_locale_implementation(Core::BufferedFile& file, CLDR& cldr)
771{
772 StringBuilder builder;
773 SourceGenerator generator { builder };
774 generator.set("string_index_type"sv, cldr.unique_strings.type_that_fits());
775 generator.set("number_format_index_type"sv, cldr.unique_formats.type_that_fits());
776 generator.set("number_format_list_index_type"sv, cldr.unique_format_lists.type_that_fits());
777 generator.set("numeric_symbol_list_index_type"sv, cldr.unique_symbols.type_that_fits());
778 generator.set("identifier_count", DeprecatedString::number(cldr.max_identifier_count));
779
780 generator.append(R"~~~(
781#include <AK/Array.h>
782#include <AK/BinarySearch.h>
783#include <AK/Optional.h>
784#include <AK/Span.h>
785#include <AK/StringView.h>
786#include <AK/Vector.h>
787#include <LibLocale/Locale.h>
788#include <LibLocale/LocaleData.h>
789#include <LibLocale/NumberFormat.h>
790#include <LibLocale/NumberFormatData.h>
791#include <LibLocale/PluralRules.h>
792
793namespace Locale {
794)~~~");
795
796 cldr.unique_strings.generate(generator);
797
798 generator.append(R"~~~(
799struct NumberFormatImpl {
800 ErrorOr<NumberFormat> to_unicode_number_format() const {
801 NumberFormat number_format {};
802
803 number_format.magnitude = magnitude;
804 number_format.exponent = exponent;
805 number_format.plurality = static_cast<PluralCategory>(plurality);
806 number_format.zero_format = decode_string(zero_format);
807 number_format.positive_format = decode_string(positive_format);
808 number_format.negative_format = decode_string(negative_format);
809
810 TRY(number_format.identifiers.try_ensure_capacity(identifiers.size()));
811 for (@string_index_type@ identifier : identifiers)
812 number_format.identifiers.unchecked_append(decode_string(identifier));
813
814 return number_format;
815 }
816
817 u8 magnitude { 0 };
818 u8 exponent { 0 };
819 u8 plurality { 0 };
820 @string_index_type@ zero_format { 0 };
821 @string_index_type@ positive_format { 0 };
822 @string_index_type@ negative_format { 0 };
823 Array<@string_index_type@, @identifier_count@> identifiers {};
824};
825
826struct NumberSystemData {
827 @numeric_symbol_list_index_type@ symbols { 0 };
828
829 u8 primary_grouping_size { 0 };
830 u8 secondary_grouping_size { 0 };
831
832 @number_format_index_type@ decimal_format { 0 };
833 @number_format_list_index_type@ decimal_long_formats { 0 };
834 @number_format_list_index_type@ decimal_short_formats { 0 };
835
836 @number_format_index_type@ currency_format { 0 };
837 @number_format_index_type@ accounting_format { 0 };
838 @number_format_list_index_type@ currency_unit_formats { 0 };
839 @number_format_list_index_type@ currency_short_formats { 0 };
840
841 @number_format_index_type@ percent_format { 0 };
842 @number_format_index_type@ scientific_format { 0 };
843};
844
845struct Unit {
846 @string_index_type@ unit { 0 };
847 @number_format_list_index_type@ long_formats { 0 };
848 @number_format_list_index_type@ short_formats { 0 };
849 @number_format_list_index_type@ narrow_formats { 0 };
850};
851)~~~");
852
853 cldr.unique_formats.generate(generator, "NumberFormatImpl"sv, "s_number_formats"sv, 10);
854 cldr.unique_format_lists.generate(generator, cldr.unique_formats.type_that_fits(), "s_number_format_lists"sv);
855 cldr.unique_symbols.generate(generator, cldr.unique_strings.type_that_fits(), "s_numeric_symbol_lists"sv);
856 cldr.unique_systems.generate(generator, "NumberSystemData"sv, "s_number_systems"sv, 10);
857 cldr.unique_units.generate(generator, "Unit"sv, "s_units"sv, 10);
858
859 auto locales = cldr.locales.keys();
860 quick_sort(locales);
861
862 generator.set("size", DeprecatedString::number(locales.size()));
863 generator.append(R"~~~(
864static constexpr Array<u8, @size@> s_minimum_grouping_digits { { )~~~");
865
866 bool first = true;
867 for (auto const& locale : locales) {
868 generator.append(first ? " "sv : ", "sv);
869 generator.append(DeprecatedString::number(cldr.locales.find(locale)->value.minimum_grouping_digits));
870 first = false;
871 }
872 generator.append(" } };\n");
873
874 auto append_map = [&](DeprecatedString name, auto type, auto const& map) {
875 generator.set("name", name);
876 generator.set("type", type);
877 generator.set("size", DeprecatedString::number(map.size()));
878
879 generator.append(R"~~~(
880static constexpr Array<@type@, @size@> @name@ { {)~~~");
881
882 bool first = true;
883 for (auto const& item : map) {
884 generator.append(first ? " "sv : ", "sv);
885 if constexpr (requires { item.value; })
886 generator.append(DeprecatedString::number(item.value));
887 else
888 generator.append(DeprecatedString::number(item));
889 first = false;
890 }
891
892 generator.append(" } };");
893 };
894
895 generate_mapping(generator, cldr.number_system_digits, "u32"sv, "s_number_systems_digits"sv, "s_number_systems_digits_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, "u32"sv, value); });
896 generate_mapping(generator, cldr.locales, cldr.unique_systems.type_that_fits(), "s_locale_number_systems"sv, "s_number_systems_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, cldr.unique_systems.type_that_fits(), value.number_systems); });
897 generate_mapping(generator, cldr.locales, cldr.unique_units.type_that_fits(), "s_locale_units"sv, "s_units_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, cldr.unique_units.type_that_fits(), value.units); });
898
899 generator.append(R"~~~(
900static Optional<NumberSystem> keyword_to_number_system(KeywordNumbers keyword)
901{
902 switch (keyword) {)~~~");
903
904 for (auto const& number_system : cldr.number_systems) {
905 generator.set("name"sv, format_identifier({}, number_system));
906 generator.append(R"~~~(
907 case KeywordNumbers::@name@:
908 return NumberSystem::@name@;)~~~");
909 }
910
911 generator.append(R"~~~(
912 default:
913 return {};
914 }
915}
916
917Optional<ReadonlySpan<u32>> get_digits_for_number_system(StringView system)
918{
919 auto number_system_keyword = keyword_nu_from_string(system);
920 if (!number_system_keyword.has_value())
921 return {};
922
923 auto number_system_value = keyword_to_number_system(*number_system_keyword);
924 if (!number_system_value.has_value())
925 return {};
926
927 auto number_system_index = to_underlying(*number_system_value);
928 return s_number_systems_digits[number_system_index];
929}
930
931static ErrorOr<NumberSystemData const*> find_number_system(StringView locale, StringView system)
932{
933 auto locale_value = locale_from_string(locale);
934 if (!locale_value.has_value())
935 return nullptr;
936
937 auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
938 auto const& number_systems = s_locale_number_systems.at(locale_index);
939
940 auto lookup_number_system = [&](auto number_system) -> NumberSystemData const* {
941 auto number_system_keyword = keyword_nu_from_string(number_system);
942 if (!number_system_keyword.has_value())
943 return nullptr;
944
945 auto number_system_value = keyword_to_number_system(*number_system_keyword);
946 if (!number_system_value.has_value())
947 return nullptr;
948
949 auto number_system_index = to_underlying(*number_system_value);
950 number_system_index = number_systems.at(number_system_index);
951
952 if (number_system_index == 0)
953 return nullptr;
954
955 return &s_number_systems.at(number_system_index);
956 };
957
958 if (auto const* number_system = lookup_number_system(system))
959 return number_system;
960
961 auto default_number_system = TRY(get_preferred_keyword_value_for_locale(locale, "nu"sv));
962 if (!default_number_system.has_value())
963 return nullptr;
964
965 return lookup_number_system(*default_number_system);
966}
967
968ErrorOr<Optional<StringView>> get_number_system_symbol(StringView locale, StringView system, NumericSymbol symbol)
969{
970 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) {
971 auto symbols = s_numeric_symbol_lists.at(number_system->symbols);
972
973 auto symbol_index = to_underlying(symbol);
974 if (symbol_index >= symbols.size())
975 return OptionalNone {};
976
977 return Optional<StringView> { decode_string(symbols[symbol_index]) };
978 }
979
980 return OptionalNone {};
981}
982
983ErrorOr<Optional<NumberGroupings>> get_number_system_groupings(StringView locale, StringView system)
984{
985 auto locale_value = locale_from_string(locale);
986 if (!locale_value.has_value())
987 return OptionalNone {};
988
989 u8 minimum_grouping_digits = s_minimum_grouping_digits[to_underlying(*locale_value) - 1];
990
991 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr)
992 return NumberGroupings { minimum_grouping_digits, number_system->primary_grouping_size, number_system->secondary_grouping_size };
993 return OptionalNone {};
994}
995
996ErrorOr<Optional<NumberFormat>> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type)
997{
998 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) {
999 @number_format_index_type@ format_index = 0;
1000
1001 switch (type) {
1002 case StandardNumberFormatType::Decimal:
1003 format_index = number_system->decimal_format;
1004 break;
1005 case StandardNumberFormatType::Currency:
1006 format_index = number_system->currency_format;
1007 break;
1008 case StandardNumberFormatType::Accounting:
1009 format_index = number_system->accounting_format;
1010 break;
1011 case StandardNumberFormatType::Percent:
1012 format_index = number_system->percent_format;
1013 break;
1014 case StandardNumberFormatType::Scientific:
1015 format_index = number_system->scientific_format;
1016 break;
1017 }
1018
1019 return TRY(s_number_formats[format_index].to_unicode_number_format());
1020 }
1021
1022 return OptionalNone {};
1023}
1024
1025ErrorOr<Vector<NumberFormat>> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type)
1026{
1027 Vector<NumberFormat> formats;
1028
1029 if (auto const* number_system = TRY(find_number_system(locale, system)); number_system != nullptr) {
1030 @number_format_list_index_type@ number_format_list_index { 0 };
1031
1032 switch (type) {
1033 case CompactNumberFormatType::DecimalLong:
1034 number_format_list_index = number_system->decimal_long_formats;
1035 break;
1036 case CompactNumberFormatType::DecimalShort:
1037 number_format_list_index = number_system->decimal_short_formats;
1038 break;
1039 case CompactNumberFormatType::CurrencyUnit:
1040 number_format_list_index = number_system->currency_unit_formats;
1041 break;
1042 case CompactNumberFormatType::CurrencyShort:
1043 number_format_list_index = number_system->currency_short_formats;
1044 break;
1045 }
1046
1047 auto number_formats = s_number_format_lists.at(number_format_list_index);
1048 TRY(formats.try_ensure_capacity(number_formats.size()));
1049
1050 for (auto number_format : number_formats)
1051 formats.unchecked_append(TRY(s_number_formats[number_format].to_unicode_number_format()));
1052 }
1053
1054 return formats;
1055}
1056
1057static Unit const* find_units(StringView locale, StringView unit)
1058{
1059 auto locale_value = locale_from_string(locale);
1060 if (!locale_value.has_value())
1061 return nullptr;
1062
1063 auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
1064 auto const& locale_units = s_locale_units.at(locale_index);
1065
1066 for (auto unit_index : locale_units) {
1067 auto const& units = s_units.at(unit_index);
1068
1069 if (unit == decode_string(units.unit))
1070 return &units;
1071 };
1072
1073 return nullptr;
1074}
1075
1076ErrorOr<Vector<NumberFormat>> get_unit_formats(StringView locale, StringView unit, Style style)
1077{
1078 Vector<NumberFormat> formats;
1079
1080 if (auto const* units = find_units(locale, unit); units != nullptr) {
1081 @number_format_list_index_type@ number_format_list_index { 0 };
1082
1083 switch (style) {
1084 case Style::Long:
1085 number_format_list_index = units->long_formats;
1086 break;
1087 case Style::Short:
1088 number_format_list_index = units->short_formats;
1089 break;
1090 case Style::Narrow:
1091 number_format_list_index = units->narrow_formats;
1092 break;
1093 default:
1094 VERIFY_NOT_REACHED();
1095 }
1096
1097 auto number_formats = s_number_format_lists.at(number_format_list_index);
1098 TRY(formats.try_ensure_capacity(number_formats.size()));
1099
1100 for (auto number_format : number_formats)
1101 formats.unchecked_append(TRY(s_number_formats[number_format].to_unicode_number_format()));
1102 }
1103
1104 return formats;
1105}
1106
1107}
1108)~~~");
1109
1110 TRY(file.write_until_depleted(generator.as_string_view().bytes()));
1111 return {};
1112}
1113
1114ErrorOr<int> serenity_main(Main::Arguments arguments)
1115{
1116 StringView generated_header_path;
1117 StringView generated_implementation_path;
1118 StringView core_path;
1119 StringView numbers_path;
1120 StringView units_path;
1121
1122 Core::ArgsParser args_parser;
1123 args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
1124 args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
1125 args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path");
1126 args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
1127 args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path");
1128 args_parser.parse(arguments);
1129
1130 auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
1131 auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
1132
1133 CLDR cldr;
1134 TRY(parse_all_locales(core_path, numbers_path, units_path, cldr));
1135
1136 TRY(generate_unicode_locale_header(*generated_header_file, cldr));
1137 TRY(generate_unicode_locale_implementation(*generated_implementation_file, cldr));
1138
1139 return 0;
1140}