Serenity Operating System
1/*
2 * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include "GeneratorUtil.h"
8#include <AK/AllOf.h>
9#include <AK/Array.h>
10#include <AK/CharacterTypes.h>
11#include <AK/DeprecatedString.h>
12#include <AK/Error.h>
13#include <AK/Find.h>
14#include <AK/HashMap.h>
15#include <AK/Optional.h>
16#include <AK/QuickSort.h>
17#include <AK/SourceGenerator.h>
18#include <AK/StringUtils.h>
19#include <AK/Types.h>
20#include <AK/Vector.h>
21#include <LibCore/ArgsParser.h>
22
23// Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code
24// points, as indicated by the "name" field. For example:
25// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
26// 4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
27struct CodePointRange {
28 u32 first;
29 u32 last;
30};
31
32// https://www.unicode.org/reports/tr44/#SpecialCasing.txt
33struct SpecialCasing {
34 u32 index { 0 };
35 u32 code_point { 0 };
36 Vector<u32> lowercase_mapping;
37 Vector<u32> uppercase_mapping;
38 Vector<u32> titlecase_mapping;
39 DeprecatedString locale;
40 DeprecatedString condition;
41};
42
43// https://www.unicode.org/reports/tr44/#CaseFolding.txt
44struct CaseFolding {
45 u32 code_point { 0 };
46 StringView status { "Common"sv };
47 Vector<u32> mapping { 0 };
48};
49
50// https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
51struct CodePointDecomposition {
52 // `tag` is a string since it's used for codegen as an enum value.
53 DeprecatedString tag { "Canonical"sv };
54 size_t decomposition_index { 0 };
55 size_t decomposition_size { 0 };
56};
57
58// https://www.unicode.org/reports/tr44/#PropList.txt
59using PropList = HashMap<DeprecatedString, Vector<CodePointRange>>;
60
61// https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt
62enum class QuickCheck {
63 Yes,
64 No,
65 Maybe,
66};
67
68struct Normalization {
69 CodePointRange code_point_range;
70 Vector<u32> value;
71 QuickCheck quick_check { QuickCheck::Yes };
72};
73
74using NormalizationProps = HashMap<DeprecatedString, Vector<Normalization>>;
75
76struct CodePointName {
77 CodePointRange code_point_range;
78 size_t name { 0 };
79};
80
81// https://www.unicode.org/reports/tr44/#UnicodeData.txt
82struct CodePointData {
83 u32 code_point { 0 };
84 DeprecatedString name;
85 Optional<size_t> abbreviation;
86 u8 canonical_combining_class { 0 };
87 DeprecatedString bidi_class;
88 Optional<CodePointDecomposition> decomposition_mapping;
89 Optional<i8> numeric_value_decimal;
90 Optional<i8> numeric_value_digit;
91 Optional<i8> numeric_value_numeric;
92 bool bidi_mirrored { false };
93 DeprecatedString unicode_1_name;
94 DeprecatedString iso_comment;
95 Optional<u32> simple_uppercase_mapping;
96 Optional<u32> simple_lowercase_mapping;
97 Optional<u32> simple_titlecase_mapping;
98 Vector<u32> special_casing_indices;
99 Vector<u32> case_folding_indices;
100};
101
102struct BlockName {
103 CodePointRange code_point_range;
104 size_t name { 0 };
105};
106
107struct UnicodeData {
108 UniqueStringStorage unique_strings;
109
110 u32 code_points_with_non_zero_combining_class { 0 };
111
112 u32 code_points_with_decomposition_mapping { 0 };
113 Vector<u32> decomposition_mappings;
114 Vector<DeprecatedString> compatibility_tags;
115
116 u32 simple_uppercase_mapping_size { 0 };
117 u32 simple_lowercase_mapping_size { 0 };
118 u32 simple_titlecase_mapping_size { 0 };
119
120 Vector<SpecialCasing> special_casing;
121 u32 code_points_with_special_casing { 0 };
122 u32 largest_special_casing_mapping_size { 0 };
123 u32 largest_special_casing_size { 0 };
124 Vector<DeprecatedString> conditions;
125 Vector<DeprecatedString> locales;
126
127 Vector<CaseFolding> case_folding;
128 u32 code_points_with_case_folding { 0 };
129 u32 largest_case_folding_mapping_size { 0 };
130 u32 largest_case_folding_size { 0 };
131 Vector<StringView> statuses;
132
133 Vector<CodePointData> code_point_data;
134
135 HashMap<u32, size_t> code_point_abbreviations;
136 HashMap<u32, size_t> code_point_display_name_aliases;
137 Vector<CodePointName> code_point_display_names;
138
139 // https://www.unicode.org/reports/tr44/#General_Category_Values
140 PropList general_categories;
141 Vector<Alias> general_category_aliases;
142
143 // The Unicode standard defines additional properties (Any, Assigned, ASCII) which are not in
144 // any UCD file. Assigned code point ranges are derived as this generator is executed.
145 // https://unicode.org/reports/tr18/#General_Category_Property
146 PropList prop_list {
147 { "Any"sv, { { 0, 0x10ffff } } },
148 { "Assigned"sv, {} },
149 { "ASCII"sv, { { 0, 0x7f } } },
150 };
151 Vector<Alias> prop_aliases;
152
153 PropList script_list {
154 { "Unknown"sv, {} },
155 };
156 Vector<Alias> script_aliases;
157 PropList script_extensions;
158
159 PropList block_list {
160 { "No_Block"sv, {} },
161 };
162 Vector<Alias> block_aliases;
163 Vector<BlockName> block_display_names;
164
165 // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
166 NormalizationProps normalization_props;
167
168 PropList grapheme_break_props;
169 PropList word_break_props;
170 PropList sentence_break_props;
171};
172
173static DeprecatedString sanitize_entry(DeprecatedString const& entry)
174{
175 auto sanitized = entry.replace("-"sv, "_"sv, ReplaceMode::All);
176 sanitized = sanitized.replace(" "sv, "_"sv, ReplaceMode::All);
177
178 StringBuilder builder;
179 bool next_is_upper = true;
180 for (auto ch : sanitized) {
181 if (next_is_upper)
182 builder.append_code_point(to_ascii_uppercase(ch));
183 else
184 builder.append_code_point(ch);
185 next_is_upper = ch == '_';
186 }
187
188 return builder.to_deprecated_string();
189}
190
191static Vector<u32> parse_code_point_list(StringView list)
192{
193 Vector<u32> code_points;
194
195 auto segments = list.split_view(' ');
196 for (auto const& code_point : segments)
197 code_points.append(AK::StringUtils::convert_to_uint_from_hex<u32>(code_point).value());
198
199 return code_points;
200}
201
202static CodePointRange parse_code_point_range(StringView list)
203{
204 CodePointRange code_point_range {};
205
206 if (list.contains(".."sv)) {
207 auto segments = list.split_view(".."sv);
208 VERIFY(segments.size() == 2);
209
210 auto begin = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
211 auto end = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[1]).value();
212 code_point_range = { begin, end };
213 } else {
214 auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(list).value();
215 code_point_range = { code_point, code_point };
216 }
217
218 return code_point_range;
219}
220
221static ErrorOr<void> parse_special_casing(Core::BufferedFile& file, UnicodeData& unicode_data)
222{
223 Array<u8, 1024> buffer;
224
225 while (TRY(file.can_read_line())) {
226 auto line = TRY(file.read_line(buffer));
227
228 if (line.is_empty() || line.starts_with('#'))
229 continue;
230
231 if (auto index = line.find('#'); index.has_value())
232 line = line.substring_view(0, *index);
233
234 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
235 VERIFY(segments.size() == 5 || segments.size() == 6);
236
237 SpecialCasing casing {};
238 casing.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
239 casing.lowercase_mapping = parse_code_point_list(segments[1]);
240 casing.titlecase_mapping = parse_code_point_list(segments[2]);
241 casing.uppercase_mapping = parse_code_point_list(segments[3]);
242
243 if (auto condition = segments[4].trim_whitespace(); !condition.is_empty()) {
244 auto conditions = condition.split_view(' ', SplitBehavior::KeepEmpty);
245 VERIFY(conditions.size() == 1 || conditions.size() == 2);
246
247 if (conditions.size() == 2) {
248 casing.locale = conditions[0];
249 casing.condition = conditions[1];
250 } else if (all_of(conditions[0], is_ascii_lower_alpha)) {
251 casing.locale = conditions[0];
252 } else {
253 casing.condition = conditions[0];
254 }
255
256 if (!casing.locale.is_empty()) {
257 casing.locale = DeprecatedString::formatted("{:c}{}", to_ascii_uppercase(casing.locale[0]), casing.locale.substring_view(1));
258
259 if (!unicode_data.locales.contains_slow(casing.locale))
260 unicode_data.locales.append(casing.locale);
261 }
262
263 casing.condition = casing.condition.replace("_"sv, ""sv, ReplaceMode::All);
264
265 if (!casing.condition.is_empty() && !unicode_data.conditions.contains_slow(casing.condition))
266 unicode_data.conditions.append(casing.condition);
267 }
268
269 unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.lowercase_mapping.size());
270 unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.titlecase_mapping.size());
271 unicode_data.largest_special_casing_mapping_size = max(unicode_data.largest_special_casing_mapping_size, casing.uppercase_mapping.size());
272
273 unicode_data.special_casing.append(move(casing));
274 }
275
276 quick_sort(unicode_data.special_casing, [](auto const& lhs, auto const& rhs) {
277 if (lhs.code_point != rhs.code_point)
278 return lhs.code_point < rhs.code_point;
279 if (lhs.locale.is_empty() && !rhs.locale.is_empty())
280 return false;
281 if (!lhs.locale.is_empty() && rhs.locale.is_empty())
282 return true;
283 return lhs.locale < rhs.locale;
284 });
285
286 for (u32 i = 0; i < unicode_data.special_casing.size(); ++i)
287 unicode_data.special_casing[i].index = i;
288
289 return {};
290}
291
292static ErrorOr<void> parse_case_folding(Core::BufferedFile& file, UnicodeData& unicode_data)
293{
294 Array<u8, 1024> buffer;
295
296 while (TRY(file.can_read_line())) {
297 auto line = TRY(file.read_line(buffer));
298 if (line.is_empty() || line.starts_with('#'))
299 continue;
300
301 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
302 VERIFY(segments.size() == 4);
303
304 CaseFolding folding {};
305 folding.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
306 folding.mapping = parse_code_point_list(segments[2]);
307
308 switch (segments[1].trim_whitespace()[0]) {
309 case 'C':
310 folding.status = "Common"sv;
311 break;
312 case 'F':
313 folding.status = "Full"sv;
314 break;
315 case 'S':
316 folding.status = "Simple"sv;
317 break;
318 case 'T':
319 folding.status = "Special"sv;
320 break;
321 }
322
323 unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size());
324
325 if (!unicode_data.statuses.contains_slow(folding.status))
326 unicode_data.statuses.append(folding.status);
327
328 unicode_data.case_folding.append(move(folding));
329 }
330
331 quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) {
332 if (lhs.code_point != rhs.code_point)
333 return lhs.code_point < rhs.code_point;
334 return lhs.status < rhs.status;
335 });
336
337 return {};
338}
339
340static ErrorOr<void> parse_prop_list(Core::BufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false)
341{
342 Array<u8, 1024> buffer;
343
344 while (TRY(file.can_read_line())) {
345 auto line = TRY(file.read_line(buffer));
346
347 if (line.is_empty() || line.starts_with('#'))
348 continue;
349
350 if (auto index = line.find('#'); index.has_value())
351 line = line.substring_view(0, *index);
352
353 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
354 VERIFY(segments.size() == 2);
355
356 auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
357 Vector<StringView> properties;
358
359 if (multi_value_property)
360 properties = segments[1].trim_whitespace().split_view(' ');
361 else
362 properties = { segments[1].trim_whitespace() };
363
364 for (auto& property : properties) {
365 auto& code_points = prop_list.ensure(sanitize_property ? sanitize_entry(property).trim_whitespace().view() : property.trim_whitespace());
366 code_points.append(code_point_range);
367 }
368 }
369
370 return {};
371}
372
373static ErrorOr<void> parse_alias_list(Core::BufferedFile& file, PropList const& prop_list, Vector<Alias>& prop_aliases)
374{
375 DeprecatedString current_property;
376 Array<u8, 1024> buffer;
377
378 auto append_alias = [&](auto alias, auto property) {
379 // Note: The alias files contain lines such as "Hyphen = Hyphen", which we should just skip.
380 if (alias == property)
381 return;
382
383 // FIXME: We will, eventually, need to find where missing properties are located and parse them.
384 if (!prop_list.contains(property))
385 return;
386
387 prop_aliases.append({ property, alias });
388 };
389
390 while (TRY(file.can_read_line())) {
391 auto line = TRY(file.read_line(buffer));
392
393 if (line.is_empty() || line.starts_with('#')) {
394 if (line.ends_with("Properties"sv))
395 current_property = line.substring_view(2);
396 continue;
397 }
398
399 // Note: For now, we only care about Binary Property aliases for Unicode property escapes.
400 if (current_property != "Binary Properties"sv)
401 continue;
402
403 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
404 VERIFY((segments.size() == 2) || (segments.size() == 3));
405
406 auto alias = segments[0].trim_whitespace();
407 auto property = segments[1].trim_whitespace();
408 append_alias(alias, property);
409
410 if (segments.size() == 3) {
411 alias = segments[2].trim_whitespace();
412 append_alias(alias, property);
413 }
414 }
415
416 return {};
417}
418
419static ErrorOr<void> parse_name_aliases(Core::BufferedFile& file, UnicodeData& unicode_data)
420{
421 Array<u8, 1024> buffer;
422
423 while (TRY(file.can_read_line())) {
424 auto line = TRY(file.read_line(buffer));
425
426 if (line.is_empty() || line.starts_with('#'))
427 continue;
428
429 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
430 VERIFY(segments.size() == 3);
431
432 auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0].trim_whitespace());
433 auto alias = segments[1].trim_whitespace();
434 auto reason = segments[2].trim_whitespace();
435
436 if (reason == "abbreviation"sv) {
437 auto index = unicode_data.unique_strings.ensure(alias);
438 unicode_data.code_point_abbreviations.set(*code_point, index);
439 } else if (reason.is_one_of("correction"sv, "control"sv)) {
440 if (!unicode_data.code_point_display_name_aliases.contains(*code_point)) {
441 auto index = unicode_data.unique_strings.ensure(alias);
442 unicode_data.code_point_display_name_aliases.set(*code_point, index);
443 }
444 }
445 }
446
447 return {};
448}
449
450static ErrorOr<void> parse_value_alias_list(Core::BufferedFile& file, StringView desired_category, Vector<DeprecatedString> const& value_list, Vector<Alias>& prop_aliases, bool primary_value_is_first = true, bool sanitize_alias = false)
451{
452 TRY(file.seek(0, SeekMode::SetPosition));
453 Array<u8, 1024> buffer;
454
455 auto append_alias = [&](auto alias, auto value) {
456 // Note: The value alias file contains lines such as "Ahom = Ahom", which we should just skip.
457 if (alias == value)
458 return;
459
460 // FIXME: We will, eventually, need to find where missing properties are located and parse them.
461 if (!value_list.contains_slow(value))
462 return;
463
464 prop_aliases.append({ value, alias });
465 };
466
467 while (TRY(file.can_read_line())) {
468 auto line = TRY(file.read_line(buffer));
469
470 if (line.is_empty() || line.starts_with('#'))
471 continue;
472
473 if (auto index = line.find('#'); index.has_value())
474 line = line.substring_view(0, *index);
475
476 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
477 auto category = segments[0].trim_whitespace();
478
479 if (category != desired_category)
480 continue;
481
482 VERIFY((segments.size() == 3) || (segments.size() == 4));
483 auto value = primary_value_is_first ? segments[1].trim_whitespace() : segments[2].trim_whitespace();
484 auto alias = primary_value_is_first ? segments[2].trim_whitespace() : segments[1].trim_whitespace();
485 append_alias(sanitize_alias ? sanitize_entry(alias).view() : alias, value);
486
487 if (segments.size() == 4) {
488 alias = segments[3].trim_whitespace();
489 append_alias(sanitize_alias ? sanitize_entry(alias).view() : alias, value);
490 }
491 }
492
493 return {};
494}
495
496static ErrorOr<void> parse_normalization_props(Core::BufferedFile& file, UnicodeData& unicode_data)
497{
498 Array<u8, 1024> buffer;
499
500 while (TRY(file.can_read_line())) {
501 auto line = TRY(file.read_line(buffer));
502
503 if (line.is_empty() || line.starts_with('#'))
504 continue;
505
506 if (auto index = line.find('#'); index.has_value())
507 line = line.substring_view(0, *index);
508
509 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
510 VERIFY((segments.size() == 2) || (segments.size() == 3));
511
512 auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
513 auto property = segments[1].trim_whitespace().to_deprecated_string();
514
515 Vector<u32> value;
516 QuickCheck quick_check = QuickCheck::Yes;
517
518 if (segments.size() == 3) {
519 auto value_or_quick_check = segments[2].trim_whitespace();
520
521 if ((value_or_quick_check == "N"sv))
522 quick_check = QuickCheck::No;
523 else if ((value_or_quick_check == "M"sv))
524 quick_check = QuickCheck::Maybe;
525 else
526 value = parse_code_point_list(value_or_quick_check);
527 }
528
529 auto& normalizations = unicode_data.normalization_props.ensure(property);
530 normalizations.append({ code_point_range, move(value), quick_check });
531
532 auto& prop_list = unicode_data.prop_list.ensure(property);
533 prop_list.append(move(code_point_range));
534 }
535
536 return {};
537}
538
539static void add_canonical_code_point_name(CodePointRange range, StringView name, UnicodeData& unicode_data)
540{
541 // https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G142981
542 // FIXME: Implement the NR1 rules for Hangul syllables.
543
544 struct CodePointNameFormat {
545 CodePointRange code_point_range;
546 StringView name;
547 };
548
549 // These code point ranges are the NR2 set of name replacements defined by Table 4-8.
550 constexpr Array<CodePointNameFormat, 16> s_ideographic_replacements { {
551 { { 0x3400, 0x4DBF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
552 { { 0x4E00, 0x9FFF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
553 { { 0xF900, 0xFA6D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
554 { { 0xFA70, 0xFAD9 }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
555 { { 0x17000, 0x187F7 }, "TANGUT IDEOGRAPH-{:X}"sv },
556 { { 0x18B00, 0x18CD5 }, "KHITAN SMALL SCRIPT CHARACTER-{:X}"sv },
557 { { 0x18D00, 0x18D08 }, "TANGUT IDEOGRAPH-{:X}"sv },
558 { { 0x1B170, 0x1B2FB }, "NUSHU CHARACTER-{:X}"sv },
559 { { 0x20000, 0x2A6DF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
560 { { 0x2A700, 0x2B739 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
561 { { 0x2B740, 0x2B81D }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
562 { { 0x2B820, 0x2CEA1 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
563 { { 0x2CEB0, 0x2EBE0 }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
564 { { 0x2F800, 0x2FA1D }, "CJK COMPATIBILITY IDEOGRAPH-{:X}"sv },
565 { { 0x30000, 0x3134A }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
566 { { 0x31350, 0x323AF }, "CJK UNIFIED IDEOGRAPH-{:X}"sv },
567 } };
568
569 auto it = find_if(s_ideographic_replacements.begin(), s_ideographic_replacements.end(),
570 [&](auto const& replacement) {
571 return replacement.code_point_range.first == range.first;
572 });
573
574 if (it != s_ideographic_replacements.end()) {
575 auto index = unicode_data.unique_strings.ensure(it->name);
576 unicode_data.code_point_display_names.append({ it->code_point_range, index });
577 return;
578 }
579
580 it = find_if(s_ideographic_replacements.begin(), s_ideographic_replacements.end(),
581 [&](auto const& replacement) {
582 return (replacement.code_point_range.first <= range.first) && (range.first <= replacement.code_point_range.last);
583 });
584
585 if (it != s_ideographic_replacements.end()) {
586 // Drop code points that will have been captured by a range defined by the ideographic replacements.
587 return;
588 }
589
590 if (auto alias = unicode_data.code_point_display_name_aliases.get(range.first); alias.has_value()) {
591 // NR4 states that control code points have a null string as their name. Our implementation
592 // uses the control code's alias as its display name.
593 unicode_data.code_point_display_names.append({ range, *alias });
594 return;
595 }
596
597 auto index = unicode_data.unique_strings.ensure(name);
598 unicode_data.code_point_display_names.append({ range, index });
599}
600
601static Optional<CodePointDecomposition> parse_decomposition_mapping(StringView string, UnicodeData& unicode_data)
602{
603 if (string.is_empty())
604 return {};
605
606 CodePointDecomposition mapping;
607
608 auto parts = string.split_view(' ');
609
610 VERIFY(parts.size() > 0);
611
612 if (parts.first().starts_with('<')) {
613 auto const tag = parts.take_first().trim("<>"sv);
614
615 mapping.tag = DeprecatedString::formatted("{:c}{}", to_ascii_uppercase(tag[0]), tag.substring_view(1));
616
617 if (!unicode_data.compatibility_tags.contains_slow(mapping.tag))
618 unicode_data.compatibility_tags.append(mapping.tag);
619 }
620
621 mapping.decomposition_index = unicode_data.decomposition_mappings.size();
622 mapping.decomposition_size = parts.size();
623 for (auto part : parts) {
624 unicode_data.decomposition_mappings.append(AK::StringUtils::convert_to_uint_from_hex<u32>(part).value());
625 }
626
627 return mapping;
628}
629
630static ErrorOr<void> parse_block_display_names(Core::BufferedFile& file, UnicodeData& unicode_data)
631{
632 Array<u8, 1024> buffer;
633 while (TRY(file.can_read_line())) {
634 auto line = TRY(file.read_line(buffer));
635 if (line.is_empty() || line.starts_with('#'))
636 continue;
637
638 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
639 VERIFY(segments.size() == 2);
640
641 auto code_point_range = parse_code_point_range(segments[0].trim_whitespace());
642 auto display_name = segments[1].trim_whitespace();
643
644 auto index = unicode_data.unique_strings.ensure(display_name);
645 unicode_data.block_display_names.append({ code_point_range, index });
646 }
647
648 TRY(file.seek(0, SeekMode::SetPosition));
649
650 return {};
651}
652
653static ErrorOr<void> parse_unicode_data(Core::BufferedFile& file, UnicodeData& unicode_data)
654{
655 Optional<u32> code_point_range_start;
656
657 auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value;
658 Optional<u32> assigned_code_point_range_start = 0;
659 u32 previous_code_point = 0;
660
661 Array<u8, 1024> buffer;
662
663 while (TRY(file.can_read_line())) {
664 auto line = TRY(file.read_line(buffer));
665
666 if (line.is_empty())
667 continue;
668
669 auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
670 VERIFY(segments.size() == 15);
671
672 CodePointData data {};
673 data.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
674 data.name = segments[1];
675 data.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
676 data.bidi_class = segments[4];
677 data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data);
678 data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
679 data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
680 data.numeric_value_numeric = AK::StringUtils::convert_to_int<i8>(segments[8]);
681 data.bidi_mirrored = segments[9] == "Y"sv;
682 data.unicode_1_name = segments[10];
683 data.iso_comment = segments[11];
684 data.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[12]);
685 data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
686 data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
687
688 if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value())
689 data.abbreviation = *abbreviation;
690
691 if (!assigned_code_point_range_start.has_value())
692 assigned_code_point_range_start = data.code_point;
693
694 if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) {
695 VERIFY(!code_point_range_start.has_value() && assigned_code_point_range_start.has_value());
696 code_point_range_start = data.code_point;
697
698 data.name = data.name.substring(1, data.name.length() - 9);
699
700 assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
701 assigned_code_point_range_start.clear();
702 } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) {
703 VERIFY(code_point_range_start.has_value());
704
705 CodePointRange code_point_range { *code_point_range_start, data.code_point };
706 assigned_code_points.append(code_point_range);
707
708 data.name = data.name.substring(1, data.name.length() - 8);
709 code_point_range_start.clear();
710
711 add_canonical_code_point_name(code_point_range, data.name, unicode_data);
712 } else {
713 add_canonical_code_point_name({ data.code_point, data.code_point }, data.name, unicode_data);
714
715 if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) {
716 VERIFY(assigned_code_point_range_start.has_value());
717
718 assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point });
719 assigned_code_point_range_start = data.code_point;
720 }
721 }
722
723 bool has_special_casing { false };
724 for (auto const& casing : unicode_data.special_casing) {
725 if (casing.code_point == data.code_point) {
726 data.special_casing_indices.append(casing.index);
727 has_special_casing = true;
728 }
729 }
730
731 bool has_case_folding { false };
732 for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
733 if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) {
734 data.case_folding_indices.append(i);
735 has_case_folding = true;
736 }
737 }
738
739 unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0;
740 unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
741 unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
742 unicode_data.simple_titlecase_mapping_size += data.simple_titlecase_mapping.has_value();
743 unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
744
745 unicode_data.code_points_with_special_casing += has_special_casing;
746 unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
747
748 unicode_data.code_points_with_case_folding += has_case_folding;
749 unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size());
750
751 previous_code_point = data.code_point;
752 unicode_data.code_point_data.append(move(data));
753 }
754
755 return {};
756}
757
758static ErrorOr<void> generate_unicode_data_header(Core::BufferedFile& file, UnicodeData& unicode_data)
759{
760 StringBuilder builder;
761 SourceGenerator generator { builder };
762 generator.set("special_casing_mapping_size", DeprecatedString::number(unicode_data.largest_special_casing_mapping_size));
763 generator.set("case_folding_mapping_size", DeprecatedString::number(unicode_data.largest_case_folding_mapping_size));
764
765 auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
766 quick_sort(values);
767 quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
768
769 generator.set("name", name);
770 generator.set("underlying", DeprecatedString::formatted("{}UnderlyingType", name));
771 generator.set("type", ((values.size() + !default_.is_empty()) < 256) ? "u8"sv : "u16"sv);
772
773 generator.append(R"~~~(
774using @underlying@ = @type@;
775
776enum class @name@ : @underlying@ {)~~~");
777
778 if (!default_.is_empty()) {
779 generator.set("default", default_);
780 generator.append(R"~~~(
781 @default@,)~~~");
782 }
783
784 for (auto const& value : values) {
785 generator.set("value", value);
786 generator.append(R"~~~(
787 @value@,)~~~");
788 }
789
790 for (auto const& alias : aliases) {
791 generator.set("alias", alias.alias);
792 generator.set("value", alias.name);
793 generator.append(R"~~~(
794 @alias@ = @value@,)~~~");
795 }
796
797 generator.append(R"~~~(
798};
799)~~~");
800 };
801
802 generator.append(R"~~~(
803#pragma once
804
805#include <AK/Types.h>
806#include <LibUnicode/Forward.h>
807
808namespace Unicode {
809)~~~");
810
811 generate_enum("Locale"sv, "None"sv, unicode_data.locales);
812 generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
813 generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses));
814 generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
815 generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
816 generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
817 generate_enum("Block"sv, {}, unicode_data.block_list.keys(), unicode_data.block_aliases);
818 generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
819 generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
820 generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
821 generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags);
822
823 generator.append(R"~~~(
824struct SpecialCasing {
825 u32 code_point { 0 };
826
827 u32 lowercase_mapping[@special_casing_mapping_size@];
828 u32 lowercase_mapping_size { 0 };
829
830 u32 uppercase_mapping[@special_casing_mapping_size@];
831 u32 uppercase_mapping_size { 0 };
832
833 u32 titlecase_mapping[@special_casing_mapping_size@];
834 u32 titlecase_mapping_size { 0 };
835
836 Locale locale { Locale::None };
837 Condition condition { Condition::None };
838};
839
840struct CaseFolding {
841 u32 code_point { 0 };
842 CaseFoldingStatus status { CaseFoldingStatus::Common };
843
844 u32 mapping[@case_folding_mapping_size@];
845 u32 mapping_size { 0 };
846};
847
848struct CodePointDecompositionRaw {
849 u32 code_point { 0 };
850 CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
851 size_t decomposition_index { 0 };
852 size_t decomposition_count { 0 };
853};
854
855struct CodePointDecomposition {
856 u32 code_point { 0 };
857 CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
858 ReadonlySpan<u32> decomposition;
859};
860
861Optional<Locale> locale_from_string(StringView locale);
862
863ReadonlySpan<SpecialCasing const*> special_case_mapping(u32 code_point);
864ReadonlySpan<CaseFolding const*> case_folding_mapping(u32 code_point);
865
866}
867)~~~");
868
869 TRY(file.write_until_depleted(generator.as_string_view().bytes()));
870 return {};
871}
872
873static ErrorOr<void> generate_unicode_data_implementation(Core::BufferedFile& file, UnicodeData const& unicode_data)
874{
875 StringBuilder builder;
876 SourceGenerator generator { builder };
877
878 generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
879 generator.set("largest_special_casing_size", DeprecatedString::number(unicode_data.largest_special_casing_size));
880 generator.set("special_casing_size", DeprecatedString::number(unicode_data.special_casing.size()));
881 generator.set("largest_case_folding_size", DeprecatedString::number(unicode_data.largest_case_folding_size));
882 generator.set("case_folding_size", DeprecatedString::number(unicode_data.case_folding.size()));
883
884 generator.append(R"~~~(
885#include <AK/Array.h>
886#include <AK/BinarySearch.h>
887#include <AK/CharacterTypes.h>
888#include <AK/Optional.h>
889#include <AK/Span.h>
890#include <AK/DeprecatedString.h>
891#include <AK/StringView.h>
892#include <LibUnicode/CharacterTypes.h>
893#include <LibUnicode/UnicodeData.h>
894#include <LibUnicode/Normalize.h>
895
896namespace Unicode {
897)~~~");
898
899 unicode_data.unique_strings.generate(generator);
900
901 auto append_list_and_size = [&](auto const& list, StringView format) {
902 if (list.is_empty()) {
903 generator.append(", {}, 0");
904 return;
905 }
906
907 bool first = true;
908 generator.append(", {");
909 for (auto const& item : list) {
910 generator.append(first ? " "sv : ", "sv);
911 generator.append(DeprecatedString::formatted(format, item));
912 first = false;
913 }
914 generator.append(DeprecatedString::formatted(" }}, {}", list.size()));
915 };
916
917 generator.append(R"~~~(
918static constexpr Array<SpecialCasing, @special_casing_size@> s_special_case { {)~~~");
919
920 for (auto const& casing : unicode_data.special_casing) {
921 generator.set("code_point", DeprecatedString::formatted("{:#x}", casing.code_point));
922 generator.append(R"~~~(
923 { @code_point@)~~~");
924
925 constexpr auto format = "0x{:x}"sv;
926 append_list_and_size(casing.lowercase_mapping, format);
927 append_list_and_size(casing.uppercase_mapping, format);
928 append_list_and_size(casing.titlecase_mapping, format);
929
930 generator.set("locale", casing.locale.is_empty() ? "None" : casing.locale);
931 generator.append(", Locale::@locale@");
932
933 generator.set("condition", casing.condition.is_empty() ? "None" : casing.condition);
934 generator.append(", Condition::@condition@");
935
936 generator.append(" },");
937 }
938
939 generator.append(R"~~~(
940} };
941
942static constexpr Array<CaseFolding, @case_folding_size@> s_case_folding { {)~~~");
943
944 for (auto const& folding : unicode_data.case_folding) {
945 generator.set("code_point", DeprecatedString::formatted("{:#x}", folding.code_point));
946 generator.set("status", folding.status);
947 generator.append(R"~~~(
948 { @code_point@, CaseFoldingStatus::@status@)~~~");
949
950 append_list_and_size(folding.mapping, "0x{:x}"sv);
951 generator.append(" },");
952 }
953
954 generator.append(R"~~~(
955} };
956
957struct CodePointMapping {
958 u32 code_point { 0 };
959 u32 mapping { 0 };
960};
961
962struct SpecialCaseMapping {
963 u32 code_point { 0 };
964 Array<SpecialCasing const*, @largest_special_casing_size@> special_casing {};
965 u32 special_casing_size { 0 };
966};
967
968struct CaseFoldingMapping {
969 u32 code_point { 0 };
970 Array<CaseFolding const*, @largest_case_folding_size@> case_folding {};
971 u32 case_folding_size { 0 };
972};
973
974struct CodePointAbbreviation {
975 u32 code_point { 0 };
976 @string_index_type@ abbreviation { 0 };
977};
978
979template<typename MappingType>
980struct CodePointComparator {
981 constexpr int operator()(u32 code_point, MappingType const& mapping)
982 {
983 return code_point - mapping.code_point;
984 }
985};
986
987struct CodePointRangeComparator {
988 constexpr int operator()(u32 code_point, CodePointRange const& range)
989 {
990 return (code_point > range.last) - (code_point < range.first);
991 }
992};
993
994struct BlockNameData {
995 CodePointRange code_point_range {};
996 @string_index_type@ display_name { 0 };
997};
998
999struct BlockNameComparator : public CodePointRangeComparator {
1000 constexpr int operator()(u32 code_point, BlockNameData const& name)
1001 {
1002 return CodePointRangeComparator::operator()(code_point, name.code_point_range);
1003 }
1004};
1005
1006struct CodePointName {
1007 CodePointRange code_point_range {};
1008 @string_index_type@ display_name { 0 };
1009};
1010
1011struct CodePointNameComparator : public CodePointRangeComparator {
1012 constexpr int operator()(u32 code_point, CodePointName const& name)
1013 {
1014 return CodePointRangeComparator::operator()(code_point, name.code_point_range);
1015 }
1016};
1017)~~~");
1018
1019 generator.set("decomposition_mappings_size", DeprecatedString::number(unicode_data.decomposition_mappings.size()));
1020 generator.append("\nstatic constexpr Array<u32, @decomposition_mappings_size@> s_decomposition_mappings_data { ");
1021 generator.append(DeprecatedString::join(", "sv, unicode_data.decomposition_mappings, "{:#x}"sv));
1022 generator.append(" };\n");
1023
1024 auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
1025 generator.set("name", name);
1026 generator.set("mapping_type", mapping_type);
1027 generator.set("size", DeprecatedString::number(size));
1028
1029 generator.append(R"~~~(
1030static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
1031 )~~~");
1032
1033 constexpr size_t max_mappings_per_row = 20;
1034 size_t mappings_in_current_row = 0;
1035
1036 for (auto const& data : unicode_data.code_point_data) {
1037 auto mapping = mapping_getter(data);
1038
1039 if constexpr (requires { mapping.has_value(); }) {
1040 if (!mapping.has_value())
1041 continue;
1042 } else {
1043 if (mapping.is_empty())
1044 continue;
1045 }
1046
1047 if (mappings_in_current_row++ > 0)
1048 generator.append(" ");
1049
1050 generator.set("code_point", DeprecatedString::formatted("{:#x}", data.code_point));
1051 generator.append("{ @code_point@");
1052
1053 if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<size_t>>) {
1054 generator.set("mapping", DeprecatedString::formatted("{:#x}", *mapping));
1055 generator.append(", @mapping@ },");
1056 } else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {
1057 generator.set("tag", mapping->tag);
1058 generator.set("start", DeprecatedString::number(mapping->decomposition_index));
1059 generator.set("size", DeprecatedString::number(mapping->decomposition_size));
1060 generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
1061 } else {
1062 append_list_and_size(mapping, "&s_@name@[{}]"sv);
1063 generator.append(" },");
1064 }
1065
1066 if (mappings_in_current_row == max_mappings_per_row) {
1067 mappings_in_current_row = 0;
1068 generator.append("\n ");
1069 }
1070 }
1071 generator.append(R"~~~(
1072} };
1073)~~~");
1074 };
1075
1076 append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class,
1077 [](auto const& data) -> Optional<u32> {
1078 if (data.canonical_combining_class == 0)
1079 return {};
1080 return data.canonical_combining_class;
1081 });
1082 append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; });
1083 append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
1084 append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; });
1085 append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
1086 append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; });
1087 append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
1088
1089 append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping,
1090 [](auto const& data) {
1091 return data.decomposition_mapping;
1092 });
1093
1094 auto append_code_point_range_list = [&](DeprecatedString name, Vector<CodePointRange> const& ranges) {
1095 generator.set("name", name);
1096 generator.set("size", DeprecatedString::number(ranges.size()));
1097 generator.append(R"~~~(
1098static constexpr Array<CodePointRange, @size@> @name@ { {
1099 )~~~");
1100
1101 constexpr size_t max_ranges_per_row = 20;
1102 size_t ranges_in_current_row = 0;
1103
1104 for (auto const& range : ranges) {
1105 if (ranges_in_current_row++ > 0)
1106 generator.append(" ");
1107
1108 generator.set("first", DeprecatedString::formatted("{:#x}", range.first));
1109 generator.set("last", DeprecatedString::formatted("{:#x}", range.last));
1110 generator.append("{ @first@, @last@ },");
1111
1112 if (ranges_in_current_row == max_ranges_per_row) {
1113 ranges_in_current_row = 0;
1114 generator.append("\n ");
1115 }
1116 }
1117
1118 generator.append(R"~~~(
1119} };
1120)~~~");
1121 };
1122
1123 auto append_prop_list = [&](StringView collection_name, StringView property_format, PropList const& property_list) {
1124 for (auto const& property : property_list) {
1125 auto name = DeprecatedString::formatted(property_format, property.key);
1126 append_code_point_range_list(move(name), property.value);
1127 }
1128
1129 auto property_names = property_list.keys();
1130 quick_sort(property_names);
1131
1132 generator.set("name", collection_name);
1133 generator.set("size", DeprecatedString::number(property_names.size()));
1134 generator.append(R"~~~(
1135static constexpr Array<ReadonlySpan<CodePointRange>, @size@> @name@ { {)~~~");
1136
1137 for (auto const& property_name : property_names) {
1138 generator.set("name", DeprecatedString::formatted(property_format, property_name));
1139 generator.append(R"~~~(
1140 @name@.span(),)~~~");
1141 }
1142
1143 generator.append(R"~~~(
1144} };
1145)~~~");
1146 };
1147
1148 append_prop_list("s_general_categories"sv, "s_general_category_{}"sv, unicode_data.general_categories);
1149 append_prop_list("s_properties"sv, "s_property_{}"sv, unicode_data.prop_list);
1150 append_prop_list("s_scripts"sv, "s_script_{}"sv, unicode_data.script_list);
1151 append_prop_list("s_script_extensions"sv, "s_script_extension_{}"sv, unicode_data.script_extensions);
1152 append_prop_list("s_blocks"sv, "s_block_{}"sv, unicode_data.block_list);
1153 append_prop_list("s_grapheme_break_properties"sv, "s_grapheme_break_property_{}"sv, unicode_data.grapheme_break_props);
1154 append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props);
1155 append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props);
1156
1157 auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) {
1158 constexpr size_t max_values_per_row = 30;
1159 size_t values_in_current_row = 0;
1160
1161 generator.set("type", type);
1162 generator.set("name", name);
1163 generator.set("size", DeprecatedString::number(display_names.size()));
1164
1165 generator.append(R"~~~(
1166static constexpr Array<@type@, @size@> @name@ { {
1167 )~~~");
1168 for (auto const& display_name : display_names) {
1169 if (values_in_current_row++ > 0)
1170 generator.append(", ");
1171
1172 generator.set("first", DeprecatedString::formatted("{:#x}", display_name.code_point_range.first));
1173 generator.set("last", DeprecatedString::formatted("{:#x}", display_name.code_point_range.last));
1174 generator.set("name", DeprecatedString::number(display_name.name));
1175 generator.append("{ { @first@, @last@ }, @name@ }");
1176
1177 if (values_in_current_row == max_values_per_row) {
1178 values_in_current_row = 0;
1179 generator.append(",\n ");
1180 }
1181 }
1182 generator.append(R"~~~(
1183} };
1184)~~~");
1185 };
1186
1187 append_code_point_display_names("BlockNameData"sv, "s_block_display_names"sv, unicode_data.block_display_names);
1188 append_code_point_display_names("CodePointName"sv, "s_code_point_display_names"sv, unicode_data.code_point_display_names);
1189
1190 generator.append(R"~~~(
1191Optional<StringView> code_point_block_display_name(u32 code_point)
1192{
1193 if (auto const* entry = binary_search(s_block_display_names, code_point, nullptr, BlockNameComparator {}))
1194 return decode_string(entry->display_name);
1195
1196 return {};
1197}
1198
1199ReadonlySpan<BlockName> block_display_names()
1200{
1201 static auto display_names = []() {
1202 Array<BlockName, s_block_display_names.size()> display_names;
1203
1204 for (size_t i = 0; i < s_block_display_names.size(); ++i) {
1205 auto const& display_name = s_block_display_names[i];
1206 display_names[i] = { display_name.code_point_range, decode_string(display_name.display_name) };
1207 }
1208
1209 return display_names;
1210 }();
1211
1212 return display_names.span();
1213}
1214
1215Optional<DeprecatedString> code_point_display_name(u32 code_point)
1216{
1217 if (auto const* entry = binary_search(s_code_point_display_names, code_point, nullptr, CodePointNameComparator {})) {
1218 auto display_name = decode_string(entry->display_name);
1219
1220 if (display_name.ends_with("{:X}"sv))
1221 return DeprecatedString::formatted(display_name, code_point);
1222
1223 return display_name;
1224 }
1225
1226 return {};
1227}
1228)~~~");
1229
1230 auto append_code_point_mapping_search = [&](StringView method, StringView mappings, StringView fallback) {
1231 generator.set("method", method);
1232 generator.set("mappings", mappings);
1233 generator.set("fallback", fallback);
1234 generator.append(R"~~~(
1235u32 @method@(u32 code_point)
1236{
1237 auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator<CodePointMapping> {});
1238 return mapping ? mapping->mapping : @fallback@;
1239}
1240)~~~");
1241 };
1242
1243 append_code_point_mapping_search("canonical_combining_class"sv, "s_combining_class_mappings"sv, "0"sv);
1244 append_code_point_mapping_search("to_unicode_uppercase"sv, "s_uppercase_mappings"sv, "code_point"sv);
1245 append_code_point_mapping_search("to_unicode_lowercase"sv, "s_lowercase_mappings"sv, "code_point"sv);
1246 append_code_point_mapping_search("to_unicode_titlecase"sv, "s_titlecase_mappings"sv, "code_point"sv);
1247
1248 generator.append(R"~~~(
1249ReadonlySpan<SpecialCasing const*> special_case_mapping(u32 code_point)
1250{
1251 auto const* mapping = binary_search(s_special_case_mappings, code_point, nullptr, CodePointComparator<SpecialCaseMapping> {});
1252 if (mapping == nullptr)
1253 return {};
1254
1255 return mapping->special_casing.span().slice(0, mapping->special_casing_size);
1256}
1257
1258ReadonlySpan<CaseFolding const*> case_folding_mapping(u32 code_point)
1259{
1260 auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator<CaseFoldingMapping> {});
1261 if (mapping == nullptr)
1262 return {};
1263
1264 return mapping->case_folding.span().slice(0, mapping->case_folding_size);
1265}
1266
1267Optional<StringView> code_point_abbreviation(u32 code_point)
1268{
1269 auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
1270 if (mapping == nullptr)
1271 return {};
1272 if (mapping->abbreviation == 0)
1273 return {};
1274
1275 return decode_string(mapping->abbreviation);
1276}
1277
1278Optional<CodePointDecomposition const> code_point_decomposition(u32 code_point)
1279{
1280 auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecompositionRaw> {});
1281 if (mapping == nullptr)
1282 return {};
1283 return CodePointDecomposition { mapping->code_point, mapping->tag, ReadonlySpan<u32> { s_decomposition_mappings_data.data() + mapping->decomposition_index, mapping->decomposition_count } };
1284}
1285
1286Optional<CodePointDecomposition const> code_point_decomposition_by_index(size_t index)
1287{
1288 if (index >= s_decomposition_mappings.size())
1289 return {};
1290 auto const& mapping = s_decomposition_mappings[index];
1291 return CodePointDecomposition { mapping.code_point, mapping.tag, ReadonlySpan<u32> { s_decomposition_mappings_data.data() + mapping.decomposition_index, mapping.decomposition_count } };
1292}
1293)~~~");
1294
1295 auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
1296 generator.set("enum_title", enum_title);
1297 generator.set("enum_snake", enum_snake);
1298 generator.set("collection_name", collection_name);
1299 generator.append(R"~~~(
1300bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
1301{
1302 auto index = static_cast<@enum_title@UnderlyingType>(@enum_snake@);
1303 auto const& ranges = @collection_name@.at(index);
1304
1305 auto const* range = binary_search(ranges, code_point, nullptr, CodePointRangeComparator {});
1306 return range != nullptr;
1307}
1308)~~~");
1309 };
1310
1311 auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& prop_list, Vector<Alias> const& aliases) -> ErrorOr<void> {
1312 HashValueMap<StringView> hashes;
1313 TRY(hashes.try_ensure_capacity(prop_list.size() + aliases.size()));
1314
1315 ValueFromStringOptions options {};
1316
1317 for (auto const& prop : prop_list) {
1318 if constexpr (IsSame<RemoveCVReference<decltype(prop)>, DeprecatedString>) {
1319 hashes.set(CaseInsensitiveStringViewTraits::hash(prop), prop);
1320 options.sensitivity = CaseSensitivity::CaseInsensitive;
1321 } else {
1322 hashes.set(prop.key.hash(), prop.key);
1323 }
1324 }
1325
1326 for (auto const& alias : aliases)
1327 hashes.set(alias.alias.hash(), alias.alias);
1328
1329 generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes), options);
1330
1331 return {};
1332 };
1333
1334 TRY(append_from_string("Locale"sv, "locale"sv, unicode_data.locales, {}));
1335
1336 append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv);
1337 TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases));
1338
1339 append_prop_search("Property"sv, "property"sv, "s_properties"sv);
1340 TRY(append_from_string("Property"sv, "property"sv, unicode_data.prop_list, unicode_data.prop_aliases));
1341
1342 append_prop_search("Script"sv, "script"sv, "s_scripts"sv);
1343 append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv);
1344 TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases));
1345
1346 append_prop_search("Block"sv, "block"sv, "s_blocks"sv);
1347 TRY(append_from_string("Block"sv, "block"sv, unicode_data.block_list, unicode_data.block_aliases));
1348
1349 append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv);
1350 append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv);
1351 append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv);
1352
1353 generator.append(R"~~~(
1354}
1355)~~~");
1356
1357 TRY(file.write_until_depleted(generator.as_string_view().bytes()));
1358 return {};
1359}
1360
1361static Vector<u32> flatten_code_point_ranges(Vector<CodePointRange> const& code_points)
1362{
1363 Vector<u32> flattened;
1364
1365 for (auto const& range : code_points) {
1366 flattened.grow_capacity(range.last - range.first);
1367 for (u32 code_point = range.first; code_point <= range.last; ++code_point)
1368 flattened.append(code_point);
1369 }
1370
1371 return flattened;
1372}
1373
1374static Vector<CodePointRange> form_code_point_ranges(Vector<u32> code_points)
1375{
1376 Vector<CodePointRange> ranges;
1377
1378 u32 range_start = code_points[0];
1379 u32 range_end = range_start;
1380
1381 for (size_t i = 1; i < code_points.size(); ++i) {
1382 u32 code_point = code_points[i];
1383
1384 if ((code_point - range_end) == 1) {
1385 range_end = code_point;
1386 } else {
1387 ranges.append({ range_start, range_end });
1388 range_start = code_point;
1389 range_end = code_point;
1390 }
1391 }
1392
1393 ranges.append({ range_start, range_end });
1394 return ranges;
1395}
1396
1397static void sort_and_merge_code_point_ranges(Vector<CodePointRange>& code_points)
1398{
1399 quick_sort(code_points, [](auto const& range1, auto const& range2) {
1400 return range1.first < range2.first;
1401 });
1402
1403 for (size_t i = 0; i < code_points.size() - 1;) {
1404 if (code_points[i].last >= code_points[i + 1].first) {
1405 code_points[i].last = max(code_points[i].last, code_points[i + 1].last);
1406 code_points.remove(i + 1);
1407 } else {
1408 ++i;
1409 }
1410 }
1411
1412 auto all_code_points = flatten_code_point_ranges(code_points);
1413 code_points = form_code_point_ranges(all_code_points);
1414}
1415
1416static void populate_general_category_unions(PropList& general_categories)
1417{
1418 // The Unicode standard defines General Category values which are not in any UCD file. These
1419 // values are simply unions of other values.
1420 // https://www.unicode.org/reports/tr44/#GC_Values_Table
1421 auto populate_union = [&](auto alias, auto categories) {
1422 auto& code_points = general_categories.ensure(alias);
1423 for (auto const& category : categories)
1424 code_points.extend(general_categories.find(category)->value);
1425
1426 sort_and_merge_code_point_ranges(code_points);
1427 };
1428
1429 populate_union("LC"sv, Array { "Ll"sv, "Lu"sv, "Lt"sv });
1430 populate_union("L"sv, Array { "Lu"sv, "Ll"sv, "Lt"sv, "Lm"sv, "Lo"sv });
1431 populate_union("M"sv, Array { "Mn"sv, "Mc"sv, "Me"sv });
1432 populate_union("N"sv, Array { "Nd"sv, "Nl"sv, "No"sv });
1433 populate_union("P"sv, Array { "Pc"sv, "Pd"sv, "Ps"sv, "Pe"sv, "Pi"sv, "Pf"sv, "Po"sv });
1434 populate_union("S"sv, Array { "Sm"sv, "Sc"sv, "Sk"sv, "So"sv });
1435 populate_union("Z"sv, Array { "Zs"sv, "Zl"sv, "Zp"sv });
1436 populate_union("C"sv, Array { "Cc"sv, "Cf"sv, "Cs"sv, "Co"sv, "Cn"sv });
1437}
1438
1439static void normalize_script_extensions(PropList& script_extensions, PropList const& script_list, Vector<Alias> const& script_aliases)
1440{
1441 // The ScriptExtensions UCD file lays out its code point ranges rather uniquely compared to
1442 // other files. The Script listed on each line may either be a full Script string or an aliased
1443 // abbreviation. Further, the extensions may or may not include the base Script list. Normalize
1444 // the extensions here to be keyed by the full Script name and always include the base list.
1445 auto extensions = move(script_extensions);
1446 script_extensions = script_list;
1447
1448 for (auto const& extension : extensions) {
1449 auto it = find_if(script_aliases.begin(), script_aliases.end(), [&](auto const& alias) { return extension.key == alias.alias; });
1450 auto const& key = (it == script_aliases.end()) ? extension.key : it->name;
1451
1452 auto& code_points = script_extensions.find(key)->value;
1453 code_points.extend(extension.value);
1454
1455 sort_and_merge_code_point_ranges(code_points);
1456 }
1457
1458 // Lastly, the Common and Inherited script extensions are special. They must not contain any
1459 // code points which appear in other script extensions. The ScriptExtensions UCD file does not
1460 // list these extensions, therefore this peculiarity must be handled programmatically.
1461 // https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
1462 auto code_point_has_other_extension = [&](StringView key, u32 code_point) {
1463 for (auto const& extension : extensions) {
1464 if (extension.key == key)
1465 continue;
1466 if (any_of(extension.value, [&](auto const& r) { return (r.first <= code_point) && (code_point <= r.last); }))
1467 return true;
1468 }
1469
1470 return false;
1471 };
1472
1473 auto get_code_points_without_other_extensions = [&](StringView key) {
1474 auto code_points = flatten_code_point_ranges(script_list.find(key)->value);
1475 code_points.remove_all_matching([&](u32 c) { return code_point_has_other_extension(key, c); });
1476 return code_points;
1477 };
1478
1479 auto common_code_points = get_code_points_without_other_extensions("Common"sv);
1480 script_extensions.set("Common"sv, form_code_point_ranges(common_code_points));
1481
1482 auto inherited_code_points = get_code_points_without_other_extensions("Inherited"sv);
1483 script_extensions.set("Inherited"sv, form_code_point_ranges(inherited_code_points));
1484}
1485
1486ErrorOr<int> serenity_main(Main::Arguments arguments)
1487{
1488 StringView generated_header_path;
1489 StringView generated_implementation_path;
1490 StringView unicode_data_path;
1491 StringView special_casing_path;
1492 StringView case_folding_path;
1493 StringView derived_general_category_path;
1494 StringView prop_list_path;
1495 StringView derived_core_prop_path;
1496 StringView derived_binary_prop_path;
1497 StringView prop_alias_path;
1498 StringView prop_value_alias_path;
1499 StringView name_alias_path;
1500 StringView scripts_path;
1501 StringView script_extensions_path;
1502 StringView blocks_path;
1503 StringView emoji_data_path;
1504 StringView normalization_path;
1505 StringView grapheme_break_path;
1506 StringView word_break_path;
1507 StringView sentence_break_path;
1508
1509 Core::ArgsParser args_parser;
1510 args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
1511 args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
1512 args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
1513 args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
1514 args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path");
1515 args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
1516 args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
1517 args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
1518 args_parser.add_option(derived_binary_prop_path, "Path to DerivedBinaryProperties.txt file", "derived-binary-prop-path", 'b', "derived-binary-prop-path");
1519 args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
1520 args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
1521 args_parser.add_option(name_alias_path, "Path to NameAliases.txt file", "name-alias-path", 'm', "name-alias-path");
1522 args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
1523 args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
1524 args_parser.add_option(blocks_path, "Path to Blocks.txt file", "blocks-path", 'k', "blocks-path");
1525 args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
1526 args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path");
1527 args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path");
1528 args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
1529 args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path");
1530 args_parser.parse(arguments);
1531
1532 auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
1533 auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
1534 auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read));
1535 auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read));
1536 auto special_casing_file = TRY(open_file(special_casing_path, Core::File::OpenMode::Read));
1537 auto case_folding_file = TRY(open_file(case_folding_path, Core::File::OpenMode::Read));
1538 auto prop_list_file = TRY(open_file(prop_list_path, Core::File::OpenMode::Read));
1539 auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::File::OpenMode::Read));
1540 auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read));
1541 auto prop_alias_file = TRY(open_file(prop_alias_path, Core::File::OpenMode::Read));
1542 auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read));
1543 auto name_alias_file = TRY(open_file(name_alias_path, Core::File::OpenMode::Read));
1544 auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read));
1545 auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read));
1546 auto blocks_file = TRY(open_file(blocks_path, Core::File::OpenMode::Read));
1547 auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read));
1548 auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read));
1549 auto grapheme_break_file = TRY(open_file(grapheme_break_path, Core::File::OpenMode::Read));
1550 auto word_break_file = TRY(open_file(word_break_path, Core::File::OpenMode::Read));
1551 auto sentence_break_file = TRY(open_file(sentence_break_path, Core::File::OpenMode::Read));
1552
1553 UnicodeData unicode_data {};
1554 TRY(parse_special_casing(*special_casing_file, unicode_data));
1555 TRY(parse_case_folding(*case_folding_file, unicode_data));
1556 TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
1557 TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
1558 TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));
1559 TRY(parse_prop_list(*derived_binary_prop_file, unicode_data.prop_list));
1560 TRY(parse_prop_list(*emoji_data_file, unicode_data.prop_list));
1561 TRY(parse_normalization_props(*normalization_file, unicode_data));
1562 TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases));
1563 TRY(parse_prop_list(*scripts_file, unicode_data.script_list));
1564 TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true));
1565 TRY(parse_block_display_names(*blocks_file, unicode_data));
1566 TRY(parse_prop_list(*blocks_file, unicode_data.block_list, false, true));
1567 TRY(parse_name_aliases(*name_alias_file, unicode_data));
1568 TRY(parse_prop_list(*grapheme_break_file, unicode_data.grapheme_break_props));
1569 TRY(parse_prop_list(*word_break_file, unicode_data.word_break_props));
1570 TRY(parse_prop_list(*sentence_break_file, unicode_data.sentence_break_props));
1571
1572 populate_general_category_unions(unicode_data.general_categories);
1573 TRY(parse_unicode_data(*unicode_data_file, unicode_data));
1574 TRY(parse_value_alias_list(*prop_value_alias_file, "gc"sv, unicode_data.general_categories.keys(), unicode_data.general_category_aliases));
1575 TRY(parse_value_alias_list(*prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false));
1576 TRY(parse_value_alias_list(*prop_value_alias_file, "blk"sv, unicode_data.block_list.keys(), unicode_data.block_aliases, false, true));
1577 normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases);
1578
1579 TRY(generate_unicode_data_header(*generated_header_file, unicode_data));
1580 TRY(generate_unicode_data_implementation(*generated_implementation_file, unicode_data));
1581
1582 return 0;
1583}