Serenity Operating System
1/*
2 * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
3 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
4 *
5 * SPDX-License-Identifier: BSD-2-Clause
6 */
7
8#include <AK/Utf16View.h>
9#include <AK/Utf32View.h>
10#include <AK/Utf8View.h>
11#include <LibUnicode/CharacterTypes.h>
12#include <LibUnicode/Segmentation.h>
13
14#if ENABLE_UNICODE_DATA
15# include <LibUnicode/UnicodeData.h>
16#endif
17
18namespace Unicode {
19
20template<typename ViewType>
21static size_t code_unit_length(ViewType const& view)
22{
23 if constexpr (IsSame<ViewType, Utf8View>)
24 return view.byte_length();
25 else if constexpr (IsSame<ViewType, Utf16View>)
26 return view.length_in_code_units();
27 else if constexpr (IsSame<ViewType, Utf32View>)
28 return view.length();
29 else
30 static_assert(DependentFalse<ViewType>);
31}
32
33template<typename ViewType, typename CodeUnitIterator>
34static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
35{
36 if constexpr (IsSame<ViewType, Utf8View>)
37 return view.byte_offset_of(it);
38 else if constexpr (IsSame<ViewType, Utf16View>)
39 return view.code_unit_offset_of(it);
40 else if constexpr (IsSame<ViewType, Utf32View>)
41 return view.iterator_offset(it);
42 else
43 static_assert(DependentFalse<ViewType>);
44}
45
46template<typename ViewType>
47static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
48{
49#if ENABLE_UNICODE_DATA
50 using GBP = GraphemeBreakProperty;
51
52 // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
53 if (view.is_empty())
54 return;
55
56 auto has_any_gbp = [](u32 code_point, auto&&... properties) {
57 return (code_point_has_grapheme_break_property(code_point, properties) || ...);
58 };
59
60 // GB1
61 if (callback(0) == IterationDecision::Break)
62 return;
63
64 if (code_unit_length(view) > 1) {
65 auto it = view.begin();
66 auto code_point = *it;
67 u32 next_code_point = 0;
68 auto current_ri_chain = 0;
69
70 for (++it; it != view.end(); ++it, code_point = next_code_point) {
71 next_code_point = *it;
72
73 // GB11
74 if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
75 auto it_copy = it;
76
77 while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
78 ++it_copy;
79
80 if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
81 ++it_copy;
82
83 if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
84 next_code_point = *it_copy;
85 it = it_copy;
86 continue;
87 }
88 }
89 }
90
91 auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
92 auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
93
94 // GB3
95 if (code_point_is_cr && next_code_point_is_lf)
96 continue;
97 // GB4, GB5
98 if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
99 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
100 return;
101 continue;
102 }
103
104 auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
105 auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
106
107 // GB6
108 if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
109 continue;
110 // GB7
111 if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
112 continue;
113 // GB8
114 if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
115 continue;
116
117 // GB9
118 if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
119 continue;
120 // GB9a
121 if (has_any_gbp(next_code_point, GBP::SpacingMark))
122 continue;
123 // GB9b
124 if (has_any_gbp(code_point, GBP::Prepend))
125 continue;
126
127 auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
128 current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
129
130 // GB12, GB13
131 if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
132 continue;
133
134 // GB999
135 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
136 return;
137 }
138 }
139
140 // GB2
141 callback(code_unit_length(view));
142#endif
143}
144
145void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
146{
147 for_each_grapheme_segmentation_boundary_impl(view, move(callback));
148}
149
150void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
151{
152 for_each_grapheme_segmentation_boundary_impl(view, move(callback));
153}
154
155void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
156{
157 for_each_grapheme_segmentation_boundary_impl(view, move(callback));
158}
159
160template<typename ViewType>
161static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
162{
163#if ENABLE_UNICODE_DATA
164 using WBP = WordBreakProperty;
165
166 // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
167 if (view.is_empty())
168 return;
169
170 auto has_any_wbp = [](u32 code_point, auto&&... properties) {
171 return (code_point_has_word_break_property(code_point, properties) || ...);
172 };
173
174 // WB1
175 if (callback(0) == IterationDecision::Break)
176 return;
177
178 if (code_unit_length(view) > 1) {
179 auto it = view.begin();
180 auto code_point = *it;
181 u32 next_code_point;
182 Optional<u32> previous_code_point;
183 auto current_ri_chain = 0;
184
185 for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
186 next_code_point = *it;
187
188 auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
189 auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
190
191 // WB3
192 if (code_point_is_cr && next_code_point_is_lf)
193 continue;
194 // WB3a, WB3b
195 if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
196 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
197 return;
198 continue;
199 }
200 // WB3c
201 if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
202 continue;
203 // WB3d
204 if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
205 continue;
206
207 // WB4
208 if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
209 continue;
210
211 auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
212 auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
213 auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
214 auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
215
216 // WB5
217 if (code_point_is_ah_letter && next_code_point_is_ah_letter)
218 continue;
219
220 Optional<u32> next_next_code_point;
221 if (it != view.end()) {
222 auto it_copy = it;
223 ++it_copy;
224 if (it_copy != view.end())
225 next_next_code_point = *it_copy;
226 }
227 bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
228 bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
229
230 auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
231
232 // WB6
233 if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
234 continue;
235
236 auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
237 auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
238 auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
239
240 // WB7
241 if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
242 continue;
243 // WB7a
244 if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
245 continue;
246 // WB7b
247 if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
248 continue;
249 // WB7c
250 if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
251 continue;
252
253 auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
254 auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
255
256 // WB8
257 if (code_point_is_numeric && next_code_point_is_numeric)
258 continue;
259 // WB9
260 if (code_point_is_ah_letter && next_code_point_is_numeric)
261 continue;
262 // WB10
263 if (code_point_is_numeric && next_code_point_is_ah_letter)
264 continue;
265
266 auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
267
268 // WB11
269 if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
270 continue;
271
272 bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
273
274 // WB12
275 if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
276 continue;
277
278 auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
279 auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
280
281 // WB13
282 if (code_point_is_katakana && next_code_point_is_katakana)
283 continue;
284
285 auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
286
287 // WB13a
288 if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
289 continue;
290 // WB13b
291 if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
292 continue;
293
294 auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
295 current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
296
297 // WB15, WB16
298 if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
299 continue;
300
301 // WB999
302 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
303 return;
304 }
305 }
306
307 // WB2
308 callback(code_unit_length(view));
309#endif
310}
311
312void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
313{
314 for_each_word_segmentation_boundary_impl(view, move(callback));
315}
316
317void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
318{
319 for_each_word_segmentation_boundary_impl(view, move(callback));
320}
321
322void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
323{
324 for_each_word_segmentation_boundary_impl(view, move(callback));
325}
326
327template<typename ViewType>
328static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
329{
330#if ENABLE_UNICODE_DATA
331 using SBP = SentenceBreakProperty;
332
333 // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
334 if (view.is_empty())
335 return;
336
337 auto has_any_sbp = [](u32 code_point, auto&&... properties) {
338 return (code_point_has_sentence_break_property(code_point, properties) || ...);
339 };
340
341 // SB1
342 if (callback(0) == IterationDecision::Break)
343 return;
344
345 if (code_unit_length(view) > 1) {
346 auto it = view.begin();
347 auto code_point = *it;
348 u32 next_code_point;
349 Optional<u32> previous_code_point;
350 enum class TerminatorSequenceState {
351 None,
352 Term,
353 Close,
354 Sp
355 } terminator_sequence_state { TerminatorSequenceState::None };
356 auto term_was_a_term = false;
357
358 for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
359 next_code_point = *it;
360
361 auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
362 auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
363
364 // SB3
365 if (code_point_is_cr && next_code_point_is_lf)
366 continue;
367
368 auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
369
370 // SB4
371 if (code_point_is_para_sep) {
372 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
373 return;
374 continue;
375 }
376
377 // SB5
378 if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
379 continue;
380
381 auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
382
383 // SB6
384 if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
385 continue;
386 // SB7
387 if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
388 continue;
389
390 if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
391 terminator_sequence_state = TerminatorSequenceState::Term;
392 term_was_a_term = code_point_is_a_term;
393 } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
394 terminator_sequence_state = TerminatorSequenceState::Close;
395 } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
396 terminator_sequence_state = TerminatorSequenceState::Sp;
397 } else {
398 terminator_sequence_state = TerminatorSequenceState::None;
399 }
400
401 // SB8
402 if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
403 auto it_copy = it;
404 bool illegal_sequence = false;
405 for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
406 if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
407 continue;
408 illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
409 }
410 if (illegal_sequence)
411 continue;
412 }
413
414 // SB8a
415 if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
416 continue;
417
418 auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
419 auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
420
421 // SB9
422 if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
423 continue;
424
425 // SB10
426 if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
427 continue;
428
429 // SB11
430 if (terminator_sequence_state >= TerminatorSequenceState::Term)
431 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
432 return;
433
434 // SB998
435 }
436 }
437
438 // SB2
439 callback(code_unit_length(view));
440#endif
441}
442
443void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
444{
445 for_each_sentence_segmentation_boundary_impl(view, move(callback));
446}
447
448void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
449{
450 for_each_sentence_segmentation_boundary_impl(view, move(callback));
451}
452
453void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
454{
455 for_each_sentence_segmentation_boundary_impl(view, move(callback));
456}
457
458}