Serenity Operating System
at master 458 lines 18 kB view raw
1/* 2 * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org> 3 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include <AK/Utf16View.h> 9#include <AK/Utf32View.h> 10#include <AK/Utf8View.h> 11#include <LibUnicode/CharacterTypes.h> 12#include <LibUnicode/Segmentation.h> 13 14#if ENABLE_UNICODE_DATA 15# include <LibUnicode/UnicodeData.h> 16#endif 17 18namespace Unicode { 19 20template<typename ViewType> 21static size_t code_unit_length(ViewType const& view) 22{ 23 if constexpr (IsSame<ViewType, Utf8View>) 24 return view.byte_length(); 25 else if constexpr (IsSame<ViewType, Utf16View>) 26 return view.length_in_code_units(); 27 else if constexpr (IsSame<ViewType, Utf32View>) 28 return view.length(); 29 else 30 static_assert(DependentFalse<ViewType>); 31} 32 33template<typename ViewType, typename CodeUnitIterator> 34static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it) 35{ 36 if constexpr (IsSame<ViewType, Utf8View>) 37 return view.byte_offset_of(it); 38 else if constexpr (IsSame<ViewType, Utf16View>) 39 return view.code_unit_offset_of(it); 40 else if constexpr (IsSame<ViewType, Utf32View>) 41 return view.iterator_offset(it); 42 else 43 static_assert(DependentFalse<ViewType>); 44} 45 46template<typename ViewType> 47static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) 48{ 49#if ENABLE_UNICODE_DATA 50 using GBP = GraphemeBreakProperty; 51 52 // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 53 if (view.is_empty()) 54 return; 55 56 auto has_any_gbp = [](u32 code_point, auto&&... properties) { 57 return (code_point_has_grapheme_break_property(code_point, properties) || ...); 58 }; 59 60 // GB1 61 if (callback(0) == IterationDecision::Break) 62 return; 63 64 if (code_unit_length(view) > 1) { 65 auto it = view.begin(); 66 auto code_point = *it; 67 u32 next_code_point = 0; 68 auto current_ri_chain = 0; 69 70 for (++it; it != view.end(); ++it, code_point = next_code_point) { 71 next_code_point = *it; 72 73 // GB11 74 if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) { 75 auto it_copy = it; 76 77 while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend)) 78 ++it_copy; 79 80 if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) { 81 ++it_copy; 82 83 if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) { 84 next_code_point = *it_copy; 85 it = it_copy; 86 continue; 87 } 88 } 89 } 90 91 auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); 92 auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); 93 94 // GB3 95 if (code_point_is_cr && next_code_point_is_lf) 96 continue; 97 // GB4, GB5 98 if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { 99 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) 100 return; 101 continue; 102 } 103 104 auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V); 105 auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T); 106 107 // GB6 108 if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT))) 109 continue; 110 // GB7 111 if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V)) 112 continue; 113 // GB8 114 if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) 115 continue; 116 117 // GB9 118 if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) 119 continue; 120 // GB9a 121 if (has_any_gbp(next_code_point, GBP::SpacingMark)) 122 continue; 123 // GB9b 124 if (has_any_gbp(code_point, GBP::Prepend)) 125 continue; 126 127 auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); 128 current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; 129 130 // GB12, GB13 131 if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1) 132 continue; 133 134 // GB999 135 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) 136 return; 137 } 138 } 139 140 // GB2 141 callback(code_unit_length(view)); 142#endif 143} 144 145void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) 146{ 147 for_each_grapheme_segmentation_boundary_impl(view, move(callback)); 148} 149 150void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) 151{ 152 for_each_grapheme_segmentation_boundary_impl(view, move(callback)); 153} 154 155void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) 156{ 157 for_each_grapheme_segmentation_boundary_impl(view, move(callback)); 158} 159 160template<typename ViewType> 161static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) 162{ 163#if ENABLE_UNICODE_DATA 164 using WBP = WordBreakProperty; 165 166 // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules 167 if (view.is_empty()) 168 return; 169 170 auto has_any_wbp = [](u32 code_point, auto&&... properties) { 171 return (code_point_has_word_break_property(code_point, properties) || ...); 172 }; 173 174 // WB1 175 if (callback(0) == IterationDecision::Break) 176 return; 177 178 if (code_unit_length(view) > 1) { 179 auto it = view.begin(); 180 auto code_point = *it; 181 u32 next_code_point; 182 Optional<u32> previous_code_point; 183 auto current_ri_chain = 0; 184 185 for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { 186 next_code_point = *it; 187 188 auto code_point_is_cr = has_any_wbp(code_point, WBP::CR); 189 auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF); 190 191 // WB3 192 if (code_point_is_cr && next_code_point_is_lf) 193 continue; 194 // WB3a, WB3b 195 if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { 196 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) 197 return; 198 continue; 199 } 200 // WB3c 201 if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic)) 202 continue; 203 // WB3d 204 if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace)) 205 continue; 206 207 // WB4 208 if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ)) 209 continue; 210 211 auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter); 212 auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter); 213 auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter); 214 auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter); 215 216 // WB5 217 if (code_point_is_ah_letter && next_code_point_is_ah_letter) 218 continue; 219 220 Optional<u32> next_next_code_point; 221 if (it != view.end()) { 222 auto it_copy = it; 223 ++it_copy; 224 if (it_copy != view.end()) 225 next_next_code_point = *it_copy; 226 } 227 bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter); 228 bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter)); 229 230 auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote); 231 232 // WB6 233 if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter))) 234 continue; 235 236 auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote); 237 auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter); 238 auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter)); 239 240 // WB7 241 if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter))) 242 continue; 243 // WB7a 244 if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote)) 245 continue; 246 // WB7b 247 if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote)) 248 continue; 249 // WB7c 250 if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote)) 251 continue; 252 253 auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric); 254 auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric); 255 256 // WB8 257 if (code_point_is_numeric && next_code_point_is_numeric) 258 continue; 259 // WB9 260 if (code_point_is_ah_letter && next_code_point_is_numeric) 261 continue; 262 // WB10 263 if (code_point_is_numeric && next_code_point_is_ah_letter) 264 continue; 265 266 auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric); 267 268 // WB11 269 if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum))) 270 continue; 271 272 bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric); 273 274 // WB12 275 if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum))) 276 continue; 277 278 auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana); 279 auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana); 280 281 // WB13 282 if (code_point_is_katakana && next_code_point_is_katakana) 283 continue; 284 285 auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet); 286 287 // WB13a 288 if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet)) 289 continue; 290 // WB13b 291 if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana)) 292 continue; 293 294 auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator); 295 current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; 296 297 // WB15, WB16 298 if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1) 299 continue; 300 301 // WB999 302 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) 303 return; 304 } 305 } 306 307 // WB2 308 callback(code_unit_length(view)); 309#endif 310} 311 312void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) 313{ 314 for_each_word_segmentation_boundary_impl(view, move(callback)); 315} 316 317void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) 318{ 319 for_each_word_segmentation_boundary_impl(view, move(callback)); 320} 321 322void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) 323{ 324 for_each_word_segmentation_boundary_impl(view, move(callback)); 325} 326 327template<typename ViewType> 328static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) 329{ 330#if ENABLE_UNICODE_DATA 331 using SBP = SentenceBreakProperty; 332 333 // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 334 if (view.is_empty()) 335 return; 336 337 auto has_any_sbp = [](u32 code_point, auto&&... properties) { 338 return (code_point_has_sentence_break_property(code_point, properties) || ...); 339 }; 340 341 // SB1 342 if (callback(0) == IterationDecision::Break) 343 return; 344 345 if (code_unit_length(view) > 1) { 346 auto it = view.begin(); 347 auto code_point = *it; 348 u32 next_code_point; 349 Optional<u32> previous_code_point; 350 enum class TerminatorSequenceState { 351 None, 352 Term, 353 Close, 354 Sp 355 } terminator_sequence_state { TerminatorSequenceState::None }; 356 auto term_was_a_term = false; 357 358 for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { 359 next_code_point = *it; 360 361 auto code_point_is_cr = has_any_sbp(code_point, SBP::CR); 362 auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF); 363 364 // SB3 365 if (code_point_is_cr && next_code_point_is_lf) 366 continue; 367 368 auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep); 369 370 // SB4 371 if (code_point_is_para_sep) { 372 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) 373 return; 374 continue; 375 } 376 377 // SB5 378 if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend)) 379 continue; 380 381 auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm); 382 383 // SB6 384 if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric)) 385 continue; 386 // SB7 387 if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper)) 388 continue; 389 390 if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) { 391 terminator_sequence_state = TerminatorSequenceState::Term; 392 term_was_a_term = code_point_is_a_term; 393 } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) { 394 terminator_sequence_state = TerminatorSequenceState::Close; 395 } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) { 396 terminator_sequence_state = TerminatorSequenceState::Sp; 397 } else { 398 terminator_sequence_state = TerminatorSequenceState::None; 399 } 400 401 // SB8 402 if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) { 403 auto it_copy = it; 404 bool illegal_sequence = false; 405 for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) { 406 if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend)) 407 continue; 408 illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower); 409 } 410 if (illegal_sequence) 411 continue; 412 } 413 414 // SB8a 415 if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm))) 416 continue; 417 418 auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp); 419 auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF); 420 421 // SB9 422 if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close))) 423 continue; 424 425 // SB10 426 if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep)) 427 continue; 428 429 // SB11 430 if (terminator_sequence_state >= TerminatorSequenceState::Term) 431 if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) 432 return; 433 434 // SB998 435 } 436 } 437 438 // SB2 439 callback(code_unit_length(view)); 440#endif 441} 442 443void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) 444{ 445 for_each_sentence_segmentation_boundary_impl(view, move(callback)); 446} 447 448void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) 449{ 450 for_each_sentence_segmentation_boundary_impl(view, move(callback)); 451} 452 453void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) 454{ 455 for_each_sentence_segmentation_boundary_impl(view, move(callback)); 456} 457 458}