Serenity Operating System
at master 101 lines 3.9 kB view raw
1/* 2 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/Array.h> 8#include <AK/StringView.h> 9#include <AK/Utf8View.h> 10#include <AK/Vector.h> 11#include <LibTest/TestCase.h> 12#include <LibUnicode/Segmentation.h> 13 14template<size_t N> 15static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N]) 16{ 17 Vector<size_t> boundaries; 18 Utf8View view { string }; 19 20 Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) { 21 boundaries.append(boundary); 22 return IterationDecision::Continue; 23 }); 24 25 EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries }); 26} 27 28TEST_CASE(grapheme_segmentation) 29{ 30 Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) { 31 VERIFY_NOT_REACHED(); 32 return IterationDecision::Break; 33 }); 34 35 test_grapheme_segmentation("a"sv, { 0u, 1u }); 36 test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u }); 37 test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u }); 38 39 test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u }); 40 test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u }); 41 test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u }); 42 43 test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u }); 44 test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u }); 45 test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u }); 46 test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u }); 47 test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u }); 48 49 test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u }); 50 test_grapheme_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u }); 51 test_grapheme_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u }); 52} 53 54template<size_t N> 55static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N]) 56{ 57 Vector<size_t> boundaries; 58 Utf8View view { string }; 59 60 Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) { 61 boundaries.append(boundary); 62 return IterationDecision::Continue; 63 }); 64 65 EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries }); 66} 67 68TEST_CASE(word_segmentation) 69{ 70 Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) { 71 VERIFY_NOT_REACHED(); 72 return IterationDecision::Break; 73 }); 74 75 test_word_segmentation("a"sv, { 0u, 1u }); 76 test_word_segmentation("ab"sv, { 0u, 2u }); 77 test_word_segmentation("abc"sv, { 0u, 3u }); 78 79 test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u }); 80 test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u }); 81 test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u }); 82 test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u }); 83 test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u }); 84 test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u }); 85 86 test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u }); 87 test_word_segmentation("a👨‍👩‍👧‍👦b"sv, { 0u, 1u, 26u, 27u }); 88 test_word_segmentation("a👩🏼‍❤️‍👨🏻b"sv, { 0u, 1u, 29u, 30u }); 89 90 test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u }); 91 test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u }); 92 test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u }); 93 test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u }); 94 95 test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u }); 96 test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u }); 97 98 test_word_segmentation( 99 "The quick (“brown”) fox can’t jump 32.3 feet, right?"sv, 100 { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u }); 101}