Serenity Operating System
1/*
2 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/Array.h>
8#include <AK/StringView.h>
9#include <AK/Utf8View.h>
10#include <AK/Vector.h>
11#include <LibTest/TestCase.h>
12#include <LibUnicode/Segmentation.h>
13
14template<size_t N>
15static void test_grapheme_segmentation(StringView string, size_t const (&expected_boundaries)[N])
16{
17 Vector<size_t> boundaries;
18 Utf8View view { string };
19
20 Unicode::for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
21 boundaries.append(boundary);
22 return IterationDecision::Continue;
23 });
24
25 EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
26}
27
28TEST_CASE(grapheme_segmentation)
29{
30 Unicode::for_each_grapheme_segmentation_boundary(Utf8View {}, [&](auto) {
31 VERIFY_NOT_REACHED();
32 return IterationDecision::Break;
33 });
34
35 test_grapheme_segmentation("a"sv, { 0u, 1u });
36 test_grapheme_segmentation("ab"sv, { 0u, 1u, 2u });
37 test_grapheme_segmentation("abc"sv, { 0u, 1u, 2u, 3u });
38
39 test_grapheme_segmentation("a\nb"sv, { 0u, 1u, 2u, 3u });
40 test_grapheme_segmentation("a\n\rb"sv, { 0u, 1u, 2u, 3u, 4u });
41 test_grapheme_segmentation("a\r\nb"sv, { 0u, 1u, 3u, 4u });
42
43 test_grapheme_segmentation("aᄀb"sv, { 0u, 1u, 4u, 5u });
44 test_grapheme_segmentation("aᄀᄀb"sv, { 0u, 1u, 7u, 8u });
45 test_grapheme_segmentation("aᄀᆢb"sv, { 0u, 1u, 7u, 8u });
46 test_grapheme_segmentation("aᄀ가b"sv, { 0u, 1u, 7u, 8u });
47 test_grapheme_segmentation("aᄀ각b"sv, { 0u, 1u, 7u, 8u });
48
49 test_grapheme_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
50 test_grapheme_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
51 test_grapheme_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
52}
53
54template<size_t N>
55static void test_word_segmentation(StringView string, size_t const (&expected_boundaries)[N])
56{
57 Vector<size_t> boundaries;
58 Utf8View view { string };
59
60 Unicode::for_each_word_segmentation_boundary(view, [&](auto boundary) {
61 boundaries.append(boundary);
62 return IterationDecision::Continue;
63 });
64
65 EXPECT_EQ(boundaries, ReadonlySpan<size_t> { expected_boundaries });
66}
67
68TEST_CASE(word_segmentation)
69{
70 Unicode::for_each_word_segmentation_boundary(Utf8View {}, [&](auto) {
71 VERIFY_NOT_REACHED();
72 return IterationDecision::Break;
73 });
74
75 test_word_segmentation("a"sv, { 0u, 1u });
76 test_word_segmentation("ab"sv, { 0u, 2u });
77 test_word_segmentation("abc"sv, { 0u, 3u });
78
79 test_word_segmentation("ab cd"sv, { 0u, 2u, 3u, 5u });
80 test_word_segmentation("ab cd"sv, { 0u, 2u, 4u, 6u });
81 test_word_segmentation("ab\tcd"sv, { 0u, 2u, 3u, 5u });
82 test_word_segmentation("ab\ncd"sv, { 0u, 2u, 3u, 5u });
83 test_word_segmentation("ab\n\rcd"sv, { 0u, 2u, 3u, 4u, 6u });
84 test_word_segmentation("ab\r\ncd"sv, { 0u, 2u, 4u, 6u });
85
86 test_word_segmentation("a😀b"sv, { 0u, 1u, 5u, 6u });
87 test_word_segmentation("a👨👩👧👦b"sv, { 0u, 1u, 26u, 27u });
88 test_word_segmentation("a👩🏼❤️👨🏻b"sv, { 0u, 1u, 29u, 30u });
89
90 test_word_segmentation("ab 12 cd"sv, { 0u, 2u, 3u, 5u, 6u, 8u });
91 test_word_segmentation("ab 1.2 cd"sv, { 0u, 2u, 3u, 6u, 7u, 9u });
92 test_word_segmentation("ab 12.34 cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
93 test_word_segmentation("ab example.com cd"sv, { 0u, 2u, 3u, 14u, 15u, 17u });
94
95 test_word_segmentation("ab can't cd"sv, { 0u, 2u, 3u, 8u, 9u, 11u });
96 test_word_segmentation("ab \"can't\" cd"sv, { 0u, 2u, 3u, 4u, 9u, 10u, 11u, 13u });
97
98 test_word_segmentation(
99 "The quick (“brown”) fox can’t jump 32.3 feet, right?"sv,
100 { 0u, 3u, 4u, 9u, 10u, 11u, 14u, 19u, 22u, 23u, 24u, 27u, 28u, 35u, 36u, 40u, 41u, 45u, 46u, 50u, 51u, 52u, 57u, 58u });
101}