Serenity Operating System
1/*
2 * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/Array.h>
8#include <AK/Checked.h>
9#include <AK/FlyString.h>
10#include <AK/Format.h>
11#include <AK/MemMem.h>
12#include <AK/Stream.h>
13#include <AK/String.h>
14#include <AK/Vector.h>
15#include <stdlib.h>
16
17namespace AK {
18
19namespace Detail {
20
21class StringData final : public RefCounted<StringData> {
22public:
23 static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t, u8*& buffer);
24 static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count);
25 static ErrorOr<NonnullRefPtr<StringData>> from_utf8(char const* utf8_bytes, size_t);
26 static ErrorOr<NonnullRefPtr<StringData>> from_stream(Stream&, size_t byte_count);
27
28 struct SubstringData {
29 StringData const* superstring { nullptr };
30 u32 start_offset { 0 };
31 };
32
33 void operator delete(void* ptr);
34
35 ~StringData();
36
37 SubstringData const& substring_data() const
38 {
39 return *reinterpret_cast<SubstringData const*>(m_bytes_or_substring_data);
40 }
41
42 // NOTE: There is no guarantee about null-termination.
43 ReadonlyBytes bytes() const
44 {
45 if (m_substring) {
46 auto const& data = substring_data();
47 return data.superstring->bytes().slice(data.start_offset, m_byte_count);
48 }
49 return { &m_bytes_or_substring_data[0], m_byte_count };
50 }
51
52 StringView bytes_as_string_view() const { return { bytes() }; }
53
54 bool operator==(StringData const& other) const
55 {
56 return bytes_as_string_view() == other.bytes_as_string_view();
57 }
58
59 unsigned hash() const
60 {
61 if (!m_has_hash)
62 compute_hash();
63 return m_hash;
64 }
65
66 bool is_fly_string() const { return m_is_fly_string; }
67 void set_fly_string(bool is_fly_string) const { m_is_fly_string = is_fly_string; }
68
69private:
70 explicit StringData(size_t byte_count);
71 StringData(StringData const& superstring, size_t start, size_t byte_count);
72
73 void compute_hash() const;
74
75 u32 m_byte_count { 0 };
76 mutable unsigned m_hash { 0 };
77 mutable bool m_has_hash { false };
78 bool m_substring { false };
79 mutable bool m_is_fly_string { false };
80
81 alignas(SubstringData) u8 m_bytes_or_substring_data[0];
82};
83
84void StringData::operator delete(void* ptr)
85{
86 free(ptr);
87}
88
89StringData::StringData(size_t byte_count)
90 : m_byte_count(byte_count)
91{
92}
93
94StringData::StringData(StringData const& superstring, size_t start, size_t byte_count)
95 : m_byte_count(byte_count)
96 , m_substring(true)
97{
98 auto& data = const_cast<SubstringData&>(substring_data());
99 data.start_offset = start;
100 data.superstring = &superstring;
101 superstring.ref();
102}
103
104StringData::~StringData()
105{
106 if (m_is_fly_string)
107 FlyString::did_destroy_fly_string_data({}, bytes_as_string_view());
108 if (m_substring)
109 substring_data().superstring->unref();
110}
111
112constexpr size_t allocation_size_for_string_data(size_t length)
113{
114 return sizeof(StringData) + (sizeof(char) * length);
115}
116
117ErrorOr<NonnullRefPtr<StringData>> StringData::create_uninitialized(size_t byte_count, u8*& buffer)
118{
119 VERIFY(byte_count);
120 void* slot = malloc(allocation_size_for_string_data(byte_count));
121 if (!slot) {
122 return Error::from_errno(ENOMEM);
123 }
124 auto new_string_data = adopt_ref(*new (slot) StringData(byte_count));
125 buffer = const_cast<u8*>(new_string_data->bytes().data());
126 return new_string_data;
127}
128
129ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, size_t byte_count)
130{
131 // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
132 VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
133
134 VERIFY(utf8_data);
135 u8* buffer = nullptr;
136 auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
137 memcpy(buffer, utf8_data, byte_count * sizeof(char));
138 return new_string_data;
139}
140
141static ErrorOr<void> read_stream_into_buffer(Stream& stream, Bytes buffer)
142{
143 TRY(stream.read_until_filled(buffer));
144
145 if (!Utf8View { StringView { buffer } }.validate())
146 return Error::from_string_literal("String::from_stream: Input was not valid UTF-8");
147
148 return {};
149}
150
151ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_t byte_count)
152{
153 // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
154 VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
155
156 u8* buffer = nullptr;
157 auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
158 TRY(read_stream_into_buffer(stream, { buffer, byte_count }));
159
160 return new_string_data;
161}
162
163ErrorOr<NonnullRefPtr<StringData>> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count)
164{
165 // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
166 VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
167
168 void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData));
169 if (!slot) {
170 return Error::from_errno(ENOMEM);
171 }
172 return adopt_ref(*new (slot) StringData(superstring, start, byte_count));
173}
174
175void StringData::compute_hash() const
176{
177 auto bytes = this->bytes();
178 if (bytes.size() == 0)
179 m_hash = 0;
180 else
181 m_hash = string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
182 m_has_hash = true;
183}
184
185}
186
187String::String(NonnullRefPtr<Detail::StringData const> data)
188 : m_data(&data.leak_ref())
189{
190}
191
192String::String(String const& other)
193 : m_data(other.m_data)
194{
195 if (!is_short_string())
196 m_data->ref();
197}
198
199String::String(String&& other)
200 : m_data(exchange(other.m_data, nullptr))
201{
202 other.m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG;
203}
204
205String& String::operator=(String&& other)
206{
207 if (!is_short_string())
208 m_data->unref();
209
210 m_data = exchange(other.m_data, nullptr);
211 other.m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG;
212 return *this;
213}
214
215String& String::operator=(String const& other)
216{
217 if (&other != this) {
218 m_data = other.m_data;
219 if (!is_short_string())
220 m_data->ref();
221 }
222 return *this;
223}
224
225void String::destroy_string()
226{
227 if (!is_short_string())
228 m_data->unref();
229}
230
231ErrorOr<String> String::from_utf8(StringView view)
232{
233 if (!Utf8View { view }.validate())
234 return Error::from_string_literal("String::from_utf8: Input was not valid UTF-8");
235
236 if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) {
237 ShortString short_string;
238 if (!view.is_empty())
239 memcpy(short_string.storage, view.characters_without_null_termination(), view.length());
240 short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG;
241 return String { short_string };
242 }
243 auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length()));
244 return String { move(data) };
245}
246
247ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
248{
249 if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) {
250 ShortString short_string;
251 if (byte_count > 0)
252 TRY(Detail::read_stream_into_buffer(stream, { short_string.storage, byte_count }));
253 short_string.byte_count_and_short_string_flag = (byte_count << 1) | SHORT_STRING_FLAG;
254 return String { short_string };
255 }
256 auto data = TRY(Detail::StringData::from_stream(stream, byte_count));
257 return String { move(data) };
258}
259
260ErrorOr<String> String::repeated(u32 code_point, size_t count)
261{
262 VERIFY(is_unicode(code_point));
263
264 Array<u8, 4> code_point_as_utf8;
265 size_t i = 0;
266
267 size_t code_point_byte_length = UnicodeUtils::code_point_to_utf8(code_point, [&](auto byte) {
268 code_point_as_utf8[i++] = static_cast<u8>(byte);
269 });
270
271 auto copy_to_buffer = [&](u8* buffer) {
272 if (code_point_byte_length == 1) {
273 memset(buffer, code_point_as_utf8[0], count);
274 return;
275 }
276
277 for (i = 0; i < count; ++i)
278 memcpy(buffer + (i * code_point_byte_length), code_point_as_utf8.data(), code_point_byte_length);
279 };
280
281 auto total_byte_count = code_point_byte_length * count;
282
283 if (total_byte_count <= MAX_SHORT_STRING_BYTE_COUNT) {
284 ShortString short_string;
285 copy_to_buffer(short_string.storage);
286 short_string.byte_count_and_short_string_flag = (total_byte_count << 1) | SHORT_STRING_FLAG;
287
288 return String { short_string };
289 }
290
291 u8* buffer = nullptr;
292 auto new_string_data = TRY(Detail::StringData::create_uninitialized(total_byte_count, buffer));
293 copy_to_buffer(buffer);
294
295 return String { move(new_string_data) };
296}
297
298StringView String::bytes_as_string_view() const
299{
300 return StringView(bytes());
301}
302
303ReadonlyBytes String::bytes() const
304{
305 if (is_short_string())
306 return m_short_string.bytes();
307 return m_data->bytes();
308}
309
310bool String::is_empty() const
311{
312 return bytes().size() == 0;
313}
314
315ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params)
316{
317 StringBuilder builder;
318 TRY(vformat(builder, fmtstr, params));
319 return builder.to_string();
320}
321
322ErrorOr<Vector<String>> String::split(u32 separator, SplitBehavior split_behavior) const
323{
324 return split_limit(separator, 0, split_behavior);
325}
326
327ErrorOr<Vector<String>> String::split_limit(u32 separator, size_t limit, SplitBehavior split_behavior) const
328{
329 Vector<String> result;
330
331 if (is_empty())
332 return result;
333
334 bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty);
335
336 size_t substring_start = 0;
337 for (auto it = code_points().begin(); it != code_points().end() && (result.size() + 1) != limit; ++it) {
338 u32 code_point = *it;
339 if (code_point == separator) {
340 size_t substring_length = code_points().iterator_offset(it) - substring_start;
341 if (substring_length != 0 || keep_empty)
342 TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, substring_length))));
343 substring_start = code_points().iterator_offset(it) + it.underlying_code_point_length_in_bytes();
344 }
345 }
346 size_t tail_length = code_points().byte_length() - substring_start;
347 if (tail_length != 0 || keep_empty)
348 TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, tail_length))));
349 return result;
350}
351
352Optional<size_t> String::find_byte_offset(u32 code_point, size_t from_byte_offset) const
353{
354 auto code_points = this->code_points();
355 if (from_byte_offset >= code_points.byte_length())
356 return {};
357
358 for (auto it = code_points.iterator_at_byte_offset(from_byte_offset); it != code_points.end(); ++it) {
359 if (*it == code_point)
360 return code_points.byte_offset_of(it);
361 }
362
363 return {};
364}
365
366Optional<size_t> String::find_byte_offset(StringView substring, size_t from_byte_offset) const
367{
368 auto view = bytes_as_string_view();
369 if (from_byte_offset >= view.length())
370 return {};
371
372 auto index = memmem_optional(
373 view.characters_without_null_termination() + from_byte_offset, view.length() - from_byte_offset,
374 substring.characters_without_null_termination(), substring.length());
375
376 if (index.has_value())
377 return *index + from_byte_offset;
378 return {};
379}
380
381bool String::operator==(String const& other) const
382{
383 if (is_short_string())
384 return m_data == other.m_data;
385 return bytes_as_string_view() == other.bytes_as_string_view();
386}
387
388bool String::operator==(FlyString const& other) const
389{
390 if (reinterpret_cast<uintptr_t>(m_data) == other.data({}))
391 return true;
392 return bytes_as_string_view() == other.bytes_as_string_view();
393}
394
395bool String::operator==(StringView other) const
396{
397 return bytes_as_string_view() == other;
398}
399
400ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const
401{
402 if (!byte_count)
403 return String {};
404 return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
405}
406
407ErrorOr<String> String::substring_from_byte_offset(size_t start) const
408{
409 VERIFY(start <= bytes_as_string_view().length());
410 return substring_from_byte_offset(start, bytes_as_string_view().length() - start);
411}
412
413ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const
414{
415 if (!byte_count)
416 return String {};
417 if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT)
418 return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count));
419 return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) };
420}
421
422ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start) const
423{
424 VERIFY(start <= bytes_as_string_view().length());
425 return substring_from_byte_offset_with_shared_superstring(start, bytes_as_string_view().length() - start);
426}
427
428bool String::operator==(char const* c_string) const
429{
430 return bytes_as_string_view() == c_string;
431}
432
433u32 String::hash() const
434{
435 if (is_short_string()) {
436 auto bytes = this->bytes();
437 return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
438 }
439 return m_data->hash();
440}
441
442Utf8View String::code_points() const
443{
444 return Utf8View(bytes_as_string_view());
445}
446
447ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string)
448{
449 return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view());
450}
451
452ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const
453{
454 return StringUtils::replace(*this, needle, replacement, replace_mode);
455}
456
457ErrorOr<String> String::reverse() const
458{
459 // FIXME: This handles multi-byte code points, but not e.g. grapheme clusters.
460 // FIXME: We could avoid allocating a temporary vector if Utf8View supports reverse iteration.
461 auto code_point_length = code_points().length();
462
463 Vector<u32> code_points;
464 TRY(code_points.try_ensure_capacity(code_point_length));
465
466 for (auto code_point : this->code_points())
467 code_points.unchecked_append(code_point);
468
469 auto builder = TRY(StringBuilder::create(code_point_length * sizeof(u32)));
470 while (!code_points.is_empty())
471 TRY(builder.try_append_code_point(code_points.take_last()));
472
473 return builder.to_string();
474}
475
476ErrorOr<String> String::trim(Utf8View const& code_points_to_trim, TrimMode mode) const
477{
478 auto trimmed = code_points().trim(code_points_to_trim, mode);
479 return String::from_utf8(trimmed.as_string());
480}
481
482ErrorOr<String> String::trim(StringView code_points_to_trim, TrimMode mode) const
483{
484 return trim(Utf8View { code_points_to_trim }, mode);
485}
486
487bool String::contains(StringView needle, CaseSensitivity case_sensitivity) const
488{
489 return StringUtils::contains(bytes_as_string_view(), needle, case_sensitivity);
490}
491
492bool String::contains(u32 needle, CaseSensitivity case_sensitivity) const
493{
494 auto needle_as_string = String::from_code_point(needle);
495 return contains(needle_as_string.bytes_as_string_view(), case_sensitivity);
496}
497
498bool String::starts_with(u32 code_point) const
499{
500 if (is_empty())
501 return false;
502
503 return *code_points().begin() == code_point;
504}
505
506bool String::starts_with_bytes(StringView bytes) const
507{
508 return bytes_as_string_view().starts_with(bytes);
509}
510
511bool String::ends_with(u32 code_point) const
512{
513 if (is_empty())
514 return false;
515
516 u32 last_code_point = 0;
517 for (auto it = code_points().begin(); it != code_points().end(); ++it)
518 last_code_point = *it;
519
520 return last_code_point == code_point;
521}
522
523bool String::ends_with_bytes(StringView bytes) const
524{
525 return bytes_as_string_view().ends_with(bytes);
526}
527
528bool String::is_short_string() const
529{
530 return has_short_string_bit(reinterpret_cast<uintptr_t>(m_data));
531}
532
533ReadonlyBytes String::ShortString::bytes() const
534{
535 return { storage, byte_count() };
536}
537
538size_t String::ShortString::byte_count() const
539{
540 return byte_count_and_short_string_flag >> 1;
541}
542
543unsigned Traits<String>::hash(String const& string)
544{
545 return string.hash();
546}
547
548String String::fly_string_data_to_string(Badge<FlyString>, uintptr_t const& data)
549{
550 if (has_short_string_bit(data))
551 return String { *reinterpret_cast<ShortString const*>(&data) };
552
553 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
554 return String { NonnullRefPtr<Detail::StringData const>(*string_data) };
555}
556
557StringView String::fly_string_data_to_string_view(Badge<FlyString>, uintptr_t const& data)
558{
559 if (has_short_string_bit(data)) {
560 auto const* short_string = reinterpret_cast<ShortString const*>(&data);
561 return short_string->bytes();
562 }
563
564 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
565 return string_data->bytes_as_string_view();
566}
567
568u32 String::fly_string_data_to_hash(Badge<FlyString>, uintptr_t const& data)
569{
570 if (has_short_string_bit(data)) {
571 auto const* short_string = reinterpret_cast<ShortString const*>(&data);
572 auto bytes = short_string->bytes();
573 return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
574 }
575
576 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
577 return string_data->hash();
578}
579
580uintptr_t String::to_fly_string_data(Badge<FlyString>) const
581{
582 return reinterpret_cast<uintptr_t>(m_data);
583}
584
585void String::ref_fly_string_data(Badge<FlyString>, uintptr_t data)
586{
587 if (has_short_string_bit(data))
588 return;
589
590 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
591 string_data->ref();
592}
593
594void String::unref_fly_string_data(Badge<FlyString>, uintptr_t data)
595{
596 if (has_short_string_bit(data))
597 return;
598
599 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data);
600 string_data->unref();
601}
602
603void String::did_create_fly_string(Badge<FlyString>) const
604{
605 VERIFY(!is_short_string());
606 m_data->set_fly_string(true);
607}
608
609DeprecatedString String::to_deprecated_string() const
610{
611 return DeprecatedString(bytes_as_string_view());
612}
613
614ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string)
615{
616 return String::from_utf8(deprecated_string.view());
617}
618
619}