Serenity Operating System
at master 619 lines 19 kB view raw
1/* 2 * Copyright (c) 2018-2022, Andreas Kling <kling@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/Array.h> 8#include <AK/Checked.h> 9#include <AK/FlyString.h> 10#include <AK/Format.h> 11#include <AK/MemMem.h> 12#include <AK/Stream.h> 13#include <AK/String.h> 14#include <AK/Vector.h> 15#include <stdlib.h> 16 17namespace AK { 18 19namespace Detail { 20 21class StringData final : public RefCounted<StringData> { 22public: 23 static ErrorOr<NonnullRefPtr<StringData>> create_uninitialized(size_t, u8*& buffer); 24 static ErrorOr<NonnullRefPtr<StringData>> create_substring(StringData const& superstring, size_t start, size_t byte_count); 25 static ErrorOr<NonnullRefPtr<StringData>> from_utf8(char const* utf8_bytes, size_t); 26 static ErrorOr<NonnullRefPtr<StringData>> from_stream(Stream&, size_t byte_count); 27 28 struct SubstringData { 29 StringData const* superstring { nullptr }; 30 u32 start_offset { 0 }; 31 }; 32 33 void operator delete(void* ptr); 34 35 ~StringData(); 36 37 SubstringData const& substring_data() const 38 { 39 return *reinterpret_cast<SubstringData const*>(m_bytes_or_substring_data); 40 } 41 42 // NOTE: There is no guarantee about null-termination. 43 ReadonlyBytes bytes() const 44 { 45 if (m_substring) { 46 auto const& data = substring_data(); 47 return data.superstring->bytes().slice(data.start_offset, m_byte_count); 48 } 49 return { &m_bytes_or_substring_data[0], m_byte_count }; 50 } 51 52 StringView bytes_as_string_view() const { return { bytes() }; } 53 54 bool operator==(StringData const& other) const 55 { 56 return bytes_as_string_view() == other.bytes_as_string_view(); 57 } 58 59 unsigned hash() const 60 { 61 if (!m_has_hash) 62 compute_hash(); 63 return m_hash; 64 } 65 66 bool is_fly_string() const { return m_is_fly_string; } 67 void set_fly_string(bool is_fly_string) const { m_is_fly_string = is_fly_string; } 68 69private: 70 explicit StringData(size_t byte_count); 71 StringData(StringData const& superstring, size_t start, size_t byte_count); 72 73 void compute_hash() const; 74 75 u32 m_byte_count { 0 }; 76 mutable unsigned m_hash { 0 }; 77 mutable bool m_has_hash { false }; 78 bool m_substring { false }; 79 mutable bool m_is_fly_string { false }; 80 81 alignas(SubstringData) u8 m_bytes_or_substring_data[0]; 82}; 83 84void StringData::operator delete(void* ptr) 85{ 86 free(ptr); 87} 88 89StringData::StringData(size_t byte_count) 90 : m_byte_count(byte_count) 91{ 92} 93 94StringData::StringData(StringData const& superstring, size_t start, size_t byte_count) 95 : m_byte_count(byte_count) 96 , m_substring(true) 97{ 98 auto& data = const_cast<SubstringData&>(substring_data()); 99 data.start_offset = start; 100 data.superstring = &superstring; 101 superstring.ref(); 102} 103 104StringData::~StringData() 105{ 106 if (m_is_fly_string) 107 FlyString::did_destroy_fly_string_data({}, bytes_as_string_view()); 108 if (m_substring) 109 substring_data().superstring->unref(); 110} 111 112constexpr size_t allocation_size_for_string_data(size_t length) 113{ 114 return sizeof(StringData) + (sizeof(char) * length); 115} 116 117ErrorOr<NonnullRefPtr<StringData>> StringData::create_uninitialized(size_t byte_count, u8*& buffer) 118{ 119 VERIFY(byte_count); 120 void* slot = malloc(allocation_size_for_string_data(byte_count)); 121 if (!slot) { 122 return Error::from_errno(ENOMEM); 123 } 124 auto new_string_data = adopt_ref(*new (slot) StringData(byte_count)); 125 buffer = const_cast<u8*>(new_string_data->bytes().data()); 126 return new_string_data; 127} 128 129ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, size_t byte_count) 130{ 131 // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. 132 VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); 133 134 VERIFY(utf8_data); 135 u8* buffer = nullptr; 136 auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); 137 memcpy(buffer, utf8_data, byte_count * sizeof(char)); 138 return new_string_data; 139} 140 141static ErrorOr<void> read_stream_into_buffer(Stream& stream, Bytes buffer) 142{ 143 TRY(stream.read_until_filled(buffer)); 144 145 if (!Utf8View { StringView { buffer } }.validate()) 146 return Error::from_string_literal("String::from_stream: Input was not valid UTF-8"); 147 148 return {}; 149} 150 151ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_t byte_count) 152{ 153 // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. 154 VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); 155 156 u8* buffer = nullptr; 157 auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); 158 TRY(read_stream_into_buffer(stream, { buffer, byte_count })); 159 160 return new_string_data; 161} 162 163ErrorOr<NonnullRefPtr<StringData>> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count) 164{ 165 // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. 166 VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); 167 168 void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData)); 169 if (!slot) { 170 return Error::from_errno(ENOMEM); 171 } 172 return adopt_ref(*new (slot) StringData(superstring, start, byte_count)); 173} 174 175void StringData::compute_hash() const 176{ 177 auto bytes = this->bytes(); 178 if (bytes.size() == 0) 179 m_hash = 0; 180 else 181 m_hash = string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size()); 182 m_has_hash = true; 183} 184 185} 186 187String::String(NonnullRefPtr<Detail::StringData const> data) 188 : m_data(&data.leak_ref()) 189{ 190} 191 192String::String(String const& other) 193 : m_data(other.m_data) 194{ 195 if (!is_short_string()) 196 m_data->ref(); 197} 198 199String::String(String&& other) 200 : m_data(exchange(other.m_data, nullptr)) 201{ 202 other.m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG; 203} 204 205String& String::operator=(String&& other) 206{ 207 if (!is_short_string()) 208 m_data->unref(); 209 210 m_data = exchange(other.m_data, nullptr); 211 other.m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG; 212 return *this; 213} 214 215String& String::operator=(String const& other) 216{ 217 if (&other != this) { 218 m_data = other.m_data; 219 if (!is_short_string()) 220 m_data->ref(); 221 } 222 return *this; 223} 224 225void String::destroy_string() 226{ 227 if (!is_short_string()) 228 m_data->unref(); 229} 230 231ErrorOr<String> String::from_utf8(StringView view) 232{ 233 if (!Utf8View { view }.validate()) 234 return Error::from_string_literal("String::from_utf8: Input was not valid UTF-8"); 235 236 if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) { 237 ShortString short_string; 238 if (!view.is_empty()) 239 memcpy(short_string.storage, view.characters_without_null_termination(), view.length()); 240 short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG; 241 return String { short_string }; 242 } 243 auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length())); 244 return String { move(data) }; 245} 246 247ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count) 248{ 249 if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) { 250 ShortString short_string; 251 if (byte_count > 0) 252 TRY(Detail::read_stream_into_buffer(stream, { short_string.storage, byte_count })); 253 short_string.byte_count_and_short_string_flag = (byte_count << 1) | SHORT_STRING_FLAG; 254 return String { short_string }; 255 } 256 auto data = TRY(Detail::StringData::from_stream(stream, byte_count)); 257 return String { move(data) }; 258} 259 260ErrorOr<String> String::repeated(u32 code_point, size_t count) 261{ 262 VERIFY(is_unicode(code_point)); 263 264 Array<u8, 4> code_point_as_utf8; 265 size_t i = 0; 266 267 size_t code_point_byte_length = UnicodeUtils::code_point_to_utf8(code_point, [&](auto byte) { 268 code_point_as_utf8[i++] = static_cast<u8>(byte); 269 }); 270 271 auto copy_to_buffer = [&](u8* buffer) { 272 if (code_point_byte_length == 1) { 273 memset(buffer, code_point_as_utf8[0], count); 274 return; 275 } 276 277 for (i = 0; i < count; ++i) 278 memcpy(buffer + (i * code_point_byte_length), code_point_as_utf8.data(), code_point_byte_length); 279 }; 280 281 auto total_byte_count = code_point_byte_length * count; 282 283 if (total_byte_count <= MAX_SHORT_STRING_BYTE_COUNT) { 284 ShortString short_string; 285 copy_to_buffer(short_string.storage); 286 short_string.byte_count_and_short_string_flag = (total_byte_count << 1) | SHORT_STRING_FLAG; 287 288 return String { short_string }; 289 } 290 291 u8* buffer = nullptr; 292 auto new_string_data = TRY(Detail::StringData::create_uninitialized(total_byte_count, buffer)); 293 copy_to_buffer(buffer); 294 295 return String { move(new_string_data) }; 296} 297 298StringView String::bytes_as_string_view() const 299{ 300 return StringView(bytes()); 301} 302 303ReadonlyBytes String::bytes() const 304{ 305 if (is_short_string()) 306 return m_short_string.bytes(); 307 return m_data->bytes(); 308} 309 310bool String::is_empty() const 311{ 312 return bytes().size() == 0; 313} 314 315ErrorOr<String> String::vformatted(StringView fmtstr, TypeErasedFormatParams& params) 316{ 317 StringBuilder builder; 318 TRY(vformat(builder, fmtstr, params)); 319 return builder.to_string(); 320} 321 322ErrorOr<Vector<String>> String::split(u32 separator, SplitBehavior split_behavior) const 323{ 324 return split_limit(separator, 0, split_behavior); 325} 326 327ErrorOr<Vector<String>> String::split_limit(u32 separator, size_t limit, SplitBehavior split_behavior) const 328{ 329 Vector<String> result; 330 331 if (is_empty()) 332 return result; 333 334 bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty); 335 336 size_t substring_start = 0; 337 for (auto it = code_points().begin(); it != code_points().end() && (result.size() + 1) != limit; ++it) { 338 u32 code_point = *it; 339 if (code_point == separator) { 340 size_t substring_length = code_points().iterator_offset(it) - substring_start; 341 if (substring_length != 0 || keep_empty) 342 TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, substring_length)))); 343 substring_start = code_points().iterator_offset(it) + it.underlying_code_point_length_in_bytes(); 344 } 345 } 346 size_t tail_length = code_points().byte_length() - substring_start; 347 if (tail_length != 0 || keep_empty) 348 TRY(result.try_append(TRY(substring_from_byte_offset_with_shared_superstring(substring_start, tail_length)))); 349 return result; 350} 351 352Optional<size_t> String::find_byte_offset(u32 code_point, size_t from_byte_offset) const 353{ 354 auto code_points = this->code_points(); 355 if (from_byte_offset >= code_points.byte_length()) 356 return {}; 357 358 for (auto it = code_points.iterator_at_byte_offset(from_byte_offset); it != code_points.end(); ++it) { 359 if (*it == code_point) 360 return code_points.byte_offset_of(it); 361 } 362 363 return {}; 364} 365 366Optional<size_t> String::find_byte_offset(StringView substring, size_t from_byte_offset) const 367{ 368 auto view = bytes_as_string_view(); 369 if (from_byte_offset >= view.length()) 370 return {}; 371 372 auto index = memmem_optional( 373 view.characters_without_null_termination() + from_byte_offset, view.length() - from_byte_offset, 374 substring.characters_without_null_termination(), substring.length()); 375 376 if (index.has_value()) 377 return *index + from_byte_offset; 378 return {}; 379} 380 381bool String::operator==(String const& other) const 382{ 383 if (is_short_string()) 384 return m_data == other.m_data; 385 return bytes_as_string_view() == other.bytes_as_string_view(); 386} 387 388bool String::operator==(FlyString const& other) const 389{ 390 if (reinterpret_cast<uintptr_t>(m_data) == other.data({})) 391 return true; 392 return bytes_as_string_view() == other.bytes_as_string_view(); 393} 394 395bool String::operator==(StringView other) const 396{ 397 return bytes_as_string_view() == other; 398} 399 400ErrorOr<String> String::substring_from_byte_offset(size_t start, size_t byte_count) const 401{ 402 if (!byte_count) 403 return String {}; 404 return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); 405} 406 407ErrorOr<String> String::substring_from_byte_offset(size_t start) const 408{ 409 VERIFY(start <= bytes_as_string_view().length()); 410 return substring_from_byte_offset(start, bytes_as_string_view().length() - start); 411} 412 413ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const 414{ 415 if (!byte_count) 416 return String {}; 417 if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) 418 return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); 419 return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) }; 420} 421 422ErrorOr<String> String::substring_from_byte_offset_with_shared_superstring(size_t start) const 423{ 424 VERIFY(start <= bytes_as_string_view().length()); 425 return substring_from_byte_offset_with_shared_superstring(start, bytes_as_string_view().length() - start); 426} 427 428bool String::operator==(char const* c_string) const 429{ 430 return bytes_as_string_view() == c_string; 431} 432 433u32 String::hash() const 434{ 435 if (is_short_string()) { 436 auto bytes = this->bytes(); 437 return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size()); 438 } 439 return m_data->hash(); 440} 441 442Utf8View String::code_points() const 443{ 444 return Utf8View(bytes_as_string_view()); 445} 446 447ErrorOr<void> Formatter<String>::format(FormatBuilder& builder, String const& utf8_string) 448{ 449 return Formatter<StringView>::format(builder, utf8_string.bytes_as_string_view()); 450} 451 452ErrorOr<String> String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const 453{ 454 return StringUtils::replace(*this, needle, replacement, replace_mode); 455} 456 457ErrorOr<String> String::reverse() const 458{ 459 // FIXME: This handles multi-byte code points, but not e.g. grapheme clusters. 460 // FIXME: We could avoid allocating a temporary vector if Utf8View supports reverse iteration. 461 auto code_point_length = code_points().length(); 462 463 Vector<u32> code_points; 464 TRY(code_points.try_ensure_capacity(code_point_length)); 465 466 for (auto code_point : this->code_points()) 467 code_points.unchecked_append(code_point); 468 469 auto builder = TRY(StringBuilder::create(code_point_length * sizeof(u32))); 470 while (!code_points.is_empty()) 471 TRY(builder.try_append_code_point(code_points.take_last())); 472 473 return builder.to_string(); 474} 475 476ErrorOr<String> String::trim(Utf8View const& code_points_to_trim, TrimMode mode) const 477{ 478 auto trimmed = code_points().trim(code_points_to_trim, mode); 479 return String::from_utf8(trimmed.as_string()); 480} 481 482ErrorOr<String> String::trim(StringView code_points_to_trim, TrimMode mode) const 483{ 484 return trim(Utf8View { code_points_to_trim }, mode); 485} 486 487bool String::contains(StringView needle, CaseSensitivity case_sensitivity) const 488{ 489 return StringUtils::contains(bytes_as_string_view(), needle, case_sensitivity); 490} 491 492bool String::contains(u32 needle, CaseSensitivity case_sensitivity) const 493{ 494 auto needle_as_string = String::from_code_point(needle); 495 return contains(needle_as_string.bytes_as_string_view(), case_sensitivity); 496} 497 498bool String::starts_with(u32 code_point) const 499{ 500 if (is_empty()) 501 return false; 502 503 return *code_points().begin() == code_point; 504} 505 506bool String::starts_with_bytes(StringView bytes) const 507{ 508 return bytes_as_string_view().starts_with(bytes); 509} 510 511bool String::ends_with(u32 code_point) const 512{ 513 if (is_empty()) 514 return false; 515 516 u32 last_code_point = 0; 517 for (auto it = code_points().begin(); it != code_points().end(); ++it) 518 last_code_point = *it; 519 520 return last_code_point == code_point; 521} 522 523bool String::ends_with_bytes(StringView bytes) const 524{ 525 return bytes_as_string_view().ends_with(bytes); 526} 527 528bool String::is_short_string() const 529{ 530 return has_short_string_bit(reinterpret_cast<uintptr_t>(m_data)); 531} 532 533ReadonlyBytes String::ShortString::bytes() const 534{ 535 return { storage, byte_count() }; 536} 537 538size_t String::ShortString::byte_count() const 539{ 540 return byte_count_and_short_string_flag >> 1; 541} 542 543unsigned Traits<String>::hash(String const& string) 544{ 545 return string.hash(); 546} 547 548String String::fly_string_data_to_string(Badge<FlyString>, uintptr_t const& data) 549{ 550 if (has_short_string_bit(data)) 551 return String { *reinterpret_cast<ShortString const*>(&data) }; 552 553 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data); 554 return String { NonnullRefPtr<Detail::StringData const>(*string_data) }; 555} 556 557StringView String::fly_string_data_to_string_view(Badge<FlyString>, uintptr_t const& data) 558{ 559 if (has_short_string_bit(data)) { 560 auto const* short_string = reinterpret_cast<ShortString const*>(&data); 561 return short_string->bytes(); 562 } 563 564 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data); 565 return string_data->bytes_as_string_view(); 566} 567 568u32 String::fly_string_data_to_hash(Badge<FlyString>, uintptr_t const& data) 569{ 570 if (has_short_string_bit(data)) { 571 auto const* short_string = reinterpret_cast<ShortString const*>(&data); 572 auto bytes = short_string->bytes(); 573 return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size()); 574 } 575 576 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data); 577 return string_data->hash(); 578} 579 580uintptr_t String::to_fly_string_data(Badge<FlyString>) const 581{ 582 return reinterpret_cast<uintptr_t>(m_data); 583} 584 585void String::ref_fly_string_data(Badge<FlyString>, uintptr_t data) 586{ 587 if (has_short_string_bit(data)) 588 return; 589 590 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data); 591 string_data->ref(); 592} 593 594void String::unref_fly_string_data(Badge<FlyString>, uintptr_t data) 595{ 596 if (has_short_string_bit(data)) 597 return; 598 599 auto const* string_data = reinterpret_cast<Detail::StringData const*>(data); 600 string_data->unref(); 601} 602 603void String::did_create_fly_string(Badge<FlyString>) const 604{ 605 VERIFY(!is_short_string()); 606 m_data->set_fly_string(true); 607} 608 609DeprecatedString String::to_deprecated_string() const 610{ 611 return DeprecatedString(bytes_as_string_view()); 612} 613 614ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string) 615{ 616 return String::from_utf8(deprecated_string.view()); 617} 618 619}