// SPDX-FileCopyrightText: Copyright (C) 2024 Roland Csaszar // SPDX-License-Identifier: MPL-2.0 // // Project: token-string // File: string.rs // Date: 22.Nov.2024 // ============================================================================= //! The string type [`TokenString`]. extern crate alloc; use alloc::string::ToString as _; use alloc::vec; use core::{borrow, cmp, fmt, hash, mem, ops, panic, slice, str}; use crate::{StringPtr, TkStrError}; /// The length of the prefix of the string, that is, the first bytes stored /// in the field `prefix` for comparisons. pub const PREFIX_LENGTH: usize = mem::size_of::() - mem::size_of::(); /// Helper constant for matching intervals. const PREFIX_LENGTH_ADD1: usize = PREFIX_LENGTH + 1; /// The length of the non-prefix part of a "small string", 8 bytes. The content /// of the field `_d.small`. pub const SMALL_DATA_LENGTH: usize = mem::size_of::(); /// The maximum length in bytes, not Unicode scalar values, of a "small" string /// that is saved in the struct [`TokenString`] itself and not on the heap. pub const MAX_LENGTH_SMALL: usize = PREFIX_LENGTH + SMALL_DATA_LENGTH; /// Helper constant for matching intervals. pub const MAX_LENGTH_SMALL_ADD1: usize = MAX_LENGTH_SMALL + 1; /// The maximum length in bytes, not Unicode scalar values, of a /// [`TokenString`]. pub const MAX_LENGTH: usize = u16::MAX as usize; /// A string which can hold at most [`MAX_LENGTH`] bytes (not Unicode scalar /// values). /// /// This holds valid UTF-8 encoded strings only. /// Strings that are short enough, which need at most [`MAX_LENGTH_SMALL`] /// bytes, are stored in the struct itself, bigger ones use the heap. /// /// # Invariant /// /// - [`TokenString`] must be a UTF-8 string (like &[`prim@str`] and /// [`alloc::string::String`]). /// - The length of a [`TokenString`] is at most [`MAX_LENGTH`] and at least 0 - /// the empty string. #[repr(C)] pub struct TokenString { /// The length of the string. /// /// Maximum: [`MAX_LENGTH`]. pub(crate) len: u16, /// The first [`PREFIX_LENGTH`] bytes of the string. pub(crate) prefix: [u8; PREFIX_LENGTH], /// The data (see [`Data`]). /// /// If the string is at most [`MAX_LENGTH_SMALL`] bytes, this holds the /// other bytes of the string, else this is a pointer to the heap. pub(crate) u: Data, } // Invariants: [`TokenString`] must be aligned to 64 bits and its size must be // 128 bits. That means that `sizeof len + prefix == 64 bit` and // `sizeof u == 64 bit`. So there is no padding. const _: () = assert!( mem::align_of::() == mem::size_of::(), "struct TokenString is not aligned to 64 bits!" ); const _: () = assert!( mem::size_of::() == 2 * mem::size_of::(), "struct TokenString has size != 128 bits" ); const _: () = assert!( mem::align_of::() == mem::size_of::(), "struct Data is not aligned to 64 bits!" ); const _: () = assert!( mem::size_of::() == mem::size_of::(), "union Data has size != 64 bits" ); // ============================================================================= // Inner types of `TokenString`. /// This is either a pointer to the string, if the string is bigger than /// [`SMALL_DATA_LENGTH`] bytes, or a pointer to a string as an array of bytes. /// /// See [`StringPtr`] #[repr(C)] pub union Data { /// If the string is small enough (at most [`MAX_LENGTH_SMALL`]), its data /// after the prefix is here. pub(crate) small: [u8; SMALL_DATA_LENGTH], /// For bigger strings as [`MAX_LENGTH_SMALL`], this points to the memory /// holding the whole string. pub(crate) ptr: mem::ManuallyDrop, } // ============================================================================= // `TokenString` itself /// The empty string. /// /// Has a length of zero. pub const EMPTY: TokenString = TokenString { len: 0, prefix: [0_u8; PREFIX_LENGTH], u: Data { small: [0_u8; SMALL_DATA_LENGTH], }, }; // ============================================================================= // Traits impl TryFrom<&str> for TokenString { type Error = TkStrError; /// Create a [`TokenString`] from a &[`prim@str`]. /// /// Return [`TkStrError::TooBig`] if the argument is greater than /// [`MAX_LENGTH`]. /// /// Memory: /// /// Allocates if and only if the length of `value` is bigger than /// [`MAX_LENGTH_SMALL`]. fn try_from(value: &str) -> Result { let bytes = value.as_bytes(); match value.len() { | 0 => Ok(Self { len: 0, prefix: [0_u8; PREFIX_LENGTH], u: Data { small: [0_u8; SMALL_DATA_LENGTH], }, }), | 1 ..= PREFIX_LENGTH => { let s = value.len(); let mut prefix = [0_u8; PREFIX_LENGTH]; prefix[.. s].copy_from_slice(&bytes[.. s]); Ok(Self { #[expect( clippy::cast_possible_truncation, reason = "Length has been checked above" )] len: s as u16, prefix, u: Data { small: [0_u8; SMALL_DATA_LENGTH], }, }) } | PREFIX_LENGTH_ADD1 ..= MAX_LENGTH_SMALL => { let s = value.len(); let mut prefix = [0_u8; PREFIX_LENGTH]; prefix.copy_from_slice(&bytes[.. PREFIX_LENGTH]); let mut small = [0_u8; SMALL_DATA_LENGTH]; small[.. s - PREFIX_LENGTH] .copy_from_slice(&bytes[PREFIX_LENGTH .. s]); Ok(Self { #[expect( clippy::cast_possible_truncation, reason = "Length has been checked above" )] len: s as u16, prefix, u: Data { small }, }) } | MAX_LENGTH_SMALL_ADD1 ..= MAX_LENGTH => { let ptr = StringPtr::from(bytes); let u = Data { ptr: mem::ManuallyDrop::new(ptr), }; let mut prefix = [0_u8; PREFIX_LENGTH]; prefix.copy_from_slice(&bytes[.. PREFIX_LENGTH]); Ok(Self { #[expect( clippy::cast_possible_truncation, reason = "Length has been checked above" )] len: value.len() as u16, prefix, u, }) } | _ => Err(TkStrError::TooBig(value.len())), } } } impl TryFrom<&[u8]> for TokenString { type Error = TkStrError; /// Try to create a [`TokenString`] from the given slice. /// /// Return [`TkStrError::TooBig`] if the given slice is too big, greater /// than [`MAX_LENGTH`]. /// Return [`TkStrError::UnicodeError`] /// /// Memory: /// /// Allocates if and only if the length of `value` is bigger than /// [`MAX_LENGTH_SMALL`]. #[inline] fn try_from(value: &[u8]) -> Result { match str::from_utf8(value) { | Ok(str) => Self::try_from(str), | Err(utf_err) => Err(TkStrError::UnicodeError(utf_err)), } } } impl TryFrom<&[char]> for TokenString { type Error = TkStrError; /// Try to create a [`TokenString`] from the given slice. /// /// Return [`TkStrError::TooBig`] if the given slice is too big, greater /// than [`MAX_LENGTH`]. /// /// Memory /// /// Allocates and deallocates a temporary [`alloc::string::String`] /// collecting the converted bytes. #[inline] fn try_from(value: &[char]) -> Result { let i = value.iter(); Self::try_from(i.collect::()) } } impl TryFrom<&alloc::string::String> for TokenString { type Error = TkStrError; /// Create a `TokenString` from a &[`alloc::string::String`]. /// /// Return [`TkStrError::TooBig`] if the argument is greater than /// [`MAX_LENGTH`]. /// /// Memory: /// /// Allocates if and only if the length of `value` is bigger than /// [`MAX_LENGTH_SMALL`]. #[inline] fn try_from(value: &alloc::string::String) -> Result { let str = value.as_str(); Self::try_from(str) } } impl TryFrom for TokenString { type Error = TkStrError; /// Create a [`TokenString`] from a [`alloc::string::String`]. /// /// Return [`TkStrError::TooBig`] if the argument is greater than /// [`MAX_LENGTH`]. /// /// Memory: /// /// Allocates if and only if the length of `value` is bigger than /// [`MAX_LENGTH_SMALL`]. #[inline] fn try_from(value: alloc::string::String) -> Result { // Sadly we can't use the string's data directly, as a [`String`] has a // capacity which is to be known when deallocating the data. // See [`String::into_raw_parts`]. let str = value.as_str(); Self::try_from(str) } } impl Drop for TokenString { #[cfg_attr(test, mutants::skip)] #[inline] fn drop(&mut self) { if usize::from(self.len) > MAX_LENGTH_SMALL { // SAFETY: // We know that there is a pointer saved in the union. // The whole string is being dropped, so taking a mutable // reference of the pointer is legal. let mut m_ptr = unsafe { mem::ManuallyDrop::take(&mut self.u.ptr) }; m_ptr.drop_manually(self.len.into()); } } } impl Clone for TokenString { /// Return a clone of the [`TokenString`]. /// /// Memory: /// /// Allocates if and only if the length of `value` is bigger than /// [`MAX_LENGTH_SMALL`]. #[inline] fn clone(&self) -> Self { let u = if self.len as usize > MAX_LENGTH_SMALL { Data { // SAFETY: // We check, that there is an allocated pointer saved in the // union. ptr: mem::ManuallyDrop::new(unsafe { self.u.ptr.clone_manually(self.len.into()) }), } } else { Data { // SAFETY: // We check, that there is a small string in the union. small: unsafe { self.u.small }, } }; Self { len: self.len, prefix: self.prefix, u, } } } impl Default for TokenString { /// Return the empty string. #[inline] fn default() -> Self { EMPTY } } impl Eq for TokenString {} impl PartialEq for TokenString { #[inline] fn eq(&self, other: &Self) -> bool { if self.len != other.len || self.prefix != other.prefix { return false; } if self.len as usize <= MAX_LENGTH_SMALL { // SAFETY: // We know we have two small strings to compare. unsafe { self.u.small == other.u.small } } else { // SAFETY: // We know we have two string pointers to compare. unsafe { self.u.ptr.eq_manually(&other.u.ptr, self.len.into()) } } } } impl PartialEq<[u8]> for TokenString { fn eq(&self, other: &[u8]) -> bool { if self.len as usize != other.len() { return false; } let len = self.len as usize; match len { | 0 => true, | 1 ..= PREFIX_LENGTH => self.prefix[.. len] == other[.. len], | PREFIX_LENGTH_ADD1 ..= MAX_LENGTH_SMALL => { // SAFETY: // Use the whole memory region of self.`prefix` and // `self.u.small` as a single array. This is not UB, as the // whole memory `TokenString` has been allocated at once and // is guaranteed to be continuous in memory. If Miri // complains about this, use the flag `MIRIFLAGS=" // -Zmiri-tree-borrows"` to use "tree borrows" instead of // "stacked borrows". let bytes = unsafe { slice::from_raw_parts(self.prefix.as_ptr(), len) }; bytes == other } // SAFETY: // We know that the pointer actually points to allocated memory. | MAX_LENGTH_SMALL_ADD1 ..= MAX_LENGTH => unsafe { self.u.ptr.as_slice_manually(len) == other }, | _ => panic!("The TokenString is bigger than MAX_LENGTH!"), } } } impl PartialEq for TokenString { #[inline] fn eq(&self, other: &str) -> bool { self == other.as_bytes() } } impl PartialEq for TokenString { #[inline] fn eq(&self, other: &alloc::string::String) -> bool { self == other.as_bytes() } } impl Ord for TokenString { /// Compare two [`TokenString`]s byte-wise. /// /// This is not a sensible alphabetical comparison for anything that isn't /// ASCII. #[inline] fn cmp(&self, other: &Self) -> cmp::Ordering { let pref_ord = self.prefix.cmp(&other.prefix); if pref_ord != cmp::Ordering::Equal { return pref_ord; } self.suffix().cmp(other.suffix()) } } impl PartialOrd for TokenString { /// Compare two [`TokenString`]s byte-wise. /// /// This is not a sensible alphabetical comparison for anything that isn't /// ASCII. #[inline] fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl fmt::Display for TokenString { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl fmt::Debug for TokenString { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.len as usize > MAX_LENGTH_SMALL { let string = // SAFETY: // We know that the pointer points to a string. unsafe { self.u.ptr.as_string_manually(self.len.into()) }; // SAFETY: // We know that the pointer points to a string. let ptr = unsafe { &self.u.ptr }; f.debug_struct("TokenString") .field("len", &self.len) .field("prefix", &self.prefix_str()) .field("ptr", ptr) .field("string", &string) .finish() } else { // SAFETY: // We've checked that this is a small string. unsafe { f.debug_struct("TokenString") .field("len", &self.len) .field("prefix", &self.prefix_str()) .field("small", &self.small_str()) .field("string", &self.as_str()) .finish() } } } } impl ops::Index for TokenString where Idx: slice::SliceIndex, { type Output = Idx::Output; #[inline] fn index(&self, index: Idx) -> &Self::Output { self.as_str().index(index) } } impl borrow::Borrow for TokenString { #[inline] fn borrow(&self) -> &str { self.as_str() } } impl AsRef for TokenString { #[inline] fn as_ref(&self) -> &str { self.as_str() } } impl hash::Hash for TokenString { #[inline] fn hash(&self, state: &mut H) { self.as_str().hash(state); } } // SAFETY: // There can be no shared references of a `TokenString`. unsafe impl Send for TokenString {} // SAFETY: // `TokenString` is immutable. unsafe impl Sync for TokenString {} // ============================================================================= // Non trait methods impl TokenString { /// Return the prefix as a `&[u8]`. fn prefix_str(&self) -> &[u8] { let l = cmp::min(self.len as usize, PREFIX_LENGTH); &self.prefix[.. l] } /// Return the suffix of a small string as a `&[u8]`. /// /// # Safety /// /// Must be called with a small string only! unsafe fn small_str(&self) -> &[u8] { let l = (self.len as usize).saturating_sub(PREFIX_LENGTH); // SAFETY: // We know that the union contains a small string. unsafe { &self.u.small[.. l] } } /// Return the length of the string in bytes. /// /// This is the length of the string in bytes, not Unicode scalar values and /// not grapheme clusters. #[must_use] #[inline] pub const fn len(&self) -> usize { self.len as usize } /// Return `true` if the string is a "small string", that is, it is saved in /// the [`TokenString`] struct itself. /// /// If this returns `false`, the string is allocated on the heap. #[must_use] #[inline] pub const fn is_small(&self) -> bool { self.len as usize <= MAX_LENGTH_SMALL } /// Return `true`, if this is the empty string. /// /// Returns `false` else. #[must_use] #[inline] pub const fn is_empty(&self) -> bool { self.len == 0 } /// Convert to a [`TokenString`]. /// /// `bytes` must be valid UTF-8, use [`TokenString::try_from`] if you are /// not sure that it is valid. If the given byte slice is bigger than /// [`MAX_LENGTH`], this panics. /// /// Memory: /// /// Allocates if and only if the length of `bytes` is bigger than /// [`MAX_LENGTH_SMALL`]. /// /// # Panics /// /// Panics if `bytes` is bigger than [`MAX_LENGTH`]. /// /// # Safety /// /// `bytes` must be valid UTF-8, if not, all bets are off - UB! #[must_use] pub unsafe fn from_bytes_unchecked(bytes: &[u8]) -> Self { match bytes.len() { | 0 => Self { len: 0, prefix: [0_u8; PREFIX_LENGTH], u: Data { small: [0_u8; SMALL_DATA_LENGTH], }, }, | 1 ..= PREFIX_LENGTH => { let s = bytes.len(); let mut prefix = [0_u8; PREFIX_LENGTH]; prefix[.. s].copy_from_slice(&bytes[.. s]); Self { #[expect( clippy::cast_possible_truncation, reason = "Length has been checked above" )] len: s as u16, prefix, u: Data { small: [0_u8; SMALL_DATA_LENGTH], }, } } | PREFIX_LENGTH_ADD1 ..= MAX_LENGTH_SMALL => { let s = bytes.len(); let mut prefix = [0_u8; PREFIX_LENGTH]; prefix.copy_from_slice(&bytes[.. PREFIX_LENGTH]); let mut small = [0_u8; SMALL_DATA_LENGTH]; small[.. s - PREFIX_LENGTH] .copy_from_slice(&bytes[PREFIX_LENGTH .. s]); Self { #[expect( clippy::cast_possible_truncation, reason = "Length has been checked above" )] len: s as u16, prefix, u: Data { small }, } } | MAX_LENGTH_SMALL_ADD1 ..= MAX_LENGTH => { let ptr = StringPtr::from(bytes); let u = Data { ptr: mem::ManuallyDrop::new(ptr), }; let mut prefix = [0_u8; PREFIX_LENGTH]; prefix.copy_from_slice(&bytes[.. PREFIX_LENGTH]); Self { #[expect( clippy::cast_possible_truncation, reason = "Length has been checked above" )] len: bytes.len() as u16, prefix, u, } } | _ => panic!( "This byte slice is too big for a TokenString, {} > \ {MAX_LENGTH}", bytes.len() ), } } /// Convert to a [`TokenString`]. /// /// If the given string `s` is bigger than [`MAX_LENGTH`], this panics. Use /// [`TokenString::try_from`] for a function that does not panic. The string /// `s` must be valid UTF-8 too, but it has already been UB if it isn't. /// /// Memory: /// /// Allocates if and only if the length of `s` is bigger than /// [`MAX_LENGTH_SMALL`]. /// /// # Panics /// /// Panics if `s` is bigger than [`MAX_LENGTH`]. #[must_use] #[inline] pub fn from_str_unchecked(s: &str) -> Self { // SAFETY: // The unsafe part of `from_bytes_unchecked` is the possibility of the // byte slice not being valid UTF-8. We are processing an UTF-8 string // here. unsafe { Self::from_bytes_unchecked(s.as_bytes()) } } /// Convert to a [`TokenString`]. /// /// If the given string `s` is bigger than [`MAX_LENGTH`], this panics. Use /// [`TokenString::try_from`] for a function that does not panic. The string /// `s` must be valid UTF-8 too, but it has already been UB if it isn't. /// /// Memory: /// /// Allocates if and only if the length of `s` is bigger than /// [`MAX_LENGTH_SMALL`]. /// /// # Panics /// /// Panics if `s` is bigger than [`MAX_LENGTH`]. #[must_use] #[inline] pub fn from_string_unchecked(s: &alloc::string::String) -> Self { // SAFETY: // The unsafe part of `from_bytes_unchecked` is the possibility of the // byte slice not being valid UTF-8. We are processing an UTF-8 string // here. unsafe { Self::from_bytes_unchecked(s.as_bytes()) } } /// Return the string as a &[`prim@str`]. #[must_use] #[inline] pub fn as_str(&self) -> &str { if self.len == 0 { "" } else if self.len as usize > MAX_LENGTH_SMALL { // SAFETY: // We know, that in the union must be a valid pointer. unsafe { self.u.ptr.as_string_manually(self.len.into()) } } else { // SAFETY: // Use the whole memory region of self.`prefix` and `self.u.small` // as a single array. This is not UB, as the whole memory // `TokenString` has been allocated at once and is guaranteed to be // continuous in memory. If Miri complains about this, use the // flag `MIRIFLAGS="-Zmiri-tree-borrows"` to use "tree borrows" // instead of "stacked borrows". let bytes = unsafe { slice::from_raw_parts(self.prefix.as_ptr(), self.len.into()) }; // SAFETY: // The precondition of `TokenString` is that the string is a valid // UTF-8 byte sequence. unsafe { str::from_utf8_unchecked(bytes) } } } /// Return the string as a byte slice. #[must_use] #[inline] pub fn as_bytes(&self) -> &[u8] { if self.len == 0 { Default::default() } else if self.len as usize > MAX_LENGTH_SMALL { // SAFETY: // We know, that in the union must be a valid pointer. unsafe { self.u.ptr.as_slice_manually(self.len.into()) } } else { // SAFETY: // Use the whole memory region of self.`prefix` and `self.u.small` // as a single array. This is not UB, as the whole memory // `TokenString` has been allocated at once and is guaranteed to be // continuous in memory. If Miri complains about this, use the // flag `MIRIFLAGS="-Zmiri-tree-borrows"` to use "tree borrows" // instead of "stacked borrows". unsafe { slice::from_raw_parts(self.prefix.as_ptr(), self.len.into()) } } } /// Return the string as a new [`alloc::string::String`]. /// /// Memory: /// /// Allocates a new [`alloc::string::String`]. #[must_use] #[inline] pub fn as_string(&self) -> alloc::string::String { self.to_string() } /// Return the string as a new vector of [`char`]s. /// /// Memory: /// /// Allocates a new [`vec::Vec`]. #[must_use] #[inline] pub fn as_chars(&self) -> vec::Vec { self.as_str().chars().collect() } /// Return the part of the string which is not stored in `self.prefix`. /// /// If the string is <= [`PREFIX_LENGTH`], the empty slice is returned. fn suffix(&self) -> &[u8] { match self.len as usize { | 0 ..= PREFIX_LENGTH => Default::default(), | PREFIX_LENGTH_ADD1 ..= MAX_LENGTH_SMALL => // SAFETY: // We checked and know that this is a small string. unsafe { &self.u.small }, | MAX_LENGTH_SMALL_ADD1 ..= MAX_LENGTH => // SAFETY: // We checked and know that this string is allocated on the heap. unsafe { &self.u.ptr.as_slice_manually(self.len.into())[PREFIX_LENGTH ..] }, | _ => panic!( "Error: this TokenString is bigger than \ TokenString::MAX_LENGTH!" ), } } /// Return the byte at index `idx`, check bounds. /// /// Returns [`TkStrError::OutOfBounds`] if the index is bigger than the /// string's length. /// /// # Errors /// [`TkStrError::OutOfBounds`] if `idx` is bigger than the string's length. #[inline] pub fn get(&self, idx: u16) -> Result { if idx >= self.len { return Err(TkStrError::OutOfBounds(idx as usize)); } // SAFETY: // We check above that the index is in bounds. unsafe { Ok(*self.as_bytes().get_unchecked(idx as usize)) } } /// Return the byte at index `idx`, don't check bounds. /// /// Panics if the index is bigger than the /// string's length. /// /// # Panics /// /// if `idx` is bigger than the string's length. #[must_use] #[inline] pub fn get_unchecked(&self, idx: u16) -> u8 { assert!((idx < self.len), "index {idx} out of bounds"); // SAFETY: // We check above that the index is in bounds. unsafe { *self.as_bytes().get_unchecked(idx as usize) } } /// Return an iterator over the `[char]`s of a string. /// /// That is, an iterator over the Unicode scalar values of the /// `TokenString`. #[inline] pub fn chars(&'_ self) -> str::Chars<'_> { self.as_str().chars() } /// Get a reference iterator. #[must_use] #[inline] pub fn iter(&self) -> TokenStringIter<'_> { <&Self as IntoIterator>::into_iter(self) } /// Return `true`, if the first byte is an uppercase ASCII character. #[must_use] #[inline] pub const fn starts_ascii_uppercase(&self) -> bool { self.prefix[0].is_ascii_uppercase() } /// Return `true`, if the first byte is an lowercase ASCII character. #[must_use] #[inline] pub const fn starts_ascii_lowercase(&self) -> bool { self.prefix[0].is_ascii_lowercase() } /// Return `true`, if the string contains only ASCII characters. #[must_use] #[inline] pub fn is_ascii(&self) -> bool { self.as_bytes().is_ascii() } /// Return `true`, if the string starts with `needle`. /// /// Returns `true` too if the string is `needle`. #[must_use] #[inline] pub fn starts_with(&self, needle: &Self) -> bool { self.as_bytes().starts_with(needle.as_bytes()) } /// Return `true`, if the string starts with `needle`. /// /// Returns `true` too if the string is `needle`. #[must_use] #[inline] pub fn starts_with_bytes(&self, needle: &[u8]) -> bool { self.as_bytes().starts_with(needle) } /// Return `true`, if the string starts with `needle`. /// /// Returns `true` too if the string is `needle`. #[must_use] #[inline] pub fn starts_with_str(&self, needle: &str) -> bool { self.as_str().starts_with(needle) } /// Return `true`, if the string ends with `needle`. /// /// Returns `true` too if the string is `needle`. #[must_use] #[inline] pub fn ends_with(&self, needle: &Self) -> bool { self.as_bytes().ends_with(needle.as_bytes()) } /// Return `true`, if the string ends with `needle`. /// /// Returns `true` too if the string is `needle`. #[must_use] #[inline] pub fn ends_with_bytes(&self, needle: &[u8]) -> bool { self.as_bytes().ends_with(needle) } /// Return `true`, if the string ends with `needle`. /// /// Returns `true` too if the string is `needle`. #[must_use] #[inline] pub fn ends_with_str(&self, needle: &str) -> bool { self.as_str().ends_with(needle) } /// Map the given function `f` over the bytes of the string, mutating it. fn map_bytes_mut(&mut self, f: fn(&mut [u8]) -> ()) { if self.len as usize > MAX_LENGTH_SMALL { // SAFETY: // We check, that we actually have a valid pointer. unsafe { f((*self.u.ptr).as_slice_manually_mut(self.len as usize)); } } else { // SAFETY: // The two arrays, `prefix` and `small`, are guaranteed to be // continuous in memory. unsafe { f(slice::from_raw_parts_mut( self.prefix.as_mut_ptr(), self.len as usize, )); } } } /// Return a new string with all uppercase ASCII characters changed to /// lowercase. #[must_use] #[inline] pub fn to_ascii_lowercase(&self) -> Self { let mut ret_val = self.clone(); ret_val.map_bytes_mut(<[u8]>::make_ascii_lowercase); ret_val } /// Return a new string with all lowercase ASCII characters changed to /// uppercase. #[must_use] #[inline] pub fn to_ascii_uppercase(&self) -> Self { let mut ret_val = self.clone(); ret_val.map_bytes_mut(<[u8]>::make_ascii_uppercase); ret_val } /// Return a new string with all ASCII whitespace removed from the start and /// end. #[must_use] #[inline] pub fn trim_ascii(&self) -> Self { // SAFETY: // We copy the current string, so the invariants should hold for the // copy too. The string does not get longer, so cannot be greater than // `MAX_LENGTH`. unsafe { Self::from_bytes_unchecked(self.as_bytes().trim_ascii()) } } /// Return a new string with all ASCII whitespace removed from the start. #[must_use] #[inline] pub fn trim_ascii_start(&self) -> Self { // SAFETY: // We copy the current string, so the invariants should hold for the // copy too: // - The string does not get longer, so cannot be greater than // `MAX_LENGTH`. // - if the string is valid UTF-8, removing ASCII characters does not // change that. unsafe { Self::from_bytes_unchecked(self.as_bytes().trim_ascii_start()) } } /// Return a new string with all ASCII whitespace removed from the end. #[must_use] #[inline] pub fn trim_ascii_end(&self) -> Self { // SAFETY: // We copy the current string, so the invariants should hold for the // copy too: // - The string does not get longer, so cannot be greater than // `MAX_LENGTH`. // - if the string is valid UTF-8, removing ASCII characters does not // change that. unsafe { Self::from_bytes_unchecked(self.as_bytes().trim_ascii_end()) } } /// Return a new string with `prefix` removed from the start. #[cfg(feature = "pattern")] #[doc(cfg(pattern))] #[inline] pub fn strip_prefix( &self, prefix: P, ) -> Option { self.as_str() .strip_prefix(prefix) // stripping a prefix should not make the string invalid UTF-8, and // does shorten it. .map(Self::from_str_unchecked) } /// Return a new string with `suffix` removed from the end. #[cfg(feature = "pattern")] #[doc(cfg(pattern))] #[inline] pub fn strip_suffix

(&self, suffix: P) -> Option where P: str::pattern::Pattern, for<'a> P::Searcher<'a>: str::pattern::ReverseSearcher<'a>, { self.as_str() .strip_suffix(suffix) // stripping a suffix should not make the string invalid UTF-8, and // does shorten it. .map(Self::from_str_unchecked) } /// Return `true` if the string contains the pattern `pat`. /// /// Returns `false` else. /// /// The feature #[cfg(feature = "pattern")] #[doc(cfg(pattern))] #[inline] pub fn contains(&self, pat: P) -> bool { self.as_str().contains(pat) } } //============================================================================== // Iterating by reference /// Iterator struct for a `&TokenString`. /// /// Iterator items are single bytes, `u8`. pub struct TokenStringIter<'a> { /// The [`TokenString`] to iterate over. string: &'a TokenString, /// The current index in the string. idx: usize, } impl<'a> TokenStringIter<'a> { /// Generate a reference iterator for the given [`TokenString`]. #[must_use] #[inline] pub const fn new(s: &'a TokenString) -> Self { TokenStringIter { string: s, idx: 0 } } } impl Iterator for TokenStringIter<'_> { type Item = u8; /// Return either the next byte, [`u8`], or [`None`] if we are at the end of /// the string. fn next(&mut self) -> Option { debug_assert!( self.idx <= self.string.len.into(), "The iterator index '{0}' is greater than the string length '{1}'!", self.idx, self.string.len ); if self.idx == self.string.len.into() { None } else if self.string.len as usize > MAX_LENGTH_SMALL { self.idx += 1; Some(self.string.as_bytes()[self.idx - 1]) } else { self.idx += 1; Some( // SAFETY: // The two arrays, `prefix` and `u.small`, are guaranteed to be // consecutive in memory and allocated at the same time. unsafe { slice::from_raw_parts( self.string.prefix.as_ptr(), self.string.len as usize, ) }[self.idx - 1], ) } } } impl<'a> IntoIterator for &'a TokenString { type IntoIter = TokenStringIter<'a>; type Item = u8; #[inline] fn into_iter(self) -> Self::IntoIter { Self::IntoIter::new(self) } } //============================================================================== // Iterating an owned `TokenString`. /// Iterator struct for an owned [`TokenString`]. /// /// Iterator items are single bytes, [`u8`]. pub struct TokenStringIterOwn { /// The [`TokenString`] to iterate over. string: TokenString, /// The current index in the string. idx: usize, } impl TokenStringIterOwn { /// Generate an owned iterator for the given [`TokenString`]. #[must_use] #[inline] pub const fn new(s: TokenString) -> Self { Self { string: s, idx: 0 } } } impl Iterator for TokenStringIterOwn { type Item = u8; /// Return either the next byte, [`u8`], or [`None`] if we are at the end of /// the string. fn next(&mut self) -> Option { debug_assert!( self.idx <= self.string.len.into(), "The iterator index '{0}' is greater than the string length '{1}'!", self.idx, self.string.len ); if self.idx == self.string.len.into() { None } else if self.string.len as usize > MAX_LENGTH_SMALL { self.idx += 1; Some(self.string.as_bytes()[self.idx - 1]) } else { self.idx += 1; Some( // SAFETY: // The two arrays, `prefix` and `u.small`, are guaranteed to be // consecutive in memory and allocated at the same time. unsafe { slice::from_raw_parts( self.string.prefix.as_ptr(), self.string.len as usize, ) }[self.idx - 1], ) } } } impl IntoIterator for TokenString { type IntoIter = TokenStringIterOwn; type Item = u8; #[inline] fn into_iter(self) -> Self::IntoIter { Self::IntoIter::new(self) } } // ============================================================================= // Tests // ============================================================================= #[cfg(test)] mod prefix { extern crate std; use assert2::{check, let_assert}; use crate::TokenString; #[test] fn empty_is_empty() { let_assert!(Ok(res) = TokenString::try_from("")); check!(res.prefix[0] == 0); check!(res.len == 0); check!(res.is_small() == true); } #[test] fn clone_empty() { let_assert!(Ok(s1) = TokenString::try_from("")); let res = s1.clone(); check!(res.prefix[0] == s1.prefix[0]); check!(res.len == s1.len); check!(res.is_small() == true); } #[test] fn try_from_str() { let_assert!(Ok(res) = TokenString::try_from("123456")); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } #[test] fn clone() { let_assert!(Ok(s1) = TokenString::try_from("123456")); let res = s1.clone(); check!(&res.prefix[0 .. 6] == &s1.prefix[0 .. 6]); check!(res.len == s1.len); check!(res.is_small() == true); } #[test] fn try_from_bytes() { let s1: &[u8] = b"123456"; let_assert!(Ok(res) = TokenString::try_from(s1)); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } #[test] fn try_from_chars() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::vec::Vec = "123456".chars().collect(); let_assert!(Ok(res) = TokenString::try_from(s1.as_slice())); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } #[test] fn try_from_string() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "123456".into(); let_assert!(Ok(res) = TokenString::try_from(s1)); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } #[test] fn try_from_stringref() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "123456".into(); let_assert!(Ok(res) = TokenString::try_from(&s1)); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } #[test] fn from_str_unchecked() { let res = TokenString::from_str_unchecked("123456"); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); } #[test] fn from_bytes_unchecked() { let s1: &[u8] = b"123456"; // SAFETY: // We know that the string is valid UTF-8. let res = unsafe { TokenString::from_bytes_unchecked(s1) }; check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } #[test] fn from_stringref_unchecked() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "123456".into(); let res = TokenString::from_string_unchecked(&s1); check!(&res.prefix[0 .. 6] == b"123456"); check!(res.len == 6); check!(res.is_small() == true); } } #[cfg(test)] mod small { extern crate std; use assert2::{check, let_assert}; use crate::TokenString; #[test] fn try_from_str() { let_assert!(Ok(res) = TokenString::try_from("1234567")); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn clone() { let_assert!(Ok(s1) = TokenString::try_from("1234567")); let res = s1.clone(); check!(&res.prefix[0 .. 6] == &s1.prefix[0 .. 6]); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] == s1.u.small[0] }); check!(res.len == s1.len); check!(res.is_small() == true); } #[test] fn try_from_bytes() { let s1: &[u8] = b"1234567"; let_assert!(Ok(res) = TokenString::try_from(s1)); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn try_from_chars() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::vec::Vec = "1234567".chars().collect(); let_assert!(Ok(res) = TokenString::try_from(s1.as_slice())); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn try_from_string() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "1234567".into(); let_assert!(Ok(res) = TokenString::try_from(s1)); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn try_from_stringref() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "1234567".into(); let_assert!(Ok(res) = TokenString::try_from(&s1)); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn from_str_unchecked() { let res = TokenString::from_str_unchecked("1234567"); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn from_bytes_unchecked() { let s1: &[u8] = b"1234567"; // SAFETY: // We know that the string is valid UTF-8. let res = unsafe { TokenString::from_bytes_unchecked(s1) }; check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } #[test] fn from_stringref_unchecked() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "1234567".into(); let res = TokenString::from_string_unchecked(&s1); check!(&res.prefix[0 .. 6] == b"123456"); // SAFETY: // We know there is a small string in the union. check!(unsafe { res.u.small[0] } == b'7'); check!(res.len == 7); check!(res.is_small() == true); } } #[cfg(test)] mod heap { extern crate std; use assert2::{check, let_assert}; use crate::TokenString; #[test] fn try_from_str() { let_assert!(Ok(res) = TokenString::try_from("1234567890ABCDE")); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn clone() { let_assert!(Ok(s1) = TokenString::try_from("1234567890ABCDE")); let res = s1.clone(); check!(&res.prefix[0 .. 6] == &s1.prefix[0 .. 6]); check!( // SAFETY: // We know there is a large string in the union. unsafe { res.u.ptr.as_slice_manually(res.len as usize)[.. 15] == s1.u.ptr.as_slice_manually(res.len as usize)[.. 15] } ); check!(res.len == s1.len); check!(res.is_small() == false); } #[test] fn try_from_bytes() { let s1: &[u8] = b"1234567890ABCDE"; let_assert!(Ok(res) = TokenString::try_from(s1)); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn try_from_chars() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::vec::Vec = "1234567890ABCDE".chars().collect(); let_assert!(Ok(res) = TokenString::try_from(s1.as_slice())); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn try_from_string() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "1234567890ABCDE".into(); let_assert!(Ok(res) = TokenString::try_from(s1)); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn try_from_stringref() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "1234567890ABCDE".into(); let_assert!(Ok(res) = TokenString::try_from(&s1)); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn from_str_unchecked() { let res = TokenString::from_str_unchecked("1234567890ABCDE"); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn from_bytes_unchecked() { let s1: &[u8] = b"1234567890ABCDE"; // SAFETY: // We know that the string is valid UTF-8. let res = unsafe { TokenString::from_bytes_unchecked(s1) }; check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } #[test] fn from_stringref_unchecked() { #[expect( clippy::std_instead_of_alloc, reason = "We are testing, this needs std" )] let s1: std::string::String = "1234567890ABCDE".into(); let res = TokenString::from_string_unchecked(&s1); check!(&res.prefix[0 .. 6] == b"123456"); check!( // SAFETY: // We know there is a large string in the union. unsafe { &res.u.ptr.as_slice_manually(res.len as usize)[.. 15] } == b"1234567890ABCDE" ); check!(res.len == 15); check!(res.is_small() == false); } }