From dbbdfc98aac34cebf73fefd394571ee10a85e617 Mon Sep 17 00:00:00 2001 From: Sachymetsu Date: Sat, 13 Dec 2025 16:16:29 +0100 Subject: [PATCH] feat: Estrisation of Token Change-Id: uzkpmtywylpltnkwsqzxpkokortkspzk --- .tangled/workflows/miri.yml | 2 +- Cargo.lock | 78 +++++++++- crates/nailgen/Cargo.toml | 2 +- crates/nailgen/src/html_gen.rs | 25 +-- crates/nailgen/src/lib.rs | 26 +--- crates/nailkov/Cargo.toml | 4 +- crates/nailkov/src/distribution.rs | 13 +- crates/nailkov/src/interner.rs | 237 ----------------------------- crates/nailkov/src/lib.rs | 57 ++----- crates/nailkov/src/token.rs | 59 ++----- crates/nailroutes/src/lib.rs | 2 - crates/nailstate/src/lib.rs | 26 +--- src/inputs.rs | 15 +- src/main.rs | 4 +- 14 files changed, 133 insertions(+), 417 deletions(-) delete mode 100644 crates/nailkov/src/interner.rs diff --git a/.tangled/workflows/miri.yml b/.tangled/workflows/miri.yml index 72e19b3..104e474 100644 --- a/.tangled/workflows/miri.yml +++ b/.tangled/workflows/miri.yml @@ -16,6 +16,6 @@ steps: rustup override set nightly cargo miri setup - name: Miri Test - command: cargo miri test --locked -p nailkov -p nailbox + command: cargo miri test --locked -p nailbox -p nailgen environment: RUSTFLAGS: -Zrandomize-layout diff --git a/Cargo.lock b/Cargo.lock index 86b0b58..3c2cabc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anyhow" version = "1.0.100" @@ -168,6 +174,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.11.0" @@ -238,6 +250,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crossfig" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40a998414a3656e7a11ca59d55598ce7df58daafd742e783844e80bbd8d500dd" + [[package]] name = "diatomic-waker" version = "0.2.3" @@ -256,6 +280,21 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "estr" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e77c7dfb1a984132a140c98a805b5f12f8e8707420dda9a6ad698bc50fc041" +dependencies = [ + "byteorder", + "crossfig", + "hashbrown 0.16.1", + "libabort", + "lock_api", + "rapidhash", + "spin", +] + [[package]] name = "eyre" version = "0.6.12" @@ -290,6 +329,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "futures" version = "0.3.31" @@ -491,6 +536,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "hermit-abi" @@ -657,6 +707,15 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "libabort" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cec5d1db7977801dd3e593f88a15906cac3c4eb7a69e38b6cc162cb020b22d7d" +dependencies = [ + "rustversion-detect", +] + [[package]] name = "libc" version = "0.2.178" @@ -826,14 +885,16 @@ dependencies = [ name = "nailkov" version = "0.1.0" dependencies = [ + "crossbeam-utils", + "estr", "hashbrown 0.15.5", "indexmap", "itertools", "nailrng", + "parking_lot", "rand", "rand_distr", "rapidhash", - "rustc-hash", "tracing", "unicode-segmentation", ] @@ -1390,18 +1451,18 @@ version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rustversion-detect" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cfa9e87e97427c3a1b472eace073b2bc577ad0e1444c128d938b3d5bcdacb17" + [[package]] name = "ryu" version = "1.0.20" @@ -1545,6 +1606,9 @@ name = "spin" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" +dependencies = [ + "lock_api", +] [[package]] name = "syn" diff --git a/crates/nailgen/Cargo.toml b/crates/nailgen/Cargo.toml index 451ca7e..9dbf5ea 100644 --- a/crates/nailgen/Cargo.toml +++ b/crates/nailgen/Cargo.toml @@ -21,7 +21,7 @@ nailbox = { path = "../nailbox" } color-eyre.workspace = true pin-project-lite.workspace = true futures-lite.workspace = true -axum.workspace = true +axum = { workspace = true, features = ["matched-path"] } rand.workspace = true bytes.workspace = true tracing.workspace = true diff --git a/crates/nailgen/src/html_gen.rs b/crates/nailgen/src/html_gen.rs index 509d8f8..b184867 100644 --- a/crates/nailgen/src/html_gen.rs +++ b/crates/nailgen/src/html_gen.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use axum::extract::MatchedPath; use bytes::{Bytes, BytesMut}; use nailconfig::NailConfig; -use nailkov::{NailKov, interner::Interner}; +use nailkov::NailKov; use nailrng::FastRng; use rand::{Rng, RngCore, distr::Alphanumeric, seq::IndexedRandom}; @@ -24,7 +24,6 @@ fn get_desired_size(config: &NailConfig, rng: &mut impl RngCore) -> usize { /// interned text from the interner. #[inline] pub fn text_generator<'a>( - interner: &'a Interner, chain: &'a NailKov, size: usize, rng: &'a mut impl RngCore, @@ -33,7 +32,7 @@ pub fn text_generator<'a>( .generate_tokens(rng) .take(size) // SAFETY: The id comes from the same interner that allocated it - .flat_map(|token| unsafe { interner.lookup(token).as_bytes() }) + .flat_map(|token| token.as_bytes()) .skip_while(|&text| !text.is_ascii_alphabetic()) } @@ -64,7 +63,6 @@ pub fn static_content<'a>(text: &'a str) -> impl Iterator + 'a { pub async fn initial_content( buf_mut: BytesMut, - interner: Arc, chain: Arc, config: Arc, mut rng: FastRng, @@ -75,7 +73,6 @@ pub async fn initial_content( (0..max_paras) .fold(buf_mut, |mut acc, _| { acc.extend(paragraph( - &interner, &chain, get_desired_size(&config, &mut rng), &mut rng, @@ -88,7 +85,6 @@ pub async fn initial_content( pub async fn main_content( mut buffer: BytesMut, - interner: Arc, chain: Arc, config: Arc, mut rng: FastRng, @@ -96,19 +92,13 @@ pub async fn main_content( buffer.reserve(config.generator.chunk_size * 2); loop { - buffer.extend(header( - &interner, - &chain, - config.generator.header_size, - &mut rng, - )); + buffer.extend(header(&chain, config.generator.header_size, &mut rng)); // Randomise how many paragraphs we want per section let paragraphs = rng.random_range(1..=4); (0..paragraphs).for_each(|_| { buffer.extend(paragraph( - &interner, &chain, get_desired_size(&config, &mut rng), &mut rng, @@ -146,7 +136,6 @@ pub fn extra(buf_mut: &mut BytesMut, config: &NailConfig, rng: &mut FastRng) -> pub async fn footer( mut buf_mut: BytesMut, - interner: Arc, chain: Arc, path: MatchedPath, config: Arc, @@ -166,7 +155,7 @@ pub async fn footer( buf_mut.extend( b"\">" .iter() - .chain(text_generator(&interner, &chain, 8, &mut rng)) + .chain(text_generator(&chain, 8, &mut rng)) .chain(b"\n"), ); } @@ -178,26 +167,24 @@ pub async fn footer( #[inline] fn paragraph<'a>( - interner: &'a Interner, chain: &'a NailKov, size: usize, rng: &'a mut impl RngCore, ) -> impl Iterator + 'a { b"

" .iter() - .chain(text_generator(interner, chain, size, rng)) + .chain(text_generator(chain, size, rng)) .chain(b"

\n") } #[inline] fn header<'a>( - interner: &'a Interner, chain: &'a NailKov, size: usize, rng: &'a mut impl RngCore, ) -> impl Iterator + 'a { b"\n

" .iter() - .chain(text_generator(interner, chain, size, rng)) + .chain(text_generator(chain, size, rng)) .chain(b"

\n") } diff --git a/crates/nailgen/src/lib.rs b/crates/nailgen/src/lib.rs index aa5f40c..4a2d5a1 100644 --- a/crates/nailgen/src/lib.rs +++ b/crates/nailgen/src/lib.rs @@ -16,7 +16,7 @@ use color_eyre::Result; use futures_lite::Stream; use nailbox::{boxed_future_within, try_arc_within}; use nailconfig::NailConfig; -use nailkov::{NailKov, interner::Interner}; +use nailkov::NailKov; use nailrng::FastRng; use pin_project_lite::pin_project; use tokio::time::Sleep; @@ -59,7 +59,6 @@ pin_project! { pub struct MarkovStream { path: MatchedPath, config: Arc, - interner: Arc, markov: MarkovGen, start_time: Instant, total_bytes: usize, @@ -77,14 +76,12 @@ impl MarkovStream { markov: MarkovGen, path: MatchedPath, config: Arc, - interner: Arc, template: Template, rng: FastRng, ) -> Self { Self { path, config, - interner, markov, total_bytes: 0, start_time: Instant::now(), @@ -121,14 +118,9 @@ impl Stream for MarkovStream { let title = this.page_title.get_or_insert_with(|| { this.template.get_static_content().map_or_else( || { - text_generator( - this.interner, - &this.markov.chain, - 24, - this.rng, - ) - .copied() - .collect() + text_generator(&this.markov.chain, 24, this.rng) + .copied() + .collect() }, |title| static_title(title).copied().collect(), ) @@ -144,7 +136,6 @@ impl Stream for MarkovStream { let handle = boxed_future_within(|| { initial_content( buffer, - this.interner.clone(), this.markov.chain.clone(), this.config.clone(), this.rng.fork(), @@ -188,7 +179,6 @@ impl Stream for MarkovStream { let handle = boxed_future_within(|| { footer( buffer, - this.interner.clone(), this.markov.chain.clone(), this.path.clone(), this.config.clone(), @@ -237,7 +227,6 @@ impl Stream for MarkovStream { let handle = boxed_future_within(|| { main_content( buffer, - this.interner.clone(), this.markov.chain.clone(), this.config.clone(), this.rng.fork(), @@ -287,10 +276,10 @@ impl Stream for MarkovStream { } impl MarkovGen { - pub fn new(input: impl AsRef, interner: &mut Interner) -> Result { + pub fn new(input: impl AsRef) -> Result { let file = std::fs::read_to_string(input.as_ref())?; - let chain = try_arc_within(|| NailKov::from_input(interner, &file))?; + let chain = try_arc_within(|| NailKov::from_input(&file))?; Ok(Self { chain }) } @@ -300,10 +289,9 @@ impl MarkovGen { self, path: MatchedPath, config: Arc, - interner: Arc, template: Template, rng: FastRng, ) -> MarkovStream { - MarkovStream::new(self, path, config, interner, template, rng) + MarkovStream::new(self, path, config, template, rng) } } diff --git a/crates/nailkov/Cargo.toml b/crates/nailkov/Cargo.toml index cf86f6c..816ae34 100644 --- a/crates/nailkov/Cargo.toml +++ b/crates/nailkov/Cargo.toml @@ -13,7 +13,6 @@ workspace = true [dependencies] nailrng = { path = "../nailrng" } rapidhash = { workspace = true, features = ["rand", "unsafe"] } -rustc-hash = { version = "2.1.1", default-features = false } hashbrown.workspace = true rand.workspace = true rand_distr.workspace = true @@ -21,3 +20,6 @@ itertools.workspace = true unicode-segmentation.workspace = true tracing.workspace = true indexmap.workspace = true +parking_lot.workspace = true +estr = "1.2.0" +crossbeam-utils = "0.8.21" diff --git a/crates/nailkov/src/distribution.rs b/crates/nailkov/src/distribution.rs index 11f2edf..0c1f031 100644 --- a/crates/nailkov/src/distribution.rs +++ b/crates/nailkov/src/distribution.rs @@ -1,11 +1,14 @@ //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). +use core::hash::BuildHasherDefault; + +use estr::IdentityHasher; use indexmap::IndexMap; use rand::Rng; use rand_distr::{Distribution, weighted::WeightedAliasIndex}; -use crate::{RandomState, error::NailError, token::Token}; +use crate::{error::NailError, token::Token}; /// A distribution of choices and their likelihood. #[derive(Clone, Debug)] @@ -29,13 +32,13 @@ impl Distribution for TokenWeights { #[derive(Clone, Debug)] pub struct TokenWeightsBuilder { /// Counts how many times a token is likely to appear. - occurrences: IndexMap, + occurrences: IndexMap>, } impl TokenWeightsBuilder { - pub fn new(hasher: RandomState) -> Self { + pub fn new() -> Self { Self { - occurrences: IndexMap::with_hasher(hasher), + occurrences: IndexMap::with_hasher(Default::default()), } } @@ -64,6 +67,6 @@ impl TokenWeightsBuilder { impl Default for TokenWeightsBuilder { fn default() -> Self { - Self::new(RandomState::new()) + Self::new() } } diff --git a/crates/nailkov/src/interner.rs b/crates/nailkov/src/interner.rs deleted file mode 100644 index 33cddd3..0000000 --- a/crates/nailkov/src/interner.rs +++ /dev/null @@ -1,237 +0,0 @@ -use hashbrown::{Equivalent, HashMap}; -use rapidhash::fast::RandomState; - -use crate::token::Token; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -#[repr(transparent)] -struct StringPtr(*const str); - -impl StringPtr { - #[inline(always)] - const fn cast(&self) -> &str { - // SAFETY: The pointer is stable as it points to memory that is never - // moved/invalidated while this struct lives, therefore can be safely - // dereferenced back to a string slice. We own the String instance this - // references, and all StringPtrs are used within the same scope as the - // String instances, so when String drops, these will be dropped too. - unsafe { &*self.0 } - } -} - -impl core::hash::Hash for StringPtr { - #[inline] - fn hash(&self, state: &mut H) { - self.cast().hash(state); - } -} -// SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated -// while Interner lives, and all instances of StringPtr live as long as Interner. -// Since the String type is `Send`, so is StringPtr -unsafe impl Send for StringPtr {} -// SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated -// while Interner lives, and all instances of StringPtr live as long as Interner. -// Since the String type is `Sync`, so is StringPtr -unsafe impl Sync for StringPtr {} - -#[derive(Debug, Clone)] -pub struct Interner { - collected: HashMap, - index: Vec, - buffer: String, - stored: Vec, -} - -impl Default for Interner { - fn default() -> Self { - Self::with_capacity(256) - } -} - -impl Interner { - /// # Safety - /// The caller must ensure that the [`Token`] being passed in was allocated - /// from the same [`Interner`] instance. - #[inline(always)] - pub unsafe fn lookup(&self, id: Token) -> &str { - // SAFETY: Safety is upheld by the caller ensuring the id was allocated - // from the same interner. - unsafe { self.index.get_unchecked(id.index()).cast() } - } - - pub fn with_capacity(cap: usize) -> Interner { - // This will get us just under 64KiB of interned storage before we - // need to allocate more space for buffer storage. - let stored = Vec::with_capacity(8); - - Interner { - collected: HashMap::with_hasher(RandomState::new()), - index: Vec::new(), - stored, - buffer: String::with_capacity(cap.next_power_of_two()), - } - } - - pub fn intern(&mut self, text: &str) -> Token { - if let Some(&id) = self.collected.get(text) { - return id; - } - - // SAFETY: `alloc`` is never called elsewhere, nor the properties it controls - // are modified outside of the method. Here we get a new StringPtr for `text` that - // hasn't been stored before. - let name = unsafe { self.alloc(text) }; - let id = Token::new(self.index.len() as u32); - self.collected.insert(name, id); - self.index.push(name); - - // SAFETY: We are using the id allocated within the same function scope, - // so it is always from the same source. - unsafe { - debug_assert!(self.lookup(id).equivalent(&name)); - } - debug_assert!(self.intern(name.cast()) == id); - - id - } - - /// Allocates a new [`StringPtr`] for the given string input. If there is no more room - /// in the current buffer, it allocates a new buffer and creates the StringPtr to reference - /// the stored string in the new buffer, storing the old one. - /// - /// # Safety - /// - /// The caller must ensure that `self.buffers` and `self.active` are never modified elsewhere, - /// and that this is called only for new instances of `text`. - unsafe fn alloc(&mut self, text: &str) -> StringPtr { - let capacity = self.buffer.capacity(); - - if capacity < self.buffer.len() + text.len() { - // If we ran out of capacity in our storage, allocate a new buffer with - // larger capacity. - let new_cap = (capacity.max(text.len()) + 1).next_power_of_two(); - let old_buf = core::mem::replace(&mut self.buffer, String::with_capacity(new_cap)); - - self.stored.push(old_buf); - } - - // Construct raw str slice to eliminate lifetime tracking as we manage its - // lifetime within the Interner instance. - let interned = { - let start = self.buffer.len(); - self.buffer.push_str(text); - - &raw const self.buffer[start..] - }; - - StringPtr(interned) - } -} - -impl Equivalent for str { - #[inline(always)] - fn equivalent(&self, key: &StringPtr) -> bool { - key.cast().eq(self) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn string_ptr_comparisons() { - let one = "one"; - let two = "two"; - - let one_ptr = StringPtr(one); - let two_ptr = StringPtr(two); - - assert_ne!(one_ptr, two_ptr); - - assert!(one.equivalent(&one_ptr)); - } - - #[test] - fn is_able_to_intern_one_string() { - let mut interner = Interner::default(); - - assert!(interner.buffer.is_empty()); - - let text = "Lorem ipsum"; - - let id = interner.intern(text); - - // SAFETY: It comes from the same source - unsafe { - assert_eq!(text, interner.lookup(id)); - } - assert_eq!(interner.buffer.len(), 11); - - let again = interner.intern(text); - - assert_eq!(id, again); - assert_eq!(interner.buffer.len(), 11); - } - - #[test] - fn is_able_to_intern_many_strings() { - let mut interner = Interner::with_capacity(32); - - let texts = [ - "Lorem ipsum", - "dolor sit amet", - "duplicated", - "Other text", - "Elevenses", - "duplicated", - "Gibberish", - ]; - - let interned: Vec = texts.iter().map(|&text| interner.intern(text)).collect(); - - assert_eq!( - interned.as_slice(), - &[ - Token::new(0), - Token::new(1), - Token::new(2), - Token::new(3), - Token::new(4), - Token::new(2), - Token::new(5) - ] - ); - assert_eq!(interner.buffer.capacity(), 64); - assert_eq!(interner.stored.len(), 1); - assert_eq!(interner.stored[0].capacity(), 32); - } - - #[test] - fn is_thread_safe() { - let mut interner = Interner::with_capacity(32); - - let texts = [ - "Lorem ipsum", - "dolor sit amet", - "duplicated", - "Other text", - "Elevenses", - "duplicated", - "Gibberish", - ]; - - let interned: Vec = texts.iter().map(|&text| interner.intern(text)).collect(); - - std::thread::scope(|s| { - s.spawn(move || { - for (id, expected) in interned.into_iter().zip(texts) { - // SAFETY: It comes from the same source - unsafe { - assert_eq!(expected, interner.lookup(id)); - } - } - }); - }); - } -} diff --git a/crates/nailkov/src/lib.rs b/crates/nailkov/src/lib.rs index ed0548d..0c634cb 100644 --- a/crates/nailkov/src/lib.rs +++ b/crates/nailkov/src/lib.rs @@ -4,54 +4,23 @@ mod distribution; mod error; -pub mod interner; mod token; +use crossbeam_utils::CachePadded; use error::NailError; use indexmap::IndexMap; -use interner::Interner; use itertools::Itertools; -use nailrng::FastRng; use rand::{RngCore, seq::IteratorRandom}; use rand_distr::Distribution; use distribution::{TokenWeights, TokenWeightsBuilder}; -use rustc_hash::FxHasher; +use rapidhash::fast::RandomState; use token::{Token, TokenPair}; use unicode_segmentation::UnicodeSegmentation; -#[derive(Clone)] -pub struct RandomState { - seed: usize, -} - -impl RandomState { - fn new() -> Self { - let mut rng = FastRng::default(); - - Self { - seed: rng.next_u64() as usize, - } - } -} - -impl Default for RandomState { - fn default() -> Self { - Self::new() - } -} - -impl core::hash::BuildHasher for RandomState { - type Hasher = FxHasher; - - fn build_hasher(&self) -> Self::Hasher { - FxHasher::with_seed(self.seed) - } -} - #[derive(Clone, Debug)] pub struct NailKov { - chain: IndexMap, + chain: CachePadded>, } pub struct NailKovIter<'a, R: RngCore> { @@ -89,8 +58,8 @@ impl NailKov { } impl NailKov { - pub fn from_input(interner: &mut Interner, input: &str) -> Result { - NailBuilder::new(RandomState::new()).with_input(interner, input) + pub fn from_input(input: &str) -> Result { + NailBuilder::new(RandomState::new()).with_input(input) } } @@ -105,8 +74,8 @@ impl NailBuilder { } } - fn with_input(self, interned: &mut Interner, input: &str) -> Result { - self.feed_str(interned, input)?.build() + fn with_input(self, input: &str) -> Result { + self.feed_str(input)?.build() } fn build(self) -> Result { @@ -128,7 +97,7 @@ impl NailBuilder { return Err(NailError::EmptyInput); } - Ok(NailKov { chain }) + Ok(NailKov { chain: CachePadded::new(chain) }) } /// Add the occurrence of `next` following `prev`. @@ -138,19 +107,15 @@ impl NailBuilder { builder.add(next); } None => { - let mut builder = TokenWeightsBuilder::new(self.chain.hasher().clone()); + let mut builder = TokenWeightsBuilder::new(); builder.add(next); self.chain.insert(prev, builder); } } } - fn feed_str(self, interner: &mut Interner, content: &str) -> Result { - self.feed_tokens( - content - .split_word_bounds() - .map(|text| interner.intern(text)), - ) + fn feed_str(self, content: &str) -> Result { + self.feed_tokens(content.split_word_bounds().map(Token::from)) } fn feed_tokens(mut self, tokens: impl Iterator) -> Result { diff --git a/crates/nailkov/src/token.rs b/crates/nailkov/src/token.rs index d71b880..b2d36e4 100644 --- a/crates/nailkov/src/token.rs +++ b/crates/nailkov/src/token.rs @@ -1,33 +1,30 @@ -use std::ops::Deref; +use estr::Estr; /// Representation of a string segment. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] #[repr(transparent)] -pub struct Token(u32); +pub struct Token(Estr); impl Token { #[inline(always)] - pub const fn new(ptr: u32) -> Self { - Self(ptr) + pub fn new(str: &str) -> Self { + Self(Estr::from(str)) } - #[inline(always)] - pub(crate) const fn index(&self) -> usize { - self.0 as usize + #[inline] + pub fn as_str(&self) -> &'static str { + self.0.as_str() } - #[inline(always)] - const fn to_bits(self) -> u32 { - self.0 + #[inline] + pub fn as_bytes(&self) -> &'static [u8] { + self.0.as_str().as_bytes() } } -impl Deref for Token { - type Target = u32; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 +impl From<&str> for Token { + fn from(value: &str) -> Self { + Self::new(value) } } @@ -37,7 +34,7 @@ impl Deref for Token { // optimized codegen for `to_bits`, `PartialEq` // Prior art taken from my contribution to Bevy: // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 -#[repr(C, align(8))] +#[repr(C, align(16))] pub struct TokenPair { // Do not reorder the fields here. The ordering is explicitly used by repr(C) // to make this struct equivalent to a u64. @@ -76,8 +73,8 @@ impl TokenPair { } #[inline(always)] - const fn to_bits(self) -> u64 { - self.left.to_bits() as u64 | ((self.right.to_bits() as u64) << 32) + fn to_bits(self) -> u128 { + (self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64) } } @@ -87,27 +84,3 @@ impl AsRef for TokenPair { self } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn token_smoke_testing() { - let left = Token(0x2); - let right = Token(0x2b); - - let pair = TokenPair::new(left, right); - - assert_eq!(pair.to_bits(), 0x2b00000002); - assert_eq!(pair.left, left); - assert_eq!(pair.right, right); - - let other_right = Token(0x2c); - - let other_pair = TokenPair::new(left, other_right); - - assert_eq!(other_pair.to_bits(), 0x2c00000002); - assert_ne!(pair, other_pair); - } -} diff --git a/crates/nailroutes/src/lib.rs b/crates/nailroutes/src/lib.rs index 9a5248e..626e3b5 100644 --- a/crates/nailroutes/src/lib.rs +++ b/crates/nailroutes/src/lib.rs @@ -24,7 +24,6 @@ async fn warning( .into_stream( matched, config.clone_inner(), - inputs.get_interner(), inputs.get_warning_template(), rng, ) @@ -46,7 +45,6 @@ async fn generated( .into_stream( matched, config.clone_inner(), - inputs.get_interner(), inputs.get_generated_template(), rng, ) diff --git a/crates/nailstate/src/lib.rs b/crates/nailstate/src/lib.rs index 4afe31c..7fb0413 100644 --- a/crates/nailstate/src/lib.rs +++ b/crates/nailstate/src/lib.rs @@ -3,7 +3,6 @@ use std::{convert::Infallible, ops::Deref, sync::Arc}; use axum::extract::{FromRef, FromRequestParts}; use nailconfig::NailConfig; use nailgen::{GeneratedTemplate, MarkovGen, Template, WarningTemplate}; -use nailkov::interner::Interner; use nailrng::FastRng; use nailspicy::SpicyPayloads; use rand::seq::IndexedRandom; @@ -34,21 +33,12 @@ impl NailPayloads { #[derive(Clone)] pub struct NailInputs { chains: Arc<[MarkovGen]>, - interner: Arc, templates: Arc, } impl NailInputs { - pub fn new( - chains: Arc<[MarkovGen]>, - interner: Arc, - templates: Arc, - ) -> Self { - Self { - chains, - interner, - templates, - } + pub fn new(chains: Arc<[MarkovGen]>, templates: Arc) -> Self { + Self { chains, templates } } /// Pulls a random markov chain from the available list. Returns a cloned @@ -64,11 +54,6 @@ impl NailInputs { } } - #[inline] - pub fn get_interner(&self) -> Arc { - self.interner.clone() - } - #[inline] pub fn get_warning_template(&self) -> Template { Template::from(self.templates.warning.clone()) @@ -121,7 +106,6 @@ impl ServerState { pub fn new( config: impl Into, chains: Arc<[MarkovGen]>, - interner: Arc, templates: Arc, spicy_payloads: Option>, ) -> Self { @@ -129,11 +113,7 @@ impl ServerState { Self { config, - inputs: NailInputs { - chains, - interner, - templates, - }, + inputs: NailInputs { chains, templates }, spicy_payloads: NailPayloads { spicy_payloads }, } } diff --git a/src/inputs.rs b/src/inputs.rs index 2c4aa58..0c85d78 100644 --- a/src/inputs.rs +++ b/src/inputs.rs @@ -2,28 +2,21 @@ use std::{fs::read_to_string, sync::Arc}; use color_eyre::eyre::Context; use glob::glob; -use nailbox::{arc_within, try_arc_within}; +use nailbox::try_arc_within; use nailconfig::NailConfig; use nailgen::{GeneratedTemplate, MarkovGen, WarningTemplate}; -use nailkov::interner::Interner; use nailstate::Templates; /// Takes a glob for finding all input files and returns a read-only list of /// all markov chains that can be generated. -pub fn get_input_files( - config: &NailConfig, -) -> color_eyre::Result<(Arc<[MarkovGen]>, Arc)> { - let mut interner = arc_within(|| Interner::with_capacity(512)); - - let interned_mut = Arc::get_mut(&mut interner).unwrap(); - +pub fn get_input_files(config: &NailConfig) -> color_eyre::Result> { let inputs = glob(&config.generator.input_files)? .filter_map(|path| { path.inspect_err(|err| tracing::error!("IO Error: {err}")) .ok() }) .filter_map(|input| { - MarkovGen::new(input, interned_mut) + MarkovGen::new(input) .inspect_err(|err| tracing::error!("Markov Error: {err}")) .ok() }) @@ -33,7 +26,7 @@ pub fn get_input_files( color_eyre::eyre::bail!("No input files found! Exiting..."); } - Ok((inputs, interner)) + Ok(inputs) } pub fn get_template_files(config: &NailConfig) -> color_eyre::Result> { diff --git a/src/main.rs b/src/main.rs index 52f03a3..625a5d2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,14 +34,14 @@ fn main() -> Result<()> { let config = nailconfig::get_configuration()?; - let (inputs, interner) = nailpit::inputs::get_input_files(config.as_ref())?; + let inputs = nailpit::inputs::get_input_files(config.as_ref())?; let templates = nailpit::inputs::get_template_files(config.as_ref())?; let spicy = nailspicy::get_spicy_payload(config.as_ref()); nailrt::start( - nailstate::ServerState::new(config, inputs, interner, templates, spicy), + nailstate::ServerState::new(config, inputs, templates, spicy), spawn_axum_worker, )?; -- 2.52.0