feat: Estrisation of Token #4

merged
opened by sachy.dev targeting main from token-refactor
Changed files
+133 -417
.tangled
workflows
crates
nailgen
nailkov
nailroutes
src
nailstate
src
src
+1 -1
.tangled/workflows/miri.yml
··· 16 16 rustup override set nightly 17 17 cargo miri setup 18 18 - name: Miri Test 19 - command: cargo miri test --locked -p nailkov -p nailbox 19 + command: cargo miri test --locked -p nailbox -p nailgen 20 20 environment: 21 21 RUSTFLAGS: -Zrandomize-layout
+71 -7
Cargo.lock
··· 26 26 "memchr", 27 27 ] 28 28 29 + [[package]] 30 + name = "allocator-api2" 31 + version = "0.2.21" 32 + source = "registry+https://github.com/rust-lang/crates.io-index" 33 + checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" 34 + 29 35 [[package]] 30 36 name = "anyhow" 31 37 version = "1.0.100" ··· 168 174 source = "registry+https://github.com/rust-lang/crates.io-index" 169 175 checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 170 176 177 + [[package]] 178 + name = "byteorder" 179 + version = "1.5.0" 180 + source = "registry+https://github.com/rust-lang/crates.io-index" 181 + checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 182 + 171 183 [[package]] 172 184 name = "bytes" 173 185 version = "1.11.0" ··· 238 250 "winapi", 239 251 ] 240 252 253 + [[package]] 254 + name = "crossbeam-utils" 255 + version = "0.8.21" 256 + source = "registry+https://github.com/rust-lang/crates.io-index" 257 + checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 258 + 259 + [[package]] 260 + name = "crossfig" 261 + version = "0.1.3" 262 + source = "registry+https://github.com/rust-lang/crates.io-index" 263 + checksum = "40a998414a3656e7a11ca59d55598ce7df58daafd742e783844e80bbd8d500dd" 264 + 241 265 [[package]] 242 266 name = "diatomic-waker" 243 267 version = "0.2.3" ··· 256 280 source = "registry+https://github.com/rust-lang/crates.io-index" 257 281 checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 258 282 283 + [[package]] 284 + name = "estr" 285 + version = "1.2.0" 286 + source = "registry+https://github.com/rust-lang/crates.io-index" 287 + checksum = "e2e77c7dfb1a984132a140c98a805b5f12f8e8707420dda9a6ad698bc50fc041" 288 + dependencies = [ 289 + "byteorder", 290 + "crossfig", 291 + "hashbrown 0.16.1", 292 + "libabort", 293 + "lock_api", 294 + "rapidhash", 295 + "spin", 296 + ] 297 + 259 298 [[package]] 260 299 name = "eyre" 261 300 version = "0.6.12" ··· 290 329 source = "registry+https://github.com/rust-lang/crates.io-index" 291 330 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 292 331 332 + [[package]] 333 + name = "foldhash" 334 + version = "0.2.0" 335 + source = "registry+https://github.com/rust-lang/crates.io-index" 336 + checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" 337 + 293 338 [[package]] 294 339 name = "futures" 295 340 version = "0.3.31" ··· 491 536 version = "0.16.1" 492 537 source = "registry+https://github.com/rust-lang/crates.io-index" 493 538 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" 539 + dependencies = [ 540 + "allocator-api2", 541 + "equivalent", 542 + "foldhash", 543 + ] 494 544 495 545 [[package]] 496 546 name = "hermit-abi" ··· 657 707 source = "registry+https://github.com/rust-lang/crates.io-index" 658 708 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 659 709 710 + [[package]] 711 + name = "libabort" 712 + version = "0.1.9" 713 + source = "registry+https://github.com/rust-lang/crates.io-index" 714 + checksum = "cec5d1db7977801dd3e593f88a15906cac3c4eb7a69e38b6cc162cb020b22d7d" 715 + dependencies = [ 716 + "rustversion-detect", 717 + ] 718 + 660 719 [[package]] 661 720 name = "libc" 662 721 version = "0.2.178" ··· 826 885 name = "nailkov" 827 886 version = "0.1.0" 828 887 dependencies = [ 888 + "crossbeam-utils", 889 + "estr", 829 890 "hashbrown 0.15.5", 830 891 "indexmap", 831 892 "itertools", 832 893 "nailrng", 894 + "parking_lot", 833 895 "rand", 834 896 "rand_distr", 835 897 "rapidhash", 836 - "rustc-hash", 837 898 "tracing", 838 899 "unicode-segmentation", 839 900 ] ··· 1390 1451 source = "registry+https://github.com/rust-lang/crates.io-index" 1391 1452 checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" 1392 1453 1393 - [[package]] 1394 - name = "rustc-hash" 1395 - version = "2.1.1" 1396 - source = "registry+https://github.com/rust-lang/crates.io-index" 1397 - checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" 1398 - 1399 1454 [[package]] 1400 1455 name = "rustversion" 1401 1456 version = "1.0.22" 1402 1457 source = "registry+https://github.com/rust-lang/crates.io-index" 1403 1458 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 1404 1459 1460 + [[package]] 1461 + name = "rustversion-detect" 1462 + version = "0.1.3" 1463 + source = "registry+https://github.com/rust-lang/crates.io-index" 1464 + checksum = "4cfa9e87e97427c3a1b472eace073b2bc577ad0e1444c128d938b3d5bcdacb17" 1465 + 1405 1466 [[package]] 1406 1467 name = "ryu" 1407 1468 version = "1.0.20" ··· 1545 1606 version = "0.10.0" 1546 1607 source = "registry+https://github.com/rust-lang/crates.io-index" 1547 1608 checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" 1609 + dependencies = [ 1610 + "lock_api", 1611 + ] 1548 1612 1549 1613 [[package]] 1550 1614 name = "syn"
+1 -1
crates/nailgen/Cargo.toml
··· 21 21 color-eyre.workspace = true 22 22 pin-project-lite.workspace = true 23 23 futures-lite.workspace = true 24 - axum.workspace = true 24 + axum = { workspace = true, features = ["matched-path"] } 25 25 rand.workspace = true 26 26 bytes.workspace = true 27 27 tracing.workspace = true
+6 -19
crates/nailgen/src/html_gen.rs
··· 3 3 use axum::extract::MatchedPath; 4 4 use bytes::{Bytes, BytesMut}; 5 5 use nailconfig::NailConfig; 6 - use nailkov::{NailKov, interner::Interner}; 6 + use nailkov::NailKov; 7 7 use nailrng::FastRng; 8 8 use rand::{Rng, RngCore, distr::Alphanumeric, seq::IndexedRandom}; 9 9 ··· 24 24 /// interned text from the interner. 25 25 #[inline] 26 26 pub fn text_generator<'a>( 27 - interner: &'a Interner, 28 27 chain: &'a NailKov, 29 28 size: usize, 30 29 rng: &'a mut impl RngCore, ··· 33 32 .generate_tokens(rng) 34 33 .take(size) 35 34 // SAFETY: The id comes from the same interner that allocated it 36 - .flat_map(|token| unsafe { interner.lookup(token).as_bytes() }) 35 + .flat_map(|token| token.as_bytes()) 37 36 .skip_while(|&text| !text.is_ascii_alphabetic()) 38 37 } 39 38 ··· 64 63 65 64 pub async fn initial_content( 66 65 buf_mut: BytesMut, 67 - interner: Arc<Interner>, 68 66 chain: Arc<NailKov>, 69 67 config: Arc<NailConfig>, 70 68 mut rng: FastRng, ··· 75 73 (0..max_paras) 76 74 .fold(buf_mut, |mut acc, _| { 77 75 acc.extend(paragraph( 78 - &interner, 79 76 &chain, 80 77 get_desired_size(&config, &mut rng), 81 78 &mut rng, ··· 88 85 89 86 pub async fn main_content( 90 87 mut buffer: BytesMut, 91 - interner: Arc<Interner>, 92 88 chain: Arc<NailKov>, 93 89 config: Arc<NailConfig>, 94 90 mut rng: FastRng, ··· 96 92 buffer.reserve(config.generator.chunk_size * 2); 97 93 98 94 loop { 99 - buffer.extend(header( 100 - &interner, 101 - &chain, 102 - config.generator.header_size, 103 - &mut rng, 104 - )); 95 + buffer.extend(header(&chain, config.generator.header_size, &mut rng)); 105 96 106 97 // Randomise how many paragraphs we want per section 107 98 let paragraphs = rng.random_range(1..=4); 108 99 109 100 (0..paragraphs).for_each(|_| { 110 101 buffer.extend(paragraph( 111 - &interner, 112 102 &chain, 113 103 get_desired_size(&config, &mut rng), 114 104 &mut rng, ··· 146 136 147 137 pub async fn footer( 148 138 mut buf_mut: BytesMut, 149 - interner: Arc<Interner>, 150 139 chain: Arc<NailKov>, 151 140 path: MatchedPath, 152 141 config: Arc<NailConfig>, ··· 166 155 buf_mut.extend( 167 156 b"\">" 168 157 .iter() 169 - .chain(text_generator(&interner, &chain, 8, &mut rng)) 158 + .chain(text_generator(&chain, 8, &mut rng)) 170 159 .chain(b"</a></li>\n"), 171 160 ); 172 161 } ··· 178 167 179 168 #[inline] 180 169 fn paragraph<'a>( 181 - interner: &'a Interner, 182 170 chain: &'a NailKov, 183 171 size: usize, 184 172 rng: &'a mut impl RngCore, 185 173 ) -> impl Iterator<Item = &'a u8> + 'a { 186 174 b"<p>" 187 175 .iter() 188 - .chain(text_generator(interner, chain, size, rng)) 176 + .chain(text_generator(chain, size, rng)) 189 177 .chain(b"</p>\n") 190 178 } 191 179 192 180 #[inline] 193 181 fn header<'a>( 194 - interner: &'a Interner, 195 182 chain: &'a NailKov, 196 183 size: usize, 197 184 rng: &'a mut impl RngCore, 198 185 ) -> impl Iterator<Item = &'a u8> + 'a { 199 186 b"\n<h2>" 200 187 .iter() 201 - .chain(text_generator(interner, chain, size, rng)) 188 + .chain(text_generator(chain, size, rng)) 202 189 .chain(b"</h2>\n") 203 190 }
+7 -19
crates/nailgen/src/lib.rs
··· 16 16 use futures_lite::Stream; 17 17 use nailbox::{boxed_future_within, try_arc_within}; 18 18 use nailconfig::NailConfig; 19 - use nailkov::{NailKov, interner::Interner}; 19 + use nailkov::NailKov; 20 20 use nailrng::FastRng; 21 21 use pin_project_lite::pin_project; 22 22 use tokio::time::Sleep; ··· 59 59 pub struct MarkovStream { 60 60 path: MatchedPath, 61 61 config: Arc<NailConfig>, 62 - interner: Arc<Interner>, 63 62 markov: MarkovGen, 64 63 start_time: Instant, 65 64 total_bytes: usize, ··· 77 76 markov: MarkovGen, 78 77 path: MatchedPath, 79 78 config: Arc<NailConfig>, 80 - interner: Arc<Interner>, 81 79 template: Template, 82 80 rng: FastRng, 83 81 ) -> Self { 84 82 Self { 85 83 path, 86 84 config, 87 - interner, 88 85 markov, 89 86 total_bytes: 0, 90 87 start_time: Instant::now(), ··· 121 118 let title = this.page_title.get_or_insert_with(|| { 122 119 this.template.get_static_content().map_or_else( 123 120 || { 124 - text_generator( 125 - this.interner, 126 - &this.markov.chain, 127 - 24, 128 - this.rng, 129 - ) 130 - .copied() 131 - .collect() 121 + text_generator(&this.markov.chain, 24, this.rng) 122 + .copied() 123 + .collect() 132 124 }, 133 125 |title| static_title(title).copied().collect(), 134 126 ) ··· 144 136 let handle = boxed_future_within(|| { 145 137 initial_content( 146 138 buffer, 147 - this.interner.clone(), 148 139 this.markov.chain.clone(), 149 140 this.config.clone(), 150 141 this.rng.fork(), ··· 188 179 let handle = boxed_future_within(|| { 189 180 footer( 190 181 buffer, 191 - this.interner.clone(), 192 182 this.markov.chain.clone(), 193 183 this.path.clone(), 194 184 this.config.clone(), ··· 237 227 let handle = boxed_future_within(|| { 238 228 main_content( 239 229 buffer, 240 - this.interner.clone(), 241 230 this.markov.chain.clone(), 242 231 this.config.clone(), 243 232 this.rng.fork(), ··· 287 276 } 288 277 289 278 impl MarkovGen { 290 - pub fn new(input: impl AsRef<Path>, interner: &mut Interner) -> Result<Self> { 279 + pub fn new(input: impl AsRef<Path>) -> Result<Self> { 291 280 let file = std::fs::read_to_string(input.as_ref())?; 292 281 293 - let chain = try_arc_within(|| NailKov::from_input(interner, &file))?; 282 + let chain = try_arc_within(|| NailKov::from_input(&file))?; 294 283 295 284 Ok(Self { chain }) 296 285 } ··· 300 289 self, 301 290 path: MatchedPath, 302 291 config: Arc<NailConfig>, 303 - interner: Arc<Interner>, 304 292 template: Template, 305 293 rng: FastRng, 306 294 ) -> MarkovStream { 307 - MarkovStream::new(self, path, config, interner, template, rng) 295 + MarkovStream::new(self, path, config, template, rng) 308 296 } 309 297 }
+3 -1
crates/nailkov/Cargo.toml
··· 13 13 [dependencies] 14 14 nailrng = { path = "../nailrng" } 15 15 rapidhash = { workspace = true, features = ["rand", "unsafe"] } 16 - rustc-hash = { version = "2.1.1", default-features = false } 17 16 hashbrown.workspace = true 18 17 rand.workspace = true 19 18 rand_distr.workspace = true ··· 21 20 unicode-segmentation.workspace = true 22 21 tracing.workspace = true 23 22 indexmap.workspace = true 23 + parking_lot.workspace = true 24 + estr = "1.2.0" 25 + crossbeam-utils = "0.8.21"
+8 -5
crates/nailkov/src/distribution.rs
··· 1 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 3 4 + use core::hash::BuildHasherDefault; 5 + 6 + use estr::IdentityHasher; 4 7 use indexmap::IndexMap; 5 8 use rand::Rng; 6 9 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 7 10 8 - use crate::{RandomState, error::NailError, token::Token}; 11 + use crate::{error::NailError, token::Token}; 9 12 10 13 /// A distribution of choices and their likelihood. 11 14 #[derive(Clone, Debug)] ··· 29 32 #[derive(Clone, Debug)] 30 33 pub struct TokenWeightsBuilder { 31 34 /// Counts how many times a token is likely to appear. 32 - occurrences: IndexMap<Token, u32, RandomState>, 35 + occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>, 33 36 } 34 37 35 38 impl TokenWeightsBuilder { 36 - pub fn new(hasher: RandomState) -> Self { 39 + pub fn new() -> Self { 37 40 Self { 38 - occurrences: IndexMap::with_hasher(hasher), 41 + occurrences: IndexMap::with_hasher(Default::default()), 39 42 } 40 43 } 41 44 ··· 64 67 65 68 impl Default for TokenWeightsBuilder { 66 69 fn default() -> Self { 67 - Self::new(RandomState::new()) 70 + Self::new() 68 71 } 69 72 }
-237
crates/nailkov/src/interner.rs
··· 1 - use hashbrown::{Equivalent, HashMap}; 2 - use rapidhash::fast::RandomState; 3 - 4 - use crate::token::Token; 5 - 6 - #[derive(Debug, Clone, Copy, PartialEq, Eq)] 7 - #[repr(transparent)] 8 - struct StringPtr(*const str); 9 - 10 - impl StringPtr { 11 - #[inline(always)] 12 - const fn cast(&self) -> &str { 13 - // SAFETY: The pointer is stable as it points to memory that is never 14 - // moved/invalidated while this struct lives, therefore can be safely 15 - // dereferenced back to a string slice. We own the String instance this 16 - // references, and all StringPtrs are used within the same scope as the 17 - // String instances, so when String drops, these will be dropped too. 18 - unsafe { &*self.0 } 19 - } 20 - } 21 - 22 - impl core::hash::Hash for StringPtr { 23 - #[inline] 24 - fn hash<H: std::hash::Hasher>(&self, state: &mut H) { 25 - self.cast().hash(state); 26 - } 27 - } 28 - // SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated 29 - // while Interner lives, and all instances of StringPtr live as long as Interner. 30 - // Since the String type is `Send`, so is StringPtr 31 - unsafe impl Send for StringPtr {} 32 - // SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated 33 - // while Interner lives, and all instances of StringPtr live as long as Interner. 34 - // Since the String type is `Sync`, so is StringPtr 35 - unsafe impl Sync for StringPtr {} 36 - 37 - #[derive(Debug, Clone)] 38 - pub struct Interner { 39 - collected: HashMap<StringPtr, Token, RandomState>, 40 - index: Vec<StringPtr>, 41 - buffer: String, 42 - stored: Vec<String>, 43 - } 44 - 45 - impl Default for Interner { 46 - fn default() -> Self { 47 - Self::with_capacity(256) 48 - } 49 - } 50 - 51 - impl Interner { 52 - /// # Safety 53 - /// The caller must ensure that the [`Token`] being passed in was allocated 54 - /// from the same [`Interner`] instance. 55 - #[inline(always)] 56 - pub unsafe fn lookup(&self, id: Token) -> &str { 57 - // SAFETY: Safety is upheld by the caller ensuring the id was allocated 58 - // from the same interner. 59 - unsafe { self.index.get_unchecked(id.index()).cast() } 60 - } 61 - 62 - pub fn with_capacity(cap: usize) -> Interner { 63 - // This will get us just under 64KiB of interned storage before we 64 - // need to allocate more space for buffer storage. 65 - let stored = Vec::with_capacity(8); 66 - 67 - Interner { 68 - collected: HashMap::with_hasher(RandomState::new()), 69 - index: Vec::new(), 70 - stored, 71 - buffer: String::with_capacity(cap.next_power_of_two()), 72 - } 73 - } 74 - 75 - pub fn intern(&mut self, text: &str) -> Token { 76 - if let Some(&id) = self.collected.get(text) { 77 - return id; 78 - } 79 - 80 - // SAFETY: `alloc`` is never called elsewhere, nor the properties it controls 81 - // are modified outside of the method. Here we get a new StringPtr for `text` that 82 - // hasn't been stored before. 83 - let name = unsafe { self.alloc(text) }; 84 - let id = Token::new(self.index.len() as u32); 85 - self.collected.insert(name, id); 86 - self.index.push(name); 87 - 88 - // SAFETY: We are using the id allocated within the same function scope, 89 - // so it is always from the same source. 90 - unsafe { 91 - debug_assert!(self.lookup(id).equivalent(&name)); 92 - } 93 - debug_assert!(self.intern(name.cast()) == id); 94 - 95 - id 96 - } 97 - 98 - /// Allocates a new [`StringPtr`] for the given string input. If there is no more room 99 - /// in the current buffer, it allocates a new buffer and creates the StringPtr to reference 100 - /// the stored string in the new buffer, storing the old one. 101 - /// 102 - /// # Safety 103 - /// 104 - /// The caller must ensure that `self.buffers` and `self.active` are never modified elsewhere, 105 - /// and that this is called only for new instances of `text`. 106 - unsafe fn alloc(&mut self, text: &str) -> StringPtr { 107 - let capacity = self.buffer.capacity(); 108 - 109 - if capacity < self.buffer.len() + text.len() { 110 - // If we ran out of capacity in our storage, allocate a new buffer with 111 - // larger capacity. 112 - let new_cap = (capacity.max(text.len()) + 1).next_power_of_two(); 113 - let old_buf = core::mem::replace(&mut self.buffer, String::with_capacity(new_cap)); 114 - 115 - self.stored.push(old_buf); 116 - } 117 - 118 - // Construct raw str slice to eliminate lifetime tracking as we manage its 119 - // lifetime within the Interner instance. 120 - let interned = { 121 - let start = self.buffer.len(); 122 - self.buffer.push_str(text); 123 - 124 - &raw const self.buffer[start..] 125 - }; 126 - 127 - StringPtr(interned) 128 - } 129 - } 130 - 131 - impl Equivalent<StringPtr> for str { 132 - #[inline(always)] 133 - fn equivalent(&self, key: &StringPtr) -> bool { 134 - key.cast().eq(self) 135 - } 136 - } 137 - 138 - #[cfg(test)] 139 - mod tests { 140 - use super::*; 141 - 142 - #[test] 143 - fn string_ptr_comparisons() { 144 - let one = "one"; 145 - let two = "two"; 146 - 147 - let one_ptr = StringPtr(one); 148 - let two_ptr = StringPtr(two); 149 - 150 - assert_ne!(one_ptr, two_ptr); 151 - 152 - assert!(one.equivalent(&one_ptr)); 153 - } 154 - 155 - #[test] 156 - fn is_able_to_intern_one_string() { 157 - let mut interner = Interner::default(); 158 - 159 - assert!(interner.buffer.is_empty()); 160 - 161 - let text = "Lorem ipsum"; 162 - 163 - let id = interner.intern(text); 164 - 165 - // SAFETY: It comes from the same source 166 - unsafe { 167 - assert_eq!(text, interner.lookup(id)); 168 - } 169 - assert_eq!(interner.buffer.len(), 11); 170 - 171 - let again = interner.intern(text); 172 - 173 - assert_eq!(id, again); 174 - assert_eq!(interner.buffer.len(), 11); 175 - } 176 - 177 - #[test] 178 - fn is_able_to_intern_many_strings() { 179 - let mut interner = Interner::with_capacity(32); 180 - 181 - let texts = [ 182 - "Lorem ipsum", 183 - "dolor sit amet", 184 - "duplicated", 185 - "Other text", 186 - "Elevenses", 187 - "duplicated", 188 - "Gibberish", 189 - ]; 190 - 191 - let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect(); 192 - 193 - assert_eq!( 194 - interned.as_slice(), 195 - &[ 196 - Token::new(0), 197 - Token::new(1), 198 - Token::new(2), 199 - Token::new(3), 200 - Token::new(4), 201 - Token::new(2), 202 - Token::new(5) 203 - ] 204 - ); 205 - assert_eq!(interner.buffer.capacity(), 64); 206 - assert_eq!(interner.stored.len(), 1); 207 - assert_eq!(interner.stored[0].capacity(), 32); 208 - } 209 - 210 - #[test] 211 - fn is_thread_safe() { 212 - let mut interner = Interner::with_capacity(32); 213 - 214 - let texts = [ 215 - "Lorem ipsum", 216 - "dolor sit amet", 217 - "duplicated", 218 - "Other text", 219 - "Elevenses", 220 - "duplicated", 221 - "Gibberish", 222 - ]; 223 - 224 - let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect(); 225 - 226 - std::thread::scope(|s| { 227 - s.spawn(move || { 228 - for (id, expected) in interned.into_iter().zip(texts) { 229 - // SAFETY: It comes from the same source 230 - unsafe { 231 - assert_eq!(expected, interner.lookup(id)); 232 - } 233 - } 234 - }); 235 - }); 236 - } 237 - }
+11 -46
crates/nailkov/src/lib.rs
··· 4 4 5 5 mod distribution; 6 6 mod error; 7 - pub mod interner; 8 7 mod token; 9 8 9 + use crossbeam_utils::CachePadded; 10 10 use error::NailError; 11 11 use indexmap::IndexMap; 12 - use interner::Interner; 13 12 use itertools::Itertools; 14 - use nailrng::FastRng; 15 13 use rand::{RngCore, seq::IteratorRandom}; 16 14 use rand_distr::Distribution; 17 15 18 16 use distribution::{TokenWeights, TokenWeightsBuilder}; 19 - use rustc_hash::FxHasher; 17 + use rapidhash::fast::RandomState; 20 18 use token::{Token, TokenPair}; 21 19 use unicode_segmentation::UnicodeSegmentation; 22 20 23 - #[derive(Clone)] 24 - pub struct RandomState { 25 - seed: usize, 26 - } 27 - 28 - impl RandomState { 29 - fn new() -> Self { 30 - let mut rng = FastRng::default(); 31 - 32 - Self { 33 - seed: rng.next_u64() as usize, 34 - } 35 - } 36 - } 37 - 38 - impl Default for RandomState { 39 - fn default() -> Self { 40 - Self::new() 41 - } 42 - } 43 - 44 - impl core::hash::BuildHasher for RandomState { 45 - type Hasher = FxHasher; 46 - 47 - fn build_hasher(&self) -> Self::Hasher { 48 - FxHasher::with_seed(self.seed) 49 - } 50 - } 51 - 52 21 #[derive(Clone, Debug)] 53 22 pub struct NailKov { 54 - chain: IndexMap<TokenPair, TokenWeights, RandomState>, 23 + chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>, 55 24 } 56 25 57 26 pub struct NailKovIter<'a, R: RngCore> { ··· 89 58 } 90 59 91 60 impl NailKov { 92 - pub fn from_input(interner: &mut Interner, input: &str) -> Result<NailKov, NailError> { 93 - NailBuilder::new(RandomState::new()).with_input(interner, input) 61 + pub fn from_input(input: &str) -> Result<NailKov, NailError> { 62 + NailBuilder::new(RandomState::new()).with_input(input) 94 63 } 95 64 } 96 65 ··· 105 74 } 106 75 } 107 76 108 - fn with_input(self, interned: &mut Interner, input: &str) -> Result<NailKov, NailError> { 109 - self.feed_str(interned, input)?.build() 77 + fn with_input(self, input: &str) -> Result<NailKov, NailError> { 78 + self.feed_str(input)?.build() 110 79 } 111 80 112 81 fn build(self) -> Result<NailKov, NailError> { ··· 128 97 return Err(NailError::EmptyInput); 129 98 } 130 99 131 - Ok(NailKov { chain }) 100 + Ok(NailKov { chain: CachePadded::new(chain) }) 132 101 } 133 102 134 103 /// Add the occurrence of `next` following `prev`. ··· 138 107 builder.add(next); 139 108 } 140 109 None => { 141 - let mut builder = TokenWeightsBuilder::new(self.chain.hasher().clone()); 110 + let mut builder = TokenWeightsBuilder::new(); 142 111 builder.add(next); 143 112 self.chain.insert(prev, builder); 144 113 } 145 114 } 146 115 } 147 116 148 - fn feed_str(self, interner: &mut Interner, content: &str) -> Result<Self, NailError> { 149 - self.feed_tokens( 150 - content 151 - .split_word_bounds() 152 - .map(|text| interner.intern(text)), 153 - ) 117 + fn feed_str(self, content: &str) -> Result<Self, NailError> { 118 + self.feed_tokens(content.split_word_bounds().map(Token::from)) 154 119 } 155 120 156 121 fn feed_tokens(mut self, tokens: impl Iterator<Item = Token>) -> Result<Self, NailError> {
+16 -43
crates/nailkov/src/token.rs
··· 1 - use std::ops::Deref; 1 + use estr::Estr; 2 2 3 3 /// Representation of a string segment. 4 4 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] 5 5 #[repr(transparent)] 6 - pub struct Token(u32); 6 + pub struct Token(Estr); 7 7 8 8 impl Token { 9 9 #[inline(always)] 10 - pub const fn new(ptr: u32) -> Self { 11 - Self(ptr) 10 + pub fn new(str: &str) -> Self { 11 + Self(Estr::from(str)) 12 12 } 13 13 14 - #[inline(always)] 15 - pub(crate) const fn index(&self) -> usize { 16 - self.0 as usize 14 + #[inline] 15 + pub fn as_str(&self) -> &'static str { 16 + self.0.as_str() 17 17 } 18 18 19 - #[inline(always)] 20 - const fn to_bits(self) -> u32 { 21 - self.0 19 + #[inline] 20 + pub fn as_bytes(&self) -> &'static [u8] { 21 + self.0.as_str().as_bytes() 22 22 } 23 23 } 24 24 25 - impl Deref for Token { 26 - type Target = u32; 27 - 28 - #[inline] 29 - fn deref(&self) -> &Self::Target { 30 - &self.0 25 + impl From<&str> for Token { 26 + fn from(value: &str) -> Self { 27 + Self::new(value) 31 28 } 32 29 } 33 30 ··· 37 34 // optimized codegen for `to_bits`, `PartialEq` 38 35 // Prior art taken from my contribution to Bevy: 39 36 // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 40 - #[repr(C, align(8))] 37 + #[repr(C, align(16))] 41 38 pub struct TokenPair { 42 39 // Do not reorder the fields here. The ordering is explicitly used by repr(C) 43 40 // to make this struct equivalent to a u64. ··· 76 73 } 77 74 78 75 #[inline(always)] 79 - const fn to_bits(self) -> u64 { 80 - self.left.to_bits() as u64 | ((self.right.to_bits() as u64) << 32) 76 + fn to_bits(self) -> u128 { 77 + (self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64) 81 78 } 82 79 } 83 80 ··· 87 84 self 88 85 } 89 86 } 90 - 91 - #[cfg(test)] 92 - mod tests { 93 - use super::*; 94 - 95 - #[test] 96 - fn token_smoke_testing() { 97 - let left = Token(0x2); 98 - let right = Token(0x2b); 99 - 100 - let pair = TokenPair::new(left, right); 101 - 102 - assert_eq!(pair.to_bits(), 0x2b00000002); 103 - assert_eq!(pair.left, left); 104 - assert_eq!(pair.right, right); 105 - 106 - let other_right = Token(0x2c); 107 - 108 - let other_pair = TokenPair::new(left, other_right); 109 - 110 - assert_eq!(other_pair.to_bits(), 0x2c00000002); 111 - assert_ne!(pair, other_pair); 112 - } 113 - }
-2
crates/nailroutes/src/lib.rs
··· 24 24 .into_stream( 25 25 matched, 26 26 config.clone_inner(), 27 - inputs.get_interner(), 28 27 inputs.get_warning_template(), 29 28 rng, 30 29 ) ··· 46 45 .into_stream( 47 46 matched, 48 47 config.clone_inner(), 49 - inputs.get_interner(), 50 48 inputs.get_generated_template(), 51 49 rng, 52 50 )
+3 -23
crates/nailstate/src/lib.rs
··· 3 3 use axum::extract::{FromRef, FromRequestParts}; 4 4 use nailconfig::NailConfig; 5 5 use nailgen::{GeneratedTemplate, MarkovGen, Template, WarningTemplate}; 6 - use nailkov::interner::Interner; 7 6 use nailrng::FastRng; 8 7 use nailspicy::SpicyPayloads; 9 8 use rand::seq::IndexedRandom; ··· 34 33 #[derive(Clone)] 35 34 pub struct NailInputs { 36 35 chains: Arc<[MarkovGen]>, 37 - interner: Arc<Interner>, 38 36 templates: Arc<Templates>, 39 37 } 40 38 41 39 impl NailInputs { 42 - pub fn new( 43 - chains: Arc<[MarkovGen]>, 44 - interner: Arc<Interner>, 45 - templates: Arc<Templates>, 46 - ) -> Self { 47 - Self { 48 - chains, 49 - interner, 50 - templates, 51 - } 40 + pub fn new(chains: Arc<[MarkovGen]>, templates: Arc<Templates>) -> Self { 41 + Self { chains, templates } 52 42 } 53 43 54 44 /// Pulls a random markov chain from the available list. Returns a cloned ··· 64 54 } 65 55 } 66 56 67 - #[inline] 68 - pub fn get_interner(&self) -> Arc<Interner> { 69 - self.interner.clone() 70 - } 71 - 72 57 #[inline] 73 58 pub fn get_warning_template(&self) -> Template { 74 59 Template::from(self.templates.warning.clone()) ··· 121 106 pub fn new( 122 107 config: impl Into<AppConfig>, 123 108 chains: Arc<[MarkovGen]>, 124 - interner: Arc<Interner>, 125 109 templates: Arc<Templates>, 126 110 spicy_payloads: Option<Arc<SpicyPayloads>>, 127 111 ) -> Self { ··· 129 113 130 114 Self { 131 115 config, 132 - inputs: NailInputs { 133 - chains, 134 - interner, 135 - templates, 136 - }, 116 + inputs: NailInputs { chains, templates }, 137 117 spicy_payloads: NailPayloads { spicy_payloads }, 138 118 } 139 119 }
+4 -11
src/inputs.rs
··· 2 2 3 3 use color_eyre::eyre::Context; 4 4 use glob::glob; 5 - use nailbox::{arc_within, try_arc_within}; 5 + use nailbox::try_arc_within; 6 6 use nailconfig::NailConfig; 7 7 use nailgen::{GeneratedTemplate, MarkovGen, WarningTemplate}; 8 - use nailkov::interner::Interner; 9 8 use nailstate::Templates; 10 9 11 10 /// Takes a glob for finding all input files and returns a read-only list of 12 11 /// all markov chains that can be generated. 13 - pub fn get_input_files( 14 - config: &NailConfig, 15 - ) -> color_eyre::Result<(Arc<[MarkovGen]>, Arc<Interner>)> { 16 - let mut interner = arc_within(|| Interner::with_capacity(512)); 17 - 18 - let interned_mut = Arc::get_mut(&mut interner).unwrap(); 19 - 12 + pub fn get_input_files(config: &NailConfig) -> color_eyre::Result<Arc<[MarkovGen]>> { 20 13 let inputs = glob(&config.generator.input_files)? 21 14 .filter_map(|path| { 22 15 path.inspect_err(|err| tracing::error!("IO Error: {err}")) 23 16 .ok() 24 17 }) 25 18 .filter_map(|input| { 26 - MarkovGen::new(input, interned_mut) 19 + MarkovGen::new(input) 27 20 .inspect_err(|err| tracing::error!("Markov Error: {err}")) 28 21 .ok() 29 22 }) ··· 33 26 color_eyre::eyre::bail!("No input files found! Exiting..."); 34 27 } 35 28 36 - Ok((inputs, interner)) 29 + Ok(inputs) 37 30 } 38 31 39 32 pub fn get_template_files(config: &NailConfig) -> color_eyre::Result<Arc<Templates>> {
+2 -2
src/main.rs
··· 34 34 35 35 let config = nailconfig::get_configuration()?; 36 36 37 - let (inputs, interner) = nailpit::inputs::get_input_files(config.as_ref())?; 37 + let inputs = nailpit::inputs::get_input_files(config.as_ref())?; 38 38 39 39 let templates = nailpit::inputs::get_template_files(config.as_ref())?; 40 40 41 41 let spicy = nailspicy::get_spicy_payload(config.as_ref()); 42 42 43 43 nailrt::start( 44 - nailstate::ServerState::new(config, inputs, interner, templates, spicy), 44 + nailstate::ServerState::new(config, inputs, templates, spicy), 45 45 spawn_axum_worker, 46 46 )?; 47 47