feat: Estrisation of Token #4

merged
opened by sachy.dev targeting main from token-refactor
Changed files
+132 -416
.tangled
workflows
crates
nailgen
nailkov
nailroutes
src
nailstate
src
src
+1 -1
.tangled/workflows/miri.yml
··· 16 rustup override set nightly 17 cargo miri setup 18 - name: Miri Test 19 - command: cargo miri test --locked -p nailkov -p nailbox 20 environment: 21 RUSTFLAGS: -Zrandomize-layout
··· 16 rustup override set nightly 17 cargo miri setup 18 - name: Miri Test 19 + command: cargo miri test --locked -p nailbox -p nailgen 20 environment: 21 RUSTFLAGS: -Zrandomize-layout
+71 -7
Cargo.lock
··· 26 "memchr", 27 ] 28 29 [[package]] 30 name = "anyhow" 31 version = "1.0.100" ··· 168 source = "registry+https://github.com/rust-lang/crates.io-index" 169 checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 170 171 [[package]] 172 name = "bytes" 173 version = "1.11.0" ··· 238 "winapi", 239 ] 240 241 [[package]] 242 name = "diatomic-waker" 243 version = "0.2.3" ··· 256 source = "registry+https://github.com/rust-lang/crates.io-index" 257 checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 258 259 [[package]] 260 name = "eyre" 261 version = "0.6.12" ··· 290 source = "registry+https://github.com/rust-lang/crates.io-index" 291 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 292 293 [[package]] 294 name = "futures" 295 version = "0.3.31" ··· 491 version = "0.16.1" 492 source = "registry+https://github.com/rust-lang/crates.io-index" 493 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" 494 495 [[package]] 496 name = "hermit-abi" ··· 657 source = "registry+https://github.com/rust-lang/crates.io-index" 658 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 659 660 [[package]] 661 name = "libc" 662 version = "0.2.178" ··· 826 name = "nailkov" 827 version = "0.1.0" 828 dependencies = [ 829 "hashbrown 0.15.5", 830 "indexmap", 831 "itertools", 832 "nailrng", 833 "rand", 834 "rand_distr", 835 "rapidhash", 836 - "rustc-hash", 837 "tracing", 838 "unicode-segmentation", 839 ] ··· 1390 source = "registry+https://github.com/rust-lang/crates.io-index" 1391 checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" 1392 1393 - [[package]] 1394 - name = "rustc-hash" 1395 - version = "2.1.1" 1396 - source = "registry+https://github.com/rust-lang/crates.io-index" 1397 - checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" 1398 - 1399 [[package]] 1400 name = "rustversion" 1401 version = "1.0.22" 1402 source = "registry+https://github.com/rust-lang/crates.io-index" 1403 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 1404 1405 [[package]] 1406 name = "ryu" 1407 version = "1.0.20" ··· 1545 version = "0.10.0" 1546 source = "registry+https://github.com/rust-lang/crates.io-index" 1547 checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" 1548 1549 [[package]] 1550 name = "syn"
··· 26 "memchr", 27 ] 28 29 + [[package]] 30 + name = "allocator-api2" 31 + version = "0.2.21" 32 + source = "registry+https://github.com/rust-lang/crates.io-index" 33 + checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" 34 + 35 [[package]] 36 name = "anyhow" 37 version = "1.0.100" ··· 174 source = "registry+https://github.com/rust-lang/crates.io-index" 175 checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" 176 177 + [[package]] 178 + name = "byteorder" 179 + version = "1.5.0" 180 + source = "registry+https://github.com/rust-lang/crates.io-index" 181 + checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 182 + 183 [[package]] 184 name = "bytes" 185 version = "1.11.0" ··· 250 "winapi", 251 ] 252 253 + [[package]] 254 + name = "crossbeam-utils" 255 + version = "0.8.21" 256 + source = "registry+https://github.com/rust-lang/crates.io-index" 257 + checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 258 + 259 + [[package]] 260 + name = "crossfig" 261 + version = "0.1.3" 262 + source = "registry+https://github.com/rust-lang/crates.io-index" 263 + checksum = "40a998414a3656e7a11ca59d55598ce7df58daafd742e783844e80bbd8d500dd" 264 + 265 [[package]] 266 name = "diatomic-waker" 267 version = "0.2.3" ··· 280 source = "registry+https://github.com/rust-lang/crates.io-index" 281 checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 282 283 + [[package]] 284 + name = "estr" 285 + version = "1.2.0" 286 + source = "registry+https://github.com/rust-lang/crates.io-index" 287 + checksum = "e2e77c7dfb1a984132a140c98a805b5f12f8e8707420dda9a6ad698bc50fc041" 288 + dependencies = [ 289 + "byteorder", 290 + "crossfig", 291 + "hashbrown 0.16.1", 292 + "libabort", 293 + "lock_api", 294 + "rapidhash", 295 + "spin", 296 + ] 297 + 298 [[package]] 299 name = "eyre" 300 version = "0.6.12" ··· 329 source = "registry+https://github.com/rust-lang/crates.io-index" 330 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 331 332 + [[package]] 333 + name = "foldhash" 334 + version = "0.2.0" 335 + source = "registry+https://github.com/rust-lang/crates.io-index" 336 + checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" 337 + 338 [[package]] 339 name = "futures" 340 version = "0.3.31" ··· 536 version = "0.16.1" 537 source = "registry+https://github.com/rust-lang/crates.io-index" 538 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" 539 + dependencies = [ 540 + "allocator-api2", 541 + "equivalent", 542 + "foldhash", 543 + ] 544 545 [[package]] 546 name = "hermit-abi" ··· 707 source = "registry+https://github.com/rust-lang/crates.io-index" 708 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 709 710 + [[package]] 711 + name = "libabort" 712 + version = "0.1.9" 713 + source = "registry+https://github.com/rust-lang/crates.io-index" 714 + checksum = "cec5d1db7977801dd3e593f88a15906cac3c4eb7a69e38b6cc162cb020b22d7d" 715 + dependencies = [ 716 + "rustversion-detect", 717 + ] 718 + 719 [[package]] 720 name = "libc" 721 version = "0.2.178" ··· 885 name = "nailkov" 886 version = "0.1.0" 887 dependencies = [ 888 + "crossbeam-utils", 889 + "estr", 890 "hashbrown 0.15.5", 891 "indexmap", 892 "itertools", 893 "nailrng", 894 + "parking_lot", 895 "rand", 896 "rand_distr", 897 "rapidhash", 898 "tracing", 899 "unicode-segmentation", 900 ] ··· 1451 source = "registry+https://github.com/rust-lang/crates.io-index" 1452 checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" 1453 1454 [[package]] 1455 name = "rustversion" 1456 version = "1.0.22" 1457 source = "registry+https://github.com/rust-lang/crates.io-index" 1458 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 1459 1460 + [[package]] 1461 + name = "rustversion-detect" 1462 + version = "0.1.3" 1463 + source = "registry+https://github.com/rust-lang/crates.io-index" 1464 + checksum = "4cfa9e87e97427c3a1b472eace073b2bc577ad0e1444c128d938b3d5bcdacb17" 1465 + 1466 [[package]] 1467 name = "ryu" 1468 version = "1.0.20" ··· 1606 version = "0.10.0" 1607 source = "registry+https://github.com/rust-lang/crates.io-index" 1608 checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" 1609 + dependencies = [ 1610 + "lock_api", 1611 + ] 1612 1613 [[package]] 1614 name = "syn"
+6 -19
crates/nailgen/src/html_gen.rs
··· 3 use axum::extract::MatchedPath; 4 use bytes::{Bytes, BytesMut}; 5 use nailconfig::NailConfig; 6 - use nailkov::{NailKov, interner::Interner}; 7 use nailrng::FastRng; 8 use rand::{Rng, RngCore, distr::Alphanumeric, seq::IndexedRandom}; 9 ··· 24 /// interned text from the interner. 25 #[inline] 26 pub fn text_generator<'a>( 27 - interner: &'a Interner, 28 chain: &'a NailKov, 29 size: usize, 30 rng: &'a mut impl RngCore, ··· 33 .generate_tokens(rng) 34 .take(size) 35 // SAFETY: The id comes from the same interner that allocated it 36 - .flat_map(|token| unsafe { interner.lookup(token).as_bytes() }) 37 .skip_while(|&text| !text.is_ascii_alphabetic()) 38 } 39 ··· 64 65 pub async fn initial_content( 66 buf_mut: BytesMut, 67 - interner: Arc<Interner>, 68 chain: Arc<NailKov>, 69 config: Arc<NailConfig>, 70 mut rng: FastRng, ··· 75 (0..max_paras) 76 .fold(buf_mut, |mut acc, _| { 77 acc.extend(paragraph( 78 - &interner, 79 &chain, 80 get_desired_size(&config, &mut rng), 81 &mut rng, ··· 88 89 pub async fn main_content( 90 mut buffer: BytesMut, 91 - interner: Arc<Interner>, 92 chain: Arc<NailKov>, 93 config: Arc<NailConfig>, 94 mut rng: FastRng, ··· 96 buffer.reserve(config.generator.chunk_size * 2); 97 98 loop { 99 - buffer.extend(header( 100 - &interner, 101 - &chain, 102 - config.generator.header_size, 103 - &mut rng, 104 - )); 105 106 // Randomise how many paragraphs we want per section 107 let paragraphs = rng.random_range(1..=4); 108 109 (0..paragraphs).for_each(|_| { 110 buffer.extend(paragraph( 111 - &interner, 112 &chain, 113 get_desired_size(&config, &mut rng), 114 &mut rng, ··· 146 147 pub async fn footer( 148 mut buf_mut: BytesMut, 149 - interner: Arc<Interner>, 150 chain: Arc<NailKov>, 151 path: MatchedPath, 152 config: Arc<NailConfig>, ··· 166 buf_mut.extend( 167 b"\">" 168 .iter() 169 - .chain(text_generator(&interner, &chain, 8, &mut rng)) 170 .chain(b"</a></li>\n"), 171 ); 172 } ··· 178 179 #[inline] 180 fn paragraph<'a>( 181 - interner: &'a Interner, 182 chain: &'a NailKov, 183 size: usize, 184 rng: &'a mut impl RngCore, 185 ) -> impl Iterator<Item = &'a u8> + 'a { 186 b"<p>" 187 .iter() 188 - .chain(text_generator(interner, chain, size, rng)) 189 .chain(b"</p>\n") 190 } 191 192 #[inline] 193 fn header<'a>( 194 - interner: &'a Interner, 195 chain: &'a NailKov, 196 size: usize, 197 rng: &'a mut impl RngCore, 198 ) -> impl Iterator<Item = &'a u8> + 'a { 199 b"\n<h2>" 200 .iter() 201 - .chain(text_generator(interner, chain, size, rng)) 202 .chain(b"</h2>\n") 203 }
··· 3 use axum::extract::MatchedPath; 4 use bytes::{Bytes, BytesMut}; 5 use nailconfig::NailConfig; 6 + use nailkov::NailKov; 7 use nailrng::FastRng; 8 use rand::{Rng, RngCore, distr::Alphanumeric, seq::IndexedRandom}; 9 ··· 24 /// interned text from the interner. 25 #[inline] 26 pub fn text_generator<'a>( 27 chain: &'a NailKov, 28 size: usize, 29 rng: &'a mut impl RngCore, ··· 32 .generate_tokens(rng) 33 .take(size) 34 // SAFETY: The id comes from the same interner that allocated it 35 + .flat_map(|token| token.as_bytes()) 36 .skip_while(|&text| !text.is_ascii_alphabetic()) 37 } 38 ··· 63 64 pub async fn initial_content( 65 buf_mut: BytesMut, 66 chain: Arc<NailKov>, 67 config: Arc<NailConfig>, 68 mut rng: FastRng, ··· 73 (0..max_paras) 74 .fold(buf_mut, |mut acc, _| { 75 acc.extend(paragraph( 76 &chain, 77 get_desired_size(&config, &mut rng), 78 &mut rng, ··· 85 86 pub async fn main_content( 87 mut buffer: BytesMut, 88 chain: Arc<NailKov>, 89 config: Arc<NailConfig>, 90 mut rng: FastRng, ··· 92 buffer.reserve(config.generator.chunk_size * 2); 93 94 loop { 95 + buffer.extend(header(&chain, config.generator.header_size, &mut rng)); 96 97 // Randomise how many paragraphs we want per section 98 let paragraphs = rng.random_range(1..=4); 99 100 (0..paragraphs).for_each(|_| { 101 buffer.extend(paragraph( 102 &chain, 103 get_desired_size(&config, &mut rng), 104 &mut rng, ··· 136 137 pub async fn footer( 138 mut buf_mut: BytesMut, 139 chain: Arc<NailKov>, 140 path: MatchedPath, 141 config: Arc<NailConfig>, ··· 155 buf_mut.extend( 156 b"\">" 157 .iter() 158 + .chain(text_generator(&chain, 8, &mut rng)) 159 .chain(b"</a></li>\n"), 160 ); 161 } ··· 167 168 #[inline] 169 fn paragraph<'a>( 170 chain: &'a NailKov, 171 size: usize, 172 rng: &'a mut impl RngCore, 173 ) -> impl Iterator<Item = &'a u8> + 'a { 174 b"<p>" 175 .iter() 176 + .chain(text_generator(chain, size, rng)) 177 .chain(b"</p>\n") 178 } 179 180 #[inline] 181 fn header<'a>( 182 chain: &'a NailKov, 183 size: usize, 184 rng: &'a mut impl RngCore, 185 ) -> impl Iterator<Item = &'a u8> + 'a { 186 b"\n<h2>" 187 .iter() 188 + .chain(text_generator(chain, size, rng)) 189 .chain(b"</h2>\n") 190 }
+7 -19
crates/nailgen/src/lib.rs
··· 16 use futures_lite::Stream; 17 use nailbox::{boxed_future_within, try_arc_within}; 18 use nailconfig::NailConfig; 19 - use nailkov::{NailKov, interner::Interner}; 20 use nailrng::FastRng; 21 use pin_project_lite::pin_project; 22 use tokio::time::Sleep; ··· 59 pub struct MarkovStream { 60 path: MatchedPath, 61 config: Arc<NailConfig>, 62 - interner: Arc<Interner>, 63 markov: MarkovGen, 64 start_time: Instant, 65 total_bytes: usize, ··· 77 markov: MarkovGen, 78 path: MatchedPath, 79 config: Arc<NailConfig>, 80 - interner: Arc<Interner>, 81 template: Template, 82 rng: FastRng, 83 ) -> Self { 84 Self { 85 path, 86 config, 87 - interner, 88 markov, 89 total_bytes: 0, 90 start_time: Instant::now(), ··· 121 let title = this.page_title.get_or_insert_with(|| { 122 this.template.get_static_content().map_or_else( 123 || { 124 - text_generator( 125 - this.interner, 126 - &this.markov.chain, 127 - 24, 128 - this.rng, 129 - ) 130 - .copied() 131 - .collect() 132 }, 133 |title| static_title(title).copied().collect(), 134 ) ··· 144 let handle = boxed_future_within(|| { 145 initial_content( 146 buffer, 147 - this.interner.clone(), 148 this.markov.chain.clone(), 149 this.config.clone(), 150 this.rng.fork(), ··· 188 let handle = boxed_future_within(|| { 189 footer( 190 buffer, 191 - this.interner.clone(), 192 this.markov.chain.clone(), 193 this.path.clone(), 194 this.config.clone(), ··· 237 let handle = boxed_future_within(|| { 238 main_content( 239 buffer, 240 - this.interner.clone(), 241 this.markov.chain.clone(), 242 this.config.clone(), 243 this.rng.fork(), ··· 287 } 288 289 impl MarkovGen { 290 - pub fn new(input: impl AsRef<Path>, interner: &mut Interner) -> Result<Self> { 291 let file = std::fs::read_to_string(input.as_ref())?; 292 293 - let chain = try_arc_within(|| NailKov::from_input(interner, &file))?; 294 295 Ok(Self { chain }) 296 } ··· 300 self, 301 path: MatchedPath, 302 config: Arc<NailConfig>, 303 - interner: Arc<Interner>, 304 template: Template, 305 rng: FastRng, 306 ) -> MarkovStream { 307 - MarkovStream::new(self, path, config, interner, template, rng) 308 } 309 }
··· 16 use futures_lite::Stream; 17 use nailbox::{boxed_future_within, try_arc_within}; 18 use nailconfig::NailConfig; 19 + use nailkov::NailKov; 20 use nailrng::FastRng; 21 use pin_project_lite::pin_project; 22 use tokio::time::Sleep; ··· 59 pub struct MarkovStream { 60 path: MatchedPath, 61 config: Arc<NailConfig>, 62 markov: MarkovGen, 63 start_time: Instant, 64 total_bytes: usize, ··· 76 markov: MarkovGen, 77 path: MatchedPath, 78 config: Arc<NailConfig>, 79 template: Template, 80 rng: FastRng, 81 ) -> Self { 82 Self { 83 path, 84 config, 85 markov, 86 total_bytes: 0, 87 start_time: Instant::now(), ··· 118 let title = this.page_title.get_or_insert_with(|| { 119 this.template.get_static_content().map_or_else( 120 || { 121 + text_generator(&this.markov.chain, 24, this.rng) 122 + .copied() 123 + .collect() 124 }, 125 |title| static_title(title).copied().collect(), 126 ) ··· 136 let handle = boxed_future_within(|| { 137 initial_content( 138 buffer, 139 this.markov.chain.clone(), 140 this.config.clone(), 141 this.rng.fork(), ··· 179 let handle = boxed_future_within(|| { 180 footer( 181 buffer, 182 this.markov.chain.clone(), 183 this.path.clone(), 184 this.config.clone(), ··· 227 let handle = boxed_future_within(|| { 228 main_content( 229 buffer, 230 this.markov.chain.clone(), 231 this.config.clone(), 232 this.rng.fork(), ··· 276 } 277 278 impl MarkovGen { 279 + pub fn new(input: impl AsRef<Path>) -> Result<Self> { 280 let file = std::fs::read_to_string(input.as_ref())?; 281 282 + let chain = try_arc_within(|| NailKov::from_input(&file))?; 283 284 Ok(Self { chain }) 285 } ··· 289 self, 290 path: MatchedPath, 291 config: Arc<NailConfig>, 292 template: Template, 293 rng: FastRng, 294 ) -> MarkovStream { 295 + MarkovStream::new(self, path, config, template, rng) 296 } 297 }
+3 -1
crates/nailkov/Cargo.toml
··· 13 [dependencies] 14 nailrng = { path = "../nailrng" } 15 rapidhash = { workspace = true, features = ["rand", "unsafe"] } 16 - rustc-hash = { version = "2.1.1", default-features = false } 17 hashbrown.workspace = true 18 rand.workspace = true 19 rand_distr.workspace = true ··· 21 unicode-segmentation.workspace = true 22 tracing.workspace = true 23 indexmap.workspace = true
··· 13 [dependencies] 14 nailrng = { path = "../nailrng" } 15 rapidhash = { workspace = true, features = ["rand", "unsafe"] } 16 hashbrown.workspace = true 17 rand.workspace = true 18 rand_distr.workspace = true ··· 20 unicode-segmentation.workspace = true 21 tracing.workspace = true 22 indexmap.workspace = true 23 + parking_lot.workspace = true 24 + estr = "1.2.0" 25 + crossbeam-utils = "0.8.21"
+8 -5
crates/nailkov/src/distribution.rs
··· 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 4 use indexmap::IndexMap; 5 use rand::Rng; 6 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 7 8 - use crate::{RandomState, error::NailError, token::Token}; 9 10 /// A distribution of choices and their likelihood. 11 #[derive(Clone, Debug)] ··· 29 #[derive(Clone, Debug)] 30 pub struct TokenWeightsBuilder { 31 /// Counts how many times a token is likely to appear. 32 - occurrences: IndexMap<Token, u32, RandomState>, 33 } 34 35 impl TokenWeightsBuilder { 36 - pub fn new(hasher: RandomState) -> Self { 37 Self { 38 - occurrences: IndexMap::with_hasher(hasher), 39 } 40 } 41 ··· 64 65 impl Default for TokenWeightsBuilder { 66 fn default() -> Self { 67 - Self::new(RandomState::new()) 68 } 69 }
··· 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 4 + use core::hash::BuildHasherDefault; 5 + 6 + use estr::IdentityHasher; 7 use indexmap::IndexMap; 8 use rand::Rng; 9 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 10 11 + use crate::{error::NailError, token::Token}; 12 13 /// A distribution of choices and their likelihood. 14 #[derive(Clone, Debug)] ··· 32 #[derive(Clone, Debug)] 33 pub struct TokenWeightsBuilder { 34 /// Counts how many times a token is likely to appear. 35 + occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>, 36 } 37 38 impl TokenWeightsBuilder { 39 + pub fn new() -> Self { 40 Self { 41 + occurrences: IndexMap::with_hasher(Default::default()), 42 } 43 } 44 ··· 67 68 impl Default for TokenWeightsBuilder { 69 fn default() -> Self { 70 + Self::new() 71 } 72 }
-237
crates/nailkov/src/interner.rs
··· 1 - use hashbrown::{Equivalent, HashMap}; 2 - use rapidhash::fast::RandomState; 3 - 4 - use crate::token::Token; 5 - 6 - #[derive(Debug, Clone, Copy, PartialEq, Eq)] 7 - #[repr(transparent)] 8 - struct StringPtr(*const str); 9 - 10 - impl StringPtr { 11 - #[inline(always)] 12 - const fn cast(&self) -> &str { 13 - // SAFETY: The pointer is stable as it points to memory that is never 14 - // moved/invalidated while this struct lives, therefore can be safely 15 - // dereferenced back to a string slice. We own the String instance this 16 - // references, and all StringPtrs are used within the same scope as the 17 - // String instances, so when String drops, these will be dropped too. 18 - unsafe { &*self.0 } 19 - } 20 - } 21 - 22 - impl core::hash::Hash for StringPtr { 23 - #[inline] 24 - fn hash<H: std::hash::Hasher>(&self, state: &mut H) { 25 - self.cast().hash(state); 26 - } 27 - } 28 - // SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated 29 - // while Interner lives, and all instances of StringPtr live as long as Interner. 30 - // Since the String type is `Send`, so is StringPtr 31 - unsafe impl Send for StringPtr {} 32 - // SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated 33 - // while Interner lives, and all instances of StringPtr live as long as Interner. 34 - // Since the String type is `Sync`, so is StringPtr 35 - unsafe impl Sync for StringPtr {} 36 - 37 - #[derive(Debug, Clone)] 38 - pub struct Interner { 39 - collected: HashMap<StringPtr, Token, RandomState>, 40 - index: Vec<StringPtr>, 41 - buffer: String, 42 - stored: Vec<String>, 43 - } 44 - 45 - impl Default for Interner { 46 - fn default() -> Self { 47 - Self::with_capacity(256) 48 - } 49 - } 50 - 51 - impl Interner { 52 - /// # Safety 53 - /// The caller must ensure that the [`Token`] being passed in was allocated 54 - /// from the same [`Interner`] instance. 55 - #[inline(always)] 56 - pub unsafe fn lookup(&self, id: Token) -> &str { 57 - // SAFETY: Safety is upheld by the caller ensuring the id was allocated 58 - // from the same interner. 59 - unsafe { self.index.get_unchecked(id.index()).cast() } 60 - } 61 - 62 - pub fn with_capacity(cap: usize) -> Interner { 63 - // This will get us just under 64KiB of interned storage before we 64 - // need to allocate more space for buffer storage. 65 - let stored = Vec::with_capacity(8); 66 - 67 - Interner { 68 - collected: HashMap::with_hasher(RandomState::new()), 69 - index: Vec::new(), 70 - stored, 71 - buffer: String::with_capacity(cap.next_power_of_two()), 72 - } 73 - } 74 - 75 - pub fn intern(&mut self, text: &str) -> Token { 76 - if let Some(&id) = self.collected.get(text) { 77 - return id; 78 - } 79 - 80 - // SAFETY: `alloc`` is never called elsewhere, nor the properties it controls 81 - // are modified outside of the method. Here we get a new StringPtr for `text` that 82 - // hasn't been stored before. 83 - let name = unsafe { self.alloc(text) }; 84 - let id = Token::new(self.index.len() as u32); 85 - self.collected.insert(name, id); 86 - self.index.push(name); 87 - 88 - // SAFETY: We are using the id allocated within the same function scope, 89 - // so it is always from the same source. 90 - unsafe { 91 - debug_assert!(self.lookup(id).equivalent(&name)); 92 - } 93 - debug_assert!(self.intern(name.cast()) == id); 94 - 95 - id 96 - } 97 - 98 - /// Allocates a new [`StringPtr`] for the given string input. If there is no more room 99 - /// in the current buffer, it allocates a new buffer and creates the StringPtr to reference 100 - /// the stored string in the new buffer, storing the old one. 101 - /// 102 - /// # Safety 103 - /// 104 - /// The caller must ensure that `self.buffers` and `self.active` are never modified elsewhere, 105 - /// and that this is called only for new instances of `text`. 106 - unsafe fn alloc(&mut self, text: &str) -> StringPtr { 107 - let capacity = self.buffer.capacity(); 108 - 109 - if capacity < self.buffer.len() + text.len() { 110 - // If we ran out of capacity in our storage, allocate a new buffer with 111 - // larger capacity. 112 - let new_cap = (capacity.max(text.len()) + 1).next_power_of_two(); 113 - let old_buf = core::mem::replace(&mut self.buffer, String::with_capacity(new_cap)); 114 - 115 - self.stored.push(old_buf); 116 - } 117 - 118 - // Construct raw str slice to eliminate lifetime tracking as we manage its 119 - // lifetime within the Interner instance. 120 - let interned = { 121 - let start = self.buffer.len(); 122 - self.buffer.push_str(text); 123 - 124 - &raw const self.buffer[start..] 125 - }; 126 - 127 - StringPtr(interned) 128 - } 129 - } 130 - 131 - impl Equivalent<StringPtr> for str { 132 - #[inline(always)] 133 - fn equivalent(&self, key: &StringPtr) -> bool { 134 - key.cast().eq(self) 135 - } 136 - } 137 - 138 - #[cfg(test)] 139 - mod tests { 140 - use super::*; 141 - 142 - #[test] 143 - fn string_ptr_comparisons() { 144 - let one = "one"; 145 - let two = "two"; 146 - 147 - let one_ptr = StringPtr(one); 148 - let two_ptr = StringPtr(two); 149 - 150 - assert_ne!(one_ptr, two_ptr); 151 - 152 - assert!(one.equivalent(&one_ptr)); 153 - } 154 - 155 - #[test] 156 - fn is_able_to_intern_one_string() { 157 - let mut interner = Interner::default(); 158 - 159 - assert!(interner.buffer.is_empty()); 160 - 161 - let text = "Lorem ipsum"; 162 - 163 - let id = interner.intern(text); 164 - 165 - // SAFETY: It comes from the same source 166 - unsafe { 167 - assert_eq!(text, interner.lookup(id)); 168 - } 169 - assert_eq!(interner.buffer.len(), 11); 170 - 171 - let again = interner.intern(text); 172 - 173 - assert_eq!(id, again); 174 - assert_eq!(interner.buffer.len(), 11); 175 - } 176 - 177 - #[test] 178 - fn is_able_to_intern_many_strings() { 179 - let mut interner = Interner::with_capacity(32); 180 - 181 - let texts = [ 182 - "Lorem ipsum", 183 - "dolor sit amet", 184 - "duplicated", 185 - "Other text", 186 - "Elevenses", 187 - "duplicated", 188 - "Gibberish", 189 - ]; 190 - 191 - let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect(); 192 - 193 - assert_eq!( 194 - interned.as_slice(), 195 - &[ 196 - Token::new(0), 197 - Token::new(1), 198 - Token::new(2), 199 - Token::new(3), 200 - Token::new(4), 201 - Token::new(2), 202 - Token::new(5) 203 - ] 204 - ); 205 - assert_eq!(interner.buffer.capacity(), 64); 206 - assert_eq!(interner.stored.len(), 1); 207 - assert_eq!(interner.stored[0].capacity(), 32); 208 - } 209 - 210 - #[test] 211 - fn is_thread_safe() { 212 - let mut interner = Interner::with_capacity(32); 213 - 214 - let texts = [ 215 - "Lorem ipsum", 216 - "dolor sit amet", 217 - "duplicated", 218 - "Other text", 219 - "Elevenses", 220 - "duplicated", 221 - "Gibberish", 222 - ]; 223 - 224 - let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect(); 225 - 226 - std::thread::scope(|s| { 227 - s.spawn(move || { 228 - for (id, expected) in interned.into_iter().zip(texts) { 229 - // SAFETY: It comes from the same source 230 - unsafe { 231 - assert_eq!(expected, interner.lookup(id)); 232 - } 233 - } 234 - }); 235 - }); 236 - } 237 - }
···
+11 -46
crates/nailkov/src/lib.rs
··· 4 5 mod distribution; 6 mod error; 7 - pub mod interner; 8 mod token; 9 10 use error::NailError; 11 use indexmap::IndexMap; 12 - use interner::Interner; 13 use itertools::Itertools; 14 - use nailrng::FastRng; 15 use rand::{RngCore, seq::IteratorRandom}; 16 use rand_distr::Distribution; 17 18 use distribution::{TokenWeights, TokenWeightsBuilder}; 19 - use rustc_hash::FxHasher; 20 use token::{Token, TokenPair}; 21 use unicode_segmentation::UnicodeSegmentation; 22 23 - #[derive(Clone)] 24 - pub struct RandomState { 25 - seed: usize, 26 - } 27 - 28 - impl RandomState { 29 - fn new() -> Self { 30 - let mut rng = FastRng::default(); 31 - 32 - Self { 33 - seed: rng.next_u64() as usize, 34 - } 35 - } 36 - } 37 - 38 - impl Default for RandomState { 39 - fn default() -> Self { 40 - Self::new() 41 - } 42 - } 43 - 44 - impl core::hash::BuildHasher for RandomState { 45 - type Hasher = FxHasher; 46 - 47 - fn build_hasher(&self) -> Self::Hasher { 48 - FxHasher::with_seed(self.seed) 49 - } 50 - } 51 - 52 #[derive(Clone, Debug)] 53 pub struct NailKov { 54 - chain: IndexMap<TokenPair, TokenWeights, RandomState>, 55 } 56 57 pub struct NailKovIter<'a, R: RngCore> { ··· 89 } 90 91 impl NailKov { 92 - pub fn from_input(interner: &mut Interner, input: &str) -> Result<NailKov, NailError> { 93 - NailBuilder::new(RandomState::new()).with_input(interner, input) 94 } 95 } 96 ··· 105 } 106 } 107 108 - fn with_input(self, interned: &mut Interner, input: &str) -> Result<NailKov, NailError> { 109 - self.feed_str(interned, input)?.build() 110 } 111 112 fn build(self) -> Result<NailKov, NailError> { ··· 128 return Err(NailError::EmptyInput); 129 } 130 131 - Ok(NailKov { chain }) 132 } 133 134 /// Add the occurrence of `next` following `prev`. ··· 138 builder.add(next); 139 } 140 None => { 141 - let mut builder = TokenWeightsBuilder::new(self.chain.hasher().clone()); 142 builder.add(next); 143 self.chain.insert(prev, builder); 144 } 145 } 146 } 147 148 - fn feed_str(self, interner: &mut Interner, content: &str) -> Result<Self, NailError> { 149 - self.feed_tokens( 150 - content 151 - .split_word_bounds() 152 - .map(|text| interner.intern(text)), 153 - ) 154 } 155 156 fn feed_tokens(mut self, tokens: impl Iterator<Item = Token>) -> Result<Self, NailError> {
··· 4 5 mod distribution; 6 mod error; 7 mod token; 8 9 + use crossbeam_utils::CachePadded; 10 use error::NailError; 11 use indexmap::IndexMap; 12 use itertools::Itertools; 13 use rand::{RngCore, seq::IteratorRandom}; 14 use rand_distr::Distribution; 15 16 use distribution::{TokenWeights, TokenWeightsBuilder}; 17 + use rapidhash::fast::RandomState; 18 use token::{Token, TokenPair}; 19 use unicode_segmentation::UnicodeSegmentation; 20 21 #[derive(Clone, Debug)] 22 pub struct NailKov { 23 + chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>, 24 } 25 26 pub struct NailKovIter<'a, R: RngCore> { ··· 58 } 59 60 impl NailKov { 61 + pub fn from_input(input: &str) -> Result<NailKov, NailError> { 62 + NailBuilder::new(RandomState::new()).with_input(input) 63 } 64 } 65 ··· 74 } 75 } 76 77 + fn with_input(self, input: &str) -> Result<NailKov, NailError> { 78 + self.feed_str(input)?.build() 79 } 80 81 fn build(self) -> Result<NailKov, NailError> { ··· 97 return Err(NailError::EmptyInput); 98 } 99 100 + Ok(NailKov { chain: CachePadded::new(chain) }) 101 } 102 103 /// Add the occurrence of `next` following `prev`. ··· 107 builder.add(next); 108 } 109 None => { 110 + let mut builder = TokenWeightsBuilder::new(); 111 builder.add(next); 112 self.chain.insert(prev, builder); 113 } 114 } 115 } 116 117 + fn feed_str(self, content: &str) -> Result<Self, NailError> { 118 + self.feed_tokens(content.split_word_bounds().map(Token::from)) 119 } 120 121 fn feed_tokens(mut self, tokens: impl Iterator<Item = Token>) -> Result<Self, NailError> {
+16 -43
crates/nailkov/src/token.rs
··· 1 - use std::ops::Deref; 2 3 /// Representation of a string segment. 4 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] 5 #[repr(transparent)] 6 - pub struct Token(u32); 7 8 impl Token { 9 #[inline(always)] 10 - pub const fn new(ptr: u32) -> Self { 11 - Self(ptr) 12 } 13 14 - #[inline(always)] 15 - pub(crate) const fn index(&self) -> usize { 16 - self.0 as usize 17 } 18 19 - #[inline(always)] 20 - const fn to_bits(self) -> u32 { 21 - self.0 22 } 23 } 24 25 - impl Deref for Token { 26 - type Target = u32; 27 - 28 - #[inline] 29 - fn deref(&self) -> &Self::Target { 30 - &self.0 31 } 32 } 33 ··· 37 // optimized codegen for `to_bits`, `PartialEq` 38 // Prior art taken from my contribution to Bevy: 39 // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 40 - #[repr(C, align(8))] 41 pub struct TokenPair { 42 // Do not reorder the fields here. The ordering is explicitly used by repr(C) 43 // to make this struct equivalent to a u64. ··· 76 } 77 78 #[inline(always)] 79 - const fn to_bits(self) -> u64 { 80 - self.left.to_bits() as u64 | ((self.right.to_bits() as u64) << 32) 81 } 82 } 83 ··· 87 self 88 } 89 } 90 - 91 - #[cfg(test)] 92 - mod tests { 93 - use super::*; 94 - 95 - #[test] 96 - fn token_smoke_testing() { 97 - let left = Token(0x2); 98 - let right = Token(0x2b); 99 - 100 - let pair = TokenPair::new(left, right); 101 - 102 - assert_eq!(pair.to_bits(), 0x2b00000002); 103 - assert_eq!(pair.left, left); 104 - assert_eq!(pair.right, right); 105 - 106 - let other_right = Token(0x2c); 107 - 108 - let other_pair = TokenPair::new(left, other_right); 109 - 110 - assert_eq!(other_pair.to_bits(), 0x2c00000002); 111 - assert_ne!(pair, other_pair); 112 - } 113 - }
··· 1 + use estr::Estr; 2 3 /// Representation of a string segment. 4 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] 5 #[repr(transparent)] 6 + pub struct Token(Estr); 7 8 impl Token { 9 #[inline(always)] 10 + pub fn new(str: &str) -> Self { 11 + Self(Estr::from(str)) 12 } 13 14 + #[inline] 15 + pub fn as_str(&self) -> &'static str { 16 + self.0.as_str() 17 } 18 19 + #[inline] 20 + pub fn as_bytes(&self) -> &'static [u8] { 21 + self.0.as_str().as_bytes() 22 } 23 } 24 25 + impl From<&str> for Token { 26 + fn from(value: &str) -> Self { 27 + Self::new(value) 28 } 29 } 30 ··· 34 // optimized codegen for `to_bits`, `PartialEq` 35 // Prior art taken from my contribution to Bevy: 36 // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 37 + #[repr(C, align(16))] 38 pub struct TokenPair { 39 // Do not reorder the fields here. The ordering is explicitly used by repr(C) 40 // to make this struct equivalent to a u64. ··· 73 } 74 75 #[inline(always)] 76 + fn to_bits(self) -> u128 { 77 + (self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64) 78 } 79 } 80 ··· 84 self 85 } 86 }
-2
crates/nailroutes/src/lib.rs
··· 24 .into_stream( 25 matched, 26 config.clone_inner(), 27 - inputs.get_interner(), 28 inputs.get_warning_template(), 29 rng, 30 ) ··· 46 .into_stream( 47 matched, 48 config.clone_inner(), 49 - inputs.get_interner(), 50 inputs.get_generated_template(), 51 rng, 52 )
··· 24 .into_stream( 25 matched, 26 config.clone_inner(), 27 inputs.get_warning_template(), 28 rng, 29 ) ··· 45 .into_stream( 46 matched, 47 config.clone_inner(), 48 inputs.get_generated_template(), 49 rng, 50 )
+3 -23
crates/nailstate/src/lib.rs
··· 3 use axum::extract::{FromRef, FromRequestParts}; 4 use nailconfig::NailConfig; 5 use nailgen::{GeneratedTemplate, MarkovGen, Template, WarningTemplate}; 6 - use nailkov::interner::Interner; 7 use nailrng::FastRng; 8 use nailspicy::SpicyPayloads; 9 use rand::seq::IndexedRandom; ··· 34 #[derive(Clone)] 35 pub struct NailInputs { 36 chains: Arc<[MarkovGen]>, 37 - interner: Arc<Interner>, 38 templates: Arc<Templates>, 39 } 40 41 impl NailInputs { 42 - pub fn new( 43 - chains: Arc<[MarkovGen]>, 44 - interner: Arc<Interner>, 45 - templates: Arc<Templates>, 46 - ) -> Self { 47 - Self { 48 - chains, 49 - interner, 50 - templates, 51 - } 52 } 53 54 /// Pulls a random markov chain from the available list. Returns a cloned ··· 64 } 65 } 66 67 - #[inline] 68 - pub fn get_interner(&self) -> Arc<Interner> { 69 - self.interner.clone() 70 - } 71 - 72 #[inline] 73 pub fn get_warning_template(&self) -> Template { 74 Template::from(self.templates.warning.clone()) ··· 121 pub fn new( 122 config: impl Into<AppConfig>, 123 chains: Arc<[MarkovGen]>, 124 - interner: Arc<Interner>, 125 templates: Arc<Templates>, 126 spicy_payloads: Option<Arc<SpicyPayloads>>, 127 ) -> Self { ··· 129 130 Self { 131 config, 132 - inputs: NailInputs { 133 - chains, 134 - interner, 135 - templates, 136 - }, 137 spicy_payloads: NailPayloads { spicy_payloads }, 138 } 139 }
··· 3 use axum::extract::{FromRef, FromRequestParts}; 4 use nailconfig::NailConfig; 5 use nailgen::{GeneratedTemplate, MarkovGen, Template, WarningTemplate}; 6 use nailrng::FastRng; 7 use nailspicy::SpicyPayloads; 8 use rand::seq::IndexedRandom; ··· 33 #[derive(Clone)] 34 pub struct NailInputs { 35 chains: Arc<[MarkovGen]>, 36 templates: Arc<Templates>, 37 } 38 39 impl NailInputs { 40 + pub fn new(chains: Arc<[MarkovGen]>, templates: Arc<Templates>) -> Self { 41 + Self { chains, templates } 42 } 43 44 /// Pulls a random markov chain from the available list. Returns a cloned ··· 54 } 55 } 56 57 #[inline] 58 pub fn get_warning_template(&self) -> Template { 59 Template::from(self.templates.warning.clone()) ··· 106 pub fn new( 107 config: impl Into<AppConfig>, 108 chains: Arc<[MarkovGen]>, 109 templates: Arc<Templates>, 110 spicy_payloads: Option<Arc<SpicyPayloads>>, 111 ) -> Self { ··· 113 114 Self { 115 config, 116 + inputs: NailInputs { chains, templates }, 117 spicy_payloads: NailPayloads { spicy_payloads }, 118 } 119 }
+4 -11
src/inputs.rs
··· 2 3 use color_eyre::eyre::Context; 4 use glob::glob; 5 - use nailbox::{arc_within, try_arc_within}; 6 use nailconfig::NailConfig; 7 use nailgen::{GeneratedTemplate, MarkovGen, WarningTemplate}; 8 - use nailkov::interner::Interner; 9 use nailstate::Templates; 10 11 /// Takes a glob for finding all input files and returns a read-only list of 12 /// all markov chains that can be generated. 13 - pub fn get_input_files( 14 - config: &NailConfig, 15 - ) -> color_eyre::Result<(Arc<[MarkovGen]>, Arc<Interner>)> { 16 - let mut interner = arc_within(|| Interner::with_capacity(512)); 17 - 18 - let interned_mut = Arc::get_mut(&mut interner).unwrap(); 19 - 20 let inputs = glob(&config.generator.input_files)? 21 .filter_map(|path| { 22 path.inspect_err(|err| tracing::error!("IO Error: {err}")) 23 .ok() 24 }) 25 .filter_map(|input| { 26 - MarkovGen::new(input, interned_mut) 27 .inspect_err(|err| tracing::error!("Markov Error: {err}")) 28 .ok() 29 }) ··· 33 color_eyre::eyre::bail!("No input files found! Exiting..."); 34 } 35 36 - Ok((inputs, interner)) 37 } 38 39 pub fn get_template_files(config: &NailConfig) -> color_eyre::Result<Arc<Templates>> {
··· 2 3 use color_eyre::eyre::Context; 4 use glob::glob; 5 + use nailbox::try_arc_within; 6 use nailconfig::NailConfig; 7 use nailgen::{GeneratedTemplate, MarkovGen, WarningTemplate}; 8 use nailstate::Templates; 9 10 /// Takes a glob for finding all input files and returns a read-only list of 11 /// all markov chains that can be generated. 12 + pub fn get_input_files(config: &NailConfig) -> color_eyre::Result<Arc<[MarkovGen]>> { 13 let inputs = glob(&config.generator.input_files)? 14 .filter_map(|path| { 15 path.inspect_err(|err| tracing::error!("IO Error: {err}")) 16 .ok() 17 }) 18 .filter_map(|input| { 19 + MarkovGen::new(input) 20 .inspect_err(|err| tracing::error!("Markov Error: {err}")) 21 .ok() 22 }) ··· 26 color_eyre::eyre::bail!("No input files found! Exiting..."); 27 } 28 29 + Ok(inputs) 30 } 31 32 pub fn get_template_files(config: &NailConfig) -> color_eyre::Result<Arc<Templates>> {
+2 -2
src/main.rs
··· 34 35 let config = nailconfig::get_configuration()?; 36 37 - let (inputs, interner) = nailpit::inputs::get_input_files(config.as_ref())?; 38 39 let templates = nailpit::inputs::get_template_files(config.as_ref())?; 40 41 let spicy = nailspicy::get_spicy_payload(config.as_ref()); 42 43 nailrt::start( 44 - nailstate::ServerState::new(config, inputs, interner, templates, spicy), 45 spawn_axum_worker, 46 )?; 47
··· 34 35 let config = nailconfig::get_configuration()?; 36 37 + let inputs = nailpit::inputs::get_input_files(config.as_ref())?; 38 39 let templates = nailpit::inputs::get_template_files(config.as_ref())?; 40 41 let spicy = nailspicy::get_spicy_payload(config.as_ref()); 42 43 nailrt::start( 44 + nailstate::ServerState::new(config, inputs, templates, spicy), 45 spawn_axum_worker, 46 )?; 47