+1
-1
.tangled/workflows/miri.yml
+1
-1
.tangled/workflows/miri.yml
+71
-7
Cargo.lock
+71
-7
Cargo.lock
···
26
26
"memchr",
27
27
]
28
28
29
+
[[package]]
30
+
name = "allocator-api2"
31
+
version = "0.2.21"
32
+
source = "registry+https://github.com/rust-lang/crates.io-index"
33
+
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
34
+
29
35
[[package]]
30
36
name = "anyhow"
31
37
version = "1.0.100"
···
168
174
source = "registry+https://github.com/rust-lang/crates.io-index"
169
175
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
170
176
177
+
[[package]]
178
+
name = "byteorder"
179
+
version = "1.5.0"
180
+
source = "registry+https://github.com/rust-lang/crates.io-index"
181
+
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
182
+
171
183
[[package]]
172
184
name = "bytes"
173
185
version = "1.11.0"
···
238
250
"winapi",
239
251
]
240
252
253
+
[[package]]
254
+
name = "crossbeam-utils"
255
+
version = "0.8.21"
256
+
source = "registry+https://github.com/rust-lang/crates.io-index"
257
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
258
+
259
+
[[package]]
260
+
name = "crossfig"
261
+
version = "0.1.3"
262
+
source = "registry+https://github.com/rust-lang/crates.io-index"
263
+
checksum = "40a998414a3656e7a11ca59d55598ce7df58daafd742e783844e80bbd8d500dd"
264
+
241
265
[[package]]
242
266
name = "diatomic-waker"
243
267
version = "0.2.3"
···
256
280
source = "registry+https://github.com/rust-lang/crates.io-index"
257
281
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
258
282
283
+
[[package]]
284
+
name = "estr"
285
+
version = "1.2.0"
286
+
source = "registry+https://github.com/rust-lang/crates.io-index"
287
+
checksum = "e2e77c7dfb1a984132a140c98a805b5f12f8e8707420dda9a6ad698bc50fc041"
288
+
dependencies = [
289
+
"byteorder",
290
+
"crossfig",
291
+
"hashbrown 0.16.1",
292
+
"libabort",
293
+
"lock_api",
294
+
"rapidhash",
295
+
"spin",
296
+
]
297
+
259
298
[[package]]
260
299
name = "eyre"
261
300
version = "0.6.12"
···
290
329
source = "registry+https://github.com/rust-lang/crates.io-index"
291
330
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
292
331
332
+
[[package]]
333
+
name = "foldhash"
334
+
version = "0.2.0"
335
+
source = "registry+https://github.com/rust-lang/crates.io-index"
336
+
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
337
+
293
338
[[package]]
294
339
name = "futures"
295
340
version = "0.3.31"
···
491
536
version = "0.16.1"
492
537
source = "registry+https://github.com/rust-lang/crates.io-index"
493
538
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
539
+
dependencies = [
540
+
"allocator-api2",
541
+
"equivalent",
542
+
"foldhash",
543
+
]
494
544
495
545
[[package]]
496
546
name = "hermit-abi"
···
657
707
source = "registry+https://github.com/rust-lang/crates.io-index"
658
708
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
659
709
710
+
[[package]]
711
+
name = "libabort"
712
+
version = "0.1.9"
713
+
source = "registry+https://github.com/rust-lang/crates.io-index"
714
+
checksum = "cec5d1db7977801dd3e593f88a15906cac3c4eb7a69e38b6cc162cb020b22d7d"
715
+
dependencies = [
716
+
"rustversion-detect",
717
+
]
718
+
660
719
[[package]]
661
720
name = "libc"
662
721
version = "0.2.178"
···
826
885
name = "nailkov"
827
886
version = "0.1.0"
828
887
dependencies = [
888
+
"crossbeam-utils",
889
+
"estr",
829
890
"hashbrown 0.15.5",
830
891
"indexmap",
831
892
"itertools",
832
893
"nailrng",
894
+
"parking_lot",
833
895
"rand",
834
896
"rand_distr",
835
897
"rapidhash",
836
-
"rustc-hash",
837
898
"tracing",
838
899
"unicode-segmentation",
839
900
]
···
1390
1451
source = "registry+https://github.com/rust-lang/crates.io-index"
1391
1452
checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace"
1392
1453
1393
-
[[package]]
1394
-
name = "rustc-hash"
1395
-
version = "2.1.1"
1396
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1397
-
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
1398
-
1399
1454
[[package]]
1400
1455
name = "rustversion"
1401
1456
version = "1.0.22"
1402
1457
source = "registry+https://github.com/rust-lang/crates.io-index"
1403
1458
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
1404
1459
1460
+
[[package]]
1461
+
name = "rustversion-detect"
1462
+
version = "0.1.3"
1463
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1464
+
checksum = "4cfa9e87e97427c3a1b472eace073b2bc577ad0e1444c128d938b3d5bcdacb17"
1465
+
1405
1466
[[package]]
1406
1467
name = "ryu"
1407
1468
version = "1.0.20"
···
1545
1606
version = "0.10.0"
1546
1607
source = "registry+https://github.com/rust-lang/crates.io-index"
1547
1608
checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591"
1609
+
dependencies = [
1610
+
"lock_api",
1611
+
]
1548
1612
1549
1613
[[package]]
1550
1614
name = "syn"
+1
-1
crates/nailgen/Cargo.toml
+1
-1
crates/nailgen/Cargo.toml
+6
-19
crates/nailgen/src/html_gen.rs
+6
-19
crates/nailgen/src/html_gen.rs
···
3
3
use axum::extract::MatchedPath;
4
4
use bytes::{Bytes, BytesMut};
5
5
use nailconfig::NailConfig;
6
-
use nailkov::{NailKov, interner::Interner};
6
+
use nailkov::NailKov;
7
7
use nailrng::FastRng;
8
8
use rand::{Rng, RngCore, distr::Alphanumeric, seq::IndexedRandom};
9
9
···
24
24
/// interned text from the interner.
25
25
#[inline]
26
26
pub fn text_generator<'a>(
27
-
interner: &'a Interner,
28
27
chain: &'a NailKov,
29
28
size: usize,
30
29
rng: &'a mut impl RngCore,
···
33
32
.generate_tokens(rng)
34
33
.take(size)
35
34
// SAFETY: The id comes from the same interner that allocated it
36
-
.flat_map(|token| unsafe { interner.lookup(token).as_bytes() })
35
+
.flat_map(|token| token.as_bytes())
37
36
.skip_while(|&text| !text.is_ascii_alphabetic())
38
37
}
39
38
···
64
63
65
64
pub async fn initial_content(
66
65
buf_mut: BytesMut,
67
-
interner: Arc<Interner>,
68
66
chain: Arc<NailKov>,
69
67
config: Arc<NailConfig>,
70
68
mut rng: FastRng,
···
75
73
(0..max_paras)
76
74
.fold(buf_mut, |mut acc, _| {
77
75
acc.extend(paragraph(
78
-
&interner,
79
76
&chain,
80
77
get_desired_size(&config, &mut rng),
81
78
&mut rng,
···
88
85
89
86
pub async fn main_content(
90
87
mut buffer: BytesMut,
91
-
interner: Arc<Interner>,
92
88
chain: Arc<NailKov>,
93
89
config: Arc<NailConfig>,
94
90
mut rng: FastRng,
···
96
92
buffer.reserve(config.generator.chunk_size * 2);
97
93
98
94
loop {
99
-
buffer.extend(header(
100
-
&interner,
101
-
&chain,
102
-
config.generator.header_size,
103
-
&mut rng,
104
-
));
95
+
buffer.extend(header(&chain, config.generator.header_size, &mut rng));
105
96
106
97
// Randomise how many paragraphs we want per section
107
98
let paragraphs = rng.random_range(1..=4);
108
99
109
100
(0..paragraphs).for_each(|_| {
110
101
buffer.extend(paragraph(
111
-
&interner,
112
102
&chain,
113
103
get_desired_size(&config, &mut rng),
114
104
&mut rng,
···
146
136
147
137
pub async fn footer(
148
138
mut buf_mut: BytesMut,
149
-
interner: Arc<Interner>,
150
139
chain: Arc<NailKov>,
151
140
path: MatchedPath,
152
141
config: Arc<NailConfig>,
···
166
155
buf_mut.extend(
167
156
b"\">"
168
157
.iter()
169
-
.chain(text_generator(&interner, &chain, 8, &mut rng))
158
+
.chain(text_generator(&chain, 8, &mut rng))
170
159
.chain(b"</a></li>\n"),
171
160
);
172
161
}
···
178
167
179
168
#[inline]
180
169
fn paragraph<'a>(
181
-
interner: &'a Interner,
182
170
chain: &'a NailKov,
183
171
size: usize,
184
172
rng: &'a mut impl RngCore,
185
173
) -> impl Iterator<Item = &'a u8> + 'a {
186
174
b"<p>"
187
175
.iter()
188
-
.chain(text_generator(interner, chain, size, rng))
176
+
.chain(text_generator(chain, size, rng))
189
177
.chain(b"</p>\n")
190
178
}
191
179
192
180
#[inline]
193
181
fn header<'a>(
194
-
interner: &'a Interner,
195
182
chain: &'a NailKov,
196
183
size: usize,
197
184
rng: &'a mut impl RngCore,
198
185
) -> impl Iterator<Item = &'a u8> + 'a {
199
186
b"\n<h2>"
200
187
.iter()
201
-
.chain(text_generator(interner, chain, size, rng))
188
+
.chain(text_generator(chain, size, rng))
202
189
.chain(b"</h2>\n")
203
190
}
+7
-19
crates/nailgen/src/lib.rs
+7
-19
crates/nailgen/src/lib.rs
···
16
16
use futures_lite::Stream;
17
17
use nailbox::{boxed_future_within, try_arc_within};
18
18
use nailconfig::NailConfig;
19
-
use nailkov::{NailKov, interner::Interner};
19
+
use nailkov::NailKov;
20
20
use nailrng::FastRng;
21
21
use pin_project_lite::pin_project;
22
22
use tokio::time::Sleep;
···
59
59
pub struct MarkovStream {
60
60
path: MatchedPath,
61
61
config: Arc<NailConfig>,
62
-
interner: Arc<Interner>,
63
62
markov: MarkovGen,
64
63
start_time: Instant,
65
64
total_bytes: usize,
···
77
76
markov: MarkovGen,
78
77
path: MatchedPath,
79
78
config: Arc<NailConfig>,
80
-
interner: Arc<Interner>,
81
79
template: Template,
82
80
rng: FastRng,
83
81
) -> Self {
84
82
Self {
85
83
path,
86
84
config,
87
-
interner,
88
85
markov,
89
86
total_bytes: 0,
90
87
start_time: Instant::now(),
···
121
118
let title = this.page_title.get_or_insert_with(|| {
122
119
this.template.get_static_content().map_or_else(
123
120
|| {
124
-
text_generator(
125
-
this.interner,
126
-
&this.markov.chain,
127
-
24,
128
-
this.rng,
129
-
)
130
-
.copied()
131
-
.collect()
121
+
text_generator(&this.markov.chain, 24, this.rng)
122
+
.copied()
123
+
.collect()
132
124
},
133
125
|title| static_title(title).copied().collect(),
134
126
)
···
144
136
let handle = boxed_future_within(|| {
145
137
initial_content(
146
138
buffer,
147
-
this.interner.clone(),
148
139
this.markov.chain.clone(),
149
140
this.config.clone(),
150
141
this.rng.fork(),
···
188
179
let handle = boxed_future_within(|| {
189
180
footer(
190
181
buffer,
191
-
this.interner.clone(),
192
182
this.markov.chain.clone(),
193
183
this.path.clone(),
194
184
this.config.clone(),
···
237
227
let handle = boxed_future_within(|| {
238
228
main_content(
239
229
buffer,
240
-
this.interner.clone(),
241
230
this.markov.chain.clone(),
242
231
this.config.clone(),
243
232
this.rng.fork(),
···
287
276
}
288
277
289
278
impl MarkovGen {
290
-
pub fn new(input: impl AsRef<Path>, interner: &mut Interner) -> Result<Self> {
279
+
pub fn new(input: impl AsRef<Path>) -> Result<Self> {
291
280
let file = std::fs::read_to_string(input.as_ref())?;
292
281
293
-
let chain = try_arc_within(|| NailKov::from_input(interner, &file))?;
282
+
let chain = try_arc_within(|| NailKov::from_input(&file))?;
294
283
295
284
Ok(Self { chain })
296
285
}
···
300
289
self,
301
290
path: MatchedPath,
302
291
config: Arc<NailConfig>,
303
-
interner: Arc<Interner>,
304
292
template: Template,
305
293
rng: FastRng,
306
294
) -> MarkovStream {
307
-
MarkovStream::new(self, path, config, interner, template, rng)
295
+
MarkovStream::new(self, path, config, template, rng)
308
296
}
309
297
}
+3
-1
crates/nailkov/Cargo.toml
+3
-1
crates/nailkov/Cargo.toml
···
13
13
[dependencies]
14
14
nailrng = { path = "../nailrng" }
15
15
rapidhash = { workspace = true, features = ["rand", "unsafe"] }
16
-
rustc-hash = { version = "2.1.1", default-features = false }
17
16
hashbrown.workspace = true
18
17
rand.workspace = true
19
18
rand_distr.workspace = true
···
21
20
unicode-segmentation.workspace = true
22
21
tracing.workspace = true
23
22
indexmap.workspace = true
23
+
parking_lot.workspace = true
24
+
estr = "1.2.0"
25
+
crossbeam-utils = "0.8.21"
+8
-5
crates/nailkov/src/distribution.rs
+8
-5
crates/nailkov/src/distribution.rs
···
1
1
//! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with
2
2
//! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov).
3
3
4
+
use core::hash::BuildHasherDefault;
5
+
6
+
use estr::IdentityHasher;
4
7
use indexmap::IndexMap;
5
8
use rand::Rng;
6
9
use rand_distr::{Distribution, weighted::WeightedAliasIndex};
7
10
8
-
use crate::{RandomState, error::NailError, token::Token};
11
+
use crate::{error::NailError, token::Token};
9
12
10
13
/// A distribution of choices and their likelihood.
11
14
#[derive(Clone, Debug)]
···
29
32
#[derive(Clone, Debug)]
30
33
pub struct TokenWeightsBuilder {
31
34
/// Counts how many times a token is likely to appear.
32
-
occurrences: IndexMap<Token, u32, RandomState>,
35
+
occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>,
33
36
}
34
37
35
38
impl TokenWeightsBuilder {
36
-
pub fn new(hasher: RandomState) -> Self {
39
+
pub fn new() -> Self {
37
40
Self {
38
-
occurrences: IndexMap::with_hasher(hasher),
41
+
occurrences: IndexMap::with_hasher(Default::default()),
39
42
}
40
43
}
41
44
···
64
67
65
68
impl Default for TokenWeightsBuilder {
66
69
fn default() -> Self {
67
-
Self::new(RandomState::new())
70
+
Self::new()
68
71
}
69
72
}
-237
crates/nailkov/src/interner.rs
-237
crates/nailkov/src/interner.rs
···
1
-
use hashbrown::{Equivalent, HashMap};
2
-
use rapidhash::fast::RandomState;
3
-
4
-
use crate::token::Token;
5
-
6
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7
-
#[repr(transparent)]
8
-
struct StringPtr(*const str);
9
-
10
-
impl StringPtr {
11
-
#[inline(always)]
12
-
const fn cast(&self) -> &str {
13
-
// SAFETY: The pointer is stable as it points to memory that is never
14
-
// moved/invalidated while this struct lives, therefore can be safely
15
-
// dereferenced back to a string slice. We own the String instance this
16
-
// references, and all StringPtrs are used within the same scope as the
17
-
// String instances, so when String drops, these will be dropped too.
18
-
unsafe { &*self.0 }
19
-
}
20
-
}
21
-
22
-
impl core::hash::Hash for StringPtr {
23
-
#[inline]
24
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
25
-
self.cast().hash(state);
26
-
}
27
-
}
28
-
// SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated
29
-
// while Interner lives, and all instances of StringPtr live as long as Interner.
30
-
// Since the String type is `Send`, so is StringPtr
31
-
unsafe impl Send for StringPtr {}
32
-
// SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated
33
-
// while Interner lives, and all instances of StringPtr live as long as Interner.
34
-
// Since the String type is `Sync`, so is StringPtr
35
-
unsafe impl Sync for StringPtr {}
36
-
37
-
#[derive(Debug, Clone)]
38
-
pub struct Interner {
39
-
collected: HashMap<StringPtr, Token, RandomState>,
40
-
index: Vec<StringPtr>,
41
-
buffer: String,
42
-
stored: Vec<String>,
43
-
}
44
-
45
-
impl Default for Interner {
46
-
fn default() -> Self {
47
-
Self::with_capacity(256)
48
-
}
49
-
}
50
-
51
-
impl Interner {
52
-
/// # Safety
53
-
/// The caller must ensure that the [`Token`] being passed in was allocated
54
-
/// from the same [`Interner`] instance.
55
-
#[inline(always)]
56
-
pub unsafe fn lookup(&self, id: Token) -> &str {
57
-
// SAFETY: Safety is upheld by the caller ensuring the id was allocated
58
-
// from the same interner.
59
-
unsafe { self.index.get_unchecked(id.index()).cast() }
60
-
}
61
-
62
-
pub fn with_capacity(cap: usize) -> Interner {
63
-
// This will get us just under 64KiB of interned storage before we
64
-
// need to allocate more space for buffer storage.
65
-
let stored = Vec::with_capacity(8);
66
-
67
-
Interner {
68
-
collected: HashMap::with_hasher(RandomState::new()),
69
-
index: Vec::new(),
70
-
stored,
71
-
buffer: String::with_capacity(cap.next_power_of_two()),
72
-
}
73
-
}
74
-
75
-
pub fn intern(&mut self, text: &str) -> Token {
76
-
if let Some(&id) = self.collected.get(text) {
77
-
return id;
78
-
}
79
-
80
-
// SAFETY: `alloc`` is never called elsewhere, nor the properties it controls
81
-
// are modified outside of the method. Here we get a new StringPtr for `text` that
82
-
// hasn't been stored before.
83
-
let name = unsafe { self.alloc(text) };
84
-
let id = Token::new(self.index.len() as u32);
85
-
self.collected.insert(name, id);
86
-
self.index.push(name);
87
-
88
-
// SAFETY: We are using the id allocated within the same function scope,
89
-
// so it is always from the same source.
90
-
unsafe {
91
-
debug_assert!(self.lookup(id).equivalent(&name));
92
-
}
93
-
debug_assert!(self.intern(name.cast()) == id);
94
-
95
-
id
96
-
}
97
-
98
-
/// Allocates a new [`StringPtr`] for the given string input. If there is no more room
99
-
/// in the current buffer, it allocates a new buffer and creates the StringPtr to reference
100
-
/// the stored string in the new buffer, storing the old one.
101
-
///
102
-
/// # Safety
103
-
///
104
-
/// The caller must ensure that `self.buffers` and `self.active` are never modified elsewhere,
105
-
/// and that this is called only for new instances of `text`.
106
-
unsafe fn alloc(&mut self, text: &str) -> StringPtr {
107
-
let capacity = self.buffer.capacity();
108
-
109
-
if capacity < self.buffer.len() + text.len() {
110
-
// If we ran out of capacity in our storage, allocate a new buffer with
111
-
// larger capacity.
112
-
let new_cap = (capacity.max(text.len()) + 1).next_power_of_two();
113
-
let old_buf = core::mem::replace(&mut self.buffer, String::with_capacity(new_cap));
114
-
115
-
self.stored.push(old_buf);
116
-
}
117
-
118
-
// Construct raw str slice to eliminate lifetime tracking as we manage its
119
-
// lifetime within the Interner instance.
120
-
let interned = {
121
-
let start = self.buffer.len();
122
-
self.buffer.push_str(text);
123
-
124
-
&raw const self.buffer[start..]
125
-
};
126
-
127
-
StringPtr(interned)
128
-
}
129
-
}
130
-
131
-
impl Equivalent<StringPtr> for str {
132
-
#[inline(always)]
133
-
fn equivalent(&self, key: &StringPtr) -> bool {
134
-
key.cast().eq(self)
135
-
}
136
-
}
137
-
138
-
#[cfg(test)]
139
-
mod tests {
140
-
use super::*;
141
-
142
-
#[test]
143
-
fn string_ptr_comparisons() {
144
-
let one = "one";
145
-
let two = "two";
146
-
147
-
let one_ptr = StringPtr(one);
148
-
let two_ptr = StringPtr(two);
149
-
150
-
assert_ne!(one_ptr, two_ptr);
151
-
152
-
assert!(one.equivalent(&one_ptr));
153
-
}
154
-
155
-
#[test]
156
-
fn is_able_to_intern_one_string() {
157
-
let mut interner = Interner::default();
158
-
159
-
assert!(interner.buffer.is_empty());
160
-
161
-
let text = "Lorem ipsum";
162
-
163
-
let id = interner.intern(text);
164
-
165
-
// SAFETY: It comes from the same source
166
-
unsafe {
167
-
assert_eq!(text, interner.lookup(id));
168
-
}
169
-
assert_eq!(interner.buffer.len(), 11);
170
-
171
-
let again = interner.intern(text);
172
-
173
-
assert_eq!(id, again);
174
-
assert_eq!(interner.buffer.len(), 11);
175
-
}
176
-
177
-
#[test]
178
-
fn is_able_to_intern_many_strings() {
179
-
let mut interner = Interner::with_capacity(32);
180
-
181
-
let texts = [
182
-
"Lorem ipsum",
183
-
"dolor sit amet",
184
-
"duplicated",
185
-
"Other text",
186
-
"Elevenses",
187
-
"duplicated",
188
-
"Gibberish",
189
-
];
190
-
191
-
let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect();
192
-
193
-
assert_eq!(
194
-
interned.as_slice(),
195
-
&[
196
-
Token::new(0),
197
-
Token::new(1),
198
-
Token::new(2),
199
-
Token::new(3),
200
-
Token::new(4),
201
-
Token::new(2),
202
-
Token::new(5)
203
-
]
204
-
);
205
-
assert_eq!(interner.buffer.capacity(), 64);
206
-
assert_eq!(interner.stored.len(), 1);
207
-
assert_eq!(interner.stored[0].capacity(), 32);
208
-
}
209
-
210
-
#[test]
211
-
fn is_thread_safe() {
212
-
let mut interner = Interner::with_capacity(32);
213
-
214
-
let texts = [
215
-
"Lorem ipsum",
216
-
"dolor sit amet",
217
-
"duplicated",
218
-
"Other text",
219
-
"Elevenses",
220
-
"duplicated",
221
-
"Gibberish",
222
-
];
223
-
224
-
let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect();
225
-
226
-
std::thread::scope(|s| {
227
-
s.spawn(move || {
228
-
for (id, expected) in interned.into_iter().zip(texts) {
229
-
// SAFETY: It comes from the same source
230
-
unsafe {
231
-
assert_eq!(expected, interner.lookup(id));
232
-
}
233
-
}
234
-
});
235
-
});
236
-
}
237
-
}
+11
-46
crates/nailkov/src/lib.rs
+11
-46
crates/nailkov/src/lib.rs
···
4
4
5
5
mod distribution;
6
6
mod error;
7
-
pub mod interner;
8
7
mod token;
9
8
9
+
use crossbeam_utils::CachePadded;
10
10
use error::NailError;
11
11
use indexmap::IndexMap;
12
-
use interner::Interner;
13
12
use itertools::Itertools;
14
-
use nailrng::FastRng;
15
13
use rand::{RngCore, seq::IteratorRandom};
16
14
use rand_distr::Distribution;
17
15
18
16
use distribution::{TokenWeights, TokenWeightsBuilder};
19
-
use rustc_hash::FxHasher;
17
+
use rapidhash::fast::RandomState;
20
18
use token::{Token, TokenPair};
21
19
use unicode_segmentation::UnicodeSegmentation;
22
20
23
-
#[derive(Clone)]
24
-
pub struct RandomState {
25
-
seed: usize,
26
-
}
27
-
28
-
impl RandomState {
29
-
fn new() -> Self {
30
-
let mut rng = FastRng::default();
31
-
32
-
Self {
33
-
seed: rng.next_u64() as usize,
34
-
}
35
-
}
36
-
}
37
-
38
-
impl Default for RandomState {
39
-
fn default() -> Self {
40
-
Self::new()
41
-
}
42
-
}
43
-
44
-
impl core::hash::BuildHasher for RandomState {
45
-
type Hasher = FxHasher;
46
-
47
-
fn build_hasher(&self) -> Self::Hasher {
48
-
FxHasher::with_seed(self.seed)
49
-
}
50
-
}
51
-
52
21
#[derive(Clone, Debug)]
53
22
pub struct NailKov {
54
-
chain: IndexMap<TokenPair, TokenWeights, RandomState>,
23
+
chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>,
55
24
}
56
25
57
26
pub struct NailKovIter<'a, R: RngCore> {
···
89
58
}
90
59
91
60
impl NailKov {
92
-
pub fn from_input(interner: &mut Interner, input: &str) -> Result<NailKov, NailError> {
93
-
NailBuilder::new(RandomState::new()).with_input(interner, input)
61
+
pub fn from_input(input: &str) -> Result<NailKov, NailError> {
62
+
NailBuilder::new(RandomState::new()).with_input(input)
94
63
}
95
64
}
96
65
···
105
74
}
106
75
}
107
76
108
-
fn with_input(self, interned: &mut Interner, input: &str) -> Result<NailKov, NailError> {
109
-
self.feed_str(interned, input)?.build()
77
+
fn with_input(self, input: &str) -> Result<NailKov, NailError> {
78
+
self.feed_str(input)?.build()
110
79
}
111
80
112
81
fn build(self) -> Result<NailKov, NailError> {
···
128
97
return Err(NailError::EmptyInput);
129
98
}
130
99
131
-
Ok(NailKov { chain })
100
+
Ok(NailKov { chain: CachePadded::new(chain) })
132
101
}
133
102
134
103
/// Add the occurrence of `next` following `prev`.
···
138
107
builder.add(next);
139
108
}
140
109
None => {
141
-
let mut builder = TokenWeightsBuilder::new(self.chain.hasher().clone());
110
+
let mut builder = TokenWeightsBuilder::new();
142
111
builder.add(next);
143
112
self.chain.insert(prev, builder);
144
113
}
145
114
}
146
115
}
147
116
148
-
fn feed_str(self, interner: &mut Interner, content: &str) -> Result<Self, NailError> {
149
-
self.feed_tokens(
150
-
content
151
-
.split_word_bounds()
152
-
.map(|text| interner.intern(text)),
153
-
)
117
+
fn feed_str(self, content: &str) -> Result<Self, NailError> {
118
+
self.feed_tokens(content.split_word_bounds().map(Token::from))
154
119
}
155
120
156
121
fn feed_tokens(mut self, tokens: impl Iterator<Item = Token>) -> Result<Self, NailError> {
+16
-43
crates/nailkov/src/token.rs
+16
-43
crates/nailkov/src/token.rs
···
1
-
use std::ops::Deref;
1
+
use estr::Estr;
2
2
3
3
/// Representation of a string segment.
4
4
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
5
5
#[repr(transparent)]
6
-
pub struct Token(u32);
6
+
pub struct Token(Estr);
7
7
8
8
impl Token {
9
9
#[inline(always)]
10
-
pub const fn new(ptr: u32) -> Self {
11
-
Self(ptr)
10
+
pub fn new(str: &str) -> Self {
11
+
Self(Estr::from(str))
12
12
}
13
13
14
-
#[inline(always)]
15
-
pub(crate) const fn index(&self) -> usize {
16
-
self.0 as usize
14
+
#[inline]
15
+
pub fn as_str(&self) -> &'static str {
16
+
self.0.as_str()
17
17
}
18
18
19
-
#[inline(always)]
20
-
const fn to_bits(self) -> u32 {
21
-
self.0
19
+
#[inline]
20
+
pub fn as_bytes(&self) -> &'static [u8] {
21
+
self.0.as_str().as_bytes()
22
22
}
23
23
}
24
24
25
-
impl Deref for Token {
26
-
type Target = u32;
27
-
28
-
#[inline]
29
-
fn deref(&self) -> &Self::Target {
30
-
&self.0
25
+
impl From<&str> for Token {
26
+
fn from(value: &str) -> Self {
27
+
Self::new(value)
31
28
}
32
29
}
33
30
···
37
34
// optimized codegen for `to_bits`, `PartialEq`
38
35
// Prior art taken from my contribution to Bevy:
39
36
// https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309
40
-
#[repr(C, align(8))]
37
+
#[repr(C, align(16))]
41
38
pub struct TokenPair {
42
39
// Do not reorder the fields here. The ordering is explicitly used by repr(C)
43
40
// to make this struct equivalent to a u64.
···
76
73
}
77
74
78
75
#[inline(always)]
79
-
const fn to_bits(self) -> u64 {
80
-
self.left.to_bits() as u64 | ((self.right.to_bits() as u64) << 32)
76
+
fn to_bits(self) -> u128 {
77
+
(self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64)
81
78
}
82
79
}
83
80
···
87
84
self
88
85
}
89
86
}
90
-
91
-
#[cfg(test)]
92
-
mod tests {
93
-
use super::*;
94
-
95
-
#[test]
96
-
fn token_smoke_testing() {
97
-
let left = Token(0x2);
98
-
let right = Token(0x2b);
99
-
100
-
let pair = TokenPair::new(left, right);
101
-
102
-
assert_eq!(pair.to_bits(), 0x2b00000002);
103
-
assert_eq!(pair.left, left);
104
-
assert_eq!(pair.right, right);
105
-
106
-
let other_right = Token(0x2c);
107
-
108
-
let other_pair = TokenPair::new(left, other_right);
109
-
110
-
assert_eq!(other_pair.to_bits(), 0x2c00000002);
111
-
assert_ne!(pair, other_pair);
112
-
}
113
-
}
-2
crates/nailroutes/src/lib.rs
-2
crates/nailroutes/src/lib.rs
···
24
24
.into_stream(
25
25
matched,
26
26
config.clone_inner(),
27
-
inputs.get_interner(),
28
27
inputs.get_warning_template(),
29
28
rng,
30
29
)
···
46
45
.into_stream(
47
46
matched,
48
47
config.clone_inner(),
49
-
inputs.get_interner(),
50
48
inputs.get_generated_template(),
51
49
rng,
52
50
)
+3
-23
crates/nailstate/src/lib.rs
+3
-23
crates/nailstate/src/lib.rs
···
3
3
use axum::extract::{FromRef, FromRequestParts};
4
4
use nailconfig::NailConfig;
5
5
use nailgen::{GeneratedTemplate, MarkovGen, Template, WarningTemplate};
6
-
use nailkov::interner::Interner;
7
6
use nailrng::FastRng;
8
7
use nailspicy::SpicyPayloads;
9
8
use rand::seq::IndexedRandom;
···
34
33
#[derive(Clone)]
35
34
pub struct NailInputs {
36
35
chains: Arc<[MarkovGen]>,
37
-
interner: Arc<Interner>,
38
36
templates: Arc<Templates>,
39
37
}
40
38
41
39
impl NailInputs {
42
-
pub fn new(
43
-
chains: Arc<[MarkovGen]>,
44
-
interner: Arc<Interner>,
45
-
templates: Arc<Templates>,
46
-
) -> Self {
47
-
Self {
48
-
chains,
49
-
interner,
50
-
templates,
51
-
}
40
+
pub fn new(chains: Arc<[MarkovGen]>, templates: Arc<Templates>) -> Self {
41
+
Self { chains, templates }
52
42
}
53
43
54
44
/// Pulls a random markov chain from the available list. Returns a cloned
···
64
54
}
65
55
}
66
56
67
-
#[inline]
68
-
pub fn get_interner(&self) -> Arc<Interner> {
69
-
self.interner.clone()
70
-
}
71
-
72
57
#[inline]
73
58
pub fn get_warning_template(&self) -> Template {
74
59
Template::from(self.templates.warning.clone())
···
121
106
pub fn new(
122
107
config: impl Into<AppConfig>,
123
108
chains: Arc<[MarkovGen]>,
124
-
interner: Arc<Interner>,
125
109
templates: Arc<Templates>,
126
110
spicy_payloads: Option<Arc<SpicyPayloads>>,
127
111
) -> Self {
···
129
113
130
114
Self {
131
115
config,
132
-
inputs: NailInputs {
133
-
chains,
134
-
interner,
135
-
templates,
136
-
},
116
+
inputs: NailInputs { chains, templates },
137
117
spicy_payloads: NailPayloads { spicy_payloads },
138
118
}
139
119
}
+4
-11
src/inputs.rs
+4
-11
src/inputs.rs
···
2
2
3
3
use color_eyre::eyre::Context;
4
4
use glob::glob;
5
-
use nailbox::{arc_within, try_arc_within};
5
+
use nailbox::try_arc_within;
6
6
use nailconfig::NailConfig;
7
7
use nailgen::{GeneratedTemplate, MarkovGen, WarningTemplate};
8
-
use nailkov::interner::Interner;
9
8
use nailstate::Templates;
10
9
11
10
/// Takes a glob for finding all input files and returns a read-only list of
12
11
/// all markov chains that can be generated.
13
-
pub fn get_input_files(
14
-
config: &NailConfig,
15
-
) -> color_eyre::Result<(Arc<[MarkovGen]>, Arc<Interner>)> {
16
-
let mut interner = arc_within(|| Interner::with_capacity(512));
17
-
18
-
let interned_mut = Arc::get_mut(&mut interner).unwrap();
19
-
12
+
pub fn get_input_files(config: &NailConfig) -> color_eyre::Result<Arc<[MarkovGen]>> {
20
13
let inputs = glob(&config.generator.input_files)?
21
14
.filter_map(|path| {
22
15
path.inspect_err(|err| tracing::error!("IO Error: {err}"))
23
16
.ok()
24
17
})
25
18
.filter_map(|input| {
26
-
MarkovGen::new(input, interned_mut)
19
+
MarkovGen::new(input)
27
20
.inspect_err(|err| tracing::error!("Markov Error: {err}"))
28
21
.ok()
29
22
})
···
33
26
color_eyre::eyre::bail!("No input files found! Exiting...");
34
27
}
35
28
36
-
Ok((inputs, interner))
29
+
Ok(inputs)
37
30
}
38
31
39
32
pub fn get_template_files(config: &NailConfig) -> color_eyre::Result<Arc<Templates>> {
+2
-2
src/main.rs
+2
-2
src/main.rs
···
34
34
35
35
let config = nailconfig::get_configuration()?;
36
36
37
-
let (inputs, interner) = nailpit::inputs::get_input_files(config.as_ref())?;
37
+
let inputs = nailpit::inputs::get_input_files(config.as_ref())?;
38
38
39
39
let templates = nailpit::inputs::get_template_files(config.as_ref())?;
40
40
41
41
let spicy = nailspicy::get_spicy_payload(config.as_ref());
42
42
43
43
nailrt::start(
44
-
nailstate::ServerState::new(config, inputs, interner, templates, spicy),
44
+
nailstate::ServerState::new(config, inputs, templates, spicy),
45
45
spawn_axum_worker,
46
46
)?;
47
47