A pit full of rusty nails

perf: Improve TokenPair hashing for faster generation

+32 -33
+2 -5
crates/nailkov/src/distribution.rs
··· 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 4 - use core::hash::BuildHasherDefault; 5 - 6 - use estr::IdentityHasher; 7 use indexmap::IndexMap; 8 use rand::Rng; 9 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 10 11 - use crate::{error::NailError, token::Token}; 12 13 /// A distribution of choices and their likelihood. 14 #[derive(Clone, Debug)] ··· 32 #[derive(Clone, Debug)] 33 pub struct TokenWeightsBuilder { 34 /// Counts how many times a token is likely to appear. 35 - occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>, 36 } 37 38 impl TokenWeightsBuilder {
··· 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 4 use indexmap::IndexMap; 5 use rand::Rng; 6 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 7 8 + use crate::{TokenHasher, error::NailError, token::Token}; 9 10 /// A distribution of choices and their likelihood. 11 #[derive(Clone, Debug)] ··· 29 #[derive(Clone, Debug)] 30 pub struct TokenWeightsBuilder { 31 /// Counts how many times a token is likely to appear. 32 + occurrences: IndexMap<Token, u32, TokenHasher>, 33 } 34 35 impl TokenWeightsBuilder {
+15 -7
crates/nailkov/src/lib.rs
··· 6 mod error; 7 mod token; 8 9 use crossbeam_utils::CachePadded; 10 use error::NailError; 11 use indexmap::IndexMap; 12 use itertools::Itertools; 13 use rand::{RngCore, seq::IteratorRandom}; 14 use rand_distr::Distribution; 15 16 use distribution::{TokenWeights, TokenWeightsBuilder}; 17 - use rapidhash::fast::RandomState; 18 use token::{Token, TokenPair}; 19 use unicode_segmentation::UnicodeSegmentation; 20 21 #[derive(Clone, Debug)] 22 pub struct NailKov { 23 - chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>, 24 } 25 26 pub struct NailKovIter<'a, R: RngCore> { ··· 59 60 impl NailKov { 61 pub fn from_input(input: &str) -> Result<NailKov, NailError> { 62 - NailBuilder::new(RandomState::new()).with_input(input) 63 } 64 } 65 66 struct NailBuilder { 67 - chain: IndexMap<TokenPair, TokenWeightsBuilder, RandomState>, 68 } 69 70 impl NailBuilder { 71 - fn new(hasher: RandomState) -> Self { 72 Self { 73 chain: IndexMap::with_hasher(hasher), 74 } ··· 83 return Err(NailError::EmptyInput); 84 } 85 86 - let chain: IndexMap<TokenPair, TokenWeights, RandomState> = self 87 .chain 88 .into_iter() 89 .flat_map(|(pair, dist)| { ··· 97 return Err(NailError::EmptyInput); 98 } 99 100 - Ok(NailKov { chain: CachePadded::new(chain) }) 101 } 102 103 /// Add the occurrence of `next` following `prev`.
··· 6 mod error; 7 mod token; 8 9 + use core::hash::BuildHasherDefault; 10 + 11 use crossbeam_utils::CachePadded; 12 use error::NailError; 13 + use estr::IdentityHasher; 14 use indexmap::IndexMap; 15 use itertools::Itertools; 16 use rand::{RngCore, seq::IteratorRandom}; 17 use rand_distr::Distribution; 18 19 use distribution::{TokenWeights, TokenWeightsBuilder}; 20 use token::{Token, TokenPair}; 21 use unicode_segmentation::UnicodeSegmentation; 22 23 + /// `nailkov` relies on `estr`'s precomputed hashes, so we avoid 24 + /// hashing ourselves and can just use the precomputed hashes instead. 25 + type TokenHasher = BuildHasherDefault<IdentityHasher>; 26 + 27 #[derive(Clone, Debug)] 28 pub struct NailKov { 29 + chain: CachePadded<IndexMap<TokenPair, TokenWeights, TokenHasher>>, 30 } 31 32 pub struct NailKovIter<'a, R: RngCore> { ··· 65 66 impl NailKov { 67 pub fn from_input(input: &str) -> Result<NailKov, NailError> { 68 + NailBuilder::new(TokenHasher::new()).with_input(input) 69 } 70 } 71 72 struct NailBuilder { 73 + chain: IndexMap<TokenPair, TokenWeightsBuilder, TokenHasher>, 74 } 75 76 impl NailBuilder { 77 + fn new(hasher: TokenHasher) -> Self { 78 Self { 79 chain: IndexMap::with_hasher(hasher), 80 } ··· 89 return Err(NailError::EmptyInput); 90 } 91 92 + let chain: IndexMap<TokenPair, TokenWeights, TokenHasher> = self 93 .chain 94 .into_iter() 95 .flat_map(|(pair, dist)| { ··· 103 return Err(NailError::EmptyInput); 104 } 105 106 + Ok(NailKov { 107 + chain: CachePadded::new(chain), 108 + }) 109 } 110 111 /// Add the occurrence of `next` following `prev`.
+15 -21
crates/nailkov/src/token.rs
··· 29 } 30 31 /// An owned pair of [`Token`]s. 32 - #[derive(Copy, Clone, Debug)] 33 // Alignment repr necessary to allow LLVM to better output 34 - // optimized codegen for `to_bits`, `PartialEq` 35 // Prior art taken from my contribution to Bevy: 36 // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 37 #[repr(C, align(16))] 38 pub struct TokenPair { 39 // Do not reorder the fields here. The ordering is explicitly used by repr(C) 40 - // to make this struct equivalent to a u64. 41 #[cfg(target_endian = "little")] 42 pub left: Token, 43 pub right: Token, ··· 45 pub left: Token, 46 } 47 48 - // By not short-circuiting in comparisons, we get better codegen. 49 - // See <https://github.com/rust-lang/rust/issues/117800> 50 - impl PartialEq for TokenPair { 51 - #[inline(always)] 52 - fn eq(&self, other: &TokenPair) -> bool { 53 - // By using `to_bits`, the codegen can be optimized out even 54 - // further potentially. Relies on the correct alignment/field 55 - // order of `TokenPair`. 56 - self.to_bits() == other.to_bits() 57 - } 58 - } 59 - 60 - impl Eq for TokenPair {} 61 - 62 impl core::hash::Hash for TokenPair { 63 - #[inline(always)] 64 fn hash<H: core::hash::Hasher>(&self, state: &mut H) { 65 - self.to_bits().hash(state); 66 } 67 } 68 ··· 72 Self { left, right } 73 } 74 75 #[inline(always)] 76 - fn to_bits(self) -> u128 { 77 - (self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64) 78 } 79 } 80
··· 29 } 30 31 /// An owned pair of [`Token`]s. 32 + #[derive(Copy, Clone, Debug, PartialEq, Eq)] 33 // Alignment repr necessary to allow LLVM to better output 34 // Prior art taken from my contribution to Bevy: 35 // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 36 #[repr(C, align(16))] 37 pub struct TokenPair { 38 // Do not reorder the fields here. The ordering is explicitly used by repr(C) 39 + // to make this struct equivalent to a u128. 40 #[cfg(target_endian = "little")] 41 pub left: Token, 42 pub right: Token, ··· 44 pub left: Token, 45 } 46 47 impl core::hash::Hash for TokenPair { 48 + #[inline] 49 fn hash<H: core::hash::Hasher>(&self, state: &mut H) { 50 + // Use only with an IdentityHasher so that you don't rehash the hash 51 + self.double_hash().hash(state); 52 } 53 } 54 ··· 58 Self { left, right } 59 } 60 61 + /// Use the precomputed hashes to generate a secondary hash. 62 + /// Method from [fastbloom](https://github.com/tomtomwombat/fastbloom/blob/main/src/hasher.rs#L190), 63 + /// which was in turn adapted from <https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf>. 64 #[inline(always)] 65 + fn double_hash(&self) -> u64 { 66 + self.left 67 + .0 68 + .digest() 69 + .hash() 70 + .wrapping_add(self.right.0.digest().hash()) 71 + .rotate_left(5) 72 } 73 } 74