perf: Improve TokenPair hashing for faster generation

··· 1 1 //! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with 2 2 //! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov). 3 3 4 - use core::hash::BuildHasherDefault; 5 - 6 - use estr::IdentityHasher; 7 4 use indexmap::IndexMap; 8 5 use rand::Rng; 9 6 use rand_distr::{Distribution, weighted::WeightedAliasIndex}; 10 7 11 - use crate::{error::NailError, token::Token}; 8 + use crate::{TokenHasher, error::NailError, token::Token}; 12 9 13 10 /// A distribution of choices and their likelihood. 14 11 #[derive(Clone, Debug)] ··· 32 29 #[derive(Clone, Debug)] 33 30 pub struct TokenWeightsBuilder { 34 31 /// Counts how many times a token is likely to appear. 35 - occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>, 32 + occurrences: IndexMap<Token, u32, TokenHasher>, 36 33 } 37 34 38 35 impl TokenWeightsBuilder {

+15 -7

crates/nailkov/src/lib.rs

··· 6 6 mod error; 7 7 mod token; 8 8 9 + use core::hash::BuildHasherDefault; 10 + 9 11 use crossbeam_utils::CachePadded; 10 12 use error::NailError; 13 + use estr::IdentityHasher; 11 14 use indexmap::IndexMap; 12 15 use itertools::Itertools; 13 16 use rand::{RngCore, seq::IteratorRandom}; 14 17 use rand_distr::Distribution; 15 18 16 19 use distribution::{TokenWeights, TokenWeightsBuilder}; 17 - use rapidhash::fast::RandomState; 18 20 use token::{Token, TokenPair}; 19 21 use unicode_segmentation::UnicodeSegmentation; 20 22 23 + /// `nailkov` relies on `estr`'s precomputed hashes, so we avoid 24 + /// hashing ourselves and can just use the precomputed hashes instead. 25 + type TokenHasher = BuildHasherDefault<IdentityHasher>; 26 + 21 27 #[derive(Clone, Debug)] 22 28 pub struct NailKov { 23 - chain: CachePadded<IndexMap<TokenPair, TokenWeights, RandomState>>, 29 + chain: CachePadded<IndexMap<TokenPair, TokenWeights, TokenHasher>>, 24 30 } 25 31 26 32 pub struct NailKovIter<'a, R: RngCore> { ··· 59 65 60 66 impl NailKov { 61 67 pub fn from_input(input: &str) -> Result<NailKov, NailError> { 62 - NailBuilder::new(RandomState::new()).with_input(input) 68 + NailBuilder::new(TokenHasher::new()).with_input(input) 63 69 } 64 70 } 65 71 66 72 struct NailBuilder { 67 - chain: IndexMap<TokenPair, TokenWeightsBuilder, RandomState>, 73 + chain: IndexMap<TokenPair, TokenWeightsBuilder, TokenHasher>, 68 74 } 69 75 70 76 impl NailBuilder { 71 - fn new(hasher: RandomState) -> Self { 77 + fn new(hasher: TokenHasher) -> Self { 72 78 Self { 73 79 chain: IndexMap::with_hasher(hasher), 74 80 } ··· 83 89 return Err(NailError::EmptyInput); 84 90 } 85 91 86 - let chain: IndexMap<TokenPair, TokenWeights, RandomState> = self 92 + let chain: IndexMap<TokenPair, TokenWeights, TokenHasher> = self 87 93 .chain 88 94 .into_iter() 89 95 .flat_map(|(pair, dist)| { ··· 97 103 return Err(NailError::EmptyInput); 98 104 } 99 105 100 - Ok(NailKov { chain: CachePadded::new(chain) }) 106 + Ok(NailKov { 107 + chain: CachePadded::new(chain), 108 + }) 101 109 } 102 110 103 111 /// Add the occurrence of `next` following `prev`.

+15 -21

crates/nailkov/src/token.rs

··· 29 29 } 30 30 31 31 /// An owned pair of [`Token`]s. 32 - #[derive(Copy, Clone, Debug)] 32 + #[derive(Copy, Clone, Debug, PartialEq, Eq)] 33 33 // Alignment repr necessary to allow LLVM to better output 34 - // optimized codegen for `to_bits`, `PartialEq` 35 34 // Prior art taken from my contribution to Bevy: 36 35 // https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309 37 36 #[repr(C, align(16))] 38 37 pub struct TokenPair { 39 38 // Do not reorder the fields here. The ordering is explicitly used by repr(C) 40 - // to make this struct equivalent to a u64. 39 + // to make this struct equivalent to a u128. 41 40 #[cfg(target_endian = "little")] 42 41 pub left: Token, 43 42 pub right: Token, ··· 45 44 pub left: Token, 46 45 } 47 46 48 - // By not short-circuiting in comparisons, we get better codegen. 49 - // See <https://github.com/rust-lang/rust/issues/117800> 50 - impl PartialEq for TokenPair { 51 - #[inline(always)] 52 - fn eq(&self, other: &TokenPair) -> bool { 53 - // By using `to_bits`, the codegen can be optimized out even 54 - // further potentially. Relies on the correct alignment/field 55 - // order of `TokenPair`. 56 - self.to_bits() == other.to_bits() 57 - } 58 - } 59 - 60 - impl Eq for TokenPair {} 61 - 62 47 impl core::hash::Hash for TokenPair { 63 - #[inline(always)] 48 + #[inline] 64 49 fn hash<H: core::hash::Hasher>(&self, state: &mut H) { 65 - self.to_bits().hash(state); 50 + // Use only with an IdentityHasher so that you don't rehash the hash 51 + self.double_hash().hash(state); 66 52 } 67 53 } 68 54 ··· 72 58 Self { left, right } 73 59 } 74 60 61 + /// Use the precomputed hashes to generate a secondary hash. 62 + /// Method from [fastbloom](https://github.com/tomtomwombat/fastbloom/blob/main/src/hasher.rs#L190), 63 + /// which was in turn adapted from <https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf>. 75 64 #[inline(always)] 76 - fn to_bits(self) -> u128 { 77 - (self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64) 65 + fn double_hash(&self) -> u64 { 66 + self.left 67 + .0 68 + .digest() 69 + .hash() 70 + .wrapping_add(self.right.0.digest().hash()) 71 + .rotate_left(5) 78 72 } 79 73 } 80 74