···11//! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with
22//! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov).
3344-use core::hash::BuildHasherDefault;
55-66-use estr::IdentityHasher;
74use indexmap::IndexMap;
85use rand::Rng;
96use rand_distr::{Distribution, weighted::WeightedAliasIndex};
1071111-use crate::{error::NailError, token::Token};
88+use crate::{TokenHasher, error::NailError, token::Token};
1291310/// A distribution of choices and their likelihood.
1411#[derive(Clone, Debug)]
···3229#[derive(Clone, Debug)]
3330pub struct TokenWeightsBuilder {
3431 /// Counts how many times a token is likely to appear.
3535- occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>,
3232+ occurrences: IndexMap<Token, u32, TokenHasher>,
3633}
37343835impl TokenWeightsBuilder {
···2929}
30303131/// An owned pair of [`Token`]s.
3232-#[derive(Copy, Clone, Debug)]
3232+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
3333// Alignment repr necessary to allow LLVM to better output
3434-// optimized codegen for `to_bits`, `PartialEq`
3534// Prior art taken from my contribution to Bevy:
3635// https://github.com/bevyengine/bevy/blob/main/crates/bevy_ecs/src/entity/mod.rs#L309
3736#[repr(C, align(16))]
3837pub struct TokenPair {
3938 // Do not reorder the fields here. The ordering is explicitly used by repr(C)
4040- // to make this struct equivalent to a u64.
3939+ // to make this struct equivalent to a u128.
4140 #[cfg(target_endian = "little")]
4241 pub left: Token,
4342 pub right: Token,
···4544 pub left: Token,
4645}
47464848-// By not short-circuiting in comparisons, we get better codegen.
4949-// See <https://github.com/rust-lang/rust/issues/117800>
5050-impl PartialEq for TokenPair {
5151- #[inline(always)]
5252- fn eq(&self, other: &TokenPair) -> bool {
5353- // By using `to_bits`, the codegen can be optimized out even
5454- // further potentially. Relies on the correct alignment/field
5555- // order of `TokenPair`.
5656- self.to_bits() == other.to_bits()
5757- }
5858-}
5959-6060-impl Eq for TokenPair {}
6161-6247impl core::hash::Hash for TokenPair {
6363- #[inline(always)]
4848+ #[inline]
6449 fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
6565- self.to_bits().hash(state);
5050+ // Use only with an IdentityHasher so that you don't rehash the hash
5151+ self.double_hash().hash(state);
6652 }
6753}
6854···7258 Self { left, right }
7359 }
74606161+ /// Use the precomputed hashes to generate a secondary hash.
6262+ /// Method from [fastbloom](https://github.com/tomtomwombat/fastbloom/blob/main/src/hasher.rs#L190),
6363+ /// which was in turn adapted from <https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf>.
7564 #[inline(always)]
7676- fn to_bits(self) -> u128 {
7777- (self.left.0.digest().hash() as u128) | ((self.right.0.digest().hash() as u128) << 64)
6565+ fn double_hash(&self) -> u64 {
6666+ self.left
6767+ .0
6868+ .digest()
6969+ .hash()
7070+ .wrapping_add(self.right.0.digest().hash())
7171+ .rotate_left(5)
7872 }
7973}
8074