//! hash functions used by spaCy/Thinc for token attribute IDs and hash embeddings. //! //! MurmurHash2_64A: string → uint64 (token attribute hashing) //! MurmurHash3_x86_128_uint64: uint64 → 4x uint32 (hash embedding bucket selection) const std = @import("std"); /// MurmurHash2_64A — the hash function spaCy uses for string→uint64 attribute IDs. /// matches preshed/murmurhash.pyx hash_string(). pub fn murmurhash2_64a(key: []const u8, seed: u64) u64 { const m: u64 = 0xc6a4a7935bd1e995; const r = 47; var h: u64 = seed ^ (key.len *% m); // process 8-byte chunks const n_blocks = key.len / 8; var i: usize = 0; while (i < n_blocks) : (i += 1) { var k = std.mem.readInt(u64, key[i * 8 ..][0..8], .little); k *%= m; k ^= k >> r; k *%= m; h ^= k; h *%= m; } // process remaining bytes const tail = key[n_blocks * 8 ..]; var remaining: u64 = 0; switch (tail.len) { 7 => remaining = @as(u64, tail[6]) << 48 | @as(u64, tail[5]) << 40 | @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 6 => remaining = @as(u64, tail[5]) << 40 | @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 5 => remaining = @as(u64, tail[4]) << 32 | @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 4 => remaining = @as(u64, tail[3]) << 24 | @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 3 => remaining = @as(u64, tail[2]) << 16 | @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 2 => remaining = @as(u64, tail[1]) << 8 | @as(u64, tail[0]), 1 => remaining = @as(u64, tail[0]), else => {}, } if (tail.len > 0) { h ^= remaining; h *%= m; } h ^= h >> r; h *%= m; h ^= h >> r; return h; } /// MurmurHash3_x86_128 adapted for a single uint64 input. /// this is the exact hash function Thinc uses in hash embeddings /// to produce 4 bucket indices from a token attribute ID. /// /// from thinc/backends/numpy_ops.pyx MurmurHash3_x86_128_uint64 pub fn murmurhash3_128_uint64(val: u64, seed: u32) [4]u32 { var h1: u64 = val; h1 *%= 0x87c37b91114253d5; h1 = std.math.rotl(u64, h1, 31); h1 *%= 0x4cf5ad432745937f; h1 ^= seed; h1 ^= 8; // length = 8 bytes var h2: u64 = seed; h2 ^= 8; // fmix64 h1 +%= h2; h2 +%= h1; h1 ^= h1 >> 33; h1 *%= 0xff51afd7ed558ccd; h1 ^= h1 >> 33; h1 *%= 0xc4ceb9fe1a85ec53; h1 ^= h1 >> 33; h2 ^= h2 >> 33; h2 *%= 0xff51afd7ed558ccd; h2 ^= h2 >> 33; h2 *%= 0xc4ceb9fe1a85ec53; h2 ^= h2 >> 33; h1 +%= h2; h2 +%= h1; return .{ @truncate(h1), @truncate(h1 >> 32), @truncate(h2), @truncate(h2 >> 32), }; } /// convenience: hash a string to the uint64 attribute ID spaCy uses, /// with the default seed of 1. pub fn hashString(s: []const u8) u64 { return murmurhash2_64a(s, 1); } // === tests === const testing = std.testing; // cross-validated against spaCy's hash_string() (preshed/murmurhash.pyx, seed=1) test "murmurhash2_64a matches spacy hash_string" { try testing.expectEqual(@as(u64, 0xdd6e45542c05f898), hashString("obama")); try testing.expectEqual(@as(u64, 0xd58ee95da168bb57), hashString("Barack")); try testing.expectEqual(@as(u64, 0x90b4b7068fc46e30), hashString("Paris")); try testing.expectEqual(@as(u64, 0xa30ebbc9c2b3d425), hashString("visited")); try testing.expectEqual(@as(u64, 0x6032b56374c05136), hashString("SpaceX")); try testing.expectEqual(@as(u64, 0xa52be5b3f6674b2a), hashString("a")); } test "murmurhash2_64a empty string" { // spaCy: hash_string("") = 0xc6a4a7935bd064dc (seed=1) try testing.expectEqual(@as(u64, 0xc6a4a7935bd064dc), hashString("")); } // cross-validated against thinc NumpyOps.hash() test "murmurhash3_128_uint64 matches thinc" { const r1 = murmurhash3_128_uint64(12345, 8); try testing.expectEqual(@as(u32, 1415810048), r1[0]); try testing.expectEqual(@as(u32, 2915517168), r1[1]); try testing.expectEqual(@as(u32, 2123715072), r1[2]); try testing.expectEqual(@as(u32, 78308456), r1[3]); const r2 = murmurhash3_128_uint64(12345, 9); try testing.expectEqual(@as(u32, 3518799221), r2[0]); try testing.expectEqual(@as(u32, 2668567277), r2[1]); try testing.expectEqual(@as(u32, 3200850228), r2[2]); try testing.expectEqual(@as(u32, 3937369458), r2[3]); const r3 = murmurhash3_128_uint64(42, 10); try testing.expectEqual(@as(u32, 4049717780), r3[0]); try testing.expectEqual(@as(u32, 353985546), r3[1]); try testing.expectEqual(@as(u32, 1712375736), r3[2]); try testing.expectEqual(@as(u32, 3784464606), r3[3]); // edge case: input 0 → all zeros const r4 = murmurhash3_128_uint64(0, 8); try testing.expectEqual(@as(u32, 0), r4[0]); try testing.expectEqual(@as(u32, 0), r4[1]); try testing.expectEqual(@as(u32, 0), r4[2]); try testing.expectEqual(@as(u32, 0), r4[3]); }