Example client for the highly experimental divepool embedding firehose
at main 29 lines 1.2 kB view raw
1package main 2 3import "fmt" 4 5// Batch is the columnar wire format for the embedding firehose. 6// Each field is a parallel array — index i across all fields describes one text. 7// 8// Embeddings are EmbeddingGemma-300m (Matryoshka) float32 vectors. Two per text: 9// - C (cluster): for clustering/similarity analysis 10// - R (retrieval): for semantic search 11// 12// Open clients receive 128d (L2-normalized Matryoshka truncation). 13// Token-authenticated clients receive full 768d. 14type Batch struct { 15 DID []string `json:"did"` // AT Protocol DID (e.g. "did:plc:abc123") 16 Col []string `json:"col"` // Collection NSID (e.g. "app.bsky.feed.post") 17 Rkey []string `json:"rkey"` // Record key 18 Lang []string `json:"lang"` // Detected language ("en", "de") 19 C [][]float32 `json:"c"` // Cluster embeddings (128d open, 768d token) 20 R [][]float32 `json:"r"` // Retrieval embeddings (128d open, 768d token) 21} 22 23// Len returns the number of items in the batch. 24func (b *Batch) Len() int { return len(b.DID) } 25 26// ATURI returns the AT URI for item i. 27func (b *Batch) ATURI(i int) string { 28 return fmt.Sprintf("at://%s/%s/%s", b.DID[i], b.Col[i], b.Rkey[i]) 29}