Example client for the highly experimental divepool embedding firehose
1package main
2
3import "fmt"
4
5// Batch is the columnar wire format for the embedding firehose.
6// Each field is a parallel array — index i across all fields describes one text.
7//
8// Embeddings are EmbeddingGemma-300m (Matryoshka) float32 vectors. Two per text:
9// - C (cluster): for clustering/similarity analysis
10// - R (retrieval): for semantic search
11//
12// Open clients receive 128d (L2-normalized Matryoshka truncation).
13// Token-authenticated clients receive full 768d.
14type Batch struct {
15 DID []string `json:"did"` // AT Protocol DID (e.g. "did:plc:abc123")
16 Col []string `json:"col"` // Collection NSID (e.g. "app.bsky.feed.post")
17 Rkey []string `json:"rkey"` // Record key
18 Lang []string `json:"lang"` // Detected language ("en", "de")
19 C [][]float32 `json:"c"` // Cluster embeddings (128d open, 768d token)
20 R [][]float32 `json:"r"` // Retrieval embeddings (128d open, 768d token)
21}
22
23// Len returns the number of items in the batch.
24func (b *Batch) Len() int { return len(b.DID) }
25
26// ATURI returns the AT URI for item i.
27func (b *Batch) ATURI(i int) string {
28 return fmt.Sprintf("at://%s/%s/%s", b.DID[i], b.Col[i], b.Rkey[i])
29}