···1313 -- Content identifier from the record
1414 cid String,
15151616+ -- Repository revision (TID) - monotonically increasing per DID, used for dedup/ordering
1717+ rev String,
1818+1619 -- Full record as native JSON (schema-flexible, queryable with record.field.subfield)
1720 record JSON,
1821···2831 -- When we indexed this record
2932 indexed_at DateTime64(3) DEFAULT now64(3),
30333434+ -- Validation state: 'unchecked', 'valid', 'invalid_rev', 'invalid_gap', 'invalid_account'
3535+ -- Populated by async batch validation, not in hot path
3636+ validation_state LowCardinality(String) DEFAULT 'unchecked',
3737+3138 -- Materialized AT URI for convenience
3232- uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey)
3939+ uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey),
4040+4141+ -- Projection for fast delete lookups by (did, cid)
4242+ -- Delete events include CID, so we can O(1) lookup the original record
4343+ -- to know what to decrement (e.g., which notebook's like count)
4444+ PROJECTION by_did_cid (
4545+ SELECT * ORDER BY (did, cid)
4646+ )
3347)
3448ENGINE = ReplacingMergeTree(indexed_at)
3535-ORDER BY (collection, did, rkey, indexed_at);
4949+ORDER BY (collection, did, rkey, event_time)
5050+SETTINGS deduplicate_merge_projection_mode = 'drop';
···11+-- Per-account revision state tracking
22+-- Maintains latest rev/cid per DID for dedup and gap detection
33+--
44+-- AggregatingMergeTree with incremental MV from raw_records
55+-- Query with argMaxMerge/maxMerge to finalize aggregates
66+77+CREATE TABLE IF NOT EXISTS account_rev_state (
88+ -- Account DID
99+ did String,
1010+1111+ -- Latest revision (TID) seen for this account
1212+ last_rev AggregateFunction(argMax, String, DateTime64(3)),
1313+1414+ -- CID of the latest revision
1515+ last_cid AggregateFunction(argMax, String, DateTime64(3)),
1616+1717+ -- Latest sequence number seen
1818+ last_seq AggregateFunction(max, UInt64),
1919+2020+ -- Latest event time seen
2121+ last_event_time AggregateFunction(max, DateTime64(3))
2222+)
2323+ENGINE = AggregatingMergeTree()
2424+ORDER BY did
···11+-- Incremental MV: fires on each insert to raw_records, maintains aggregate state
22+-- Must be created after both account_rev_state (target) and raw_records (source) exist
33+44+CREATE MATERIALIZED VIEW IF NOT EXISTS account_rev_state_mv TO account_rev_state AS
55+SELECT
66+ did,
77+ argMaxState(rev, event_time) as last_rev,
88+ argMaxState(cid, event_time) as last_cid,
99+ maxState(seq) as last_seq,
1010+ maxState(event_time) as last_event_time
1111+FROM raw_records
1212+GROUP BY did
···11use crate::error::{ConfigError, IndexError};
22+use dashmap::DashSet;
23use url::Url;
3445/// ClickHouse connection configuration
···8485 }
8586}
86878888+use smol_str::{SmolStr, ToSmolStr};
8989+9090+/// Pre-parsed collection filter for efficient matching
9191+#[derive(Debug, Clone)]
9292+pub struct CollectionFilter {
9393+ /// Prefix patterns (from "foo.*" -> "foo.")
9494+ prefixes: Vec<SmolStr>,
9595+ /// Exact match patterns (HashSet for O(1) lookup)
9696+ exact: DashSet<SmolStr>,
9797+ /// True if filter is empty (accept all)
9898+ accept_all: bool,
9999+}
100100+101101+impl CollectionFilter {
102102+ /// Parse filter patterns into prefixes and exact matches
103103+ pub fn new(patterns: Vec<SmolStr>) -> Self {
104104+ let mut prefixes = Vec::new();
105105+ let exact = DashSet::new();
106106+107107+ for pattern in patterns {
108108+ if let Some(prefix) = pattern.strip_suffix('*') {
109109+ prefixes.push(SmolStr::new(prefix));
110110+ } else {
111111+ exact.insert(SmolStr::new(&pattern));
112112+ }
113113+ }
114114+115115+ let accept_all = prefixes.is_empty() && exact.is_empty();
116116+ Self {
117117+ prefixes,
118118+ exact,
119119+ accept_all,
120120+ }
121121+ }
122122+123123+ /// Check if a collection matches any pattern
124124+ #[inline]
125125+ pub fn matches(&self, collection: &str) -> bool {
126126+ if self.accept_all {
127127+ return true;
128128+ }
129129+130130+ // O(1) exact match check first
131131+ if self.exact.contains(collection) {
132132+ return true;
133133+ }
134134+135135+ // Prefix check - for small N, linear scan is fine
136136+ // Accumulate without early return to help branch predictor
137137+ let mut matched = false;
138138+ for prefix in &self.prefixes {
139139+ matched |= collection.starts_with(prefix.as_str());
140140+ }
141141+ matched
142142+ }
143143+}
144144+145145+/// Indexer runtime configuration
146146+#[derive(Debug, Clone)]
147147+pub struct IndexerConfig {
148148+ /// Maximum records to batch before flushing to ClickHouse
149149+ pub batch_size: usize,
150150+ /// Maximum time (ms) before flushing even if batch isn't full
151151+ pub flush_interval_ms: u64,
152152+ /// Collection filter (pre-parsed patterns)
153153+ pub collections: CollectionFilter,
154154+}
155155+156156+impl Default for IndexerConfig {
157157+ fn default() -> Self {
158158+ Self {
159159+ batch_size: 1000,
160160+ flush_interval_ms: 1000,
161161+ collections: CollectionFilter::new(vec![
162162+ SmolStr::new_static("sh.weaver.*"),
163163+ SmolStr::new_static("app.bsky.actor.profile"),
164164+ ]),
165165+ }
166166+ }
167167+}
168168+169169+impl IndexerConfig {
170170+ /// Load configuration from environment variables.
171171+ ///
172172+ /// Optional env vars:
173173+ /// - `INDEXER_BATCH_SIZE`: Max records per batch (default: 1000)
174174+ /// - `INDEXER_FLUSH_INTERVAL_MS`: Max ms between flushes (default: 1000)
175175+ /// - `INDEXER_COLLECTIONS`: Comma-separated collection patterns (default: sh.weaver.*,app.bsky.actor.profile)
176176+ /// Use * suffix for prefix matching, e.g., "sh.weaver.*" matches all sh.weaver.* collections
177177+ pub fn from_env() -> Self {
178178+ let batch_size = std::env::var("INDEXER_BATCH_SIZE")
179179+ .ok()
180180+ .and_then(|s| s.parse().ok())
181181+ .unwrap_or(1000);
182182+183183+ let flush_interval_ms = std::env::var("INDEXER_FLUSH_INTERVAL_MS")
184184+ .ok()
185185+ .and_then(|s| s.parse().ok())
186186+ .unwrap_or(1000);
187187+188188+ let patterns: Vec<SmolStr> = std::env::var("INDEXER_COLLECTIONS")
189189+ .map(|s| s.split(',').map(|p| p.trim().to_smolstr()).collect())
190190+ .unwrap_or_else(|_| {
191191+ vec![
192192+ SmolStr::new_static("sh.weaver.*"),
193193+ SmolStr::new_static("app.bsky.actor.profile"),
194194+ ]
195195+ });
196196+197197+ Self {
198198+ batch_size,
199199+ flush_interval_ms,
200200+ collections: CollectionFilter::new(patterns),
201201+ }
202202+ }
203203+}
204204+87205/// Combined configuration for the indexer
88206#[derive(Debug, Clone)]
89207pub struct Config {
90208 pub clickhouse: ClickHouseConfig,
91209 pub firehose: FirehoseConfig,
210210+ pub indexer: IndexerConfig,
92211}
9321294213impl Config {
···97216 Ok(Self {
98217 clickhouse: ClickHouseConfig::from_env()?,
99218 firehose: FirehoseConfig::from_env()?,
219219+ indexer: IndexerConfig::from_env(),
100220 })
101221 }
102222}
+2-2
crates/weaver-index/src/firehose.rs
···22mod records;
3344pub use consumer::{
55- FirehoseConsumer, MessageStream, SubscribeReposMessage, Commit, Identity, Account, Sync,
55+ Account, Commit, FirehoseConsumer, Identity, MessageStream, SubscribeReposMessage, Sync,
66};
77-pub use records::{extract_records, ExtractedRecord};
77+pub use records::{ExtractedRecord, extract_records};
+10-6
crates/weaver-index/src/firehose/records.rs
···11use crate::error::{CarError, IndexError};
22use bytes::Bytes;
33+use chrono::{DateTime, Utc};
34use jacquard_repo::car::reader::parse_car_bytes;
45use smol_str::{SmolStr, ToSmolStr};
56···1516 /// Record key within the collection
1617 pub rkey: SmolStr,
1718 /// Content identifier
1818- pub cid: String,
1919+ pub cid: SmolStr,
2020+ /// Repository revision (TID) - monotonically increasing per DID
2121+ pub rev: SmolStr,
1922 /// Operation type: "create", "update", or "delete"
2023 pub operation: SmolStr,
2124 /// Raw DAG-CBOR bytes of the record (None for deletes)
2225 pub cbor_bytes: Option<Bytes>,
2326 /// Sequence number from the firehose event
2427 pub seq: i64,
2525- /// Event timestamp (milliseconds since epoch)
2626- pub event_time_ms: i64,
2828+ /// Event timestamp
2929+ pub event_time: DateTime<Utc>,
2730}
28312932impl ExtractedRecord {
···6164 message: e.to_string(),
6265 })?;
63666464- let event_time_ms = commit.time.as_ref().timestamp_millis();
6767+ let event_time = commit.time.as_ref().with_timezone(&Utc);
6568 let mut records = Vec::with_capacity(commit.ops.len());
66696770 for op in &commit.ops {
···7780 };
78817982 let operation = op.action.to_smolstr();
8080- let cid_str = op.cid.as_ref().map(|c| c.to_string()).unwrap_or_default();
8383+ let cid_str = op.cid.as_ref().map(|c| c.to_smolstr()).unwrap_or_default();
81848285 // For creates/updates, look up the record in the CAR blocks
8386 let cbor_bytes = if let Some(cid_link) = &op.cid {
···97100 collection,
98101 rkey,
99102 cid: cid_str,
103103+ rev: commit.rev.to_smolstr(),
100104 operation,
101105 cbor_bytes,
102106 seq: commit.seq,
103103- event_time_ms,
107107+ event_time,
104108 });
105109 }
106110
+512
crates/weaver-index/src/indexer.rs
···11+use std::sync::Arc;
22+use std::time::{Duration, Instant};
33+44+use chrono::Utc;
55+use dashmap::DashMap;
66+use n0_future::StreamExt;
77+use smol_str::{SmolStr, ToSmolStr};
88+use tracing::{debug, info, warn};
99+1010+use chrono::DateTime;
1111+1212+use crate::clickhouse::{
1313+ AccountRevState, Client, FirehoseCursor, RawAccountEvent, RawIdentityEvent, RawRecordInsert,
1414+};
1515+use crate::config::IndexerConfig;
1616+use crate::error::{IndexError, Result};
1717+use crate::firehose::{
1818+ Account, Commit, ExtractedRecord, FirehoseConsumer, Identity, MessageStream,
1919+ SubscribeReposMessage, extract_records,
2020+};
2121+2222+/// Default consumer ID for cursor tracking
2323+const CONSUMER_ID: &str = "main";
2424+2525+/// Per-account revision state for deduplication
2626+#[derive(Debug, Clone)]
2727+pub struct RevState {
2828+ pub last_rev: SmolStr,
2929+ pub last_cid: SmolStr,
3030+}
3131+3232+/// In-memory cache of per-account revision state
3333+///
3434+/// Used for fast deduplication without hitting ClickHouse on every event.
3535+/// Populated from account_rev_state table on startup, updated as events are processed.
3636+pub struct RevCache {
3737+ inner: DashMap<SmolStr, RevState>,
3838+}
3939+4040+impl RevCache {
4141+ pub fn new() -> Self {
4242+ Self {
4343+ inner: DashMap::new(),
4444+ }
4545+ }
4646+4747+ /// Load cache from ClickHouse account_rev_state table
4848+ pub async fn load_from_clickhouse(client: &Client) -> Result<Self> {
4949+ let query = r#"
5050+ SELECT
5151+ did,
5252+ argMaxMerge(last_rev) as last_rev,
5353+ argMaxMerge(last_cid) as last_cid,
5454+ maxMerge(last_seq) as last_seq,
5555+ maxMerge(last_event_time) as last_event_time
5656+ FROM account_rev_state
5757+ GROUP BY did
5858+ "#;
5959+6060+ let rows: Vec<AccountRevState> =
6161+ client.inner().query(query).fetch_all().await.map_err(|e| {
6262+ IndexError::ClickHouse(crate::error::ClickHouseError::Query {
6363+ message: "failed to load account rev state".into(),
6464+ source: e,
6565+ })
6666+ })?;
6767+6868+ let cache = Self::new();
6969+ for row in rows {
7070+ cache.inner.insert(
7171+ SmolStr::new(&row.did),
7272+ RevState {
7373+ last_rev: SmolStr::new(&row.last_rev),
7474+ last_cid: SmolStr::new(&row.last_cid),
7575+ },
7676+ );
7777+ }
7878+7979+ info!(
8080+ accounts = cache.inner.len(),
8181+ "loaded rev cache from clickhouse"
8282+ );
8383+ Ok(cache)
8484+ }
8585+8686+ /// Check if we should process this commit (returns false if already seen)
8787+ pub fn should_process(&self, did: &str, rev: &str) -> bool {
8888+ match self.inner.get(did) {
8989+ Some(state) => rev > state.last_rev.as_str(),
9090+ None => true, // new account, always process
9191+ }
9292+ }
9393+9494+ /// Update cache after processing a commit
9595+ pub fn update(&self, did: &SmolStr, rev: &SmolStr, cid: &SmolStr) {
9696+ self.inner.insert(
9797+ did.clone(),
9898+ RevState {
9999+ last_rev: rev.clone(),
100100+ last_cid: cid.clone(),
101101+ },
102102+ );
103103+ }
104104+105105+ /// Get current cache size (number of accounts tracked)
106106+ pub fn len(&self) -> usize {
107107+ self.inner.len()
108108+ }
109109+110110+ pub fn is_empty(&self) -> bool {
111111+ self.inner.is_empty()
112112+ }
113113+}
114114+115115+impl Default for RevCache {
116116+ fn default() -> Self {
117117+ Self::new()
118118+ }
119119+}
120120+121121+/// Safety margin when resuming - back up this many sequence numbers
122122+/// to ensure no gaps from incomplete batches or race conditions
123123+const CURSOR_REWIND: i64 = 1000;
124124+125125+/// Load cursor from ClickHouse for resuming
126126+///
127127+/// Returns cursor with safety margin subtracted to ensure overlap
128128+pub async fn load_cursor(client: &Client) -> Result<Option<i64>> {
129129+ let query = format!(
130130+ r#"
131131+ SELECT consumer_id, seq, event_time
132132+ FROM firehose_cursor FINAL
133133+ WHERE consumer_id = '{}'
134134+ LIMIT 1
135135+ "#,
136136+ CONSUMER_ID
137137+ );
138138+139139+ let cursor: Option<FirehoseCursor> = client
140140+ .inner()
141141+ .query(&query)
142142+ .fetch_optional()
143143+ .await
144144+ .map_err(|e| crate::error::ClickHouseError::Query {
145145+ message: "failed to load cursor".into(),
146146+ source: e,
147147+ })?;
148148+149149+ if let Some(c) = &cursor {
150150+ let resume_at = (c.seq as i64).saturating_sub(CURSOR_REWIND);
151151+ info!(
152152+ saved_seq = c.seq,
153153+ resume_seq = resume_at,
154154+ rewind = CURSOR_REWIND,
155155+ "loaded cursor from clickhouse (with safety margin)"
156156+ );
157157+ Ok(Some(resume_at))
158158+ } else {
159159+ Ok(None)
160160+ }
161161+}
162162+163163+/// Main indexer that consumes firehose and writes to ClickHouse
164164+pub struct Indexer {
165165+ client: Arc<Client>,
166166+ consumer: FirehoseConsumer,
167167+ rev_cache: RevCache,
168168+ config: IndexerConfig,
169169+}
170170+171171+impl Indexer {
172172+ /// Create a new indexer
173173+ pub async fn new(
174174+ client: Client,
175175+ consumer: FirehoseConsumer,
176176+ config: IndexerConfig,
177177+ ) -> Result<Self> {
178178+ let client = Arc::new(client);
179179+180180+ // Load rev cache from ClickHouse
181181+ let rev_cache = RevCache::load_from_clickhouse(&client).await?;
182182+183183+ Ok(Self {
184184+ client,
185185+ consumer,
186186+ rev_cache,
187187+ config,
188188+ })
189189+ }
190190+191191+ /// Save cursor to ClickHouse
192192+ async fn save_cursor(&self, seq: u64, event_time: DateTime<Utc>) -> Result<()> {
193193+ let query = format!(
194194+ "INSERT INTO firehose_cursor (consumer_id, seq, event_time) VALUES ('{}', {}, {})",
195195+ CONSUMER_ID,
196196+ seq,
197197+ event_time.timestamp_millis()
198198+ );
199199+200200+ self.client.execute(&query).await?;
201201+ debug!(seq, "saved cursor");
202202+ Ok(())
203203+ }
204204+205205+ /// Run the indexer loop
206206+ pub async fn run(&self) -> Result<()> {
207207+ info!("connecting to firehose...");
208208+ let mut stream: MessageStream = self.consumer.connect().await?;
209209+210210+ // Inserters handle batching internally based on config
211211+ let mut records = self.client.inserter::<RawRecordInsert>("raw_records");
212212+ let mut identities = self
213213+ .client
214214+ .inserter::<RawIdentityEvent>("raw_identity_events");
215215+ let mut accounts = self
216216+ .client
217217+ .inserter::<RawAccountEvent>("raw_account_events");
218218+219219+ // Stats and cursor tracking
220220+ let mut processed: u64 = 0;
221221+ let mut skipped: u64 = 0;
222222+ let mut last_seq: u64 = 0;
223223+ let mut last_event_time = Utc::now();
224224+ let mut last_stats = Instant::now();
225225+ let mut last_cursor_save = Instant::now();
226226+227227+ info!("starting indexer loop");
228228+229229+ while let Some(result) = stream.next().await {
230230+ let msg = match result {
231231+ Ok(msg) => msg,
232232+ Err(e) => {
233233+ warn!(error = ?e, "firehose stream error");
234234+ continue;
235235+ }
236236+ };
237237+238238+ // Track seq from any message type that has it
239239+ match &msg {
240240+ SubscribeReposMessage::Commit(c) => {
241241+ last_seq = c.seq as u64;
242242+ last_event_time = c.time.as_ref().with_timezone(&Utc);
243243+ }
244244+ SubscribeReposMessage::Identity(i) => {
245245+ last_seq = i.seq as u64;
246246+ last_event_time = i.time.as_ref().with_timezone(&Utc);
247247+ }
248248+ SubscribeReposMessage::Account(a) => {
249249+ last_seq = a.seq as u64;
250250+ last_event_time = a.time.as_ref().with_timezone(&Utc);
251251+ }
252252+ _ => {}
253253+ }
254254+255255+ match msg {
256256+ SubscribeReposMessage::Commit(commit) => {
257257+ if self
258258+ .process_commit(&commit, &mut records, &mut skipped)
259259+ .await?
260260+ {
261261+ processed += 1;
262262+ }
263263+ }
264264+ SubscribeReposMessage::Identity(identity) => {
265265+ write_identity(&identity, &mut identities).await?;
266266+ }
267267+ SubscribeReposMessage::Account(account) => {
268268+ write_account(&account, &mut accounts).await?;
269269+ }
270270+ SubscribeReposMessage::Sync(_) => {
271271+ debug!("received sync (tooBig) event, skipping");
272272+ }
273273+ _ => {}
274274+ }
275275+276276+ // commit() flushes if internal thresholds met, otherwise no-op
277277+ records
278278+ .commit()
279279+ .await
280280+ .map_err(|e| crate::error::ClickHouseError::Query {
281281+ message: "commit failed".into(),
282282+ source: e,
283283+ })?;
284284+285285+ // Periodic stats and cursor save (every 10s)
286286+ if last_stats.elapsed() >= Duration::from_secs(10) {
287287+ info!(
288288+ processed,
289289+ skipped,
290290+ last_seq,
291291+ rev_cache_size = self.rev_cache.len(),
292292+ "indexer stats"
293293+ );
294294+ last_stats = Instant::now();
295295+ }
296296+297297+ // Save cursor every 30s
298298+ if last_cursor_save.elapsed() >= Duration::from_secs(30) && last_seq > 0 {
299299+ if let Err(e) = self.save_cursor(last_seq, last_event_time).await {
300300+ warn!(error = ?e, "failed to save cursor");
301301+ }
302302+ last_cursor_save = Instant::now();
303303+ }
304304+ }
305305+306306+ // Final flush
307307+ records
308308+ .end()
309309+ .await
310310+ .map_err(|e| crate::error::ClickHouseError::Query {
311311+ message: "final flush failed".into(),
312312+ source: e,
313313+ })?;
314314+ identities
315315+ .end()
316316+ .await
317317+ .map_err(|e| crate::error::ClickHouseError::Query {
318318+ message: "final flush failed".into(),
319319+ source: e,
320320+ })?;
321321+ accounts
322322+ .end()
323323+ .await
324324+ .map_err(|e| crate::error::ClickHouseError::Query {
325325+ message: "final flush failed".into(),
326326+ source: e,
327327+ })?;
328328+329329+ // Final cursor save
330330+ if last_seq > 0 {
331331+ self.save_cursor(last_seq, last_event_time).await?;
332332+ }
333333+334334+ info!(last_seq, "firehose stream ended");
335335+ Ok(())
336336+ }
337337+338338+ async fn process_commit(
339339+ &self,
340340+ commit: &Commit<'_>,
341341+ inserter: &mut clickhouse::inserter::Inserter<RawRecordInsert>,
342342+ skipped: &mut u64,
343343+ ) -> Result<bool> {
344344+ let did = commit.repo.as_ref();
345345+ let rev = commit.rev.as_ref();
346346+347347+ // Dedup check
348348+ if !self.rev_cache.should_process(did, rev) {
349349+ *skipped += 1;
350350+ return Ok(false);
351351+ }
352352+353353+ // Extract and write records
354354+ for record in extract_records(commit).await? {
355355+ // Collection filter - skip early before JSON conversion
356356+ if !self.config.collections.matches(&record.collection) {
357357+ continue;
358358+ }
359359+360360+ let json = record.to_json()?.unwrap_or_else(|| "{}".to_string());
361361+362362+ // Fire and forget delete handling
363363+ if record.operation == "delete" {
364364+ let client = self.client.clone();
365365+ let record_clone = record.clone();
366366+ tokio::spawn(async move {
367367+ if let Err(e) = handle_delete(&client, record_clone).await {
368368+ warn!(error = ?e, "delete handling failed");
369369+ }
370370+ });
371371+ }
372372+373373+ inserter
374374+ .write(&RawRecordInsert {
375375+ did: record.did.clone(),
376376+ collection: record.collection.clone(),
377377+ rkey: record.rkey.clone(),
378378+ cid: record.cid.clone(),
379379+ rev: record.rev.clone(),
380380+ record: json.to_smolstr(),
381381+ operation: record.operation.clone(),
382382+ seq: record.seq as u64,
383383+ event_time: record.event_time,
384384+ })
385385+ .await
386386+ .map_err(|e| crate::error::ClickHouseError::Query {
387387+ message: "write failed".into(),
388388+ source: e,
389389+ })?;
390390+ }
391391+392392+ // Update rev cache
393393+ self.rev_cache.update(
394394+ &SmolStr::new(did),
395395+ &SmolStr::new(rev),
396396+ &commit.commit.0.to_smolstr(),
397397+ );
398398+399399+ Ok(true)
400400+ }
401401+}
402402+403403+async fn write_identity(
404404+ identity: &Identity<'_>,
405405+ inserter: &mut clickhouse::inserter::Inserter<RawIdentityEvent>,
406406+) -> Result<()> {
407407+ inserter
408408+ .write(&RawIdentityEvent {
409409+ did: identity.did.to_smolstr(),
410410+ handle: identity
411411+ .handle
412412+ .as_ref()
413413+ .map(|h| h.as_ref().to_smolstr())
414414+ .unwrap_or_default(),
415415+ seq: identity.seq as u64,
416416+ event_time: identity.time.as_ref().with_timezone(&Utc),
417417+ })
418418+ .await
419419+ .map_err(|e| crate::error::ClickHouseError::Query {
420420+ message: "write failed".into(),
421421+ source: e,
422422+ })?;
423423+ Ok(())
424424+}
425425+426426+async fn write_account(
427427+ account: &Account<'_>,
428428+ inserter: &mut clickhouse::inserter::Inserter<RawAccountEvent>,
429429+) -> Result<()> {
430430+ inserter
431431+ .write(&RawAccountEvent {
432432+ did: account.did.to_smolstr(),
433433+ active: if account.active { 1 } else { 0 },
434434+ status: account
435435+ .status
436436+ .as_ref()
437437+ .map(|s| s.as_ref().to_smolstr())
438438+ .unwrap_or_default(),
439439+ seq: account.seq as u64,
440440+ event_time: account.time.as_ref().with_timezone(&Utc),
441441+ })
442442+ .await
443443+ .map_err(|e| crate::error::ClickHouseError::Query {
444444+ message: "write failed".into(),
445445+ source: e,
446446+ })?;
447447+ Ok(())
448448+}
449449+450450+/// Handle a delete event with poll-then-stub logic
451451+///
452452+/// For deletes, we need to look up the original record to know what was deleted
453453+/// (e.g., which notebook a like was for). If the record doesn't exist yet
454454+/// (out-of-order events), we poll for up to 15 seconds before creating a stub tombstone.
455455+/// Minimal struct for delete lookups - just the fields we need to process the delete
456456+#[derive(Debug, Clone, clickhouse::Row, serde::Deserialize)]
457457+struct LookupRawRecord {
458458+ did: SmolStr,
459459+ collection: SmolStr,
460460+ rkey: SmolStr,
461461+ record: SmolStr, // JSON string of the original record
462462+}
463463+464464+async fn handle_delete(client: &Client, record: ExtractedRecord) -> Result<()> {
465465+ let deadline = Instant::now() + Duration::from_secs(15);
466466+467467+ loop {
468468+ // Try to find the record by CID
469469+ let query = format!(
470470+ r#"
471471+ SELECT did, collection, rkey, record
472472+ FROM raw_records
473473+ WHERE did = '{}' AND cid = '{}'
474474+ ORDER BY event_time DESC
475475+ LIMIT 1
476476+ "#,
477477+ record.did, record.cid
478478+ );
479479+480480+ let original: Option<LookupRawRecord> = client
481481+ .inner()
482482+ .query(&query)
483483+ .fetch_optional()
484484+ .await
485485+ .map_err(|e| crate::error::ClickHouseError::Query {
486486+ message: "delete lookup failed".into(),
487487+ source: e,
488488+ })?;
489489+490490+ if let Some(_original) = original {
491491+ // Found the record - the main insert path already handles creating
492492+ // the delete row, so we're done. In phase 2, this is where we'd
493493+ // parse original.record and insert count deltas for denormalized tables.
494494+ debug!(did = %record.did, cid = %record.cid, "delete found original record");
495495+ return Ok(());
496496+ }
497497+498498+ if Instant::now() > deadline {
499499+ // Gave up - create stub tombstone
500500+ // The record will be inserted via the main batch path with operation='delete'
501501+ // and empty record content, which serves as our stub tombstone
502502+ warn!(
503503+ did = %record.did,
504504+ cid = %record.cid,
505505+ "delete timeout, stub tombstone will be created"
506506+ );
507507+ return Ok(());
508508+ }
509509+510510+ tokio::time::sleep(Duration::from_secs(1)).await;
511511+ }
512512+}
+2
crates/weaver-index/src/lib.rs
···22pub mod config;
33pub mod error;
44pub mod firehose;
55+pub mod indexer;
5667pub use config::Config;
78pub use error::{IndexError, Result};
99+pub use indexer::{load_cursor, Indexer};