parallel tap

Orual 2064e111 e86033fb

+299 -227
+19 -4
crates/weaver-index/src/bin/weaver_indexer.rs
··· 6 }; 7 use weaver_index::firehose::FirehoseConsumer; 8 use weaver_index::server::{AppState, ServerConfig, TelemetryConfig, telemetry}; 9 use weaver_index::{FirehoseIndexer, TapIndexer, load_cursor}; 10 11 #[derive(Parser)] ··· 166 } 167 SourceMode::Tap => { 168 let tap_config = TapConfig::from_env()?; 169 - let indexer = TapIndexer::new(indexer_client, tap_config, indexer_config); 170 - info!("Starting tap indexer"); 171 tokio::spawn(async move { indexer.run().await }) 172 } 173 }; ··· 239 tap_config: TapConfig, 240 indexer_config: IndexerConfig, 241 ) -> miette::Result<()> { 242 - let indexer = TapIndexer::new(client, tap_config, indexer_config); 243 244 - info!("Starting tap indexer"); 245 indexer.run().await?; 246 247 Ok(())
··· 6 }; 7 use weaver_index::firehose::FirehoseConsumer; 8 use weaver_index::server::{AppState, ServerConfig, TelemetryConfig, telemetry}; 9 + use weaver_index::clickhouse::InserterConfig; 10 use weaver_index::{FirehoseIndexer, TapIndexer, load_cursor}; 11 12 #[derive(Parser)] ··· 167 } 168 SourceMode::Tap => { 169 let tap_config = TapConfig::from_env()?; 170 + let num_workers = tap_config.num_workers; 171 + let indexer = TapIndexer::new( 172 + indexer_client, 173 + tap_config, 174 + InserterConfig::default(), 175 + indexer_config, 176 + num_workers, 177 + ); 178 + info!("Starting tap indexer with {} workers", num_workers); 179 tokio::spawn(async move { indexer.run().await }) 180 } 181 }; ··· 247 tap_config: TapConfig, 248 indexer_config: IndexerConfig, 249 ) -> miette::Result<()> { 250 + let num_workers = tap_config.num_workers; 251 + let indexer = TapIndexer::new( 252 + client, 253 + tap_config, 254 + InserterConfig::default(), 255 + indexer_config, 256 + num_workers, 257 + ); 258 259 + info!("Starting tap indexer with {} workers", num_workers); 260 indexer.run().await?; 261 262 Ok(())
-1
crates/weaver-index/src/clickhouse/resilient_inserter.rs
··· 513 514 #[cfg(test)] 515 mod tests { 516 - use super::*; 517 518 #[test] 519 fn test_extract_failing_row() {
··· 513 514 #[cfg(test)] 515 mod tests { 516 517 #[test] 518 fn test_extract_failing_row() {
+14 -1
crates/weaver-index/src/config.rs
··· 207 pub struct TapConfig { 208 pub url: Url, 209 pub send_acks: bool, 210 } 211 212 impl TapConfig { 213 /// Default tap URL (local) 214 pub const DEFAULT_URL: &'static str = "ws://localhost:2480/channel"; 215 216 /// Load configuration from environment variables. 217 /// 218 /// Optional env vars: 219 /// - `TAP_URL`: Tap WebSocket URL (default: ws://localhost:2480/channel) 220 /// - `TAP_SEND_ACKS`: Whether to send acks (default: true) 221 pub fn from_env() -> Result<Self, IndexError> { 222 let url_str = std::env::var("TAP_URL").unwrap_or_else(|_| Self::DEFAULT_URL.to_string()); 223 ··· 230 .map(|s| s.to_lowercase() != "false") 231 .unwrap_or(true); 232 233 - Ok(Self { url, send_acks }) 234 } 235 } 236
··· 207 pub struct TapConfig { 208 pub url: Url, 209 pub send_acks: bool, 210 + pub num_workers: usize, 211 } 212 213 impl TapConfig { 214 /// Default tap URL (local) 215 pub const DEFAULT_URL: &'static str = "ws://localhost:2480/channel"; 216 + /// Default number of parallel workers 217 + pub const DEFAULT_WORKERS: usize = 4; 218 219 /// Load configuration from environment variables. 220 /// 221 /// Optional env vars: 222 /// - `TAP_URL`: Tap WebSocket URL (default: ws://localhost:2480/channel) 223 /// - `TAP_SEND_ACKS`: Whether to send acks (default: true) 224 + /// - `TAP_WORKERS`: Number of parallel workers (default: 4) 225 pub fn from_env() -> Result<Self, IndexError> { 226 let url_str = std::env::var("TAP_URL").unwrap_or_else(|_| Self::DEFAULT_URL.to_string()); 227 ··· 234 .map(|s| s.to_lowercase() != "false") 235 .unwrap_or(true); 236 237 + let num_workers = std::env::var("TAP_WORKERS") 238 + .ok() 239 + .and_then(|s| s.parse().ok()) 240 + .unwrap_or(Self::DEFAULT_WORKERS); 241 + 242 + Ok(Self { 243 + url, 244 + send_acks, 245 + num_workers, 246 + }) 247 } 248 } 249
+2 -220
crates/weaver-index/src/indexer.rs
··· 5 use dashmap::DashMap; 6 use n0_future::StreamExt; 7 use smol_str::{SmolStr, ToSmolStr}; 8 - use tracing::{debug, info, trace, warn}; 9 10 use chrono::DateTime; 11 ··· 14 RawRecordInsert, ResilientRecordInserter, 15 }; 16 use crate::config::IndexerConfig; 17 - use crate::config::TapConfig; 18 - use crate::error::{ClickHouseError, IndexError, Result}; 19 use crate::firehose::{ 20 Account, ExtractedRecord, FirehoseConsumer, Identity, MessageStream, SubscribeReposMessage, 21 extract_records, 22 }; 23 - use crate::tap::{TapConfig as TapConsumerConfig, TapConsumer, TapEvent}; 24 25 /// Default consumer ID for cursor tracking 26 const CONSUMER_ID: &str = "main"; ··· 523 tokio::time::sleep(Duration::from_secs(1)).await; 524 } 525 } 526 - 527 - // ============================================================================ 528 - // TapIndexer - consumes from tap websocket 529 - // ============================================================================ 530 - 531 - /// Consumer ID for tap cursor tracking 532 - const TAP_CONSUMER_ID: &str = "tap"; 533 - 534 - /// Tap indexer that consumes from tap websocket and writes to ClickHouse 535 - pub struct TapIndexer { 536 - client: Arc<Client>, 537 - tap_config: TapConfig, 538 - config: IndexerConfig, 539 - } 540 - 541 - impl TapIndexer { 542 - /// Create a new tap indexer 543 - pub fn new(client: Client, tap_config: TapConfig, config: IndexerConfig) -> Self { 544 - Self { 545 - client: Arc::new(client), 546 - tap_config, 547 - config, 548 - } 549 - } 550 - 551 - /// Save tap cursor to ClickHouse for visibility 552 - async fn save_cursor(&self, seq: u64) -> Result<()> { 553 - let query = format!( 554 - "INSERT INTO firehose_cursor (consumer_id, seq, event_time) VALUES ('{}', {}, now64(3))", 555 - TAP_CONSUMER_ID, seq 556 - ); 557 - 558 - self.client.execute(&query).await?; 559 - debug!(seq, "saved tap cursor"); 560 - Ok(()) 561 - } 562 - 563 - /// Run the tap indexer loop 564 - pub async fn run(&self) -> Result<()> { 565 - info!(url = %self.tap_config.url, "connecting to tap..."); 566 - 567 - let consumer_config = TapConsumerConfig::new(self.tap_config.url.clone()) 568 - .with_acks(self.tap_config.send_acks); 569 - let consumer = TapConsumer::new(consumer_config); 570 - 571 - let (mut events, ack_tx) = consumer.connect().await?; 572 - 573 - // Use resilient inserter for records since that's where untrusted JSON enters 574 - let mut records = 575 - ResilientRecordInserter::new(self.client.inner().clone(), InserterConfig::default()); 576 - let mut identities = self 577 - .client 578 - .inserter::<RawIdentityEvent>("raw_identity_events"); 579 - 580 - let mut processed: u64 = 0; 581 - let mut last_seq: u64 = 0; 582 - let mut last_stats = Instant::now(); 583 - let mut last_cursor_save = Instant::now(); 584 - 585 - info!("starting tap indexer loop"); 586 - 587 - loop { 588 - // Get time until next required flush - must commit before socket timeout (30s) 589 - let records_time = records.time_left().unwrap_or(Duration::from_secs(10)); 590 - let identities_time = identities.time_left().unwrap_or(Duration::from_secs(10)); 591 - let time_left = records_time.min(identities_time); 592 - 593 - let event = match tokio::time::timeout(time_left, events.recv()).await { 594 - Ok(Some(event)) => event, 595 - Ok(None) => { 596 - // Channel closed, exit loop 597 - break; 598 - } 599 - Err(_) => { 600 - // Timeout - flush inserters to keep INSERT alive 601 - trace!("flush timeout, committing inserters"); 602 - records.commit().await?; 603 - identities 604 - .commit() 605 - .await 606 - .map_err(|e| ClickHouseError::Query { 607 - message: "periodic identities commit failed".into(), 608 - source: e, 609 - })?; 610 - continue; 611 - } 612 - }; 613 - 614 - let event_id = event.id(); 615 - last_seq = event_id; 616 - 617 - match event { 618 - TapEvent::Record(envelope) => { 619 - let record = &envelope.record; 620 - 621 - // Collection filter 622 - if !self.config.collections.matches(&record.collection) { 623 - // Still ack even if filtered 624 - let _ = ack_tx.send(event_id).await; 625 - continue; 626 - } 627 - 628 - let json = match &record.record { 629 - Some(v) => match serde_json::to_string(v) { 630 - Ok(s) => s, 631 - Err(e) => { 632 - warn!( 633 - did = %record.did, 634 - collection = %record.collection, 635 - rkey = %record.rkey, 636 - error = ?e, 637 - "failed to serialize record, sending to DLQ" 638 - ); 639 - let raw_data = format!( 640 - r#"{{"did":"{}","collection":"{}","rkey":"{}","cid":"{}","error":"serialization_failed"}}"#, 641 - record.did, record.collection, record.rkey, record.cid 642 - ); 643 - records 644 - .write_raw_to_dlq( 645 - record.action.as_str().to_smolstr(), 646 - raw_data, 647 - e.to_string(), 648 - event_id, 649 - ) 650 - .await?; 651 - let _ = ack_tx.send(event_id).await; 652 - continue; 653 - } 654 - }, 655 - None => "{}".to_string(), 656 - }; 657 - 658 - debug!( 659 - op = record.action.as_str(), 660 - id = event_id, 661 - len = json.len(), 662 - "writing record" 663 - ); 664 - 665 - records 666 - .write(RawRecordInsert { 667 - did: record.did.clone(), 668 - collection: record.collection.clone(), 669 - rkey: record.rkey.clone(), 670 - cid: record.cid.clone(), 671 - rev: record.rev.clone(), 672 - record: json.to_smolstr(), 673 - operation: record.action.as_str().to_smolstr(), 674 - seq: event_id, 675 - event_time: Utc::now(), 676 - is_live: record.live, 677 - }) 678 - .await?; 679 - records.commit().await?; 680 - 681 - processed += 1; 682 - } 683 - TapEvent::Identity(envelope) => { 684 - let identity = &envelope.identity; 685 - 686 - identities 687 - .write(&RawIdentityEvent { 688 - did: identity.did.clone(), 689 - handle: identity.handle.clone(), 690 - seq: event_id, 691 - event_time: Utc::now(), 692 - }) 693 - .await 694 - .map_err(|e| ClickHouseError::Query { 695 - message: "identity write failed".into(), 696 - source: e, 697 - })?; 698 - identities 699 - .commit() 700 - .await 701 - .map_err(|e| ClickHouseError::Query { 702 - message: "identity commit failed".into(), 703 - source: e, 704 - })?; 705 - } 706 - } 707 - 708 - // Send ack after successful write+commit 709 - let _ = ack_tx.send(event_id).await; 710 - 711 - // Periodic stats 712 - if last_stats.elapsed() >= Duration::from_secs(10) { 713 - info!(processed, last_seq, "tap indexer stats"); 714 - last_stats = Instant::now(); 715 - } 716 - 717 - // Save cursor every 30s for visibility 718 - if last_cursor_save.elapsed() >= Duration::from_secs(30) && last_seq > 0 { 719 - if let Err(e) = self.save_cursor(last_seq).await { 720 - warn!(error = ?e, "failed to save tap cursor"); 721 - } 722 - last_cursor_save = Instant::now(); 723 - } 724 - } 725 - 726 - // Final flush 727 - records.end().await?; 728 - identities.end().await.map_err(|e| ClickHouseError::Query { 729 - message: "final identities flush failed".into(), 730 - source: e, 731 - })?; 732 - 733 - // Final cursor save 734 - if last_seq > 0 { 735 - self.save_cursor(last_seq).await?; 736 - } 737 - 738 - info!(last_seq, "tap stream ended"); 739 - Ok(()) 740 - } 741 - }
··· 5 use dashmap::DashMap; 6 use n0_future::StreamExt; 7 use smol_str::{SmolStr, ToSmolStr}; 8 + use tracing::{debug, info, warn}; 9 10 use chrono::DateTime; 11 ··· 14 RawRecordInsert, ResilientRecordInserter, 15 }; 16 use crate::config::IndexerConfig; 17 + use crate::error::{IndexError, Result}; 18 use crate::firehose::{ 19 Account, ExtractedRecord, FirehoseConsumer, Identity, MessageStream, SubscribeReposMessage, 20 extract_records, 21 }; 22 23 /// Default consumer ID for cursor tracking 24 const CONSUMER_ID: &str = "main"; ··· 521 tokio::time::sleep(Duration::from_secs(1)).await; 522 } 523 }
+3 -1
crates/weaver-index/src/lib.rs
··· 4 pub mod error; 5 pub mod firehose; 6 pub mod indexer; 7 pub mod server; 8 pub mod sqlite; 9 pub mod tap; 10 11 pub use config::Config; 12 pub use error::{IndexError, Result}; 13 - pub use indexer::{FirehoseIndexer, TapIndexer, load_cursor}; 14 pub use server::{AppState, ServerConfig}; 15 pub use sqlite::{ShardKey, ShardRouter, SqliteShard};
··· 4 pub mod error; 5 pub mod firehose; 6 pub mod indexer; 7 + pub mod parallel_tap; 8 pub mod server; 9 pub mod sqlite; 10 pub mod tap; 11 12 pub use config::Config; 13 pub use error::{IndexError, Result}; 14 + pub use indexer::{FirehoseIndexer, load_cursor}; 15 + pub use parallel_tap::TapIndexer; 16 pub use server::{AppState, ServerConfig}; 17 pub use sqlite::{ShardKey, ShardRouter, SqliteShard};
+261
crates/weaver-index/src/parallel_tap.rs
···
··· 1 + use std::sync::Arc; 2 + use std::time::{Duration, Instant}; 3 + 4 + use chrono::Utc; 5 + use smol_str::ToSmolStr; 6 + use tokio::task::JoinHandle; 7 + use tracing::{debug, error, info, trace, warn}; 8 + 9 + use crate::clickhouse::{ 10 + Client, InserterConfig, RawIdentityEvent, RawRecordInsert, ResilientRecordInserter, 11 + }; 12 + use crate::config::{IndexerConfig, TapConfig}; 13 + use crate::error::{ClickHouseError, Result}; 14 + use crate::tap::{TapConfig as TapConsumerConfig, TapConsumer, TapEvent}; 15 + 16 + /// TAP indexer with multiple parallel websocket connections 17 + /// 18 + /// Each worker maintains its own websocket connection to TAP and its own 19 + /// ClickHouse inserter. TAP distributes events across connected clients, 20 + /// and its ack-gating mechanism ensures per-DID ordering is preserved 21 + /// regardless of which worker handles which events. 22 + pub struct TapIndexer { 23 + client: Arc<Client>, 24 + tap_config: TapConfig, 25 + inserter_config: InserterConfig, 26 + config: Arc<IndexerConfig>, 27 + num_workers: usize, 28 + } 29 + 30 + impl TapIndexer { 31 + pub fn new( 32 + client: Client, 33 + tap_config: TapConfig, 34 + inserter_config: InserterConfig, 35 + config: IndexerConfig, 36 + num_workers: usize, 37 + ) -> Self { 38 + Self { 39 + client: Arc::new(client), 40 + tap_config, 41 + inserter_config, 42 + config: Arc::new(config), 43 + num_workers, 44 + } 45 + } 46 + 47 + pub async fn run(&self) -> Result<()> { 48 + info!( 49 + num_workers = self.num_workers, 50 + url = %self.tap_config.url, 51 + "starting parallel tap indexer" 52 + ); 53 + 54 + let mut handles: Vec<JoinHandle<Result<()>>> = Vec::with_capacity(self.num_workers); 55 + 56 + for worker_id in 0..self.num_workers { 57 + let client = self.client.clone(); 58 + let tap_config = self.tap_config.clone(); 59 + let inserter_config = self.inserter_config.clone(); 60 + let config = self.config.clone(); 61 + 62 + let handle = tokio::spawn(async move { 63 + run_tap_worker(worker_id, client, tap_config, inserter_config, config).await 64 + }); 65 + 66 + handles.push(handle); 67 + } 68 + 69 + // Wait for all workers 70 + // TODO: Implement proper supervision - restart failed workers instead of propagating 71 + for (i, handle) in handles.into_iter().enumerate() { 72 + match handle.await { 73 + Ok(Ok(())) => { 74 + info!(worker_id = i, "tap worker finished cleanly"); 75 + } 76 + Ok(Err(e)) => { 77 + error!(worker_id = i, error = ?e, "tap worker failed"); 78 + return Err(e); 79 + } 80 + Err(e) => { 81 + error!(worker_id = i, error = ?e, "tap worker panicked"); 82 + return Err(crate::error::FirehoseError::Stream { 83 + message: format!("worker {} panicked: {}", i, e), 84 + } 85 + .into()); 86 + } 87 + } 88 + } 89 + 90 + Ok(()) 91 + } 92 + } 93 + 94 + async fn run_tap_worker( 95 + worker_id: usize, 96 + client: Arc<Client>, 97 + tap_config: TapConfig, 98 + inserter_config: InserterConfig, 99 + config: Arc<IndexerConfig>, 100 + ) -> Result<()> { 101 + info!(worker_id, url = %tap_config.url, "tap worker starting"); 102 + 103 + let consumer_config = 104 + TapConsumerConfig::new(tap_config.url.clone()).with_acks(tap_config.send_acks); 105 + let consumer = TapConsumer::new(consumer_config); 106 + 107 + let (mut events, ack_tx) = consumer.connect().await?; 108 + 109 + // Each worker has its own resilient inserter 110 + let mut records = ResilientRecordInserter::new(client.inner().clone(), inserter_config); 111 + let mut identities = client.inserter::<RawIdentityEvent>("raw_identity_events"); 112 + 113 + let mut processed: u64 = 0; 114 + let mut last_stats = Instant::now(); 115 + 116 + info!(worker_id, "tap worker connected, starting event loop"); 117 + 118 + loop { 119 + // Get time until next required flush 120 + let records_time = records.time_left().unwrap_or(Duration::from_secs(10)); 121 + let identities_time = identities.time_left().unwrap_or(Duration::from_secs(10)); 122 + let time_left = records_time.min(identities_time); 123 + 124 + let event = match tokio::time::timeout(time_left, events.recv()).await { 125 + Ok(Some(event)) => event, 126 + Ok(None) => { 127 + info!(worker_id, "tap channel closed, exiting"); 128 + break; 129 + } 130 + Err(_) => { 131 + // Timeout - flush inserters 132 + trace!(worker_id, "flush timeout, committing inserters"); 133 + records.commit().await?; 134 + identities 135 + .commit() 136 + .await 137 + .map_err(|e| ClickHouseError::Query { 138 + message: "periodic identities commit failed".into(), 139 + source: e, 140 + })?; 141 + continue; 142 + } 143 + }; 144 + 145 + let event_id = event.id(); 146 + 147 + match event { 148 + TapEvent::Record(envelope) => { 149 + let record = &envelope.record; 150 + 151 + // Collection filter 152 + if !config.collections.matches(&record.collection) { 153 + let _ = ack_tx.send(event_id).await; 154 + continue; 155 + } 156 + 157 + // Serialize record 158 + let json = match &record.record { 159 + Some(v) => match serde_json::to_string(v) { 160 + Ok(s) => s, 161 + Err(e) => { 162 + warn!( 163 + worker_id, 164 + did = %record.did, 165 + collection = %record.collection, 166 + rkey = %record.rkey, 167 + error = ?e, 168 + "failed to serialize record, sending to DLQ" 169 + ); 170 + let raw_data = format!( 171 + r#"{{"did":"{}","collection":"{}","rkey":"{}","cid":"{}","error":"serialization_failed"}}"#, 172 + record.did, record.collection, record.rkey, record.cid 173 + ); 174 + records 175 + .write_raw_to_dlq( 176 + record.action.as_str().to_smolstr(), 177 + raw_data, 178 + e.to_string(), 179 + event_id, 180 + ) 181 + .await?; 182 + let _ = ack_tx.send(event_id).await; 183 + continue; 184 + } 185 + }, 186 + None => "{}".to_string(), 187 + }; 188 + 189 + debug!( 190 + worker_id, 191 + op = record.action.as_str(), 192 + id = event_id, 193 + len = json.len(), 194 + "writing record" 195 + ); 196 + 197 + records 198 + .write(RawRecordInsert { 199 + did: record.did.clone(), 200 + collection: record.collection.clone(), 201 + rkey: record.rkey.clone(), 202 + cid: record.cid.clone(), 203 + rev: record.rev.clone(), 204 + record: json.to_smolstr(), 205 + operation: record.action.as_str().to_smolstr(), 206 + seq: event_id, 207 + event_time: Utc::now(), 208 + is_live: record.live, 209 + }) 210 + .await?; 211 + records.commit().await?; 212 + 213 + // Ack after successful processing 214 + let _ = ack_tx.send(event_id).await; 215 + 216 + processed += 1; 217 + } 218 + TapEvent::Identity(envelope) => { 219 + let identity = &envelope.identity; 220 + 221 + identities 222 + .write(&RawIdentityEvent { 223 + did: identity.did.clone(), 224 + handle: identity.handle.clone(), 225 + seq: event_id, 226 + event_time: Utc::now(), 227 + }) 228 + .await 229 + .map_err(|e| ClickHouseError::Query { 230 + message: "identity write failed".into(), 231 + source: e, 232 + })?; 233 + identities 234 + .commit() 235 + .await 236 + .map_err(|e| ClickHouseError::Query { 237 + message: "identity commit failed".into(), 238 + source: e, 239 + })?; 240 + 241 + let _ = ack_tx.send(event_id).await; 242 + } 243 + } 244 + 245 + // Periodic stats 246 + if last_stats.elapsed() > Duration::from_secs(30) { 247 + info!(worker_id, processed, "tap worker stats"); 248 + last_stats = Instant::now(); 249 + } 250 + } 251 + 252 + // Clean shutdown 253 + records.end().await?; 254 + identities.end().await.map_err(|e| ClickHouseError::Query { 255 + message: "identities end failed".into(), 256 + source: e, 257 + })?; 258 + 259 + info!(worker_id, processed, "tap worker finished"); 260 + Ok(()) 261 + }