initial consumer skeleton

Orual 40eee0db 0a223e0f

+1029 -300
+21
Cargo.lock
··· 5024 checksum = "edcd27d72f2f071c64249075f42e205ff93c9a4c5f6c6da53e79ed9f9832c285" 5025 5026 [[package]] 5027 name = "indexmap" 5028 version = "1.9.3" 5029 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 11464 "cid", 11465 "clap", 11466 "clickhouse", 11467 "dotenvy", 11468 "humansize", 11469 "jacquard", 11470 "jacquard-common", 11471 "jacquard-repo",
··· 5024 checksum = "edcd27d72f2f071c64249075f42e205ff93c9a4c5f6c6da53e79ed9f9832c285" 5025 5026 [[package]] 5027 + name = "include_dir" 5028 + version = "0.7.4" 5029 + source = "registry+https://github.com/rust-lang/crates.io-index" 5030 + checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd" 5031 + dependencies = [ 5032 + "include_dir_macros", 5033 + ] 5034 + 5035 + [[package]] 5036 + name = "include_dir_macros" 5037 + version = "0.7.4" 5038 + source = "registry+https://github.com/rust-lang/crates.io-index" 5039 + checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75" 5040 + dependencies = [ 5041 + "proc-macro2", 5042 + "quote", 5043 + ] 5044 + 5045 + [[package]] 5046 name = "indexmap" 5047 version = "1.9.3" 5048 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 11483 "cid", 11484 "clap", 11485 "clickhouse", 11486 + "dashmap 6.1.0", 11487 "dotenvy", 11488 "humansize", 11489 + "include_dir", 11490 "jacquard", 11491 "jacquard-common", 11492 "jacquard-repo",
+146 -224
crates/weaver-api/src/com_atproto/sync/subscribe_repos.rs
··· 8 /// Represents a change to an account's status on a host (eg, PDS or Relay). The semantics of this event are that the status is at the host which emitted the event, not necessarily that at the currently active PDS. Eg, a Relay takedown would emit a takedown with active=false, even if the PDS is still active. 9 #[jacquard_derive::lexicon] 10 #[derive( 11 - serde::Serialize, 12 - serde::Deserialize, 13 - Debug, 14 - Clone, 15 - PartialEq, 16 - Eq, 17 - jacquard_derive::IntoStatic 18 )] 19 #[serde(rename_all = "camelCase")] 20 pub struct Account<'a> { ··· 32 33 pub mod account_state { 34 35 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 36 #[allow(unused)] 37 use ::core::marker::PhantomData; 38 mod sealed { ··· 179 S::Seq: account_state::IsUnset, 180 { 181 /// Set the `seq` field (required) 182 - pub fn seq( 183 - mut self, 184 - value: impl Into<i64>, 185 - ) -> AccountBuilder<'a, account_state::SetSeq<S>> { 186 self.__unsafe_private_named.2 = ::core::option::Option::Some(value.into()); 187 AccountBuilder { 188 _phantom_state: ::core::marker::PhantomData, ··· 194 195 impl<'a, S: account_state::State> AccountBuilder<'a, S> { 196 /// Set the `status` field (optional) 197 - pub fn status( 198 - mut self, 199 - value: impl Into<Option<jacquard_common::CowStr<'a>>>, 200 - ) -> Self { 201 self.__unsafe_private_named.3 = value.into(); 202 self 203 } ··· 265 } 266 } 267 268 - fn lexicon_doc_com_atproto_sync_subscribeRepos() -> ::jacquard_lexicon::lexicon::LexiconDoc< 269 - 'static, 270 - > { 271 ::jacquard_lexicon::lexicon::LexiconDoc { 272 lexicon: ::jacquard_lexicon::lexicon::Lexicon::Lexicon1, 273 id: ::jacquard_common::CowStr::new_static("com.atproto.sync.subscribeRepos"), ··· 647 ); 648 map.insert( 649 ::jacquard_common::smol_str::SmolStr::new_static("info"), 650 - ::jacquard_lexicon::lexicon::LexUserType::Object(::jacquard_lexicon::lexicon::LexObject { 651 - description: None, 652 - required: Some( 653 - vec![::jacquard_common::smol_str::SmolStr::new_static("name")], 654 - ), 655 - nullable: None, 656 - properties: { 657 - #[allow(unused_mut)] 658 - let mut map = ::std::collections::BTreeMap::new(); 659 - map.insert( 660 - ::jacquard_common::smol_str::SmolStr::new_static("message"), 661 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 662 - description: None, 663 - format: None, 664 - default: None, 665 - min_length: None, 666 - max_length: None, 667 - min_graphemes: None, 668 - max_graphemes: None, 669 - r#enum: None, 670 - r#const: None, 671 - known_values: None, 672 - }), 673 - ); 674 - map.insert( 675 - ::jacquard_common::smol_str::SmolStr::new_static("name"), 676 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 677 - description: None, 678 - format: None, 679 - default: None, 680 - min_length: None, 681 - max_length: None, 682 - min_graphemes: None, 683 - max_graphemes: None, 684 - r#enum: None, 685 - r#const: None, 686 - known_values: None, 687 - }), 688 - ); 689 - map 690 }, 691 - }), 692 ); 693 map.insert( 694 ::jacquard_common::smol_str::SmolStr::new_static("main"), ··· 723 ); 724 map.insert( 725 ::jacquard_common::smol_str::SmolStr::new_static("repoOp"), 726 - ::jacquard_lexicon::lexicon::LexUserType::Object(::jacquard_lexicon::lexicon::LexObject { 727 - description: Some( 728 - ::jacquard_common::CowStr::new_static( 729 "A repo operation, ie a mutation of a single record.", 730 - ), 731 - ), 732 - required: Some( 733 - vec![ 734 ::jacquard_common::smol_str::SmolStr::new_static("action"), 735 ::jacquard_common::smol_str::SmolStr::new_static("path"), 736 - ::jacquard_common::smol_str::SmolStr::new_static("cid") 737 - ], 738 - ), 739 - nullable: None, 740 - properties: { 741 - #[allow(unused_mut)] 742 - let mut map = ::std::collections::BTreeMap::new(); 743 - map.insert( 744 - ::jacquard_common::smol_str::SmolStr::new_static("action"), 745 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 746 - description: None, 747 - format: None, 748 - default: None, 749 - min_length: None, 750 - max_length: None, 751 - min_graphemes: None, 752 - max_graphemes: None, 753 - r#enum: None, 754 - r#const: None, 755 - known_values: None, 756 - }), 757 - ); 758 - map.insert( 759 ::jacquard_common::smol_str::SmolStr::new_static("cid"), 760 - ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink(::jacquard_lexicon::lexicon::LexCidLink { 761 - description: None, 762 - }), 763 - ); 764 - map.insert( 765 - ::jacquard_common::smol_str::SmolStr::new_static("path"), 766 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 767 - description: None, 768 - format: None, 769 - default: None, 770 - min_length: None, 771 - max_length: None, 772 - min_graphemes: None, 773 - max_graphemes: None, 774 - r#enum: None, 775 - r#const: None, 776 - known_values: None, 777 - }), 778 - ); 779 - map.insert( 780 - ::jacquard_common::smol_str::SmolStr::new_static("prev"), 781 - ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink(::jacquard_lexicon::lexicon::LexCidLink { 782 - description: None, 783 - }), 784 - ); 785 - map 786 }, 787 - }), 788 ); 789 map.insert( 790 ::jacquard_common::smol_str::SmolStr::new_static("sync"), ··· 916 /// Represents an update of repository state. Note that empty commits are allowed, which include no repo data changes, but an update to rev and signature. 917 #[jacquard_derive::lexicon] 918 #[derive( 919 - serde::Serialize, 920 - serde::Deserialize, 921 - Debug, 922 - Clone, 923 - PartialEq, 924 - Eq, 925 - jacquard_derive::IntoStatic 926 )] 927 #[serde(rename_all = "camelCase")] 928 pub struct Commit<'a> { ··· 959 960 pub mod commit_state { 961 962 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 963 #[allow(unused)] 964 use ::core::marker::PhantomData; 965 mod sealed { ··· 1176 ::core::option::Option<Vec<jacquard_common::types::cid::CidLink<'a>>>, 1177 ::core::option::Option<bytes::Bytes>, 1178 ::core::option::Option<jacquard_common::types::cid::CidLink<'a>>, 1179 - ::core::option::Option< 1180 - Vec<crate::com_atproto::sync::subscribe_repos::RepoOp<'a>>, 1181 - >, 1182 ::core::option::Option<jacquard_common::types::cid::CidLink<'a>>, 1183 ::core::option::Option<bool>, 1184 ::core::option::Option<jacquard_common::types::string::Did<'a>>, ··· 1204 CommitBuilder { 1205 _phantom_state: ::core::marker::PhantomData, 1206 __unsafe_private_named: ( 1207 - None, 1208 - None, 1209 - None, 1210 - None, 1211 - None, 1212 - None, 1213 - None, 1214 - None, 1215 - None, 1216 - None, 1217 - None, 1218 - None, 1219 ), 1220 _phantom: ::core::marker::PhantomData, 1221 } ··· 1380 S::Seq: commit_state::IsUnset, 1381 { 1382 /// Set the `seq` field (required) 1383 - pub fn seq( 1384 - mut self, 1385 - value: impl Into<i64>, 1386 - ) -> CommitBuilder<'a, commit_state::SetSeq<S>> { 1387 self.__unsafe_private_named.8 = ::core::option::Option::Some(value.into()); 1388 CommitBuilder { 1389 _phantom_state: ::core::marker::PhantomData, ··· 1395 1396 impl<'a, S: commit_state::State> CommitBuilder<'a, S> { 1397 /// Set the `since` field (optional) 1398 - pub fn since( 1399 - mut self, 1400 - value: impl Into<Option<jacquard_common::types::string::Tid>>, 1401 - ) -> Self { 1402 self.__unsafe_private_named.9 = value.into(); 1403 self 1404 } 1405 /// Set the `since` field to an Option value (optional) 1406 - pub fn maybe_since( 1407 - mut self, 1408 - value: Option<jacquard_common::types::string::Tid>, 1409 - ) -> Self { 1410 self.__unsafe_private_named.9 = value; 1411 self 1412 } ··· 1526 #[allow(unused_comparisons)] 1527 if value.len() > 200usize { 1528 return Err(::jacquard_lexicon::validation::ConstraintError::MaxLength { 1529 - path: ::jacquard_lexicon::validation::ValidationPath::from_field( 1530 - "ops", 1531 - ), 1532 max: 200usize, 1533 actual: value.len(), 1534 }); ··· 1541 /// Represents a change to an account's identity. Could be an updated handle, signing key, or pds hosting endpoint. Serves as a prod to all downstream services to refresh their identity cache. 1542 #[jacquard_derive::lexicon] 1543 #[derive( 1544 - serde::Serialize, 1545 - serde::Deserialize, 1546 - Debug, 1547 - Clone, 1548 - PartialEq, 1549 - Eq, 1550 - jacquard_derive::IntoStatic 1551 )] 1552 #[serde(rename_all = "camelCase")] 1553 pub struct Identity<'a> { ··· 1563 1564 pub mod identity_state { 1565 1566 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 1567 #[allow(unused)] 1568 use ::core::marker::PhantomData; 1569 mod sealed { ··· 1693 S::Seq: identity_state::IsUnset, 1694 { 1695 /// Set the `seq` field (required) 1696 - pub fn seq( 1697 - mut self, 1698 - value: impl Into<i64>, 1699 - ) -> IdentityBuilder<'a, identity_state::SetSeq<S>> { 1700 self.__unsafe_private_named.2 = ::core::option::Option::Some(value.into()); 1701 IdentityBuilder { 1702 _phantom_state: ::core::marker::PhantomData, ··· 1786 PartialEq, 1787 Eq, 1788 jacquard_derive::IntoStatic, 1789 - Default 1790 )] 1791 #[serde(rename_all = "camelCase")] 1792 pub struct Info<'a> { ··· 1815 } 1816 1817 #[derive( 1818 - serde::Serialize, 1819 - serde::Deserialize, 1820 - Debug, 1821 - Clone, 1822 - PartialEq, 1823 - Eq, 1824 - jacquard_derive::IntoStatic 1825 )] 1826 #[serde(rename_all = "camelCase")] 1827 pub struct SubscribeRepos { ··· 1831 1832 pub mod subscribe_repos_state { 1833 1834 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 1835 #[allow(unused)] 1836 use ::core::marker::PhantomData; 1837 mod sealed { ··· 1898 1899 #[jacquard_derive::open_union] 1900 #[derive( 1901 - serde::Serialize, 1902 - serde::Deserialize, 1903 - Debug, 1904 - Clone, 1905 - PartialEq, 1906 - Eq, 1907 - jacquard_derive::IntoStatic 1908 )] 1909 #[serde(tag = "$type")] 1910 #[serde(bound(deserialize = "'de: 'a"))] ··· 1926 pub fn decode_framed<'de: 'a>( 1927 bytes: &'de [u8], 1928 ) -> Result<SubscribeReposMessage<'a>, jacquard_common::error::DecodeError> { 1929 - let (header, body) = jacquard_common::xrpc::subscription::parse_event_header( 1930 - bytes, 1931 - )?; 1932 match header.t.as_str() { 1933 "#commit" => { 1934 let variant = serde_ipld_dagcbor::from_slice(body)?; ··· 1950 let variant = serde_ipld_dagcbor::from_slice(body)?; 1951 Ok(Self::Info(Box::new(variant))) 1952 } 1953 - unknown => { 1954 - Err( 1955 - jacquard_common::error::DecodeError::UnknownEventType(unknown.into()), 1956 - ) 1957 - } 1958 } 1959 } 1960 } ··· 1969 Eq, 1970 thiserror::Error, 1971 miette::Diagnostic, 1972 - jacquard_derive::IntoStatic 1973 )] 1974 #[serde(tag = "error", content = "message")] 1975 #[serde(bound(deserialize = "'de: 'a"))] ··· 2008 pub struct SubscribeReposStream; 2009 impl jacquard_common::xrpc::SubscriptionResp for SubscribeReposStream { 2010 const NSID: &'static str = "com.atproto.sync.subscribeRepos"; 2011 - const ENCODING: jacquard_common::xrpc::MessageEncoding = jacquard_common::xrpc::MessageEncoding::DagCbor; 2012 type Message<'de> = SubscribeReposMessage<'de>; 2013 type Error<'de> = SubscribeReposError<'de>; 2014 fn decode_message<'de>( ··· 2020 2021 impl jacquard_common::xrpc::XrpcSubscription for SubscribeRepos { 2022 const NSID: &'static str = "com.atproto.sync.subscribeRepos"; 2023 - const ENCODING: jacquard_common::xrpc::MessageEncoding = jacquard_common::xrpc::MessageEncoding::DagCbor; 2024 type Stream = SubscribeReposStream; 2025 } 2026 2027 pub struct SubscribeReposEndpoint; 2028 impl jacquard_common::xrpc::SubscriptionEndpoint for SubscribeReposEndpoint { 2029 const PATH: &'static str = "/xrpc/com.atproto.sync.subscribeRepos"; 2030 - const ENCODING: jacquard_common::xrpc::MessageEncoding = jacquard_common::xrpc::MessageEncoding::DagCbor; 2031 type Params<'de> = SubscribeRepos; 2032 type Stream = SubscribeReposStream; 2033 } ··· 2035 /// A repo operation, ie a mutation of a single record. 2036 #[jacquard_derive::lexicon] 2037 #[derive( 2038 - serde::Serialize, 2039 - serde::Deserialize, 2040 - Debug, 2041 - Clone, 2042 - PartialEq, 2043 - Eq, 2044 - jacquard_derive::IntoStatic 2045 )] 2046 #[serde(rename_all = "camelCase")] 2047 pub struct RepoOp<'a> { ··· 2060 2061 pub mod repo_op_state { 2062 2063 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 2064 #[allow(unused)] 2065 use ::core::marker::PhantomData; 2066 mod sealed { ··· 2161 self 2162 } 2163 /// Set the `cid` field to an Option value (optional) 2164 - pub fn maybe_cid( 2165 - mut self, 2166 - value: Option<jacquard_common::types::cid::CidLink<'a>>, 2167 - ) -> Self { 2168 self.__unsafe_private_named.1 = value; 2169 self 2170 } ··· 2199 self 2200 } 2201 /// Set the `prev` field to an Option value (optional) 2202 - pub fn maybe_prev( 2203 - mut self, 2204 - value: Option<jacquard_common::types::cid::CidLink<'a>>, 2205 - ) -> Self { 2206 self.__unsafe_private_named.3 = value; 2207 self 2208 } ··· 2262 /// Updates the repo to a new state, without necessarily including that state on the firehose. Used to recover from broken commit streams, data loss incidents, or in situations where upstream host does not know recent state of the repository. 2263 #[jacquard_derive::lexicon] 2264 #[derive( 2265 - serde::Serialize, 2266 - serde::Deserialize, 2267 - Debug, 2268 - Clone, 2269 - PartialEq, 2270 - Eq, 2271 - jacquard_derive::IntoStatic 2272 )] 2273 #[serde(rename_all = "camelCase")] 2274 pub struct Sync<'a> { ··· 2289 2290 pub mod sync_state { 2291 2292 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 2293 #[allow(unused)] 2294 use ::core::marker::PhantomData; 2295 mod sealed { ··· 2473 S::Seq: sync_state::IsUnset, 2474 { 2475 /// Set the `seq` field (required) 2476 - pub fn seq( 2477 - mut self, 2478 - value: impl Into<i64>, 2479 - ) -> SyncBuilder<'a, sync_state::SetSeq<S>> { 2480 self.__unsafe_private_named.3 = ::core::option::Option::Some(value.into()); 2481 SyncBuilder { 2482 _phantom_state: ::core::marker::PhantomData, ··· 2559 ) -> ::std::result::Result<(), ::jacquard_lexicon::validation::ConstraintError> { 2560 Ok(()) 2561 } 2562 - }
··· 8 /// Represents a change to an account's status on a host (eg, PDS or Relay). The semantics of this event are that the status is at the host which emitted the event, not necessarily that at the currently active PDS. Eg, a Relay takedown would emit a takedown with active=false, even if the PDS is still active. 9 #[jacquard_derive::lexicon] 10 #[derive( 11 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 12 )] 13 #[serde(rename_all = "camelCase")] 14 pub struct Account<'a> { ··· 26 27 pub mod account_state { 28 29 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 30 #[allow(unused)] 31 use ::core::marker::PhantomData; 32 mod sealed { ··· 173 S::Seq: account_state::IsUnset, 174 { 175 /// Set the `seq` field (required) 176 + pub fn seq(mut self, value: impl Into<i64>) -> AccountBuilder<'a, account_state::SetSeq<S>> { 177 self.__unsafe_private_named.2 = ::core::option::Option::Some(value.into()); 178 AccountBuilder { 179 _phantom_state: ::core::marker::PhantomData, ··· 185 186 impl<'a, S: account_state::State> AccountBuilder<'a, S> { 187 /// Set the `status` field (optional) 188 + pub fn status(mut self, value: impl Into<Option<jacquard_common::CowStr<'a>>>) -> Self { 189 self.__unsafe_private_named.3 = value.into(); 190 self 191 } ··· 253 } 254 } 255 256 + fn lexicon_doc_com_atproto_sync_subscribeRepos() -> ::jacquard_lexicon::lexicon::LexiconDoc<'static> 257 + { 258 ::jacquard_lexicon::lexicon::LexiconDoc { 259 lexicon: ::jacquard_lexicon::lexicon::Lexicon::Lexicon1, 260 id: ::jacquard_common::CowStr::new_static("com.atproto.sync.subscribeRepos"), ··· 634 ); 635 map.insert( 636 ::jacquard_common::smol_str::SmolStr::new_static("info"), 637 + ::jacquard_lexicon::lexicon::LexUserType::Object( 638 + ::jacquard_lexicon::lexicon::LexObject { 639 + description: None, 640 + required: Some(vec![::jacquard_common::smol_str::SmolStr::new_static( 641 + "name", 642 + )]), 643 + nullable: None, 644 + properties: { 645 + #[allow(unused_mut)] 646 + let mut map = ::std::collections::BTreeMap::new(); 647 + map.insert( 648 + ::jacquard_common::smol_str::SmolStr::new_static("message"), 649 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 650 + ::jacquard_lexicon::lexicon::LexString { 651 + description: None, 652 + format: None, 653 + default: None, 654 + min_length: None, 655 + max_length: None, 656 + min_graphemes: None, 657 + max_graphemes: None, 658 + r#enum: None, 659 + r#const: None, 660 + known_values: None, 661 + }, 662 + ), 663 + ); 664 + map.insert( 665 + ::jacquard_common::smol_str::SmolStr::new_static("name"), 666 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 667 + ::jacquard_lexicon::lexicon::LexString { 668 + description: None, 669 + format: None, 670 + default: None, 671 + min_length: None, 672 + max_length: None, 673 + min_graphemes: None, 674 + max_graphemes: None, 675 + r#enum: None, 676 + r#const: None, 677 + known_values: None, 678 + }, 679 + ), 680 + ); 681 + map 682 + }, 683 }, 684 + ), 685 ); 686 map.insert( 687 ::jacquard_common::smol_str::SmolStr::new_static("main"), ··· 716 ); 717 map.insert( 718 ::jacquard_common::smol_str::SmolStr::new_static("repoOp"), 719 + ::jacquard_lexicon::lexicon::LexUserType::Object( 720 + ::jacquard_lexicon::lexicon::LexObject { 721 + description: Some(::jacquard_common::CowStr::new_static( 722 "A repo operation, ie a mutation of a single record.", 723 + )), 724 + required: Some(vec![ 725 ::jacquard_common::smol_str::SmolStr::new_static("action"), 726 ::jacquard_common::smol_str::SmolStr::new_static("path"), 727 ::jacquard_common::smol_str::SmolStr::new_static("cid"), 728 + ]), 729 + nullable: None, 730 + properties: { 731 + #[allow(unused_mut)] 732 + let mut map = ::std::collections::BTreeMap::new(); 733 + map.insert( 734 + ::jacquard_common::smol_str::SmolStr::new_static("action"), 735 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 736 + ::jacquard_lexicon::lexicon::LexString { 737 + description: None, 738 + format: None, 739 + default: None, 740 + min_length: None, 741 + max_length: None, 742 + min_graphemes: None, 743 + max_graphemes: None, 744 + r#enum: None, 745 + r#const: None, 746 + known_values: None, 747 + }, 748 + ), 749 + ); 750 + map.insert( 751 + ::jacquard_common::smol_str::SmolStr::new_static("cid"), 752 + ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink( 753 + ::jacquard_lexicon::lexicon::LexCidLink { description: None }, 754 + ), 755 + ); 756 + map.insert( 757 + ::jacquard_common::smol_str::SmolStr::new_static("path"), 758 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 759 + ::jacquard_lexicon::lexicon::LexString { 760 + description: None, 761 + format: None, 762 + default: None, 763 + min_length: None, 764 + max_length: None, 765 + min_graphemes: None, 766 + max_graphemes: None, 767 + r#enum: None, 768 + r#const: None, 769 + known_values: None, 770 + }, 771 + ), 772 + ); 773 + map.insert( 774 + ::jacquard_common::smol_str::SmolStr::new_static("prev"), 775 + ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink( 776 + ::jacquard_lexicon::lexicon::LexCidLink { description: None }, 777 + ), 778 + ); 779 + map 780 + }, 781 }, 782 + ), 783 ); 784 map.insert( 785 ::jacquard_common::smol_str::SmolStr::new_static("sync"), ··· 911 /// Represents an update of repository state. Note that empty commits are allowed, which include no repo data changes, but an update to rev and signature. 912 #[jacquard_derive::lexicon] 913 #[derive( 914 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 915 )] 916 #[serde(rename_all = "camelCase")] 917 pub struct Commit<'a> { ··· 948 949 pub mod commit_state { 950 951 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 952 #[allow(unused)] 953 use ::core::marker::PhantomData; 954 mod sealed { ··· 1165 ::core::option::Option<Vec<jacquard_common::types::cid::CidLink<'a>>>, 1166 ::core::option::Option<bytes::Bytes>, 1167 ::core::option::Option<jacquard_common::types::cid::CidLink<'a>>, 1168 + ::core::option::Option<Vec<crate::com_atproto::sync::subscribe_repos::RepoOp<'a>>>, 1169 ::core::option::Option<jacquard_common::types::cid::CidLink<'a>>, 1170 ::core::option::Option<bool>, 1171 ::core::option::Option<jacquard_common::types::string::Did<'a>>, ··· 1191 CommitBuilder { 1192 _phantom_state: ::core::marker::PhantomData, 1193 __unsafe_private_named: ( 1194 + None, None, None, None, None, None, None, None, None, None, None, None, 1195 ), 1196 _phantom: ::core::marker::PhantomData, 1197 } ··· 1356 S::Seq: commit_state::IsUnset, 1357 { 1358 /// Set the `seq` field (required) 1359 + pub fn seq(mut self, value: impl Into<i64>) -> CommitBuilder<'a, commit_state::SetSeq<S>> { 1360 self.__unsafe_private_named.8 = ::core::option::Option::Some(value.into()); 1361 CommitBuilder { 1362 _phantom_state: ::core::marker::PhantomData, ··· 1368 1369 impl<'a, S: commit_state::State> CommitBuilder<'a, S> { 1370 /// Set the `since` field (optional) 1371 + pub fn since(mut self, value: impl Into<Option<jacquard_common::types::string::Tid>>) -> Self { 1372 self.__unsafe_private_named.9 = value.into(); 1373 self 1374 } 1375 /// Set the `since` field to an Option value (optional) 1376 + pub fn maybe_since(mut self, value: Option<jacquard_common::types::string::Tid>) -> Self { 1377 self.__unsafe_private_named.9 = value; 1378 self 1379 } ··· 1493 #[allow(unused_comparisons)] 1494 if value.len() > 200usize { 1495 return Err(::jacquard_lexicon::validation::ConstraintError::MaxLength { 1496 + path: ::jacquard_lexicon::validation::ValidationPath::from_field("ops"), 1497 max: 200usize, 1498 actual: value.len(), 1499 }); ··· 1506 /// Represents a change to an account's identity. Could be an updated handle, signing key, or pds hosting endpoint. Serves as a prod to all downstream services to refresh their identity cache. 1507 #[jacquard_derive::lexicon] 1508 #[derive( 1509 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1510 )] 1511 #[serde(rename_all = "camelCase")] 1512 pub struct Identity<'a> { ··· 1522 1523 pub mod identity_state { 1524 1525 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 1526 #[allow(unused)] 1527 use ::core::marker::PhantomData; 1528 mod sealed { ··· 1652 S::Seq: identity_state::IsUnset, 1653 { 1654 /// Set the `seq` field (required) 1655 + pub fn seq(mut self, value: impl Into<i64>) -> IdentityBuilder<'a, identity_state::SetSeq<S>> { 1656 self.__unsafe_private_named.2 = ::core::option::Option::Some(value.into()); 1657 IdentityBuilder { 1658 _phantom_state: ::core::marker::PhantomData, ··· 1742 PartialEq, 1743 Eq, 1744 jacquard_derive::IntoStatic, 1745 + Default, 1746 )] 1747 #[serde(rename_all = "camelCase")] 1748 pub struct Info<'a> { ··· 1771 } 1772 1773 #[derive( 1774 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1775 )] 1776 #[serde(rename_all = "camelCase")] 1777 pub struct SubscribeRepos { ··· 1781 1782 pub mod subscribe_repos_state { 1783 1784 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 1785 #[allow(unused)] 1786 use ::core::marker::PhantomData; 1787 mod sealed { ··· 1848 1849 #[jacquard_derive::open_union] 1850 #[derive( 1851 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1852 )] 1853 #[serde(tag = "$type")] 1854 #[serde(bound(deserialize = "'de: 'a"))] ··· 1870 pub fn decode_framed<'de: 'a>( 1871 bytes: &'de [u8], 1872 ) -> Result<SubscribeReposMessage<'a>, jacquard_common::error::DecodeError> { 1873 + let (header, body) = jacquard_common::xrpc::subscription::parse_event_header(bytes)?; 1874 match header.t.as_str() { 1875 "#commit" => { 1876 let variant = serde_ipld_dagcbor::from_slice(body)?; ··· 1892 let variant = serde_ipld_dagcbor::from_slice(body)?; 1893 Ok(Self::Info(Box::new(variant))) 1894 } 1895 + unknown => Err(jacquard_common::error::DecodeError::UnknownEventType( 1896 + unknown.into(), 1897 + )), 1898 } 1899 } 1900 } ··· 1909 Eq, 1910 thiserror::Error, 1911 miette::Diagnostic, 1912 + jacquard_derive::IntoStatic, 1913 )] 1914 #[serde(tag = "error", content = "message")] 1915 #[serde(bound(deserialize = "'de: 'a"))] ··· 1948 pub struct SubscribeReposStream; 1949 impl jacquard_common::xrpc::SubscriptionResp for SubscribeReposStream { 1950 const NSID: &'static str = "com.atproto.sync.subscribeRepos"; 1951 + const ENCODING: jacquard_common::xrpc::MessageEncoding = 1952 + jacquard_common::xrpc::MessageEncoding::DagCbor; 1953 type Message<'de> = SubscribeReposMessage<'de>; 1954 type Error<'de> = SubscribeReposError<'de>; 1955 fn decode_message<'de>( ··· 1961 1962 impl jacquard_common::xrpc::XrpcSubscription for SubscribeRepos { 1963 const NSID: &'static str = "com.atproto.sync.subscribeRepos"; 1964 + const ENCODING: jacquard_common::xrpc::MessageEncoding = 1965 + jacquard_common::xrpc::MessageEncoding::DagCbor; 1966 type Stream = SubscribeReposStream; 1967 } 1968 1969 pub struct SubscribeReposEndpoint; 1970 impl jacquard_common::xrpc::SubscriptionEndpoint for SubscribeReposEndpoint { 1971 const PATH: &'static str = "/xrpc/com.atproto.sync.subscribeRepos"; 1972 + const ENCODING: jacquard_common::xrpc::MessageEncoding = 1973 + jacquard_common::xrpc::MessageEncoding::DagCbor; 1974 type Params<'de> = SubscribeRepos; 1975 type Stream = SubscribeReposStream; 1976 } ··· 1978 /// A repo operation, ie a mutation of a single record. 1979 #[jacquard_derive::lexicon] 1980 #[derive( 1981 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1982 )] 1983 #[serde(rename_all = "camelCase")] 1984 pub struct RepoOp<'a> { ··· 1997 1998 pub mod repo_op_state { 1999 2000 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 2001 #[allow(unused)] 2002 use ::core::marker::PhantomData; 2003 mod sealed { ··· 2098 self 2099 } 2100 /// Set the `cid` field to an Option value (optional) 2101 + pub fn maybe_cid(mut self, value: Option<jacquard_common::types::cid::CidLink<'a>>) -> Self { 2102 self.__unsafe_private_named.1 = value; 2103 self 2104 } ··· 2133 self 2134 } 2135 /// Set the `prev` field to an Option value (optional) 2136 + pub fn maybe_prev(mut self, value: Option<jacquard_common::types::cid::CidLink<'a>>) -> Self { 2137 self.__unsafe_private_named.3 = value; 2138 self 2139 } ··· 2193 /// Updates the repo to a new state, without necessarily including that state on the firehose. Used to recover from broken commit streams, data loss incidents, or in situations where upstream host does not know recent state of the repository. 2194 #[jacquard_derive::lexicon] 2195 #[derive( 2196 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 2197 )] 2198 #[serde(rename_all = "camelCase")] 2199 pub struct Sync<'a> { ··· 2214 2215 pub mod sync_state { 2216 2217 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 2218 #[allow(unused)] 2219 use ::core::marker::PhantomData; 2220 mod sealed { ··· 2398 S::Seq: sync_state::IsUnset, 2399 { 2400 /// Set the `seq` field (required) 2401 + pub fn seq(mut self, value: impl Into<i64>) -> SyncBuilder<'a, sync_state::SetSeq<S>> { 2402 self.__unsafe_private_named.3 = ::core::option::Option::Some(value.into()); 2403 SyncBuilder { 2404 _phantom_state: ::core::marker::PhantomData, ··· 2481 ) -> ::std::result::Result<(), ::jacquard_lexicon::validation::ConstraintError> { 2482 Ok(()) 2483 } 2484 + }
+2
crates/weaver-index/Cargo.toml
··· 64 # Utilities 65 humansize = "2.0" 66 base64 = "0.22"
··· 64 # Utilities 65 humansize = "2.0" 66 base64 = "0.22" 67 + dashmap = "6" 68 + include_dir = "0.7.4"
+17 -2
crates/weaver-index/migrations/clickhouse/001_raw_records.sql
··· 13 -- Content identifier from the record 14 cid String, 15 16 -- Full record as native JSON (schema-flexible, queryable with record.field.subfield) 17 record JSON, 18 ··· 28 -- When we indexed this record 29 indexed_at DateTime64(3) DEFAULT now64(3), 30 31 -- Materialized AT URI for convenience 32 - uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey) 33 ) 34 ENGINE = ReplacingMergeTree(indexed_at) 35 - ORDER BY (collection, did, rkey, indexed_at);
··· 13 -- Content identifier from the record 14 cid String, 15 16 + -- Repository revision (TID) - monotonically increasing per DID, used for dedup/ordering 17 + rev String, 18 + 19 -- Full record as native JSON (schema-flexible, queryable with record.field.subfield) 20 record JSON, 21 ··· 31 -- When we indexed this record 32 indexed_at DateTime64(3) DEFAULT now64(3), 33 34 + -- Validation state: 'unchecked', 'valid', 'invalid_rev', 'invalid_gap', 'invalid_account' 35 + -- Populated by async batch validation, not in hot path 36 + validation_state LowCardinality(String) DEFAULT 'unchecked', 37 + 38 -- Materialized AT URI for convenience 39 + uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey), 40 + 41 + -- Projection for fast delete lookups by (did, cid) 42 + -- Delete events include CID, so we can O(1) lookup the original record 43 + -- to know what to decrement (e.g., which notebook's like count) 44 + PROJECTION by_did_cid ( 45 + SELECT * ORDER BY (did, cid) 46 + ) 47 ) 48 ENGINE = ReplacingMergeTree(indexed_at) 49 + ORDER BY (collection, did, rkey, event_time) 50 + SETTINGS deduplicate_merge_projection_mode = 'drop';
+24
crates/weaver-index/migrations/clickhouse/006_account_rev_state.sql
···
··· 1 + -- Per-account revision state tracking 2 + -- Maintains latest rev/cid per DID for dedup and gap detection 3 + -- 4 + -- AggregatingMergeTree with incremental MV from raw_records 5 + -- Query with argMaxMerge/maxMerge to finalize aggregates 6 + 7 + CREATE TABLE IF NOT EXISTS account_rev_state ( 8 + -- Account DID 9 + did String, 10 + 11 + -- Latest revision (TID) seen for this account 12 + last_rev AggregateFunction(argMax, String, DateTime64(3)), 13 + 14 + -- CID of the latest revision 15 + last_cid AggregateFunction(argMax, String, DateTime64(3)), 16 + 17 + -- Latest sequence number seen 18 + last_seq AggregateFunction(max, UInt64), 19 + 20 + -- Latest event time seen 21 + last_event_time AggregateFunction(max, DateTime64(3)) 22 + ) 23 + ENGINE = AggregatingMergeTree() 24 + ORDER BY did
+12
crates/weaver-index/migrations/clickhouse/007_account_rev_state_mv.sql
···
··· 1 + -- Incremental MV: fires on each insert to raw_records, maintains aggregate state 2 + -- Must be created after both account_rev_state (target) and raw_records (source) exist 3 + 4 + CREATE MATERIALIZED VIEW IF NOT EXISTS account_rev_state_mv TO account_rev_state AS 5 + SELECT 6 + did, 7 + argMaxState(rev, event_time) as last_rev, 8 + argMaxState(cid, event_time) as last_cid, 9 + maxState(seq) as last_seq, 10 + maxState(event_time) as last_event_time 11 + FROM raw_records 12 + GROUP BY did
+4 -5
crates/weaver-index/src/bin/storage_benchmark.rs
··· 305 } 306 }; 307 308 - let event_time = DateTime::from_timestamp_millis(record.event_time_ms).unwrap(); 309 // Insert JSON record 310 json_inserter 311 .write(&RawRecordJson { 312 did: record.did.clone(), 313 collection: record.collection.clone(), 314 rkey: record.rkey.clone(), 315 - cid: record.cid.clone(), 316 record: json_str, 317 operation: record.operation.clone(), 318 seq: record.seq as u64, 319 - event_time: event_time.clone(), 320 }) 321 .await 322 .map_err(|e| weaver_index::error::ClickHouseError::Insert { ··· 330 did: record.did, 331 collection: record.collection, 332 rkey: record.rkey, 333 - cid: record.cid, 334 record: cbor_bytes.clone(), 335 operation: record.operation, 336 seq: record.seq as u64, 337 - event_time, 338 }) 339 .await 340 .map_err(|e| weaver_index::error::ClickHouseError::Insert {
··· 305 } 306 }; 307 308 // Insert JSON record 309 json_inserter 310 .write(&RawRecordJson { 311 did: record.did.clone(), 312 collection: record.collection.clone(), 313 rkey: record.rkey.clone(), 314 + cid: record.cid.to_string(), 315 record: json_str, 316 operation: record.operation.clone(), 317 seq: record.seq as u64, 318 + event_time: record.event_time, 319 }) 320 .await 321 .map_err(|e| weaver_index::error::ClickHouseError::Insert { ··· 329 did: record.did, 330 collection: record.collection, 331 rkey: record.rkey, 332 + cid: record.cid.to_string(), 333 record: cbor_bytes.clone(), 334 operation: record.operation, 335 seq: record.seq as u64, 336 + event_time: record.event_time, 337 }) 338 .await 339 .map_err(|e| weaver_index::error::ClickHouseError::Insert {
+71 -10
crates/weaver-index/src/bin/weaver_indexer.rs
··· 1 use clap::{Parser, Subcommand}; 2 - use tracing::info; 3 - use weaver_index::clickhouse::{Client, Migrator}; 4 - use weaver_index::config::ClickHouseConfig; 5 6 #[derive(Parser)] 7 - #[command(name = "weaver-indexer")] 8 #[command(about = "Weaver index service - firehose ingestion and query serving")] 9 struct Args { 10 #[command(subcommand)] ··· 18 /// Show what would be run without executing 19 #[arg(long)] 20 dry_run: bool, 21 }, 22 23 /// Check database connectivity ··· 31 async fn main() -> miette::Result<()> { 32 dotenvy::dotenv().ok(); 33 34 tracing_subscriber::fmt() 35 .with_env_filter( 36 - tracing_subscriber::EnvFilter::from_default_env() 37 - .add_directive("weaver_index=info".parse().unwrap()) 38 - .add_directive("weaver_indexer=info".parse().unwrap()), 39 ) 40 .init(); 41 42 let args = Args::parse(); 43 44 match args.command { 45 - Command::Migrate { dry_run } => run_migrate(dry_run).await, 46 Command::Health => run_health().await, 47 Command::Run => run_indexer().await, 48 } 49 } 50 51 - async fn run_migrate(dry_run: bool) -> miette::Result<()> { 52 let config = ClickHouseConfig::from_env()?; 53 info!( 54 "Connecting to ClickHouse at {} (database: {})", ··· 56 ); 57 58 let client = Client::new(&config)?; 59 let migrator = Migrator::new(&client); 60 61 if dry_run { ··· 93 } 94 95 async fn run_indexer() -> miette::Result<()> { 96 - info!("Indexer not yet implemented"); 97 Ok(()) 98 }
··· 1 use clap::{Parser, Subcommand}; 2 + use miette::IntoDiagnostic; 3 + use tracing::{Level, info, warn}; 4 + use tracing_subscriber::EnvFilter; 5 + use weaver_index::clickhouse::{Client, Migrator, Tables}; 6 + use weaver_index::config::{ClickHouseConfig, FirehoseConfig, IndexerConfig}; 7 + use weaver_index::firehose::FirehoseConsumer; 8 + use weaver_index::{Indexer, load_cursor}; 9 10 #[derive(Parser)] 11 + #[command(name = "indexer")] 12 #[command(about = "Weaver index service - firehose ingestion and query serving")] 13 struct Args { 14 #[command(subcommand)] ··· 22 /// Show what would be run without executing 23 #[arg(long)] 24 dry_run: bool, 25 + 26 + /// Drop all tables before running migrations (for testing) 27 + #[arg(long)] 28 + reset: bool, 29 }, 30 31 /// Check database connectivity ··· 39 async fn main() -> miette::Result<()> { 40 dotenvy::dotenv().ok(); 41 42 + let console_level = if cfg!(debug_assertions) { 43 + Level::DEBUG 44 + } else { 45 + Level::INFO 46 + }; 47 + 48 tracing_subscriber::fmt() 49 .with_env_filter( 50 + tracing_subscriber::EnvFilter::builder() 51 + .from_env_lossy() 52 + .add_directive(console_level.into()) 53 + .add_directive("hyper_util=info".parse().into_diagnostic()?), 54 ) 55 .init(); 56 57 let args = Args::parse(); 58 59 match args.command { 60 + Command::Migrate { dry_run, reset } => run_migrate(dry_run, reset).await, 61 Command::Health => run_health().await, 62 Command::Run => run_indexer().await, 63 } 64 } 65 66 + async fn run_migrate(dry_run: bool, reset: bool) -> miette::Result<()> { 67 let config = ClickHouseConfig::from_env()?; 68 info!( 69 "Connecting to ClickHouse at {} (database: {})", ··· 71 ); 72 73 let client = Client::new(&config)?; 74 + 75 + if reset { 76 + if dry_run { 77 + info!("Would drop tables:"); 78 + for table in Tables::ALL { 79 + info!(" - {}", table); 80 + } 81 + } else { 82 + info!("Dropping all tables..."); 83 + for table in Tables::ALL { 84 + let query = format!("DROP TABLE IF EXISTS {}", table); 85 + match client.execute(&query).await { 86 + Ok(_) => info!(" dropped {}", table), 87 + Err(e) => warn!(" failed to drop {}: {}", table, e), 88 + } 89 + } 90 + } 91 + } 92 + 93 let migrator = Migrator::new(&client); 94 95 if dry_run { ··· 127 } 128 129 async fn run_indexer() -> miette::Result<()> { 130 + let ch_config = ClickHouseConfig::from_env()?; 131 + let mut firehose_config = FirehoseConfig::from_env()?; 132 + let indexer_config = IndexerConfig::from_env(); 133 + 134 + info!( 135 + "Connecting to ClickHouse at {} (database: {})", 136 + ch_config.url, ch_config.database 137 + ); 138 + let client = Client::new(&ch_config)?; 139 + 140 + // Load cursor from ClickHouse if not overridden by env var 141 + if firehose_config.cursor.is_none() { 142 + if let Some(cursor) = load_cursor(&client).await? { 143 + firehose_config.cursor = Some(cursor); 144 + } 145 + } 146 + 147 + info!( 148 + "Connecting to firehose at {} (cursor: {:?})", 149 + firehose_config.relay_url, firehose_config.cursor 150 + ); 151 + let consumer = FirehoseConsumer::new(firehose_config); 152 + 153 + let indexer = Indexer::new(client, consumer, indexer_config).await?; 154 + 155 + info!("Starting indexer"); 156 + indexer.run().await?; 157 + 158 Ok(()) 159 }
+2 -1
crates/weaver-index/src/clickhouse.rs
··· 5 pub use client::{Client, TableSize}; 6 pub use migrations::{MigrationResult, Migrator}; 7 pub use schema::{ 8 - FirehoseCursor, RawAccountEvent, RawEventDlq, RawIdentityEvent, RawRecord, Tables, 9 };
··· 5 pub use client::{Client, TableSize}; 6 pub use migrations::{MigrationResult, Migrator}; 7 pub use schema::{ 8 + AccountRevState, FirehoseCursor, RawAccountEvent, RawEventDlq, RawIdentityEvent, 9 + RawRecordInsert, Tables, 10 };
+5 -1
crates/weaver-index/src/clickhouse/client.rs
··· 15 .with_url(config.url.as_str()) 16 .with_database(&config.database) 17 .with_user(&config.user) 18 - .with_password(&config.password); 19 20 Ok(Self { inner }) 21 }
··· 15 .with_url(config.url.as_str()) 16 .with_database(&config.database) 17 .with_user(&config.user) 18 + .with_password(&config.password) 19 + // Enable JSON type support (treated as string at transport level) 20 + .with_option("allow_experimental_json_type", "1") 21 + .with_option("input_format_binary_read_json_as_string", "1") 22 + .with_option("output_format_binary_write_json_as_string", "1"); 23 24 Ok(Self { inner }) 25 }
+24 -34
crates/weaver-index/src/clickhouse/migrations.rs
··· 1 use crate::error::{ClickHouseError, IndexError}; 2 use tracing::info; 3 4 use super::Client; 5 6 - /// Embedded migrations - compiled into the binary 7 - const MIGRATIONS: &[(&str, &str)] = &[ 8 - ( 9 - "000_migrations.sql", 10 - include_str!("../../migrations/clickhouse/000_migrations.sql"), 11 - ), 12 - ( 13 - "001_raw_records.sql", 14 - include_str!("../../migrations/clickhouse/001_raw_records.sql"), 15 - ), 16 - ( 17 - "002_identity_events.sql", 18 - include_str!("../../migrations/clickhouse/002_identity_events.sql"), 19 - ), 20 - ( 21 - "003_account_events.sql", 22 - include_str!("../../migrations/clickhouse/003_account_events.sql"), 23 - ), 24 - ( 25 - "004_events_dlq.sql", 26 - include_str!("../../migrations/clickhouse/004_events_dlq.sql"), 27 - ), 28 - ( 29 - "005_firehose_cursor.sql", 30 - include_str!("../../migrations/clickhouse/005_firehose_cursor.sql"), 31 - ), 32 - ]; 33 34 /// Migration runner for ClickHouse 35 pub struct Migrator<'a> { ··· 41 Self { client } 42 } 43 44 /// Run all pending migrations 45 pub async fn run(&self) -> Result<MigrationResult, IndexError> { 46 // First, ensure the migrations table exists (bootstrap) ··· 52 let mut applied_count = 0; 53 let mut skipped_count = 0; 54 55 - for (name, sql) in MIGRATIONS { 56 // Skip the bootstrap migration after first run 57 - if *name == "000_migrations.sql" && applied.contains(&"000_migrations.sql".to_string()) 58 - { 59 skipped_count += 1; 60 continue; 61 } ··· 86 Err(_) => vec![], 87 }; 88 89 - let pending: Vec<String> = MIGRATIONS 90 - .iter() 91 .filter(|(name, _)| !applied.contains(&name.to_string())) 92 .map(|(name, _)| name.to_string()) 93 .collect(); ··· 97 98 async fn ensure_migrations_table(&self) -> Result<(), IndexError> { 99 // Run the bootstrap migration directly 100 - let (_, sql) = MIGRATIONS 101 - .iter() 102 .find(|(name, _)| *name == "000_migrations.sql") 103 .expect("bootstrap migration must exist"); 104
··· 1 use crate::error::{ClickHouseError, IndexError}; 2 + use include_dir::{Dir, include_dir}; 3 use tracing::info; 4 5 use super::Client; 6 7 + /// Embedded migrations directory - compiled into the binary 8 + static MIGRATIONS_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/migrations/clickhouse"); 9 10 /// Migration runner for ClickHouse 11 pub struct Migrator<'a> { ··· 17 Self { client } 18 } 19 20 + /// Get sorted list of migration files from embedded directory 21 + fn migrations() -> Vec<(&'static str, &'static str)> { 22 + let mut files: Vec<_> = MIGRATIONS_DIR 23 + .files() 24 + .filter(|f| f.path().extension().is_some_and(|ext| ext == "sql")) 25 + .filter_map(|f| { 26 + let name = f.path().file_name()?.to_str()?; 27 + let contents = f.contents_utf8()?; 28 + Some((name, contents)) 29 + }) 30 + .collect(); 31 + files.sort_by_key(|(name, _)| *name); 32 + files 33 + } 34 + 35 /// Run all pending migrations 36 pub async fn run(&self) -> Result<MigrationResult, IndexError> { 37 // First, ensure the migrations table exists (bootstrap) ··· 43 let mut applied_count = 0; 44 let mut skipped_count = 0; 45 46 + for (name, sql) in Self::migrations() { 47 // Skip the bootstrap migration after first run 48 + if name == "000_migrations.sql" && applied.contains(&"000_migrations.sql".to_string()) { 49 skipped_count += 1; 50 continue; 51 } ··· 76 Err(_) => vec![], 77 }; 78 79 + let pending: Vec<String> = Self::migrations() 80 + .into_iter() 81 .filter(|(name, _)| !applied.contains(&name.to_string())) 82 .map(|(name, _)| name.to_string()) 83 .collect(); ··· 87 88 async fn ensure_migrations_table(&self) -> Result<(), IndexError> { 89 // Run the bootstrap migration directly 90 + let (_, sql) = Self::migrations() 91 + .into_iter() 92 .find(|(name, _)| *name == "000_migrations.sql") 93 .expect("bootstrap migration must exist"); 94
+55 -15
crates/weaver-index/src/clickhouse/schema.rs
··· 1 use chrono::{DateTime, Utc}; 2 use clickhouse::Row; 3 4 /// Table names for production schema 5 pub struct Tables; ··· 10 pub const RAW_ACCOUNT_EVENTS: &'static str = "raw_account_events"; 11 pub const RAW_EVENTS_DLQ: &'static str = "raw_events_dlq"; 12 pub const FIREHOSE_CURSOR: &'static str = "firehose_cursor"; 13 } 14 15 /// Row type for raw_records table 16 /// Schema defined in migrations/clickhouse/001_raw_records.sql 17 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 18 - pub struct RawRecord { 19 - pub did: String, 20 - pub collection: String, 21 - pub rkey: String, 22 - pub cid: String, 23 - pub record: String, // JSON string - ClickHouse JSON type accepts string 24 - pub operation: String, 25 pub seq: u64, 26 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 27 pub event_time: DateTime<Utc>, 28 } 29 30 /// Row type for raw_identity_events table 31 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 32 pub struct RawIdentityEvent { 33 - pub did: String, 34 - pub handle: String, 35 pub seq: u64, 36 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 37 pub event_time: DateTime<Utc>, ··· 40 /// Row type for raw_account_events table 41 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 42 pub struct RawAccountEvent { 43 - pub did: String, 44 pub active: u8, 45 - pub status: String, 46 pub seq: u64, 47 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 48 pub event_time: DateTime<Utc>, ··· 51 /// Row type for raw_events_dlq table 52 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 53 pub struct RawEventDlq { 54 - pub event_type: String, 55 - pub raw_data: String, // JSON string 56 - pub error_message: String, 57 pub seq: u64, 58 } 59 60 /// Row type for firehose_cursor table 61 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 62 pub struct FirehoseCursor { 63 - pub consumer_id: String, 64 pub seq: u64, 65 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 66 pub event_time: DateTime<Utc>, 67 }
··· 1 use chrono::{DateTime, Utc}; 2 use clickhouse::Row; 3 + use smol_str::SmolStr; 4 5 /// Table names for production schema 6 pub struct Tables; ··· 11 pub const RAW_ACCOUNT_EVENTS: &'static str = "raw_account_events"; 12 pub const RAW_EVENTS_DLQ: &'static str = "raw_events_dlq"; 13 pub const FIREHOSE_CURSOR: &'static str = "firehose_cursor"; 14 + pub const ACCOUNT_REV_STATE: &'static str = "account_rev_state"; 15 + pub const ACCOUNT_REV_STATE_MV: &'static str = "account_rev_state_mv"; 16 + pub const MIGRATIONS: &'static str = "_migrations"; 17 + 18 + /// All tables and views in drop order (MVs before their source tables) 19 + pub const ALL: &'static [&'static str] = &[ 20 + Self::ACCOUNT_REV_STATE_MV, // MV first, depends on raw_records 21 + Self::ACCOUNT_REV_STATE, 22 + Self::RAW_RECORDS, 23 + Self::RAW_IDENTITY_EVENTS, 24 + Self::RAW_ACCOUNT_EVENTS, 25 + Self::RAW_EVENTS_DLQ, 26 + Self::FIREHOSE_CURSOR, 27 + Self::MIGRATIONS, 28 + ]; 29 + } 30 + 31 + /// Validation states for records 32 + pub mod validation { 33 + pub const UNCHECKED: &str = "unchecked"; 34 + pub const VALID: &str = "valid"; 35 + pub const INVALID_REV: &str = "invalid_rev"; 36 + pub const INVALID_GAP: &str = "invalid_gap"; 37 + pub const INVALID_ACCOUNT: &str = "invalid_account"; 38 } 39 40 /// Row type for raw_records table 41 /// Schema defined in migrations/clickhouse/001_raw_records.sql 42 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 43 + pub struct RawRecordInsert { 44 + pub did: SmolStr, 45 + pub collection: SmolStr, 46 + pub rkey: SmolStr, 47 + pub cid: SmolStr, 48 + pub rev: SmolStr, 49 + pub record: SmolStr, // JSON string - ClickHouse JSON type accepts string 50 + pub operation: SmolStr, 51 pub seq: u64, 52 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 53 pub event_time: DateTime<Utc>, 54 + // Note: indexed_at has DEFAULT now64(3), omit from insert 55 + // Note: validation_state has DEFAULT 'unchecked', omit from insert 56 } 57 58 /// Row type for raw_identity_events table 59 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 60 pub struct RawIdentityEvent { 61 + pub did: SmolStr, 62 + pub handle: SmolStr, 63 pub seq: u64, 64 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 65 pub event_time: DateTime<Utc>, ··· 68 /// Row type for raw_account_events table 69 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 70 pub struct RawAccountEvent { 71 + pub did: SmolStr, 72 pub active: u8, 73 + pub status: SmolStr, 74 pub seq: u64, 75 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 76 pub event_time: DateTime<Utc>, ··· 79 /// Row type for raw_events_dlq table 80 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 81 pub struct RawEventDlq { 82 + pub event_type: SmolStr, 83 + pub raw_data: SmolStr, // JSON string 84 + pub error_message: SmolStr, 85 pub seq: u64, 86 } 87 88 /// Row type for firehose_cursor table 89 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 90 pub struct FirehoseCursor { 91 + pub consumer_id: SmolStr, 92 pub seq: u64, 93 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 94 pub event_time: DateTime<Utc>, 95 } 96 + 97 + /// Row type for reading finalized account_rev_state 98 + /// Query with: SELECT did, argMaxMerge(last_rev), argMaxMerge(last_cid), maxMerge(last_seq), maxMerge(last_event_time) FROM account_rev_state GROUP BY did 99 + #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 100 + pub struct AccountRevState { 101 + pub did: SmolStr, 102 + pub last_rev: SmolStr, 103 + pub last_cid: SmolStr, 104 + pub last_seq: u64, 105 + #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 106 + pub last_event_time: DateTime<Utc>, 107 + }
+120
crates/weaver-index/src/config.rs
··· 1 use crate::error::{ConfigError, IndexError}; 2 use url::Url; 3 4 /// ClickHouse connection configuration ··· 84 } 85 } 86 87 /// Combined configuration for the indexer 88 #[derive(Debug, Clone)] 89 pub struct Config { 90 pub clickhouse: ClickHouseConfig, 91 pub firehose: FirehoseConfig, 92 } 93 94 impl Config { ··· 97 Ok(Self { 98 clickhouse: ClickHouseConfig::from_env()?, 99 firehose: FirehoseConfig::from_env()?, 100 }) 101 } 102 }
··· 1 use crate::error::{ConfigError, IndexError}; 2 + use dashmap::DashSet; 3 use url::Url; 4 5 /// ClickHouse connection configuration ··· 85 } 86 } 87 88 + use smol_str::{SmolStr, ToSmolStr}; 89 + 90 + /// Pre-parsed collection filter for efficient matching 91 + #[derive(Debug, Clone)] 92 + pub struct CollectionFilter { 93 + /// Prefix patterns (from "foo.*" -> "foo.") 94 + prefixes: Vec<SmolStr>, 95 + /// Exact match patterns (HashSet for O(1) lookup) 96 + exact: DashSet<SmolStr>, 97 + /// True if filter is empty (accept all) 98 + accept_all: bool, 99 + } 100 + 101 + impl CollectionFilter { 102 + /// Parse filter patterns into prefixes and exact matches 103 + pub fn new(patterns: Vec<SmolStr>) -> Self { 104 + let mut prefixes = Vec::new(); 105 + let exact = DashSet::new(); 106 + 107 + for pattern in patterns { 108 + if let Some(prefix) = pattern.strip_suffix('*') { 109 + prefixes.push(SmolStr::new(prefix)); 110 + } else { 111 + exact.insert(SmolStr::new(&pattern)); 112 + } 113 + } 114 + 115 + let accept_all = prefixes.is_empty() && exact.is_empty(); 116 + Self { 117 + prefixes, 118 + exact, 119 + accept_all, 120 + } 121 + } 122 + 123 + /// Check if a collection matches any pattern 124 + #[inline] 125 + pub fn matches(&self, collection: &str) -> bool { 126 + if self.accept_all { 127 + return true; 128 + } 129 + 130 + // O(1) exact match check first 131 + if self.exact.contains(collection) { 132 + return true; 133 + } 134 + 135 + // Prefix check - for small N, linear scan is fine 136 + // Accumulate without early return to help branch predictor 137 + let mut matched = false; 138 + for prefix in &self.prefixes { 139 + matched |= collection.starts_with(prefix.as_str()); 140 + } 141 + matched 142 + } 143 + } 144 + 145 + /// Indexer runtime configuration 146 + #[derive(Debug, Clone)] 147 + pub struct IndexerConfig { 148 + /// Maximum records to batch before flushing to ClickHouse 149 + pub batch_size: usize, 150 + /// Maximum time (ms) before flushing even if batch isn't full 151 + pub flush_interval_ms: u64, 152 + /// Collection filter (pre-parsed patterns) 153 + pub collections: CollectionFilter, 154 + } 155 + 156 + impl Default for IndexerConfig { 157 + fn default() -> Self { 158 + Self { 159 + batch_size: 1000, 160 + flush_interval_ms: 1000, 161 + collections: CollectionFilter::new(vec![ 162 + SmolStr::new_static("sh.weaver.*"), 163 + SmolStr::new_static("app.bsky.actor.profile"), 164 + ]), 165 + } 166 + } 167 + } 168 + 169 + impl IndexerConfig { 170 + /// Load configuration from environment variables. 171 + /// 172 + /// Optional env vars: 173 + /// - `INDEXER_BATCH_SIZE`: Max records per batch (default: 1000) 174 + /// - `INDEXER_FLUSH_INTERVAL_MS`: Max ms between flushes (default: 1000) 175 + /// - `INDEXER_COLLECTIONS`: Comma-separated collection patterns (default: sh.weaver.*,app.bsky.actor.profile) 176 + /// Use * suffix for prefix matching, e.g., "sh.weaver.*" matches all sh.weaver.* collections 177 + pub fn from_env() -> Self { 178 + let batch_size = std::env::var("INDEXER_BATCH_SIZE") 179 + .ok() 180 + .and_then(|s| s.parse().ok()) 181 + .unwrap_or(1000); 182 + 183 + let flush_interval_ms = std::env::var("INDEXER_FLUSH_INTERVAL_MS") 184 + .ok() 185 + .and_then(|s| s.parse().ok()) 186 + .unwrap_or(1000); 187 + 188 + let patterns: Vec<SmolStr> = std::env::var("INDEXER_COLLECTIONS") 189 + .map(|s| s.split(',').map(|p| p.trim().to_smolstr()).collect()) 190 + .unwrap_or_else(|_| { 191 + vec![ 192 + SmolStr::new_static("sh.weaver.*"), 193 + SmolStr::new_static("app.bsky.actor.profile"), 194 + ] 195 + }); 196 + 197 + Self { 198 + batch_size, 199 + flush_interval_ms, 200 + collections: CollectionFilter::new(patterns), 201 + } 202 + } 203 + } 204 + 205 /// Combined configuration for the indexer 206 #[derive(Debug, Clone)] 207 pub struct Config { 208 pub clickhouse: ClickHouseConfig, 209 pub firehose: FirehoseConfig, 210 + pub indexer: IndexerConfig, 211 } 212 213 impl Config { ··· 216 Ok(Self { 217 clickhouse: ClickHouseConfig::from_env()?, 218 firehose: FirehoseConfig::from_env()?, 219 + indexer: IndexerConfig::from_env(), 220 }) 221 } 222 }
+2 -2
crates/weaver-index/src/firehose.rs
··· 2 mod records; 3 4 pub use consumer::{ 5 - FirehoseConsumer, MessageStream, SubscribeReposMessage, Commit, Identity, Account, Sync, 6 }; 7 - pub use records::{extract_records, ExtractedRecord};
··· 2 mod records; 3 4 pub use consumer::{ 5 + Account, Commit, FirehoseConsumer, Identity, MessageStream, SubscribeReposMessage, Sync, 6 }; 7 + pub use records::{ExtractedRecord, extract_records};
+10 -6
crates/weaver-index/src/firehose/records.rs
··· 1 use crate::error::{CarError, IndexError}; 2 use bytes::Bytes; 3 use jacquard_repo::car::reader::parse_car_bytes; 4 use smol_str::{SmolStr, ToSmolStr}; 5 ··· 15 /// Record key within the collection 16 pub rkey: SmolStr, 17 /// Content identifier 18 - pub cid: String, 19 /// Operation type: "create", "update", or "delete" 20 pub operation: SmolStr, 21 /// Raw DAG-CBOR bytes of the record (None for deletes) 22 pub cbor_bytes: Option<Bytes>, 23 /// Sequence number from the firehose event 24 pub seq: i64, 25 - /// Event timestamp (milliseconds since epoch) 26 - pub event_time_ms: i64, 27 } 28 29 impl ExtractedRecord { ··· 61 message: e.to_string(), 62 })?; 63 64 - let event_time_ms = commit.time.as_ref().timestamp_millis(); 65 let mut records = Vec::with_capacity(commit.ops.len()); 66 67 for op in &commit.ops { ··· 77 }; 78 79 let operation = op.action.to_smolstr(); 80 - let cid_str = op.cid.as_ref().map(|c| c.to_string()).unwrap_or_default(); 81 82 // For creates/updates, look up the record in the CAR blocks 83 let cbor_bytes = if let Some(cid_link) = &op.cid { ··· 97 collection, 98 rkey, 99 cid: cid_str, 100 operation, 101 cbor_bytes, 102 seq: commit.seq, 103 - event_time_ms, 104 }); 105 } 106
··· 1 use crate::error::{CarError, IndexError}; 2 use bytes::Bytes; 3 + use chrono::{DateTime, Utc}; 4 use jacquard_repo::car::reader::parse_car_bytes; 5 use smol_str::{SmolStr, ToSmolStr}; 6 ··· 16 /// Record key within the collection 17 pub rkey: SmolStr, 18 /// Content identifier 19 + pub cid: SmolStr, 20 + /// Repository revision (TID) - monotonically increasing per DID 21 + pub rev: SmolStr, 22 /// Operation type: "create", "update", or "delete" 23 pub operation: SmolStr, 24 /// Raw DAG-CBOR bytes of the record (None for deletes) 25 pub cbor_bytes: Option<Bytes>, 26 /// Sequence number from the firehose event 27 pub seq: i64, 28 + /// Event timestamp 29 + pub event_time: DateTime<Utc>, 30 } 31 32 impl ExtractedRecord { ··· 64 message: e.to_string(), 65 })?; 66 67 + let event_time = commit.time.as_ref().with_timezone(&Utc); 68 let mut records = Vec::with_capacity(commit.ops.len()); 69 70 for op in &commit.ops { ··· 80 }; 81 82 let operation = op.action.to_smolstr(); 83 + let cid_str = op.cid.as_ref().map(|c| c.to_smolstr()).unwrap_or_default(); 84 85 // For creates/updates, look up the record in the CAR blocks 86 let cbor_bytes = if let Some(cid_link) = &op.cid { ··· 100 collection, 101 rkey, 102 cid: cid_str, 103 + rev: commit.rev.to_smolstr(), 104 operation, 105 cbor_bytes, 106 seq: commit.seq, 107 + event_time, 108 }); 109 } 110
+512
crates/weaver-index/src/indexer.rs
···
··· 1 + use std::sync::Arc; 2 + use std::time::{Duration, Instant}; 3 + 4 + use chrono::Utc; 5 + use dashmap::DashMap; 6 + use n0_future::StreamExt; 7 + use smol_str::{SmolStr, ToSmolStr}; 8 + use tracing::{debug, info, warn}; 9 + 10 + use chrono::DateTime; 11 + 12 + use crate::clickhouse::{ 13 + AccountRevState, Client, FirehoseCursor, RawAccountEvent, RawIdentityEvent, RawRecordInsert, 14 + }; 15 + use crate::config::IndexerConfig; 16 + use crate::error::{IndexError, Result}; 17 + use crate::firehose::{ 18 + Account, Commit, ExtractedRecord, FirehoseConsumer, Identity, MessageStream, 19 + SubscribeReposMessage, extract_records, 20 + }; 21 + 22 + /// Default consumer ID for cursor tracking 23 + const CONSUMER_ID: &str = "main"; 24 + 25 + /// Per-account revision state for deduplication 26 + #[derive(Debug, Clone)] 27 + pub struct RevState { 28 + pub last_rev: SmolStr, 29 + pub last_cid: SmolStr, 30 + } 31 + 32 + /// In-memory cache of per-account revision state 33 + /// 34 + /// Used for fast deduplication without hitting ClickHouse on every event. 35 + /// Populated from account_rev_state table on startup, updated as events are processed. 36 + pub struct RevCache { 37 + inner: DashMap<SmolStr, RevState>, 38 + } 39 + 40 + impl RevCache { 41 + pub fn new() -> Self { 42 + Self { 43 + inner: DashMap::new(), 44 + } 45 + } 46 + 47 + /// Load cache from ClickHouse account_rev_state table 48 + pub async fn load_from_clickhouse(client: &Client) -> Result<Self> { 49 + let query = r#" 50 + SELECT 51 + did, 52 + argMaxMerge(last_rev) as last_rev, 53 + argMaxMerge(last_cid) as last_cid, 54 + maxMerge(last_seq) as last_seq, 55 + maxMerge(last_event_time) as last_event_time 56 + FROM account_rev_state 57 + GROUP BY did 58 + "#; 59 + 60 + let rows: Vec<AccountRevState> = 61 + client.inner().query(query).fetch_all().await.map_err(|e| { 62 + IndexError::ClickHouse(crate::error::ClickHouseError::Query { 63 + message: "failed to load account rev state".into(), 64 + source: e, 65 + }) 66 + })?; 67 + 68 + let cache = Self::new(); 69 + for row in rows { 70 + cache.inner.insert( 71 + SmolStr::new(&row.did), 72 + RevState { 73 + last_rev: SmolStr::new(&row.last_rev), 74 + last_cid: SmolStr::new(&row.last_cid), 75 + }, 76 + ); 77 + } 78 + 79 + info!( 80 + accounts = cache.inner.len(), 81 + "loaded rev cache from clickhouse" 82 + ); 83 + Ok(cache) 84 + } 85 + 86 + /// Check if we should process this commit (returns false if already seen) 87 + pub fn should_process(&self, did: &str, rev: &str) -> bool { 88 + match self.inner.get(did) { 89 + Some(state) => rev > state.last_rev.as_str(), 90 + None => true, // new account, always process 91 + } 92 + } 93 + 94 + /// Update cache after processing a commit 95 + pub fn update(&self, did: &SmolStr, rev: &SmolStr, cid: &SmolStr) { 96 + self.inner.insert( 97 + did.clone(), 98 + RevState { 99 + last_rev: rev.clone(), 100 + last_cid: cid.clone(), 101 + }, 102 + ); 103 + } 104 + 105 + /// Get current cache size (number of accounts tracked) 106 + pub fn len(&self) -> usize { 107 + self.inner.len() 108 + } 109 + 110 + pub fn is_empty(&self) -> bool { 111 + self.inner.is_empty() 112 + } 113 + } 114 + 115 + impl Default for RevCache { 116 + fn default() -> Self { 117 + Self::new() 118 + } 119 + } 120 + 121 + /// Safety margin when resuming - back up this many sequence numbers 122 + /// to ensure no gaps from incomplete batches or race conditions 123 + const CURSOR_REWIND: i64 = 1000; 124 + 125 + /// Load cursor from ClickHouse for resuming 126 + /// 127 + /// Returns cursor with safety margin subtracted to ensure overlap 128 + pub async fn load_cursor(client: &Client) -> Result<Option<i64>> { 129 + let query = format!( 130 + r#" 131 + SELECT consumer_id, seq, event_time 132 + FROM firehose_cursor FINAL 133 + WHERE consumer_id = '{}' 134 + LIMIT 1 135 + "#, 136 + CONSUMER_ID 137 + ); 138 + 139 + let cursor: Option<FirehoseCursor> = client 140 + .inner() 141 + .query(&query) 142 + .fetch_optional() 143 + .await 144 + .map_err(|e| crate::error::ClickHouseError::Query { 145 + message: "failed to load cursor".into(), 146 + source: e, 147 + })?; 148 + 149 + if let Some(c) = &cursor { 150 + let resume_at = (c.seq as i64).saturating_sub(CURSOR_REWIND); 151 + info!( 152 + saved_seq = c.seq, 153 + resume_seq = resume_at, 154 + rewind = CURSOR_REWIND, 155 + "loaded cursor from clickhouse (with safety margin)" 156 + ); 157 + Ok(Some(resume_at)) 158 + } else { 159 + Ok(None) 160 + } 161 + } 162 + 163 + /// Main indexer that consumes firehose and writes to ClickHouse 164 + pub struct Indexer { 165 + client: Arc<Client>, 166 + consumer: FirehoseConsumer, 167 + rev_cache: RevCache, 168 + config: IndexerConfig, 169 + } 170 + 171 + impl Indexer { 172 + /// Create a new indexer 173 + pub async fn new( 174 + client: Client, 175 + consumer: FirehoseConsumer, 176 + config: IndexerConfig, 177 + ) -> Result<Self> { 178 + let client = Arc::new(client); 179 + 180 + // Load rev cache from ClickHouse 181 + let rev_cache = RevCache::load_from_clickhouse(&client).await?; 182 + 183 + Ok(Self { 184 + client, 185 + consumer, 186 + rev_cache, 187 + config, 188 + }) 189 + } 190 + 191 + /// Save cursor to ClickHouse 192 + async fn save_cursor(&self, seq: u64, event_time: DateTime<Utc>) -> Result<()> { 193 + let query = format!( 194 + "INSERT INTO firehose_cursor (consumer_id, seq, event_time) VALUES ('{}', {}, {})", 195 + CONSUMER_ID, 196 + seq, 197 + event_time.timestamp_millis() 198 + ); 199 + 200 + self.client.execute(&query).await?; 201 + debug!(seq, "saved cursor"); 202 + Ok(()) 203 + } 204 + 205 + /// Run the indexer loop 206 + pub async fn run(&self) -> Result<()> { 207 + info!("connecting to firehose..."); 208 + let mut stream: MessageStream = self.consumer.connect().await?; 209 + 210 + // Inserters handle batching internally based on config 211 + let mut records = self.client.inserter::<RawRecordInsert>("raw_records"); 212 + let mut identities = self 213 + .client 214 + .inserter::<RawIdentityEvent>("raw_identity_events"); 215 + let mut accounts = self 216 + .client 217 + .inserter::<RawAccountEvent>("raw_account_events"); 218 + 219 + // Stats and cursor tracking 220 + let mut processed: u64 = 0; 221 + let mut skipped: u64 = 0; 222 + let mut last_seq: u64 = 0; 223 + let mut last_event_time = Utc::now(); 224 + let mut last_stats = Instant::now(); 225 + let mut last_cursor_save = Instant::now(); 226 + 227 + info!("starting indexer loop"); 228 + 229 + while let Some(result) = stream.next().await { 230 + let msg = match result { 231 + Ok(msg) => msg, 232 + Err(e) => { 233 + warn!(error = ?e, "firehose stream error"); 234 + continue; 235 + } 236 + }; 237 + 238 + // Track seq from any message type that has it 239 + match &msg { 240 + SubscribeReposMessage::Commit(c) => { 241 + last_seq = c.seq as u64; 242 + last_event_time = c.time.as_ref().with_timezone(&Utc); 243 + } 244 + SubscribeReposMessage::Identity(i) => { 245 + last_seq = i.seq as u64; 246 + last_event_time = i.time.as_ref().with_timezone(&Utc); 247 + } 248 + SubscribeReposMessage::Account(a) => { 249 + last_seq = a.seq as u64; 250 + last_event_time = a.time.as_ref().with_timezone(&Utc); 251 + } 252 + _ => {} 253 + } 254 + 255 + match msg { 256 + SubscribeReposMessage::Commit(commit) => { 257 + if self 258 + .process_commit(&commit, &mut records, &mut skipped) 259 + .await? 260 + { 261 + processed += 1; 262 + } 263 + } 264 + SubscribeReposMessage::Identity(identity) => { 265 + write_identity(&identity, &mut identities).await?; 266 + } 267 + SubscribeReposMessage::Account(account) => { 268 + write_account(&account, &mut accounts).await?; 269 + } 270 + SubscribeReposMessage::Sync(_) => { 271 + debug!("received sync (tooBig) event, skipping"); 272 + } 273 + _ => {} 274 + } 275 + 276 + // commit() flushes if internal thresholds met, otherwise no-op 277 + records 278 + .commit() 279 + .await 280 + .map_err(|e| crate::error::ClickHouseError::Query { 281 + message: "commit failed".into(), 282 + source: e, 283 + })?; 284 + 285 + // Periodic stats and cursor save (every 10s) 286 + if last_stats.elapsed() >= Duration::from_secs(10) { 287 + info!( 288 + processed, 289 + skipped, 290 + last_seq, 291 + rev_cache_size = self.rev_cache.len(), 292 + "indexer stats" 293 + ); 294 + last_stats = Instant::now(); 295 + } 296 + 297 + // Save cursor every 30s 298 + if last_cursor_save.elapsed() >= Duration::from_secs(30) && last_seq > 0 { 299 + if let Err(e) = self.save_cursor(last_seq, last_event_time).await { 300 + warn!(error = ?e, "failed to save cursor"); 301 + } 302 + last_cursor_save = Instant::now(); 303 + } 304 + } 305 + 306 + // Final flush 307 + records 308 + .end() 309 + .await 310 + .map_err(|e| crate::error::ClickHouseError::Query { 311 + message: "final flush failed".into(), 312 + source: e, 313 + })?; 314 + identities 315 + .end() 316 + .await 317 + .map_err(|e| crate::error::ClickHouseError::Query { 318 + message: "final flush failed".into(), 319 + source: e, 320 + })?; 321 + accounts 322 + .end() 323 + .await 324 + .map_err(|e| crate::error::ClickHouseError::Query { 325 + message: "final flush failed".into(), 326 + source: e, 327 + })?; 328 + 329 + // Final cursor save 330 + if last_seq > 0 { 331 + self.save_cursor(last_seq, last_event_time).await?; 332 + } 333 + 334 + info!(last_seq, "firehose stream ended"); 335 + Ok(()) 336 + } 337 + 338 + async fn process_commit( 339 + &self, 340 + commit: &Commit<'_>, 341 + inserter: &mut clickhouse::inserter::Inserter<RawRecordInsert>, 342 + skipped: &mut u64, 343 + ) -> Result<bool> { 344 + let did = commit.repo.as_ref(); 345 + let rev = commit.rev.as_ref(); 346 + 347 + // Dedup check 348 + if !self.rev_cache.should_process(did, rev) { 349 + *skipped += 1; 350 + return Ok(false); 351 + } 352 + 353 + // Extract and write records 354 + for record in extract_records(commit).await? { 355 + // Collection filter - skip early before JSON conversion 356 + if !self.config.collections.matches(&record.collection) { 357 + continue; 358 + } 359 + 360 + let json = record.to_json()?.unwrap_or_else(|| "{}".to_string()); 361 + 362 + // Fire and forget delete handling 363 + if record.operation == "delete" { 364 + let client = self.client.clone(); 365 + let record_clone = record.clone(); 366 + tokio::spawn(async move { 367 + if let Err(e) = handle_delete(&client, record_clone).await { 368 + warn!(error = ?e, "delete handling failed"); 369 + } 370 + }); 371 + } 372 + 373 + inserter 374 + .write(&RawRecordInsert { 375 + did: record.did.clone(), 376 + collection: record.collection.clone(), 377 + rkey: record.rkey.clone(), 378 + cid: record.cid.clone(), 379 + rev: record.rev.clone(), 380 + record: json.to_smolstr(), 381 + operation: record.operation.clone(), 382 + seq: record.seq as u64, 383 + event_time: record.event_time, 384 + }) 385 + .await 386 + .map_err(|e| crate::error::ClickHouseError::Query { 387 + message: "write failed".into(), 388 + source: e, 389 + })?; 390 + } 391 + 392 + // Update rev cache 393 + self.rev_cache.update( 394 + &SmolStr::new(did), 395 + &SmolStr::new(rev), 396 + &commit.commit.0.to_smolstr(), 397 + ); 398 + 399 + Ok(true) 400 + } 401 + } 402 + 403 + async fn write_identity( 404 + identity: &Identity<'_>, 405 + inserter: &mut clickhouse::inserter::Inserter<RawIdentityEvent>, 406 + ) -> Result<()> { 407 + inserter 408 + .write(&RawIdentityEvent { 409 + did: identity.did.to_smolstr(), 410 + handle: identity 411 + .handle 412 + .as_ref() 413 + .map(|h| h.as_ref().to_smolstr()) 414 + .unwrap_or_default(), 415 + seq: identity.seq as u64, 416 + event_time: identity.time.as_ref().with_timezone(&Utc), 417 + }) 418 + .await 419 + .map_err(|e| crate::error::ClickHouseError::Query { 420 + message: "write failed".into(), 421 + source: e, 422 + })?; 423 + Ok(()) 424 + } 425 + 426 + async fn write_account( 427 + account: &Account<'_>, 428 + inserter: &mut clickhouse::inserter::Inserter<RawAccountEvent>, 429 + ) -> Result<()> { 430 + inserter 431 + .write(&RawAccountEvent { 432 + did: account.did.to_smolstr(), 433 + active: if account.active { 1 } else { 0 }, 434 + status: account 435 + .status 436 + .as_ref() 437 + .map(|s| s.as_ref().to_smolstr()) 438 + .unwrap_or_default(), 439 + seq: account.seq as u64, 440 + event_time: account.time.as_ref().with_timezone(&Utc), 441 + }) 442 + .await 443 + .map_err(|e| crate::error::ClickHouseError::Query { 444 + message: "write failed".into(), 445 + source: e, 446 + })?; 447 + Ok(()) 448 + } 449 + 450 + /// Handle a delete event with poll-then-stub logic 451 + /// 452 + /// For deletes, we need to look up the original record to know what was deleted 453 + /// (e.g., which notebook a like was for). If the record doesn't exist yet 454 + /// (out-of-order events), we poll for up to 15 seconds before creating a stub tombstone. 455 + /// Minimal struct for delete lookups - just the fields we need to process the delete 456 + #[derive(Debug, Clone, clickhouse::Row, serde::Deserialize)] 457 + struct LookupRawRecord { 458 + did: SmolStr, 459 + collection: SmolStr, 460 + rkey: SmolStr, 461 + record: SmolStr, // JSON string of the original record 462 + } 463 + 464 + async fn handle_delete(client: &Client, record: ExtractedRecord) -> Result<()> { 465 + let deadline = Instant::now() + Duration::from_secs(15); 466 + 467 + loop { 468 + // Try to find the record by CID 469 + let query = format!( 470 + r#" 471 + SELECT did, collection, rkey, record 472 + FROM raw_records 473 + WHERE did = '{}' AND cid = '{}' 474 + ORDER BY event_time DESC 475 + LIMIT 1 476 + "#, 477 + record.did, record.cid 478 + ); 479 + 480 + let original: Option<LookupRawRecord> = client 481 + .inner() 482 + .query(&query) 483 + .fetch_optional() 484 + .await 485 + .map_err(|e| crate::error::ClickHouseError::Query { 486 + message: "delete lookup failed".into(), 487 + source: e, 488 + })?; 489 + 490 + if let Some(_original) = original { 491 + // Found the record - the main insert path already handles creating 492 + // the delete row, so we're done. In phase 2, this is where we'd 493 + // parse original.record and insert count deltas for denormalized tables. 494 + debug!(did = %record.did, cid = %record.cid, "delete found original record"); 495 + return Ok(()); 496 + } 497 + 498 + if Instant::now() > deadline { 499 + // Gave up - create stub tombstone 500 + // The record will be inserted via the main batch path with operation='delete' 501 + // and empty record content, which serves as our stub tombstone 502 + warn!( 503 + did = %record.did, 504 + cid = %record.cid, 505 + "delete timeout, stub tombstone will be created" 506 + ); 507 + return Ok(()); 508 + } 509 + 510 + tokio::time::sleep(Duration::from_secs(1)).await; 511 + } 512 + }
+2
crates/weaver-index/src/lib.rs
··· 2 pub mod config; 3 pub mod error; 4 pub mod firehose; 5 6 pub use config::Config; 7 pub use error::{IndexError, Result};
··· 2 pub mod config; 3 pub mod error; 4 pub mod firehose; 5 + pub mod indexer; 6 7 pub use config::Config; 8 pub use error::{IndexError, Result}; 9 + pub use indexer::{load_cursor, Indexer};