initial consumer skeleton

Orual 40eee0db 0a223e0f

+1029 -300
+21
Cargo.lock
··· 5024 5024 checksum = "edcd27d72f2f071c64249075f42e205ff93c9a4c5f6c6da53e79ed9f9832c285" 5025 5025 5026 5026 [[package]] 5027 + name = "include_dir" 5028 + version = "0.7.4" 5029 + source = "registry+https://github.com/rust-lang/crates.io-index" 5030 + checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd" 5031 + dependencies = [ 5032 + "include_dir_macros", 5033 + ] 5034 + 5035 + [[package]] 5036 + name = "include_dir_macros" 5037 + version = "0.7.4" 5038 + source = "registry+https://github.com/rust-lang/crates.io-index" 5039 + checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75" 5040 + dependencies = [ 5041 + "proc-macro2", 5042 + "quote", 5043 + ] 5044 + 5045 + [[package]] 5027 5046 name = "indexmap" 5028 5047 version = "1.9.3" 5029 5048 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 11464 11483 "cid", 11465 11484 "clap", 11466 11485 "clickhouse", 11486 + "dashmap 6.1.0", 11467 11487 "dotenvy", 11468 11488 "humansize", 11489 + "include_dir", 11469 11490 "jacquard", 11470 11491 "jacquard-common", 11471 11492 "jacquard-repo",
+146 -224
crates/weaver-api/src/com_atproto/sync/subscribe_repos.rs
··· 8 8 /// Represents a change to an account's status on a host (eg, PDS or Relay). The semantics of this event are that the status is at the host which emitted the event, not necessarily that at the currently active PDS. Eg, a Relay takedown would emit a takedown with active=false, even if the PDS is still active. 9 9 #[jacquard_derive::lexicon] 10 10 #[derive( 11 - serde::Serialize, 12 - serde::Deserialize, 13 - Debug, 14 - Clone, 15 - PartialEq, 16 - Eq, 17 - jacquard_derive::IntoStatic 11 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 18 12 )] 19 13 #[serde(rename_all = "camelCase")] 20 14 pub struct Account<'a> { ··· 32 26 33 27 pub mod account_state { 34 28 35 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 29 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 36 30 #[allow(unused)] 37 31 use ::core::marker::PhantomData; 38 32 mod sealed { ··· 179 173 S::Seq: account_state::IsUnset, 180 174 { 181 175 /// Set the `seq` field (required) 182 - pub fn seq( 183 - mut self, 184 - value: impl Into<i64>, 185 - ) -> AccountBuilder<'a, account_state::SetSeq<S>> { 176 + pub fn seq(mut self, value: impl Into<i64>) -> AccountBuilder<'a, account_state::SetSeq<S>> { 186 177 self.__unsafe_private_named.2 = ::core::option::Option::Some(value.into()); 187 178 AccountBuilder { 188 179 _phantom_state: ::core::marker::PhantomData, ··· 194 185 195 186 impl<'a, S: account_state::State> AccountBuilder<'a, S> { 196 187 /// Set the `status` field (optional) 197 - pub fn status( 198 - mut self, 199 - value: impl Into<Option<jacquard_common::CowStr<'a>>>, 200 - ) -> Self { 188 + pub fn status(mut self, value: impl Into<Option<jacquard_common::CowStr<'a>>>) -> Self { 201 189 self.__unsafe_private_named.3 = value.into(); 202 190 self 203 191 } ··· 265 253 } 266 254 } 267 255 268 - fn lexicon_doc_com_atproto_sync_subscribeRepos() -> ::jacquard_lexicon::lexicon::LexiconDoc< 269 - 'static, 270 - > { 256 + fn lexicon_doc_com_atproto_sync_subscribeRepos() -> ::jacquard_lexicon::lexicon::LexiconDoc<'static> 257 + { 271 258 ::jacquard_lexicon::lexicon::LexiconDoc { 272 259 lexicon: ::jacquard_lexicon::lexicon::Lexicon::Lexicon1, 273 260 id: ::jacquard_common::CowStr::new_static("com.atproto.sync.subscribeRepos"), ··· 647 634 ); 648 635 map.insert( 649 636 ::jacquard_common::smol_str::SmolStr::new_static("info"), 650 - ::jacquard_lexicon::lexicon::LexUserType::Object(::jacquard_lexicon::lexicon::LexObject { 651 - description: None, 652 - required: Some( 653 - vec![::jacquard_common::smol_str::SmolStr::new_static("name")], 654 - ), 655 - nullable: None, 656 - properties: { 657 - #[allow(unused_mut)] 658 - let mut map = ::std::collections::BTreeMap::new(); 659 - map.insert( 660 - ::jacquard_common::smol_str::SmolStr::new_static("message"), 661 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 662 - description: None, 663 - format: None, 664 - default: None, 665 - min_length: None, 666 - max_length: None, 667 - min_graphemes: None, 668 - max_graphemes: None, 669 - r#enum: None, 670 - r#const: None, 671 - known_values: None, 672 - }), 673 - ); 674 - map.insert( 675 - ::jacquard_common::smol_str::SmolStr::new_static("name"), 676 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 677 - description: None, 678 - format: None, 679 - default: None, 680 - min_length: None, 681 - max_length: None, 682 - min_graphemes: None, 683 - max_graphemes: None, 684 - r#enum: None, 685 - r#const: None, 686 - known_values: None, 687 - }), 688 - ); 689 - map 637 + ::jacquard_lexicon::lexicon::LexUserType::Object( 638 + ::jacquard_lexicon::lexicon::LexObject { 639 + description: None, 640 + required: Some(vec![::jacquard_common::smol_str::SmolStr::new_static( 641 + "name", 642 + )]), 643 + nullable: None, 644 + properties: { 645 + #[allow(unused_mut)] 646 + let mut map = ::std::collections::BTreeMap::new(); 647 + map.insert( 648 + ::jacquard_common::smol_str::SmolStr::new_static("message"), 649 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 650 + ::jacquard_lexicon::lexicon::LexString { 651 + description: None, 652 + format: None, 653 + default: None, 654 + min_length: None, 655 + max_length: None, 656 + min_graphemes: None, 657 + max_graphemes: None, 658 + r#enum: None, 659 + r#const: None, 660 + known_values: None, 661 + }, 662 + ), 663 + ); 664 + map.insert( 665 + ::jacquard_common::smol_str::SmolStr::new_static("name"), 666 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 667 + ::jacquard_lexicon::lexicon::LexString { 668 + description: None, 669 + format: None, 670 + default: None, 671 + min_length: None, 672 + max_length: None, 673 + min_graphemes: None, 674 + max_graphemes: None, 675 + r#enum: None, 676 + r#const: None, 677 + known_values: None, 678 + }, 679 + ), 680 + ); 681 + map 682 + }, 690 683 }, 691 - }), 684 + ), 692 685 ); 693 686 map.insert( 694 687 ::jacquard_common::smol_str::SmolStr::new_static("main"), ··· 723 716 ); 724 717 map.insert( 725 718 ::jacquard_common::smol_str::SmolStr::new_static("repoOp"), 726 - ::jacquard_lexicon::lexicon::LexUserType::Object(::jacquard_lexicon::lexicon::LexObject { 727 - description: Some( 728 - ::jacquard_common::CowStr::new_static( 719 + ::jacquard_lexicon::lexicon::LexUserType::Object( 720 + ::jacquard_lexicon::lexicon::LexObject { 721 + description: Some(::jacquard_common::CowStr::new_static( 729 722 "A repo operation, ie a mutation of a single record.", 730 - ), 731 - ), 732 - required: Some( 733 - vec![ 723 + )), 724 + required: Some(vec![ 734 725 ::jacquard_common::smol_str::SmolStr::new_static("action"), 735 726 ::jacquard_common::smol_str::SmolStr::new_static("path"), 736 - ::jacquard_common::smol_str::SmolStr::new_static("cid") 737 - ], 738 - ), 739 - nullable: None, 740 - properties: { 741 - #[allow(unused_mut)] 742 - let mut map = ::std::collections::BTreeMap::new(); 743 - map.insert( 744 - ::jacquard_common::smol_str::SmolStr::new_static("action"), 745 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 746 - description: None, 747 - format: None, 748 - default: None, 749 - min_length: None, 750 - max_length: None, 751 - min_graphemes: None, 752 - max_graphemes: None, 753 - r#enum: None, 754 - r#const: None, 755 - known_values: None, 756 - }), 757 - ); 758 - map.insert( 759 727 ::jacquard_common::smol_str::SmolStr::new_static("cid"), 760 - ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink(::jacquard_lexicon::lexicon::LexCidLink { 761 - description: None, 762 - }), 763 - ); 764 - map.insert( 765 - ::jacquard_common::smol_str::SmolStr::new_static("path"), 766 - ::jacquard_lexicon::lexicon::LexObjectProperty::String(::jacquard_lexicon::lexicon::LexString { 767 - description: None, 768 - format: None, 769 - default: None, 770 - min_length: None, 771 - max_length: None, 772 - min_graphemes: None, 773 - max_graphemes: None, 774 - r#enum: None, 775 - r#const: None, 776 - known_values: None, 777 - }), 778 - ); 779 - map.insert( 780 - ::jacquard_common::smol_str::SmolStr::new_static("prev"), 781 - ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink(::jacquard_lexicon::lexicon::LexCidLink { 782 - description: None, 783 - }), 784 - ); 785 - map 728 + ]), 729 + nullable: None, 730 + properties: { 731 + #[allow(unused_mut)] 732 + let mut map = ::std::collections::BTreeMap::new(); 733 + map.insert( 734 + ::jacquard_common::smol_str::SmolStr::new_static("action"), 735 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 736 + ::jacquard_lexicon::lexicon::LexString { 737 + description: None, 738 + format: None, 739 + default: None, 740 + min_length: None, 741 + max_length: None, 742 + min_graphemes: None, 743 + max_graphemes: None, 744 + r#enum: None, 745 + r#const: None, 746 + known_values: None, 747 + }, 748 + ), 749 + ); 750 + map.insert( 751 + ::jacquard_common::smol_str::SmolStr::new_static("cid"), 752 + ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink( 753 + ::jacquard_lexicon::lexicon::LexCidLink { description: None }, 754 + ), 755 + ); 756 + map.insert( 757 + ::jacquard_common::smol_str::SmolStr::new_static("path"), 758 + ::jacquard_lexicon::lexicon::LexObjectProperty::String( 759 + ::jacquard_lexicon::lexicon::LexString { 760 + description: None, 761 + format: None, 762 + default: None, 763 + min_length: None, 764 + max_length: None, 765 + min_graphemes: None, 766 + max_graphemes: None, 767 + r#enum: None, 768 + r#const: None, 769 + known_values: None, 770 + }, 771 + ), 772 + ); 773 + map.insert( 774 + ::jacquard_common::smol_str::SmolStr::new_static("prev"), 775 + ::jacquard_lexicon::lexicon::LexObjectProperty::CidLink( 776 + ::jacquard_lexicon::lexicon::LexCidLink { description: None }, 777 + ), 778 + ); 779 + map 780 + }, 786 781 }, 787 - }), 782 + ), 788 783 ); 789 784 map.insert( 790 785 ::jacquard_common::smol_str::SmolStr::new_static("sync"), ··· 916 911 /// Represents an update of repository state. Note that empty commits are allowed, which include no repo data changes, but an update to rev and signature. 917 912 #[jacquard_derive::lexicon] 918 913 #[derive( 919 - serde::Serialize, 920 - serde::Deserialize, 921 - Debug, 922 - Clone, 923 - PartialEq, 924 - Eq, 925 - jacquard_derive::IntoStatic 914 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 926 915 )] 927 916 #[serde(rename_all = "camelCase")] 928 917 pub struct Commit<'a> { ··· 959 948 960 949 pub mod commit_state { 961 950 962 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 951 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 963 952 #[allow(unused)] 964 953 use ::core::marker::PhantomData; 965 954 mod sealed { ··· 1176 1165 ::core::option::Option<Vec<jacquard_common::types::cid::CidLink<'a>>>, 1177 1166 ::core::option::Option<bytes::Bytes>, 1178 1167 ::core::option::Option<jacquard_common::types::cid::CidLink<'a>>, 1179 - ::core::option::Option< 1180 - Vec<crate::com_atproto::sync::subscribe_repos::RepoOp<'a>>, 1181 - >, 1168 + ::core::option::Option<Vec<crate::com_atproto::sync::subscribe_repos::RepoOp<'a>>>, 1182 1169 ::core::option::Option<jacquard_common::types::cid::CidLink<'a>>, 1183 1170 ::core::option::Option<bool>, 1184 1171 ::core::option::Option<jacquard_common::types::string::Did<'a>>, ··· 1204 1191 CommitBuilder { 1205 1192 _phantom_state: ::core::marker::PhantomData, 1206 1193 __unsafe_private_named: ( 1207 - None, 1208 - None, 1209 - None, 1210 - None, 1211 - None, 1212 - None, 1213 - None, 1214 - None, 1215 - None, 1216 - None, 1217 - None, 1218 - None, 1194 + None, None, None, None, None, None, None, None, None, None, None, None, 1219 1195 ), 1220 1196 _phantom: ::core::marker::PhantomData, 1221 1197 } ··· 1380 1356 S::Seq: commit_state::IsUnset, 1381 1357 { 1382 1358 /// Set the `seq` field (required) 1383 - pub fn seq( 1384 - mut self, 1385 - value: impl Into<i64>, 1386 - ) -> CommitBuilder<'a, commit_state::SetSeq<S>> { 1359 + pub fn seq(mut self, value: impl Into<i64>) -> CommitBuilder<'a, commit_state::SetSeq<S>> { 1387 1360 self.__unsafe_private_named.8 = ::core::option::Option::Some(value.into()); 1388 1361 CommitBuilder { 1389 1362 _phantom_state: ::core::marker::PhantomData, ··· 1395 1368 1396 1369 impl<'a, S: commit_state::State> CommitBuilder<'a, S> { 1397 1370 /// Set the `since` field (optional) 1398 - pub fn since( 1399 - mut self, 1400 - value: impl Into<Option<jacquard_common::types::string::Tid>>, 1401 - ) -> Self { 1371 + pub fn since(mut self, value: impl Into<Option<jacquard_common::types::string::Tid>>) -> Self { 1402 1372 self.__unsafe_private_named.9 = value.into(); 1403 1373 self 1404 1374 } 1405 1375 /// Set the `since` field to an Option value (optional) 1406 - pub fn maybe_since( 1407 - mut self, 1408 - value: Option<jacquard_common::types::string::Tid>, 1409 - ) -> Self { 1376 + pub fn maybe_since(mut self, value: Option<jacquard_common::types::string::Tid>) -> Self { 1410 1377 self.__unsafe_private_named.9 = value; 1411 1378 self 1412 1379 } ··· 1526 1493 #[allow(unused_comparisons)] 1527 1494 if value.len() > 200usize { 1528 1495 return Err(::jacquard_lexicon::validation::ConstraintError::MaxLength { 1529 - path: ::jacquard_lexicon::validation::ValidationPath::from_field( 1530 - "ops", 1531 - ), 1496 + path: ::jacquard_lexicon::validation::ValidationPath::from_field("ops"), 1532 1497 max: 200usize, 1533 1498 actual: value.len(), 1534 1499 }); ··· 1541 1506 /// Represents a change to an account's identity. Could be an updated handle, signing key, or pds hosting endpoint. Serves as a prod to all downstream services to refresh their identity cache. 1542 1507 #[jacquard_derive::lexicon] 1543 1508 #[derive( 1544 - serde::Serialize, 1545 - serde::Deserialize, 1546 - Debug, 1547 - Clone, 1548 - PartialEq, 1549 - Eq, 1550 - jacquard_derive::IntoStatic 1509 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1551 1510 )] 1552 1511 #[serde(rename_all = "camelCase")] 1553 1512 pub struct Identity<'a> { ··· 1563 1522 1564 1523 pub mod identity_state { 1565 1524 1566 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 1525 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 1567 1526 #[allow(unused)] 1568 1527 use ::core::marker::PhantomData; 1569 1528 mod sealed { ··· 1693 1652 S::Seq: identity_state::IsUnset, 1694 1653 { 1695 1654 /// Set the `seq` field (required) 1696 - pub fn seq( 1697 - mut self, 1698 - value: impl Into<i64>, 1699 - ) -> IdentityBuilder<'a, identity_state::SetSeq<S>> { 1655 + pub fn seq(mut self, value: impl Into<i64>) -> IdentityBuilder<'a, identity_state::SetSeq<S>> { 1700 1656 self.__unsafe_private_named.2 = ::core::option::Option::Some(value.into()); 1701 1657 IdentityBuilder { 1702 1658 _phantom_state: ::core::marker::PhantomData, ··· 1786 1742 PartialEq, 1787 1743 Eq, 1788 1744 jacquard_derive::IntoStatic, 1789 - Default 1745 + Default, 1790 1746 )] 1791 1747 #[serde(rename_all = "camelCase")] 1792 1748 pub struct Info<'a> { ··· 1815 1771 } 1816 1772 1817 1773 #[derive( 1818 - serde::Serialize, 1819 - serde::Deserialize, 1820 - Debug, 1821 - Clone, 1822 - PartialEq, 1823 - Eq, 1824 - jacquard_derive::IntoStatic 1774 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1825 1775 )] 1826 1776 #[serde(rename_all = "camelCase")] 1827 1777 pub struct SubscribeRepos { ··· 1831 1781 1832 1782 pub mod subscribe_repos_state { 1833 1783 1834 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 1784 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 1835 1785 #[allow(unused)] 1836 1786 use ::core::marker::PhantomData; 1837 1787 mod sealed { ··· 1898 1848 1899 1849 #[jacquard_derive::open_union] 1900 1850 #[derive( 1901 - serde::Serialize, 1902 - serde::Deserialize, 1903 - Debug, 1904 - Clone, 1905 - PartialEq, 1906 - Eq, 1907 - jacquard_derive::IntoStatic 1851 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 1908 1852 )] 1909 1853 #[serde(tag = "$type")] 1910 1854 #[serde(bound(deserialize = "'de: 'a"))] ··· 1926 1870 pub fn decode_framed<'de: 'a>( 1927 1871 bytes: &'de [u8], 1928 1872 ) -> Result<SubscribeReposMessage<'a>, jacquard_common::error::DecodeError> { 1929 - let (header, body) = jacquard_common::xrpc::subscription::parse_event_header( 1930 - bytes, 1931 - )?; 1873 + let (header, body) = jacquard_common::xrpc::subscription::parse_event_header(bytes)?; 1932 1874 match header.t.as_str() { 1933 1875 "#commit" => { 1934 1876 let variant = serde_ipld_dagcbor::from_slice(body)?; ··· 1950 1892 let variant = serde_ipld_dagcbor::from_slice(body)?; 1951 1893 Ok(Self::Info(Box::new(variant))) 1952 1894 } 1953 - unknown => { 1954 - Err( 1955 - jacquard_common::error::DecodeError::UnknownEventType(unknown.into()), 1956 - ) 1957 - } 1895 + unknown => Err(jacquard_common::error::DecodeError::UnknownEventType( 1896 + unknown.into(), 1897 + )), 1958 1898 } 1959 1899 } 1960 1900 } ··· 1969 1909 Eq, 1970 1910 thiserror::Error, 1971 1911 miette::Diagnostic, 1972 - jacquard_derive::IntoStatic 1912 + jacquard_derive::IntoStatic, 1973 1913 )] 1974 1914 #[serde(tag = "error", content = "message")] 1975 1915 #[serde(bound(deserialize = "'de: 'a"))] ··· 2008 1948 pub struct SubscribeReposStream; 2009 1949 impl jacquard_common::xrpc::SubscriptionResp for SubscribeReposStream { 2010 1950 const NSID: &'static str = "com.atproto.sync.subscribeRepos"; 2011 - const ENCODING: jacquard_common::xrpc::MessageEncoding = jacquard_common::xrpc::MessageEncoding::DagCbor; 1951 + const ENCODING: jacquard_common::xrpc::MessageEncoding = 1952 + jacquard_common::xrpc::MessageEncoding::DagCbor; 2012 1953 type Message<'de> = SubscribeReposMessage<'de>; 2013 1954 type Error<'de> = SubscribeReposError<'de>; 2014 1955 fn decode_message<'de>( ··· 2020 1961 2021 1962 impl jacquard_common::xrpc::XrpcSubscription for SubscribeRepos { 2022 1963 const NSID: &'static str = "com.atproto.sync.subscribeRepos"; 2023 - const ENCODING: jacquard_common::xrpc::MessageEncoding = jacquard_common::xrpc::MessageEncoding::DagCbor; 1964 + const ENCODING: jacquard_common::xrpc::MessageEncoding = 1965 + jacquard_common::xrpc::MessageEncoding::DagCbor; 2024 1966 type Stream = SubscribeReposStream; 2025 1967 } 2026 1968 2027 1969 pub struct SubscribeReposEndpoint; 2028 1970 impl jacquard_common::xrpc::SubscriptionEndpoint for SubscribeReposEndpoint { 2029 1971 const PATH: &'static str = "/xrpc/com.atproto.sync.subscribeRepos"; 2030 - const ENCODING: jacquard_common::xrpc::MessageEncoding = jacquard_common::xrpc::MessageEncoding::DagCbor; 1972 + const ENCODING: jacquard_common::xrpc::MessageEncoding = 1973 + jacquard_common::xrpc::MessageEncoding::DagCbor; 2031 1974 type Params<'de> = SubscribeRepos; 2032 1975 type Stream = SubscribeReposStream; 2033 1976 } ··· 2035 1978 /// A repo operation, ie a mutation of a single record. 2036 1979 #[jacquard_derive::lexicon] 2037 1980 #[derive( 2038 - serde::Serialize, 2039 - serde::Deserialize, 2040 - Debug, 2041 - Clone, 2042 - PartialEq, 2043 - Eq, 2044 - jacquard_derive::IntoStatic 1981 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 2045 1982 )] 2046 1983 #[serde(rename_all = "camelCase")] 2047 1984 pub struct RepoOp<'a> { ··· 2060 1997 2061 1998 pub mod repo_op_state { 2062 1999 2063 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 2000 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 2064 2001 #[allow(unused)] 2065 2002 use ::core::marker::PhantomData; 2066 2003 mod sealed { ··· 2161 2098 self 2162 2099 } 2163 2100 /// Set the `cid` field to an Option value (optional) 2164 - pub fn maybe_cid( 2165 - mut self, 2166 - value: Option<jacquard_common::types::cid::CidLink<'a>>, 2167 - ) -> Self { 2101 + pub fn maybe_cid(mut self, value: Option<jacquard_common::types::cid::CidLink<'a>>) -> Self { 2168 2102 self.__unsafe_private_named.1 = value; 2169 2103 self 2170 2104 } ··· 2199 2133 self 2200 2134 } 2201 2135 /// Set the `prev` field to an Option value (optional) 2202 - pub fn maybe_prev( 2203 - mut self, 2204 - value: Option<jacquard_common::types::cid::CidLink<'a>>, 2205 - ) -> Self { 2136 + pub fn maybe_prev(mut self, value: Option<jacquard_common::types::cid::CidLink<'a>>) -> Self { 2206 2137 self.__unsafe_private_named.3 = value; 2207 2138 self 2208 2139 } ··· 2262 2193 /// Updates the repo to a new state, without necessarily including that state on the firehose. Used to recover from broken commit streams, data loss incidents, or in situations where upstream host does not know recent state of the repository. 2263 2194 #[jacquard_derive::lexicon] 2264 2195 #[derive( 2265 - serde::Serialize, 2266 - serde::Deserialize, 2267 - Debug, 2268 - Clone, 2269 - PartialEq, 2270 - Eq, 2271 - jacquard_derive::IntoStatic 2196 + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, jacquard_derive::IntoStatic, 2272 2197 )] 2273 2198 #[serde(rename_all = "camelCase")] 2274 2199 pub struct Sync<'a> { ··· 2289 2214 2290 2215 pub mod sync_state { 2291 2216 2292 - pub use crate::builder_types::{Set, Unset, IsSet, IsUnset}; 2217 + pub use crate::builder_types::{IsSet, IsUnset, Set, Unset}; 2293 2218 #[allow(unused)] 2294 2219 use ::core::marker::PhantomData; 2295 2220 mod sealed { ··· 2473 2398 S::Seq: sync_state::IsUnset, 2474 2399 { 2475 2400 /// Set the `seq` field (required) 2476 - pub fn seq( 2477 - mut self, 2478 - value: impl Into<i64>, 2479 - ) -> SyncBuilder<'a, sync_state::SetSeq<S>> { 2401 + pub fn seq(mut self, value: impl Into<i64>) -> SyncBuilder<'a, sync_state::SetSeq<S>> { 2480 2402 self.__unsafe_private_named.3 = ::core::option::Option::Some(value.into()); 2481 2403 SyncBuilder { 2482 2404 _phantom_state: ::core::marker::PhantomData, ··· 2559 2481 ) -> ::std::result::Result<(), ::jacquard_lexicon::validation::ConstraintError> { 2560 2482 Ok(()) 2561 2483 } 2562 - } 2484 + }
+2
crates/weaver-index/Cargo.toml
··· 64 64 # Utilities 65 65 humansize = "2.0" 66 66 base64 = "0.22" 67 + dashmap = "6" 68 + include_dir = "0.7.4"
+17 -2
crates/weaver-index/migrations/clickhouse/001_raw_records.sql
··· 13 13 -- Content identifier from the record 14 14 cid String, 15 15 16 + -- Repository revision (TID) - monotonically increasing per DID, used for dedup/ordering 17 + rev String, 18 + 16 19 -- Full record as native JSON (schema-flexible, queryable with record.field.subfield) 17 20 record JSON, 18 21 ··· 28 31 -- When we indexed this record 29 32 indexed_at DateTime64(3) DEFAULT now64(3), 30 33 34 + -- Validation state: 'unchecked', 'valid', 'invalid_rev', 'invalid_gap', 'invalid_account' 35 + -- Populated by async batch validation, not in hot path 36 + validation_state LowCardinality(String) DEFAULT 'unchecked', 37 + 31 38 -- Materialized AT URI for convenience 32 - uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey) 39 + uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey), 40 + 41 + -- Projection for fast delete lookups by (did, cid) 42 + -- Delete events include CID, so we can O(1) lookup the original record 43 + -- to know what to decrement (e.g., which notebook's like count) 44 + PROJECTION by_did_cid ( 45 + SELECT * ORDER BY (did, cid) 46 + ) 33 47 ) 34 48 ENGINE = ReplacingMergeTree(indexed_at) 35 - ORDER BY (collection, did, rkey, indexed_at); 49 + ORDER BY (collection, did, rkey, event_time) 50 + SETTINGS deduplicate_merge_projection_mode = 'drop';
+24
crates/weaver-index/migrations/clickhouse/006_account_rev_state.sql
··· 1 + -- Per-account revision state tracking 2 + -- Maintains latest rev/cid per DID for dedup and gap detection 3 + -- 4 + -- AggregatingMergeTree with incremental MV from raw_records 5 + -- Query with argMaxMerge/maxMerge to finalize aggregates 6 + 7 + CREATE TABLE IF NOT EXISTS account_rev_state ( 8 + -- Account DID 9 + did String, 10 + 11 + -- Latest revision (TID) seen for this account 12 + last_rev AggregateFunction(argMax, String, DateTime64(3)), 13 + 14 + -- CID of the latest revision 15 + last_cid AggregateFunction(argMax, String, DateTime64(3)), 16 + 17 + -- Latest sequence number seen 18 + last_seq AggregateFunction(max, UInt64), 19 + 20 + -- Latest event time seen 21 + last_event_time AggregateFunction(max, DateTime64(3)) 22 + ) 23 + ENGINE = AggregatingMergeTree() 24 + ORDER BY did
+12
crates/weaver-index/migrations/clickhouse/007_account_rev_state_mv.sql
··· 1 + -- Incremental MV: fires on each insert to raw_records, maintains aggregate state 2 + -- Must be created after both account_rev_state (target) and raw_records (source) exist 3 + 4 + CREATE MATERIALIZED VIEW IF NOT EXISTS account_rev_state_mv TO account_rev_state AS 5 + SELECT 6 + did, 7 + argMaxState(rev, event_time) as last_rev, 8 + argMaxState(cid, event_time) as last_cid, 9 + maxState(seq) as last_seq, 10 + maxState(event_time) as last_event_time 11 + FROM raw_records 12 + GROUP BY did
+4 -5
crates/weaver-index/src/bin/storage_benchmark.rs
··· 305 305 } 306 306 }; 307 307 308 - let event_time = DateTime::from_timestamp_millis(record.event_time_ms).unwrap(); 309 308 // Insert JSON record 310 309 json_inserter 311 310 .write(&RawRecordJson { 312 311 did: record.did.clone(), 313 312 collection: record.collection.clone(), 314 313 rkey: record.rkey.clone(), 315 - cid: record.cid.clone(), 314 + cid: record.cid.to_string(), 316 315 record: json_str, 317 316 operation: record.operation.clone(), 318 317 seq: record.seq as u64, 319 - event_time: event_time.clone(), 318 + event_time: record.event_time, 320 319 }) 321 320 .await 322 321 .map_err(|e| weaver_index::error::ClickHouseError::Insert { ··· 330 329 did: record.did, 331 330 collection: record.collection, 332 331 rkey: record.rkey, 333 - cid: record.cid, 332 + cid: record.cid.to_string(), 334 333 record: cbor_bytes.clone(), 335 334 operation: record.operation, 336 335 seq: record.seq as u64, 337 - event_time, 336 + event_time: record.event_time, 338 337 }) 339 338 .await 340 339 .map_err(|e| weaver_index::error::ClickHouseError::Insert {
+71 -10
crates/weaver-index/src/bin/weaver_indexer.rs
··· 1 1 use clap::{Parser, Subcommand}; 2 - use tracing::info; 3 - use weaver_index::clickhouse::{Client, Migrator}; 4 - use weaver_index::config::ClickHouseConfig; 2 + use miette::IntoDiagnostic; 3 + use tracing::{Level, info, warn}; 4 + use tracing_subscriber::EnvFilter; 5 + use weaver_index::clickhouse::{Client, Migrator, Tables}; 6 + use weaver_index::config::{ClickHouseConfig, FirehoseConfig, IndexerConfig}; 7 + use weaver_index::firehose::FirehoseConsumer; 8 + use weaver_index::{Indexer, load_cursor}; 5 9 6 10 #[derive(Parser)] 7 - #[command(name = "weaver-indexer")] 11 + #[command(name = "indexer")] 8 12 #[command(about = "Weaver index service - firehose ingestion and query serving")] 9 13 struct Args { 10 14 #[command(subcommand)] ··· 18 22 /// Show what would be run without executing 19 23 #[arg(long)] 20 24 dry_run: bool, 25 + 26 + /// Drop all tables before running migrations (for testing) 27 + #[arg(long)] 28 + reset: bool, 21 29 }, 22 30 23 31 /// Check database connectivity ··· 31 39 async fn main() -> miette::Result<()> { 32 40 dotenvy::dotenv().ok(); 33 41 42 + let console_level = if cfg!(debug_assertions) { 43 + Level::DEBUG 44 + } else { 45 + Level::INFO 46 + }; 47 + 34 48 tracing_subscriber::fmt() 35 49 .with_env_filter( 36 - tracing_subscriber::EnvFilter::from_default_env() 37 - .add_directive("weaver_index=info".parse().unwrap()) 38 - .add_directive("weaver_indexer=info".parse().unwrap()), 50 + tracing_subscriber::EnvFilter::builder() 51 + .from_env_lossy() 52 + .add_directive(console_level.into()) 53 + .add_directive("hyper_util=info".parse().into_diagnostic()?), 39 54 ) 40 55 .init(); 41 56 42 57 let args = Args::parse(); 43 58 44 59 match args.command { 45 - Command::Migrate { dry_run } => run_migrate(dry_run).await, 60 + Command::Migrate { dry_run, reset } => run_migrate(dry_run, reset).await, 46 61 Command::Health => run_health().await, 47 62 Command::Run => run_indexer().await, 48 63 } 49 64 } 50 65 51 - async fn run_migrate(dry_run: bool) -> miette::Result<()> { 66 + async fn run_migrate(dry_run: bool, reset: bool) -> miette::Result<()> { 52 67 let config = ClickHouseConfig::from_env()?; 53 68 info!( 54 69 "Connecting to ClickHouse at {} (database: {})", ··· 56 71 ); 57 72 58 73 let client = Client::new(&config)?; 74 + 75 + if reset { 76 + if dry_run { 77 + info!("Would drop tables:"); 78 + for table in Tables::ALL { 79 + info!(" - {}", table); 80 + } 81 + } else { 82 + info!("Dropping all tables..."); 83 + for table in Tables::ALL { 84 + let query = format!("DROP TABLE IF EXISTS {}", table); 85 + match client.execute(&query).await { 86 + Ok(_) => info!(" dropped {}", table), 87 + Err(e) => warn!(" failed to drop {}: {}", table, e), 88 + } 89 + } 90 + } 91 + } 92 + 59 93 let migrator = Migrator::new(&client); 60 94 61 95 if dry_run { ··· 93 127 } 94 128 95 129 async fn run_indexer() -> miette::Result<()> { 96 - info!("Indexer not yet implemented"); 130 + let ch_config = ClickHouseConfig::from_env()?; 131 + let mut firehose_config = FirehoseConfig::from_env()?; 132 + let indexer_config = IndexerConfig::from_env(); 133 + 134 + info!( 135 + "Connecting to ClickHouse at {} (database: {})", 136 + ch_config.url, ch_config.database 137 + ); 138 + let client = Client::new(&ch_config)?; 139 + 140 + // Load cursor from ClickHouse if not overridden by env var 141 + if firehose_config.cursor.is_none() { 142 + if let Some(cursor) = load_cursor(&client).await? { 143 + firehose_config.cursor = Some(cursor); 144 + } 145 + } 146 + 147 + info!( 148 + "Connecting to firehose at {} (cursor: {:?})", 149 + firehose_config.relay_url, firehose_config.cursor 150 + ); 151 + let consumer = FirehoseConsumer::new(firehose_config); 152 + 153 + let indexer = Indexer::new(client, consumer, indexer_config).await?; 154 + 155 + info!("Starting indexer"); 156 + indexer.run().await?; 157 + 97 158 Ok(()) 98 159 }
+2 -1
crates/weaver-index/src/clickhouse.rs
··· 5 5 pub use client::{Client, TableSize}; 6 6 pub use migrations::{MigrationResult, Migrator}; 7 7 pub use schema::{ 8 - FirehoseCursor, RawAccountEvent, RawEventDlq, RawIdentityEvent, RawRecord, Tables, 8 + AccountRevState, FirehoseCursor, RawAccountEvent, RawEventDlq, RawIdentityEvent, 9 + RawRecordInsert, Tables, 9 10 };
+5 -1
crates/weaver-index/src/clickhouse/client.rs
··· 15 15 .with_url(config.url.as_str()) 16 16 .with_database(&config.database) 17 17 .with_user(&config.user) 18 - .with_password(&config.password); 18 + .with_password(&config.password) 19 + // Enable JSON type support (treated as string at transport level) 20 + .with_option("allow_experimental_json_type", "1") 21 + .with_option("input_format_binary_read_json_as_string", "1") 22 + .with_option("output_format_binary_write_json_as_string", "1"); 19 23 20 24 Ok(Self { inner }) 21 25 }
+24 -34
crates/weaver-index/src/clickhouse/migrations.rs
··· 1 1 use crate::error::{ClickHouseError, IndexError}; 2 + use include_dir::{Dir, include_dir}; 2 3 use tracing::info; 3 4 4 5 use super::Client; 5 6 6 - /// Embedded migrations - compiled into the binary 7 - const MIGRATIONS: &[(&str, &str)] = &[ 8 - ( 9 - "000_migrations.sql", 10 - include_str!("../../migrations/clickhouse/000_migrations.sql"), 11 - ), 12 - ( 13 - "001_raw_records.sql", 14 - include_str!("../../migrations/clickhouse/001_raw_records.sql"), 15 - ), 16 - ( 17 - "002_identity_events.sql", 18 - include_str!("../../migrations/clickhouse/002_identity_events.sql"), 19 - ), 20 - ( 21 - "003_account_events.sql", 22 - include_str!("../../migrations/clickhouse/003_account_events.sql"), 23 - ), 24 - ( 25 - "004_events_dlq.sql", 26 - include_str!("../../migrations/clickhouse/004_events_dlq.sql"), 27 - ), 28 - ( 29 - "005_firehose_cursor.sql", 30 - include_str!("../../migrations/clickhouse/005_firehose_cursor.sql"), 31 - ), 32 - ]; 7 + /// Embedded migrations directory - compiled into the binary 8 + static MIGRATIONS_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/migrations/clickhouse"); 33 9 34 10 /// Migration runner for ClickHouse 35 11 pub struct Migrator<'a> { ··· 41 17 Self { client } 42 18 } 43 19 20 + /// Get sorted list of migration files from embedded directory 21 + fn migrations() -> Vec<(&'static str, &'static str)> { 22 + let mut files: Vec<_> = MIGRATIONS_DIR 23 + .files() 24 + .filter(|f| f.path().extension().is_some_and(|ext| ext == "sql")) 25 + .filter_map(|f| { 26 + let name = f.path().file_name()?.to_str()?; 27 + let contents = f.contents_utf8()?; 28 + Some((name, contents)) 29 + }) 30 + .collect(); 31 + files.sort_by_key(|(name, _)| *name); 32 + files 33 + } 34 + 44 35 /// Run all pending migrations 45 36 pub async fn run(&self) -> Result<MigrationResult, IndexError> { 46 37 // First, ensure the migrations table exists (bootstrap) ··· 52 43 let mut applied_count = 0; 53 44 let mut skipped_count = 0; 54 45 55 - for (name, sql) in MIGRATIONS { 46 + for (name, sql) in Self::migrations() { 56 47 // Skip the bootstrap migration after first run 57 - if *name == "000_migrations.sql" && applied.contains(&"000_migrations.sql".to_string()) 58 - { 48 + if name == "000_migrations.sql" && applied.contains(&"000_migrations.sql".to_string()) { 59 49 skipped_count += 1; 60 50 continue; 61 51 } ··· 86 76 Err(_) => vec![], 87 77 }; 88 78 89 - let pending: Vec<String> = MIGRATIONS 90 - .iter() 79 + let pending: Vec<String> = Self::migrations() 80 + .into_iter() 91 81 .filter(|(name, _)| !applied.contains(&name.to_string())) 92 82 .map(|(name, _)| name.to_string()) 93 83 .collect(); ··· 97 87 98 88 async fn ensure_migrations_table(&self) -> Result<(), IndexError> { 99 89 // Run the bootstrap migration directly 100 - let (_, sql) = MIGRATIONS 101 - .iter() 90 + let (_, sql) = Self::migrations() 91 + .into_iter() 102 92 .find(|(name, _)| *name == "000_migrations.sql") 103 93 .expect("bootstrap migration must exist"); 104 94
+55 -15
crates/weaver-index/src/clickhouse/schema.rs
··· 1 1 use chrono::{DateTime, Utc}; 2 2 use clickhouse::Row; 3 + use smol_str::SmolStr; 3 4 4 5 /// Table names for production schema 5 6 pub struct Tables; ··· 10 11 pub const RAW_ACCOUNT_EVENTS: &'static str = "raw_account_events"; 11 12 pub const RAW_EVENTS_DLQ: &'static str = "raw_events_dlq"; 12 13 pub const FIREHOSE_CURSOR: &'static str = "firehose_cursor"; 14 + pub const ACCOUNT_REV_STATE: &'static str = "account_rev_state"; 15 + pub const ACCOUNT_REV_STATE_MV: &'static str = "account_rev_state_mv"; 16 + pub const MIGRATIONS: &'static str = "_migrations"; 17 + 18 + /// All tables and views in drop order (MVs before their source tables) 19 + pub const ALL: &'static [&'static str] = &[ 20 + Self::ACCOUNT_REV_STATE_MV, // MV first, depends on raw_records 21 + Self::ACCOUNT_REV_STATE, 22 + Self::RAW_RECORDS, 23 + Self::RAW_IDENTITY_EVENTS, 24 + Self::RAW_ACCOUNT_EVENTS, 25 + Self::RAW_EVENTS_DLQ, 26 + Self::FIREHOSE_CURSOR, 27 + Self::MIGRATIONS, 28 + ]; 29 + } 30 + 31 + /// Validation states for records 32 + pub mod validation { 33 + pub const UNCHECKED: &str = "unchecked"; 34 + pub const VALID: &str = "valid"; 35 + pub const INVALID_REV: &str = "invalid_rev"; 36 + pub const INVALID_GAP: &str = "invalid_gap"; 37 + pub const INVALID_ACCOUNT: &str = "invalid_account"; 13 38 } 14 39 15 40 /// Row type for raw_records table 16 41 /// Schema defined in migrations/clickhouse/001_raw_records.sql 17 42 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 18 - pub struct RawRecord { 19 - pub did: String, 20 - pub collection: String, 21 - pub rkey: String, 22 - pub cid: String, 23 - pub record: String, // JSON string - ClickHouse JSON type accepts string 24 - pub operation: String, 43 + pub struct RawRecordInsert { 44 + pub did: SmolStr, 45 + pub collection: SmolStr, 46 + pub rkey: SmolStr, 47 + pub cid: SmolStr, 48 + pub rev: SmolStr, 49 + pub record: SmolStr, // JSON string - ClickHouse JSON type accepts string 50 + pub operation: SmolStr, 25 51 pub seq: u64, 26 52 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 27 53 pub event_time: DateTime<Utc>, 54 + // Note: indexed_at has DEFAULT now64(3), omit from insert 55 + // Note: validation_state has DEFAULT 'unchecked', omit from insert 28 56 } 29 57 30 58 /// Row type for raw_identity_events table 31 59 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 32 60 pub struct RawIdentityEvent { 33 - pub did: String, 34 - pub handle: String, 61 + pub did: SmolStr, 62 + pub handle: SmolStr, 35 63 pub seq: u64, 36 64 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 37 65 pub event_time: DateTime<Utc>, ··· 40 68 /// Row type for raw_account_events table 41 69 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 42 70 pub struct RawAccountEvent { 43 - pub did: String, 71 + pub did: SmolStr, 44 72 pub active: u8, 45 - pub status: String, 73 + pub status: SmolStr, 46 74 pub seq: u64, 47 75 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 48 76 pub event_time: DateTime<Utc>, ··· 51 79 /// Row type for raw_events_dlq table 52 80 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 53 81 pub struct RawEventDlq { 54 - pub event_type: String, 55 - pub raw_data: String, // JSON string 56 - pub error_message: String, 82 + pub event_type: SmolStr, 83 + pub raw_data: SmolStr, // JSON string 84 + pub error_message: SmolStr, 57 85 pub seq: u64, 58 86 } 59 87 60 88 /// Row type for firehose_cursor table 61 89 #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 62 90 pub struct FirehoseCursor { 63 - pub consumer_id: String, 91 + pub consumer_id: SmolStr, 64 92 pub seq: u64, 65 93 #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 66 94 pub event_time: DateTime<Utc>, 67 95 } 96 + 97 + /// Row type for reading finalized account_rev_state 98 + /// Query with: SELECT did, argMaxMerge(last_rev), argMaxMerge(last_cid), maxMerge(last_seq), maxMerge(last_event_time) FROM account_rev_state GROUP BY did 99 + #[derive(Debug, Clone, Row, serde::Serialize, serde::Deserialize)] 100 + pub struct AccountRevState { 101 + pub did: SmolStr, 102 + pub last_rev: SmolStr, 103 + pub last_cid: SmolStr, 104 + pub last_seq: u64, 105 + #[serde(with = "clickhouse::serde::chrono::datetime64::millis")] 106 + pub last_event_time: DateTime<Utc>, 107 + }
+120
crates/weaver-index/src/config.rs
··· 1 1 use crate::error::{ConfigError, IndexError}; 2 + use dashmap::DashSet; 2 3 use url::Url; 3 4 4 5 /// ClickHouse connection configuration ··· 84 85 } 85 86 } 86 87 88 + use smol_str::{SmolStr, ToSmolStr}; 89 + 90 + /// Pre-parsed collection filter for efficient matching 91 + #[derive(Debug, Clone)] 92 + pub struct CollectionFilter { 93 + /// Prefix patterns (from "foo.*" -> "foo.") 94 + prefixes: Vec<SmolStr>, 95 + /// Exact match patterns (HashSet for O(1) lookup) 96 + exact: DashSet<SmolStr>, 97 + /// True if filter is empty (accept all) 98 + accept_all: bool, 99 + } 100 + 101 + impl CollectionFilter { 102 + /// Parse filter patterns into prefixes and exact matches 103 + pub fn new(patterns: Vec<SmolStr>) -> Self { 104 + let mut prefixes = Vec::new(); 105 + let exact = DashSet::new(); 106 + 107 + for pattern in patterns { 108 + if let Some(prefix) = pattern.strip_suffix('*') { 109 + prefixes.push(SmolStr::new(prefix)); 110 + } else { 111 + exact.insert(SmolStr::new(&pattern)); 112 + } 113 + } 114 + 115 + let accept_all = prefixes.is_empty() && exact.is_empty(); 116 + Self { 117 + prefixes, 118 + exact, 119 + accept_all, 120 + } 121 + } 122 + 123 + /// Check if a collection matches any pattern 124 + #[inline] 125 + pub fn matches(&self, collection: &str) -> bool { 126 + if self.accept_all { 127 + return true; 128 + } 129 + 130 + // O(1) exact match check first 131 + if self.exact.contains(collection) { 132 + return true; 133 + } 134 + 135 + // Prefix check - for small N, linear scan is fine 136 + // Accumulate without early return to help branch predictor 137 + let mut matched = false; 138 + for prefix in &self.prefixes { 139 + matched |= collection.starts_with(prefix.as_str()); 140 + } 141 + matched 142 + } 143 + } 144 + 145 + /// Indexer runtime configuration 146 + #[derive(Debug, Clone)] 147 + pub struct IndexerConfig { 148 + /// Maximum records to batch before flushing to ClickHouse 149 + pub batch_size: usize, 150 + /// Maximum time (ms) before flushing even if batch isn't full 151 + pub flush_interval_ms: u64, 152 + /// Collection filter (pre-parsed patterns) 153 + pub collections: CollectionFilter, 154 + } 155 + 156 + impl Default for IndexerConfig { 157 + fn default() -> Self { 158 + Self { 159 + batch_size: 1000, 160 + flush_interval_ms: 1000, 161 + collections: CollectionFilter::new(vec![ 162 + SmolStr::new_static("sh.weaver.*"), 163 + SmolStr::new_static("app.bsky.actor.profile"), 164 + ]), 165 + } 166 + } 167 + } 168 + 169 + impl IndexerConfig { 170 + /// Load configuration from environment variables. 171 + /// 172 + /// Optional env vars: 173 + /// - `INDEXER_BATCH_SIZE`: Max records per batch (default: 1000) 174 + /// - `INDEXER_FLUSH_INTERVAL_MS`: Max ms between flushes (default: 1000) 175 + /// - `INDEXER_COLLECTIONS`: Comma-separated collection patterns (default: sh.weaver.*,app.bsky.actor.profile) 176 + /// Use * suffix for prefix matching, e.g., "sh.weaver.*" matches all sh.weaver.* collections 177 + pub fn from_env() -> Self { 178 + let batch_size = std::env::var("INDEXER_BATCH_SIZE") 179 + .ok() 180 + .and_then(|s| s.parse().ok()) 181 + .unwrap_or(1000); 182 + 183 + let flush_interval_ms = std::env::var("INDEXER_FLUSH_INTERVAL_MS") 184 + .ok() 185 + .and_then(|s| s.parse().ok()) 186 + .unwrap_or(1000); 187 + 188 + let patterns: Vec<SmolStr> = std::env::var("INDEXER_COLLECTIONS") 189 + .map(|s| s.split(',').map(|p| p.trim().to_smolstr()).collect()) 190 + .unwrap_or_else(|_| { 191 + vec![ 192 + SmolStr::new_static("sh.weaver.*"), 193 + SmolStr::new_static("app.bsky.actor.profile"), 194 + ] 195 + }); 196 + 197 + Self { 198 + batch_size, 199 + flush_interval_ms, 200 + collections: CollectionFilter::new(patterns), 201 + } 202 + } 203 + } 204 + 87 205 /// Combined configuration for the indexer 88 206 #[derive(Debug, Clone)] 89 207 pub struct Config { 90 208 pub clickhouse: ClickHouseConfig, 91 209 pub firehose: FirehoseConfig, 210 + pub indexer: IndexerConfig, 92 211 } 93 212 94 213 impl Config { ··· 97 216 Ok(Self { 98 217 clickhouse: ClickHouseConfig::from_env()?, 99 218 firehose: FirehoseConfig::from_env()?, 219 + indexer: IndexerConfig::from_env(), 100 220 }) 101 221 } 102 222 }
+2 -2
crates/weaver-index/src/firehose.rs
··· 2 2 mod records; 3 3 4 4 pub use consumer::{ 5 - FirehoseConsumer, MessageStream, SubscribeReposMessage, Commit, Identity, Account, Sync, 5 + Account, Commit, FirehoseConsumer, Identity, MessageStream, SubscribeReposMessage, Sync, 6 6 }; 7 - pub use records::{extract_records, ExtractedRecord}; 7 + pub use records::{ExtractedRecord, extract_records};
+10 -6
crates/weaver-index/src/firehose/records.rs
··· 1 1 use crate::error::{CarError, IndexError}; 2 2 use bytes::Bytes; 3 + use chrono::{DateTime, Utc}; 3 4 use jacquard_repo::car::reader::parse_car_bytes; 4 5 use smol_str::{SmolStr, ToSmolStr}; 5 6 ··· 15 16 /// Record key within the collection 16 17 pub rkey: SmolStr, 17 18 /// Content identifier 18 - pub cid: String, 19 + pub cid: SmolStr, 20 + /// Repository revision (TID) - monotonically increasing per DID 21 + pub rev: SmolStr, 19 22 /// Operation type: "create", "update", or "delete" 20 23 pub operation: SmolStr, 21 24 /// Raw DAG-CBOR bytes of the record (None for deletes) 22 25 pub cbor_bytes: Option<Bytes>, 23 26 /// Sequence number from the firehose event 24 27 pub seq: i64, 25 - /// Event timestamp (milliseconds since epoch) 26 - pub event_time_ms: i64, 28 + /// Event timestamp 29 + pub event_time: DateTime<Utc>, 27 30 } 28 31 29 32 impl ExtractedRecord { ··· 61 64 message: e.to_string(), 62 65 })?; 63 66 64 - let event_time_ms = commit.time.as_ref().timestamp_millis(); 67 + let event_time = commit.time.as_ref().with_timezone(&Utc); 65 68 let mut records = Vec::with_capacity(commit.ops.len()); 66 69 67 70 for op in &commit.ops { ··· 77 80 }; 78 81 79 82 let operation = op.action.to_smolstr(); 80 - let cid_str = op.cid.as_ref().map(|c| c.to_string()).unwrap_or_default(); 83 + let cid_str = op.cid.as_ref().map(|c| c.to_smolstr()).unwrap_or_default(); 81 84 82 85 // For creates/updates, look up the record in the CAR blocks 83 86 let cbor_bytes = if let Some(cid_link) = &op.cid { ··· 97 100 collection, 98 101 rkey, 99 102 cid: cid_str, 103 + rev: commit.rev.to_smolstr(), 100 104 operation, 101 105 cbor_bytes, 102 106 seq: commit.seq, 103 - event_time_ms, 107 + event_time, 104 108 }); 105 109 } 106 110
+512
crates/weaver-index/src/indexer.rs
··· 1 + use std::sync::Arc; 2 + use std::time::{Duration, Instant}; 3 + 4 + use chrono::Utc; 5 + use dashmap::DashMap; 6 + use n0_future::StreamExt; 7 + use smol_str::{SmolStr, ToSmolStr}; 8 + use tracing::{debug, info, warn}; 9 + 10 + use chrono::DateTime; 11 + 12 + use crate::clickhouse::{ 13 + AccountRevState, Client, FirehoseCursor, RawAccountEvent, RawIdentityEvent, RawRecordInsert, 14 + }; 15 + use crate::config::IndexerConfig; 16 + use crate::error::{IndexError, Result}; 17 + use crate::firehose::{ 18 + Account, Commit, ExtractedRecord, FirehoseConsumer, Identity, MessageStream, 19 + SubscribeReposMessage, extract_records, 20 + }; 21 + 22 + /// Default consumer ID for cursor tracking 23 + const CONSUMER_ID: &str = "main"; 24 + 25 + /// Per-account revision state for deduplication 26 + #[derive(Debug, Clone)] 27 + pub struct RevState { 28 + pub last_rev: SmolStr, 29 + pub last_cid: SmolStr, 30 + } 31 + 32 + /// In-memory cache of per-account revision state 33 + /// 34 + /// Used for fast deduplication without hitting ClickHouse on every event. 35 + /// Populated from account_rev_state table on startup, updated as events are processed. 36 + pub struct RevCache { 37 + inner: DashMap<SmolStr, RevState>, 38 + } 39 + 40 + impl RevCache { 41 + pub fn new() -> Self { 42 + Self { 43 + inner: DashMap::new(), 44 + } 45 + } 46 + 47 + /// Load cache from ClickHouse account_rev_state table 48 + pub async fn load_from_clickhouse(client: &Client) -> Result<Self> { 49 + let query = r#" 50 + SELECT 51 + did, 52 + argMaxMerge(last_rev) as last_rev, 53 + argMaxMerge(last_cid) as last_cid, 54 + maxMerge(last_seq) as last_seq, 55 + maxMerge(last_event_time) as last_event_time 56 + FROM account_rev_state 57 + GROUP BY did 58 + "#; 59 + 60 + let rows: Vec<AccountRevState> = 61 + client.inner().query(query).fetch_all().await.map_err(|e| { 62 + IndexError::ClickHouse(crate::error::ClickHouseError::Query { 63 + message: "failed to load account rev state".into(), 64 + source: e, 65 + }) 66 + })?; 67 + 68 + let cache = Self::new(); 69 + for row in rows { 70 + cache.inner.insert( 71 + SmolStr::new(&row.did), 72 + RevState { 73 + last_rev: SmolStr::new(&row.last_rev), 74 + last_cid: SmolStr::new(&row.last_cid), 75 + }, 76 + ); 77 + } 78 + 79 + info!( 80 + accounts = cache.inner.len(), 81 + "loaded rev cache from clickhouse" 82 + ); 83 + Ok(cache) 84 + } 85 + 86 + /// Check if we should process this commit (returns false if already seen) 87 + pub fn should_process(&self, did: &str, rev: &str) -> bool { 88 + match self.inner.get(did) { 89 + Some(state) => rev > state.last_rev.as_str(), 90 + None => true, // new account, always process 91 + } 92 + } 93 + 94 + /// Update cache after processing a commit 95 + pub fn update(&self, did: &SmolStr, rev: &SmolStr, cid: &SmolStr) { 96 + self.inner.insert( 97 + did.clone(), 98 + RevState { 99 + last_rev: rev.clone(), 100 + last_cid: cid.clone(), 101 + }, 102 + ); 103 + } 104 + 105 + /// Get current cache size (number of accounts tracked) 106 + pub fn len(&self) -> usize { 107 + self.inner.len() 108 + } 109 + 110 + pub fn is_empty(&self) -> bool { 111 + self.inner.is_empty() 112 + } 113 + } 114 + 115 + impl Default for RevCache { 116 + fn default() -> Self { 117 + Self::new() 118 + } 119 + } 120 + 121 + /// Safety margin when resuming - back up this many sequence numbers 122 + /// to ensure no gaps from incomplete batches or race conditions 123 + const CURSOR_REWIND: i64 = 1000; 124 + 125 + /// Load cursor from ClickHouse for resuming 126 + /// 127 + /// Returns cursor with safety margin subtracted to ensure overlap 128 + pub async fn load_cursor(client: &Client) -> Result<Option<i64>> { 129 + let query = format!( 130 + r#" 131 + SELECT consumer_id, seq, event_time 132 + FROM firehose_cursor FINAL 133 + WHERE consumer_id = '{}' 134 + LIMIT 1 135 + "#, 136 + CONSUMER_ID 137 + ); 138 + 139 + let cursor: Option<FirehoseCursor> = client 140 + .inner() 141 + .query(&query) 142 + .fetch_optional() 143 + .await 144 + .map_err(|e| crate::error::ClickHouseError::Query { 145 + message: "failed to load cursor".into(), 146 + source: e, 147 + })?; 148 + 149 + if let Some(c) = &cursor { 150 + let resume_at = (c.seq as i64).saturating_sub(CURSOR_REWIND); 151 + info!( 152 + saved_seq = c.seq, 153 + resume_seq = resume_at, 154 + rewind = CURSOR_REWIND, 155 + "loaded cursor from clickhouse (with safety margin)" 156 + ); 157 + Ok(Some(resume_at)) 158 + } else { 159 + Ok(None) 160 + } 161 + } 162 + 163 + /// Main indexer that consumes firehose and writes to ClickHouse 164 + pub struct Indexer { 165 + client: Arc<Client>, 166 + consumer: FirehoseConsumer, 167 + rev_cache: RevCache, 168 + config: IndexerConfig, 169 + } 170 + 171 + impl Indexer { 172 + /// Create a new indexer 173 + pub async fn new( 174 + client: Client, 175 + consumer: FirehoseConsumer, 176 + config: IndexerConfig, 177 + ) -> Result<Self> { 178 + let client = Arc::new(client); 179 + 180 + // Load rev cache from ClickHouse 181 + let rev_cache = RevCache::load_from_clickhouse(&client).await?; 182 + 183 + Ok(Self { 184 + client, 185 + consumer, 186 + rev_cache, 187 + config, 188 + }) 189 + } 190 + 191 + /// Save cursor to ClickHouse 192 + async fn save_cursor(&self, seq: u64, event_time: DateTime<Utc>) -> Result<()> { 193 + let query = format!( 194 + "INSERT INTO firehose_cursor (consumer_id, seq, event_time) VALUES ('{}', {}, {})", 195 + CONSUMER_ID, 196 + seq, 197 + event_time.timestamp_millis() 198 + ); 199 + 200 + self.client.execute(&query).await?; 201 + debug!(seq, "saved cursor"); 202 + Ok(()) 203 + } 204 + 205 + /// Run the indexer loop 206 + pub async fn run(&self) -> Result<()> { 207 + info!("connecting to firehose..."); 208 + let mut stream: MessageStream = self.consumer.connect().await?; 209 + 210 + // Inserters handle batching internally based on config 211 + let mut records = self.client.inserter::<RawRecordInsert>("raw_records"); 212 + let mut identities = self 213 + .client 214 + .inserter::<RawIdentityEvent>("raw_identity_events"); 215 + let mut accounts = self 216 + .client 217 + .inserter::<RawAccountEvent>("raw_account_events"); 218 + 219 + // Stats and cursor tracking 220 + let mut processed: u64 = 0; 221 + let mut skipped: u64 = 0; 222 + let mut last_seq: u64 = 0; 223 + let mut last_event_time = Utc::now(); 224 + let mut last_stats = Instant::now(); 225 + let mut last_cursor_save = Instant::now(); 226 + 227 + info!("starting indexer loop"); 228 + 229 + while let Some(result) = stream.next().await { 230 + let msg = match result { 231 + Ok(msg) => msg, 232 + Err(e) => { 233 + warn!(error = ?e, "firehose stream error"); 234 + continue; 235 + } 236 + }; 237 + 238 + // Track seq from any message type that has it 239 + match &msg { 240 + SubscribeReposMessage::Commit(c) => { 241 + last_seq = c.seq as u64; 242 + last_event_time = c.time.as_ref().with_timezone(&Utc); 243 + } 244 + SubscribeReposMessage::Identity(i) => { 245 + last_seq = i.seq as u64; 246 + last_event_time = i.time.as_ref().with_timezone(&Utc); 247 + } 248 + SubscribeReposMessage::Account(a) => { 249 + last_seq = a.seq as u64; 250 + last_event_time = a.time.as_ref().with_timezone(&Utc); 251 + } 252 + _ => {} 253 + } 254 + 255 + match msg { 256 + SubscribeReposMessage::Commit(commit) => { 257 + if self 258 + .process_commit(&commit, &mut records, &mut skipped) 259 + .await? 260 + { 261 + processed += 1; 262 + } 263 + } 264 + SubscribeReposMessage::Identity(identity) => { 265 + write_identity(&identity, &mut identities).await?; 266 + } 267 + SubscribeReposMessage::Account(account) => { 268 + write_account(&account, &mut accounts).await?; 269 + } 270 + SubscribeReposMessage::Sync(_) => { 271 + debug!("received sync (tooBig) event, skipping"); 272 + } 273 + _ => {} 274 + } 275 + 276 + // commit() flushes if internal thresholds met, otherwise no-op 277 + records 278 + .commit() 279 + .await 280 + .map_err(|e| crate::error::ClickHouseError::Query { 281 + message: "commit failed".into(), 282 + source: e, 283 + })?; 284 + 285 + // Periodic stats and cursor save (every 10s) 286 + if last_stats.elapsed() >= Duration::from_secs(10) { 287 + info!( 288 + processed, 289 + skipped, 290 + last_seq, 291 + rev_cache_size = self.rev_cache.len(), 292 + "indexer stats" 293 + ); 294 + last_stats = Instant::now(); 295 + } 296 + 297 + // Save cursor every 30s 298 + if last_cursor_save.elapsed() >= Duration::from_secs(30) && last_seq > 0 { 299 + if let Err(e) = self.save_cursor(last_seq, last_event_time).await { 300 + warn!(error = ?e, "failed to save cursor"); 301 + } 302 + last_cursor_save = Instant::now(); 303 + } 304 + } 305 + 306 + // Final flush 307 + records 308 + .end() 309 + .await 310 + .map_err(|e| crate::error::ClickHouseError::Query { 311 + message: "final flush failed".into(), 312 + source: e, 313 + })?; 314 + identities 315 + .end() 316 + .await 317 + .map_err(|e| crate::error::ClickHouseError::Query { 318 + message: "final flush failed".into(), 319 + source: e, 320 + })?; 321 + accounts 322 + .end() 323 + .await 324 + .map_err(|e| crate::error::ClickHouseError::Query { 325 + message: "final flush failed".into(), 326 + source: e, 327 + })?; 328 + 329 + // Final cursor save 330 + if last_seq > 0 { 331 + self.save_cursor(last_seq, last_event_time).await?; 332 + } 333 + 334 + info!(last_seq, "firehose stream ended"); 335 + Ok(()) 336 + } 337 + 338 + async fn process_commit( 339 + &self, 340 + commit: &Commit<'_>, 341 + inserter: &mut clickhouse::inserter::Inserter<RawRecordInsert>, 342 + skipped: &mut u64, 343 + ) -> Result<bool> { 344 + let did = commit.repo.as_ref(); 345 + let rev = commit.rev.as_ref(); 346 + 347 + // Dedup check 348 + if !self.rev_cache.should_process(did, rev) { 349 + *skipped += 1; 350 + return Ok(false); 351 + } 352 + 353 + // Extract and write records 354 + for record in extract_records(commit).await? { 355 + // Collection filter - skip early before JSON conversion 356 + if !self.config.collections.matches(&record.collection) { 357 + continue; 358 + } 359 + 360 + let json = record.to_json()?.unwrap_or_else(|| "{}".to_string()); 361 + 362 + // Fire and forget delete handling 363 + if record.operation == "delete" { 364 + let client = self.client.clone(); 365 + let record_clone = record.clone(); 366 + tokio::spawn(async move { 367 + if let Err(e) = handle_delete(&client, record_clone).await { 368 + warn!(error = ?e, "delete handling failed"); 369 + } 370 + }); 371 + } 372 + 373 + inserter 374 + .write(&RawRecordInsert { 375 + did: record.did.clone(), 376 + collection: record.collection.clone(), 377 + rkey: record.rkey.clone(), 378 + cid: record.cid.clone(), 379 + rev: record.rev.clone(), 380 + record: json.to_smolstr(), 381 + operation: record.operation.clone(), 382 + seq: record.seq as u64, 383 + event_time: record.event_time, 384 + }) 385 + .await 386 + .map_err(|e| crate::error::ClickHouseError::Query { 387 + message: "write failed".into(), 388 + source: e, 389 + })?; 390 + } 391 + 392 + // Update rev cache 393 + self.rev_cache.update( 394 + &SmolStr::new(did), 395 + &SmolStr::new(rev), 396 + &commit.commit.0.to_smolstr(), 397 + ); 398 + 399 + Ok(true) 400 + } 401 + } 402 + 403 + async fn write_identity( 404 + identity: &Identity<'_>, 405 + inserter: &mut clickhouse::inserter::Inserter<RawIdentityEvent>, 406 + ) -> Result<()> { 407 + inserter 408 + .write(&RawIdentityEvent { 409 + did: identity.did.to_smolstr(), 410 + handle: identity 411 + .handle 412 + .as_ref() 413 + .map(|h| h.as_ref().to_smolstr()) 414 + .unwrap_or_default(), 415 + seq: identity.seq as u64, 416 + event_time: identity.time.as_ref().with_timezone(&Utc), 417 + }) 418 + .await 419 + .map_err(|e| crate::error::ClickHouseError::Query { 420 + message: "write failed".into(), 421 + source: e, 422 + })?; 423 + Ok(()) 424 + } 425 + 426 + async fn write_account( 427 + account: &Account<'_>, 428 + inserter: &mut clickhouse::inserter::Inserter<RawAccountEvent>, 429 + ) -> Result<()> { 430 + inserter 431 + .write(&RawAccountEvent { 432 + did: account.did.to_smolstr(), 433 + active: if account.active { 1 } else { 0 }, 434 + status: account 435 + .status 436 + .as_ref() 437 + .map(|s| s.as_ref().to_smolstr()) 438 + .unwrap_or_default(), 439 + seq: account.seq as u64, 440 + event_time: account.time.as_ref().with_timezone(&Utc), 441 + }) 442 + .await 443 + .map_err(|e| crate::error::ClickHouseError::Query { 444 + message: "write failed".into(), 445 + source: e, 446 + })?; 447 + Ok(()) 448 + } 449 + 450 + /// Handle a delete event with poll-then-stub logic 451 + /// 452 + /// For deletes, we need to look up the original record to know what was deleted 453 + /// (e.g., which notebook a like was for). If the record doesn't exist yet 454 + /// (out-of-order events), we poll for up to 15 seconds before creating a stub tombstone. 455 + /// Minimal struct for delete lookups - just the fields we need to process the delete 456 + #[derive(Debug, Clone, clickhouse::Row, serde::Deserialize)] 457 + struct LookupRawRecord { 458 + did: SmolStr, 459 + collection: SmolStr, 460 + rkey: SmolStr, 461 + record: SmolStr, // JSON string of the original record 462 + } 463 + 464 + async fn handle_delete(client: &Client, record: ExtractedRecord) -> Result<()> { 465 + let deadline = Instant::now() + Duration::from_secs(15); 466 + 467 + loop { 468 + // Try to find the record by CID 469 + let query = format!( 470 + r#" 471 + SELECT did, collection, rkey, record 472 + FROM raw_records 473 + WHERE did = '{}' AND cid = '{}' 474 + ORDER BY event_time DESC 475 + LIMIT 1 476 + "#, 477 + record.did, record.cid 478 + ); 479 + 480 + let original: Option<LookupRawRecord> = client 481 + .inner() 482 + .query(&query) 483 + .fetch_optional() 484 + .await 485 + .map_err(|e| crate::error::ClickHouseError::Query { 486 + message: "delete lookup failed".into(), 487 + source: e, 488 + })?; 489 + 490 + if let Some(_original) = original { 491 + // Found the record - the main insert path already handles creating 492 + // the delete row, so we're done. In phase 2, this is where we'd 493 + // parse original.record and insert count deltas for denormalized tables. 494 + debug!(did = %record.did, cid = %record.cid, "delete found original record"); 495 + return Ok(()); 496 + } 497 + 498 + if Instant::now() > deadline { 499 + // Gave up - create stub tombstone 500 + // The record will be inserted via the main batch path with operation='delete' 501 + // and empty record content, which serves as our stub tombstone 502 + warn!( 503 + did = %record.did, 504 + cid = %record.cid, 505 + "delete timeout, stub tombstone will be created" 506 + ); 507 + return Ok(()); 508 + } 509 + 510 + tokio::time::sleep(Duration::from_secs(1)).await; 511 + } 512 + }
+2
crates/weaver-index/src/lib.rs
··· 2 2 pub mod config; 3 3 pub mod error; 4 4 pub mod firehose; 5 + pub mod indexer; 5 6 6 7 pub use config::Config; 7 8 pub use error::{IndexError, Result}; 9 + pub use indexer::{load_cursor, Indexer};