commit cd388c2e1b57cb43415a428d84f17c406c399ba2 · nonbinary.computer/weaver

+1

Cargo.lock

··· 11789 "jacquard-repo", 11790 "miette 7.6.0", 11791 "n0-future 0.1.3", 11792 "rusqlite", 11793 "rusqlite_migration", 11794 "serde",

··· 11789 "jacquard-repo", 11790 "miette 7.6.0", 11791 "n0-future 0.1.3", 11792 + "regex", 11793 "rusqlite", 11794 "rusqlite_migration", 11795 "serde",

+1

crates/weaver-index/Cargo.toml

··· 68 base64 = "0.22" 69 dashmap = "6" 70 include_dir = "0.7.4" 71 72 # WebSocket (for tap consumer) 73 tokio-tungstenite = { version = "0.26", features = ["native-tls"] }

··· 68 base64 = "0.22" 69 dashmap = "6" 70 include_dir = "0.7.4" 71 + regex = "1" 72 73 # WebSocket (for tap consumer) 74 tokio-tungstenite = { version = "0.26", features = ["native-tls"] }

+7 -7

crates/weaver-index/migrations/clickhouse/001_raw_records.sql

··· 1 -- Raw records from firehose/jetstream 2 -- Core table for all AT Protocol records before denormalization 3 -- 4 - -- Uses ReplacingMergeTree to deduplicate on (collection, did, rkey) keeping latest indexed_at 5 -- JSON column stores full record, extract fields only when needed for ORDER BY/WHERE/JOINs 6 7 CREATE TABLE IF NOT EXISTS raw_records ( ··· 10 collection LowCardinality(String), 11 rkey String, 12 13 - -- Content identifier from the record 14 cid String, 15 16 - -- Repository revision (TID) - monotonically increasing per DID, used for dedup/ordering 17 rev String, 18 19 -- Full record as native JSON (schema-flexible, queryable with record.field.subfield) 20 record JSON, 21 22 - -- Operation: 'create', 'update', 'delete' 23 operation LowCardinality(String), 24 25 -- Firehose sequence number (metadata only, not for ordering - can jump on relay restart) ··· 49 SELECT * ORDER BY (did, cid) 50 ) 51 ) 52 - ENGINE = ReplacingMergeTree(indexed_at) 53 - ORDER BY (collection, did, rkey, event_time, indexed_at) 54 - SETTINGS deduplicate_merge_projection_mode = 'drop';

··· 1 -- Raw records from firehose/jetstream 2 -- Core table for all AT Protocol records before denormalization 3 -- 4 + -- Append-only log using plain MergeTree - all versions preserved for audit/rollback. 5 + -- Query-time deduplication via ORDER BY + LIMIT or window functions. 6 -- JSON column stores full record, extract fields only when needed for ORDER BY/WHERE/JOINs 7 8 CREATE TABLE IF NOT EXISTS raw_records ( ··· 11 collection LowCardinality(String), 12 rkey String, 13 14 + -- Content identifier from the record (content-addressed hash) 15 cid String, 16 17 + -- Repository revision (TID) - monotonically increasing per DID, used for ordering 18 rev String, 19 20 -- Full record as native JSON (schema-flexible, queryable with record.field.subfield) 21 record JSON, 22 23 + -- Operation: 'create', 'update', 'delete', 'cache' (fetched on-demand) 24 operation LowCardinality(String), 25 26 -- Firehose sequence number (metadata only, not for ordering - can jump on relay restart) ··· 50 SELECT * ORDER BY (did, cid) 51 ) 52 ) 53 + ENGINE = MergeTree() 54 + ORDER BY (collection, did, rkey, event_time, indexed_at);

+37

crates/weaver-index/migrations/clickhouse/008_handle_mappings.sql

···

··· 1 + -- Handle → DID mappings with account status tracking 2 + -- 3 + -- Updated from three sources: 4 + -- 1. Identity events (firehose) - handle claims/changes via 009_handle_mappings_identity_mv.sql 5 + -- 2. Account events (firehose) - takedowns/suspensions/deletions via 010_handle_mappings_account_mv.sql 6 + -- 3. Resolution cache flush - XRPC handle resolution results (manual inserts) 7 + -- 8 + -- Query pattern: ORDER BY freed ASC, event_time DESC to get active mapping first 9 + 10 + CREATE TABLE IF NOT EXISTS handle_mappings ( 11 + handle String, 12 + did String, 13 + 14 + -- 0 = active, 1 = account deactivated/suspended/deleted 15 + freed UInt8 DEFAULT 0, 16 + 17 + -- 'active' | 'takendown' | 'suspended' | 'deleted' | 'deactivated' 18 + account_status LowCardinality(String) DEFAULT 'active', 19 + 20 + -- 'identity' (firehose) | 'account' (firehose) | 'resolution' (xrpc cache) 21 + source LowCardinality(String), 22 + 23 + -- Canonical event time (firehose event or resolution time) 24 + event_time DateTime64(3), 25 + 26 + -- When we indexed this mapping 27 + indexed_at DateTime64(3) DEFAULT now64(3), 28 + 29 + -- Fast DID → handle lookups (for account events, profile hydration) 30 + -- Query with ORDER BY freed ASC, event_time DESC to get active mapping 31 + PROJECTION by_did ( 32 + SELECT * ORDER BY (did, freed, event_time) 33 + ) 34 + ) 35 + ENGINE = ReplacingMergeTree(indexed_at) 36 + ORDER BY (handle, did) 37 + SETTINGS deduplicate_merge_projection_mode = 'drop';

+13

crates/weaver-index/migrations/clickhouse/009_handle_mappings_identity_mv.sql

···

··· 1 + -- Auto-populate handle_mappings from identity events when handle is present 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS handle_mappings_from_identity_mv TO handle_mappings AS 4 + SELECT 5 + handle, 6 + did, 7 + 0 as freed, 8 + 'active' as account_status, 9 + 'identity' as source, 10 + event_time, 11 + now64(3) as indexed_at 12 + FROM raw_identity_events 13 + WHERE handle != ''

+16

crates/weaver-index/migrations/clickhouse/010_handle_mappings_account_mv.sql

···

··· 1 + -- Auto-populate freed status from account events 2 + -- JOINs against handle_mappings to find current handle for the DID 3 + -- If no mapping exists yet, the JOIN fails silently (can't free unknown handles) 4 + 5 + CREATE MATERIALIZED VIEW IF NOT EXISTS handle_mappings_from_account_mv TO handle_mappings AS 6 + SELECT 7 + h.handle, 8 + a.did, 9 + 1 as freed, 10 + a.status as account_status, 11 + 'account' as source, 12 + a.event_time, 13 + now64(3) as indexed_at 14 + FROM raw_account_events a 15 + INNER JOIN handle_mappings h ON h.did = a.did AND h.freed = 0 16 + WHERE a.active = 0 AND a.status != ''

+25

crates/weaver-index/migrations/clickhouse/011_profiles_weaver.sql

···

··· 1 + -- Weaver profile source table 2 + -- Populated by MV from raw_records, merged into profiles by refreshable MV 3 + 4 + CREATE TABLE IF NOT EXISTS profiles_weaver ( 5 + did String, 6 + 7 + -- Raw profile JSON 8 + profile String, 9 + 10 + -- Extracted fields for coalescing 11 + display_name String DEFAULT '', 12 + description String DEFAULT '', 13 + avatar_cid String DEFAULT '', 14 + banner_cid String DEFAULT '', 15 + 16 + -- Timestamps 17 + created_at DateTime64(3) DEFAULT toDateTime64(0, 3), 18 + event_time DateTime64(3), 19 + indexed_at DateTime64(3) DEFAULT now64(3), 20 + 21 + -- Soft delete (epoch = not deleted) 22 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 23 + ) 24 + ENGINE = ReplacingMergeTree(indexed_at) 25 + ORDER BY did

+16

crates/weaver-index/migrations/clickhouse/012_profiles_weaver_mv.sql

···

··· 1 + -- Populate profiles_weaver from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS profiles_weaver_mv TO profiles_weaver AS 4 + SELECT 5 + did, 6 + toString(record) as profile, 7 + coalesce(record.displayName, '') as display_name, 8 + coalesce(record.description, '') as description, 9 + coalesce(record.avatar.ref.`$link`, '') as avatar_cid, 10 + coalesce(record.banner.ref.`$link`, '') as banner_cid, 11 + coalesce(toDateTime64(record.createdAt, 3), toDateTime64(0, 3)) as created_at, 12 + event_time, 13 + now64(3) as indexed_at, 14 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 15 + FROM raw_records 16 + WHERE collection = 'sh.weaver.actor.profile'

+25

crates/weaver-index/migrations/clickhouse/013_profiles_bsky.sql

···

··· 1 + -- Bluesky profile source table 2 + -- Populated by MV from raw_records, merged into profiles by refreshable MV 3 + 4 + CREATE TABLE IF NOT EXISTS profiles_bsky ( 5 + did String, 6 + 7 + -- Raw profile JSON 8 + profile String, 9 + 10 + -- Extracted fields for coalescing 11 + display_name String DEFAULT '', 12 + description String DEFAULT '', 13 + avatar_cid String DEFAULT '', 14 + banner_cid String DEFAULT '', 15 + 16 + -- Timestamps 17 + created_at DateTime64(3) DEFAULT toDateTime64(0, 3), 18 + event_time DateTime64(3), 19 + indexed_at DateTime64(3) DEFAULT now64(3), 20 + 21 + -- Soft delete (epoch = not deleted) 22 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 23 + ) 24 + ENGINE = ReplacingMergeTree(indexed_at) 25 + ORDER BY did

+16

crates/weaver-index/migrations/clickhouse/014_profiles_bsky_mv.sql

···

··· 1 + -- Populate profiles_bsky from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS profiles_bsky_mv TO profiles_bsky AS 4 + SELECT 5 + did, 6 + toString(record) as profile, 7 + coalesce(record.displayName, '') as display_name, 8 + coalesce(record.description, '') as description, 9 + coalesce(record.avatar.ref.`$link`, '') as avatar_cid, 10 + coalesce(record.banner.ref.`$link`, '') as banner_cid, 11 + coalesce(toDateTime64(record.createdAt, 3), toDateTime64(0, 3)) as created_at, 12 + event_time, 13 + now64(3) as indexed_at, 14 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 15 + FROM raw_records 16 + WHERE collection = 'app.bsky.actor.profile'

+41

crates/weaver-index/migrations/clickhouse/015_profiles.sql

···

··· 1 + -- Unified profiles view 2 + -- Refreshable MV that merges weaver + bsky profiles with handle resolution 3 + -- Queries are pure reads, no merge computation needed 4 + 5 + CREATE MATERIALIZED VIEW IF NOT EXISTS profiles 6 + REFRESH EVERY 1 MINUTE 7 + ENGINE = ReplacingMergeTree(indexed_at) 8 + ORDER BY did 9 + AS SELECT 10 + if(w.did != '', w.did, b.did) as did, 11 + 12 + -- Handle from handle_mappings (empty if not resolved yet) 13 + coalesce(h.handle, '') as handle, 14 + 15 + -- Raw profiles per source 16 + coalesce(w.profile, '') as weaver_profile, 17 + coalesce(b.profile, '') as bsky_profile, 18 + 19 + -- Coalesced fields (weaver > bsky priority) 20 + coalesce(nullIf(w.display_name, ''), b.display_name, '') as display_name, 21 + coalesce(nullIf(w.description, ''), b.description, '') as description, 22 + coalesce(nullIf(w.avatar_cid, ''), b.avatar_cid, '') as avatar_cid, 23 + coalesce(nullIf(w.banner_cid, ''), b.banner_cid, '') as banner_cid, 24 + 25 + -- Presence flags 26 + if(w.did != '', 1, 0) as has_weaver, 27 + if(b.did != '', 1, 0) as has_bsky, 28 + 29 + -- Timestamps 30 + coalesce(w.created_at, b.created_at, toDateTime64(0, 3)) as created_at, 31 + greatest(coalesce(w.event_time, toDateTime64(0, 3)), coalesce(b.event_time, toDateTime64(0, 3))) as event_time, 32 + now64(3) as indexed_at 33 + FROM (SELECT * FROM profiles_weaver WHERE deleted_at = toDateTime64(0, 3)) w 34 + FULL OUTER JOIN (SELECT * FROM profiles_bsky WHERE deleted_at = toDateTime64(0, 3)) b ON w.did = b.did 35 + LEFT JOIN ( 36 + SELECT did, argMax(handle, event_time) as handle 37 + FROM handle_mappings 38 + WHERE freed = 0 AND did != '' 39 + GROUP BY did 40 + ) h ON h.did = if(w.did != '', w.did, b.did) 41 + WHERE if(w.did != '', w.did, b.did) != ''

+17

crates/weaver-index/migrations/clickhouse/016_profile_counts.sql

···

··· 1 + -- Profile counts aggregated from graph tables 2 + -- Updated by MVs from follows, notebooks, entries (added later with those tables) 3 + -- Joined with profiles at query time 4 + 5 + CREATE TABLE IF NOT EXISTS profile_counts ( 6 + did String, 7 + 8 + -- Signed for increment/decrement from MVs 9 + follower_count Int64 DEFAULT 0, 10 + following_count Int64 DEFAULT 0, 11 + notebook_count Int64 DEFAULT 0, 12 + entry_count Int64 DEFAULT 0, 13 + 14 + indexed_at DateTime64(3) DEFAULT now64(3) 15 + ) 16 + ENGINE = SummingMergeTree((follower_count, following_count, notebook_count, entry_count)) 17 + ORDER BY did

+36

crates/weaver-index/migrations/clickhouse/017_notebooks.sql

···

··· 1 + -- Notebooks table 2 + -- Populated by MV from raw_records 3 + 4 + CREATE TABLE IF NOT EXISTS notebooks ( 5 + -- Identity 6 + did String, 7 + rkey String, 8 + cid String, 9 + 10 + -- Materialized URI for convenience 11 + uri String MATERIALIZED concat('at://', did, '/sh.weaver.notebook.book/', rkey), 12 + 13 + -- Extracted fields 14 + title String DEFAULT '', 15 + path String DEFAULT '', 16 + description String DEFAULT '', 17 + tags Array(String) DEFAULT [], 18 + publish_global UInt8 DEFAULT 0, 19 + 20 + -- Authors array (DIDs) 21 + author_dids Array(String) DEFAULT [], 22 + 23 + -- Entry count (length of entryList) 24 + entry_count UInt32 DEFAULT 0, 25 + 26 + -- Timestamps 27 + created_at DateTime64(3) DEFAULT toDateTime64(0, 3), 28 + updated_at DateTime64(3) DEFAULT toDateTime64(0, 3), 29 + event_time DateTime64(3), 30 + indexed_at DateTime64(3) DEFAULT now64(3), 31 + 32 + -- Soft delete (epoch = not deleted) 33 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 34 + ) 35 + ENGINE = ReplacingMergeTree(indexed_at) 36 + ORDER BY (did, rkey)

+21

crates/weaver-index/migrations/clickhouse/018_notebooks_mv.sql

···

··· 1 + -- Populate notebooks from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS notebooks_mv TO notebooks AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + coalesce(record.title, '') as title, 9 + coalesce(record.path, '') as path, 10 + '' as description, -- notebooks don't have description field 11 + JSONExtract(toString(record), 'tags', 'Array(String)') as tags, 12 + if(record.publishGlobal = true, 1, 0) as publish_global, 13 + arrayMap(x -> JSONExtractString(x, 'did'), JSONExtractArrayRaw(toString(record), 'authors')) as author_dids, 14 + length(JSONExtractArrayRaw(toString(record), 'entryList')) as entry_count, 15 + coalesce(toDateTime64(record.createdAt, 3), toDateTime64(0, 3)) as created_at, 16 + coalesce(toDateTime64(record.updatedAt, 3), toDateTime64(0, 3)) as updated_at, 17 + event_time, 18 + now64(3) as indexed_at, 19 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 20 + FROM raw_records 21 + WHERE collection = 'sh.weaver.notebook.book'

+17

crates/weaver-index/migrations/clickhouse/019_notebook_counts.sql

···

··· 1 + -- Notebook engagement counts 2 + -- Updated by MVs from likes, bookmarks, subscriptions (added later with graph tables) 3 + -- Joined with notebooks at query time 4 + 5 + CREATE TABLE IF NOT EXISTS notebook_counts ( 6 + did String, 7 + rkey String, 8 + 9 + -- Signed for increment/decrement from MVs 10 + like_count Int64 DEFAULT 0, 11 + bookmark_count Int64 DEFAULT 0, 12 + subscriber_count Int64 DEFAULT 0, 13 + 14 + indexed_at DateTime64(3) DEFAULT now64(3) 15 + ) 16 + ENGINE = SummingMergeTree((like_count, bookmark_count, subscriber_count)) 17 + ORDER BY (did, rkey)

+35

crates/weaver-index/migrations/clickhouse/020_entries.sql

···

··· 1 + -- Entries table 2 + -- Populated by MV from raw_records 3 + 4 + CREATE TABLE IF NOT EXISTS entries ( 5 + -- Identity 6 + did String, 7 + rkey String, 8 + cid String, 9 + 10 + -- Materialized URI for convenience 11 + uri String MATERIALIZED concat('at://', did, '/sh.weaver.notebook.entry/', rkey), 12 + 13 + -- Extracted fields 14 + title String DEFAULT '', 15 + path String DEFAULT '', 16 + content String DEFAULT '', 17 + tags Array(String) DEFAULT [], 18 + 19 + -- Authors array (DIDs) 20 + author_dids Array(String) DEFAULT [], 21 + 22 + -- Content preview (first 500 chars for feeds) 23 + content_preview String DEFAULT '', 24 + 25 + -- Timestamps 26 + created_at DateTime64(3) DEFAULT toDateTime64(0, 3), 27 + updated_at DateTime64(3) DEFAULT toDateTime64(0, 3), 28 + event_time DateTime64(3), 29 + indexed_at DateTime64(3) DEFAULT now64(3), 30 + 31 + -- Soft delete (epoch = not deleted) 32 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 33 + ) 34 + ENGINE = ReplacingMergeTree(indexed_at) 35 + ORDER BY (did, rkey)

+20

crates/weaver-index/migrations/clickhouse/021_entries_mv.sql

···

··· 1 + -- Populate entries from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS entries_mv TO entries AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + coalesce(record.title, '') as title, 9 + coalesce(record.path, '') as path, 10 + coalesce(record.content, '') as content, 11 + JSONExtract(toString(record), 'tags', 'Array(String)') as tags, 12 + arrayMap(x -> JSONExtractString(x, 'did'), JSONExtractArrayRaw(toString(record), 'authors')) as author_dids, 13 + substring(coalesce(record.content, ''), 1, 500) as content_preview, 14 + coalesce(toDateTime64(record.createdAt, 3), toDateTime64(0, 3)) as created_at, 15 + coalesce(toDateTime64(record.updatedAt, 3), toDateTime64(0, 3)) as updated_at, 16 + event_time, 17 + now64(3) as indexed_at, 18 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 19 + FROM raw_records 20 + WHERE collection = 'sh.weaver.notebook.entry'

+16

crates/weaver-index/migrations/clickhouse/022_entry_counts.sql

···

··· 1 + -- Entry engagement counts 2 + -- Updated by MVs from likes, bookmarks (added later with graph tables) 3 + -- Joined with entries at query time 4 + 5 + CREATE TABLE IF NOT EXISTS entry_counts ( 6 + did String, 7 + rkey String, 8 + 9 + -- Signed for increment/decrement from MVs 10 + like_count Int64 DEFAULT 0, 11 + bookmark_count Int64 DEFAULT 0, 12 + 13 + indexed_at DateTime64(3) DEFAULT now64(3) 14 + ) 15 + ENGINE = SummingMergeTree((like_count, bookmark_count)) 16 + ORDER BY (did, rkey)

+22

crates/weaver-index/migrations/clickhouse/023_drafts.sql

···

··· 1 + -- Draft stub records 2 + -- Anchors for unpublished content, enables draft discovery via queries 3 + 4 + CREATE TABLE IF NOT EXISTS drafts ( 5 + -- Identity 6 + did String, 7 + rkey String, 8 + cid String, 9 + 10 + -- Materialized URI for convenience 11 + uri String MATERIALIZED concat('at://', did, '/sh.weaver.edit.draft/', rkey), 12 + 13 + -- Timestamps 14 + created_at DateTime64(3) DEFAULT toDateTime64(0, 3), 15 + event_time DateTime64(3), 16 + indexed_at DateTime64(3) DEFAULT now64(3), 17 + 18 + -- Soft delete (epoch = not deleted) 19 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 20 + ) 21 + ENGINE = ReplacingMergeTree(indexed_at) 22 + ORDER BY (did, rkey)

+13

crates/weaver-index/migrations/clickhouse/024_drafts_mv.sql

···

··· 1 + -- Populate drafts from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS drafts_mv TO drafts AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + coalesce(toDateTime64(record.createdAt, 3), toDateTime64(0, 3)) as created_at, 9 + event_time, 10 + now64(3) as indexed_at, 11 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 12 + FROM raw_records 13 + WHERE collection = 'sh.weaver.edit.draft'

+46

crates/weaver-index/migrations/clickhouse/025_edit_nodes.sql

···

··· 1 + -- Edit nodes (roots and diffs combined) 2 + -- Used for querying edit history from ClickHouse 3 + -- Real-time collab queries use SQLite hot tier 4 + 5 + CREATE TABLE IF NOT EXISTS edit_nodes ( 6 + -- Identity 7 + did String, 8 + rkey String, 9 + cid String, 10 + collection LowCardinality(String), -- 'sh.weaver.edit.root' or 'sh.weaver.edit.diff' 11 + 12 + -- Materialized URI 13 + uri String MATERIALIZED concat('at://', did, '/', collection, '/', rkey), 14 + 15 + -- Node type derived from collection 16 + node_type LowCardinality(String), -- 'root' or 'diff' 17 + 18 + -- Resource being edited (extracted from doc.value) 19 + -- One of these will be populated depending on doc type 20 + resource_type LowCardinality(String) DEFAULT '', -- 'entry', 'notebook', 'draft' 21 + resource_did String DEFAULT '', 22 + resource_rkey String DEFAULT '', 23 + resource_collection LowCardinality(String) DEFAULT '', 24 + 25 + -- For diffs: reference to root 26 + root_did String DEFAULT '', 27 + root_rkey String DEFAULT '', 28 + 29 + -- For diffs: reference to previous node 30 + prev_did String DEFAULT '', 31 + prev_rkey String DEFAULT '', 32 + 33 + -- Whether this has inline diff data vs blob snapshot 34 + has_inline_diff UInt8 DEFAULT 0, 35 + has_snapshot UInt8 DEFAULT 0, 36 + 37 + -- Timestamps 38 + created_at DateTime64(3) DEFAULT toDateTime64(0, 3), 39 + event_time DateTime64(3), 40 + indexed_at DateTime64(3) DEFAULT now64(3), 41 + 42 + -- Soft delete (epoch = not deleted) 43 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 44 + ) 45 + ENGINE = ReplacingMergeTree(indexed_at) 46 + ORDER BY (did, rkey)

+62

crates/weaver-index/migrations/clickhouse/026_edit_roots_mv.sql

···

··· 1 + -- Populate edit_nodes from edit.root records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS edit_roots_mv TO edit_nodes AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + 'sh.weaver.edit.root' as collection, 9 + 'root' as node_type, 10 + 11 + -- Extract resource type and ref from doc.value union 12 + multiIf( 13 + toString(record.doc.value.entry.uri) != '', 'entry', 14 + toString(record.doc.value.notebook.uri) != '', 'notebook', 15 + toString(record.doc.value.draftKey) != '', 'draft', 16 + '' 17 + ) as resource_type, 18 + 19 + -- Extract resource DID (parse from URI or empty) 20 + multiIf( 21 + toString(record.doc.value.entry.uri) != '', 22 + splitByChar('/', replaceOne(toString(record.doc.value.entry.uri), 'at://', ''))[1], 23 + toString(record.doc.value.notebook.uri) != '', 24 + splitByChar('/', replaceOne(toString(record.doc.value.notebook.uri), 'at://', ''))[1], 25 + '' 26 + ) as resource_did, 27 + 28 + -- Extract resource rkey (parse from URI or use draftKey) 29 + multiIf( 30 + toString(record.doc.value.entry.uri) != '', 31 + splitByChar('/', replaceOne(toString(record.doc.value.entry.uri), 'at://', ''))[3], 32 + toString(record.doc.value.notebook.uri) != '', 33 + splitByChar('/', replaceOne(toString(record.doc.value.notebook.uri), 'at://', ''))[3], 34 + toString(record.doc.value.draftKey) != '', 35 + toString(record.doc.value.draftKey), 36 + '' 37 + ) as resource_rkey, 38 + 39 + -- Extract resource collection 40 + multiIf( 41 + toString(record.doc.value.entry.uri) != '', 'sh.weaver.notebook.entry', 42 + toString(record.doc.value.notebook.uri) != '', 'sh.weaver.notebook.book', 43 + toString(record.doc.value.draftKey) != '', 'sh.weaver.edit.draft', 44 + '' 45 + ) as resource_collection, 46 + 47 + -- Roots don't have root/prev refs 48 + '' as root_did, 49 + '' as root_rkey, 50 + '' as prev_did, 51 + '' as prev_rkey, 52 + 53 + -- Roots always have snapshot 54 + 0 as has_inline_diff, 55 + 1 as has_snapshot, 56 + 57 + event_time as created_at, 58 + event_time, 59 + now64(3) as indexed_at, 60 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 61 + FROM raw_records 62 + WHERE collection = 'sh.weaver.edit.root'

+68

crates/weaver-index/migrations/clickhouse/027_edit_diffs_mv.sql

···

··· 1 + -- Populate edit_nodes from edit.diff records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS edit_diffs_mv TO edit_nodes AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + 'sh.weaver.edit.diff' as collection, 9 + 'diff' as node_type, 10 + 11 + -- Extract resource type and ref from doc.value union 12 + multiIf( 13 + toString(record.doc.value.entry.uri) != '', 'entry', 14 + toString(record.doc.value.notebook.uri) != '', 'notebook', 15 + toString(record.doc.value.draftKey) != '', 'draft', 16 + '' 17 + ) as resource_type, 18 + 19 + -- Extract resource DID 20 + multiIf( 21 + toString(record.doc.value.entry.uri) != '', 22 + splitByChar('/', replaceOne(toString(record.doc.value.entry.uri), 'at://', ''))[1], 23 + toString(record.doc.value.notebook.uri) != '', 24 + splitByChar('/', replaceOne(toString(record.doc.value.notebook.uri), 'at://', ''))[1], 25 + '' 26 + ) as resource_did, 27 + 28 + -- Extract resource rkey 29 + multiIf( 30 + toString(record.doc.value.entry.uri) != '', 31 + splitByChar('/', replaceOne(toString(record.doc.value.entry.uri), 'at://', ''))[3], 32 + toString(record.doc.value.notebook.uri) != '', 33 + splitByChar('/', replaceOne(toString(record.doc.value.notebook.uri), 'at://', ''))[3], 34 + toString(record.doc.value.draftKey) != '', 35 + toString(record.doc.value.draftKey), 36 + '' 37 + ) as resource_rkey, 38 + 39 + -- Extract resource collection 40 + multiIf( 41 + toString(record.doc.value.entry.uri) != '', 'sh.weaver.notebook.entry', 42 + toString(record.doc.value.notebook.uri) != '', 'sh.weaver.notebook.book', 43 + toString(record.doc.value.draftKey) != '', 'sh.weaver.edit.draft', 44 + '' 45 + ) as resource_collection, 46 + 47 + -- Root reference 48 + splitByChar('/', replaceOne(toString(record.root.uri), 'at://', ''))[1] as root_did, 49 + splitByChar('/', replaceOne(toString(record.root.uri), 'at://', ''))[3] as root_rkey, 50 + 51 + -- Prev reference (optional) 52 + if(toString(record.prev.uri) != '', 53 + splitByChar('/', replaceOne(toString(record.prev.uri), 'at://', ''))[1], 54 + '') as prev_did, 55 + if(toString(record.prev.uri) != '', 56 + splitByChar('/', replaceOne(toString(record.prev.uri), 'at://', ''))[3], 57 + '') as prev_rkey, 58 + 59 + -- Check for inline diff vs snapshot 60 + if(length(toString(record.inlineDiff)) > 0, 1, 0) as has_inline_diff, 61 + if(toString(record.snapshot.ref.`$link`) != '', 1, 0) as has_snapshot, 62 + 63 + coalesce(toDateTime64(record.createdAt, 3), event_time) as created_at, 64 + event_time, 65 + now64(3) as indexed_at, 66 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 67 + FROM raw_records 68 + WHERE collection = 'sh.weaver.edit.diff'

+63

crates/weaver-index/migrations/clickhouse/028_edit_heads.sql

···

··· 1 + -- Edit heads per resource 2 + -- Refreshable MV that tracks all branch heads for each resource 3 + -- A head is a node with no children (nothing has prev pointing to it) 4 + -- Multiple heads = divergent branches needing merge 5 + 6 + CREATE MATERIALIZED VIEW IF NOT EXISTS edit_heads 7 + REFRESH EVERY 1 MINUTE 8 + ENGINE = ReplacingMergeTree(indexed_at) 9 + ORDER BY (resource_did, resource_collection, resource_rkey, head_did, head_rkey) 10 + AS 11 + WITH 12 + -- All nodes 13 + all_nodes AS ( 14 + SELECT 15 + did, rkey, cid, collection, node_type, 16 + resource_did, resource_collection, resource_rkey, 17 + root_did, root_rkey, 18 + prev_did, prev_rkey, 19 + created_at 20 + FROM edit_nodes 21 + WHERE resource_did != '' 22 + ), 23 + -- Nodes that are pointed to by prev (have children) 24 + has_children AS ( 25 + SELECT DISTINCT prev_did as did, prev_rkey as rkey 26 + FROM all_nodes 27 + WHERE prev_did != '' 28 + ), 29 + -- Root CIDs lookup 30 + root_cids AS ( 31 + SELECT did, rkey, cid 32 + FROM edit_nodes 33 + WHERE node_type = 'root' 34 + ) 35 + -- Heads are nodes with no children 36 + SELECT 37 + n.resource_did, 38 + n.resource_collection, 39 + n.resource_rkey, 40 + concat('at://', n.resource_did, '/', n.resource_collection, '/', n.resource_rkey) as resource_uri, 41 + 42 + -- This head 43 + n.did as head_did, 44 + n.rkey as head_rkey, 45 + n.cid as head_cid, 46 + n.collection as head_collection, 47 + n.node_type as head_type, 48 + concat('at://', n.did, '/', n.collection, '/', n.rkey) as head_uri, 49 + 50 + -- Root for this branch (with CID) 51 + if(n.node_type = 'root', n.did, n.root_did) as root_did, 52 + if(n.node_type = 'root', n.rkey, n.root_rkey) as root_rkey, 53 + if(n.node_type = 'root', n.cid, coalesce(r.cid, '')) as root_cid, 54 + if(n.node_type = 'root', 55 + concat('at://', n.did, '/', n.collection, '/', n.rkey), 56 + if(n.root_did != '', concat('at://', n.root_did, '/sh.weaver.edit.root/', n.root_rkey), '') 57 + ) as root_uri, 58 + 59 + n.created_at as head_created_at, 60 + now64(3) as indexed_at 61 + FROM all_nodes n 62 + LEFT ANTI JOIN has_children c ON n.did = c.did AND n.rkey = c.rkey 63 + LEFT JOIN root_cids r ON r.did = n.root_did AND r.rkey = n.root_rkey

+37

crates/weaver-index/migrations/clickhouse/029_collab_invites.sql

···

··· 1 + -- Collaboration invites 2 + -- Half of a two-way collaboration agreement 3 + 4 + CREATE TABLE IF NOT EXISTS collab_invites ( 5 + -- Invite record identity 6 + did String, 7 + rkey String, 8 + cid String, 9 + uri String MATERIALIZED concat('at://', did, '/sh.weaver.collab.invite/', rkey), 10 + 11 + -- Resource being shared (decomposed) 12 + resource_did String, 13 + resource_collection LowCardinality(String), 14 + resource_rkey String, 15 + resource_uri String MATERIALIZED concat('at://', resource_did, '/', resource_collection, '/', resource_rkey), 16 + 17 + -- Inviter is the record author (did) 18 + inviter_did String MATERIALIZED did, 19 + 20 + -- Invitee 21 + invitee_did String, 22 + 23 + -- Optional fields 24 + scope LowCardinality(String) DEFAULT '', 25 + message String DEFAULT '', 26 + expires_at DateTime64(3) DEFAULT toDateTime64(0, 3), 27 + 28 + -- Timestamps 29 + created_at DateTime64(3), 30 + event_time DateTime64(3), 31 + indexed_at DateTime64(3) DEFAULT now64(3), 32 + 33 + -- Soft delete (epoch = not deleted) 34 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 35 + ) 36 + ENGINE = ReplacingMergeTree(indexed_at) 37 + ORDER BY (did, rkey)

+23

crates/weaver-index/migrations/clickhouse/030_collab_invites_mv.sql

···

··· 1 + -- Populate collab_invites from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS collab_invites_mv TO collab_invites AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + 9 + -- Parse resource strongRef 10 + splitByChar('/', replaceOne(toString(record.resource.uri), 'at://', ''))[1] as resource_did, 11 + splitByChar('/', replaceOne(toString(record.resource.uri), 'at://', ''))[2] as resource_collection, 12 + splitByChar('/', replaceOne(toString(record.resource.uri), 'at://', ''))[3] as resource_rkey, 13 + 14 + toString(record.invitee) as invitee_did, 15 + coalesce(toString(record.scope), '') as scope, 16 + coalesce(toString(record.message), '') as message, 17 + coalesce(toDateTime64(record.expiresAt, 3), toDateTime64(0, 3)) as expires_at, 18 + coalesce(toDateTime64(record.createdAt, 3), event_time) as created_at, 19 + event_time, 20 + now64(3) as indexed_at, 21 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 22 + FROM raw_records 23 + WHERE collection = 'sh.weaver.collab.invite'

+35

crates/weaver-index/migrations/clickhouse/031_collab_accepts.sql

···

··· 1 + -- Collaboration accepts 2 + -- Completes the two-way collaboration agreement 3 + 4 + CREATE TABLE IF NOT EXISTS collab_accepts ( 5 + -- Accept record identity 6 + did String, 7 + rkey String, 8 + cid String, 9 + uri String MATERIALIZED concat('at://', did, '/sh.weaver.collab.accept/', rkey), 10 + 11 + -- Invite being accepted (decomposed) 12 + invite_did String, 13 + invite_rkey String, 14 + invite_cid String DEFAULT '', 15 + invite_uri String MATERIALIZED concat('at://', invite_did, '/sh.weaver.collab.invite/', invite_rkey), 16 + 17 + -- Resource (denormalized in the record) 18 + resource_did String, 19 + resource_collection LowCardinality(String), 20 + resource_rkey String, 21 + resource_uri String MATERIALIZED concat('at://', resource_did, '/', resource_collection, '/', resource_rkey), 22 + 23 + -- Accepter is the record author (did) 24 + accepter_did String MATERIALIZED did, 25 + 26 + -- Timestamps 27 + created_at DateTime64(3), 28 + event_time DateTime64(3), 29 + indexed_at DateTime64(3) DEFAULT now64(3), 30 + 31 + -- Soft delete (epoch = not deleted) 32 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 33 + ) 34 + ENGINE = ReplacingMergeTree(indexed_at) 35 + ORDER BY (did, rkey)

+24

crates/weaver-index/migrations/clickhouse/032_collab_accepts_mv.sql

···

··· 1 + -- Populate collab_accepts from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS collab_accepts_mv TO collab_accepts AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + 9 + -- Parse invite strongRef 10 + splitByChar('/', replaceOne(toString(record.invite.uri), 'at://', ''))[1] as invite_did, 11 + splitByChar('/', replaceOne(toString(record.invite.uri), 'at://', ''))[3] as invite_rkey, 12 + toString(record.invite.cid) as invite_cid, 13 + 14 + -- Parse resource AT-URI 15 + splitByChar('/', replaceOne(toString(record.resource), 'at://', ''))[1] as resource_did, 16 + splitByChar('/', replaceOne(toString(record.resource), 'at://', ''))[2] as resource_collection, 17 + splitByChar('/', replaceOne(toString(record.resource), 'at://', ''))[3] as resource_rkey, 18 + 19 + coalesce(toDateTime64(record.createdAt, 3), event_time) as created_at, 20 + event_time, 21 + now64(3) as indexed_at, 22 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 23 + FROM raw_records 24 + WHERE collection = 'sh.weaver.collab.accept'

+34

crates/weaver-index/migrations/clickhouse/033_collab_sessions.sql

···

··· 1 + -- Active collaboration sessions 2 + -- Published when joining collaborative editing, deleted on disconnect 3 + 4 + CREATE TABLE IF NOT EXISTS collab_sessions ( 5 + -- Session record identity 6 + did String, 7 + rkey String, 8 + cid String, 9 + uri String MATERIALIZED concat('at://', did, '/sh.weaver.collab.session/', rkey), 10 + 11 + -- Resource being edited (decomposed) 12 + resource_did String, 13 + resource_collection LowCardinality(String), 14 + resource_rkey String, 15 + resource_uri String MATERIALIZED concat('at://', resource_did, '/', resource_collection, '/', resource_rkey), 16 + 17 + -- Participant is the record author (did) 18 + participant_did String MATERIALIZED did, 19 + 20 + -- Connection info 21 + node_id String, 22 + relay_url String DEFAULT '', 23 + 24 + -- Timestamps 25 + created_at DateTime64(3), 26 + expires_at DateTime64(3) DEFAULT toDateTime64(0, 3), 27 + event_time DateTime64(3), 28 + indexed_at DateTime64(3) DEFAULT now64(3), 29 + 30 + -- Soft delete (epoch = not deleted) 31 + deleted_at DateTime64(3) DEFAULT toDateTime64(0, 3) 32 + ) 33 + ENGINE = ReplacingMergeTree(indexed_at) 34 + ORDER BY (did, rkey)

+22

crates/weaver-index/migrations/clickhouse/034_collab_sessions_mv.sql

···

··· 1 + -- Populate collab_sessions from raw_records 2 + 3 + CREATE MATERIALIZED VIEW IF NOT EXISTS collab_sessions_mv TO collab_sessions AS 4 + SELECT 5 + did, 6 + rkey, 7 + cid, 8 + 9 + -- Parse resource strongRef 10 + splitByChar('/', replaceOne(toString(record.resource.uri), 'at://', ''))[1] as resource_did, 11 + splitByChar('/', replaceOne(toString(record.resource.uri), 'at://', ''))[2] as resource_collection, 12 + splitByChar('/', replaceOne(toString(record.resource.uri), 'at://', ''))[3] as resource_rkey, 13 + 14 + toString(record.nodeId) as node_id, 15 + coalesce(toString(record.relayUrl), '') as relay_url, 16 + coalesce(toDateTime64(record.createdAt, 3), event_time) as created_at, 17 + coalesce(toDateTime64(record.expiresAt, 3), toDateTime64(0, 3)) as expires_at, 18 + event_time, 19 + now64(3) as indexed_at, 20 + if(operation = 'delete', event_time, toDateTime64(0, 3)) as deleted_at 21 + FROM raw_records 22 + WHERE collection = 'sh.weaver.collab.session'

+47

crates/weaver-index/migrations/clickhouse/035_collaborators.sql

···

··· 1 + -- Valid collaborators (matched invite + accept pairs) 2 + -- Refreshable MV that joins invites and accepts 3 + 4 + CREATE MATERIALIZED VIEW IF NOT EXISTS collaborators 5 + REFRESH EVERY 1 MINUTE 6 + ENGINE = ReplacingMergeTree(indexed_at) 7 + ORDER BY (resource_did, resource_collection, resource_rkey, collaborator_did) 8 + AS 9 + SELECT 10 + -- Resource 11 + i.resource_did, 12 + i.resource_collection, 13 + i.resource_rkey, 14 + concat('at://', i.resource_did, '/', i.resource_collection, '/', i.resource_rkey) as resource_uri, 15 + 16 + -- Collaborator (the invitee who accepted) 17 + i.invitee_did as collaborator_did, 18 + 19 + -- Inviter 20 + i.did as inviter_did, 21 + 22 + -- Invite record 23 + i.did as invite_did, 24 + i.rkey as invite_rkey, 25 + i.cid as invite_cid, 26 + concat('at://', i.did, '/sh.weaver.collab.invite/', i.rkey) as invite_uri, 27 + 28 + -- Accept record 29 + a.did as accept_did, 30 + a.rkey as accept_rkey, 31 + a.cid as accept_cid, 32 + concat('at://', a.did, '/sh.weaver.collab.accept/', a.rkey) as accept_uri, 33 + 34 + -- Scope 35 + i.scope, 36 + 37 + -- Timestamps 38 + i.created_at as invited_at, 39 + a.created_at as accepted_at, 40 + now64(3) as indexed_at 41 + FROM collab_invites i 42 + INNER JOIN collab_accepts a ON 43 + a.invite_did = i.did 44 + AND a.invite_rkey = i.rkey 45 + WHERE 46 + -- Invite not expired 47 + (i.expires_at = toDateTime64(0, 3) OR i.expires_at > now64(3))

+66

crates/weaver-index/migrations/clickhouse/036_permissions.sql

···

··· 1 + -- Resource permissions 2 + -- Refreshable MV that computes who can access each resource 3 + -- Combines: owners (resource creator) + collaborators (invite+accept pairs) 4 + 5 + CREATE MATERIALIZED VIEW IF NOT EXISTS permissions 6 + REFRESH EVERY 1 MINUTE 7 + ENGINE = ReplacingMergeTree(indexed_at) 8 + ORDER BY (resource_did, resource_collection, resource_rkey, grantee_did) 9 + AS 10 + -- Owners: resource creator has owner permission 11 + SELECT 12 + did as resource_did, 13 + 'sh.weaver.notebook.entry' as resource_collection, 14 + rkey as resource_rkey, 15 + concat('at://', did, '/sh.weaver.notebook.entry/', rkey) as resource_uri, 16 + 17 + did as grantee_did, 18 + 'owner' as scope, 19 + 20 + -- Source is the resource itself 21 + did as source_did, 22 + 'sh.weaver.notebook.entry' as source_collection, 23 + rkey as source_rkey, 24 + 25 + created_at as granted_at, 26 + now64(3) as indexed_at 27 + FROM entries 28 + 29 + UNION ALL 30 + 31 + SELECT 32 + did as resource_did, 33 + 'sh.weaver.notebook.book' as resource_collection, 34 + rkey as resource_rkey, 35 + concat('at://', did, '/sh.weaver.notebook.book/', rkey) as resource_uri, 36 + 37 + did as grantee_did, 38 + 'owner' as scope, 39 + 40 + did as source_did, 41 + 'sh.weaver.notebook.book' as source_collection, 42 + rkey as source_rkey, 43 + 44 + created_at as granted_at, 45 + now64(3) as indexed_at 46 + FROM notebooks 47 + 48 + UNION ALL 49 + 50 + -- Collaborators: invite+accept pairs grant permission 51 + SELECT 52 + resource_did, 53 + resource_collection, 54 + resource_rkey, 55 + resource_uri, 56 + 57 + collaborator_did as grantee_did, 58 + if(scope != '', scope, 'collaborator') as scope, 59 + 60 + invite_did as source_did, 61 + 'sh.weaver.collab.invite' as source_collection, 62 + invite_rkey as source_rkey, 63 + 64 + accepted_at as granted_at, 65 + indexed_at 66 + FROM collaborators

+10 -9

crates/weaver-index/src/bin/weaver_indexer.rs

··· 1 use clap::{Parser, Subcommand}; 2 use tracing::{error, info, warn}; 3 - use weaver_index::clickhouse::{Client, Migrator, Tables}; 4 use weaver_index::config::{ 5 ClickHouseConfig, FirehoseConfig, IndexerConfig, ShardConfig, SourceMode, TapConfig, 6 }; ··· 71 let client = Client::new(&config)?; 72 73 if reset { 74 if dry_run { 75 - info!("Would drop tables:"); 76 - for table in Tables::ALL { 77 - info!(" - {}", table); 78 } 79 } else { 80 - info!("Dropping all tables..."); 81 - for table in Tables::ALL { 82 - let query = format!("DROP TABLE IF EXISTS {}", table); 83 match client.execute(&query).await { 84 - Ok(_) => info!(" dropped {}", table), 85 - Err(e) => warn!(" failed to drop {}: {}", table, e), 86 } 87 } 88 }

··· 1 use clap::{Parser, Subcommand}; 2 use tracing::{error, info, warn}; 3 + use weaver_index::clickhouse::{Client, Migrator}; 4 use weaver_index::config::{ 5 ClickHouseConfig, FirehoseConfig, IndexerConfig, ShardConfig, SourceMode, TapConfig, 6 }; ··· 71 let client = Client::new(&config)?; 72 73 if reset { 74 + let objects = Migrator::all_objects(); 75 if dry_run { 76 + info!("Would drop {} objects:", objects.len()); 77 + for obj in &objects { 78 + info!(" - {} ({:?})", obj.name, obj.object_type); 79 } 80 } else { 81 + info!("Dropping all tables and views..."); 82 + for obj in &objects { 83 + let query = obj.drop_statement(); 84 match client.execute(&query).await { 85 + Ok(_) => info!(" dropped {} ({:?})", obj.name, obj.object_type), 86 + Err(e) => warn!(" failed to drop {}: {}", obj.name, e), 87 } 88 } 89 }

+1 -1

crates/weaver-index/src/clickhouse.rs

··· 3 mod schema; 4 5 pub use client::{Client, TableSize}; 6 - pub use migrations::{MigrationResult, Migrator}; 7 pub use schema::{ 8 AccountRevState, FirehoseCursor, RawAccountEvent, RawEventDlq, RawIdentityEvent, 9 RawRecordInsert, Tables,

··· 3 mod schema; 4 5 pub use client::{Client, TableSize}; 6 + pub use migrations::{DbObject, MigrationResult, Migrator, ObjectType}; 7 pub use schema::{ 8 AccountRevState, FirehoseCursor, RawAccountEvent, RawEventDlq, RawIdentityEvent, 9 RawRecordInsert, Tables,

+20 -12

crates/weaver-index/src/clickhouse/client.rs

··· 106 collection: &str, 107 rkey: &str, 108 ) -> Result<Option<RecordRow>, IndexError> { 109 - // FINAL ensures ReplacingMergeTree deduplication is applied 110 // Order by event_time first (firehose data wins), then indexed_at as tiebreaker 111 // Include deletes so we can return not-found for deleted records 112 let query = r#" 113 SELECT cid, record, operation 114 - FROM raw_records FINAL 115 WHERE did = ? 116 AND collection = ? 117 AND rkey = ? ··· 191 /// List records for a repo+collection 192 /// 193 /// Returns non-deleted records ordered by rkey, with cursor-based pagination. 194 pub async fn list_records( 195 &self, 196 did: &str, ··· 202 let order = if reverse { "DESC" } else { "ASC" }; 203 let cursor_op = if reverse { "<" } else { ">" }; 204 205 - // Build query with optional cursor 206 let query = if cursor.is_some() { 207 format!( 208 r#" 209 SELECT rkey, cid, record 210 - FROM raw_records FINAL 211 - WHERE did = ? 212 - AND collection = ? 213 - AND rkey {cursor_op} ? 214 - AND operation != 'delete' 215 ORDER BY rkey {order} 216 LIMIT ? 217 "#, ··· 220 format!( 221 r#" 222 SELECT rkey, cid, record 223 - FROM raw_records FINAL 224 - WHERE did = ? 225 - AND collection = ? 226 - AND operation != 'delete' 227 ORDER BY rkey {order} 228 LIMIT ? 229 "#,

··· 106 collection: &str, 107 rkey: &str, 108 ) -> Result<Option<RecordRow>, IndexError> { 109 // Order by event_time first (firehose data wins), then indexed_at as tiebreaker 110 // Include deletes so we can return not-found for deleted records 111 let query = r#" 112 SELECT cid, record, operation 113 + FROM raw_records 114 WHERE did = ? 115 AND collection = ? 116 AND rkey = ? ··· 190 /// List records for a repo+collection 191 /// 192 /// Returns non-deleted records ordered by rkey, with cursor-based pagination. 193 + /// Uses window function to get latest operation per rkey and filter out deletes. 194 pub async fn list_records( 195 &self, 196 did: &str, ··· 202 let order = if reverse { "DESC" } else { "ASC" }; 203 let cursor_op = if reverse { "<" } else { ">" }; 204 205 + // Use window function to get latest version per rkey, then filter out deletes 206 let query = if cursor.is_some() { 207 format!( 208 r#" 209 SELECT rkey, cid, record 210 + FROM ( 211 + SELECT rkey, cid, record, operation, 212 + ROW_NUMBER() OVER (PARTITION BY rkey ORDER BY event_time DESC, indexed_at DESC) as rn 213 + FROM raw_records 214 + WHERE did = ? 215 + AND collection = ? 216 + AND rkey {cursor_op} ? 217 + ) 218 + WHERE rn = 1 AND operation != 'delete' 219 ORDER BY rkey {order} 220 LIMIT ? 221 "#, ··· 224 format!( 225 r#" 226 SELECT rkey, cid, record 227 + FROM ( 228 + SELECT rkey, cid, record, operation, 229 + ROW_NUMBER() OVER (PARTITION BY rkey ORDER BY event_time DESC, indexed_at DESC) as rn 230 + FROM raw_records 231 + WHERE did = ? 232 + AND collection = ? 233 + ) 234 + WHERE rn = 1 AND operation != 'delete' 235 ORDER BY rkey {order} 236 LIMIT ? 237 "#,

+87 -1

crates/weaver-index/src/clickhouse/migrations.rs

··· 1 use crate::error::{ClickHouseError, IndexError}; 2 use include_dir::{Dir, include_dir}; 3 use tracing::info; 4 5 use super::Client; ··· 7 /// Embedded migrations directory - compiled into the binary 8 static MIGRATIONS_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/migrations/clickhouse"); 9 10 /// Migration runner for ClickHouse 11 pub struct Migrator<'a> { 12 client: &'a Client, ··· 32 files 33 } 34 35 /// Run all pending migrations 36 pub async fn run(&self) -> Result<MigrationResult, IndexError> { 37 // First, ensure the migrations table exists (bootstrap) ··· 57 } 58 59 info!(migration = %name, "applying migration"); 60 - self.client.execute(sql).await?; 61 self.record_migration(name).await?; 62 applied_count += 1; 63 } ··· 66 applied: applied_count, 67 skipped: skipped_count, 68 }) 69 } 70 71 /// Check which migrations would be applied without running them

··· 1 use crate::error::{ClickHouseError, IndexError}; 2 use include_dir::{Dir, include_dir}; 3 + use regex::Regex; 4 use tracing::info; 5 6 use super::Client; ··· 8 /// Embedded migrations directory - compiled into the binary 9 static MIGRATIONS_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/migrations/clickhouse"); 10 11 + /// Type of database object 12 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 13 + pub enum ObjectType { 14 + Table, 15 + MaterializedView, 16 + View, 17 + } 18 + 19 + /// A database object (table or view) extracted from migrations 20 + #[derive(Debug, Clone)] 21 + pub struct DbObject { 22 + pub name: String, 23 + pub object_type: ObjectType, 24 + } 25 + 26 + impl DbObject { 27 + /// Get the DROP statement for this object 28 + pub fn drop_statement(&self) -> String { 29 + match self.object_type { 30 + ObjectType::Table => format!("DROP TABLE IF EXISTS {}", self.name), 31 + ObjectType::MaterializedView | ObjectType::View => { 32 + format!("DROP VIEW IF EXISTS {}", self.name) 33 + } 34 + } 35 + } 36 + } 37 + 38 /// Migration runner for ClickHouse 39 pub struct Migrator<'a> { 40 client: &'a Client, ··· 60 files 61 } 62 63 + /// Extract all database objects (tables, views) from migrations 64 + /// Returns them in reverse order for safe dropping (MVs before their source tables) 65 + pub fn all_objects() -> Vec<DbObject> { 66 + let table_re = 67 + Regex::new(r"(?i)CREATE\s+TABLE\s+IF\s+NOT\s+EXISTS\s+(\w+)").unwrap(); 68 + let mv_re = 69 + Regex::new(r"(?i)CREATE\s+MATERIALIZED\s+VIEW\s+IF\s+NOT\s+EXISTS\s+(\w+)").unwrap(); 70 + let view_re = 71 + Regex::new(r"(?i)CREATE\s+VIEW\s+IF\s+NOT\s+EXISTS\s+(\w+)").unwrap(); 72 + 73 + let mut objects = Vec::new(); 74 + 75 + for (_, sql) in Self::migrations() { 76 + // Find all materialized views 77 + for caps in mv_re.captures_iter(sql) { 78 + objects.push(DbObject { 79 + name: caps[1].to_string(), 80 + object_type: ObjectType::MaterializedView, 81 + }); 82 + } 83 + // Find all regular views (excluding MVs already matched) 84 + for caps in view_re.captures_iter(sql) { 85 + let name = caps[1].to_string(); 86 + // Skip if already added as MV 87 + if !objects.iter().any(|o| o.name == name) { 88 + objects.push(DbObject { 89 + name, 90 + object_type: ObjectType::View, 91 + }); 92 + } 93 + } 94 + // Find all tables 95 + for caps in table_re.captures_iter(sql) { 96 + objects.push(DbObject { 97 + name: caps[1].to_string(), 98 + object_type: ObjectType::Table, 99 + }); 100 + } 101 + } 102 + 103 + // Reverse so MVs/views come before their source tables 104 + objects.reverse(); 105 + objects 106 + } 107 + 108 /// Run all pending migrations 109 pub async fn run(&self) -> Result<MigrationResult, IndexError> { 110 // First, ensure the migrations table exists (bootstrap) ··· 130 } 131 132 info!(migration = %name, "applying migration"); 133 + 134 + // Split by semicolons and execute each statement 135 + for statement in Self::split_statements(sql) { 136 + self.client.execute(statement).await?; 137 + } 138 + 139 self.record_migration(name).await?; 140 applied_count += 1; 141 } ··· 144 applied: applied_count, 145 skipped: skipped_count, 146 }) 147 + } 148 + 149 + /// Split SQL into individual statements 150 + fn split_statements(sql: &str) -> Vec<&str> { 151 + sql.split(';') 152 + .map(|s| s.trim()) 153 + .filter(|s| !s.is_empty()) 154 + .collect() 155 } 156 157 /// Check which migrations would be applied without running them

+3 -3

crates/weaver-index/src/error.rs

··· 93 /// ClickHouse database errors 94 #[derive(Debug, Error, Diagnostic)] 95 pub enum ClickHouseError { 96 - #[error("failed to connect to ClickHouse: {message}")] 97 #[diagnostic(code(clickhouse::connection))] 98 Connection { 99 message: String, ··· 101 source: clickhouse::error::Error, 102 }, 103 104 - #[error("ClickHouse query failed: {message}")] 105 #[diagnostic(code(clickhouse::query))] 106 Query { 107 message: String, ··· 109 source: clickhouse::error::Error, 110 }, 111 112 - #[error("failed to insert batch: {message}")] 113 #[diagnostic(code(clickhouse::insert))] 114 Insert { 115 message: String,

··· 93 /// ClickHouse database errors 94 #[derive(Debug, Error, Diagnostic)] 95 pub enum ClickHouseError { 96 + #[error("failed to connect to ClickHouse: {message}: {source}")] 97 #[diagnostic(code(clickhouse::connection))] 98 Connection { 99 message: String, ··· 101 source: clickhouse::error::Error, 102 }, 103 104 + #[error("ClickHouse query failed: {message}: {source}")] 105 #[diagnostic(code(clickhouse::query))] 106 Query { 107 message: String, ··· 109 source: clickhouse::error::Error, 110 }, 111 112 + #[error("failed to insert batch: {message}: {source}")] 113 #[diagnostic(code(clickhouse::insert))] 114 Insert { 115 message: String,