feat: token based search · quilling.dev/parakeet@2363156

+7

Cargo.lock

··· 684 684 "tokio-util", 685 685 "tracing", 686 686 "tracing-subscriber", 687 + "unicode-segmentation", 687 688 "urlencoding", 688 689 "zstd", 689 690 ] ··· 4518 4519 version = "0.1.3" 4519 4520 source = "registry+https://github.com/rust-lang/crates.io-index" 4520 4521 checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" 4522 + 4523 + [[package]] 4524 + name = "unicode-segmentation" 4525 + version = "1.12.0" 4526 + source = "registry+https://github.com/rust-lang/crates.io-index" 4527 + checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" 4521 4528 4522 4529 [[package]] 4523 4530 name = "unsigned-varint"

+1

consumer/Cargo.toml

··· 46 46 futures-channel = "0.3.31" 47 47 urlencoding = "2.1" 48 48 color-eyre = "0.6.5" 49 + unicode-segmentation = "1.10" 49 50 50 51 51 52 [lints.rust]

+2

consumer/src/database_writer/operations/mod.rs

··· 119 119 pub mod handlers; 120 120 pub mod notifications; 121 121 pub mod processor; 122 + pub mod search_index; 122 123 pub mod types; 123 124 124 125 pub use processor::process_record_to_operations; 126 + pub use search_index::SearchIndexWriter; 125 127 pub use types::{AggregateDelta, DatabaseOperation}; 126 128 // SelfLabels available via types::SelfLabels if needed, but most code imports from lexica directly 127 129 // Notification helpers re-exported for convenience

+69

consumer/src/database_writer/operations/search_index.rs

··· 1 + use crate::search::SearchTokenizer; 2 + use chrono::Utc; 3 + use deadpool_postgres::GenericClient; 4 + use eyre::Context as _; 5 + 6 + pub struct SearchIndexWriter { 7 + tokenizer: SearchTokenizer, 8 + } 9 + 10 + impl SearchIndexWriter { 11 + pub fn new() -> Self { 12 + Self { 13 + tokenizer: SearchTokenizer::new(), 14 + } 15 + } 16 + 17 + /// Index posts for search 18 + /// 19 + /// Takes post data (id, content, author_did, etc.) and creates tokenized search records. 20 + /// This is designed to be called from the database writer after posts are inserted. 21 + pub async fn index_posts_raw<C: GenericClient>( 22 + &self, 23 + conn: &C, 24 + post_data: &[(i64, Option<Vec<u8>>, String, Option<String>, bool, bool)], // (post_id, content, author_did, lang, has_media, has_links) 25 + ) -> eyre::Result<()> { 26 + if post_data.is_empty() { 27 + return Ok(()); 28 + } 29 + 30 + let now = Utc::now(); 31 + 32 + for (post_id, content, author_did, lang, has_media, has_links) in post_data { 33 + // Decompress and tokenize content 34 + let tokens = if let Some(content_bytes) = content { 35 + let decompressed = zstd::decode_all(content_bytes.as_slice()) 36 + .wrap_err("Failed to decompress post content")?; 37 + let text = String::from_utf8(decompressed) 38 + .wrap_err("Failed to decode post content as UTF-8")?; 39 + self.tokenizer.tokenize(&text) 40 + } else { 41 + // Empty post or stub - no tokens 42 + vec![] 43 + }; 44 + 45 + // Insert or update search tokens 46 + conn.execute( 47 + "INSERT INTO post_search_tokens (post_id, indexed_at, tokens, lang, has_media, has_links, author_did) 48 + VALUES ($1, $2, $3, $4, $5, $6, $7) 49 + ON CONFLICT (post_id) DO UPDATE SET 50 + tokens = EXCLUDED.tokens, 51 + indexed_at = EXCLUDED.indexed_at, 52 + lang = EXCLUDED.lang, 53 + has_media = EXCLUDED.has_media, 54 + has_links = EXCLUDED.has_links", 55 + &[post_id, &now, &tokens, lang, has_media, has_links, author_did], 56 + ) 57 + .await 58 + .wrap_err("Failed to insert/update search tokens")?; 59 + } 60 + 61 + Ok(()) 62 + } 63 + } 64 + 65 + impl Default for SearchIndexWriter { 66 + fn default() -> Self { 67 + Self::new() 68 + } 69 + }

+1

consumer/src/lib.rs

··· 15 15 // mod label_indexer; // Disabled - will be reimplemented with new cursor system 16 16 pub mod parsing; 17 17 mod relay; 18 + pub mod search; 18 19 mod sources; 19 20 pub mod types; 20 21 mod utils;

+2

consumer/src/search/mod.rs

··· 1 + pub mod tokenizer; 2 + pub use tokenizer::SearchTokenizer;

+154

consumer/src/search/tokenizer.rs

··· 1 + use regex::Regex; 2 + use std::collections::HashSet; 3 + use std::sync::LazyLock; 4 + use unicode_segmentation::UnicodeSegmentation; 5 + 6 + static HASHTAG_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"#([\p{L}\p{N}_]+)").unwrap()); 7 + static MENTION_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"@([\w-]+)\.[\w.-]+").unwrap()); 8 + 9 + pub struct SearchTokenizer { 10 + min_token_length: usize, 11 + max_tokens_per_post: usize, 12 + stopwords: HashSet<String>, 13 + } 14 + 15 + impl SearchTokenizer { 16 + pub fn new() -> Self { 17 + Self { 18 + min_token_length: 2, 19 + max_tokens_per_post: 100, 20 + stopwords: Self::load_stopwords(), 21 + } 22 + } 23 + 24 + pub fn tokenize(&self, content: &str) -> Vec<String> { 25 + let mut tokens = Vec::new(); 26 + let mut seen = HashSet::new(); 27 + 28 + // Extract hashtags (preserve as single tokens) 29 + for cap in HASHTAG_RE.captures_iter(content) { 30 + let tag = cap[1].to_lowercase(); 31 + if tag.len() >= self.min_token_length && seen.insert(tag.clone()) { 32 + tokens.push(tag); 33 + } 34 + } 35 + 36 + // Extract mentions (username only) 37 + for cap in MENTION_RE.captures_iter(content) { 38 + let username = cap[1].to_lowercase(); 39 + if username.len() >= self.min_token_length && seen.insert(username.clone()) { 40 + tokens.push(username); 41 + } 42 + } 43 + 44 + // Extract regular words 45 + for word in UnicodeSegmentation::unicode_words(content) { 46 + // Skip URLs 47 + if word.starts_with("http") || word.contains(".com") || word.contains(".org") { 48 + continue; 49 + } 50 + 51 + let normalized = word.to_lowercase(); 52 + 53 + // Filter: too short, stopwords, already seen 54 + if normalized.len() < self.min_token_length { 55 + continue; 56 + } 57 + if self.stopwords.contains(&normalized) { 58 + continue; 59 + } 60 + if !seen.insert(normalized.clone()) { 61 + continue; 62 + } 63 + 64 + tokens.push(normalized); 65 + 66 + if tokens.len() >= self.max_tokens_per_post { 67 + break; 68 + } 69 + } 70 + 71 + tokens 72 + } 73 + 74 + fn load_stopwords() -> HashSet<String> { 75 + // Top 100 English stopwords 76 + let words = vec![ 77 + "a", "an", "and", "are", "as", "at", "be", "by", "for", 78 + "from", "has", "he", "in", "is", "it", "its", "of", "on", 79 + "that", "the", "to", "was", "will", "with", "this", "but", 80 + "they", "have", "had", "what", "when", "where", "who", "which", 81 + "not", "or", "if", "so", "can", "been", "would", "could", 82 + "should", "may", "do", "does", "did", "me", "we", "us", 83 + "our", "your", "their", "his", "her", "she", "him", "them", 84 + "my", "you", "i", "am", "up", "out", "about", "into", 85 + "than", "then", "now", "all", "some", "any", "no", "more", 86 + "most", "just", "only", "very", "too", "also", "such", 87 + "much", "many", "even", "still", "how", "why", "here", 88 + "there", "said", "each", "which", "these", "those", "other", 89 + "another", "both", "few", "own", "same", "get", "got", 90 + ]; 91 + words.into_iter().map(String::from).collect() 92 + } 93 + } 94 + 95 + impl Default for SearchTokenizer { 96 + fn default() -> Self { 97 + Self::new() 98 + } 99 + } 100 + 101 + #[cfg(test)] 102 + mod tests { 103 + use super::*; 104 + 105 + #[test] 106 + fn test_hashtag_extraction() { 107 + let tokenizer = SearchTokenizer::new(); 108 + let tokens = tokenizer.tokenize("Love #Rust and #Bluesky! #AI2024"); 109 + 110 + assert!(tokens.contains(&"rust".to_string())); 111 + assert!(tokens.contains(&"bluesky".to_string())); 112 + assert!(tokens.contains(&"ai2024".to_string())); 113 + } 114 + 115 + #[test] 116 + fn test_mention_extraction() { 117 + let tokenizer = SearchTokenizer::new(); 118 + let tokens = tokenizer.tokenize("Hey @alice.bsky.social check this!"); 119 + 120 + assert!(tokens.contains(&"alice".to_string())); 121 + assert!(tokens.contains(&"hey".to_string())); 122 + assert!(tokens.contains(&"check".to_string())); 123 + } 124 + 125 + #[test] 126 + fn test_stopword_filtering() { 127 + let tokenizer = SearchTokenizer::new(); 128 + let tokens = tokenizer.tokenize("The quick brown fox"); 129 + 130 + assert!(!tokens.contains(&"the".to_string())); 131 + assert!(tokens.contains(&"quick".to_string())); 132 + assert!(tokens.contains(&"brown".to_string())); 133 + assert!(tokens.contains(&"fox".to_string())); 134 + } 135 + 136 + #[test] 137 + fn test_url_skipping() { 138 + let tokenizer = SearchTokenizer::new(); 139 + let tokens = tokenizer.tokenize("Check out https://example.com/article"); 140 + 141 + assert!(tokens.contains(&"check".to_string())); 142 + assert!(!tokens.iter().any(|t| t.contains("http"))); 143 + assert!(!tokens.iter().any(|t| t.contains("example"))); 144 + } 145 + 146 + #[test] 147 + fn test_deduplication() { 148 + let tokenizer = SearchTokenizer::new(); 149 + let tokens = tokenizer.tokenize("rust rust rust #Rust"); 150 + 151 + // Should only appear once (deduplicated) 152 + assert_eq!(tokens.iter().filter(|t| *t == "rust").count(), 1); 153 + } 154 + }

+1

migrations/2025-11-15-225000_create_post_search_tokens/down.sql

··· 1 + DROP TABLE IF EXISTS post_search_tokens CASCADE;

+31

migrations/2025-11-15-225000_create_post_search_tokens/up.sql

··· 1 + -- Create search tokens table (regular PostgreSQL table) 2 + -- TimescaleDB conversion will happen in a future migration 3 + CREATE TABLE post_search_tokens ( 4 + post_id BIGINT PRIMARY KEY, 5 + indexed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), 6 + tokens TEXT[] NOT NULL, 7 + lang TEXT, 8 + has_media BOOLEAN NOT NULL DEFAULT false, 9 + has_links BOOLEAN NOT NULL DEFAULT false, 10 + author_did TEXT NOT NULL, 11 + 12 + CONSTRAINT fk_post FOREIGN KEY (post_id) 13 + REFERENCES posts(id) ON DELETE CASCADE 14 + ); 15 + 16 + -- Indexes 17 + CREATE INDEX idx_post_search_tokens_gin 18 + ON post_search_tokens USING GIN (tokens); 19 + 20 + CREATE INDEX idx_post_search_author 21 + ON post_search_tokens (author_did, indexed_at DESC); 22 + 23 + CREATE INDEX idx_post_search_lang 24 + ON post_search_tokens (lang, indexed_at DESC) 25 + WHERE lang IS NOT NULL; 26 + 27 + CREATE INDEX idx_post_search_indexed_at 28 + ON post_search_tokens (indexed_at DESC); 29 + 30 + -- Note: We'll convert this to a TimescaleDB hypertable in a future migration 31 + -- See .claude/plans/1115-2249-TOKEN_SEARCH_IMPLEMENTATION.md appendix for TimescaleDB conversion plan

+27

parakeet-db/src/models.rs

··· 235 235 pub mentioned_actor_id: i32, // PK: FK to actors 236 236 } 237 237 238 + // Post search tokens - for token-based full-text search 239 + #[derive(Clone, Debug, Queryable, Selectable, Identifiable)] 240 + #[diesel(table_name = crate::schema::post_search_tokens)] 241 + #[diesel(primary_key(post_id))] 242 + #[diesel(check_for_backend(diesel::pg::Pg))] 243 + pub struct PostSearchToken { 244 + pub post_id: i64, // PK: FK to posts 245 + pub indexed_at: DateTime<Utc>, 246 + pub tokens: Vec<Option<String>>, // Token array for GIN index search 247 + pub lang: Option<String>, 248 + pub has_media: bool, 249 + pub has_links: bool, 250 + pub author_did: String, 251 + } 252 + 253 + #[derive(Debug, Insertable)] 254 + #[diesel(table_name = crate::schema::post_search_tokens)] 255 + pub struct NewPostSearchToken { 256 + pub post_id: i64, 257 + pub indexed_at: DateTime<Utc>, 258 + pub tokens: Vec<String>, // Non-nullable tokens for insertion 259 + pub lang: Option<String>, 260 + pub has_media: bool, 261 + pub has_links: bool, 262 + pub author_did: String, 263 + } 264 + 238 265 #[derive(Debug, Queryable, Selectable, Identifiable)] 239 266 #[diesel(table_name = crate::schema::uris)] 240 267 #[diesel(primary_key(id))]

+14

parakeet-db/src/schema.rs

··· 540 540 } 541 541 542 542 diesel::table! { 543 + post_search_tokens (post_id) { 544 + post_id -> Int8, 545 + indexed_at -> Timestamptz, 546 + tokens -> Array<Nullable<Text>>, 547 + lang -> Nullable<Text>, 548 + has_media -> Bool, 549 + has_links -> Bool, 550 + author_did -> Text, 551 + } 552 + } 553 + 554 + diesel::table! { 543 555 postgate_detached (postgate_id, detached_post_id) { 544 556 postgate_id -> Int8, 545 557 detached_post_id -> Int8, ··· 760 772 diesel::joinable!(post_facets -> uris (link_uri_id)); 761 773 diesel::joinable!(post_mentions -> actors (mentioned_actor_id)); 762 774 diesel::joinable!(post_mentions -> posts (post_id)); 775 + diesel::joinable!(post_search_tokens -> posts (post_id)); 763 776 diesel::joinable!(postgate_detached -> postgates (postgate_id)); 764 777 diesel::joinable!(postgate_detached -> posts (detached_post_id)); 765 778 diesel::joinable!(postgates -> actors (actor_id)); ··· 817 830 post_embed_video_captions, 818 831 post_facets, 819 832 post_mentions, 833 + post_search_tokens, 820 834 postgate_detached, 821 835 postgates, 822 836 posts,

Configure Feed

Configure Feed