search for standard sites pub-search.waow.tech
search zig blog atproto
12
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 135 lines 4.0 kB view raw
1#!/usr/bin/env -S uv run --script --quiet 2# /// script 3# requires-python = ">=3.12" 4# dependencies = ["httpx", "pydantic-settings"] 5# /// 6"""Delete junk vectors from turbopuffer. 7 8Finds documents with short content or test titles in turso, 9hashes their URIs, and deletes the corresponding vectors from tpuf. 10""" 11 12import hashlib 13import os 14 15import httpx 16from pydantic_settings import BaseSettings, SettingsConfigDict 17 18 19class Settings(BaseSettings): 20 model_config = SettingsConfigDict( 21 env_file=os.environ.get("ENV_FILE", ".env"), extra="ignore" 22 ) 23 turso_url: str 24 turso_token: str 25 turbopuffer_api_key: str 26 turbopuffer_namespace: str = "leaflet-search" 27 28 @property 29 def turso_host(self) -> str: 30 url = self.turso_url 31 if url.startswith("libsql://"): 32 url = url[len("libsql://"):] 33 return url 34 35 36def turso_query(settings: Settings, sql: str, args: list | None = None): 37 stmt: dict = {"sql": sql} 38 if args: 39 stmt["args"] = [{"type": "text", "value": str(a)} for a in args] 40 41 response = httpx.post( 42 f"https://{settings.turso_host}/v2/pipeline", 43 headers={ 44 "Authorization": f"Bearer {settings.turso_token}", 45 "Content-Type": "application/json", 46 }, 47 json={ 48 "requests": [ 49 {"type": "execute", "stmt": stmt}, 50 {"type": "close"}, 51 ], 52 }, 53 timeout=30, 54 ) 55 response.raise_for_status() 56 return response.json() 57 58 59def hash_id(uri: str) -> str: 60 """Match tpuf.zig hashId: first 32 hex chars of SHA256.""" 61 return hashlib.sha256(uri.encode()).hexdigest()[:32] 62 63 64def tpuf_delete(settings: Settings, ids: list[str]): 65 """Delete vectors by ID from turbopuffer.""" 66 url = f"https://api.turbopuffer.com/v2/namespaces/{settings.turbopuffer_namespace}" 67 response = httpx.post( 68 url, 69 headers={ 70 "Authorization": f"Bearer {settings.turbopuffer_api_key}", 71 "Content-Type": "application/json", 72 }, 73 json={"deletes": ids}, 74 timeout=30, 75 ) 76 response.raise_for_status() 77 return response.json() 78 79 80settings = Settings() # type: ignore 81 82# find junk docs: short content OR test titles 83sql = """ 84SELECT uri, title, LENGTH(content) as content_len 85FROM documents 86WHERE embedded_at IS NOT NULL 87 AND ( 88 LENGTH(content) <= 50 89 OR LOWER(title) IN ('test', 'testing', 'untitled', 'test test', 'hello world') 90 OR LOWER(title) LIKE 'test %' 91 OR LOWER(title) LIKE '% test' 92 ) 93ORDER BY content_len ASC 94""" 95 96result = turso_query(settings, sql) 97rows = result["results"][0]["response"]["result"]["rows"] 98 99print(f"found {len(rows)} junk documents with vectors") 100 101if not rows: 102 print("nothing to clean up") 103 raise SystemExit(0) 104 105# show what we'll delete 106for row in rows[:20]: 107 uri = row[0]["value"] if isinstance(row[0], dict) else row[0] 108 title = row[1]["value"] if isinstance(row[1], dict) else row[1] 109 content_len = row[2]["value"] if isinstance(row[2], dict) else row[2] 110 print(f" [{content_len:>5} chars] {title!r:40s} {uri[:60]}") 111 112if len(rows) > 20: 113 print(f" ... and {len(rows) - 20} more") 114 115# compute tpuf IDs 116uris = [row[0]["value"] if isinstance(row[0], dict) else row[0] for row in rows] 117tpuf_ids = [hash_id(uri) for uri in uris] 118 119print(f"\ndeleting {len(tpuf_ids)} vectors from turbopuffer...") 120 121# batch delete (tpuf accepts up to 1000 per request) 122for i in range(0, len(tpuf_ids), 100): 123 batch = tpuf_ids[i : i + 100] 124 tpuf_delete(settings, batch) 125 print(f" deleted batch {i // 100 + 1} ({len(batch)} vectors)") 126 127print("done") 128 129# also clear embedded_at so these docs don't get re-embedded 130# (the new embedder filter will skip them anyway, but belt + suspenders) 131print("\nclearing embedded_at on junk docs...") 132for uri in uris: 133 turso_query(settings, "UPDATE documents SET embedded_at = NULL WHERE uri = ?", [uri]) 134 135print(f"cleared {len(uris)} embedded_at timestamps")