search for standard sites
pub-search.waow.tech
search
zig
blog
atproto
1#!/usr/bin/env -S uv run --script --quiet
2# /// script
3# requires-python = ">=3.12"
4# dependencies = ["httpx", "pydantic-settings"]
5# ///
6"""Delete junk vectors from turbopuffer.
7
8Finds documents with short content or test titles in turso,
9hashes their URIs, and deletes the corresponding vectors from tpuf.
10"""
11
12import hashlib
13import os
14
15import httpx
16from pydantic_settings import BaseSettings, SettingsConfigDict
17
18
19class Settings(BaseSettings):
20 model_config = SettingsConfigDict(
21 env_file=os.environ.get("ENV_FILE", ".env"), extra="ignore"
22 )
23 turso_url: str
24 turso_token: str
25 turbopuffer_api_key: str
26 turbopuffer_namespace: str = "leaflet-search"
27
28 @property
29 def turso_host(self) -> str:
30 url = self.turso_url
31 if url.startswith("libsql://"):
32 url = url[len("libsql://"):]
33 return url
34
35
36def turso_query(settings: Settings, sql: str, args: list | None = None):
37 stmt: dict = {"sql": sql}
38 if args:
39 stmt["args"] = [{"type": "text", "value": str(a)} for a in args]
40
41 response = httpx.post(
42 f"https://{settings.turso_host}/v2/pipeline",
43 headers={
44 "Authorization": f"Bearer {settings.turso_token}",
45 "Content-Type": "application/json",
46 },
47 json={
48 "requests": [
49 {"type": "execute", "stmt": stmt},
50 {"type": "close"},
51 ],
52 },
53 timeout=30,
54 )
55 response.raise_for_status()
56 return response.json()
57
58
59def hash_id(uri: str) -> str:
60 """Match tpuf.zig hashId: first 32 hex chars of SHA256."""
61 return hashlib.sha256(uri.encode()).hexdigest()[:32]
62
63
64def tpuf_delete(settings: Settings, ids: list[str]):
65 """Delete vectors by ID from turbopuffer."""
66 url = f"https://api.turbopuffer.com/v2/namespaces/{settings.turbopuffer_namespace}"
67 response = httpx.post(
68 url,
69 headers={
70 "Authorization": f"Bearer {settings.turbopuffer_api_key}",
71 "Content-Type": "application/json",
72 },
73 json={"deletes": ids},
74 timeout=30,
75 )
76 response.raise_for_status()
77 return response.json()
78
79
80settings = Settings() # type: ignore
81
82# find junk docs: short content OR test titles
83sql = """
84SELECT uri, title, LENGTH(content) as content_len
85FROM documents
86WHERE embedded_at IS NOT NULL
87 AND (
88 LENGTH(content) <= 50
89 OR LOWER(title) IN ('test', 'testing', 'untitled', 'test test', 'hello world')
90 OR LOWER(title) LIKE 'test %'
91 OR LOWER(title) LIKE '% test'
92 )
93ORDER BY content_len ASC
94"""
95
96result = turso_query(settings, sql)
97rows = result["results"][0]["response"]["result"]["rows"]
98
99print(f"found {len(rows)} junk documents with vectors")
100
101if not rows:
102 print("nothing to clean up")
103 raise SystemExit(0)
104
105# show what we'll delete
106for row in rows[:20]:
107 uri = row[0]["value"] if isinstance(row[0], dict) else row[0]
108 title = row[1]["value"] if isinstance(row[1], dict) else row[1]
109 content_len = row[2]["value"] if isinstance(row[2], dict) else row[2]
110 print(f" [{content_len:>5} chars] {title!r:40s} {uri[:60]}")
111
112if len(rows) > 20:
113 print(f" ... and {len(rows) - 20} more")
114
115# compute tpuf IDs
116uris = [row[0]["value"] if isinstance(row[0], dict) else row[0] for row in rows]
117tpuf_ids = [hash_id(uri) for uri in uris]
118
119print(f"\ndeleting {len(tpuf_ids)} vectors from turbopuffer...")
120
121# batch delete (tpuf accepts up to 1000 per request)
122for i in range(0, len(tpuf_ids), 100):
123 batch = tpuf_ids[i : i + 100]
124 tpuf_delete(settings, batch)
125 print(f" deleted batch {i // 100 + 1} ({len(batch)} vectors)")
126
127print("done")
128
129# also clear embedded_at so these docs don't get re-embedded
130# (the new embedder filter will skip them anyway, but belt + suspenders)
131print("\nclearing embedded_at on junk docs...")
132for uri in uris:
133 turso_query(settings, "UPDATE documents SET embedded_at = NULL WHERE uri = ?", [uri])
134
135print(f"cleared {len(uris)} embedded_at timestamps")