build: convert to monorepo (migrate tap project) · desertthunder.dev/twisted@ee2f9c6

.browserslistrc apps/twisted/.browserslistrc

+6 -6

.gitignore

··· 25 25 /.versions 26 26 /.vscode/* 27 27 !/.vscode/extensions.json 28 - /coverage 29 - /dist 30 - /node_modules 31 - /platforms 32 - /plugins 33 - /www 28 + coverage 29 + dist 30 + node_modules 31 + platforms 32 + plugins 33 + www

+8 -10

README.md

··· 1 - # Twisted 1 + # Twisted Monorepo 2 2 3 - A mobile client for [Tangled](https://tangled.org). 3 + - `apps/twisted`: Ionic/Vue client 4 + - `packages/api`: Go API copied from `~/Projects/TWISTER` 4 5 5 6 ## Development 6 7 7 - Run the mobile apps with Capacitor: 8 + Use the top-level `justfile` for common tasks: 8 9 9 10 ```bash 10 - pnpm cap run ios 11 - pnpm cap run android 11 + just dev 12 + just build 13 + just test 12 14 ``` 13 15 14 - Or to test the web version: 15 - 16 - ```bash 17 - pnpm dev 18 - ``` 16 + The existing client package still works directly from `apps/twisted`.

android/.gitignore apps/twisted/android/.gitignore

android/app/.gitignore apps/twisted/android/app/.gitignore

android/app/build.gradle apps/twisted/android/app/build.gradle

android/app/capacitor.build.gradle apps/twisted/android/app/capacitor.build.gradle

android/app/proguard-rules.pro apps/twisted/android/app/proguard-rules.pro

android/app/src/androidTest/java/com/getcapacitor/myapp/ExampleInstrumentedTest.java apps/twisted/android/app/src/androidTest/java/com/getcapacitor/myapp/ExampleInstrumentedTest.java

android/app/src/main/AndroidManifest.xml apps/twisted/android/app/src/main/AndroidManifest.xml

android/app/src/main/java/io/ionic/starter/MainActivity.java apps/twisted/android/app/src/main/java/io/ionic/starter/MainActivity.java

android/app/src/main/res/drawable-land-hdpi/splash.png apps/twisted/android/app/src/main/res/drawable-land-hdpi/splash.png

android/app/src/main/res/drawable-land-mdpi/splash.png apps/twisted/android/app/src/main/res/drawable-land-mdpi/splash.png

android/app/src/main/res/drawable-land-xhdpi/splash.png apps/twisted/android/app/src/main/res/drawable-land-xhdpi/splash.png

android/app/src/main/res/drawable-land-xxhdpi/splash.png apps/twisted/android/app/src/main/res/drawable-land-xxhdpi/splash.png

android/app/src/main/res/drawable-land-xxxhdpi/splash.png apps/twisted/android/app/src/main/res/drawable-land-xxxhdpi/splash.png

android/app/src/main/res/drawable-port-hdpi/splash.png apps/twisted/android/app/src/main/res/drawable-port-hdpi/splash.png

android/app/src/main/res/drawable-port-mdpi/splash.png apps/twisted/android/app/src/main/res/drawable-port-mdpi/splash.png

android/app/src/main/res/drawable-port-xhdpi/splash.png apps/twisted/android/app/src/main/res/drawable-port-xhdpi/splash.png

android/app/src/main/res/drawable-port-xxhdpi/splash.png apps/twisted/android/app/src/main/res/drawable-port-xxhdpi/splash.png

android/app/src/main/res/drawable-port-xxxhdpi/splash.png apps/twisted/android/app/src/main/res/drawable-port-xxxhdpi/splash.png

android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml apps/twisted/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml

android/app/src/main/res/drawable/ic_launcher_background.xml apps/twisted/android/app/src/main/res/drawable/ic_launcher_background.xml

android/app/src/main/res/drawable/splash.png apps/twisted/android/app/src/main/res/drawable/splash.png

android/app/src/main/res/layout/activity_main.xml apps/twisted/android/app/src/main/res/layout/activity_main.xml

android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml apps/twisted/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml

android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml apps/twisted/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml

android/app/src/main/res/mipmap-hdpi/ic_launcher.png apps/twisted/android/app/src/main/res/mipmap-hdpi/ic_launcher.png

android/app/src/main/res/mipmap-hdpi/ic_launcher_foreground.png apps/twisted/android/app/src/main/res/mipmap-hdpi/ic_launcher_foreground.png

android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png apps/twisted/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png

android/app/src/main/res/mipmap-mdpi/ic_launcher.png apps/twisted/android/app/src/main/res/mipmap-mdpi/ic_launcher.png

android/app/src/main/res/mipmap-mdpi/ic_launcher_foreground.png apps/twisted/android/app/src/main/res/mipmap-mdpi/ic_launcher_foreground.png

android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png apps/twisted/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png

android/app/src/main/res/mipmap-xhdpi/ic_launcher.png apps/twisted/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png

android/app/src/main/res/mipmap-xhdpi/ic_launcher_foreground.png apps/twisted/android/app/src/main/res/mipmap-xhdpi/ic_launcher_foreground.png

android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png apps/twisted/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png

android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png apps/twisted/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png

android/app/src/main/res/mipmap-xxhdpi/ic_launcher_foreground.png apps/twisted/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_foreground.png

android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png apps/twisted/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png

android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png apps/twisted/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png

android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_foreground.png apps/twisted/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_foreground.png

android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png apps/twisted/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png

android/app/src/main/res/values/ic_launcher_background.xml apps/twisted/android/app/src/main/res/values/ic_launcher_background.xml

android/app/src/main/res/values/strings.xml apps/twisted/android/app/src/main/res/values/strings.xml

android/app/src/main/res/values/styles.xml apps/twisted/android/app/src/main/res/values/styles.xml

android/app/src/main/res/xml/file_paths.xml apps/twisted/android/app/src/main/res/xml/file_paths.xml

android/app/src/test/java/com/getcapacitor/myapp/ExampleUnitTest.java apps/twisted/android/app/src/test/java/com/getcapacitor/myapp/ExampleUnitTest.java

android/build.gradle apps/twisted/android/build.gradle

android/capacitor.settings.gradle apps/twisted/android/capacitor.settings.gradle

android/gradle.properties apps/twisted/android/gradle.properties

android/gradle/wrapper/gradle-wrapper.jar apps/twisted/android/gradle/wrapper/gradle-wrapper.jar

android/gradle/wrapper/gradle-wrapper.properties apps/twisted/android/gradle/wrapper/gradle-wrapper.properties

android/gradlew apps/twisted/android/gradlew

android/gradlew.bat apps/twisted/android/gradlew.bat

android/settings.gradle apps/twisted/android/settings.gradle

android/variables.gradle apps/twisted/android/variables.gradle

+18

apps/twisted/README.md

··· 1 + # Twisted 2 + 3 + A mobile client for [Tangled](https://tangled.org). 4 + 5 + ## Development 6 + 7 + Run the mobile apps with Capacitor: 8 + 9 + ```bash 10 + pnpm cap run ios 11 + pnpm cap run android 12 + ``` 13 + 14 + Or to test the web version: 15 + 16 + ```bash 17 + pnpm dev 18 + ```

+63

apps/twisted/package.json

··· 1 + { 2 + "name": "@twisted/app", 3 + "private": true, 4 + "version": "0.0.1", 5 + "type": "module", 6 + "scripts": { 7 + "dev": "vite", 8 + "build": "vue-tsc && vite build", 9 + "preview": "vite preview", 10 + "test:e2e": "cypress run", 11 + "test:unit": "vitest", 12 + "lint": "eslint .", 13 + "check": "vue-tsc --noEmit" 14 + }, 15 + "dependencies": { 16 + "@atcute/bluesky": "^3.3.0", 17 + "@atcute/client": "^4.2.1", 18 + "@atcute/tangled": "^1.0.17", 19 + "@capacitor/android": "^8.2.0", 20 + "@capacitor/app": "8.0.1", 21 + "@capacitor/core": "8.2.0", 22 + "@capacitor/haptics": "8.0.1", 23 + "@capacitor/ios": "^8.2.0", 24 + "@capacitor/keyboard": "8.0.1", 25 + "@capacitor/status-bar": "8.0.1", 26 + "@ionic/vue": "^8.0.0", 27 + "@ionic/vue-router": "^8.0.0", 28 + "@shikijs/markdown-it": "^4.0.2", 29 + "@tanstack/query-persist-client-core": "^5.95.0", 30 + "@tanstack/vue-query": "^5.94.5", 31 + "dompurify": "^3.3.3", 32 + "idb-keyval": "^6.2.2", 33 + "ionicons": "^7.0.0", 34 + "markdown-it": "^14.1.1", 35 + "pinia": "^3.0.4", 36 + "shiki": "^4.0.2", 37 + "vue": "^3.3.0", 38 + "vue-router": "^4.2.0" 39 + }, 40 + "devDependencies": { 41 + "@capacitor/cli": "8.2.0", 42 + "@eslint/js": "10.0.1", 43 + "@types/dompurify": "^3.2.0", 44 + "@types/node": "25.5.0", 45 + "@vitejs/plugin-legacy": "^5.0.0", 46 + "@vitejs/plugin-vue": "^4.0.0", 47 + "@vue/test-utils": "^2.3.0", 48 + "cypress": "^13.5.0", 49 + "eslint": "^10.1.0", 50 + "eslint-plugin-vue": "^10.8.0", 51 + "globals": "17.4.0", 52 + "jsdom": "^22.1.0", 53 + "prettier": "^3.8.1", 54 + "terser": "^5.4.0", 55 + "typescript": "~5.9.0", 56 + "typescript-eslint": "^8.57.1", 57 + "vite": "^5.0.0", 58 + "vitest": "^0.34.6", 59 + "vue-eslint-parser": "10.4.0", 60 + "vue-tsc": "^2.1.10" 61 + }, 62 + "description": "An Ionic project" 63 + }

capacitor.config.ts apps/twisted/capacitor.config.ts

cypress.config.ts apps/twisted/cypress.config.ts

docs/specs/README.md apps/twisted/docs/specs/README.md

docs/specs/phase-1.md apps/twisted/docs/specs/phase-1.md

docs/specs/phase-2.md apps/twisted/docs/specs/phase-2.md

docs/specs/phase-3.md apps/twisted/docs/specs/phase-3.md

docs/specs/phase-4.md apps/twisted/docs/specs/phase-4.md

docs/specs/phase-5.md apps/twisted/docs/specs/phase-5.md

docs/specs/phase-6.md apps/twisted/docs/specs/phase-6.md

docs/specs/phase-7.md apps/twisted/docs/specs/phase-7.md

docs/tasks/phase-1.md apps/twisted/docs/tasks/phase-1.md

docs/tasks/phase-2.md apps/twisted/docs/tasks/phase-2.md

docs/tasks/phase-3.md apps/twisted/docs/tasks/phase-3.md

docs/tasks/phase-4.md apps/twisted/docs/tasks/phase-4.md

docs/tasks/phase-5.md apps/twisted/docs/tasks/phase-5.md

docs/tasks/phase-6.md apps/twisted/docs/tasks/phase-6.md

docs/tasks/phase-7.md apps/twisted/docs/tasks/phase-7.md

eslint.config.js apps/twisted/eslint.config.js

index.html apps/twisted/index.html

ionic.config.json apps/twisted/ionic.config.json

ios/.gitignore apps/twisted/ios/.gitignore

ios/App/App.xcodeproj/project.pbxproj apps/twisted/ios/App/App.xcodeproj/project.pbxproj

ios/App/App.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist apps/twisted/ios/App/App.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist

ios/App/App.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved apps/twisted/ios/App/App.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved

ios/App/App/AppDelegate.swift apps/twisted/ios/App/App/AppDelegate.swift

ios/App/App/Assets.xcassets/AppIcon.appiconset/AppIcon-512@2x.png apps/twisted/ios/App/App/Assets.xcassets/AppIcon.appiconset/AppIcon-512@2x.png

ios/App/App/Assets.xcassets/AppIcon.appiconset/Contents.json apps/twisted/ios/App/App/Assets.xcassets/AppIcon.appiconset/Contents.json

ios/App/App/Assets.xcassets/Contents.json apps/twisted/ios/App/App/Assets.xcassets/Contents.json

ios/App/App/Assets.xcassets/Splash.imageset/Contents.json apps/twisted/ios/App/App/Assets.xcassets/Splash.imageset/Contents.json

ios/App/App/Assets.xcassets/Splash.imageset/splash-2732x2732-1.png apps/twisted/ios/App/App/Assets.xcassets/Splash.imageset/splash-2732x2732-1.png

ios/App/App/Assets.xcassets/Splash.imageset/splash-2732x2732-2.png apps/twisted/ios/App/App/Assets.xcassets/Splash.imageset/splash-2732x2732-2.png

ios/App/App/Assets.xcassets/Splash.imageset/splash-2732x2732.png apps/twisted/ios/App/App/Assets.xcassets/Splash.imageset/splash-2732x2732.png

ios/App/App/Base.lproj/LaunchScreen.storyboard apps/twisted/ios/App/App/Base.lproj/LaunchScreen.storyboard

ios/App/App/Base.lproj/Main.storyboard apps/twisted/ios/App/App/Base.lproj/Main.storyboard

ios/App/App/Info.plist apps/twisted/ios/App/App/Info.plist

ios/App/CapApp-SPM/.gitignore apps/twisted/ios/App/CapApp-SPM/.gitignore

ios/App/CapApp-SPM/Package.swift apps/twisted/ios/App/CapApp-SPM/Package.swift

ios/App/CapApp-SPM/README.md apps/twisted/ios/App/CapApp-SPM/README.md

ios/App/CapApp-SPM/Sources/CapApp-SPM/CapApp-SPM.swift apps/twisted/ios/App/CapApp-SPM/Sources/CapApp-SPM/CapApp-SPM.swift

ios/debug.xcconfig apps/twisted/ios/debug.xcconfig

+54

justfile

··· 1 + set shell := ["/bin/zsh", "-cu"] 2 + 3 + default: 4 + @just --list 5 + 6 + dev: 7 + pnpm --dir apps/twisted dev 8 + 9 + build: 10 + pnpm --dir apps/twisted build 11 + just --justfile packages/api/justfile build 12 + 13 + lint: 14 + pnpm --dir apps/twisted lint 15 + 16 + check: 17 + pnpm --dir apps/twisted check 18 + 19 + test: 20 + pnpm --dir apps/twisted test:unit 21 + just --justfile packages/api/justfile test 22 + 23 + app-build: 24 + pnpm --dir apps/twisted build 25 + 26 + app-preview: 27 + pnpm --dir apps/twisted preview 28 + 29 + app-test-unit: 30 + pnpm --dir apps/twisted test:unit 31 + 32 + app-test-e2e: 33 + pnpm --dir apps/twisted test:e2e 34 + 35 + app-cap-ios: 36 + pnpm --dir apps/twisted exec cap run ios 37 + 38 + app-cap-android: 39 + pnpm --dir apps/twisted exec cap run android 40 + 41 + api-build: 42 + just --justfile packages/api/justfile build 43 + 44 + api-run-api: 45 + just --justfile packages/api/justfile run-api 46 + 47 + api-run-indexer: 48 + just --justfile packages/api/justfile run-indexer 49 + 50 + api-test: 51 + just --justfile packages/api/justfile test 52 + 53 + api-clean: 54 + just --justfile packages/api/justfile clean

+15 -59

package.json

··· 1 1 { 2 - "name": "twisted", 2 + "name": "twisted-monorepo", 3 3 "private": true, 4 - "version": "0.0.1", 5 - "type": "module", 6 4 "scripts": { 7 - "dev": "vite", 8 - "build": "vue-tsc && vite build", 9 - "preview": "vite preview", 10 - "test:e2e": "cypress run", 11 - "test:unit": "vitest", 12 - "lint": "eslint .", 13 - "check": "vue-tsc --noEmit" 14 - }, 15 - "dependencies": { 16 - "@atcute/bluesky": "^3.3.0", 17 - "@atcute/client": "^4.2.1", 18 - "@atcute/tangled": "^1.0.17", 19 - "@capacitor/android": "^8.2.0", 20 - "@capacitor/app": "8.0.1", 21 - "@capacitor/core": "8.2.0", 22 - "@capacitor/haptics": "8.0.1", 23 - "@capacitor/ios": "^8.2.0", 24 - "@capacitor/keyboard": "8.0.1", 25 - "@capacitor/status-bar": "8.0.1", 26 - "@ionic/vue": "^8.0.0", 27 - "@ionic/vue-router": "^8.0.0", 28 - "@shikijs/markdown-it": "^4.0.2", 29 - "@tanstack/query-persist-client-core": "^5.95.0", 30 - "@tanstack/vue-query": "^5.94.5", 31 - "dompurify": "^3.3.3", 32 - "idb-keyval": "^6.2.2", 33 - "ionicons": "^7.0.0", 34 - "markdown-it": "^14.1.1", 35 - "pinia": "^3.0.4", 36 - "shiki": "^4.0.2", 37 - "vue": "^3.3.0", 38 - "vue-router": "^4.2.0" 39 - }, 40 - "devDependencies": { 41 - "@capacitor/cli": "8.2.0", 42 - "@eslint/js": "10.0.1", 43 - "@types/dompurify": "^3.2.0", 44 - "@types/node": "25.5.0", 45 - "@vitejs/plugin-legacy": "^5.0.0", 46 - "@vitejs/plugin-vue": "^4.0.0", 47 - "@vue/test-utils": "^2.3.0", 48 - "cypress": "^13.5.0", 49 - "eslint": "^10.1.0", 50 - "eslint-plugin-vue": "^10.8.0", 51 - "globals": "17.4.0", 52 - "jsdom": "^22.1.0", 53 - "prettier": "^3.8.1", 54 - "terser": "^5.4.0", 55 - "typescript": "~5.9.0", 56 - "typescript-eslint": "^8.57.1", 57 - "vite": "^5.0.0", 58 - "vitest": "^0.34.6", 59 - "vue-eslint-parser": "10.4.0", 60 - "vue-tsc": "^2.1.10" 61 - }, 62 - "description": "An Ionic project" 5 + "dev": "pnpm --dir apps/twisted dev", 6 + "build": "pnpm --dir apps/twisted build", 7 + "preview": "pnpm --dir apps/twisted preview", 8 + "lint": "pnpm --dir apps/twisted lint", 9 + "check": "pnpm --dir apps/twisted check", 10 + "test:unit": "pnpm --dir apps/twisted test:unit", 11 + "test:e2e": "pnpm --dir apps/twisted test:e2e", 12 + "app:cap:run:ios": "pnpm --dir apps/twisted exec cap run ios", 13 + "app:cap:run:android": "pnpm --dir apps/twisted exec cap run android", 14 + "api:build": "just --justfile packages/api/justfile build", 15 + "api:test": "just --justfile packages/api/justfile test", 16 + "api:run:api": "just --justfile packages/api/justfile run-api", 17 + "api:run:indexer": "just --justfile packages/api/justfile run-indexer" 18 + } 63 19 }

+27

packages/api/.env.example

··· 1 + TAP_URL=wss://tap-instance.up.railway.app/channel 2 + TAP_AUTH_PASSWORD=your-tap-admin-password 3 + 4 + TURSO_DATABASE_URL=libsql://twister-db.turso.io 5 + TURSO_AUTH_TOKEN=eyJhbGci... 6 + 7 + INDEXED_COLLECTIONS=sh.tangled.* 8 + 9 + SEARCH_DEFAULT_LIMIT=20 10 + SEARCH_MAX_LIMIT=100 11 + SEARCH_DEFAULT_MODE=keyword 12 + 13 + HYBRID_KEYWORD_WEIGHT=0.65 14 + HYBRID_SEMANTIC_WEIGHT=0.35 15 + 16 + # EMBEDDING_PROVIDER=openai 17 + # EMBEDDING_MODEL=text-embedding-3-small 18 + # EMBEDDING_API_KEY=sk-... 19 + # EMBEDDING_API_URL= 20 + # EMBEDDING_DIM=768 21 + # EMBEDDING_BATCH_SIZE=32 22 + 23 + HTTP_BIND_ADDR=:8080 24 + LOG_LEVEL=info 25 + LOG_FORMAT=json 26 + ENABLE_ADMIN_ENDPOINTS=false 27 + # ADMIN_AUTH_TOKEN=

+33

packages/api/.gitignore

··· 1 + # If you prefer the allow list template instead of the deny list, see community template: 2 + # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 + # 4 + # Binaries for programs and plugins 5 + twister 6 + *.exe 7 + *.exe~ 8 + *.dll 9 + *.so 10 + *.dylib 11 + 12 + # Test binary, built with `go test -c` 13 + *.test 14 + 15 + # Code coverage profiles and other test artifacts 16 + *.out 17 + coverage.* 18 + *.coverprofile 19 + profile.cov 20 + 21 + # Dependency directories (remove the comment below to include it) 22 + # vendor/ 23 + 24 + # Go workspace file 25 + go.work 26 + go.work.sum 27 + 28 + # env file 29 + .env 30 + 31 + # Editor/IDE 32 + # .idea/ 33 + # .vscode/

+26

packages/api/Dockerfile

··· 1 + FROM golang:1.24-alpine AS builder 2 + 3 + WORKDIR /app 4 + 5 + COPY go.mod go.sum ./ 6 + RUN go mod download 7 + 8 + COPY . . 9 + 10 + ARG VERSION=dev 11 + ARG COMMIT=none 12 + 13 + RUN CGO_ENABLED=0 GOOS=linux go build \ 14 + -ldflags="-s -w -X main.version=${VERSION} -X main.commit=${COMMIT}" \ 15 + -o /app/twister \ 16 + ./main.go 17 + 18 + FROM alpine:3.21 19 + 20 + RUN apk add --no-cache ca-certificates tzdata 21 + 22 + COPY --from=builder /app/twister /usr/local/bin/twister 23 + 24 + EXPOSE 8080 9090 9091 25 + 26 + CMD ["twister", "api"]

+3

packages/api/README.md

··· 1 + # Twister 2 + 3 + Tap-based search engine for Tangled.

+292

packages/api/docs/specs/01-architecture.md

··· 1 + --- 2 + title: "Spec 01 — Architecture" 3 + updated: 2026-03-22 4 + --- 5 + 6 + ## 1. Purpose 7 + 8 + Build a Go-based search service for Tangled content on AT Protocol that: 9 + 10 + * ingests Tangled records through **Tap** (already deployed on Railway) 11 + * denormalizes them into internal search documents 12 + * indexes them in **Turso/libSQL** 13 + * exposes a search API with **keyword**, **semantic**, and **hybrid** retrieval modes 14 + 15 + ## 2. Functional Goals 16 + 17 + The system shall: 18 + 19 + * index Tangled-specific ATProto collections under the `sh.tangled.*` namespace 20 + * support initial backfill and continuous incremental sync via Tap 21 + * support lexical retrieval using Turso's Tantivy-backed FTS 22 + * support semantic retrieval using vector embeddings 23 + * support hybrid ranking combining lexical and semantic signals 24 + * expose stable HTTP APIs for search and document lookup 25 + * support deployment on **Railway** 26 + 27 + ## 3. Non-Functional Goals 28 + 29 + The system shall prioritize: 30 + 31 + * **correctness of sync** — cursors never advance ahead of committed data 32 + * **operational simplicity** — single binary, subcommand-driven 33 + * **incremental delivery** — keyword search ships before embeddings 34 + * **small deployable services** — process groups, not microservices 35 + * **reindexability** — any document or collection can be re-normalized and re-indexed 36 + * **low coupling** — sync, indexing, and serving are independent concerns 37 + 38 + ## 4. Out of Scope (v1) 39 + 40 + * code-aware symbol search 41 + * sourcegraph-style structural search 42 + * personalized ranking 43 + * access control beyond public/private visibility flags in indexed records 44 + * full analytics pipeline 45 + * custom ANN infrastructure outside Turso/libSQL 46 + 47 + ## 5. Design Principles 48 + 49 + 1. **Tap owns synchronization correctness.** The application does not consume the raw firehose. Tap handles connection, cryptographic verification, backfill, and filtering. 50 + 51 + 2. **The indexer owns denormalization.** Raw ATProto records are never queried directly by the public API. 52 + 53 + 3. **Search serves denormalized documents.** Search ranking depends on the document model, not transport. 54 + 55 + 4. **Keyword search is the baseline.** Semantic and hybrid search are layered on top. 56 + 57 + 5. **Embeddings are asynchronous.** Ingestion is never blocked on vector generation unless explicitly configured. 58 + 59 + ## 6. External Systems 60 + 61 + - **AT Protocol network** — source of all Tangled content 62 + - **Tap** — filtered event delivery from the AT Protocol firehose (deployed on Railway) 63 + - **Turso/libSQL** — relational storage, Tantivy-backed FTS, and native vector search 64 + - **Embedding provider** — generates vectors for semantic search 65 + - **Railway** — deployment platform for Twister services and Tap 66 + 67 + ## 7. Architecture Summary 68 + 69 + ```text 70 + ATProto Firehose / PDS 71 + │ 72 + ▼ 73 + Tap (Railway) 74 + │ WebSocket / webhook JSON events 75 + ▼ 76 + Go Indexer Service 77 + ├─ decode Tap events 78 + ├─ normalize records → documents 79 + ├─ upsert documents 80 + ├─ schedule embeddings 81 + └─ persist sync cursor 82 + │ 83 + ▼ 84 + Turso/libSQL 85 + ├─ documents table 86 + ├─ document_embeddings table 87 + ├─ FTS index (Tantivy-backed) 88 + ├─ vector index (DiskANN) 89 + └─ sync_state table 90 + │ 91 + ▼ 92 + Go Search API 93 + ├─ keyword search (fts_match / fts_score) 94 + ├─ semantic search (vector_top_k) 95 + ├─ hybrid search (weighted merge) 96 + └─ document fetch 97 + ``` 98 + 99 + ## 8. Runtime Units 100 + 101 + | Unit | Role | Deployment | 102 + | -------------- | ----------------------------------- | ------------------------------- | 103 + | `api` | HTTP search and document API | Railway service (public) | 104 + | `indexer` | Tap consumer, normalizer, DB writer | Railway service (internal) | 105 + | `embed-worker` | Async embedding generation | Optional Railway service | 106 + | `tap` | ATProto sync | Railway (already deployed) | 107 + 108 + ## 9. Repository Structure 109 + 110 + ```text 111 + main.go 112 + 113 + internal/ 114 + api/ # HTTP handlers, middleware, routes 115 + config/ # Config struct, env parsing 116 + embed/ # Embedding provider abstraction, worker 117 + index/ # FTS and vector index management 118 + ingest/ # Tap event consumer, ingestion loop 119 + normalize/ # Per-collection record → document adapters 120 + observability/# Structured logging, metrics 121 + ranking/ # Score normalization, hybrid merge 122 + search/ # Search orchestration (keyword, semantic, hybrid) 123 + store/ # DB access layer, migrations, domain types 124 + tapclient/ # Tap WebSocket/webhook client 125 + ``` 126 + 127 + ## 10. Binary Subcommands 128 + 129 + ```bash 130 + twister api # Start HTTP search API 131 + twister indexer # Start Tap consumer / indexer 132 + twister embed-worker # Start async embedding worker 133 + twister reindex # Re-normalize and upsert documents 134 + twister reembed # Re-generate embeddings 135 + twister backfill # Bootstrap index from seed users 136 + twister healthcheck # One-shot health probe 137 + ``` 138 + 139 + ## 11. Technology Choices 140 + 141 + ### Language: Go 142 + 143 + Go is the implementation language for the API server, indexer, embedding worker, and CLI commands. Rationale: straightforward long-running services, excellent HTTP support, good concurrency model, small container footprint. 144 + 145 + ### Sync Layer: Tap 146 + 147 + Tap is the only supported sync source in v1. It handles firehose connection, cryptographic verification, backfill, and filtering, then delivers simple JSON events via WebSocket or webhook. 148 + 149 + **Tap is already deployed on Railway.** Twister connects to it as a WebSocket client. 150 + 151 + #### Tap Capabilities 152 + 153 + - Validates repo structure, MST integrity, and identity signatures 154 + - Automatic backfill fetches full repo history from PDS when repos are added 155 + - Filtered output by DID list, collection, or full network mode 156 + - Ordering guarantees: historical events (`live: false`) delivered before live events (`live: true`) 157 + 158 + #### Tap Delivery Modes 159 + 160 + | Mode | Config | Behavior | 161 + | -------------------------- | ----------------------- | ------------------------------------------------- | 162 + | WebSocket + acks (default) | — | Client acks each event; no data loss | 163 + | Fire-and-forget | `TAP_DISABLE_ACKS=true` | Events marked acked on receipt; simpler but lossy | 164 + | Webhook | `TAP_WEBHOOK_URL=...` | Events POSTed as JSON; acked on HTTP 200 | 165 + 166 + #### Tap API Endpoints (reference) 167 + 168 + | Endpoint | Method | Purpose | 169 + | --------------------- | ------ | ------------------------------------- | 170 + | `/health` | GET | Health check | 171 + | `/channel` | WS | WebSocket event stream | 172 + | `/repos/add` | POST | Add DIDs to track | 173 + | `/repos/remove` | POST | Stop tracking a repo | 174 + | `/info/:did` | GET | Repo state, rev, record count, errors | 175 + | `/stats/repo-count` | GET | Total tracked repos | 176 + | `/stats/record-count` | GET | Total tracked records | 177 + | `/stats/cursors` | GET | Firehose and list repos cursors | 178 + 179 + #### Key Tap Configuration 180 + 181 + | Variable | Default | Purpose | 182 + | ------------------------ | ------- | ---------------------------------------------------------------------------------- | 183 + | `TAP_SIGNAL_COLLECTION` | — | Auto-track repos with records in this collection | 184 + | `TAP_COLLECTION_FILTERS` | — | Comma-separated collection filters (e.g., `sh.tangled.repo,sh.tangled.repo.issue`) | 185 + | `TAP_ADMIN_PASSWORD` | — | Basic auth for API access | 186 + | `TAP_DISABLE_ACKS` | `false` | Fire-and-forget mode | 187 + | `TAP_WEBHOOK_URL` | — | Webhook delivery URL | 188 + 189 + ### Storage and Search: Turso/libSQL 190 + 191 + Turso/libSQL is used for relational metadata storage, Tantivy-backed full-text search, and native vector search. 192 + 193 + #### Go SDK Options 194 + 195 + | Package | CGo | Embedded Replicas | Remote | 196 + | -------------------------------------------------- | --- | ----------------- | ------ | 197 + | `github.com/tursodatabase/go-libsql` | Yes | Yes | Yes | 198 + | `github.com/tursodatabase/libsql-client-go/libsql` | No | No | Yes | 199 + 200 + Both register as `database/sql` drivers under `"libsql"`. They cannot be imported in the same binary. 201 + 202 + **Recommendation:** Use `libsql-client-go` (pure Go, remote-only) unless embedded replicas are needed for local read performance. 203 + 204 + #### Connection Patterns 205 + 206 + ```go 207 + // Remote only (pure Go, no CGo) 208 + import _ "github.com/tursodatabase/libsql-client-go/libsql" 209 + db, err := sql.Open("libsql", "libsql://your-db.turso.io?authToken=TOKEN") 210 + 211 + // Embedded replica (CGo required) 212 + import "github.com/tursodatabase/go-libsql" 213 + connector, err := libsql.NewEmbeddedReplicaConnector( 214 + "local.db", "libsql://your-db.turso.io", 215 + libsql.WithAuthToken("TOKEN"), 216 + libsql.WithSyncInterval(time.Minute), 217 + ) 218 + db := sql.OpenDB(connector) 219 + ``` 220 + 221 + #### Full-Text Search (Tantivy-backed) 222 + 223 + Turso FTS is **not** standard SQLite FTS5. It uses Tantivy under the hood. 224 + 225 + ```sql 226 + -- Create FTS index with per-column tokenizers and weights 227 + CREATE INDEX idx_docs_fts ON documents USING fts ( 228 + title WITH tokenizer=default, 229 + body WITH tokenizer=default, 230 + summary WITH tokenizer=default, 231 + repo_name WITH tokenizer=simple, 232 + author_handle WITH tokenizer=raw 233 + ) WITH (weights='title=3.0,repo_name=2.5,author_handle=2.0,summary=1.5,body=1.0'); 234 + 235 + -- Filter by match 236 + SELECT id, title FROM documents 237 + WHERE fts_match(title, body, summary, repo_name, author_handle, 'search query'); 238 + 239 + -- BM25 scoring 240 + SELECT id, title, fts_score(title, body, summary, repo_name, author_handle, 'search query') AS score 241 + FROM documents 242 + ORDER BY score DESC; 243 + 244 + -- Highlighting 245 + SELECT fts_highlight(title, '', '', 'search query') AS highlighted 246 + FROM documents; 247 + ``` 248 + 249 + **Available tokenizers:** `default` (Unicode-aware), `raw` (exact match), `simple` (whitespace+punctuation), `whitespace`, `ngram` (2-3 char n-grams). 250 + 251 + **Query syntax (Tantivy):** `database AND search`, `database NOT nosql`, `"exact phrase"`, `data*` (prefix), `title:database` (field-specific), `title:database^2` (boosting). 252 + 253 + **Limitations:** No snippet function (use highlighting). No automatic segment merging (manual `OPTIMIZE INDEX` required). 254 + No read-your-writes within a transaction. No MATCH operator (use `fts_match()` function). 255 + 256 + #### Vector Search 257 + 258 + ```sql 259 + -- Vector column type 260 + embedding F32_BLOB(768) 261 + 262 + -- Insert 263 + INSERT INTO document_embeddings (document_id, embedding, ...) 264 + VALUES (?, vector32(?), ...); -- ? is JSON array '[0.1, 0.2, ...]' 265 + 266 + -- Brute-force similarity search 267 + SELECT d.id, vector_distance_cos(e.embedding, vector32(?)) AS distance 268 + FROM documents d 269 + JOIN document_embeddings e ON d.id = e.document_id 270 + ORDER BY distance ASC LIMIT 20; 271 + 272 + -- Create ANN index (DiskANN) 273 + CREATE INDEX idx_embeddings ON document_embeddings( 274 + libsql_vector_idx(embedding, 'metric=cosine') 275 + ); 276 + 277 + -- ANN search via index 278 + SELECT d.id, d.title 279 + FROM vector_top_k('idx_embeddings', vector32(?), 20) AS v 280 + JOIN document_embeddings e ON e.rowid = v.id 281 + JOIN documents d ON d.id = e.document_id; 282 + ``` 283 + 284 + **Vector types:** `F32_BLOB` (recommended), `F16_BLOB`, `F64_BLOB`, `F8_BLOB`, `F1BIT_BLOB`. 285 + 286 + **Distance functions:** `vector_distance_cos` (cosine), `vector_distance_l2` (Euclidean). 287 + 288 + **Max dimensions:** 65,536. Dimension is fixed at table creation. 289 + 290 + ### Deployment: Railway 291 + 292 + Railway is the deployment platform. It supports health checks, autodeploy, per-service scaling, and internal networking. Tap is already deployed here. Twister deploys as separate Railway services (api, indexer, embed-worker) within the same project.

+192

packages/api/docs/specs/02-tangled-lexicons.md

··· 1 + --- 2 + title: "Spec 02 — Tangled Lexicons" 3 + updated: 2026-03-22 4 + source: https://github.com/mary-ext/atcute/tree/trunk/packages/definitions/tangled/lexicons/sh/tangled 5 + --- 6 + 7 + All Tangled records use the `sh.tangled.*` namespace. Records use TID keys unless noted otherwise. 8 + 9 + ## 1. Searchable Record Types 10 + 11 + These are the primary records Twister indexes for search. 12 + 13 + ### sh.tangled.repo 14 + 15 + Repository metadata. Key: `tid`. 16 + 17 + | Field | Type | Required | Description | 18 + | ------------- | -------- | -------- | ---------------------------------------------- | 19 + | `name` | string | yes | Repository name | 20 + | `knot` | string | yes | Knot (hosting node) where the repo was created | 21 + | `spindle` | string | no | CI runner for jobs | 22 + | `description` | string | no | 1–140 graphemes | 23 + | `website` | uri | no | Related URI | 24 + | `topics` | string[] | no | Up to 50 topic tags, each 1–50 chars | 25 + | `source` | uri | no | Upstream source | 26 + | `labels` | at-uri[] | no | Label definitions this repo subscribes to | 27 + | `createdAt` | datetime | yes | | 28 + 29 + ### sh.tangled.repo.issue 30 + 31 + Issue on a repository. Key: `tid`. 32 + 33 + | Field | Type | Required | Description | 34 + | ------------ | -------- | -------- | -------------------------------- | 35 + | `repo` | at-uri | yes | AT-URI of the parent repo record | 36 + | `title` | string | yes | Issue title | 37 + | `body` | string | no | Issue body (markdown) | 38 + | `createdAt` | datetime | yes | | 39 + | `mentions` | did[] | no | Mentioned users | 40 + | `references` | at-uri[] | no | Referenced records | 41 + 42 + ### sh.tangled.repo.pull 43 + 44 + Pull request. Key: `tid`. 45 + 46 + | Field | Type | Required | Description | 47 + | ------------ | -------- | -------- | -------------------------------------------------- | 48 + | `target` | object | yes | `{repo: at-uri, branch: string}` | 49 + | `title` | string | yes | PR title | 50 + | `body` | string | no | PR description (markdown) | 51 + | `patchBlob` | blob | yes | Patch content (`text/x-patch`) | 52 + | `source` | object | no | `{branch: string, sha: string(40), repo?: at-uri}` | 53 + | `createdAt` | datetime | yes | | 54 + | `mentions` | did[] | no | Mentioned users | 55 + | `references` | at-uri[] | no | Referenced records | 56 + 57 + ### sh.tangled.string 58 + 59 + Code snippet / gist. Key: `tid`. 60 + 61 + | Field | Type | Required | Description | 62 + | ------------- | -------- | -------- | ------------------- | 63 + | `filename` | string | yes | 1–140 graphemes | 64 + | `description` | string | yes | Up to 280 graphemes | 65 + | `createdAt` | datetime | yes | | 66 + | `contents` | string | yes | Snippet content | 67 + 68 + ### sh.tangled.actor.profile 69 + 70 + User profile. Key: `literal:self` (singleton per account). 71 + 72 + | Field | Type | Required | Description | 73 + | -------------------- | -------- | -------- | ---------------------------- | 74 + | `avatar` | blob | no | PNG/JPEG, max 1MB | 75 + | `description` | string | no | Bio, up to 256 graphemes | 76 + | `links` | uri[] | no | Up to 5 social/website links | 77 + | `stats` | string[] | no | Up to 2 vanity stat types | 78 + | `bluesky` | boolean | yes | Show Bluesky link | 79 + | `location` | string | no | Up to 40 graphemes | 80 + | `pinnedRepositories` | at-uri[] | no | Up to 6 pinned repos | 81 + | `pronouns` | string | no | Up to 40 chars | 82 + 83 + ## 2. Interaction Record Types 84 + 85 + These records represent social interactions. They may be indexed for counts/signals but are lower priority for text search. 86 + 87 + ### sh.tangled.feed.star 88 + 89 + Star/favorite on a record. Key: `tid`. 90 + 91 + | Field | Type | Required | 92 + | ----------- | -------- | -------- | 93 + | `subject` | at-uri | yes | 94 + | `createdAt` | datetime | yes | 95 + 96 + ### sh.tangled.feed.reaction 97 + 98 + Emoji reaction on a record. Key: `tid`. 99 + 100 + | Field | Type | Required | Description | 101 + | ----------- | -------- | -------- | ------------------------------- | 102 + | `subject` | at-uri | yes | | 103 + | `reaction` | string | yes | One of: 👍 👎 😆 🎉 🫤 ❤️ 🚀 👀 | 104 + | `createdAt` | datetime | yes | | 105 + 106 + ### sh.tangled.graph.follow 107 + 108 + Follow a user. Key: `tid`. 109 + 110 + | Field | Type | Required | 111 + | ----------- | -------- | -------- | 112 + | `subject` | did | yes | 113 + | `createdAt` | datetime | yes | 114 + 115 + ## 3. State Record Types 116 + 117 + These records track mutable state of issues and PRs. 118 + 119 + ### sh.tangled.repo.issue.state 120 + 121 + | Field | Type | Required | Description | 122 + | ------- | ------ | -------- | -------------------------------------------------------------------------- | 123 + | `issue` | at-uri | yes | | 124 + | `state` | string | yes | `sh.tangled.repo.issue.state.open` or `sh.tangled.repo.issue.state.closed` | 125 + 126 + ### sh.tangled.repo.pull.status 127 + 128 + | Field | Type | Required | Description | 129 + | -------- | ------ | -------- | ----------------------------------------------------------- | 130 + | `pull` | at-uri | yes | | 131 + | `status` | string | yes | `sh.tangled.repo.pull.status.open`, `.closed`, or `.merged` | 132 + 133 + ## 4. Comment Record Types 134 + 135 + ### sh.tangled.repo.issue.comment 136 + 137 + | Field | Type | Required | Description | 138 + | ------------ | -------- | -------- | ------------------------------ | 139 + | `issue` | at-uri | yes | Parent issue | 140 + | `body` | string | yes | Comment body | 141 + | `createdAt` | datetime | yes | | 142 + | `replyTo` | at-uri | no | Parent comment (for threading) | 143 + | `mentions` | did[] | no | | 144 + | `references` | at-uri[] | no | | 145 + 146 + ### sh.tangled.repo.pull.comment 147 + 148 + | Field | Type | Required | Description | 149 + | ------------ | -------- | -------- | ------------ | 150 + | `pull` | at-uri | yes | Parent PR | 151 + | `body` | string | yes | Comment body | 152 + | `createdAt` | datetime | yes | | 153 + | `mentions` | did[] | no | | 154 + | `references` | at-uri[] | no | | 155 + 156 + ## 5. Infrastructure Record Types 157 + 158 + These are not indexed for search but may be consumed for operational context. 159 + 160 + | Collection | Description | 161 + | ----------------------------- | ---------------------------------------------------- | 162 + | `sh.tangled.label.definition` | Label definitions with name, valueType, scope, color | 163 + | `sh.tangled.label.op` | Label application operations | 164 + | `sh.tangled.git.refUpdate` | Git reference update events | 165 + | `sh.tangled.knot.member` | Knot membership | 166 + | `sh.tangled.spindle.member` | Spindle (CI runner) membership | 167 + | `sh.tangled.pipeline.status` | CI pipeline status | 168 + 169 + ## 6. Collection Priority for v1 Indexing 170 + 171 + | Priority | Collection | Rationale | 172 + | -------- | ------------------------------- | ------------------------------------ | 173 + | P0 | `sh.tangled.repo` | Core searchable content | 174 + | P0 | `sh.tangled.repo.issue` | High-signal text content | 175 + | P0 | `sh.tangled.repo.pull` | High-signal text content | 176 + | P1 | `sh.tangled.string` | Searchable code snippets | 177 + | P1 | `sh.tangled.actor.profile` | User/org discovery | 178 + | P2 | `sh.tangled.repo.issue.comment` | Body text, high volume | 179 + | P2 | `sh.tangled.repo.pull.comment` | Body text, high volume | 180 + | P2 | `sh.tangled.repo.issue.state` | State for filtering, not text search | 181 + | P2 | `sh.tangled.repo.pull.status` | State for filtering, not text search | 182 + | P3 | `sh.tangled.feed.star` | Ranking signal (star count) | 183 + | P3 | `sh.tangled.feed.reaction` | Ranking signal | 184 + | P3 | `sh.tangled.graph.follow` | Ranking signal | 185 + 186 + ### Tap Collection Filter for v1 187 + 188 + ```sh 189 + TAP_COLLECTION_FILTERS=sh.tangled.repo,sh.tangled.repo.issue,sh.tangled.repo.issue.comment,sh.tangled.repo.issue.state,sh.tangled.repo.pull,sh.tangled.repo.pull.comment,sh.tangled.repo.pull.status,sh.tangled.string,sh.tangled.actor.profile,sh.tangled.feed.star 190 + 191 + # or sh.tangled.* 192 + ```

+153

packages/api/docs/specs/03-data-model.md

··· 1 + --- 2 + title: "Spec 03 — Data Model" 3 + updated: 2026-03-22 4 + --- 5 + 6 + ## 1. Search Document 7 + 8 + A **search document** is the internal denormalized representation used for retrieval. It is derived from one or more ATProto records via normalization. 9 + 10 + ### Stable Identifier 11 + 12 + ```sh 13 + id = did + "|" + collection + "|" + rkey 14 + ``` 15 + 16 + Example: `did:plc:abc123|sh.tangled.repo|3kb3fge5lm32x` 17 + 18 + ### Required Fields 19 + 20 + | Field | Type | Description | 21 + | --------------- | ------- | -------------------------------------------------------------------------- | 22 + | `id` | TEXT PK | Stable composite identifier | 23 + | `did` | TEXT | Author DID | 24 + | `collection` | TEXT | ATProto collection NSID | 25 + | `rkey` | TEXT | Record key (TID) | 26 + | `at_uri` | TEXT | Full AT-URI | 27 + | `cid` | TEXT | Content identifier (hash) | 28 + | `record_type` | TEXT | Normalized type label (e.g., `repo`, `issue`, `pull`, `string`, `profile`) | 29 + | `title` | TEXT | Normalized title | 30 + | `body` | TEXT | Normalized body text | 31 + | `summary` | TEXT | Short summary / description | 32 + | `repo_did` | TEXT | DID of the repo owner (resolved from at-uri for issues/PRs) | 33 + | `repo_name` | TEXT | Repository name (resolved) | 34 + | `author_handle` | TEXT | Author handle (resolved via identity) | 35 + | `tags_json` | TEXT | JSON array of tags/topics | 36 + | `language` | TEXT | Detected or declared language | 37 + | `created_at` | TEXT | Record creation timestamp (ISO 8601) | 38 + | `updated_at` | TEXT | Last record update timestamp | 39 + | `indexed_at` | TEXT | When this document was last indexed | 40 + | `deleted_at` | TEXT | Soft-delete timestamp (tombstone) | 41 + 42 + ### Derived Fields (not stored in documents table) 43 + 44 + | Field | Location | Description | 45 + | ---------------- | -------------------------------------- | ------------------------------ | 46 + | Embedding vector | `document_embeddings` table | F32_BLOB(N) | 47 + | FTS index | Turso FTS index | Tantivy-backed full-text index | 48 + | Star count | Aggregated from `sh.tangled.feed.star` | Ranking signal | 49 + 50 + ## 2. Core Documents Table 51 + 52 + ```sql 53 + CREATE TABLE documents ( 54 + id TEXT PRIMARY KEY, 55 + did TEXT NOT NULL, 56 + collection TEXT NOT NULL, 57 + rkey TEXT NOT NULL, 58 + at_uri TEXT NOT NULL, 59 + cid TEXT NOT NULL, 60 + record_type TEXT NOT NULL, 61 + title TEXT, 62 + body TEXT, 63 + summary TEXT, 64 + repo_did TEXT, 65 + repo_name TEXT, 66 + author_handle TEXT, 67 + tags_json TEXT, 68 + language TEXT, 69 + created_at TEXT, 70 + updated_at TEXT, 71 + indexed_at TEXT NOT NULL, 72 + deleted_at TEXT 73 + ); 74 + 75 + CREATE INDEX idx_documents_did ON documents(did); 76 + CREATE INDEX idx_documents_collection ON documents(collection); 77 + CREATE INDEX idx_documents_record_type ON documents(record_type); 78 + CREATE INDEX idx_documents_repo_did ON documents(repo_did); 79 + CREATE INDEX idx_documents_created_at ON documents(created_at); 80 + CREATE INDEX idx_documents_deleted_at ON documents(deleted_at); 81 + ``` 82 + 83 + ## 3. FTS Index 84 + 85 + ```sql 86 + CREATE INDEX idx_documents_fts ON documents USING fts ( 87 + title WITH tokenizer=default, 88 + body WITH tokenizer=default, 89 + summary WITH tokenizer=default, 90 + repo_name WITH tokenizer=simple, 91 + author_handle WITH tokenizer=raw, 92 + tags_json WITH tokenizer=simple 93 + ) WITH (weights='title=3.0,repo_name=2.5,author_handle=2.0,summary=1.5,tags_json=1.2,body=1.0'); 94 + ``` 95 + 96 + ## 4. Embeddings Table 97 + 98 + ```sql 99 + CREATE TABLE document_embeddings ( 100 + document_id TEXT PRIMARY KEY REFERENCES documents(id), 101 + embedding F32_BLOB(768), 102 + embedding_model TEXT NOT NULL, 103 + embedded_at TEXT NOT NULL 104 + ); 105 + 106 + CREATE INDEX idx_embeddings_vec ON document_embeddings( 107 + libsql_vector_idx(embedding, 'metric=cosine') 108 + ); 109 + ``` 110 + 111 + The vector dimension (768) is configurable by model. Changing models requires a new column or table migration. 112 + 113 + ## 5. Sync State Table 114 + 115 + ```sql 116 + CREATE TABLE sync_state ( 117 + consumer_name TEXT PRIMARY KEY, 118 + cursor TEXT NOT NULL, 119 + high_water_mark TEXT, 120 + updated_at TEXT NOT NULL 121 + ); 122 + ``` 123 + 124 + Stores the Tap event ID that has been successfully committed. On restart, the indexer resumes from this cursor. 125 + 126 + ## 6. Embedding Jobs Table 127 + 128 + ```sql 129 + CREATE TABLE embedding_jobs ( 130 + document_id TEXT PRIMARY KEY REFERENCES documents(id), 131 + status TEXT NOT NULL, -- 'pending', 'processing', 'completed', 'failed' 132 + attempts INTEGER NOT NULL DEFAULT 0, 133 + last_error TEXT, 134 + scheduled_at TEXT NOT NULL, 135 + updated_at TEXT NOT NULL 136 + ); 137 + 138 + CREATE INDEX idx_embedding_jobs_status ON embedding_jobs(status); 139 + ``` 140 + 141 + ## 7. Issue/PR State Cache (optional) 142 + 143 + To support filtering search results by issue state or PR status without joining back to the raw records: 144 + 145 + ```sql 146 + CREATE TABLE record_state ( 147 + subject_uri TEXT PRIMARY KEY, -- at-uri of the issue or PR 148 + state TEXT NOT NULL, -- 'open', 'closed', 'merged' 149 + updated_at TEXT NOT NULL 150 + ); 151 + ``` 152 + 153 + Updated when `sh.tangled.repo.issue.state` or `sh.tangled.repo.pull.status` events are ingested.

+360

packages/api/docs/specs/04-data-pipeline.md

··· 1 + --- 2 + title: "Spec 04 — Data Pipeline" 3 + updated: 2026-03-22 4 + --- 5 + 6 + Covers the full data path: Tap event ingestion, record normalization, and failure handling. 7 + 8 + ## 1. Tap Event Format 9 + 10 + ### Record Events 11 + 12 + ```json 13 + { 14 + "id": 12345, 15 + "type": "record", 16 + "record": { 17 + "live": true, 18 + "rev": "3kb3fge5lm32x", 19 + "did": "did:plc:abc123", 20 + "collection": "sh.tangled.repo", 21 + "rkey": "3kb3fge5lm32x", 22 + "action": "create", 23 + "cid": "bafyreig...", 24 + "record": { 25 + "$type": "sh.tangled.repo", 26 + "name": "my-project", 27 + "knot": "knot.tangled.org", 28 + "description": "A cool project", 29 + "topics": ["go", "search"], 30 + "createdAt": "2026-03-22T12:00:00.000Z" 31 + } 32 + } 33 + } 34 + ``` 35 + 36 + Key fields: 37 + 38 + - `id` — monotonic event ID, used as cursor 39 + - `type` — `"record"` or `"identity"` 40 + - `record.live` — `true` for real-time events, `false` for backfill 41 + - `record.action` — `"create"`, `"update"`, or `"delete"` 42 + - `record.did` — author DID 43 + - `record.collection` — ATProto collection NSID 44 + - `record.rkey` — record key 45 + - `record.cid` — content identifier 46 + - `record.record` — the full ATProto record payload (absent on delete) 47 + 48 + ### Identity Events 49 + 50 + ```json 51 + { 52 + "id": 12346, 53 + "type": "identity", 54 + "identity": { 55 + "did": "did:plc:abc123", 56 + "handle": "alice.tangled.org", 57 + "isActive": true, 58 + "status": "active" 59 + } 60 + } 61 + ``` 62 + 63 + Identity events are always delivered for tracked repos, regardless of collection filters. 64 + 65 + ## 2. WebSocket Protocol 66 + 67 + ### Connection 68 + 69 + Connect to `wss://<tap-host>/channel` (or `ws://` for local dev). 70 + 71 + If `TAP_ADMIN_PASSWORD` is set, authenticate with HTTP Basic auth (`admin:<password>`). 72 + 73 + ### Acknowledgment Protocol 74 + 75 + Default mode requires the client to ack each event by sending the event `id` back over the WebSocket. Events are retried after `TAP_RETRY_TIMEOUT` (default 60s) if unacked. 76 + 77 + For simpler development, set `TAP_DISABLE_ACKS=true` on Tap for fire-and-forget delivery. 78 + 79 + ### Ordering Guarantees 80 + 81 + Events are ordered **per-repo** (per-DID), not globally: 82 + 83 + - **Historical events** (`live: false`) may be sent concurrently within a repo 84 + - **Live events** (`live: true`) are synchronization barriers — all prior events for that repo must complete before a live event is sent 85 + - No ordering guarantee across different repos 86 + 87 + Example sequence for one repo: `H1, H2, L1, H3, H4, L2` 88 + 89 + - H1 and H2 sent concurrently 90 + - Wait for completion, send L1 alone 91 + - Wait for L1, send H3 and H4 concurrently 92 + - Wait for completion, send L2 alone 93 + 94 + ### Delivery Guarantee 95 + 96 + Events are delivered **at least once**. Duplicates may occur on crashes or ack timeouts. The indexer must handle idempotent upserts. 97 + 98 + ## 3. Ingestion Contract 99 + 100 + For each event, the indexer: 101 + 102 + 1. Validates `type` is `"record"` (identity events are handled separately) 103 + 2. Checks `record.collection` against the allowlist 104 + 3. Maps `record.action` to an operation: 105 + - `create` → upsert document 106 + - `update` → upsert document 107 + - `delete` → tombstone document (`deleted_at = now`) 108 + 4. Decodes `record.record` into the collection-specific struct 109 + 5. Normalizes to internal `Document` 110 + 6. Upserts into the documents table 111 + 7. Schedules embedding job if eligible 112 + 8. Persists cursor (`event.id`) **only after successful DB commit** 113 + 114 + ### Cursor Persistence Rules 115 + 116 + - If DB commit fails → cursor does not advance → event will be retried 117 + - If normalization fails → log error, optionally dead-letter, skip → cursor advances 118 + - If embedding scheduling fails → document remains keyword-searchable → cursor advances 119 + 120 + ## 4. Backfill Behavior 121 + 122 + When a repo is added to Tap (via `/repos/add`, signal collection, or full network mode): 123 + 124 + 1. Tap fetches full repo history from PDS via `com.atproto.sync.getRepo` 125 + 2. Firehose events for that repo are buffered during backfill 126 + 3. Historical events (`live: false`) are delivered first 127 + 4. After backfill completes, buffered live events drain 128 + 5. New firehose events stream normally (`live: true`) 129 + 130 + ### Application-Level Backfill Support 131 + 132 + The indexer also supports: 133 + 134 + - Full reindex from existing corpus (re-normalize all stored documents) 135 + - Targeted reindex by collection 136 + - Targeted reindex by DID 137 + 138 + These do not involve Tap — they re-process documents already in the database. 139 + 140 + ## 5. Normalization 141 + 142 + Normalization converts heterogeneous `sh.tangled.*` records into the common `Document` shape defined in [03-data-model.md](03-data-model.md). 143 + 144 + ### Adapter Interface 145 + 146 + Each indexed collection provides an adapter: 147 + 148 + ```go 149 + type RecordAdapter interface { 150 + Collection() string 151 + RecordType() string 152 + Normalize(event TapRecordEvent) (*Document, error) 153 + Searchable(record map[string]any) bool 154 + } 155 + ``` 156 + 157 + ### Per-Collection Normalization 158 + 159 + #### sh.tangled.repo → `repo` 160 + 161 + | Document Field | Source | 162 + | -------------- | -------------------------------- | 163 + | `title` | `record.name` | 164 + | `body` | `record.description` | 165 + | `summary` | `record.description` (truncated) | 166 + | `repo_name` | `record.name` | 167 + | `repo_did` | `event.did` | 168 + | `tags_json` | `json(record.topics)` | 169 + | `created_at` | `record.createdAt` | 170 + 171 + **Searchable:** Always (unless empty name). 172 + 173 + #### sh.tangled.repo.issue → `issue` 174 + 175 + | Document Field | Source | 176 + | -------------- | ------------------------------------------- | 177 + | `title` | `record.title` | 178 + | `body` | `record.body` | 179 + | `summary` | First ~200 chars of `record.body` | 180 + | `repo_did` | Extracted from `record.repo` AT-URI | 181 + | `repo_name` | Resolved from repo AT-URI | 182 + | `tags_json` | `[]` (labels resolved separately if needed) | 183 + | `created_at` | `record.createdAt` | 184 + 185 + **Searchable:** Always. 186 + 187 + #### sh.tangled.repo.pull → `pull` 188 + 189 + | Document Field | Source | 190 + | -------------- | ------------------------------------------ | 191 + | `title` | `record.title` | 192 + | `body` | `record.body` | 193 + | `summary` | First ~200 chars of `record.body` | 194 + | `repo_did` | Extracted from `record.target.repo` AT-URI | 195 + | `repo_name` | Resolved from target repo AT-URI | 196 + | `tags_json` | `[]` | 197 + | `created_at` | `record.createdAt` | 198 + 199 + **Searchable:** Always. 200 + 201 + #### sh.tangled.string → `string` 202 + 203 + | Document Field | Source | 204 + | -------------- | -------------------- | 205 + | `title` | `record.filename` | 206 + | `body` | `record.contents` | 207 + | `summary` | `record.description` | 208 + | `repo_name` | — | 209 + | `repo_did` | — | 210 + | `tags_json` | `[]` | 211 + | `created_at` | `record.createdAt` | 212 + 213 + **Searchable:** Always (content is required). 214 + 215 + #### sh.tangled.actor.profile → `profile` 216 + 217 + | Document Field | Source | 218 + | -------------- | ---------------------------------------------------- | 219 + | `title` | Author handle (resolved from DID) | 220 + | `body` | `record.description` | 221 + | `summary` | `record.description` (truncated) + `record.location` | 222 + | `repo_name` | — | 223 + | `repo_did` | — | 224 + | `tags_json` | `[]` | 225 + | `created_at` | — (profiles don't have createdAt) | 226 + 227 + **Searchable:** If `description` is non-empty. 228 + 229 + #### sh.tangled.repo.issue.comment → `issue_comment` 230 + 231 + | Document Field | Source | 232 + | -------------- | ----------------------------------------- | 233 + | `title` | — (derived: "Comment on {issue title}") | 234 + | `body` | `record.body` | 235 + | `summary` | First ~200 chars of `record.body` | 236 + | `repo_did` | Resolved from `record.issue` AT-URI chain | 237 + | `repo_name` | Resolved | 238 + | `created_at` | `record.createdAt` | 239 + 240 + **Searchable:** If body is non-empty. 241 + 242 + #### sh.tangled.repo.pull.comment → `pull_comment` 243 + 244 + Same pattern as issue comments, using `record.pull` instead of `record.issue`. 245 + 246 + ### State Event Handling 247 + 248 + State and status records (`sh.tangled.repo.issue.state`, `sh.tangled.repo.pull.status`) do **not** produce new search documents. Instead, they update the `record_state` cache table (see [03-data-model.md](03-data-model.md)). 249 + 250 + ### Interaction Event Handling 251 + 252 + Stars (`sh.tangled.feed.star`) and reactions (`sh.tangled.feed.reaction`) do not produce search documents. They may be aggregated for ranking signals in later phases. 253 + 254 + ### Embedding Input Text 255 + 256 + For documents eligible for embedding, compose the input as: 257 + 258 + ```sh 259 + {title}\n{repo_name}\n{author_handle}\n{tags}\n{summary}\n{body} 260 + ``` 261 + 262 + Fields are joined with newlines. Empty fields are omitted. 263 + 264 + ### Repo Name Resolution 265 + 266 + Issues, PRs, and comments reference their parent repo via AT-URI (e.g., `at://did:plc:abc/sh.tangled.repo/tid`). Resolving the repo name requires either: 267 + 268 + 1. Looking up the repo document in the local `documents` table 269 + 2. Caching repo metadata in a lightweight lookup table 270 + 271 + Option 1 is preferred for v1. If the repo document hasn't been indexed yet, `repo_name` is left empty and backfilled on the next reindex pass. 272 + 273 + ## 6. Identity Event Handling 274 + 275 + Identity events should be used to maintain an author handle cache: 276 + 277 + ```sh 278 + did → handle mapping 279 + ``` 280 + 281 + When an identity event arrives with a new handle, update `author_handle` on all documents with that DID. This ensures search by handle returns current results. 282 + 283 + ## 7. Repo Management 284 + 285 + To add repos for tracking, POST to Tap's `/repos/add` endpoint: 286 + 287 + ```bash 288 + curl -u admin:PASSWORD -X POST https://tap-host/repos/add \ 289 + -H "Content-Type: application/json" \ 290 + -d '{"dids": ["did:plc:abc123", "did:plc:def456"]}' 291 + ``` 292 + 293 + Alternatively, use `TAP_SIGNAL_COLLECTION=sh.tangled.repo` to auto-track any repo that has Tangled repo records. 294 + 295 + ## 8. Failure Handling 296 + 297 + ### Ingestion Failures 298 + 299 + If Tap event processing fails before DB commit: 300 + 301 + - Log the failure with event ID, DID, collection, rkey, and error class 302 + - Retry with exponential backoff (for transient errors like DB timeouts) 303 + - Do **not** advance cursor — the event will be re-delivered by Tap 304 + - After max retries for a persistent error, log and skip (cursor advances) 305 + 306 + ### Normalization Failures 307 + 308 + If a record cannot be normalized: 309 + 310 + - Log collection, DID, rkey, CID, and error class 311 + - Do not crash the process 312 + - Skip the event and advance cursor 313 + - Optionally insert into a `dead_letter` table for manual inspection 314 + 315 + ### Embedding Failures 316 + 317 + If embedding generation fails: 318 + 319 + - The document remains keyword-searchable 320 + - The embedding job is marked `failed` with `last_error` and incremented `attempts` 321 + - Jobs are retried with exponential backoff up to a max attempt count 322 + - After max attempts, the job enters `dead` state 323 + - The embed-worker exposes failed job count as a metric 324 + 325 + ### DB Failures 326 + 327 + If Turso/libSQL is unreachable: 328 + 329 + - **API** returns `503` for search endpoints; `/healthz` still returns 200 (liveness), `/readyz` returns 503 330 + - **Indexer** pauses event processing and retries DB connection with backoff; cursor does not advance 331 + - **Embed-worker** pauses job processing and retries 332 + 333 + ### Tap Connection Failures 334 + 335 + If the WebSocket connection to Tap drops: 336 + 337 + - Reconnect with exponential backoff 338 + - Resume from the last persisted cursor 339 + - Log reconnection attempts and success 340 + 341 + Tap itself handles firehose reconnection independently — a Tap restart does not require indexer intervention beyond reconnecting the WebSocket. 342 + 343 + ### Duplicate Event Handling 344 + 345 + Tap delivers events **at least once**. Duplicates are handled by: 346 + 347 + - Using `id = did|collection|rkey` as the primary key 348 + - All writes are upserts (`INSERT OR REPLACE` / `ON CONFLICT ... DO UPDATE`) 349 + - CID comparison can detect true no-ops (same content) vs. actual updates 350 + 351 + ### Startup Recovery 352 + 353 + On indexer startup: 354 + 355 + 1. Read `cursor` from `sync_state` table 356 + 2. Connect to Tap WebSocket 357 + 3. Tap replays events from the stored cursor position 358 + 4. Processing resumes normally 359 + 360 + If no cursor exists (first run), Tap delivers all historical events from backfill.

+296

packages/api/docs/specs/05-search.md

··· 1 + --- 2 + title: "Spec 05 — Search" 3 + updated: 2026-03-22 4 + --- 5 + 6 + Covers all search modes, the API contract, scoring, and filtering. 7 + 8 + ## 1. Search Modes 9 + 10 + | Mode | Backing | Available | 11 + |------|---------|-----------| 12 + | `keyword` | Turso Tantivy-backed FTS | MVP | 13 + | `semantic` | Vector similarity (DiskANN index) | Phase 2 | 14 + | `hybrid` | Weighted merge of keyword + semantic | Phase 3 | 15 + 16 + ## 2. Keyword Search 17 + 18 + ### Implementation 19 + 20 + Uses Turso's `fts_score()` function for BM25 ranking: 21 + 22 + ```sql 23 + SELECT 24 + d.id, d.title, d.summary, d.repo_name, d.author_handle, 25 + d.collection, d.record_type, d.updated_at, 26 + fts_score(d.title, d.body, d.summary, d.repo_name, d.author_handle, d.tags_json, ?) AS score 27 + FROM documents d 28 + WHERE fts_match(d.title, d.body, d.summary, d.repo_name, d.author_handle, d.tags_json, ?) 29 + AND d.deleted_at IS NULL 30 + ORDER BY score DESC 31 + LIMIT ? OFFSET ?; 32 + ``` 33 + 34 + ### Field Weights 35 + 36 + Configured in the FTS index definition: 37 + 38 + | Field | Weight | Rationale | 39 + |-------|--------|-----------| 40 + | `title` | 3.0 | Highest signal for relevance | 41 + | `repo_name` | 2.5 | Exact repo lookups should rank first | 42 + | `author_handle` | 2.0 | Author search is common | 43 + | `summary` | 1.5 | More focused than body | 44 + | `tags_json` | 1.2 | Topic matching | 45 + | `body` | 1.0 | Baseline | 46 + 47 + ### Query Features 48 + 49 + Tantivy query syntax is exposed to users: 50 + 51 + - Boolean: `go AND search`, `rust NOT unsafe` 52 + - Phrase: `"pull request"` 53 + - Prefix: `tang*` 54 + - Field-specific: `title:parser` 55 + 56 + ### Snippets 57 + 58 + Use `fts_highlight()` to generate highlighted snippets: 59 + 60 + ```sql 61 + fts_highlight(d.body, '', '', ?) AS body_snippet 62 + ``` 63 + 64 + ## 3. Semantic Search 65 + 66 + ### Query Flow 67 + 68 + 1. Convert user query text to embedding via the configured provider 69 + 2. Query `vector_top_k` for nearest neighbors 70 + 3. Join back to `documents` to get metadata 71 + 4. Filter out deleted/hidden documents 72 + 5. Return results with distance as score 73 + 74 + ```sql 75 + SELECT d.id, d.title, d.summary, d.repo_name, d.author_handle, 76 + d.collection, d.record_type, d.updated_at 77 + FROM vector_top_k('idx_embeddings_vec', vector32(?), ?) AS v 78 + JOIN document_embeddings e ON e.rowid = v.id 79 + JOIN documents d ON d.id = e.document_id 80 + WHERE d.deleted_at IS NULL; 81 + ``` 82 + 83 + ### Score Normalization 84 + 85 + Cosine distance ranges from 0 (identical) to 2 (opposite). Normalize to a 0–1 relevance score: 86 + 87 + ``` 88 + semantic_score = 1.0 - (distance / 2.0) 89 + ``` 90 + 91 + ## 4. Hybrid Search 92 + 93 + ### v1: Weighted Score Blending 94 + 95 + ``` 96 + hybrid_score = 0.65 * keyword_score_normalized + 0.35 * semantic_score_normalized 97 + ``` 98 + 99 + ### Score Normalization for Blending 100 + 101 + Keyword (BM25) scores are unbounded. Normalize using min-max within the result set: 102 + 103 + ``` 104 + keyword_normalized = (score - min_score) / (max_score - min_score) 105 + ``` 106 + 107 + Semantic scores are already bounded after the distance-to-relevance conversion. 108 + 109 + ### Merge Strategy 110 + 111 + 1. Fetch top N keyword results (e.g., N=50) 112 + 2. Fetch top N semantic results 113 + 3. Merge on `document_id` 114 + 4. For documents appearing in both sets, combine scores 115 + 5. For documents in only one set, use that score (with 0 for the missing signal) 116 + 6. Sort by `hybrid_score` descending 117 + 7. Deduplicate 118 + 8. Apply limit/offset 119 + 120 + ### v2: Reciprocal Rank Fusion (future) 121 + 122 + If keyword and semantic score scales prove unstable under weighted blending, replace with RRF: 123 + 124 + ``` 125 + rrf_score = Σ 1 / (k + rank_i) 126 + ``` 127 + 128 + where `k` is a constant (typically 60) and `rank_i` is the document's rank in each result list. 129 + 130 + ## 5. Filtering 131 + 132 + All search modes support these filters, applied as SQL WHERE clauses: 133 + 134 + | Filter | Parameter | SQL | 135 + |--------|-----------|-----| 136 + | Collection | `collection` | `d.collection = ?` | 137 + | Author | `author` | `d.author_handle = ?` or `d.did = ?` | 138 + | Repo | `repo` | `d.repo_name = ?` or `d.repo_did = ?` | 139 + | Record type | `type` | `d.record_type = ?` | 140 + | Language | `language` | `d.language = ?` | 141 + | Date range | `from`, `to` | `d.created_at >= ?` and `d.created_at <= ?` | 142 + | State | `state` | Join to `record_state` table | 143 + 144 + ## 6. Embedding Eligibility 145 + 146 + A document is eligible for embedding if: 147 + 148 + - `deleted_at IS NULL` 149 + - `record_type` is one of: `repo`, `issue`, `pull`, `string`, `profile` 150 + - At least one of `title`, `body`, or `summary` is non-empty 151 + - Total text length exceeds a minimum threshold (e.g., 20 characters) 152 + 153 + ## 7. API Endpoints 154 + 155 + ### Health 156 + 157 + | Method | Path | Description | 158 + | ------ | ---------- | -------------------------------- | 159 + | GET | `/healthz` | Liveness — process is responsive | 160 + | GET | `/readyz` | Readiness — DB is reachable | 161 + 162 + ### Search 163 + 164 + | Method | Path | Description | 165 + | ------ | ------------------ | ------------------------------------------------ | 166 + | GET | `/search` | Search with configurable mode (default: keyword) | 167 + | GET | `/search/keyword` | Keyword-only search | 168 + | GET | `/search/semantic` | Semantic-only search | 169 + | GET | `/search/hybrid` | Hybrid search | 170 + 171 + ### Documents 172 + 173 + | Method | Path | Description | 174 + | ------ | ----------------- | ----------------------------- | 175 + | GET | `/documents/{id}` | Fetch a single document by ID | 176 + 177 + ### Admin 178 + 179 + | Method | Path | Description | 180 + | ------ | ---------------- | -------------------- | 181 + | POST | `/admin/reindex` | Trigger reindex | 182 + | POST | `/admin/reembed` | Trigger re-embedding | 183 + 184 + Admin endpoints are disabled by default. Enable with `ENABLE_ADMIN_ENDPOINTS=true`. 185 + 186 + ## 8. Query Parameters 187 + 188 + | Parameter | Type | Default | Description | 189 + | ------------ | ------ | --------- | -------------------------------------------------------------------- | 190 + | `q` | string | required | Search query | 191 + | `mode` | string | `keyword` | `keyword`, `semantic`, or `hybrid` | 192 + | `limit` | int | 20 | Results per page (max: `SEARCH_MAX_LIMIT`) | 193 + | `offset` | int | 0 | Pagination offset | 194 + | `collection` | string | — | Filter by `sh.tangled.*` collection | 195 + | `type` | string | — | Filter by record type (`repo`, `issue`, `pull`, `string`, `profile`) | 196 + | `author` | string | — | Filter by author handle or DID | 197 + | `repo` | string | — | Filter by repo name or repo DID | 198 + | `language` | string | — | Filter by language | 199 + | `from` | string | — | Created after (ISO 8601) | 200 + | `to` | string | — | Created before (ISO 8601) | 201 + | `state` | string | — | Filter by state (`open`, `closed`, `merged`) | 202 + 203 + ## 9. Search Response 204 + 205 + ```json 206 + { 207 + "query": "rust markdown tui", 208 + "mode": "hybrid", 209 + "total": 142, 210 + "limit": 20, 211 + "offset": 0, 212 + "results": [ 213 + { 214 + "id": "did:plc:abc|sh.tangled.repo|3kb3fge5lm32x", 215 + "collection": "sh.tangled.repo", 216 + "record_type": "repo", 217 + "title": "glow-rs", 218 + "body_snippet": "A TUI markdown viewer inspired by Glow...", 219 + "summary": "Rust TUI markdown viewer", 220 + "repo_name": "glow-rs", 221 + "author_handle": "desertthunder.dev", 222 + "score": 0.842, 223 + "matched_by": ["keyword", "semantic"], 224 + "created_at": "2026-03-20T10:00:00Z", 225 + "updated_at": "2026-03-22T15:03:11Z" 226 + } 227 + ] 228 + } 229 + ``` 230 + 231 + ### Result Fields 232 + 233 + | Field | Type | Description | 234 + | --------------- | -------- | --------------------------------------- | 235 + | `id` | string | Document stable ID | 236 + | `collection` | string | ATProto collection NSID | 237 + | `record_type` | string | Normalized type label | 238 + | `title` | string | Document title | 239 + | `body_snippet` | string | Highlighted body excerpt | 240 + | `summary` | string | Short description | 241 + | `repo_name` | string | Repository name (if applicable) | 242 + | `author_handle` | string | Author handle | 243 + | `score` | float | Relevance score (0–1) | 244 + | `matched_by` | string[] | Which search modes produced this result | 245 + | `created_at` | string | ISO 8601 creation timestamp | 246 + | `updated_at` | string | ISO 8601 last update timestamp | 247 + 248 + ## 10. Document Response 249 + 250 + `GET /documents/{id}` returns the full document: 251 + 252 + ```json 253 + { 254 + "id": "did:plc:abc|sh.tangled.repo|3kb3fge5lm32x", 255 + "did": "did:plc:abc", 256 + "collection": "sh.tangled.repo", 257 + "rkey": "3kb3fge5lm32x", 258 + "at_uri": "at://did:plc:abc/sh.tangled.repo/3kb3fge5lm32x", 259 + "cid": "bafyreig...", 260 + "record_type": "repo", 261 + "title": "glow-rs", 262 + "body": "A TUI markdown viewer inspired by Glow, written in Rust.", 263 + "summary": "Rust TUI markdown viewer", 264 + "repo_name": "glow-rs", 265 + "author_handle": "desertthunder.dev", 266 + "tags_json": "[\"rust\", \"tui\", \"markdown\"]", 267 + "language": "en", 268 + "created_at": "2026-03-20T10:00:00Z", 269 + "updated_at": "2026-03-22T15:03:11Z", 270 + "indexed_at": "2026-03-22T15:05:00Z", 271 + "has_embedding": true 272 + } 273 + ``` 274 + 275 + ## 11. Error Responses 276 + 277 + | Status | Condition | 278 + | ------ | ------------------------------------------------------------------ | 279 + | 400 | Missing `q` parameter, invalid `limit`/`offset`, malformed filters | 280 + | 404 | Document not found | 281 + | 503 | DB unreachable (readiness failure) | 282 + 283 + ```json 284 + { 285 + "error": "invalid_parameter", 286 + "message": "limit must be between 1 and 100" 287 + } 288 + ``` 289 + 290 + ## 12. API Behavior 291 + 292 + - `keyword` returns only lexical matches via `fts_match`/`fts_score` 293 + - `semantic` returns only embedding-backed matches via `vector_top_k` 294 + - `hybrid` merges both result sets and reranks 295 + - All modes exclude documents with `deleted_at IS NOT NULL` by default 296 + - Pagination uses `limit`/`offset` (cursor-based pagination deferred)

+325

packages/api/docs/specs/06-operations.md

··· 1 + --- 2 + title: "Spec 06 — Operations" 3 + updated: 2026-03-22 4 + --- 5 + 6 + Covers configuration, observability, security, and deployment. 7 + 8 + ## 1. Configuration 9 + 10 + All configuration is via environment variables. 11 + 12 + ### Required 13 + 14 + | Variable | Description | 15 + | --------------------- | ----------------------------------------------------------- | 16 + | `TAP_URL` | Tap WebSocket URL (e.g., `wss://tap.example.com/channel`) | 17 + | `TAP_AUTH_PASSWORD` | Tap admin password for Basic auth (if set on Tap) | 18 + | `TURSO_DATABASE_URL` | Turso connection URL (e.g., `libsql://db-name.turso.io`) | 19 + | `TURSO_AUTH_TOKEN` | Turso JWT auth token | 20 + | `INDEXED_COLLECTIONS` | Comma-separated list of `sh.tangled.*` collections to index | 21 + 22 + ### Search 23 + 24 + | Variable | Default | Description | 25 + | ---------------------- | --------- | ------------------------ | 26 + | `SEARCH_DEFAULT_LIMIT` | `20` | Default results per page | 27 + | `SEARCH_MAX_LIMIT` | `100` | Maximum results per page | 28 + | `SEARCH_DEFAULT_MODE` | `keyword` | Default search mode | 29 + 30 + ### Embedding 31 + 32 + | Variable | Default | Description | 33 + | ---------------------- | ------- | ---------------------------------------------------- | 34 + | `EMBEDDING_PROVIDER` | — | Provider name (e.g., `openai`, `ollama`, `voyageai`) | 35 + | `EMBEDDING_MODEL` | — | Model name (e.g., `text-embedding-3-small`) | 36 + | `EMBEDDING_API_KEY` | — | Provider API key | 37 + | `EMBEDDING_API_URL` | — | Provider base URL (for self-hosted) | 38 + | `EMBEDDING_DIM` | `768` | Vector dimensionality | 39 + | `EMBEDDING_BATCH_SIZE` | `32` | Batch size for embed-worker | 40 + 41 + ### Hybrid Search 42 + 43 + | Variable | Default | Description | 44 + | ------------------------ | ------- | --------------------------------------- | 45 + | `HYBRID_KEYWORD_WEIGHT` | `0.65` | Keyword score weight in hybrid ranking | 46 + | `HYBRID_SEMANTIC_WEIGHT` | `0.35` | Semantic score weight in hybrid ranking | 47 + 48 + ### Server 49 + 50 + | Variable | Default | Description | 51 + | ------------------------ | ------- | ------------------------------------------- | 52 + | `HTTP_BIND_ADDR` | `:8080` | API server bind address | 53 + | `LOG_LEVEL` | `info` | Log level: `debug`, `info`, `warn`, `error` | 54 + | `LOG_FORMAT` | `json` | Log format: `json` or `text` | 55 + | `ENABLE_ADMIN_ENDPOINTS` | `false` | Enable `/admin/*` endpoints | 56 + | `ADMIN_AUTH_TOKEN` | — | Bearer token for admin endpoints | 57 + 58 + ### Example `.env` 59 + 60 + ```bash 61 + # Tap (deployed on Railway) 62 + TAP_URL=wss://tap-instance.up.railway.app/channel 63 + TAP_AUTH_PASSWORD=your-tap-admin-password 64 + 65 + # Turso 66 + TURSO_DATABASE_URL=libsql://twister-db.turso.io 67 + TURSO_AUTH_TOKEN=eyJhbGci... 68 + 69 + # Collections 70 + INDEXED_COLLECTIONS=sh.tangled.repo,sh.tangled.repo.issue,sh.tangled.repo.pull,sh.tangled.string,sh.tangled.actor.profile,sh.tangled.repo.issue.comment,sh.tangled.repo.pull.comment,sh.tangled.repo.issue.state,sh.tangled.repo.pull.status,sh.tangled.feed.star 71 + 72 + # Search 73 + SEARCH_DEFAULT_LIMIT=20 74 + SEARCH_MAX_LIMIT=100 75 + 76 + # Embedding (Phase 2) 77 + # EMBEDDING_PROVIDER=openai 78 + # EMBEDDING_MODEL=text-embedding-3-small 79 + # EMBEDDING_API_KEY=sk-... 80 + # EMBEDDING_DIM=768 81 + 82 + # Server 83 + HTTP_BIND_ADDR=:8080 84 + LOG_LEVEL=info 85 + ENABLE_ADMIN_ENDPOINTS=false 86 + ``` 87 + 88 + ## 2. Observability 89 + 90 + ### Structured Logging 91 + 92 + Use Go's `slog` with JSON output. Every log entry includes: 93 + 94 + | Field | Description | 95 + | --------- | ----------------------------------- | 96 + | `ts` | Timestamp (RFC 3339) | 97 + | `level` | Log level | 98 + | `service` | `api`, `indexer`, or `embed-worker` | 99 + | `msg` | Human-readable message | 100 + 101 + #### Context Fields (where applicable) 102 + 103 + | Field | When | 104 + | ------------- | ------------------------ | 105 + | `event_name` | Tap event processing | 106 + | `event_id` | Tap event ID | 107 + | `document_id` | Document operations | 108 + | `did` | Any DID-scoped operation | 109 + | `collection` | Record processing | 110 + | `rkey` | Record processing | 111 + | `cursor` | Cursor persistence | 112 + | `error_class` | Error handling | 113 + | `duration_ms` | Timed operations | 114 + 115 + ### Metrics 116 + 117 + Recommended counters and gauges (via logs, Prometheus, or platform metrics): 118 + 119 + #### Ingestion 120 + 121 + | Metric | Type | Description | 122 + | ------------------------------ | --------- | ---------------------------------- | 123 + | `events_processed_total` | counter | Total Tap events processed | 124 + | `events_failed_total` | counter | Events that failed processing | 125 + | `normalization_failures_total` | counter | Normalization errors by collection | 126 + | `upsert_duration_ms` | histogram | DB upsert latency | 127 + | `cursor_position` | gauge | Current Tap cursor position | 128 + 129 + #### Embedding 130 + 131 + | Metric | Type | Description | 132 + | -------------------------- | --------- | ------------------------------ | 133 + | `embedding_queue_depth` | gauge | Pending embedding jobs | 134 + | `embedding_failures_total` | counter | Failed embedding attempts | 135 + | `embedding_duration_ms` | histogram | Per-document embedding latency | 136 + 137 + #### Search 138 + 139 + | Metric | Type | Description | 140 + | ----------------------- | --------- | -------------------------- | 141 + | `search_requests_total` | counter | Requests by mode | 142 + | `search_duration_ms` | histogram | Query latency by mode | 143 + | `search_results_count` | histogram | Results returned per query | 144 + 145 + ### Health Checks 146 + 147 + #### API Process 148 + 149 + | Endpoint | Check | Healthy | 150 + | -------------- | --------------------- | ------------------- | 151 + | `GET /healthz` | Process is responsive | Always (liveness) | 152 + | `GET /readyz` | DB connection works | `SELECT 1` succeeds | 153 + 154 + #### Indexer Process 155 + 156 + The indexer exposes a top-level health probe (not HTTP-routed): 157 + 158 + - Tap WebSocket connected or reconnecting 159 + - Cursor advancing or intentionally idle 160 + - DB reachable 161 + 162 + On Railway, this is a health check endpoint on a separate port (9090). 163 + 164 + #### Embed Worker 165 + 166 + - DB reachable 167 + - Embedding provider reachable (periodic test call) 168 + - Job queue not stalled (jobs processing within expected timeframe) 169 + 170 + ## 3. Security 171 + 172 + ### Secrets Management 173 + 174 + Secrets are injected through platform secret management: 175 + 176 + - **Railway:** Environment variables in the dashboard or `railway variables` 177 + 178 + Secrets are never stored in code, config files, or Docker images. 179 + 180 + Required secrets: 181 + 182 + | Secret | Purpose | 183 + | ------------------- | --------------------------------- | 184 + | `TURSO_AUTH_TOKEN` | Turso database authentication | 185 + | `TAP_AUTH_PASSWORD` | Tap admin API authentication | 186 + | `EMBEDDING_API_KEY` | Embedding provider authentication | 187 + | `ADMIN_AUTH_TOKEN` | Admin endpoint authentication | 188 + 189 + ### Admin Endpoints 190 + 191 + Admin endpoints (`/admin/reindex`, `/admin/reembed`) are: 192 + 193 + - Disabled by default (`ENABLE_ADMIN_ENDPOINTS=false`) 194 + - When enabled, protected by bearer token (`ADMIN_AUTH_TOKEN`) 195 + - Alternatively, exposed only on internal networking (Railway private networking) 196 + 197 + ### Input Validation 198 + 199 + The search API shall: 200 + 201 + - Validate `limit` is between 1 and `SEARCH_MAX_LIMIT` 202 + - Validate `offset` is non-negative 203 + - Reject unknown or malformed filter parameters with 400 204 + - Sanitize query strings before passing to FTS (Tantivy query parser handles this, but validate basic structure) 205 + - Bound hybrid requests (limit concurrent vector searches) 206 + 207 + ### Tap Authentication 208 + 209 + The indexer authenticates to Tap using HTTP Basic auth (`admin:<TAP_AUTH_PASSWORD>`). The WebSocket upgrade request includes the auth header. 210 + 211 + ### Data Privacy 212 + 213 + - All indexed content is public ATProto data 214 + - No private or authenticated content is ingested 215 + - Deleted records are tombstoned (`deleted_at` set) and excluded from search results 216 + - Tombstoned documents are periodically purged (configurable retention) 217 + 218 + ## 4. Deployment 219 + 220 + ### Railway (Primary) 221 + 222 + All Twister services deploy as separate Railway services within the same project. Tap is already deployed here. 223 + 224 + #### Service Layout 225 + 226 + | Service | Start Command | Health Check | Public | 227 + | ------------ | ---------------------- | ------------------ | ------ | 228 + | tap | (already deployed) | `GET /health` | no | 229 + | api | `twister api` | `GET /healthz` | yes | 230 + | indexer | `twister indexer` | `GET :9090/health` | no | 231 + | embed-worker | `twister embed-worker` | `GET :9091/health` | no | 232 + 233 + All services share the same Docker image. Railway uses the start command to select the subcommand. 234 + 235 + #### Environment Variables 236 + 237 + Set per-service in the Railway dashboard or via `railway variables`: 238 + 239 + ```bash 240 + # Shared across services 241 + TURSO_DATABASE_URL=libsql://twister-db.turso.io 242 + TURSO_AUTH_TOKEN=eyJ... 243 + LOG_LEVEL=info 244 + LOG_FORMAT=json 245 + 246 + # API service 247 + HTTP_BIND_ADDR=:8080 248 + SEARCH_DEFAULT_LIMIT=20 249 + SEARCH_MAX_LIMIT=100 250 + ENABLE_ADMIN_ENDPOINTS=false 251 + 252 + # Indexer service 253 + TAP_URL=wss://${{tap.RAILWAY_PUBLIC_DOMAIN}}/channel # Railway service reference 254 + TAP_AUTH_PASSWORD=... 255 + INDEXED_COLLECTIONS=sh.tangled.repo,sh.tangled.repo.issue,sh.tangled.repo.pull,sh.tangled.string,sh.tangled.actor.profile 256 + 257 + # Embed-worker (Phase 2) 258 + # EMBEDDING_PROVIDER=openai 259 + # EMBEDDING_MODEL=text-embedding-3-small 260 + # EMBEDDING_API_KEY=sk-... 261 + ``` 262 + 263 + Railway supports referencing other services' variables with `${{service.VAR}}` syntax, which is useful for linking the indexer to Tap's domain. 264 + 265 + #### Health Checks 266 + 267 + Railway activates deployments based on health check responses. Configure per-service: 268 + 269 + - **api:** HTTP health check on `/healthz` port 8080 270 + - **indexer:** HTTP health check on `/health` port 9090 271 + - **embed-worker:** HTTP health check on `/health` port 9091 272 + 273 + #### Autodeploy 274 + 275 + Connect the GitHub repository for automatic deployments on push. Railway builds from the Dockerfile and uses the start command configured per service. 276 + 277 + #### Internal Networking 278 + 279 + Railway services within the same project can communicate over private networking using `service.railway.internal` hostnames. The indexer connects to Tap via this internal network when both are in the same project. 280 + 281 + ### Dockerfile 282 + 283 + ```dockerfile 284 + FROM golang:1.24-alpine AS builder 285 + 286 + WORKDIR /app 287 + 288 + COPY go.mod go.sum ./ 289 + RUN go mod download 290 + 291 + COPY . . 292 + 293 + RUN CGO_ENABLED=0 GOOS=linux go build \ 294 + -ldflags="-s -w" \ 295 + -o /app/twister \ 296 + ./main.go 297 + 298 + FROM alpine:3.21 299 + 300 + RUN apk add --no-cache ca-certificates tzdata 301 + 302 + COPY --from=builder /app/twister /usr/local/bin/twister 303 + 304 + EXPOSE 8080 9090 9091 305 + 306 + CMD ["twister", "api"] 307 + ``` 308 + 309 + Notes: 310 + 311 + - `CGO_ENABLED=0` for static binary (required if using `libsql-client-go`; not compatible with `go-libsql` which needs CGo) 312 + - Railway overrides `CMD` with the start command configured per service 313 + - Multiple ports exposed: 8080 (API), 9090 (indexer health), 9091 (embed-worker health) 314 + 315 + ### Graceful Shutdown 316 + 317 + All processes handle `SIGTERM` and `SIGINT`: 318 + 319 + 1. Stop accepting new requests/events 320 + 2. Drain in-flight work (with timeout) 321 + 3. Persist current cursor (indexer) 322 + 4. Close DB connections 323 + 5. Exit 0 324 + 325 + Railway sends `SIGTERM` during deployments and restarts.

+137

packages/api/docs/specs/07-graph-backfill.md

··· 1 + --- 2 + title: "Spec 07 — Graph Backfill" 3 + updated: 2026-03-22 4 + --- 5 + 6 + ## 1. Purpose 7 + 8 + Bootstrap the search index with existing Tangled content by discovering users from a seed set and triggering Tap backfill for their repositories. Without this, the index only captures new events after deployment. 9 + 10 + ## 2. Seed Set 11 + 12 + A manually curated list of known Tangled users (DIDs or handles), stored in a plain text file: 13 + 14 + ```text 15 + # Known active Tangled users 16 + did:plc:abc123 17 + did:plc:def456 18 + alice.tangled.sh 19 + bob.tangled.sh 20 + # Add more as discovered 21 + ``` 22 + 23 + Format: 24 + - One entry per line 25 + - Lines starting with `#` are comments 26 + - Blank lines are ignored 27 + - Entries can be DIDs (`did:plc:...`) or handles (`alice.tangled.sh`) 28 + - Handles are resolved to DIDs before processing 29 + 30 + ## 3. Fan-Out Strategy 31 + 32 + From each seed user, discover connected users to expand the crawl set: 33 + 34 + ### Discovery Sources 35 + 36 + 1. **Follows**: Fetch `sh.tangled.graph.follow` records for the user → extract `subject` DIDs 37 + 2. **Collaborators**: For repos owned by the user, identify other users who have created issues, PRs, or comments → extract their DIDs 38 + 39 + ### Depth Limit 40 + 41 + Fan-out is configurable with a max hops parameter (default: 2): 42 + 43 + - **Hop 0**: Seed users themselves 44 + - **Hop 1**: Direct follows and collaborators of seed users 45 + - **Hop 2**: Follows and collaborators of hop-1 users 46 + 47 + Higher hop counts discover more users but increase time and may pull in loosely related accounts. Start with 2 hops and adjust based on the size of the Tangled network. 48 + 49 + ### Crawl Queue 50 + 51 + Discovered DIDs are added to a queue, deduplicated by DID. Each entry tracks: 52 + - DID 53 + - Discovery hop (distance from seed) 54 + - Source (which seed/user led to discovery) 55 + 56 + ## 4. Backfill Mechanism 57 + 58 + For each discovered user: 59 + 60 + 1. **Check if already tracked**: Query Tap's `/info/:did` endpoint — if the repo is already tracked and backfilled, skip 61 + 2. **Register with Tap**: POST to `/repos/add` with the DID — Tap handles the actual repo export and event delivery 62 + 3. **Tap backfill flow**: Tap fetches full repo history from PDS via `com.atproto.sync.getRepo`, then delivers historical events (`live: false`) through the normal WebSocket channel 63 + 4. **Indexer processes normally**: The indexer's existing ingestion loop handles backfill events the same as live events — no special backfill code path needed 64 + 65 + ### Rate Limiting 66 + 67 + - Batch `/repos/add` calls (e.g., 10 DIDs per request) 68 + - Add configurable delay between batches to avoid overwhelming Tap 69 + - Respect Tap's processing capacity — monitor `/stats/repo-count` to track progress 70 + 71 + ## 5. Deduplication 72 + 73 + - **User-level**: Maintain a visited set of DIDs during fan-out; skip already-seen DIDs 74 + - **Tap-level**: Tap's `/repos/add` is idempotent — adding an already-tracked DID is a no-op 75 + - **Record-level**: The indexer's upsert logic (keyed on `did|collection|rkey`) handles duplicate events naturally 76 + 77 + ## 6. CLI Interface 78 + 79 + ```bash 80 + # Basic backfill from seed file 81 + twister backfill --seeds seeds.txt 82 + 83 + # Limit fan-out depth 84 + twister backfill --seeds seeds.txt --max-hops 1 85 + 86 + # Preview discovered users without triggering backfill 87 + twister backfill --seeds seeds.txt --dry-run 88 + 89 + # Control parallelism 90 + twister backfill --seeds seeds.txt --concurrency 5 91 + ``` 92 + 93 + ### Flags 94 + 95 + | Flag | Default | Description | 96 + |------|---------|-------------| 97 + | `--seeds` | required | Path to seed file | 98 + | `--max-hops` | `2` | Max fan-out depth from seed users | 99 + | `--dry-run` | `false` | List discovered users without submitting to Tap | 100 + | `--concurrency` | `5` | Parallel discovery workers | 101 + | `--batch-size` | `10` | DIDs per `/repos/add` call | 102 + | `--batch-delay` | `1s` | Delay between batches | 103 + 104 + ### Output 105 + 106 + Progress is logged to stdout: 107 + 108 + ```text 109 + [hop 0] Processing 5 seed users... 110 + [hop 0] did:plc:abc123 → 12 follows, 3 collaborators 111 + [hop 0] did:plc:def456 → 8 follows, 1 collaborator 112 + [hop 1] Processing 24 discovered users (18 new)... 113 + ... 114 + [done] Discovered 142 unique users across 2 hops 115 + [done] Submitted 98 new DIDs to Tap (44 already tracked) 116 + ``` 117 + 118 + ## 7. Idempotency 119 + 120 + The entire backfill process is safe to re-run: 121 + 122 + - Seed file parsing is stateless 123 + - Fan-out discovery is deterministic for a given network state 124 + - Tap's `/repos/add` is idempotent 125 + - The indexer's upsert logic handles re-delivered events 126 + - No local state is persisted between runs (the crawl queue is in-memory) 127 + 128 + ## 8. Configuration 129 + 130 + | Variable | Default | Description | 131 + |----------|---------|-------------| 132 + | `TAP_URL` | (existing) | Tap base URL for API calls | 133 + | `TAP_AUTH_PASSWORD` | (existing) | Tap admin auth | 134 + | `TURSO_DATABASE_URL` | (existing) | For checking existing records | 135 + | `TURSO_AUTH_TOKEN` | (existing) | DB auth | 136 + 137 + No new environment variables are needed — backfill reuses existing Tap and DB configuration.

+21

packages/api/docs/specs/README.md

··· 1 + --- 2 + title: "Twister — Technical Specification Index" 3 + updated: 2026-03-22 4 + --- 5 + 6 + # Twister Technical Specifications 7 + 8 + Twister is a Go-based search service for [Tangled](https://tangled.org) content on AT Protocol. 9 + It ingests records through [Tap](https://github.com/bluesky-social/indigo/tree/main/cmd/tap), denormalizes them into search documents, indexes them in [Turso/libSQL](https://docs.turso.tech), and exposes keyword, semantic, and hybrid search APIs. 10 + 11 + ## Specifications 12 + 13 + | # | Document | Description | 14 + |---|----------|-------------| 15 + | 1 | [Architecture](01-architecture.md) | Purpose, goals, design principles, system context, tech choices | 16 + | 2 | [Tangled Lexicons](02-tangled-lexicons.md) | `sh.tangled.*` record schemas and fields | 17 + | 3 | [Data Model](03-data-model.md) | Database schema, search documents, sync state | 18 + | 4 | [Data Pipeline](04-data-pipeline.md) | Tap integration, normalization, failure handling | 19 + | 5 | [Search](05-search.md) | Search modes, API contract, scoring, filtering | 20 + | 6 | [Operations](06-operations.md) | Configuration, observability, security, deployment | 21 + | 7 | [Graph Backfill](07-graph-backfill.md) | Seed-based user discovery and content backfill |

+38

packages/api/docs/tasks/README.md

··· 1 + --- 2 + title: "Twister — Task Index" 3 + updated: 2026-03-22 4 + --- 5 + 6 + # Twister Tasks 7 + 8 + Assumes Go, Tap (deployed on Railway), Turso/libSQL, and Railway for deployment. 9 + 10 + ## Delivery Strategy 11 + 12 + Build in four phases: 13 + 14 + 1. **MVP** — ingestion, keyword search, deployment, operational tooling, graph backfill 15 + 2. **Semantic Search** — embeddings, vector retrieval 16 + 3. **Hybrid Search** — weighted merge of keyword + semantic 17 + 4. **Quality Polish** — ranking refinement, advanced filters, analytics 18 + 19 + Ship keyword search before embeddings. That gives a testable, inspectable baseline before introducing model behavior. 20 + 21 + ## Phases 22 + 23 + | Phase | Title | Document | Status | 24 + | ----- | ----- | -------- | ------ | 25 + | 1 | MVP | [phase-1-mvp.md](phase-1-mvp.md) | In progress (M0–M2 complete) | 26 + | 2 | Semantic Search | [phase-2-semantic.md](phase-2-semantic.md) | Not started | 27 + | 3 | Hybrid Search | [phase-3-hybrid.md](phase-3-hybrid.md) | Not started | 28 + | 4 | Quality Polish | [phase-4-quality.md](phase-4-quality.md) | Not started | 29 + 30 + ## MVP Complete When 31 + 32 + - Tap ingests tracked `sh.tangled.*` records 33 + - Documents normalize into a stable store 34 + - Keyword search works publicly 35 + - API and indexer are deployed on Railway 36 + - Restart does not lose sync position 37 + - Reindex exists for repair 38 + - Graph backfill populates initial content from seed users

+407

packages/api/docs/tasks/phase-1-mvp.md

··· 1 + --- 2 + title: "Phase 1 — MVP" 3 + updated: 2026-03-22 4 + --- 5 + 6 + # Phase 1 — MVP 7 + 8 + Get a searchable product online: ingestion, keyword search, deployment, and operational tooling. 9 + 10 + ## MVP Complete When 11 + 12 + - Tap ingests tracked `sh.tangled.*` records 13 + - Documents normalize into a stable store 14 + - Keyword search works publicly 15 + - API and indexer are deployed on Railway 16 + - Restart does not lose sync position 17 + - Reindex exists for repair 18 + - Graph backfill populates initial content from seed users 19 + 20 + --- 21 + 22 + ## M0 — Repository Bootstrap ✅ 23 + 24 + Executable layout, local tooling, and development conventions (completed 2026-03-22). 25 + 26 + --- 27 + 28 + ## M1 — Database Schema and Store Layer ✅ 29 + 30 + refs: [specs/03-data-model.md](../specs/03-data-model.md) 31 + 32 + Implemented the Turso/libSQL schema and Go store package for document persistence. 33 + 34 + --- 35 + 36 + ## M2 — Normalization Layer ✅ 37 + 38 + refs: [specs/02-tangled-lexicons.md](../specs/02-tangled-lexicons.md), [specs/04-data-pipeline.md](../specs/04-data-pipeline.md) 39 + 40 + Translate `sh.tangled.*` records into internal search documents. 41 + 42 + --- 43 + 44 + ## M3 — Tap Client and Ingestion Loop 45 + 46 + refs: [specs/04-data-pipeline.md](../specs/04-data-pipeline.md), [specs/01-architecture.md](../specs/01-architecture.md) 47 + 48 + ### Goal 49 + 50 + Connect the indexer to Tap (on Railway) and process live events into the store. 51 + 52 + ### Why Now 53 + 54 + Tap is the point of truth for synchronized ATProto ingestion. It is already deployed on Railway. 55 + 56 + ### Deliverables 57 + 58 + - Tap WebSocket client package (`internal/tapclient/`) 59 + - Event decode layer (record events + identity events) 60 + - Ingestion loop with retry/backoff 61 + - Cursor persistence coupled to successful DB commits 62 + - Identity event handler (DID → handle cache) 63 + 64 + ### Tasks 65 + 66 + - [ ] Define Tap event DTOs matching the documented event shape: 67 + 68 + ```go 69 + type TapEvent struct { 70 + ID int64 `json:"id"` 71 + Type string `json:"type"` // "record" or "identity" 72 + Record *TapRecord `json:"record"` 73 + Identity *TapIdentity `json:"identity"` 74 + } 75 + type TapRecord struct { 76 + Live bool `json:"live"` 77 + Rev string `json:"rev"` 78 + DID string `json:"did"` 79 + Collection string `json:"collection"` 80 + RKey string `json:"rkey"` 81 + Action string `json:"action"` // "create", "update", "delete" 82 + CID string `json:"cid"` 83 + Record json.RawMessage `json:"record"` 84 + } 85 + type TapIdentity struct { 86 + DID string `json:"did"` 87 + Handle string `json:"handle"` 88 + IsActive bool `json:"isActive"` 89 + Status string `json:"status"` 90 + } 91 + ``` 92 + 93 + - [ ] Implement WebSocket client: 94 + - Connect to `TAP_URL` (e.g., `wss://tap.railway.internal/channel`) 95 + - HTTP Basic auth with `admin:TAP_AUTH_PASSWORD` 96 + - Auto-reconnect with exponential backoff 97 + - Ack protocol: send event `id` back after successful processing 98 + - [ ] Implement ingestion loop: 99 + 1. Receive event from WebSocket 100 + 2. If `type == "identity"` → update handle cache, ack, continue 101 + 3. If `type == "record"` → check collection allowlist 102 + 4. Map `action` to operation (create/update → upsert, delete → tombstone) 103 + 5. Decode `record.record` via adapter registry 104 + 6. Normalize to `Document` 105 + 7. Upsert to store 106 + 8. Schedule embedding job if eligible (Phase 2) 107 + 9. Persist cursor (event ID) after successful DB commit 108 + 10. Ack the event 109 + - [ ] Implement collection allowlist from `INDEXED_COLLECTIONS` config 110 + - [ ] Handle state events (`sh.tangled.repo.issue.state`, `sh.tangled.repo.pull.status`) → update `record_state` 111 + - [ ] Handle normalization failures: log, skip, advance cursor 112 + - [ ] Handle DB failures: retry with backoff, do not advance cursor 113 + 114 + ### Verification 115 + 116 + - [ ] Indexer connects to Tap via WebSocket in development 117 + - [ ] A newly created tracked record appears in `documents` table 118 + - [ ] An updated record changes the existing row (CID changes) 119 + - [ ] A delete event tombstones the row (`deleted_at` set) 120 + - [ ] Killing and restarting the indexer resumes from persisted cursor without duplication 121 + - [ ] Identity events update handle cache 122 + - [ ] Unsupported collections are silently skipped 123 + - [ ] Connection drops trigger automatic reconnection 124 + 125 + ### Exit Criteria 126 + 127 + The system continuously ingests and persists `sh.tangled.*` records from Tap. 128 + 129 + --- 130 + 131 + ## M4 — Keyword Search API 132 + 133 + refs: [specs/05-search.md](../specs/05-search.md) 134 + 135 + ### Goal 136 + 137 + Expose a usable public search API backed by Turso's Tantivy-backed FTS. 138 + 139 + ### Why Now 140 + 141 + First real product milestone. Searchable Tangled content without waiting for embeddings. 142 + 143 + ### Deliverables 144 + 145 + - HTTP server (chi or net/http) 146 + - `GET /healthz` — liveness 147 + - `GET /readyz` — readiness (DB connectivity) 148 + - `GET /search` — keyword search with configurable mode 149 + - `GET /search/keyword` — keyword-only search 150 + - `GET /documents/{id}` — document lookup 151 + - Search repository layer (FTS queries isolated from handlers) 152 + - Pagination, filtering, snippets 153 + 154 + ### Tasks 155 + 156 + - [ ] Set up HTTP server with chi router 157 + - [ ] Implement `/healthz` (always 200) and `/readyz` (SELECT 1 against DB) 158 + - [ ] Implement search repository with FTS queries: 159 + 160 + ```sql 161 + SELECT id, title, summary, repo_name, author_handle, collection, record_type, 162 + created_at, updated_at, 163 + fts_score(title, body, summary, repo_name, author_handle, tags_json, ?) AS score, 164 + fts_highlight(body, '', '', ?) AS body_snippet 165 + FROM documents 166 + WHERE fts_match(title, body, summary, repo_name, author_handle, tags_json, ?) 167 + AND deleted_at IS NULL 168 + ORDER BY score DESC 169 + LIMIT ? OFFSET ?; 170 + ``` 171 + 172 + - [ ] Implement request validation: 173 + - `q` required, non-empty 174 + - `limit` 1–100, default 20 175 + - `offset` >= 0, default 0 176 + - Reject unknown parameters with 400 177 + - [ ] Implement filters (as WHERE clauses): 178 + - `collection` → `d.collection = ?` 179 + - `type` → `d.record_type = ?` 180 + - `author` → `d.author_handle = ?` or `d.did = ?` 181 + - `repo` → `d.repo_name = ?` 182 + - [ ] Implement `/documents/{id}` — full document response 183 + - [ ] Implement stable JSON response contract (see spec 05-search.md) 184 + - [ ] Exclude tombstoned documents (`deleted_at IS NOT NULL`) by default 185 + - [ ] Add request logging middleware (method, path, status, duration) 186 + - [ ] Add CORS headers if needed 187 + 188 + ### Verification 189 + 190 + - [ ] Searching by exact repo name returns the expected repo first 191 + - [ ] Searching by title term returns expected documents 192 + - [ ] Searching by author handle returns relevant docs 193 + - [ ] Tombstoned documents do not appear 194 + - [ ] Malformed query parameters return 400 with error JSON 195 + - [ ] DB outage causes `/readyz` to fail (503) 196 + - [ ] Pagination works: `offset=0&limit=5` then `offset=5&limit=5` returns different results 197 + - [ ] Filter by collection returns only matching docs 198 + 199 + ### Exit Criteria 200 + 201 + A user can search Tangled content reliably with keyword search. 202 + 203 + --- 204 + 205 + ## M5 — Railway Deployment 206 + 207 + refs: [specs/06-operations.md](../specs/06-operations.md) 208 + 209 + ### Goal 210 + 211 + Deploy the API and indexer as Railway services alongside Tap. 212 + 213 + ### Why Now 214 + 215 + At this point, the product is useful enough to run continuously. 216 + 217 + ### Deliverables 218 + 219 + - Finalized Dockerfile 220 + - Railway project with services: `api`, `indexer` 221 + - Health checks configured per service 222 + - Secrets/env vars set 223 + - Production startup commands documented 224 + 225 + ### Tasks 226 + 227 + - [ ] Finalize Dockerfile (multi-stage, CGO_ENABLED=0, Alpine runtime) 228 + - [ ] Create Railway services: 229 + - `api` — start command: `twister api` 230 + - `indexer` — start command: `twister indexer` 231 + - [ ] Configure environment variables per service: 232 + - Shared: `TURSO_DATABASE_URL`, `TURSO_AUTH_TOKEN`, `LOG_LEVEL`, `LOG_FORMAT` 233 + - API: `HTTP_BIND_ADDR`, `SEARCH_DEFAULT_LIMIT`, `SEARCH_MAX_LIMIT` 234 + - Indexer: `TAP_URL` (reference Tap service domain), `TAP_AUTH_PASSWORD`, `INDEXED_COLLECTIONS` 235 + - [ ] Configure health checks: 236 + - API: HTTP check on `/healthz` port 8080 237 + - Indexer: HTTP check on `/health` port 9090 238 + - [ ] Use Railway internal networking for indexer → Tap connection 239 + - [ ] Connect GitHub repo for autodeploy 240 + - [ ] Test graceful shutdown on redeploy (SIGTERM handling) 241 + - [ ] Document deploy steps 242 + 243 + ### Verification 244 + 245 + - [ ] API service becomes healthy and routable (public URL) 246 + - [ ] Indexer service starts and stays healthy 247 + - [ ] A new Tangled record ingested post-deploy becomes searchable 248 + - [ ] A redeploy preserves API availability 249 + - [ ] A restart does not lose sync position (cursor persisted) 250 + - [ ] Health checks correctly report status 251 + 252 + ### Exit Criteria 253 + 254 + The system runs as a deployed service with health-checked processes on Railway. 255 + 256 + --- 257 + 258 + ## M6 — Reindex and Repair 259 + 260 + refs: [specs/05-search.md](../specs/05-search.md) 261 + 262 + ### Goal 263 + 264 + Make the system recoverable and operable with repair tools. 265 + 266 + ### Why Now 267 + 268 + Search systems are never perfect on first ingestion. Repair tools are needed before production. 269 + 270 + ### Deliverables 271 + 272 + - `twister reindex` command with scoping options 273 + - Dry-run mode 274 + - Admin reindex endpoint (optional) 275 + - Progress logging and error summary 276 + 277 + ### Tasks 278 + 279 + - [ ] Implement `reindex` subcommand with flags: 280 + - `--collection` — reindex one collection 281 + - `--did` — reindex one DID's documents 282 + - `--document` — reindex one document by ID 283 + - `--dry-run` — show intended work without writes 284 + - No flags → reindex all 285 + - [ ] Implement reindex logic: 286 + 1. Select documents matching scope 287 + 2. For each document, re-run normalization from stored fields (or re-fetch if source available) 288 + 3. Update FTS-relevant fields 289 + 4. Upsert back to store 290 + 5. Log progress (N/total, errors) 291 + - [ ] Implement `POST /admin/reindex` endpoint (behind `ENABLE_ADMIN_ENDPOINTS` + `ADMIN_AUTH_TOKEN`) 292 + - [ ] Add error summary output on completion 293 + - [ ] Exit non-zero on unrecoverable failures 294 + 295 + ### Verification 296 + 297 + - [ ] Reindexing one document updates its stored normalized text 298 + - [ ] Reindexing one collection repairs intentionally corrupted rows 299 + - [ ] Dry-run shows intended work without writes 300 + - [ ] Reindex command exits non-zero on failures 301 + - [ ] Admin endpoint triggers reindex when enabled 302 + 303 + ### Exit Criteria 304 + 305 + Operators can repair bad indexes without rebuilding everything manually. 306 + 307 + --- 308 + 309 + ## M7 — Observability 310 + 311 + refs: [specs/06-operations.md](../specs/06-operations.md) 312 + 313 + ### Goal 314 + 315 + Make the system diagnosable in production. 316 + 317 + ### Deliverables 318 + 319 + - Structured slog fields across all services 320 + - Error classification 321 + - Ingestion lag visibility 322 + - Periodic state logs 323 + - Operator documentation 324 + 325 + ### Tasks 326 + 327 + - [ ] Standardize slog fields across all packages: 328 + - `service`, `event_name`, `event_id`, `did`, `collection`, `rkey`, `document_id`, `cursor`, `error_class`, `duration_ms` 329 + - [ ] Add error classification (normalize_error, db_error, tap_error, embed_error) 330 + - [ ] Add periodic state logs in indexer: 331 + - Current cursor position 332 + - Events processed since last log 333 + - Documents in store (count) 334 + - [ ] Add request logging in API (method, path, status, duration, query) 335 + - [ ] Add search latency logging per query mode 336 + - [ ] Write operator documentation: 337 + - Restart procedure 338 + - Reindex procedure 339 + - Backfill notes 340 + - Failure triage guide 341 + 342 + ### Verification 343 + 344 + - [ ] A failed Tap decode surfaces enough context to debug (collection, DID, rkey, error class) 345 + - [ ] DB connectivity failures are visible in logs and readiness 346 + - [ ] Operator can follow the runbook to diagnose a broken indexer 347 + - [ ] Search latency is logged per request 348 + 349 + ### Exit Criteria 350 + 351 + The system is maintainable without guesswork. 352 + 353 + --- 354 + 355 + ## M-New — Graph Backfill from Seed Users 356 + 357 + refs: [specs/07-graph-backfill.md](../specs/07-graph-backfill.md) 358 + 359 + ### Goal 360 + 361 + Bootstrap the search index with existing Tangled content by discovering and backfilling users from a seed set. 362 + 363 + ### Why Now 364 + 365 + Before MVP launch, the index needs existing content. Live ingestion only captures new events — backfill populates historical data. 366 + 367 + ### Deliverables 368 + 369 + - `twister backfill` CLI command 370 + - Seed file parser 371 + - Graph fan-out discovery (follows/collaborators) 372 + - Tap `/repos/add` integration for discovered users 373 + - Deduplication against already-indexed users 374 + - Progress logging 375 + 376 + ### Tasks 377 + 378 + - [ ] Implement `backfill` subcommand with flags: 379 + - `--seeds <file>` — path to seed file (one DID or handle per line) 380 + - `--max-hops <n>` — depth limit for fan-out (default: 2) 381 + - `--dry-run` — show discovered users without triggering backfill 382 + - `--concurrency <n>` — parallel discovery workers (default: 5) 383 + - [ ] Implement seed file parser (supports DIDs and handles, comments with `#`) 384 + - [ ] Implement graph fan-out: 385 + 1. For each seed user, resolve DID if handle provided 386 + 2. Fetch `sh.tangled.graph.follow` records for the user 387 + 3. Fetch collaborators from repos owned by the user 388 + 4. Add discovered DIDs to the crawl queue 389 + 5. Repeat up to `max-hops` depth 390 + - [ ] Integrate with Tap `/repos/add` to register discovered DIDs for tracking 391 + - [ ] Deduplicate: skip DIDs already tracked by Tap (check via `/info/:did`) 392 + - [ ] Log progress: seeds processed, users discovered per hop, DIDs submitted to Tap 393 + - [ ] Handle rate limiting and errors gracefully (retry with backoff) 394 + - [ ] Make idempotent: safe to re-run; Tap handles duplicate `/repos/add` calls 395 + 396 + ### Verification 397 + 398 + - [ ] Running with a seed file of 3 known users discovers their followers 399 + - [ ] `--max-hops 1` limits discovery to direct connections only 400 + - [ ] `--dry-run` lists discovered DIDs without calling Tap 401 + - [ ] Already-tracked users are skipped 402 + - [ ] Re-running the same seed file produces no duplicate work 403 + - [ ] Tap begins backfilling records for newly added DIDs 404 + 405 + ### Exit Criteria 406 + 407 + The index contains historical content from the seed user graph, not just new events.

+124

packages/api/docs/tasks/phase-2-semantic.md

··· 1 + --- 2 + title: "Phase 2 — Semantic Search" 3 + updated: 2026-03-22 4 + --- 5 + 6 + # Phase 2 — Semantic Search 7 + 8 + Add embedding generation and vector-based retrieval on top of the keyword baseline. 9 + 10 + --- 11 + 12 + ## M8 — Embedding Pipeline 13 + 14 + refs: [specs/03-data-model.md](../specs/03-data-model.md), [specs/05-search.md](../specs/05-search.md) 15 + 16 + ### Goal 17 + 18 + Add asynchronous embedding generation without blocking ingestion. 19 + 20 + ### Why Now 21 + 22 + Only after keyword search is stable should semantic complexity be added. 23 + 24 + ### Deliverables 25 + 26 + - `embedding_jobs` table operational (schema from M1) 27 + - `embed-worker` subcommand 28 + - Embedding provider abstraction (OpenAI, Voyage, Ollama) 29 + - Retry and dead-letter behavior 30 + - `twister reembed` command 31 + 32 + ### Tasks 33 + 34 + - [ ] Define embedding provider interface: 35 + 36 + ```go 37 + type EmbeddingProvider interface { 38 + Embed(ctx context.Context, texts []string) ([][]float32, error) 39 + Model() string 40 + Dimension() int 41 + } 42 + ``` 43 + 44 + - [ ] Implement OpenAI provider (or preferred provider) 45 + - [ ] Implement embedding input text composition (see spec 04-data-pipeline.md, section 5): 46 + `title\nrepo_name\nauthor_handle\ntags\nsummary\nbody` 47 + - [ ] Add job enqueueing: on document upsert, insert `embedding_jobs` row with `status=pending` 48 + - [ ] Implement `embed-worker` loop: 49 + 1. Poll for `pending` jobs (batch by `EMBEDDING_BATCH_SIZE`) 50 + 2. Compose input text per document 51 + 3. Call embedding provider 52 + 4. Store vectors in `document_embeddings` with `vector32(?)` 53 + 5. Mark job `completed` 54 + 6. On failure: increment `attempts`, set `last_error`, backoff 55 + 7. After max attempts: mark `dead` 56 + - [ ] Create DiskANN vector index: `CREATE INDEX idx_embeddings_vec ON document_embeddings(libsql_vector_idx(embedding, 'metric=cosine'))` 57 + - [ ] Implement `reembed` command (re-generate all embeddings, useful for model migration) 58 + - [ ] Skip deleted documents in embedding pipeline 59 + - [ ] Add health check endpoint for embed-worker (port 9091) 60 + 61 + ### Verification 62 + 63 + - [ ] Creating a new searchable document enqueues an embedding job 64 + - [ ] Worker processes the job and stores a vector in `document_embeddings` 65 + - [ ] Failed embedding calls retry with bounded attempts 66 + - [ ] Keyword search still works when embed-worker is down 67 + - [ ] `reembed` regenerates embeddings for all eligible documents 68 + 69 + ### Exit Criteria 70 + 71 + Embeddings are produced asynchronously and stored durably. 72 + 73 + --- 74 + 75 + ## M9 — Semantic Search 76 + 77 + refs: [specs/05-search.md](../specs/05-search.md) 78 + 79 + ### Goal 80 + 81 + Expose vector-based semantic retrieval. 82 + 83 + ### Why Now 84 + 85 + Natural next step once embeddings exist. Turso/libSQL has native vector search with `vector_top_k`. 86 + 87 + ### Deliverables 88 + 89 + - `GET /search/semantic` endpoint 90 + - Query-time embedding (convert query text → vector) 91 + - Vector similarity search via `vector_top_k` 92 + - Response parity with keyword search 93 + 94 + ### Tasks 95 + 96 + - [ ] Implement query embedding: call embedding provider with user's query text 97 + - [ ] Implement semantic search repository: 98 + 99 + ```sql 100 + SELECT d.id, d.title, d.summary, d.repo_name, d.author_handle, 101 + d.collection, d.record_type, d.created_at, d.updated_at 102 + FROM vector_top_k('idx_embeddings_vec', vector32(?), ?) AS v 103 + JOIN document_embeddings e ON e.rowid = v.id 104 + JOIN documents d ON d.id = e.document_id 105 + WHERE d.deleted_at IS NULL; 106 + ``` 107 + 108 + - [ ] Normalize distance to relevance score: `score = 1.0 - (distance / 2.0)` 109 + - [ ] Apply same filters as keyword search (collection, author, repo, type) 110 + - [ ] Add timeout and cost controls (limit vector search to reasonable K) 111 + - [ ] Wire `/search/semantic` handler 112 + - [ ] Return `matched_by: ["semantic"]` in results 113 + 114 + ### Verification 115 + 116 + - [ ] Semantically similar queries retrieve expected documents even with little lexical overlap 117 + - [ ] Documents without embeddings are omitted from semantic results 118 + - [ ] Semantic search returns the same JSON schema as keyword search 119 + - [ ] Latency is acceptable under small test load 120 + - [ ] Filters work correctly with semantic results 121 + 122 + ### Exit Criteria 123 + 124 + The API supports true semantic search over Tangled documents.

+53

packages/api/docs/tasks/phase-3-hybrid.md

··· 1 + --- 2 + title: "Phase 3 — Hybrid Search" 3 + updated: 2026-03-22 4 + --- 5 + 6 + # Phase 3 — Hybrid Search 7 + 8 + Merge lexical and semantic search into the default high-quality retrieval mode. 9 + 10 + --- 11 + 12 + ## M10 — Hybrid Search 13 + 14 + refs: [specs/05-search.md](../specs/05-search.md) 15 + 16 + ### Deliverables 17 + 18 + - `GET /search/hybrid` endpoint 19 + - Weighted score blending (keyword 0.65 + semantic 0.35) 20 + - Score normalization 21 + - Result deduplication 22 + - `matched_by` metadata showing which modes contributed 23 + 24 + ### Tasks 25 + 26 + - [ ] Implement hybrid search orchestrator: 27 + 1. Fetch top N keyword results (N=50 or configurable) 28 + 2. Fetch top N semantic results 29 + 3. Normalize keyword scores (min-max within result set) 30 + 4. Semantic scores already normalized (0–1) 31 + 5. Merge on `document_id` 32 + 6. For documents in both sets: `hybrid_score = 0.65 * keyword + 0.35 * semantic` 33 + 7. For documents in one set: use available score (other = 0) 34 + 8. Sort by hybrid_score descending 35 + 9. Deduplicate 36 + 10. Apply limit/offset 37 + - [ ] Populate `matched_by` field: `["keyword"]`, `["semantic"]`, or `["keyword", "semantic"]` 38 + - [ ] Make weights configurable via `HYBRID_KEYWORD_WEIGHT` / `HYBRID_SEMANTIC_WEIGHT` 39 + - [ ] Wire `/search/hybrid` handler 40 + - [ ] Make `/search?mode=hybrid` work 41 + 42 + ### Verification 43 + 44 + - [ ] Hybrid returns documents found by either source 45 + - [ ] Duplicates are merged correctly (no duplicate IDs in results) 46 + - [ ] Exact-match queries still favor lexical relevance 47 + - [ ] Exploratory natural-language queries improve over keyword-only results 48 + - [ ] Score ordering is stable across repeated runs on the same corpus 49 + - [ ] `matched_by` accurately reflects which modes produced each result 50 + 51 + ### Exit Criteria 52 + 53 + Hybrid search becomes the preferred default search mode.

+49

packages/api/docs/tasks/phase-4-quality.md

··· 1 + --- 2 + title: "Phase 4 — Ranking and Quality Polish" 3 + updated: 2026-03-22 4 + --- 5 + 6 + # Phase 4 — Ranking and Quality Polish 7 + 8 + Improve search quality without changing the core architecture. 9 + 10 + --- 11 + 12 + ## M11 — Ranking and Quality Polish 13 + 14 + refs: [specs/05-search.md](../specs/05-search.md) 15 + 16 + ### Deliverables 17 + 18 + - Boosted field weighting refinement 19 + - Recency boost 20 + - Collection-aware ranking 21 + - Better snippets/highlights 22 + - Issue/PR state filtering 23 + - Star count as ranking signal 24 + - Optional query analytics 25 + 26 + ### Tasks 27 + 28 + - [ ] Tune FTS index weights based on real query results 29 + - [ ] Add small recency boost to ranking (e.g., decay function on `created_at`) 30 + - [ ] Add collection-aware ranking adjustments (repos ranked differently from comments) 31 + - [ ] Index `sh.tangled.repo.issue.comment` and `sh.tangled.repo.pull.comment` (P2 collections) 32 + - [ ] Aggregate `sh.tangled.feed.star` counts per repo and use as ranking signal 33 + - [ ] Implement `state` filter (open/closed/merged) using `record_state` table 34 + - [ ] Improve snippets: better truncation, multi-field highlights 35 + - [ ] Add curated relevance test fixtures (expected queries → expected top results) 36 + - [ ] Run `OPTIMIZE INDEX idx_documents_fts` as maintenance task 37 + - [ ] Optional: log queries for analytics (anonymized) 38 + 39 + ### Verification 40 + 41 + - [ ] Exact repo lookups reliably rank the repo first 42 + - [ ] Recent active content gets a reasonable small boost without overwhelming exact relevance 43 + - [ ] Snippets show useful matched context 44 + - [ ] Ranking regression tests catch obvious degradations 45 + - [ ] State filter correctly excludes closed/merged items when requested 46 + 47 + ### Exit Criteria 48 + 49 + Search quality is noticeably improved and more predictable.

+26

packages/api/go.mod

··· 1 + module tangled.org/desertthunder.dev/twister 2 + 3 + go 1.25.0 4 + 5 + require ( 6 + github.com/spf13/cobra v1.10.2 7 + github.com/tursodatabase/libsql-client-go v0.0.0-20251219100830-236aa1ff8acc 8 + modernc.org/sqlite v1.47.0 9 + ) 10 + 11 + require ( 12 + github.com/antlr4-go/antlr/v4 v4.13.0 // indirect 13 + github.com/coder/websocket v1.8.12 // indirect 14 + github.com/dustin/go-humanize v1.0.1 // indirect 15 + github.com/google/uuid v1.6.0 // indirect 16 + github.com/inconshreveable/mousetrap v1.1.0 // indirect 17 + github.com/mattn/go-isatty v0.0.20 // indirect 18 + github.com/ncruces/go-strftime v1.0.0 // indirect 19 + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect 20 + github.com/spf13/pflag v1.0.9 // indirect 21 + golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect 22 + golang.org/x/sys v0.42.0 // indirect 23 + modernc.org/libc v1.70.0 // indirect 24 + modernc.org/mathutil v1.7.1 // indirect 25 + modernc.org/memory v1.11.0 // indirect 26 + )

+69

packages/api/go.sum

··· 1 + github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= 2 + github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= 3 + github.com/coder/websocket v1.8.12 h1:5bUXkEPPIbewrnkU8LTCLVaxi4N4J8ahufH2vlo4NAo= 4 + github.com/coder/websocket v1.8.12/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs= 5 + github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= 6 + github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= 7 + github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= 8 + github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= 9 + github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= 10 + github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 11 + github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 12 + github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= 13 + github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= 14 + github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 15 + github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 16 + github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= 17 + github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 18 + github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= 19 + github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= 20 + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= 21 + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= 22 + github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 23 + github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= 24 + github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= 25 + github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= 26 + github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 27 + github.com/tursodatabase/libsql-client-go v0.0.0-20251219100830-236aa1ff8acc h1:lzi/5fg2EfinRlh3v//YyIhnc4tY7BTqazQGwb1ar+0= 28 + github.com/tursodatabase/libsql-client-go v0.0.0-20251219100830-236aa1ff8acc/go.mod h1:08inkKyguB6CGGssc/JzhmQWwBgFQBgjlYFjxjRh7nU= 29 + go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= 30 + golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY= 31 + golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70= 32 + golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= 33 + golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= 34 + golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= 35 + golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= 36 + golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 37 + golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= 38 + golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= 39 + golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= 40 + golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= 41 + gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 42 + modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= 43 + modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= 44 + modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw= 45 + modernc.org/ccgo/v4 v4.32.0/go.mod h1:6F08EBCx5uQc38kMGl+0Nm0oWczoo1c7cgpzEry7Uc0= 46 + modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= 47 + modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= 48 + modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= 49 + modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= 50 + modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= 51 + modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= 52 + modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= 53 + modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= 54 + modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw= 55 + modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo= 56 + modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= 57 + modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= 58 + modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= 59 + modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= 60 + modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= 61 + modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= 62 + modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= 63 + modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= 64 + modernc.org/sqlite v1.47.0 h1:R1XyaNpoW4Et9yly+I2EeX7pBza/w+pmYee/0HJDyKk= 65 + modernc.org/sqlite v1.47.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig= 66 + modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= 67 + modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= 68 + modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= 69 + modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=

+1

packages/api/internal/api/api.go

··· 1 + package api

+113

packages/api/internal/config/config.go

··· 1 + package config 2 + 3 + import ( 4 + "errors" 5 + "os" 6 + "strconv" 7 + "strings" 8 + ) 9 + 10 + type Config struct { 11 + TursoURL string 12 + TursoToken string 13 + TapURL string 14 + TapAuthPassword string 15 + IndexedCollections string 16 + SearchDefaultLimit int 17 + SearchMaxLimit int 18 + SearchDefaultMode string 19 + EmbeddingProvider string 20 + EmbeddingModel string 21 + EmbeddingAPIKey string 22 + EmbeddingAPIURL string 23 + EmbeddingDim int 24 + EmbeddingBatchSize int 25 + HybridKeywordWeight float64 26 + HybridSemanticWeight float64 27 + HTTPBindAddr string 28 + LogLevel string 29 + LogFormat string 30 + EnableAdminEndpoints bool 31 + AdminAuthToken string 32 + } 33 + 34 + func Load() (*Config, error) { 35 + cfg := &Config{ 36 + TursoURL: os.Getenv("TURSO_DATABASE_URL"), 37 + TursoToken: os.Getenv("TURSO_AUTH_TOKEN"), 38 + TapURL: os.Getenv("TAP_URL"), 39 + TapAuthPassword: os.Getenv("TAP_AUTH_PASSWORD"), 40 + IndexedCollections: os.Getenv("INDEXED_COLLECTIONS"), 41 + SearchDefaultMode: envOrDefault("SEARCH_DEFAULT_MODE", "keyword"), 42 + EmbeddingProvider: os.Getenv("EMBEDDING_PROVIDER"), 43 + EmbeddingModel: os.Getenv("EMBEDDING_MODEL"), 44 + EmbeddingAPIKey: os.Getenv("EMBEDDING_API_KEY"), 45 + EmbeddingAPIURL: os.Getenv("EMBEDDING_API_URL"), 46 + HTTPBindAddr: envOrDefault("HTTP_BIND_ADDR", ":8080"), 47 + LogLevel: envOrDefault("LOG_LEVEL", "info"), 48 + LogFormat: envOrDefault("LOG_FORMAT", "json"), 49 + AdminAuthToken: os.Getenv("ADMIN_AUTH_TOKEN"), 50 + SearchDefaultLimit: envInt("SEARCH_DEFAULT_LIMIT", 20), 51 + SearchMaxLimit: envInt("SEARCH_MAX_LIMIT", 100), 52 + EmbeddingDim: envInt("EMBEDDING_DIM", 768), 53 + EmbeddingBatchSize: envInt("EMBEDDING_BATCH_SIZE", 32), 54 + HybridKeywordWeight: envFloat("HYBRID_KEYWORD_WEIGHT", 0.65), 55 + HybridSemanticWeight: envFloat("HYBRID_SEMANTIC_WEIGHT", 0.35), 56 + EnableAdminEndpoints: envBool("ENABLE_ADMIN_ENDPOINTS", false), 57 + } 58 + 59 + var errs []error 60 + if cfg.TursoURL == "" { 61 + errs = append(errs, errors.New("TURSO_DATABASE_URL is required")) 62 + } 63 + if cfg.TursoToken == "" && !strings.HasPrefix(cfg.TursoURL, "file:") { 64 + errs = append(errs, errors.New("TURSO_AUTH_TOKEN is required for non-file URLs")) 65 + } 66 + if len(errs) > 0 { 67 + return nil, errors.Join(errs...) 68 + } 69 + return cfg, nil 70 + } 71 + 72 + func envOrDefault(key, def string) string { 73 + if v := os.Getenv(key); v != "" { 74 + return v 75 + } 76 + return def 77 + } 78 + 79 + func envInt(key string, def int) int { 80 + v := os.Getenv(key) 81 + if v == "" { 82 + return def 83 + } 84 + n, err := strconv.Atoi(v) 85 + if err != nil { 86 + return def 87 + } 88 + return n 89 + } 90 + 91 + func envFloat(key string, def float64) float64 { 92 + v := os.Getenv(key) 93 + if v == "" { 94 + return def 95 + } 96 + f, err := strconv.ParseFloat(v, 64) 97 + if err != nil { 98 + return def 99 + } 100 + return f 101 + } 102 + 103 + func envBool(key string, def bool) bool { 104 + v := os.Getenv(key) 105 + if v == "" { 106 + return def 107 + } 108 + b, err := strconv.ParseBool(v) 109 + if err != nil { 110 + return def 111 + } 112 + return b 113 + }

+1

packages/api/internal/embed/embed.go

··· 1 + package embed

+1

packages/api/internal/index/index.go

··· 1 + package index

+1

packages/api/internal/ingest/ingest.go

··· 1 + package ingest

+51

packages/api/internal/normalize/issue.go

··· 1 + package normalize 2 + 3 + import ( 4 + "fmt" 5 + 6 + "tangled.org/desertthunder.dev/twister/internal/store" 7 + ) 8 + 9 + const collectionIssue = "sh.tangled.repo.issue" 10 + 11 + // IssueAdapter normalizes sh.tangled.repo.issue records. 12 + type IssueAdapter struct{} 13 + 14 + func (a *IssueAdapter) Collection() string { return collectionIssue } 15 + func (a *IssueAdapter) RecordType() string { return "issue" } 16 + 17 + func (a *IssueAdapter) Searchable(_ map[string]any) bool { return true } 18 + 19 + func (a *IssueAdapter) Normalize(event TapRecordEvent) (*store.Document, error) { 20 + r := event.Record 21 + rec := r.Record 22 + 23 + title := str(rec, "title") 24 + body := str(rec, "body") 25 + repoURI := str(rec, "repo") 26 + 27 + repoDID := "" 28 + if repoURI != "" { 29 + did, _, _, err := ParseATURI(repoURI) 30 + if err != nil { 31 + return nil, fmt.Errorf("issue repo AT-URI: %w", err) 32 + } 33 + repoDID = did 34 + } 35 + 36 + return &store.Document{ 37 + ID: StableID(r.DID, r.Collection, r.RKey), 38 + DID: r.DID, 39 + Collection: r.Collection, 40 + RKey: r.RKey, 41 + ATURI: BuildATURI(r.DID, r.Collection, r.RKey), 42 + CID: r.CID, 43 + RecordType: a.RecordType(), 44 + Title: title, 45 + Body: body, 46 + Summary: truncate(body, 200), 47 + RepoDID: repoDID, 48 + TagsJSON: "[]", 49 + CreatedAt: str(rec, "createdAt"), 50 + }, nil 51 + }

+119

packages/api/internal/normalize/normalize.go

··· 1 + package normalize 2 + 3 + import ( 4 + "encoding/json" 5 + "fmt" 6 + "strings" 7 + 8 + "tangled.org/desertthunder.dev/twister/internal/store" 9 + ) 10 + 11 + // TapRecord is the inner record object from a Tap event. 12 + type TapRecord struct { 13 + Live bool `json:"live"` 14 + Rev string `json:"rev"` 15 + DID string `json:"did"` 16 + Collection string `json:"collection"` 17 + RKey string `json:"rkey"` 18 + Action string `json:"action"` 19 + CID string `json:"cid"` 20 + Record map[string]any `json:"record"` 21 + } 22 + 23 + // TapIdentity is the identity payload from a Tap identity event. 24 + type TapIdentity struct { 25 + DID string `json:"did"` 26 + Handle string `json:"handle"` 27 + IsActive bool `json:"isActive"` 28 + Status string `json:"status"` 29 + } 30 + 31 + // TapRecordEvent is the top-level event received from the Tap WebSocket. 32 + type TapRecordEvent struct { 33 + ID int64 `json:"id"` 34 + Type string `json:"type"` 35 + Record *TapRecord `json:"record,omitempty"` 36 + Identity *TapIdentity `json:"identity,omitempty"` 37 + } 38 + 39 + // RecordAdapter normalizes a Tap record event into a search Document. 40 + type RecordAdapter interface { 41 + Collection() string 42 + RecordType() string 43 + Normalize(event TapRecordEvent) (*store.Document, error) 44 + Searchable(record map[string]any) bool 45 + } 46 + 47 + // StateUpdate represents a record_state change derived from a state/status record. 48 + type StateUpdate struct { 49 + SubjectURI string 50 + State string 51 + } 52 + 53 + // StateHandler processes state/status records that update record_state, 54 + // not the documents table. 55 + type StateHandler interface { 56 + Collection() string 57 + HandleState(event TapRecordEvent) (*StateUpdate, error) 58 + } 59 + 60 + // StableID returns the canonical document ID: did|collection|rkey. 61 + func StableID(did, collection, rkey string) string { 62 + return fmt.Sprintf("%s|%s|%s", did, collection, rkey) 63 + } 64 + 65 + // BuildATURI constructs the AT-URI for a record. 66 + func BuildATURI(did, collection, rkey string) string { 67 + return fmt.Sprintf("at://%s/%s/%s", did, collection, rkey) 68 + } 69 + 70 + // ParseATURI extracts the DID, collection, and rkey from an AT-URI of the 71 + // form at://did:plc:abc123/sh.tangled.repo/3kb3fge5lm32x. 72 + func ParseATURI(uri string) (did, collection, rkey string, err error) { 73 + trimmed := strings.TrimPrefix(uri, "at://") 74 + parts := strings.SplitN(trimmed, "/", 3) 75 + if len(parts) != 3 || parts[0] == "" { 76 + return "", "", "", fmt.Errorf("invalid AT-URI: %q", uri) 77 + } 78 + return parts[0], parts[1], parts[2], nil 79 + } 80 + 81 + // truncate returns s truncated to at most n bytes. 82 + func truncate(s string, n int) string { 83 + if len(s) <= n { 84 + return s 85 + } 86 + return s[:n] 87 + } 88 + 89 + // marshalTags serializes a value as JSON, returning "[]" on nil or error. 90 + func marshalTags(v any) string { 91 + if v == nil { 92 + return "[]" 93 + } 94 + b, err := json.Marshal(v) 95 + if err != nil { 96 + return "[]" 97 + } 98 + return string(b) 99 + } 100 + 101 + // str safely extracts a string field from a map[string]any. 102 + func str(m map[string]any, key string) string { 103 + if v, ok := m[key]; ok { 104 + if s, ok := v.(string); ok { 105 + return s 106 + } 107 + } 108 + return "" 109 + } 110 + 111 + // nestedMap safely extracts a nested map[string]any from a map. 112 + func nestedMap(m map[string]any, key string) map[string]any { 113 + if v, ok := m[key]; ok { 114 + if nested, ok := v.(map[string]any); ok { 115 + return nested 116 + } 117 + } 118 + return nil 119 + }

+378

packages/api/internal/normalize/normalize_test.go

··· 1 + package normalize_test 2 + 3 + import ( 4 + "encoding/json" 5 + "os" 6 + "path/filepath" 7 + "testing" 8 + 9 + "tangled.org/desertthunder.dev/twister/internal/normalize" 10 + ) 11 + 12 + func loadFixture(t *testing.T, name string) normalize.TapRecordEvent { 13 + t.Helper() 14 + path := filepath.Join("testdata", name) 15 + data, err := os.ReadFile(path) 16 + if err != nil { 17 + t.Fatalf("read fixture %s: %v", name, err) 18 + } 19 + var event normalize.TapRecordEvent 20 + if err := json.Unmarshal(data, &event); err != nil { 21 + t.Fatalf("decode fixture %s: %v", name, err) 22 + } 23 + return event 24 + } 25 + 26 + // TestStableID verifies deterministic ID generation. 27 + func TestStableID(t *testing.T) { 28 + id := normalize.StableID("did:plc:abc", "sh.tangled.repo", "3kb3fge5lm32x") 29 + want := "did:plc:abc|sh.tangled.repo|3kb3fge5lm32x" 30 + if id != want { 31 + t.Errorf("StableID = %q, want %q", id, want) 32 + } 33 + } 34 + 35 + // TestParseATURI covers the happy path and malformed input. 36 + func TestParseATURI(t *testing.T) { 37 + t.Run("valid", func(t *testing.T) { 38 + did, col, rkey, err := normalize.ParseATURI("at://did:plc:abc123/sh.tangled.repo/3kb3fge5lm32x") 39 + if err != nil { 40 + t.Fatal(err) 41 + } 42 + if did != "did:plc:abc123" || col != "sh.tangled.repo" || rkey != "3kb3fge5lm32x" { 43 + t.Errorf("got (%s, %s, %s)", did, col, rkey) 44 + } 45 + }) 46 + 47 + t.Run("invalid", func(t *testing.T) { 48 + _, _, _, err := normalize.ParseATURI("not-an-at-uri") 49 + if err == nil { 50 + t.Error("expected error for malformed AT-URI") 51 + } 52 + }) 53 + 54 + t.Run("missing rkey", func(t *testing.T) { 55 + _, _, _, err := normalize.ParseATURI("at://did:plc:abc/sh.tangled.repo") 56 + if err == nil { 57 + t.Error("expected error for AT-URI missing rkey segment") 58 + } 59 + }) 60 + } 61 + 62 + // TestRepoAdapter verifies field mapping for sh.tangled.repo. 63 + func TestRepoAdapter(t *testing.T) { 64 + event := loadFixture(t, "repo.json") 65 + adapter := &normalize.RepoAdapter{} 66 + 67 + if adapter.Collection() != "sh.tangled.repo" { 68 + t.Errorf("Collection = %q", adapter.Collection()) 69 + } 70 + if adapter.RecordType() != "repo" { 71 + t.Errorf("RecordType = %q", adapter.RecordType()) 72 + } 73 + 74 + doc, err := adapter.Normalize(event) 75 + if err != nil { 76 + t.Fatalf("Normalize: %v", err) 77 + } 78 + 79 + if doc.ID != "did:plc:abc123|sh.tangled.repo|3kb3fge5lm32x" { 80 + t.Errorf("ID = %q", doc.ID) 81 + } 82 + if doc.Title != "my-project" { 83 + t.Errorf("Title = %q", doc.Title) 84 + } 85 + if doc.Body != "A cool project for searching things" { 86 + t.Errorf("Body = %q", doc.Body) 87 + } 88 + if doc.RepoName != "my-project" { 89 + t.Errorf("RepoName = %q", doc.RepoName) 90 + } 91 + if doc.RepoDID != "did:plc:abc123" { 92 + t.Errorf("RepoDID = %q", doc.RepoDID) 93 + } 94 + if doc.TagsJSON != `["go","search","atproto"]` { 95 + t.Errorf("TagsJSON = %q", doc.TagsJSON) 96 + } 97 + if doc.RecordType != "repo" { 98 + t.Errorf("RecordType = %q", doc.RecordType) 99 + } 100 + if doc.ATURI != "at://did:plc:abc123/sh.tangled.repo/3kb3fge5lm32x" { 101 + t.Errorf("ATURI = %q", doc.ATURI) 102 + } 103 + 104 + // Searchable 105 + if !adapter.Searchable(event.Record.Record) { 106 + t.Error("Searchable returned false for a named repo") 107 + } 108 + if adapter.Searchable(map[string]any{"name": ""}) { 109 + t.Error("Searchable returned true for empty name") 110 + } 111 + } 112 + 113 + // TestIssueAdapter verifies field mapping for sh.tangled.repo.issue. 114 + func TestIssueAdapter(t *testing.T) { 115 + event := loadFixture(t, "issue.json") 116 + adapter := &normalize.IssueAdapter{} 117 + 118 + doc, err := adapter.Normalize(event) 119 + if err != nil { 120 + t.Fatalf("Normalize: %v", err) 121 + } 122 + 123 + if doc.Title != "Fix search ranking for repos" { 124 + t.Errorf("Title = %q", doc.Title) 125 + } 126 + if doc.RepoDID != "did:plc:repoowner" { 127 + t.Errorf("RepoDID = %q, want did:plc:repoowner", doc.RepoDID) 128 + } 129 + if doc.RecordType != "issue" { 130 + t.Errorf("RecordType = %q", doc.RecordType) 131 + } 132 + if len(doc.Summary) > 200 { 133 + t.Errorf("Summary too long: %d chars", len(doc.Summary)) 134 + } 135 + if doc.TagsJSON != "[]" { 136 + t.Errorf("TagsJSON = %q, want []", doc.TagsJSON) 137 + } 138 + 139 + // Deterministic output 140 + doc2, _ := adapter.Normalize(event) 141 + if doc.ID != doc2.ID { 142 + t.Error("Normalize is not deterministic") 143 + } 144 + } 145 + 146 + // TestIssueAdapter_BadATURI ensures malformed repo AT-URI returns an error. 147 + func TestIssueAdapter_BadATURI(t *testing.T) { 148 + event := normalize.TapRecordEvent{ 149 + ID: 9999, 150 + Type: "record", 151 + Record: &normalize.TapRecord{ 152 + DID: "did:plc:abc", 153 + Collection: "sh.tangled.repo.issue", 154 + RKey: "rkey1", 155 + CID: "cid1", 156 + Record: map[string]any{ 157 + "title": "Test", 158 + "body": "Body", 159 + "repo": "not-an-at-uri", 160 + }, 161 + }, 162 + } 163 + adapter := &normalize.IssueAdapter{} 164 + _, err := adapter.Normalize(event) 165 + if err == nil { 166 + t.Error("expected error for invalid repo AT-URI") 167 + } 168 + } 169 + 170 + // TestPullAdapter verifies field mapping for sh.tangled.repo.pull. 171 + func TestPullAdapter(t *testing.T) { 172 + event := loadFixture(t, "pull.json") 173 + adapter := &normalize.PullAdapter{} 174 + 175 + doc, err := adapter.Normalize(event) 176 + if err != nil { 177 + t.Fatalf("Normalize: %v", err) 178 + } 179 + 180 + if doc.Title != "Add star-based ranking signal" { 181 + t.Errorf("Title = %q", doc.Title) 182 + } 183 + if doc.RepoDID != "did:plc:repoowner" { 184 + t.Errorf("RepoDID = %q, want did:plc:repoowner", doc.RepoDID) 185 + } 186 + if doc.RecordType != "pull" { 187 + t.Errorf("RecordType = %q", doc.RecordType) 188 + } 189 + } 190 + 191 + // TestPullAdapter_NoTarget verifies that a missing target is handled gracefully. 192 + func TestPullAdapter_NoTarget(t *testing.T) { 193 + event := normalize.TapRecordEvent{ 194 + ID: 9998, 195 + Type: "record", 196 + Record: &normalize.TapRecord{ 197 + DID: "did:plc:abc", 198 + Collection: "sh.tangled.repo.pull", 199 + RKey: "rkey2", 200 + CID: "cid2", 201 + Record: map[string]any{ 202 + "title": "PR without target", 203 + "body": "Body", 204 + }, 205 + }, 206 + } 207 + adapter := &normalize.PullAdapter{} 208 + doc, err := adapter.Normalize(event) 209 + if err != nil { 210 + t.Fatalf("unexpected error: %v", err) 211 + } 212 + if doc.RepoDID != "" { 213 + t.Errorf("RepoDID = %q, want empty", doc.RepoDID) 214 + } 215 + } 216 + 217 + // TestStringAdapter verifies field mapping for sh.tangled.string. 218 + func TestStringAdapter(t *testing.T) { 219 + event := loadFixture(t, "string.json") 220 + adapter := &normalize.StringAdapter{} 221 + 222 + doc, err := adapter.Normalize(event) 223 + if err != nil { 224 + t.Fatalf("Normalize: %v", err) 225 + } 226 + 227 + if doc.Title != "search.go" { 228 + t.Errorf("Title = %q", doc.Title) 229 + } 230 + if doc.Summary != "BM25 scoring function for full-text search" { 231 + t.Errorf("Summary = %q", doc.Summary) 232 + } 233 + if doc.RecordType != "string" { 234 + t.Errorf("RecordType = %q", doc.RecordType) 235 + } 236 + 237 + // Searchable only when contents is non-empty 238 + if !adapter.Searchable(event.Record.Record) { 239 + t.Error("Searchable = false for non-empty contents") 240 + } 241 + if adapter.Searchable(map[string]any{"contents": ""}) { 242 + t.Error("Searchable = true for empty contents") 243 + } 244 + } 245 + 246 + // TestProfileAdapter verifies field mapping for sh.tangled.actor.profile. 247 + func TestProfileAdapter(t *testing.T) { 248 + event := loadFixture(t, "profile.json") 249 + adapter := &normalize.ProfileAdapter{} 250 + 251 + doc, err := adapter.Normalize(event) 252 + if err != nil { 253 + t.Fatalf("Normalize: %v", err) 254 + } 255 + 256 + if doc.Body != "Building search infrastructure for the open social web. Go enthusiast." { 257 + t.Errorf("Body = %q", doc.Body) 258 + } 259 + if doc.Summary == "" { 260 + t.Error("Summary is empty; expected description + location") 261 + } 262 + if doc.RecordType != "profile" { 263 + t.Errorf("RecordType = %q", doc.RecordType) 264 + } 265 + // Title is intentionally empty (handle resolved separately via identity events) 266 + if doc.Title != "" { 267 + t.Errorf("Title = %q, want empty (handle resolved externally)", doc.Title) 268 + } 269 + 270 + // Searchable only when description is non-empty 271 + if !adapter.Searchable(event.Record.Record) { 272 + t.Error("Searchable = false for non-empty description") 273 + } 274 + if adapter.Searchable(map[string]any{"description": ""}) { 275 + t.Error("Searchable = true for empty description") 276 + } 277 + } 278 + 279 + // TestIssueStateHandler verifies record_state extraction. 280 + func TestIssueStateHandler(t *testing.T) { 281 + event := loadFixture(t, "issue_state.json") 282 + handler := &normalize.IssueStateHandler{} 283 + 284 + if handler.Collection() != "sh.tangled.repo.issue.state" { 285 + t.Errorf("Collection = %q", handler.Collection()) 286 + } 287 + 288 + update, err := handler.HandleState(event) 289 + if err != nil { 290 + t.Fatalf("HandleState: %v", err) 291 + } 292 + if update.SubjectURI != "at://did:plc:abc123/sh.tangled.repo.issue/3kb3fge5lm32y" { 293 + t.Errorf("SubjectURI = %q", update.SubjectURI) 294 + } 295 + if update.State != "closed" { 296 + t.Errorf("State = %q, want closed", update.State) 297 + } 298 + } 299 + 300 + // TestPullStatusHandler verifies record_state extraction for PRs. 301 + func TestPullStatusHandler(t *testing.T) { 302 + event := loadFixture(t, "pull_status.json") 303 + handler := &normalize.PullStatusHandler{} 304 + 305 + if handler.Collection() != "sh.tangled.repo.pull.status" { 306 + t.Errorf("Collection = %q", handler.Collection()) 307 + } 308 + 309 + update, err := handler.HandleState(event) 310 + if err != nil { 311 + t.Fatalf("HandleState: %v", err) 312 + } 313 + if update.State != "merged" { 314 + t.Errorf("State = %q, want merged", update.State) 315 + } 316 + } 317 + 318 + // TestIssueStateHandler_MissingFields ensures errors on missing required fields. 319 + func TestIssueStateHandler_MissingFields(t *testing.T) { 320 + handler := &normalize.IssueStateHandler{} 321 + 322 + t.Run("missing subject", func(t *testing.T) { 323 + event := normalize.TapRecordEvent{ 324 + Record: &normalize.TapRecord{ 325 + Record: map[string]any{"status": "closed"}, 326 + }, 327 + } 328 + _, err := handler.HandleState(event) 329 + if err == nil { 330 + t.Error("expected error for missing subject") 331 + } 332 + }) 333 + 334 + t.Run("missing status", func(t *testing.T) { 335 + event := normalize.TapRecordEvent{ 336 + Record: &normalize.TapRecord{ 337 + Record: map[string]any{"subject": "at://did:plc:x/col/rkey"}, 338 + }, 339 + } 340 + _, err := handler.HandleState(event) 341 + if err == nil { 342 + t.Error("expected error for missing status") 343 + } 344 + }) 345 + } 346 + 347 + // TestRegistry verifies adapter and state handler lookup. 348 + func TestRegistry(t *testing.T) { 349 + reg := normalize.NewRegistry() 350 + 351 + collections := []string{ 352 + "sh.tangled.repo", 353 + "sh.tangled.repo.issue", 354 + "sh.tangled.repo.pull", 355 + "sh.tangled.string", 356 + "sh.tangled.actor.profile", 357 + } 358 + for _, col := range collections { 359 + if _, ok := reg.Adapter(col); !ok { 360 + t.Errorf("no adapter registered for %q", col) 361 + } 362 + } 363 + 364 + stateCollections := []string{ 365 + "sh.tangled.repo.issue.state", 366 + "sh.tangled.repo.pull.status", 367 + } 368 + for _, col := range stateCollections { 369 + if _, ok := reg.StateHandler(col); !ok { 370 + t.Errorf("no state handler registered for %q", col) 371 + } 372 + } 373 + 374 + // Unsupported collections return false 375 + if _, ok := reg.Adapter("sh.tangled.unknown"); ok { 376 + t.Error("expected no adapter for unknown collection") 377 + } 378 + }

+49

packages/api/internal/normalize/profile.go

··· 1 + package normalize 2 + 3 + import "tangled.org/desertthunder.dev/twister/internal/store" 4 + 5 + const collectionProfile = "sh.tangled.actor.profile" 6 + 7 + // ProfileAdapter normalizes sh.tangled.actor.profile records. 8 + // Title (author handle) is not available in the record payload — it is set by 9 + // the indexer after identity-event handle resolution. 10 + type ProfileAdapter struct{} 11 + 12 + func (a *ProfileAdapter) Collection() string { return collectionProfile } 13 + func (a *ProfileAdapter) RecordType() string { return "profile" } 14 + 15 + func (a *ProfileAdapter) Searchable(record map[string]any) bool { 16 + return str(record, "description") != "" 17 + } 18 + 19 + func (a *ProfileAdapter) Normalize(event TapRecordEvent) (*store.Document, error) { 20 + r := event.Record 21 + rec := r.Record 22 + 23 + description := str(rec, "description") 24 + location := str(rec, "location") 25 + 26 + summary := description 27 + if location != "" { 28 + if summary != "" { 29 + summary = summary + " · " + location 30 + } else { 31 + summary = location 32 + } 33 + } 34 + 35 + return &store.Document{ 36 + ID: StableID(r.DID, r.Collection, r.RKey), 37 + DID: r.DID, 38 + Collection: r.Collection, 39 + RKey: r.RKey, 40 + ATURI: BuildATURI(r.DID, r.Collection, r.RKey), 41 + CID: r.CID, 42 + RecordType: a.RecordType(), 43 + // Title (handle) is resolved from DID by the indexer via identity events. 44 + Title: "", 45 + Body: description, 46 + Summary: truncate(summary, 200), 47 + TagsJSON: "[]", 48 + }, nil 49 + }

+55

packages/api/internal/normalize/pull.go

··· 1 + package normalize 2 + 3 + import ( 4 + "fmt" 5 + 6 + "tangled.org/desertthunder.dev/twister/internal/store" 7 + ) 8 + 9 + const collectionPull = "sh.tangled.repo.pull" 10 + 11 + // PullAdapter normalizes sh.tangled.repo.pull records. 12 + type PullAdapter struct{} 13 + 14 + func (a *PullAdapter) Collection() string { return collectionPull } 15 + func (a *PullAdapter) RecordType() string { return "pull" } 16 + 17 + func (a *PullAdapter) Searchable(_ map[string]any) bool { return true } 18 + 19 + func (a *PullAdapter) Normalize(event TapRecordEvent) (*store.Document, error) { 20 + r := event.Record 21 + rec := r.Record 22 + 23 + title := str(rec, "title") 24 + body := str(rec, "body") 25 + 26 + // repo DID is extracted from record.target.repo AT-URI 27 + repoDID := "" 28 + target := nestedMap(rec, "target") 29 + if target != nil { 30 + repoURI := str(target, "repo") 31 + if repoURI != "" { 32 + did, _, _, err := ParseATURI(repoURI) 33 + if err != nil { 34 + return nil, fmt.Errorf("pull target repo AT-URI: %w", err) 35 + } 36 + repoDID = did 37 + } 38 + } 39 + 40 + return &store.Document{ 41 + ID: StableID(r.DID, r.Collection, r.RKey), 42 + DID: r.DID, 43 + Collection: r.Collection, 44 + RKey: r.RKey, 45 + ATURI: BuildATURI(r.DID, r.Collection, r.RKey), 46 + CID: r.CID, 47 + RecordType: a.RecordType(), 48 + Title: title, 49 + Body: body, 50 + Summary: truncate(body, 200), 51 + RepoDID: repoDID, 52 + TagsJSON: "[]", 53 + CreatedAt: str(rec, "createdAt"), 54 + }, nil 55 + }

+43

packages/api/internal/normalize/registry.go

··· 1 + package normalize 2 + 3 + // Registry maps collection NSIDs to their adapters or state handlers. 4 + type Registry struct { 5 + adapters map[string]RecordAdapter 6 + stateHandlers map[string]StateHandler 7 + } 8 + 9 + // NewRegistry returns a Registry pre-loaded with all supported adapters. 10 + func NewRegistry() *Registry { 11 + r := &Registry{ 12 + adapters: make(map[string]RecordAdapter), 13 + stateHandlers: make(map[string]StateHandler), 14 + } 15 + for _, a := range []RecordAdapter{ 16 + &RepoAdapter{}, 17 + &IssueAdapter{}, 18 + &PullAdapter{}, 19 + &StringAdapter{}, 20 + &ProfileAdapter{}, 21 + } { 22 + r.adapters[a.Collection()] = a 23 + } 24 + for _, h := range []StateHandler{ 25 + &IssueStateHandler{}, 26 + &PullStatusHandler{}, 27 + } { 28 + r.stateHandlers[h.Collection()] = h 29 + } 30 + return r 31 + } 32 + 33 + // Adapter returns the RecordAdapter for a collection, if supported. 34 + func (r *Registry) Adapter(collection string) (RecordAdapter, bool) { 35 + a, ok := r.adapters[collection] 36 + return a, ok 37 + } 38 + 39 + // StateHandler returns the StateHandler for a collection, if supported. 40 + func (r *Registry) StateHandler(collection string) (StateHandler, bool) { 41 + h, ok := r.stateHandlers[collection] 42 + return h, ok 43 + }

+40

packages/api/internal/normalize/repo.go

··· 1 + package normalize 2 + 3 + import "tangled.org/desertthunder.dev/twister/internal/store" 4 + 5 + const collectionRepo = "sh.tangled.repo" 6 + 7 + // RepoAdapter normalizes sh.tangled.repo records. 8 + type RepoAdapter struct{} 9 + 10 + func (a *RepoAdapter) Collection() string { return collectionRepo } 11 + func (a *RepoAdapter) RecordType() string { return "repo" } 12 + 13 + func (a *RepoAdapter) Searchable(record map[string]any) bool { 14 + return str(record, "name") != "" 15 + } 16 + 17 + func (a *RepoAdapter) Normalize(event TapRecordEvent) (*store.Document, error) { 18 + r := event.Record 19 + rec := r.Record 20 + 21 + name := str(rec, "name") 22 + description := str(rec, "description") 23 + 24 + return &store.Document{ 25 + ID: StableID(r.DID, r.Collection, r.RKey), 26 + DID: r.DID, 27 + Collection: r.Collection, 28 + RKey: r.RKey, 29 + ATURI: BuildATURI(r.DID, r.Collection, r.RKey), 30 + CID: r.CID, 31 + RecordType: a.RecordType(), 32 + Title: name, 33 + Body: description, 34 + Summary: truncate(description, 200), 35 + RepoDID: r.DID, 36 + RepoName: name, 37 + TagsJSON: marshalTags(rec["topics"]), 38 + CreatedAt: str(rec, "createdAt"), 39 + }, nil 40 + }

+56

packages/api/internal/normalize/state.go

··· 1 + package normalize 2 + 3 + import "fmt" 4 + 5 + const ( 6 + collectionIssueState string = "sh.tangled.repo.issue.state" 7 + collectionPullStatus string = "sh.tangled.repo.pull.status" 8 + ) 9 + 10 + // IssueStateHandler processes sh.tangled.repo.issue.state records. 11 + type IssueStateHandler struct{} 12 + 13 + func (h *IssueStateHandler) Collection() string { return collectionIssueState } 14 + 15 + func (h *IssueStateHandler) HandleState(event TapRecordEvent) (*StateUpdate, error) { 16 + r := event.Record 17 + rec := r.Record 18 + 19 + subject := str(rec, "subject") 20 + if subject == "" { 21 + return nil, fmt.Errorf("issue state record missing subject field") 22 + } 23 + state := str(rec, "status") 24 + if state == "" { 25 + return nil, fmt.Errorf("issue state record missing status field") 26 + } 27 + 28 + return &StateUpdate{ 29 + SubjectURI: subject, 30 + State: state, 31 + }, nil 32 + } 33 + 34 + // PullStatusHandler processes sh.tangled.repo.pull.status records. 35 + type PullStatusHandler struct{} 36 + 37 + func (h *PullStatusHandler) Collection() string { return collectionPullStatus } 38 + 39 + func (h *PullStatusHandler) HandleState(event TapRecordEvent) (*StateUpdate, error) { 40 + r := event.Record 41 + rec := r.Record 42 + 43 + subject := str(rec, "subject") 44 + if subject == "" { 45 + return nil, fmt.Errorf("pull status record missing subject field") 46 + } 47 + status := str(rec, "status") 48 + if status == "" { 49 + return nil, fmt.Errorf("pull status record missing status field") 50 + } 51 + 52 + return &StateUpdate{ 53 + SubjectURI: subject, 54 + State: status, 55 + }, nil 56 + }

+35

packages/api/internal/normalize/string.go

··· 1 + package normalize 2 + 3 + import "tangled.org/desertthunder.dev/twister/internal/store" 4 + 5 + const collectionString = "sh.tangled.string" 6 + 7 + // StringAdapter normalizes sh.tangled.string records (code snippets/gists). 8 + type StringAdapter struct{} 9 + 10 + func (a *StringAdapter) Collection() string { return collectionString } 11 + func (a *StringAdapter) RecordType() string { return "string" } 12 + 13 + func (a *StringAdapter) Searchable(record map[string]any) bool { 14 + return str(record, "contents") != "" 15 + } 16 + 17 + func (a *StringAdapter) Normalize(event TapRecordEvent) (*store.Document, error) { 18 + r := event.Record 19 + rec := r.Record 20 + 21 + return &store.Document{ 22 + ID: StableID(r.DID, r.Collection, r.RKey), 23 + DID: r.DID, 24 + Collection: r.Collection, 25 + RKey: r.RKey, 26 + ATURI: BuildATURI(r.DID, r.Collection, r.RKey), 27 + CID: r.CID, 28 + RecordType: a.RecordType(), 29 + Title: str(rec, "filename"), 30 + Body: str(rec, "contents"), 31 + Summary: str(rec, "description"), 32 + TagsJSON: "[]", 33 + CreatedAt: str(rec, "createdAt"), 34 + }, nil 35 + }

+20

packages/api/internal/normalize/testdata/issue.json

··· 1 + { 2 + "id": 1002, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm32y", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.repo.issue", 9 + "rkey": "3kb3fge5lm32y", 10 + "action": "create", 11 + "cid": "bafyreigabc124", 12 + "record": { 13 + "$type": "sh.tangled.repo.issue", 14 + "title": "Fix search ranking for repos", 15 + "body": "The current ranking algorithm does not account for star count. We should add a star-based signal to improve relevance for popular repositories.", 16 + "repo": "at://did:plc:repoowner/sh.tangled.repo/3kb3fge5lm30a", 17 + "createdAt": "2026-03-22T13:00:00.000Z" 18 + } 19 + } 20 + }

+18

packages/api/internal/normalize/testdata/issue_state.json

··· 1 + { 2 + "id": 1006, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm33c", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.repo.issue.state", 9 + "rkey": "3kb3fge5lm33c", 10 + "action": "create", 11 + "cid": "bafyreigabc128", 12 + "record": { 13 + "$type": "sh.tangled.repo.issue.state", 14 + "subject": "at://did:plc:abc123/sh.tangled.repo.issue/3kb3fge5lm32y", 15 + "status": "closed" 16 + } 17 + } 18 + }

+19

packages/api/internal/normalize/testdata/profile.json

··· 1 + { 2 + "id": 1005, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm33b", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.actor.profile", 9 + "rkey": "self", 10 + "action": "create", 11 + "cid": "bafyreigabc127", 12 + "record": { 13 + "$type": "sh.tangled.actor.profile", 14 + "description": "Building search infrastructure for the open social web. Go enthusiast.", 15 + "location": "San Francisco, CA", 16 + "website": "https://example.tangled.org" 17 + } 18 + } 19 + }

+27

packages/api/internal/normalize/testdata/pull.json

··· 1 + { 2 + "id": 1003, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm32z", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.repo.pull", 9 + "rkey": "3kb3fge5lm32z", 10 + "action": "create", 11 + "cid": "bafyreigabc125", 12 + "record": { 13 + "$type": "sh.tangled.repo.pull", 14 + "title": "Add star-based ranking signal", 15 + "body": "Implements the star count signal discussed in issue #42. Adds a weight multiplier based on repository star count, capped at 1000 stars.", 16 + "target": { 17 + "repo": "at://did:plc:repoowner/sh.tangled.repo/3kb3fge5lm30a", 18 + "branch": "main" 19 + }, 20 + "source": { 21 + "repo": "at://did:plc:abc123/sh.tangled.repo/3kb3fge5lm30b", 22 + "branch": "feature/star-ranking" 23 + }, 24 + "createdAt": "2026-03-22T14:00:00.000Z" 25 + } 26 + } 27 + }

+18

packages/api/internal/normalize/testdata/pull_status.json

··· 1 + { 2 + "id": 1007, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm33d", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.repo.pull.status", 9 + "rkey": "3kb3fge5lm33d", 10 + "action": "create", 11 + "cid": "bafyreigabc129", 12 + "record": { 13 + "$type": "sh.tangled.repo.pull.status", 14 + "subject": "at://did:plc:abc123/sh.tangled.repo.pull/3kb3fge5lm32z", 15 + "status": "merged" 16 + } 17 + } 18 + }

+21

packages/api/internal/normalize/testdata/repo.json

··· 1 + { 2 + "id": 1001, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm32x", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.repo", 9 + "rkey": "3kb3fge5lm32x", 10 + "action": "create", 11 + "cid": "bafyreigabc123", 12 + "record": { 13 + "$type": "sh.tangled.repo", 14 + "name": "my-project", 15 + "knot": "knot.tangled.org", 16 + "description": "A cool project for searching things", 17 + "topics": ["go", "search", "atproto"], 18 + "createdAt": "2026-03-22T12:00:00.000Z" 19 + } 20 + } 21 + }

+20

packages/api/internal/normalize/testdata/string.json

··· 1 + { 2 + "id": 1004, 3 + "type": "record", 4 + "record": { 5 + "live": true, 6 + "rev": "3kb3fge5lm33a", 7 + "did": "did:plc:abc123", 8 + "collection": "sh.tangled.string", 9 + "rkey": "3kb3fge5lm33a", 10 + "action": "create", 11 + "cid": "bafyreigabc126", 12 + "record": { 13 + "$type": "sh.tangled.string", 14 + "filename": "search.go", 15 + "contents": "package search\n\nfunc BM25Score(tf, df, n, avgdl float64) float64 {\n\tk1 := 1.2\n\tb := 0.75\n\treturn tf * (k1 + 1) / (tf + k1*(1-b+b*n/avgdl))\n}", 16 + "description": "BM25 scoring function for full-text search", 17 + "createdAt": "2026-03-22T15:00:00.000Z" 18 + } 19 + } 20 + }

+31

packages/api/internal/observability/log.go

··· 1 + package observability 2 + 3 + import ( 4 + "log/slog" 5 + "os" 6 + 7 + "tangled.org/desertthunder.dev/twister/internal/config" 8 + ) 9 + 10 + func NewLogger(cfg *config.Config) *slog.Logger { 11 + level := slog.LevelInfo 12 + switch cfg.LogLevel { 13 + case "debug": 14 + level = slog.LevelDebug 15 + case "warn": 16 + level = slog.LevelWarn 17 + case "error": 18 + level = slog.LevelError 19 + } 20 + 21 + opts := &slog.HandlerOptions{Level: level} 22 + 23 + var handler slog.Handler 24 + if cfg.LogFormat == "text" { 25 + handler = slog.NewTextHandler(os.Stdout, opts) 26 + } else { 27 + handler = slog.NewJSONHandler(os.Stdout, opts) 28 + } 29 + 30 + return slog.New(handler) 31 + }

+1

packages/api/internal/ranking/ranking.go

··· 1 + package ranking

+1

packages/api/internal/search/search.go

··· 1 + package search

+95

packages/api/internal/store/db.go

··· 1 + package store 2 + 3 + import ( 4 + "database/sql" 5 + "embed" 6 + "fmt" 7 + "log/slog" 8 + "sort" 9 + "strings" 10 + 11 + _ "github.com/tursodatabase/libsql-client-go/libsql" 12 + _ "modernc.org/sqlite" 13 + ) 14 + 15 + //go:embed migrations/*.sql 16 + var migrationsFS embed.FS 17 + 18 + // Open establishes a connection to the database. 19 + // For remote Turso URLs (libsql:// or https://) it uses the libsql-client-go driver. 20 + // For local file: URLs it uses the pure-Go SQLite driver (no CGo required). 21 + func Open(url, token string) (*sql.DB, error) { 22 + driver, dsn := driverAndDSN(url, token) 23 + db, err := sql.Open(driver, dsn) 24 + if err != nil { 25 + return nil, fmt.Errorf("open db: %w", err) 26 + } 27 + if err := db.Ping(); err != nil { 28 + db.Close() 29 + return nil, fmt.Errorf("ping db: %w", err) 30 + } 31 + return db, nil 32 + } 33 + 34 + // driverAndDSN returns the sql driver name and DSN for the given URL. 35 + // file: URLs use the pure-Go "sqlite" driver; all others use "libsql". 36 + func driverAndDSN(url, token string) (driver, dsn string) { 37 + if strings.HasPrefix(url, "file:") { 38 + return "sqlite", strings.TrimPrefix(url, "file:") 39 + } 40 + if token == "" || strings.Contains(url, "?") { 41 + return "libsql", url 42 + } 43 + return "libsql", url + "?authToken=" + token 44 + } 45 + 46 + // Migrate runs all embedded SQL migration files in order. 47 + func Migrate(db *sql.DB) error { 48 + entries, err := migrationsFS.ReadDir("migrations") 49 + if err != nil { 50 + return fmt.Errorf("read migrations dir: %w", err) 51 + } 52 + sort.Slice(entries, func(i, j int) bool { 53 + return entries[i].Name() < entries[j].Name() 54 + }) 55 + for _, entry := range entries { 56 + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".sql") { 57 + continue 58 + } 59 + data, err := migrationsFS.ReadFile("migrations/" + entry.Name()) 60 + if err != nil { 61 + return fmt.Errorf("read migration %s: %w", entry.Name(), err) 62 + } 63 + if err := execMigration(db, entry.Name(), string(data)); err != nil { 64 + return err 65 + } 66 + slog.Info("migration applied", "file", entry.Name()) 67 + } 68 + return nil 69 + } 70 + 71 + func execMigration(db *sql.DB, name, content string) error { 72 + for _, stmt := range splitStatements(content) { 73 + if _, err := db.Exec(stmt); err != nil { 74 + upper := strings.ToUpper(stmt) 75 + if strings.Contains(upper, "USING FTS") || strings.Contains(upper, "LIBSQL_VECTOR_IDX") { 76 + slog.Warn("migration: skipping extension index (not supported in this environment)", 77 + "migration", name, "err", err) 78 + continue 79 + } 80 + return fmt.Errorf("migration %s: exec failed: %w\nstatement: %s", name, err, stmt) 81 + } 82 + } 83 + return nil 84 + } 85 + 86 + func splitStatements(content string) []string { 87 + var stmts []string 88 + for _, s := range strings.Split(content, ";") { 89 + s = strings.TrimSpace(s) 90 + if s != "" { 91 + stmts = append(stmts, s) 92 + } 93 + } 94 + return stmts 95 + }

+77

packages/api/internal/store/migrations/001_initial.sql

··· 1 + CREATE TABLE IF NOT EXISTS documents ( 2 + id TEXT PRIMARY KEY, 3 + did TEXT NOT NULL, 4 + collection TEXT NOT NULL, 5 + rkey TEXT NOT NULL, 6 + at_uri TEXT NOT NULL, 7 + cid TEXT NOT NULL, 8 + record_type TEXT NOT NULL, 9 + title TEXT, 10 + body TEXT, 11 + summary TEXT, 12 + repo_did TEXT, 13 + repo_name TEXT, 14 + author_handle TEXT, 15 + tags_json TEXT, 16 + language TEXT, 17 + created_at TEXT, 18 + updated_at TEXT, 19 + indexed_at TEXT NOT NULL, 20 + deleted_at TEXT 21 + ); 22 + 23 + CREATE INDEX IF NOT EXISTS idx_documents_did ON documents(did); 24 + 25 + CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection); 26 + 27 + CREATE INDEX IF NOT EXISTS idx_documents_record_type ON documents(record_type); 28 + 29 + CREATE INDEX IF NOT EXISTS idx_documents_repo_did ON documents(repo_did); 30 + 31 + CREATE INDEX IF NOT EXISTS idx_documents_created_at ON documents(created_at); 32 + 33 + CREATE INDEX IF NOT EXISTS idx_documents_deleted_at ON documents(deleted_at); 34 + 35 + CREATE INDEX IF NOT EXISTS idx_documents_fts ON documents USING fts ( 36 + title WITH tokenizer=default, 37 + body WITH tokenizer=default, 38 + summary WITH tokenizer=default, 39 + repo_name WITH tokenizer=simple, 40 + author_handle WITH tokenizer=raw, 41 + tags_json WITH tokenizer=simple 42 + ) WITH (weights='title=3.0,repo_name=2.5,author_handle=2.0,summary=1.5,tags_json=1.2,body=1.0'); 43 + 44 + CREATE TABLE IF NOT EXISTS sync_state ( 45 + consumer_name TEXT PRIMARY KEY, 46 + cursor TEXT NOT NULL, 47 + high_water_mark TEXT, 48 + updated_at TEXT NOT NULL 49 + ); 50 + 51 + CREATE TABLE IF NOT EXISTS document_embeddings ( 52 + document_id TEXT PRIMARY KEY REFERENCES documents(id), 53 + embedding F32_BLOB(768), 54 + embedding_model TEXT NOT NULL, 55 + embedded_at TEXT NOT NULL 56 + ); 57 + 58 + CREATE INDEX IF NOT EXISTS idx_embeddings_vec ON document_embeddings( 59 + libsql_vector_idx(embedding, 'metric=cosine') 60 + ); 61 + 62 + CREATE TABLE IF NOT EXISTS embedding_jobs ( 63 + document_id TEXT PRIMARY KEY REFERENCES documents(id), 64 + status TEXT NOT NULL, 65 + attempts INTEGER NOT NULL DEFAULT 0, 66 + last_error TEXT, 67 + scheduled_at TEXT NOT NULL, 68 + updated_at TEXT NOT NULL 69 + ); 70 + 71 + CREATE INDEX IF NOT EXISTS idx_embedding_jobs_status ON embedding_jobs(status); 72 + 73 + CREATE TABLE IF NOT EXISTS record_state ( 74 + subject_uri TEXT PRIMARY KEY, 75 + state TEXT NOT NULL, 76 + updated_at TEXT NOT NULL 77 + );

+166

packages/api/internal/store/sql_store.go

··· 1 + package store 2 + 3 + import ( 4 + "context" 5 + "database/sql" 6 + "errors" 7 + "fmt" 8 + "time" 9 + ) 10 + 11 + // SQLStore implements Store against a libSQL database. 12 + type SQLStore struct { 13 + db *sql.DB 14 + } 15 + 16 + // New wraps an open *sql.DB in a Store implementation. 17 + func New(db *sql.DB) Store { 18 + return &SQLStore{db: db} 19 + } 20 + 21 + func (s *SQLStore) UpsertDocument(ctx context.Context, doc *Document) error { 22 + doc.IndexedAt = time.Now().UTC().Format(time.RFC3339) 23 + _, err := s.db.ExecContext(ctx, ` 24 + INSERT INTO documents ( 25 + id, did, collection, rkey, at_uri, cid, record_type, 26 + title, body, summary, repo_did, repo_name, author_handle, 27 + tags_json, language, created_at, updated_at, indexed_at, deleted_at 28 + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 29 + ON CONFLICT(id) DO UPDATE SET 30 + did = excluded.did, 31 + collection = excluded.collection, 32 + rkey = excluded.rkey, 33 + at_uri = excluded.at_uri, 34 + cid = excluded.cid, 35 + record_type = excluded.record_type, 36 + title = excluded.title, 37 + body = excluded.body, 38 + summary = excluded.summary, 39 + repo_did = excluded.repo_did, 40 + repo_name = excluded.repo_name, 41 + author_handle = excluded.author_handle, 42 + tags_json = excluded.tags_json, 43 + language = excluded.language, 44 + created_at = excluded.created_at, 45 + updated_at = excluded.updated_at, 46 + indexed_at = excluded.indexed_at, 47 + deleted_at = excluded.deleted_at`, 48 + doc.ID, doc.DID, doc.Collection, doc.RKey, doc.ATURI, doc.CID, doc.RecordType, 49 + doc.Title, doc.Body, doc.Summary, doc.RepoDID, doc.RepoName, doc.AuthorHandle, 50 + doc.TagsJSON, doc.Language, doc.CreatedAt, doc.UpdatedAt, doc.IndexedAt, nullableStr(doc.DeletedAt), 51 + ) 52 + if err != nil { 53 + return fmt.Errorf("upsert document: %w", err) 54 + } 55 + return nil 56 + } 57 + 58 + func (s *SQLStore) GetDocument(ctx context.Context, id string) (*Document, error) { 59 + row := s.db.QueryRowContext(ctx, ` 60 + SELECT id, did, collection, rkey, at_uri, cid, record_type, 61 + title, body, summary, repo_did, repo_name, author_handle, 62 + tags_json, language, created_at, updated_at, indexed_at, deleted_at 63 + FROM documents WHERE id = ?`, id) 64 + 65 + doc, err := scanDocument(row) 66 + if errors.Is(err, sql.ErrNoRows) { 67 + return nil, nil 68 + } 69 + if err != nil { 70 + return nil, fmt.Errorf("get document: %w", err) 71 + } 72 + return doc, nil 73 + } 74 + 75 + func (s *SQLStore) MarkDeleted(ctx context.Context, id string) error { 76 + now := time.Now().UTC().Format(time.RFC3339) 77 + _, err := s.db.ExecContext(ctx, 78 + `UPDATE documents SET deleted_at = ? WHERE id = ?`, now, id) 79 + if err != nil { 80 + return fmt.Errorf("mark deleted: %w", err) 81 + } 82 + return nil 83 + } 84 + 85 + func (s *SQLStore) GetSyncState(ctx context.Context, consumer string) (*SyncState, error) { 86 + row := s.db.QueryRowContext(ctx, ` 87 + SELECT consumer_name, cursor, high_water_mark, updated_at 88 + FROM sync_state WHERE consumer_name = ?`, consumer) 89 + 90 + ss := &SyncState{} 91 + var hwm sql.NullString 92 + err := row.Scan(&ss.ConsumerName, &ss.Cursor, &hwm, &ss.UpdatedAt) 93 + if errors.Is(err, sql.ErrNoRows) { 94 + return nil, nil 95 + } 96 + if err != nil { 97 + return nil, fmt.Errorf("get sync state: %w", err) 98 + } 99 + ss.HighWaterMark = hwm.String 100 + return ss, nil 101 + } 102 + 103 + func (s *SQLStore) SetSyncState(ctx context.Context, consumer string, cursor string) error { 104 + now := time.Now().UTC().Format(time.RFC3339) 105 + _, err := s.db.ExecContext(ctx, ` 106 + INSERT INTO sync_state (consumer_name, cursor, updated_at) VALUES (?, ?, ?) 107 + ON CONFLICT(consumer_name) DO UPDATE SET 108 + cursor = excluded.cursor, 109 + updated_at = excluded.updated_at`, 110 + consumer, cursor, now, 111 + ) 112 + if err != nil { 113 + return fmt.Errorf("set sync state: %w", err) 114 + } 115 + return nil 116 + } 117 + 118 + func (s *SQLStore) UpdateRecordState(ctx context.Context, subjectURI string, state string) error { 119 + now := time.Now().UTC().Format(time.RFC3339) 120 + _, err := s.db.ExecContext(ctx, ` 121 + INSERT INTO record_state (subject_uri, state, updated_at) VALUES (?, ?, ?) 122 + ON CONFLICT(subject_uri) DO UPDATE SET 123 + state = excluded.state, 124 + updated_at = excluded.updated_at`, 125 + subjectURI, state, now, 126 + ) 127 + if err != nil { 128 + return fmt.Errorf("update record state: %w", err) 129 + } 130 + return nil 131 + } 132 + 133 + func scanDocument(row *sql.Row) (*Document, error) { 134 + doc := &Document{} 135 + var ( 136 + title, body, summary, repoDID, repoName, authorHandle sql.NullString 137 + tagsJSON, language, createdAt, updatedAt, deletedAt sql.NullString 138 + ) 139 + err := row.Scan( 140 + &doc.ID, &doc.DID, &doc.Collection, &doc.RKey, &doc.ATURI, &doc.CID, &doc.RecordType, 141 + &title, &body, &summary, &repoDID, &repoName, &authorHandle, 142 + &tagsJSON, &language, &createdAt, &updatedAt, &doc.IndexedAt, &deletedAt, 143 + ) 144 + if err != nil { 145 + return nil, err 146 + } 147 + doc.Title = title.String 148 + doc.Body = body.String 149 + doc.Summary = summary.String 150 + doc.RepoDID = repoDID.String 151 + doc.RepoName = repoName.String 152 + doc.AuthorHandle = authorHandle.String 153 + doc.TagsJSON = tagsJSON.String 154 + doc.Language = language.String 155 + doc.CreatedAt = createdAt.String 156 + doc.UpdatedAt = updatedAt.String 157 + doc.DeletedAt = deletedAt.String 158 + return doc, nil 159 + } 160 + 161 + func nullableStr(s string) any { 162 + if s == "" { 163 + return nil 164 + } 165 + return s 166 + }

+51

packages/api/internal/store/store.go

··· 1 + package store 2 + 3 + import "context" 4 + 5 + // Document is the denormalized search document stored in the database. 6 + type Document struct { 7 + ID string 8 + DID string 9 + Collection string 10 + RKey string 11 + ATURI string 12 + CID string 13 + RecordType string 14 + Title string 15 + Body string 16 + Summary string 17 + RepoDID string 18 + RepoName string 19 + AuthorHandle string 20 + TagsJSON string 21 + Language string 22 + CreatedAt string 23 + UpdatedAt string 24 + IndexedAt string 25 + DeletedAt string 26 + } 27 + 28 + // SyncState tracks the Tap consumer cursor for resuming on restart. 29 + type SyncState struct { 30 + ConsumerName string 31 + Cursor string 32 + HighWaterMark string 33 + UpdatedAt string 34 + } 35 + 36 + // RecordState caches issue/PR open-closed-merged state. 37 + type RecordState struct { 38 + SubjectURI string 39 + State string 40 + UpdatedAt string 41 + } 42 + 43 + // Store is the persistence interface for Twister. 44 + type Store interface { 45 + UpsertDocument(ctx context.Context, doc *Document) error 46 + GetDocument(ctx context.Context, id string) (*Document, error) 47 + MarkDeleted(ctx context.Context, id string) error 48 + GetSyncState(ctx context.Context, consumer string) (*SyncState, error) 49 + SetSyncState(ctx context.Context, consumer string, cursor string) error 50 + UpdateRecordState(ctx context.Context, subjectURI string, state string) error 51 + }

+159

packages/api/internal/store/store_test.go

··· 1 + package store_test 2 + 3 + import ( 4 + "context" 5 + "os" 6 + "path/filepath" 7 + "testing" 8 + 9 + "tangled.org/desertthunder.dev/twister/internal/store" 10 + ) 11 + 12 + func TestIntegration(t *testing.T) { 13 + dir := t.TempDir() 14 + dbPath := filepath.Join(dir, "test.db") 15 + url := "file:" + dbPath 16 + 17 + db, err := store.Open(url, "") 18 + if err != nil { 19 + t.Fatalf("open: %v", err) 20 + } 21 + t.Cleanup(func() { 22 + db.Close() 23 + os.Remove(dbPath) 24 + }) 25 + 26 + if err := store.Migrate(db); err != nil { 27 + t.Fatalf("migrate: %v", err) 28 + } 29 + 30 + st := store.New(db) 31 + ctx := context.Background() 32 + 33 + t.Run("upsert and get document", func(t *testing.T) { 34 + doc := &store.Document{ 35 + ID: "did:plc:abc|sh.tangled.repo|abc123", 36 + DID: "did:plc:abc", 37 + Collection: "sh.tangled.repo", 38 + RKey: "abc123", 39 + ATURI: "at://did:plc:abc/sh.tangled.repo/abc123", 40 + CID: "bafyreiabc", 41 + RecordType: "repo", 42 + Title: "my-repo", 43 + Body: "A test repository", 44 + RepoName: "my-repo", 45 + } 46 + if err := st.UpsertDocument(ctx, doc); err != nil { 47 + t.Fatalf("upsert: %v", err) 48 + } 49 + 50 + got, err := st.GetDocument(ctx, doc.ID) 51 + if err != nil { 52 + t.Fatalf("get: %v", err) 53 + } 54 + if got == nil { 55 + t.Fatal("expected document, got nil") 56 + } 57 + if got.Title != "my-repo" { 58 + t.Errorf("title: got %q, want %q", got.Title, "my-repo") 59 + } 60 + if got.IndexedAt == "" { 61 + t.Error("indexed_at should be set by upsert") 62 + } 63 + }) 64 + 65 + t.Run("upsert is idempotent", func(t *testing.T) { 66 + doc := &store.Document{ 67 + ID: "did:plc:abc|sh.tangled.repo|abc123", 68 + DID: "did:plc:abc", 69 + Collection: "sh.tangled.repo", 70 + RKey: "abc123", 71 + ATURI: "at://did:plc:abc/sh.tangled.repo/abc123", 72 + CID: "bafyreiabc2", 73 + RecordType: "repo", 74 + Title: "my-repo-v2", 75 + } 76 + if err := st.UpsertDocument(ctx, doc); err != nil { 77 + t.Fatalf("upsert: %v", err) 78 + } 79 + got, err := st.GetDocument(ctx, doc.ID) 80 + if err != nil { 81 + t.Fatalf("get: %v", err) 82 + } 83 + if got.Title != "my-repo-v2" { 84 + t.Errorf("title: got %q, want %q", got.Title, "my-repo-v2") 85 + } 86 + if got.CID != "bafyreiabc2" { 87 + t.Errorf("cid: got %q, want updated CID", got.CID) 88 + } 89 + }) 90 + 91 + t.Run("tombstone sets deleted_at", func(t *testing.T) { 92 + if err := st.MarkDeleted(ctx, "did:plc:abc|sh.tangled.repo|abc123"); err != nil { 93 + t.Fatalf("mark deleted: %v", err) 94 + } 95 + got, err := st.GetDocument(ctx, "did:plc:abc|sh.tangled.repo|abc123") 96 + if err != nil { 97 + t.Fatalf("get: %v", err) 98 + } 99 + if got.DeletedAt == "" { 100 + t.Error("deleted_at should be set after tombstone") 101 + } 102 + }) 103 + 104 + t.Run("get missing document returns nil", func(t *testing.T) { 105 + got, err := st.GetDocument(ctx, "nonexistent") 106 + if err != nil { 107 + t.Fatalf("get: %v", err) 108 + } 109 + if got != nil { 110 + t.Error("expected nil for missing document") 111 + } 112 + }) 113 + 114 + t.Run("sync state CRUD", func(t *testing.T) { 115 + got, err := st.GetSyncState(ctx, "tap-consumer") 116 + if err != nil { 117 + t.Fatalf("get sync state: %v", err) 118 + } 119 + if got != nil { 120 + t.Error("expected nil for missing sync state") 121 + } 122 + 123 + if err := st.SetSyncState(ctx, "tap-consumer", "cursor-001"); err != nil { 124 + t.Fatalf("set sync state: %v", err) 125 + } 126 + 127 + got, err = st.GetSyncState(ctx, "tap-consumer") 128 + if err != nil { 129 + t.Fatalf("get sync state: %v", err) 130 + } 131 + if got == nil { 132 + t.Fatal("expected sync state, got nil") 133 + } 134 + if got.Cursor != "cursor-001" { 135 + t.Errorf("cursor: got %q, want %q", got.Cursor, "cursor-001") 136 + } 137 + 138 + if err := st.SetSyncState(ctx, "tap-consumer", "cursor-002"); err != nil { 139 + t.Fatalf("update sync state: %v", err) 140 + } 141 + got, err = st.GetSyncState(ctx, "tap-consumer") 142 + if err != nil { 143 + t.Fatalf("get: %v", err) 144 + } 145 + if got.Cursor != "cursor-002" { 146 + t.Errorf("cursor after update: got %q, want %q", got.Cursor, "cursor-002") 147 + } 148 + }) 149 + 150 + t.Run("record state upsert", func(t *testing.T) { 151 + uri := "at://did:plc:abc/sh.tangled.repo.issue/1" 152 + if err := st.UpdateRecordState(ctx, uri, "open"); err != nil { 153 + t.Fatalf("update record state: %v", err) 154 + } 155 + if err := st.UpdateRecordState(ctx, uri, "closed"); err != nil { 156 + t.Fatalf("update record state to closed: %v", err) 157 + } 158 + }) 159 + }

+1

packages/api/internal/tapclient/tapclient.go

··· 1 + package tapclient

+18

packages/api/justfile

··· 1 + version := `git describe --tags --always --dirty 2>/dev/null || echo dev` 2 + commit := `git rev-parse --short HEAD 2>/dev/null || echo none` 3 + ldflags := "-s -w -X main.version=" + version + " -X main.commit=" + commit 4 + 5 + build: 6 + CGO_ENABLED=0 go build -ldflags "{{ldflags}}" -o twister ./main.go 7 + 8 + run-api: 9 + go run -ldflags "{{ldflags}}" ./main.go api 10 + 11 + run-indexer: 12 + go run -ldflags "{{ldflags}}" ./main.go indexer 13 + 14 + test: 15 + go test ./... 16 + 17 + clean: 18 + rm -f twister

+159

packages/api/main.go

··· 1 + package main 2 + 3 + import ( 4 + "context" 5 + "fmt" 6 + "log/slog" 7 + "os" 8 + "os/signal" 9 + "syscall" 10 + 11 + "github.com/spf13/cobra" 12 + "tangled.org/desertthunder.dev/twister/internal/config" 13 + "tangled.org/desertthunder.dev/twister/internal/observability" 14 + ) 15 + 16 + var ( 17 + version = "dev" 18 + commit = "none" 19 + ) 20 + 21 + func main() { 22 + root := &cobra.Command{ 23 + Use: "twister", 24 + Short: "Tangled search service", 25 + Version: fmt.Sprintf("%s (%s)", version, commit), 26 + } 27 + 28 + root.AddCommand( 29 + newAPICmd(), 30 + newIndexerCmd(), 31 + newEmbedWorkerCmd(), 32 + newReindexCmd(), 33 + newReembedCmd(), 34 + newHealthcheckCmd(), 35 + ) 36 + 37 + if err := root.Execute(); err != nil { 38 + os.Exit(1) 39 + } 40 + } 41 + 42 + func baseContext() (context.Context, context.CancelFunc) { 43 + ctx, cancel := context.WithCancel(context.Background()) 44 + go func() { 45 + ch := make(chan os.Signal, 1) 46 + signal.Notify(ch, syscall.SIGTERM, syscall.SIGINT) 47 + <-ch 48 + cancel() 49 + }() 50 + return ctx, cancel 51 + } 52 + 53 + func newAPICmd() *cobra.Command { 54 + return &cobra.Command{ 55 + Use: "api", 56 + Short: "Start the HTTP search API", 57 + RunE: func(cmd *cobra.Command, args []string) error { 58 + cfg, err := config.Load() 59 + if err != nil { 60 + return fmt.Errorf("config: %w", err) 61 + } 62 + log := observability.NewLogger(cfg) 63 + log.Info("starting api", slog.String("service", "api"), slog.String("version", version), slog.String("addr", cfg.HTTPBindAddr)) 64 + ctx, cancel := baseContext() 65 + defer cancel() 66 + <-ctx.Done() 67 + log.Info("shutting down api") 68 + return nil 69 + }, 70 + } 71 + } 72 + 73 + func newIndexerCmd() *cobra.Command { 74 + return &cobra.Command{ 75 + Use: "indexer", 76 + Short: "Start the Tap consumer and indexer", 77 + RunE: func(cmd *cobra.Command, args []string) error { 78 + cfg, err := config.Load() 79 + if err != nil { 80 + return fmt.Errorf("config: %w", err) 81 + } 82 + log := observability.NewLogger(cfg) 83 + log.Info("starting indexer", slog.String("service", "indexer"), slog.String("version", version)) 84 + ctx, cancel := baseContext() 85 + defer cancel() 86 + <-ctx.Done() 87 + log.Info("shutting down indexer") 88 + return nil 89 + }, 90 + } 91 + } 92 + 93 + func newEmbedWorkerCmd() *cobra.Command { 94 + return &cobra.Command{ 95 + Use: "embed-worker", 96 + Short: "Start the async embedding worker", 97 + RunE: func(cmd *cobra.Command, args []string) error { 98 + cfg, err := config.Load() 99 + if err != nil { 100 + return fmt.Errorf("config: %w", err) 101 + } 102 + log := observability.NewLogger(cfg) 103 + log.Info("starting embed-worker", slog.String("service", "embed-worker"), slog.String("version", version)) 104 + ctx, cancel := baseContext() 105 + defer cancel() 106 + <-ctx.Done() 107 + log.Info("shutting down embed-worker") 108 + return nil 109 + }, 110 + } 111 + } 112 + 113 + func newReindexCmd() *cobra.Command { 114 + return &cobra.Command{ 115 + Use: "reindex", 116 + Short: "Re-normalize and upsert all documents", 117 + RunE: func(cmd *cobra.Command, args []string) error { 118 + cfg, err := config.Load() 119 + if err != nil { 120 + return fmt.Errorf("config: %w", err) 121 + } 122 + log := observability.NewLogger(cfg) 123 + log.Info("reindex: not yet implemented") 124 + return nil 125 + }, 126 + } 127 + } 128 + 129 + func newReembedCmd() *cobra.Command { 130 + return &cobra.Command{ 131 + Use: "reembed", 132 + Short: "Re-generate all embeddings", 133 + RunE: func(cmd *cobra.Command, args []string) error { 134 + cfg, err := config.Load() 135 + if err != nil { 136 + return fmt.Errorf("config: %w", err) 137 + } 138 + log := observability.NewLogger(cfg) 139 + log.Info("reembed: not yet implemented") 140 + return nil 141 + }, 142 + } 143 + } 144 + 145 + func newHealthcheckCmd() *cobra.Command { 146 + return &cobra.Command{ 147 + Use: "healthcheck", 148 + Short: "One-shot health probe", 149 + RunE: func(cmd *cobra.Command, args []string) error { 150 + cfg, err := config.Load() 151 + if err != nil { 152 + return fmt.Errorf("config: %w", err) 153 + } 154 + log := observability.NewLogger(cfg) 155 + log.Info("healthcheck: ok") 156 + return nil 157 + }, 158 + } 159 + }

+3 -1

pnpm-lock.yaml

··· 6 6 7 7 importers: 8 8 9 - .: 9 + .: {} 10 + 11 + apps/twisted: 10 12 dependencies: 11 13 '@atcute/bluesky': 12 14 specifier: ^3.3.0

+4

pnpm-workspace.yaml

··· 1 + packages: 2 + - "apps/*" 3 + - "packages/*" 4 + 1 5 onlyBuiltDependencies: 2 6 - core-js 3 7 - cypress

public/favicon.png apps/twisted/public/favicon.png

src/App.vue apps/twisted/src/App.vue

src/app/boot/.gitkeep apps/twisted/src/app/boot/.gitkeep

src/app/providers/.gitkeep apps/twisted/src/app/providers/.gitkeep

src/app/router/index.ts apps/twisted/src/app/router/index.ts

src/components/common/.gitkeep apps/twisted/src/components/common/.gitkeep

src/components/common/ActivityCard.vue apps/twisted/src/components/common/ActivityCard.vue

src/components/common/EmptyState.vue apps/twisted/src/components/common/EmptyState.vue

src/components/common/ErrorBoundary.vue apps/twisted/src/components/common/ErrorBoundary.vue

src/components/common/RepoCard.vue apps/twisted/src/components/common/RepoCard.vue

src/components/common/SkeletonLoader.vue apps/twisted/src/components/common/SkeletonLoader.vue

src/components/common/UserCard.vue apps/twisted/src/components/common/UserCard.vue

src/components/feed/.gitkeep apps/twisted/src/components/feed/.gitkeep

src/components/profile/.gitkeep apps/twisted/src/components/profile/.gitkeep

src/components/repo/.gitkeep apps/twisted/src/components/repo/.gitkeep

src/components/repo/CommentThread.vue apps/twisted/src/components/repo/CommentThread.vue

src/components/repo/FileTreeItem.vue apps/twisted/src/components/repo/FileTreeItem.vue

src/components/repo/MarkdownRenderer.vue apps/twisted/src/components/repo/MarkdownRenderer.vue

src/core/auth/.gitkeep apps/twisted/src/core/auth/.gitkeep

src/core/config/.gitkeep apps/twisted/src/core/config/.gitkeep

src/core/errors/.gitkeep apps/twisted/src/core/errors/.gitkeep

src/core/errors/tangled.ts apps/twisted/src/core/errors/tangled.ts

src/core/query/cache.ts apps/twisted/src/core/query/cache.ts

src/core/query/client.ts apps/twisted/src/core/query/client.ts

src/core/query/persister.ts apps/twisted/src/core/query/persister.ts

src/core/storage/.gitkeep apps/twisted/src/core/storage/.gitkeep

src/core/theme/preferences.ts apps/twisted/src/core/theme/preferences.ts

src/domain/feed/.gitkeep apps/twisted/src/domain/feed/.gitkeep

src/domain/models/activity.ts apps/twisted/src/domain/models/activity.ts

src/domain/models/comment.ts apps/twisted/src/domain/models/comment.ts

src/domain/models/follow.ts apps/twisted/src/domain/models/follow.ts

src/domain/models/issue.ts apps/twisted/src/domain/models/issue.ts

src/domain/models/pull-request.ts apps/twisted/src/domain/models/pull-request.ts

src/domain/models/repo.ts apps/twisted/src/domain/models/repo.ts

src/domain/models/string.ts apps/twisted/src/domain/models/string.ts

src/domain/models/user.ts apps/twisted/src/domain/models/user.ts

src/domain/profile/.gitkeep apps/twisted/src/domain/profile/.gitkeep

src/domain/repo/.gitkeep apps/twisted/src/domain/repo/.gitkeep

src/features/activity/ActivityPage.vue apps/twisted/src/features/activity/ActivityPage.vue

src/features/explore/ExplorePage.vue apps/twisted/src/features/explore/ExplorePage.vue

src/features/home/HomePage.vue apps/twisted/src/features/home/HomePage.vue

src/features/profile/ProfilePage.vue apps/twisted/src/features/profile/ProfilePage.vue

src/features/profile/SettingsPage.vue apps/twisted/src/features/profile/SettingsPage.vue

src/features/profile/UserProfilePage.vue apps/twisted/src/features/profile/UserProfilePage.vue

src/features/profile/UserStrings.vue apps/twisted/src/features/profile/UserStrings.vue

src/features/repo/IssueDetailPage.vue apps/twisted/src/features/repo/IssueDetailPage.vue

src/features/repo/PullRequestDetailPage.vue apps/twisted/src/features/repo/PullRequestDetailPage.vue

src/features/repo/RepoDetailPage.vue apps/twisted/src/features/repo/RepoDetailPage.vue

src/features/repo/RepoFiles.vue apps/twisted/src/features/repo/RepoFiles.vue

src/features/repo/RepoIssues.vue apps/twisted/src/features/repo/RepoIssues.vue

src/features/repo/RepoOverview.vue apps/twisted/src/features/repo/RepoOverview.vue

src/features/repo/RepoPRs.vue apps/twisted/src/features/repo/RepoPRs.vue

src/lib/html.ts apps/twisted/src/lib/html.ts

src/lib/syntax.ts apps/twisted/src/lib/syntax.ts

src/main.ts apps/twisted/src/main.ts

src/mocks/.gitkeep apps/twisted/src/mocks/.gitkeep

src/mocks/activity.ts apps/twisted/src/mocks/activity.ts

src/mocks/issues.ts apps/twisted/src/mocks/issues.ts

src/mocks/pull-requests.ts apps/twisted/src/mocks/pull-requests.ts

src/mocks/repos.ts apps/twisted/src/mocks/repos.ts

src/mocks/users.ts apps/twisted/src/mocks/users.ts

src/services/atproto/.gitkeep apps/twisted/src/services/atproto/.gitkeep

src/services/atproto/client.ts apps/twisted/src/services/atproto/client.ts

src/services/tangled/.gitkeep apps/twisted/src/services/tangled/.gitkeep

src/services/tangled/endpoints.ts apps/twisted/src/services/tangled/endpoints.ts

src/services/tangled/normalizers.ts apps/twisted/src/services/tangled/normalizers.ts

src/services/tangled/queries.ts apps/twisted/src/services/tangled/queries.ts

src/services/tangled/repo-assets.ts apps/twisted/src/services/tangled/repo-assets.ts

src/services/tangled/uris.ts apps/twisted/src/services/tangled/uris.ts

src/theme/variables.css apps/twisted/src/theme/variables.css

src/views/HomePage.vue apps/twisted/src/views/HomePage.vue

src/views/TabsPage.vue apps/twisted/src/views/TabsPage.vue

src/vite-env.d.ts apps/twisted/src/vite-env.d.ts

tests/e2e/fixtures/example.json apps/twisted/tests/e2e/fixtures/example.json

tests/e2e/specs/test.cy.ts apps/twisted/tests/e2e/specs/test.cy.ts

tests/e2e/support/commands.ts apps/twisted/tests/e2e/support/commands.ts

tests/e2e/support/e2e.ts apps/twisted/tests/e2e/support/e2e.ts

tests/unit/example.spec.ts apps/twisted/tests/unit/example.spec.ts

tests/unit/tangled-normalizers.spec.ts apps/twisted/tests/unit/tangled-normalizers.spec.ts

tsconfig.json apps/twisted/tsconfig.json

tsconfig.node.json apps/twisted/tsconfig.node.json

vite.config.ts apps/twisted/vite.config.ts