Compare changes

Choose any two refs to compare.

-4
.gitignore
··· 4 4 .env 5 5 server.log 6 6 test_*.py 7 - 8 - # zig build artifacts 9 - .zig-cache/ 10 - zig-out/
-1
Cargo.lock
··· 683 683 "opentelemetry 0.26.0", 684 684 "opentelemetry-instrumentation-actix-web", 685 685 "opentelemetry-otlp 0.26.0", 686 - "regex", 687 686 "reqwest", 688 687 "serde", 689 688 "serde_json",
-1
Cargo.toml
··· 24 24 opentelemetry = { version = "0.26", features = ["trace", "metrics"] } 25 25 opentelemetry-instrumentation-actix-web = { version = "0.23", features = ["metrics"] } 26 26 opentelemetry-otlp = { version = "0.26", features = ["trace", "http-proto", "reqwest-client", "reqwest-rustls"] } 27 - regex = "1.12"
+1 -1
README.md
··· 2 2 3 3 hybrid semantic + keyword search for the bufo zone 4 4 5 - **live at: [find-bufo.com](https://find-bufo.com/)** 5 + **live at: [find-bufo.fly.dev](https://find-bufo.fly.dev/)** 6 6 7 7 ## overview 8 8
-32
bot/Dockerfile
··· 1 - # build stage 2 - FROM debian:bookworm-slim AS builder 3 - 4 - RUN apt-get update && apt-get install -y --no-install-recommends \ 5 - ca-certificates \ 6 - curl \ 7 - xz-utils \ 8 - && rm -rf /var/lib/apt/lists/* 9 - 10 - # install zig 0.15.2 11 - RUN curl -L https://ziglang.org/download/0.15.2/zig-x86_64-linux-0.15.2.tar.xz | tar -xJ -C /usr/local \ 12 - && ln -s /usr/local/zig-x86_64-linux-0.15.2/zig /usr/local/bin/zig 13 - 14 - WORKDIR /app 15 - COPY build.zig build.zig.zon ./ 16 - COPY src ./src 17 - 18 - RUN zig build -Doptimize=ReleaseSafe 19 - 20 - # runtime stage 21 - FROM debian:bookworm-slim 22 - 23 - RUN apt-get update && apt-get install -y --no-install-recommends \ 24 - ca-certificates \ 25 - && rm -rf /var/lib/apt/lists/* \ 26 - # prefer IPv4 over IPv6 for outbound connections (IPv6 times out in Fly.io) 27 - && echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf 28 - 29 - WORKDIR /app 30 - COPY --from=builder /app/zig-out/bin/bufo-bot . 31 - 32 - CMD ["./bufo-bot"]
-53
bot/README.md
··· 1 - # bufo-bot 2 - 3 - bluesky bot that listens to the jetstream firehose and quote-posts matching bufo images. 4 - 5 - ## how it works 6 - 7 - 1. connects to bluesky jetstream (firehose) 8 - 2. for each post, checks if text contains an exact phrase matching a bufo name 9 - 3. if matched, quote-posts with the corresponding bufo image 10 - 11 - ## matching logic 12 - 13 - - extracts phrase from bufo filename (e.g., `bufo-let-them-eat-cake` -> `let them eat cake`) 14 - - requires exact consecutive word match in post text 15 - - configurable minimum phrase length (default: 4 words) 16 - 17 - ## configuration 18 - 19 - | env var | default | description | 20 - |---------|---------|-------------| 21 - | `BSKY_HANDLE` | required | bluesky handle (e.g., `find-bufo.com`) | 22 - | `BSKY_APP_PASSWORD` | required | app password from bsky settings | 23 - | `MIN_PHRASE_WORDS` | `4` | minimum words in phrase to match | 24 - | `POSTING_ENABLED` | `false` | must be `true` to actually post | 25 - | `COOLDOWN_MINUTES` | `120` | don't repost same bufo within this time | 26 - | `EXCLUDE_PATTERNS` | `...` | exclude bufos matching these patterns | 27 - | `JETSTREAM_ENDPOINT` | `jetstream2.us-east.bsky.network` | jetstream server | 28 - 29 - ## local dev 30 - 31 - ```bash 32 - # build 33 - zig build 34 - 35 - # run locally (dry run by default) 36 - ./zig-out/bin/bufo-bot 37 - ``` 38 - 39 - ## deploy 40 - 41 - ```bash 42 - # set secrets (once) 43 - fly secrets set BSKY_HANDLE=find-bufo.com BSKY_APP_PASSWORD=xxxx -a bufo-bot 44 - 45 - # deploy 46 - fly deploy 47 - 48 - # enable posting 49 - fly secrets set POSTING_ENABLED=true -a bufo-bot 50 - 51 - # check logs 52 - fly logs -a bufo-bot 53 - ```
-34
bot/build.zig
··· 1 - const std = @import("std"); 2 - 3 - pub fn build(b: *std.Build) void { 4 - const target = b.standardTargetOptions(.{}); 5 - const optimize = b.standardOptimizeOption(.{}); 6 - 7 - const websocket = b.dependency("websocket", .{ 8 - .target = target, 9 - .optimize = optimize, 10 - }); 11 - 12 - const exe = b.addExecutable(.{ 13 - .name = "bufo-bot", 14 - .root_module = b.createModule(.{ 15 - .root_source_file = b.path("src/main.zig"), 16 - .target = target, 17 - .optimize = optimize, 18 - .imports = &.{ 19 - .{ .name = "websocket", .module = websocket.module("websocket") }, 20 - }, 21 - }), 22 - }); 23 - 24 - b.installArtifact(exe); 25 - 26 - const run_cmd = b.addRunArtifact(exe); 27 - run_cmd.step.dependOn(b.getInstallStep()); 28 - if (b.args) |args| { 29 - run_cmd.addArgs(args); 30 - } 31 - 32 - const run_step = b.step("run", "Run the bot"); 33 - run_step.dependOn(&run_cmd.step); 34 - }
-17
bot/build.zig.zon
··· 1 - .{ 2 - .name = .bufo_bot, 3 - .version = "0.0.1", 4 - .fingerprint = 0xe143490f82fa96db, 5 - .minimum_zig_version = "0.15.0", 6 - .dependencies = .{ 7 - .websocket = .{ 8 - .url = "https://github.com/karlseguin/websocket.zig/archive/refs/heads/master.tar.gz", 9 - .hash = "websocket-0.1.0-ZPISdRNzAwAGszh62EpRtoQxu8wb1MSMVI6Ow0o2dmyJ", 10 - }, 11 - }, 12 - .paths = .{ 13 - "build.zig", 14 - "build.zig.zon", 15 - "src", 16 - }, 17 - }
-31
bot/fly.toml
··· 1 - app = "bufo-bot" 2 - primary_region = "ewr" 3 - 4 - [build] 5 - dockerfile = "Dockerfile" 6 - 7 - [env] 8 - JETSTREAM_ENDPOINT = "jetstream2.us-east.bsky.network" 9 - STATS_PORT = "8080" 10 - 11 - [http_service] 12 - internal_port = 8080 13 - force_https = true 14 - auto_stop_machines = "off" 15 - auto_start_machines = true 16 - min_machines_running = 1 17 - max_machines_running = 1 # IMPORTANT: only 1 instance - bot consumes jetstream firehose 18 - 19 - [[vm]] 20 - memory = "256mb" 21 - cpu_kind = "shared" 22 - cpus = 1 23 - 24 - [mounts] 25 - source = "bufo_data" 26 - destination = "/data" 27 - 28 - # secrets to set via: fly secrets set KEY=value -a bufo-bot 29 - # - BSKY_HANDLE (e.g., find-bufo.com) 30 - # - BSKY_APP_PASSWORD (app password from bsky settings) 31 - # - POSTING_ENABLED=true (to enable posting, default is false)
-25
bot/justfile
··· 1 - # bot/justfile 2 - set shell := ["bash", "-eu", "-o", "pipefail", "-c"] 3 - 4 - default: 5 - @just --list 6 - 7 - # build the bot 8 - build: 9 - zig build 10 - 11 - # run the bot locally 12 - run: 13 - zig build run 14 - 15 - # deploy to fly.io 16 - deploy: 17 - fly deploy --wait-timeout 180 18 - 19 - # check logs 20 - logs: 21 - fly logs -a bufo-bot 22 - 23 - # set secrets (run once) 24 - secrets HANDLE PASSWORD: 25 - fly secrets set BSKY_HANDLE={{ HANDLE }} BSKY_APP_PASSWORD={{ PASSWORD }} -a bufo-bot
-525
bot/src/bsky.zig
··· 1 - const std = @import("std"); 2 - const mem = std.mem; 3 - const json = std.json; 4 - const http = std.http; 5 - const Allocator = mem.Allocator; 6 - const Io = std.Io; 7 - 8 - pub const BskyClient = struct { 9 - allocator: Allocator, 10 - handle: []const u8, 11 - app_password: []const u8, 12 - access_jwt: ?[]const u8 = null, 13 - did: ?[]const u8 = null, 14 - pds_host: ?[]const u8 = null, 15 - 16 - pub fn init(allocator: Allocator, handle: []const u8, app_password: []const u8) BskyClient { 17 - return .{ 18 - .allocator = allocator, 19 - .handle = handle, 20 - .app_password = app_password, 21 - }; 22 - } 23 - 24 - pub fn deinit(self: *BskyClient) void { 25 - if (self.access_jwt) |jwt| self.allocator.free(jwt); 26 - if (self.did) |did| self.allocator.free(did); 27 - if (self.pds_host) |host| self.allocator.free(host); 28 - } 29 - 30 - fn httpClient(self: *BskyClient) http.Client { 31 - return .{ .allocator = self.allocator }; 32 - } 33 - 34 - pub fn login(self: *BskyClient) !void { 35 - std.debug.print("logging in as {s}...\n", .{self.handle}); 36 - 37 - var client = self.httpClient(); 38 - defer client.deinit(); 39 - 40 - var body_buf: std.ArrayList(u8) = .{}; 41 - defer body_buf.deinit(self.allocator); 42 - try body_buf.print(self.allocator, "{{\"identifier\":\"{s}\",\"password\":\"{s}\"}}", .{ self.handle, self.app_password }); 43 - 44 - var aw: Io.Writer.Allocating = .init(self.allocator); 45 - defer aw.deinit(); 46 - 47 - const result = client.fetch(.{ 48 - .location = .{ .url = "https://bsky.social/xrpc/com.atproto.server.createSession" }, 49 - .method = .POST, 50 - .headers = .{ .content_type = .{ .override = "application/json" } }, 51 - .payload = body_buf.items, 52 - .response_writer = &aw.writer, 53 - }) catch |err| { 54 - std.debug.print("login request failed: {}\n", .{err}); 55 - return err; 56 - }; 57 - 58 - if (result.status != .ok) { 59 - std.debug.print("login failed with status: {}\n", .{result.status}); 60 - return error.LoginFailed; 61 - } 62 - 63 - const response = aw.toArrayList(); 64 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 65 - defer parsed.deinit(); 66 - 67 - const root = parsed.value.object; 68 - 69 - const jwt_val = root.get("accessJwt") orelse return error.NoJwt; 70 - if (jwt_val != .string) return error.NoJwt; 71 - 72 - const did_val = root.get("did") orelse return error.NoDid; 73 - if (did_val != .string) return error.NoDid; 74 - 75 - self.access_jwt = try self.allocator.dupe(u8, jwt_val.string); 76 - self.did = try self.allocator.dupe(u8, did_val.string); 77 - 78 - // fetch PDS host from PLC directory 79 - try self.fetchPdsHost(); 80 - 81 - std.debug.print("logged in as {s} (did: {s}, pds: {s})\n", .{ self.handle, self.did.?, self.pds_host.? }); 82 - } 83 - 84 - fn fetchPdsHost(self: *BskyClient) !void { 85 - var client = self.httpClient(); 86 - defer client.deinit(); 87 - 88 - var url_buf: [256]u8 = undefined; 89 - const url = std.fmt.bufPrint(&url_buf, "https://plc.directory/{s}", .{self.did.?}) catch return error.UrlTooLong; 90 - 91 - var aw: Io.Writer.Allocating = .init(self.allocator); 92 - defer aw.deinit(); 93 - 94 - const result = client.fetch(.{ 95 - .location = .{ .url = url }, 96 - .method = .GET, 97 - .response_writer = &aw.writer, 98 - }) catch |err| { 99 - std.debug.print("fetch PDS host failed: {}\n", .{err}); 100 - return err; 101 - }; 102 - 103 - if (result.status != .ok) { 104 - std.debug.print("fetch PDS host failed with status: {}\n", .{result.status}); 105 - return error.PlcLookupFailed; 106 - } 107 - 108 - const response = aw.toArrayList(); 109 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 110 - defer parsed.deinit(); 111 - 112 - // find the atproto_pds service endpoint 113 - const service = parsed.value.object.get("service") orelse return error.NoService; 114 - if (service != .array) return error.NoService; 115 - 116 - for (service.array.items) |svc| { 117 - if (svc != .object) continue; 118 - const id_val = svc.object.get("id") orelse continue; 119 - if (id_val != .string) continue; 120 - if (!mem.eql(u8, id_val.string, "#atproto_pds")) continue; 121 - 122 - const endpoint_val = svc.object.get("serviceEndpoint") orelse continue; 123 - if (endpoint_val != .string) continue; 124 - 125 - // extract host from URL like "https://phellinus.us-west.host.bsky.network" 126 - const endpoint = endpoint_val.string; 127 - const prefix = "https://"; 128 - if (mem.startsWith(u8, endpoint, prefix)) { 129 - self.pds_host = try self.allocator.dupe(u8, endpoint[prefix.len..]); 130 - return; 131 - } 132 - } 133 - 134 - return error.NoPdsService; 135 - } 136 - 137 - pub fn uploadBlob(self: *BskyClient, data: []const u8, content_type: []const u8) ![]const u8 { 138 - if (self.access_jwt == null) return error.NotLoggedIn; 139 - 140 - var client = self.httpClient(); 141 - defer client.deinit(); 142 - 143 - var auth_buf: [512]u8 = undefined; 144 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong; 145 - 146 - var aw: Io.Writer.Allocating = .init(self.allocator); 147 - defer aw.deinit(); 148 - 149 - const result = client.fetch(.{ 150 - .location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.uploadBlob" }, 151 - .method = .POST, 152 - .headers = .{ 153 - .content_type = .{ .override = content_type }, 154 - .authorization = .{ .override = auth_header }, 155 - }, 156 - .payload = data, 157 - .response_writer = &aw.writer, 158 - }) catch |err| { 159 - std.debug.print("upload blob failed: {}\n", .{err}); 160 - return err; 161 - }; 162 - 163 - if (result.status != .ok) { 164 - const err_response = aw.toArrayList(); 165 - std.debug.print("upload blob failed with status: {} - {s}\n", .{ result.status, err_response.items }); 166 - // check for expired token 167 - if (mem.indexOf(u8, err_response.items, "ExpiredToken") != null) { 168 - return error.ExpiredToken; 169 - } 170 - return error.UploadFailed; 171 - } 172 - 173 - const response = aw.toArrayList(); 174 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 175 - defer parsed.deinit(); 176 - 177 - const root = parsed.value.object; 178 - const blob = root.get("blob") orelse return error.NoBlobRef; 179 - if (blob != .object) return error.NoBlobRef; 180 - 181 - return json.Stringify.valueAlloc(self.allocator, blob, .{}) catch return error.SerializeError; 182 - } 183 - 184 - pub fn createQuotePost(self: *BskyClient, quote_uri: []const u8, quote_cid: []const u8, blob_json: []const u8, alt_text: []const u8) !void { 185 - if (self.access_jwt == null or self.did == null) return error.NotLoggedIn; 186 - 187 - var client = self.httpClient(); 188 - defer client.deinit(); 189 - 190 - var body_buf: std.ArrayList(u8) = .{}; 191 - defer body_buf.deinit(self.allocator); 192 - 193 - var ts_buf: [30]u8 = undefined; 194 - try body_buf.print(self.allocator, 195 - \\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{"$type":"app.bsky.feed.post","text":"","createdAt":"{s}","embed":{{"$type":"app.bsky.embed.recordWithMedia","record":{{"$type":"app.bsky.embed.record","record":{{"uri":"{s}","cid":"{s}"}}}},"media":{{"$type":"app.bsky.embed.images","images":[{{"image":{s},"alt":"{s}"}}]}}}}}}}} 196 - , .{ self.did.?, getIsoTimestamp(&ts_buf), quote_uri, quote_cid, blob_json, alt_text }); 197 - 198 - var auth_buf: [512]u8 = undefined; 199 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong; 200 - 201 - var aw: Io.Writer.Allocating = .init(self.allocator); 202 - defer aw.deinit(); 203 - 204 - const result = client.fetch(.{ 205 - .location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" }, 206 - .method = .POST, 207 - .headers = .{ 208 - .content_type = .{ .override = "application/json" }, 209 - .authorization = .{ .override = auth_header }, 210 - }, 211 - .payload = body_buf.items, 212 - .response_writer = &aw.writer, 213 - }) catch |err| { 214 - std.debug.print("create post failed: {}\n", .{err}); 215 - return err; 216 - }; 217 - 218 - if (result.status != .ok) { 219 - const response = aw.toArrayList(); 220 - std.debug.print("create post failed with status: {} - {s}\n", .{ result.status, response.items }); 221 - return error.PostFailed; 222 - } 223 - 224 - std.debug.print("posted successfully!\n", .{}); 225 - } 226 - 227 - pub fn getPostCid(self: *BskyClient, uri: []const u8) ![]const u8 { 228 - if (self.access_jwt == null) return error.NotLoggedIn; 229 - 230 - var client = self.httpClient(); 231 - defer client.deinit(); 232 - 233 - var parts = mem.splitScalar(u8, uri[5..], '/'); 234 - const did = parts.next() orelse return error.InvalidUri; 235 - _ = parts.next(); 236 - const rkey = parts.next() orelse return error.InvalidUri; 237 - 238 - var url_buf: [512]u8 = undefined; 239 - const url = std.fmt.bufPrint(&url_buf, "https://bsky.social/xrpc/com.atproto.repo.getRecord?repo={s}&collection=app.bsky.feed.post&rkey={s}", .{ did, rkey }) catch return error.UrlTooLong; 240 - 241 - var auth_buf: [512]u8 = undefined; 242 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong; 243 - 244 - var aw: Io.Writer.Allocating = .init(self.allocator); 245 - defer aw.deinit(); 246 - 247 - const result = client.fetch(.{ 248 - .location = .{ .url = url }, 249 - .method = .GET, 250 - .headers = .{ .authorization = .{ .override = auth_header } }, 251 - .response_writer = &aw.writer, 252 - }) catch |err| { 253 - std.debug.print("get record failed: {}\n", .{err}); 254 - return err; 255 - }; 256 - 257 - if (result.status != .ok) { 258 - return error.GetRecordFailed; 259 - } 260 - 261 - const response = aw.toArrayList(); 262 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 263 - defer parsed.deinit(); 264 - 265 - const cid_val = parsed.value.object.get("cid") orelse return error.NoCid; 266 - if (cid_val != .string) return error.NoCid; 267 - 268 - return try self.allocator.dupe(u8, cid_val.string); 269 - } 270 - 271 - pub fn fetchImage(self: *BskyClient, url: []const u8) ![]const u8 { 272 - var client = self.httpClient(); 273 - defer client.deinit(); 274 - 275 - var aw: Io.Writer.Allocating = .init(self.allocator); 276 - errdefer aw.deinit(); 277 - 278 - const result = client.fetch(.{ 279 - .location = .{ .url = url }, 280 - .method = .GET, 281 - .response_writer = &aw.writer, 282 - }) catch |err| { 283 - std.debug.print("fetch image failed: {}\n", .{err}); 284 - return err; 285 - }; 286 - 287 - if (result.status != .ok) { 288 - aw.deinit(); 289 - return error.FetchFailed; 290 - } 291 - 292 - return try aw.toOwnedSlice(); 293 - } 294 - 295 - pub fn getServiceAuth(self: *BskyClient) ![]const u8 { 296 - if (self.access_jwt == null or self.did == null or self.pds_host == null) return error.NotLoggedIn; 297 - 298 - var client = self.httpClient(); 299 - defer client.deinit(); 300 - 301 - var url_buf: [512]u8 = undefined; 302 - const url = std.fmt.bufPrint(&url_buf, "https://bsky.social/xrpc/com.atproto.server.getServiceAuth?aud=did:web:{s}&lxm=com.atproto.repo.uploadBlob", .{self.pds_host.?}) catch return error.UrlTooLong; 303 - 304 - var auth_buf: [512]u8 = undefined; 305 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong; 306 - 307 - var aw: Io.Writer.Allocating = .init(self.allocator); 308 - defer aw.deinit(); 309 - 310 - const result = client.fetch(.{ 311 - .location = .{ .url = url }, 312 - .method = .GET, 313 - .headers = .{ .authorization = .{ .override = auth_header } }, 314 - .response_writer = &aw.writer, 315 - }) catch |err| { 316 - std.debug.print("get service auth failed: {}\n", .{err}); 317 - return err; 318 - }; 319 - 320 - if (result.status != .ok) { 321 - const err_response = aw.toArrayList(); 322 - std.debug.print("get service auth failed with status: {} - {s}\n", .{ result.status, err_response.items }); 323 - // check for expired token 324 - if (mem.indexOf(u8, err_response.items, "ExpiredToken") != null) { 325 - return error.ExpiredToken; 326 - } 327 - return error.ServiceAuthFailed; 328 - } 329 - 330 - const response = aw.toArrayList(); 331 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 332 - defer parsed.deinit(); 333 - 334 - const token_val = parsed.value.object.get("token") orelse return error.NoToken; 335 - if (token_val != .string) return error.NoToken; 336 - 337 - return try self.allocator.dupe(u8, token_val.string); 338 - } 339 - 340 - pub fn uploadVideo(self: *BskyClient, data: []const u8, filename: []const u8) ![]const u8 { 341 - if (self.did == null) return error.NotLoggedIn; 342 - 343 - // get service auth token 344 - const service_token = try self.getServiceAuth(); 345 - defer self.allocator.free(service_token); 346 - 347 - var client = self.httpClient(); 348 - defer client.deinit(); 349 - 350 - var url_buf: [512]u8 = undefined; 351 - const url = std.fmt.bufPrint(&url_buf, "https://video.bsky.app/xrpc/app.bsky.video.uploadVideo?did={s}&name={s}", .{ self.did.?, filename }) catch return error.UrlTooLong; 352 - 353 - var auth_buf: [512]u8 = undefined; 354 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{service_token}) catch return error.AuthTooLong; 355 - 356 - var aw: Io.Writer.Allocating = .init(self.allocator); 357 - defer aw.deinit(); 358 - 359 - const result = client.fetch(.{ 360 - .location = .{ .url = url }, 361 - .method = .POST, 362 - .headers = .{ 363 - .content_type = .{ .override = "image/gif" }, 364 - .authorization = .{ .override = auth_header }, 365 - }, 366 - .payload = data, 367 - .response_writer = &aw.writer, 368 - }) catch |err| { 369 - std.debug.print("upload video failed: {}\n", .{err}); 370 - return err; 371 - }; 372 - 373 - const response = aw.toArrayList(); 374 - 375 - // handle both .ok and .conflict (already_exists) as success 376 - if (result.status != .ok and result.status != .conflict) { 377 - std.debug.print("upload video failed with status: {}\n", .{result.status}); 378 - return error.VideoUploadFailed; 379 - } 380 - 381 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 382 - defer parsed.deinit(); 383 - 384 - // for conflict responses, jobId is at root level; for ok responses, it's in jobStatus 385 - var job_id_val: ?json.Value = null; 386 - if (parsed.value.object.get("jobStatus")) |job_status| { 387 - if (job_status == .object) { 388 - job_id_val = job_status.object.get("jobId"); 389 - } 390 - } 391 - // fallback to root level jobId (conflict case) 392 - if (job_id_val == null) { 393 - job_id_val = parsed.value.object.get("jobId"); 394 - } 395 - 396 - const job_id = job_id_val orelse { 397 - std.debug.print("no jobId in response\n", .{}); 398 - return error.NoJobId; 399 - }; 400 - if (job_id != .string) return error.NoJobId; 401 - 402 - return try self.allocator.dupe(u8, job_id.string); 403 - } 404 - 405 - pub fn waitForVideo(self: *BskyClient, job_id: []const u8) ![]const u8 { 406 - const service_token = try self.getServiceAuth(); 407 - defer self.allocator.free(service_token); 408 - 409 - var url_buf: [512]u8 = undefined; 410 - const url = std.fmt.bufPrint(&url_buf, "https://video.bsky.app/xrpc/app.bsky.video.getJobStatus?jobId={s}", .{job_id}) catch return error.UrlTooLong; 411 - 412 - var auth_buf: [512]u8 = undefined; 413 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{service_token}) catch return error.AuthTooLong; 414 - 415 - var attempts: u32 = 0; 416 - while (attempts < 60) : (attempts += 1) { 417 - var client = self.httpClient(); 418 - defer client.deinit(); 419 - 420 - var aw: Io.Writer.Allocating = .init(self.allocator); 421 - defer aw.deinit(); 422 - 423 - const result = client.fetch(.{ 424 - .location = .{ .url = url }, 425 - .method = .GET, 426 - .headers = .{ .authorization = .{ .override = auth_header } }, 427 - .response_writer = &aw.writer, 428 - }) catch |err| { 429 - std.debug.print("get job status failed: {}\n", .{err}); 430 - return err; 431 - }; 432 - 433 - if (result.status != .ok) { 434 - std.debug.print("get job status failed with status: {}\n", .{result.status}); 435 - return error.JobStatusFailed; 436 - } 437 - 438 - const response = aw.toArrayList(); 439 - const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError; 440 - defer parsed.deinit(); 441 - 442 - const job_status = parsed.value.object.get("jobStatus") orelse return error.NoJobStatus; 443 - if (job_status != .object) return error.NoJobStatus; 444 - 445 - const state_val = job_status.object.get("state") orelse continue; 446 - if (state_val != .string) continue; 447 - 448 - if (mem.eql(u8, state_val.string, "JOB_STATE_COMPLETED")) { 449 - const blob = job_status.object.get("blob") orelse return error.NoBlobRef; 450 - if (blob != .object) return error.NoBlobRef; 451 - return json.Stringify.valueAlloc(self.allocator, blob, .{}) catch return error.SerializeError; 452 - } else if (mem.eql(u8, state_val.string, "JOB_STATE_FAILED")) { 453 - std.debug.print("video processing failed\n", .{}); 454 - return error.VideoProcessingFailed; 455 - } 456 - 457 - std.Thread.sleep(1 * std.time.ns_per_s); 458 - } 459 - 460 - return error.VideoTimeout; 461 - } 462 - 463 - pub fn createVideoQuotePost(self: *BskyClient, quote_uri: []const u8, quote_cid: []const u8, blob_json: []const u8, alt_text: []const u8) !void { 464 - if (self.access_jwt == null or self.did == null) return error.NotLoggedIn; 465 - 466 - var client = self.httpClient(); 467 - defer client.deinit(); 468 - 469 - var body_buf: std.ArrayList(u8) = .{}; 470 - defer body_buf.deinit(self.allocator); 471 - 472 - var ts_buf: [30]u8 = undefined; 473 - try body_buf.print(self.allocator, 474 - \\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{"$type":"app.bsky.feed.post","text":"","createdAt":"{s}","embed":{{"$type":"app.bsky.embed.recordWithMedia","record":{{"$type":"app.bsky.embed.record","record":{{"uri":"{s}","cid":"{s}"}}}},"media":{{"$type":"app.bsky.embed.video","video":{s},"alt":"{s}"}}}}}}}} 475 - , .{ self.did.?, getIsoTimestamp(&ts_buf), quote_uri, quote_cid, blob_json, alt_text }); 476 - 477 - var auth_buf: [512]u8 = undefined; 478 - const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong; 479 - 480 - var aw: Io.Writer.Allocating = .init(self.allocator); 481 - defer aw.deinit(); 482 - 483 - const result = client.fetch(.{ 484 - .location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" }, 485 - .method = .POST, 486 - .headers = .{ 487 - .content_type = .{ .override = "application/json" }, 488 - .authorization = .{ .override = auth_header }, 489 - }, 490 - .payload = body_buf.items, 491 - .response_writer = &aw.writer, 492 - }) catch |err| { 493 - std.debug.print("create video post failed: {}\n", .{err}); 494 - return err; 495 - }; 496 - 497 - if (result.status != .ok) { 498 - const response = aw.toArrayList(); 499 - std.debug.print("create video post failed with status: {} - {s}\n", .{ result.status, response.items }); 500 - return error.PostFailed; 501 - } 502 - 503 - std.debug.print("posted video successfully!\n", .{}); 504 - } 505 - }; 506 - 507 - fn getIsoTimestamp(buf: *[30]u8) []const u8 { 508 - const ts = std.time.timestamp(); 509 - const epoch_secs: u64 = @intCast(ts); 510 - const epoch = std.time.epoch.EpochSeconds{ .secs = epoch_secs }; 511 - const day = epoch.getEpochDay(); 512 - const year_day = day.calculateYearDay(); 513 - const month_day = year_day.calculateMonthDay(); 514 - const day_secs = epoch.getDaySeconds(); 515 - 516 - const len = std.fmt.bufPrint(buf, "{d:0>4}-{d:0>2}-{d:0>2}T{d:0>2}:{d:0>2}:{d:0>2}.000Z", .{ 517 - year_day.year, 518 - month_day.month.numeric(), 519 - month_day.day_index + 1, 520 - day_secs.getHoursIntoDay(), 521 - day_secs.getMinutesIntoHour(), 522 - day_secs.getSecondsIntoMinute(), 523 - }) catch return "2025-01-01T00:00:00.000Z"; 524 - return buf[0..len.len]; 525 - }
-47
bot/src/config.zig
··· 1 - const std = @import("std"); 2 - const posix = std.posix; 3 - 4 - pub const Config = struct { 5 - bsky_handle: []const u8, 6 - bsky_app_password: []const u8, 7 - jetstream_endpoint: []const u8, 8 - min_phrase_words: u32, 9 - posting_enabled: bool, 10 - cooldown_minutes: u32, 11 - exclude_patterns: []const u8, 12 - stats_port: u16, 13 - 14 - pub fn fromEnv() Config { 15 - return .{ 16 - .bsky_handle = posix.getenv("BSKY_HANDLE") orelse "find-bufo.com", 17 - .bsky_app_password = posix.getenv("BSKY_APP_PASSWORD") orelse "", 18 - .jetstream_endpoint = posix.getenv("JETSTREAM_ENDPOINT") orelse "jetstream2.us-east.bsky.network", 19 - .min_phrase_words = parseU32(posix.getenv("MIN_PHRASE_WORDS"), 4), 20 - .posting_enabled = parseBool(posix.getenv("POSTING_ENABLED")), 21 - .cooldown_minutes = parseU32(posix.getenv("COOLDOWN_MINUTES"), 120), 22 - .exclude_patterns = posix.getenv("EXCLUDE_PATTERNS") orelse "what-have-you-done,what-have-i-done,sad,crying,cant-take", 23 - .stats_port = parseU16(posix.getenv("STATS_PORT"), 8080), 24 - }; 25 - } 26 - }; 27 - 28 - fn parseU16(str: ?[]const u8, default: u16) u16 { 29 - if (str) |s| { 30 - return std.fmt.parseInt(u16, s, 10) catch default; 31 - } 32 - return default; 33 - } 34 - 35 - fn parseU32(str: ?[]const u8, default: u32) u32 { 36 - if (str) |s| { 37 - return std.fmt.parseInt(u32, s, 10) catch default; 38 - } 39 - return default; 40 - } 41 - 42 - fn parseBool(str: ?[]const u8) bool { 43 - if (str) |s| { 44 - return std.mem.eql(u8, s, "true") or std.mem.eql(u8, s, "1"); 45 - } 46 - return false; 47 - }
-143
bot/src/jetstream.zig
··· 1 - const std = @import("std"); 2 - const mem = std.mem; 3 - const json = std.json; 4 - const posix = std.posix; 5 - const Allocator = mem.Allocator; 6 - const websocket = @import("websocket"); 7 - 8 - pub const Post = struct { 9 - uri: []const u8, 10 - text: []const u8, 11 - did: []const u8, 12 - rkey: []const u8, 13 - }; 14 - 15 - pub const JetstreamClient = struct { 16 - allocator: Allocator, 17 - host: []const u8, 18 - callback: *const fn (Post) void, 19 - 20 - pub fn init(allocator: Allocator, host: []const u8, callback: *const fn (Post) void) JetstreamClient { 21 - return .{ 22 - .allocator = allocator, 23 - .host = host, 24 - .callback = callback, 25 - }; 26 - } 27 - 28 - pub fn run(self: *JetstreamClient) void { 29 - // exponential backoff: 1s -> 2s -> 4s -> ... -> 60s cap 30 - var backoff: u64 = 1; 31 - const max_backoff: u64 = 60; 32 - 33 - while (true) { 34 - self.connect() catch |err| { 35 - std.debug.print("jetstream error: {}, reconnecting in {}s...\n", .{ err, backoff }); 36 - }; 37 - posix.nanosleep(backoff, 0); 38 - backoff = @min(backoff * 2, max_backoff); 39 - } 40 - } 41 - 42 - fn connect(self: *JetstreamClient) !void { 43 - const path = "/subscribe?wantedCollections=app.bsky.feed.post"; 44 - 45 - std.debug.print("connecting to wss://{s}{s}\n", .{ self.host, path }); 46 - 47 - var client = websocket.Client.init(self.allocator, .{ 48 - .host = self.host, 49 - .port = 443, 50 - .tls = true, 51 - .max_size = 1024 * 1024, // 1MB - some jetstream messages are large 52 - }) catch |err| { 53 - std.debug.print("websocket client init failed: {}\n", .{err}); 54 - return err; 55 - }; 56 - defer client.deinit(); 57 - 58 - var host_header_buf: [256]u8 = undefined; 59 - const host_header = std.fmt.bufPrint(&host_header_buf, "Host: {s}\r\n", .{self.host}) catch self.host; 60 - 61 - client.handshake(path, .{ .headers = host_header }) catch |err| { 62 - std.debug.print("websocket handshake failed: {}\n", .{err}); 63 - return err; 64 - }; 65 - 66 - std.debug.print("jetstream connected!\n", .{}); 67 - 68 - var handler = Handler{ .allocator = self.allocator, .callback = self.callback }; 69 - client.readLoop(&handler) catch |err| { 70 - std.debug.print("websocket read loop error: {}\n", .{err}); 71 - return err; 72 - }; 73 - } 74 - }; 75 - 76 - const Handler = struct { 77 - allocator: Allocator, 78 - callback: *const fn (Post) void, 79 - 80 - pub fn serverMessage(self: *Handler, data: []const u8) !void { 81 - self.processMessage(data) catch |err| { 82 - if (err != error.NotAPost) { 83 - std.debug.print("message processing error: {}\n", .{err}); 84 - } 85 - }; 86 - } 87 - 88 - pub fn close(_: *Handler) void { 89 - std.debug.print("jetstream connection closed\n", .{}); 90 - } 91 - 92 - fn processMessage(self: *Handler, payload: []const u8) !void { 93 - // jetstream format: 94 - // { "did": "...", "kind": "commit", "commit": { "collection": "app.bsky.feed.post", "rkey": "...", "record": { "text": "...", ... } } } 95 - const parsed = json.parseFromSlice(json.Value, self.allocator, payload, .{}) catch return error.ParseError; 96 - defer parsed.deinit(); 97 - 98 - const root = parsed.value.object; 99 - 100 - // check kind 101 - const kind = root.get("kind") orelse return error.NotAPost; 102 - if (kind != .string or !mem.eql(u8, kind.string, "commit")) return error.NotAPost; 103 - 104 - // get did 105 - const did_val = root.get("did") orelse return error.NotAPost; 106 - if (did_val != .string) return error.NotAPost; 107 - 108 - // get commit 109 - const commit = root.get("commit") orelse return error.NotAPost; 110 - if (commit != .object) return error.NotAPost; 111 - 112 - // check collection 113 - const collection = commit.object.get("collection") orelse return error.NotAPost; 114 - if (collection != .string or !mem.eql(u8, collection.string, "app.bsky.feed.post")) return error.NotAPost; 115 - 116 - // check operation (create only) 117 - const operation = commit.object.get("operation") orelse return error.NotAPost; 118 - if (operation != .string or !mem.eql(u8, operation.string, "create")) return error.NotAPost; 119 - 120 - // get rkey 121 - const rkey_val = commit.object.get("rkey") orelse return error.NotAPost; 122 - if (rkey_val != .string) return error.NotAPost; 123 - 124 - // get record 125 - const record = commit.object.get("record") orelse return error.NotAPost; 126 - if (record != .object) return error.NotAPost; 127 - 128 - // get text 129 - const text_val = record.object.get("text") orelse return error.NotAPost; 130 - if (text_val != .string) return error.NotAPost; 131 - 132 - // construct uri 133 - var uri_buf: [256]u8 = undefined; 134 - const uri = std.fmt.bufPrint(&uri_buf, "at://{s}/app.bsky.feed.post/{s}", .{ did_val.string, rkey_val.string }) catch return error.UriTooLong; 135 - 136 - self.callback(.{ 137 - .uri = uri, 138 - .text = text_val.string, 139 - .did = did_val.string, 140 - .rkey = rkey_val.string, 141 - }); 142 - } 143 - };
-243
bot/src/main.zig
··· 1 - const std = @import("std"); 2 - const mem = std.mem; 3 - const json = std.json; 4 - const http = std.http; 5 - const Thread = std.Thread; 6 - const Allocator = mem.Allocator; 7 - const config = @import("config.zig"); 8 - const matcher = @import("matcher.zig"); 9 - const jetstream = @import("jetstream.zig"); 10 - const bsky = @import("bsky.zig"); 11 - const stats = @import("stats.zig"); 12 - 13 - var global_state: ?*BotState = null; 14 - 15 - const BotState = struct { 16 - allocator: Allocator, 17 - config: config.Config, 18 - matcher: matcher.Matcher, 19 - bsky_client: bsky.BskyClient, 20 - recent_bufos: std.StringHashMap(i64), // name -> timestamp 21 - mutex: Thread.Mutex = .{}, 22 - stats: stats.Stats, 23 - }; 24 - 25 - pub fn main() !void { 26 - var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 27 - defer _ = gpa.deinit(); 28 - const allocator = gpa.allocator(); 29 - 30 - std.debug.print("starting bufo bot...\n", .{}); 31 - 32 - const cfg = config.Config.fromEnv(); 33 - 34 - // load bufos from API 35 - var m = matcher.Matcher.init(allocator, cfg.min_phrase_words); 36 - try loadBufos(allocator, &m, cfg.exclude_patterns); 37 - std.debug.print("loaded {} bufos with >= {} word phrases\n", .{ m.count(), cfg.min_phrase_words }); 38 - 39 - if (m.count() == 0) { 40 - std.debug.print("no bufos loaded, exiting\n", .{}); 41 - return; 42 - } 43 - 44 - // init bluesky client 45 - var bsky_client = bsky.BskyClient.init(allocator, cfg.bsky_handle, cfg.bsky_app_password); 46 - defer bsky_client.deinit(); 47 - 48 - if (cfg.posting_enabled) { 49 - try bsky_client.login(); 50 - } else { 51 - std.debug.print("posting disabled, running in dry-run mode\n", .{}); 52 - } 53 - 54 - // init stats 55 - var bot_stats = stats.Stats.init(allocator); 56 - defer bot_stats.deinit(); 57 - bot_stats.setBufosLoaded(@intCast(m.count())); 58 - 59 - // init state 60 - var state = BotState{ 61 - .allocator = allocator, 62 - .config = cfg, 63 - .matcher = m, 64 - .bsky_client = bsky_client, 65 - .recent_bufos = std.StringHashMap(i64).init(allocator), 66 - .stats = bot_stats, 67 - }; 68 - defer state.recent_bufos.deinit(); 69 - 70 - global_state = &state; 71 - 72 - // start stats server on background thread 73 - var stats_server = stats.StatsServer.init(allocator, &state.stats, cfg.stats_port); 74 - const stats_thread = Thread.spawn(.{}, stats.StatsServer.run, .{&stats_server}) catch |err| { 75 - std.debug.print("failed to start stats server: {}\n", .{err}); 76 - return err; 77 - }; 78 - defer stats_thread.join(); 79 - 80 - // start jetstream consumer 81 - var js = jetstream.JetstreamClient.init(allocator, cfg.jetstream_endpoint, onPost); 82 - js.run(); 83 - } 84 - 85 - fn onPost(post: jetstream.Post) void { 86 - const state = global_state orelse return; 87 - 88 - state.stats.incPostsChecked(); 89 - 90 - // check for match 91 - const match = state.matcher.findMatch(post.text) orelse return; 92 - 93 - state.stats.incMatchesFound(); 94 - state.stats.incBufoMatch(match.name, match.url); 95 - std.debug.print("match: {s}\n", .{match.name}); 96 - 97 - if (!state.config.posting_enabled) { 98 - std.debug.print("posting disabled, skipping\n", .{}); 99 - return; 100 - } 101 - 102 - state.mutex.lock(); 103 - defer state.mutex.unlock(); 104 - 105 - // check cooldown 106 - const now = std.time.timestamp(); 107 - const cooldown_secs = @as(i64, @intCast(state.config.cooldown_minutes)) * 60; 108 - 109 - if (state.recent_bufos.get(match.name)) |last_posted| { 110 - if (now - last_posted < cooldown_secs) { 111 - state.stats.incCooldownsHit(); 112 - std.debug.print("cooldown: {s} posted recently, skipping\n", .{match.name}); 113 - return; 114 - } 115 - } 116 - 117 - // try to post, with one retry on token expiration 118 - tryPost(state, post, match, now) catch |err| { 119 - if (err == error.ExpiredToken) { 120 - std.debug.print("token expired, re-logging in...\n", .{}); 121 - state.bsky_client.login() catch |login_err| { 122 - std.debug.print("failed to re-login: {}\n", .{login_err}); 123 - state.stats.incErrors(); 124 - return; 125 - }; 126 - std.debug.print("re-login successful, retrying post...\n", .{}); 127 - tryPost(state, post, match, now) catch |retry_err| { 128 - std.debug.print("retry failed: {}\n", .{retry_err}); 129 - state.stats.incErrors(); 130 - }; 131 - } else { 132 - state.stats.incErrors(); 133 - } 134 - }; 135 - } 136 - 137 - fn tryPost(state: *BotState, post: jetstream.Post, match: matcher.Match, now: i64) !void { 138 - // fetch bufo image 139 - const img_data = try state.bsky_client.fetchImage(match.url); 140 - defer state.allocator.free(img_data); 141 - 142 - const is_gif = mem.endsWith(u8, match.url, ".gif"); 143 - 144 - // build alt text (name without extension, dashes to spaces) 145 - var alt_buf: [128]u8 = undefined; 146 - var alt_len: usize = 0; 147 - for (match.name) |c| { 148 - if (c == '-') { 149 - alt_buf[alt_len] = ' '; 150 - } else if (c == '.') { 151 - break; // stop at extension 152 - } else { 153 - alt_buf[alt_len] = c; 154 - } 155 - alt_len += 1; 156 - if (alt_len >= alt_buf.len - 1) break; 157 - } 158 - const alt_text = alt_buf[0..alt_len]; 159 - 160 - // get post CID for quote 161 - const cid = try state.bsky_client.getPostCid(post.uri); 162 - defer state.allocator.free(cid); 163 - 164 - if (is_gif) { 165 - // upload as video for animated GIFs 166 - std.debug.print("uploading {d} bytes as video\n", .{img_data.len}); 167 - const job_id = try state.bsky_client.uploadVideo(img_data, match.name); 168 - defer state.allocator.free(job_id); 169 - 170 - std.debug.print("waiting for video processing (job: {s})...\n", .{job_id}); 171 - const blob_json = try state.bsky_client.waitForVideo(job_id); 172 - defer state.allocator.free(blob_json); 173 - 174 - try state.bsky_client.createVideoQuotePost(post.uri, cid, blob_json, alt_text); 175 - } else { 176 - // upload as image 177 - const content_type = if (mem.endsWith(u8, match.url, ".png")) 178 - "image/png" 179 - else 180 - "image/jpeg"; 181 - 182 - std.debug.print("uploading {d} bytes as {s}\n", .{ img_data.len, content_type }); 183 - const blob_json = try state.bsky_client.uploadBlob(img_data, content_type); 184 - defer state.allocator.free(blob_json); 185 - 186 - try state.bsky_client.createQuotePost(post.uri, cid, blob_json, alt_text); 187 - } 188 - std.debug.print("posted bufo quote: {s}\n", .{match.name}); 189 - state.stats.incPostsCreated(); 190 - 191 - // update cooldown cache 192 - state.recent_bufos.put(match.name, now) catch {}; 193 - } 194 - 195 - fn loadBufos(allocator: Allocator, m: *matcher.Matcher, exclude_patterns: []const u8) !void { 196 - var client = http.Client{ .allocator = allocator }; 197 - defer client.deinit(); 198 - 199 - var url_buf: [512]u8 = undefined; 200 - const url = std.fmt.bufPrint(&url_buf, "https://find-bufo.com/api/search?query=bufo&top_k=2000&alpha=0&exclude={s}", .{exclude_patterns}) catch return error.UrlTooLong; 201 - 202 - var aw: std.Io.Writer.Allocating = .init(allocator); 203 - defer aw.deinit(); 204 - 205 - const result = client.fetch(.{ 206 - .location = .{ .url = url }, 207 - .method = .GET, 208 - .response_writer = &aw.writer, 209 - }) catch |err| { 210 - std.debug.print("failed to fetch bufos: {}\n", .{err}); 211 - return err; 212 - }; 213 - 214 - if (result.status != .ok) { 215 - std.debug.print("failed to fetch bufos, status: {}\n", .{result.status}); 216 - return error.FetchFailed; 217 - } 218 - 219 - const response_list = aw.toArrayList(); 220 - const response = response_list.items; 221 - 222 - const parsed = json.parseFromSlice(json.Value, allocator, response, .{}) catch return error.ParseError; 223 - defer parsed.deinit(); 224 - 225 - const results = parsed.value.object.get("results") orelse return; 226 - if (results != .array) return; 227 - 228 - var loaded: usize = 0; 229 - for (results.array.items) |item| { 230 - if (item != .object) continue; 231 - 232 - const name_val = item.object.get("name") orelse continue; 233 - if (name_val != .string) continue; 234 - 235 - const url_val = item.object.get("url") orelse continue; 236 - if (url_val != .string) continue; 237 - 238 - m.addBufo(name_val.string, url_val.string) catch continue; 239 - loaded += 1; 240 - } 241 - 242 - std.debug.print("loaded {} bufos from API\n", .{loaded}); 243 - }
-152
bot/src/matcher.zig
··· 1 - const std = @import("std"); 2 - const mem = std.mem; 3 - const Allocator = mem.Allocator; 4 - 5 - pub const Bufo = struct { 6 - name: []const u8, 7 - url: []const u8, 8 - phrase: []const []const u8, 9 - }; 10 - 11 - pub const Match = struct { 12 - name: []const u8, 13 - url: []const u8, 14 - }; 15 - 16 - pub const Matcher = struct { 17 - bufos: std.ArrayList(Bufo) = .{}, 18 - allocator: Allocator, 19 - min_words: u32, 20 - 21 - pub fn init(allocator: Allocator, min_words: u32) Matcher { 22 - return .{ 23 - .allocator = allocator, 24 - .min_words = min_words, 25 - }; 26 - } 27 - 28 - pub fn deinit(self: *Matcher) void { 29 - for (self.bufos.items) |bufo| { 30 - self.allocator.free(bufo.name); 31 - self.allocator.free(bufo.url); 32 - for (bufo.phrase) |word| { 33 - self.allocator.free(word); 34 - } 35 - self.allocator.free(bufo.phrase); 36 - } 37 - self.bufos.deinit(self.allocator); 38 - } 39 - 40 - pub fn addBufo(self: *Matcher, name: []const u8, url: []const u8) !void { 41 - const phrase = try extractPhrase(self.allocator, name); 42 - 43 - if (phrase.len < self.min_words) { 44 - for (phrase) |word| self.allocator.free(word); 45 - self.allocator.free(phrase); 46 - return; 47 - } 48 - 49 - try self.bufos.append(self.allocator, .{ 50 - .name = try self.allocator.dupe(u8, name), 51 - .url = try self.allocator.dupe(u8, url), 52 - .phrase = phrase, 53 - }); 54 - } 55 - 56 - pub fn findMatch(self: *Matcher, text: []const u8) ?Match { 57 - var words: std.ArrayList([]const u8) = .{}; 58 - defer words.deinit(self.allocator); 59 - 60 - var i: usize = 0; 61 - while (i < text.len) { 62 - while (i < text.len and !isAlpha(text[i])) : (i += 1) {} 63 - if (i >= text.len) break; 64 - 65 - const start = i; 66 - while (i < text.len and isAlpha(text[i])) : (i += 1) {} 67 - 68 - const word = text[start..i]; 69 - if (word.len > 0) { 70 - words.append(self.allocator, word) catch continue; 71 - } 72 - } 73 - 74 - for (self.bufos.items) |bufo| { 75 - if (containsPhrase(words.items, bufo.phrase)) { 76 - return .{ 77 - .name = bufo.name, 78 - .url = bufo.url, 79 - }; 80 - } 81 - } 82 - return null; 83 - } 84 - 85 - pub fn count(self: *Matcher) usize { 86 - return self.bufos.items.len; 87 - } 88 - }; 89 - 90 - fn extractPhrase(allocator: Allocator, name: []const u8) ![]const []const u8 { 91 - var start: usize = 0; 92 - if (mem.startsWith(u8, name, "bufo-")) { 93 - start = 5; 94 - } 95 - var end = name.len; 96 - if (mem.endsWith(u8, name, ".gif")) { 97 - end -= 4; 98 - } else if (mem.endsWith(u8, name, ".png")) { 99 - end -= 4; 100 - } else if (mem.endsWith(u8, name, ".jpg")) { 101 - end -= 4; 102 - } else if (mem.endsWith(u8, name, ".jpeg")) { 103 - end -= 5; 104 - } 105 - 106 - const slug = name[start..end]; 107 - 108 - var words: std.ArrayList([]const u8) = .{}; 109 - errdefer { 110 - for (words.items) |word| allocator.free(word); 111 - words.deinit(allocator); 112 - } 113 - 114 - var iter = mem.splitScalar(u8, slug, '-'); 115 - while (iter.next()) |word| { 116 - if (word.len > 0) { 117 - const lower = try allocator.alloc(u8, word.len); 118 - for (word, 0..) |c, j| { 119 - lower[j] = std.ascii.toLower(c); 120 - } 121 - try words.append(allocator, lower); 122 - } 123 - } 124 - 125 - return try words.toOwnedSlice(allocator); 126 - } 127 - 128 - fn containsPhrase(post_words: []const []const u8, phrase: []const []const u8) bool { 129 - if (phrase.len == 0 or post_words.len < phrase.len) return false; 130 - 131 - outer: for (0..post_words.len - phrase.len + 1) |i| { 132 - for (phrase, 0..) |phrase_word, j| { 133 - if (!eqlIgnoreCase(post_words[i + j], phrase_word)) { 134 - continue :outer; 135 - } 136 - } 137 - return true; 138 - } 139 - return false; 140 - } 141 - 142 - fn eqlIgnoreCase(a: []const u8, b: []const u8) bool { 143 - if (a.len != b.len) return false; 144 - for (a, b) |ca, cb| { 145 - if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false; 146 - } 147 - return true; 148 - } 149 - 150 - fn isAlpha(c: u8) bool { 151 - return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'); 152 - }
-401
bot/src/stats.zig
··· 1 - const std = @import("std"); 2 - const mem = std.mem; 3 - const json = std.json; 4 - const fs = std.fs; 5 - const Allocator = mem.Allocator; 6 - const Thread = std.Thread; 7 - const template = @import("stats_template.zig"); 8 - 9 - const STATS_PATH = "/data/stats.json"; 10 - 11 - pub const Stats = struct { 12 - allocator: Allocator, 13 - start_time: i64, 14 - prior_uptime: u64 = 0, // cumulative uptime from previous runs 15 - posts_checked: std.atomic.Value(u64) = .init(0), 16 - matches_found: std.atomic.Value(u64) = .init(0), 17 - posts_created: std.atomic.Value(u64) = .init(0), 18 - cooldowns_hit: std.atomic.Value(u64) = .init(0), 19 - errors: std.atomic.Value(u64) = .init(0), 20 - bufos_loaded: u64 = 0, 21 - 22 - // track per-bufo match counts: name -> {count, url} 23 - bufo_matches: std.StringHashMap(BufoMatchData), 24 - bufo_mutex: Thread.Mutex = .{}, 25 - 26 - const BufoMatchData = struct { 27 - count: u64, 28 - url: []const u8, 29 - }; 30 - 31 - pub fn init(allocator: Allocator) Stats { 32 - var self = Stats{ 33 - .allocator = allocator, 34 - .start_time = std.time.timestamp(), 35 - .bufo_matches = std.StringHashMap(BufoMatchData).init(allocator), 36 - }; 37 - self.load(); 38 - return self; 39 - } 40 - 41 - pub fn deinit(self: *Stats) void { 42 - self.save(); 43 - var iter = self.bufo_matches.iterator(); 44 - while (iter.next()) |entry| { 45 - self.allocator.free(entry.key_ptr.*); 46 - self.allocator.free(entry.value_ptr.url); 47 - } 48 - self.bufo_matches.deinit(); 49 - } 50 - 51 - fn load(self: *Stats) void { 52 - const file = fs.openFileAbsolute(STATS_PATH, .{}) catch return; 53 - defer file.close(); 54 - 55 - var buf: [64 * 1024]u8 = undefined; 56 - const len = file.readAll(&buf) catch return; 57 - if (len == 0) return; 58 - 59 - const parsed = json.parseFromSlice(json.Value, self.allocator, buf[0..len], .{}) catch return; 60 - defer parsed.deinit(); 61 - 62 - const root = parsed.value.object; 63 - 64 - if (root.get("posts_checked")) |v| if (v == .integer) { 65 - self.posts_checked.store(@intCast(@max(0, v.integer)), .monotonic); 66 - }; 67 - if (root.get("matches_found")) |v| if (v == .integer) { 68 - self.matches_found.store(@intCast(@max(0, v.integer)), .monotonic); 69 - }; 70 - if (root.get("posts_created")) |v| if (v == .integer) { 71 - self.posts_created.store(@intCast(@max(0, v.integer)), .monotonic); 72 - }; 73 - if (root.get("cooldowns_hit")) |v| if (v == .integer) { 74 - self.cooldowns_hit.store(@intCast(@max(0, v.integer)), .monotonic); 75 - }; 76 - if (root.get("errors")) |v| if (v == .integer) { 77 - self.errors.store(@intCast(@max(0, v.integer)), .monotonic); 78 - }; 79 - if (root.get("cumulative_uptime")) |v| if (v == .integer) { 80 - self.prior_uptime = @intCast(@max(0, v.integer)); 81 - }; 82 - 83 - // load bufo_matches (or legacy bufo_posts) 84 - const matches_key = if (root.get("bufo_matches") != null) "bufo_matches" else "bufo_posts"; 85 - if (root.get(matches_key)) |bp| { 86 - if (bp == .object) { 87 - var iter = bp.object.iterator(); 88 - while (iter.next()) |entry| { 89 - if (entry.value_ptr.* == .object) { 90 - // format: {"count": N, "url": "..."} 91 - const obj = entry.value_ptr.object; 92 - const count_val = obj.get("count") orelse continue; 93 - const url_val = obj.get("url") orelse continue; 94 - if (count_val != .integer or url_val != .string) continue; 95 - 96 - const key = self.allocator.dupe(u8, entry.key_ptr.*) catch continue; 97 - const url = self.allocator.dupe(u8, url_val.string) catch { 98 - self.allocator.free(key); 99 - continue; 100 - }; 101 - self.bufo_matches.put(key, .{ 102 - .count = @intCast(@max(0, count_val.integer)), 103 - .url = url, 104 - }) catch { 105 - self.allocator.free(key); 106 - self.allocator.free(url); 107 - }; 108 - } else if (entry.value_ptr.* == .integer) { 109 - // legacy format: just integer count - construct URL from name 110 - const key = self.allocator.dupe(u8, entry.key_ptr.*) catch continue; 111 - var url_buf: [256]u8 = undefined; 112 - const constructed_url = std.fmt.bufPrint(&url_buf, "https://all-the.bufo.zone/{s}", .{entry.key_ptr.*}) catch continue; 113 - const url = self.allocator.dupe(u8, constructed_url) catch { 114 - self.allocator.free(key); 115 - continue; 116 - }; 117 - self.bufo_matches.put(key, .{ 118 - .count = @intCast(@max(0, entry.value_ptr.integer)), 119 - .url = url, 120 - }) catch { 121 - self.allocator.free(key); 122 - self.allocator.free(url); 123 - }; 124 - } 125 - } 126 - } 127 - } 128 - 129 - std.debug.print("loaded stats from {s}\n", .{STATS_PATH}); 130 - } 131 - 132 - pub fn save(self: *Stats) void { 133 - self.bufo_mutex.lock(); 134 - defer self.bufo_mutex.unlock(); 135 - self.saveUnlocked(); 136 - } 137 - 138 - pub fn totalUptime(self: *Stats) i64 { 139 - const now = std.time.timestamp(); 140 - const session: i64 = now - self.start_time; 141 - return @as(i64, @intCast(self.prior_uptime)) + session; 142 - } 143 - 144 - pub fn incPostsChecked(self: *Stats) void { 145 - _ = self.posts_checked.fetchAdd(1, .monotonic); 146 - } 147 - 148 - pub fn incMatchesFound(self: *Stats) void { 149 - _ = self.matches_found.fetchAdd(1, .monotonic); 150 - } 151 - 152 - pub fn incBufoMatch(self: *Stats, bufo_name: []const u8, bufo_url: []const u8) void { 153 - self.bufo_mutex.lock(); 154 - defer self.bufo_mutex.unlock(); 155 - 156 - if (self.bufo_matches.getPtr(bufo_name)) |data| { 157 - data.count += 1; 158 - } else { 159 - const key = self.allocator.dupe(u8, bufo_name) catch return; 160 - const url = self.allocator.dupe(u8, bufo_url) catch { 161 - self.allocator.free(key); 162 - return; 163 - }; 164 - self.bufo_matches.put(key, .{ .count = 1, .url = url }) catch { 165 - self.allocator.free(key); 166 - self.allocator.free(url); 167 - }; 168 - } 169 - self.saveUnlocked(); 170 - } 171 - 172 - pub fn incPostsCreated(self: *Stats) void { 173 - _ = self.posts_created.fetchAdd(1, .monotonic); 174 - } 175 - 176 - fn saveUnlocked(self: *Stats) void { 177 - // called when mutex is already held 178 - const file = fs.createFileAbsolute(STATS_PATH, .{}) catch return; 179 - defer file.close(); 180 - 181 - const now = std.time.timestamp(); 182 - const session_uptime: u64 = @intCast(@max(0, now - self.start_time)); 183 - const total_uptime = self.prior_uptime + session_uptime; 184 - 185 - var buf: [64 * 1024]u8 = undefined; 186 - var fbs = std.io.fixedBufferStream(&buf); 187 - const writer = fbs.writer(); 188 - 189 - writer.writeAll("{") catch return; 190 - std.fmt.format(writer, "\"posts_checked\":{},", .{self.posts_checked.load(.monotonic)}) catch return; 191 - std.fmt.format(writer, "\"matches_found\":{},", .{self.matches_found.load(.monotonic)}) catch return; 192 - std.fmt.format(writer, "\"posts_created\":{},", .{self.posts_created.load(.monotonic)}) catch return; 193 - std.fmt.format(writer, "\"cooldowns_hit\":{},", .{self.cooldowns_hit.load(.monotonic)}) catch return; 194 - std.fmt.format(writer, "\"errors\":{},", .{self.errors.load(.monotonic)}) catch return; 195 - std.fmt.format(writer, "\"cumulative_uptime\":{},", .{total_uptime}) catch return; 196 - writer.writeAll("\"bufo_matches\":{") catch return; 197 - 198 - var first = true; 199 - var iter = self.bufo_matches.iterator(); 200 - while (iter.next()) |entry| { 201 - if (!first) writer.writeAll(",") catch return; 202 - first = false; 203 - std.fmt.format(writer, "\"{s}\":{{\"count\":{},\"url\":\"{s}\"}}", .{ entry.key_ptr.*, entry.value_ptr.count, entry.value_ptr.url }) catch return; 204 - } 205 - 206 - writer.writeAll("}}") catch return; 207 - file.writeAll(fbs.getWritten()) catch return; 208 - } 209 - 210 - pub fn incCooldownsHit(self: *Stats) void { 211 - _ = self.cooldowns_hit.fetchAdd(1, .monotonic); 212 - } 213 - 214 - pub fn incErrors(self: *Stats) void { 215 - _ = self.errors.fetchAdd(1, .monotonic); 216 - } 217 - 218 - pub fn setBufosLoaded(self: *Stats, count: u64) void { 219 - self.bufos_loaded = count; 220 - } 221 - 222 - fn formatUptime(seconds: i64, buf: []u8) []const u8 { 223 - const s: u64 = @intCast(@max(0, seconds)); 224 - const days = s / 86400; 225 - const hours = (s % 86400) / 3600; 226 - const mins = (s % 3600) / 60; 227 - const secs = s % 60; 228 - 229 - if (days > 0) { 230 - return std.fmt.bufPrint(buf, "{}d {}h {}m", .{ days, hours, mins }) catch "?"; 231 - } else if (hours > 0) { 232 - return std.fmt.bufPrint(buf, "{}h {}m {}s", .{ hours, mins, secs }) catch "?"; 233 - } else if (mins > 0) { 234 - return std.fmt.bufPrint(buf, "{}m {}s", .{ mins, secs }) catch "?"; 235 - } else { 236 - return std.fmt.bufPrint(buf, "{}s", .{secs}) catch "?"; 237 - } 238 - } 239 - 240 - pub fn renderHtml(self: *Stats, allocator: Allocator) ![]const u8 { 241 - const uptime = self.totalUptime(); 242 - 243 - var uptime_buf: [64]u8 = undefined; 244 - const uptime_str = formatUptime(uptime, &uptime_buf); 245 - 246 - const BufoEntry = struct { 247 - name: []const u8, 248 - count: u64, 249 - url: []const u8, 250 - 251 - fn compare(_: void, a: @This(), b: @This()) bool { 252 - return a.count > b.count; 253 - } 254 - }; 255 - 256 - // collect top bufos 257 - var top_bufos: std.ArrayList(BufoEntry) = .{}; 258 - defer top_bufos.deinit(allocator); 259 - 260 - { 261 - self.bufo_mutex.lock(); 262 - defer self.bufo_mutex.unlock(); 263 - 264 - var iter = self.bufo_matches.iterator(); 265 - while (iter.next()) |entry| { 266 - try top_bufos.append(allocator, .{ .name = entry.key_ptr.*, .count = entry.value_ptr.count, .url = entry.value_ptr.url }); 267 - } 268 - } 269 - 270 - // sort by count descending 271 - mem.sort(BufoEntry, top_bufos.items, {}, BufoEntry.compare); 272 - 273 - // build top bufos grid html 274 - var top_html: std.ArrayList(u8) = .{}; 275 - defer top_html.deinit(allocator); 276 - 277 - const limit = @min(top_bufos.items.len, 20); 278 - 279 - // find max count for scaling 280 - var max_count: u64 = 1; 281 - for (top_bufos.items[0..limit]) |entry| { 282 - if (entry.count > max_count) max_count = entry.count; 283 - } 284 - 285 - for (top_bufos.items[0..limit]) |entry| { 286 - // scale size: min 60px, max 160px based on count ratio 287 - const ratio = @as(f64, @floatFromInt(entry.count)) / @as(f64, @floatFromInt(max_count)); 288 - const size: u32 = @intFromFloat(60.0 + ratio * 100.0); 289 - 290 - // strip extension for display name 291 - var display_name = entry.name; 292 - if (mem.endsWith(u8, entry.name, ".gif")) { 293 - display_name = entry.name[0 .. entry.name.len - 4]; 294 - } else if (mem.endsWith(u8, entry.name, ".png")) { 295 - display_name = entry.name[0 .. entry.name.len - 4]; 296 - } else if (mem.endsWith(u8, entry.name, ".jpg")) { 297 - display_name = entry.name[0 .. entry.name.len - 4]; 298 - } 299 - 300 - try std.fmt.format(top_html.writer(allocator), 301 - \\<div class="bufo-card" style="width:{}px;height:{}px;" title="{s} ({} matches)" data-name="{s}" onclick="showPosts(this)"> 302 - \\<img src="{s}" alt="{s}" loading="lazy"> 303 - \\<span class="bufo-count">{}</span> 304 - \\</div> 305 - , .{ size, size, display_name, entry.count, display_name, entry.url, display_name, entry.count }); 306 - } 307 - 308 - const top_section = if (top_bufos.items.len > 0) top_html.items else "<p class=\"no-bufos\">no posts yet</p>"; 309 - 310 - const html = try std.fmt.allocPrint(allocator, template.html, .{ 311 - uptime, 312 - uptime_str, 313 - self.posts_checked.load(.monotonic), 314 - self.posts_checked.load(.monotonic), 315 - self.matches_found.load(.monotonic), 316 - self.matches_found.load(.monotonic), 317 - self.posts_created.load(.monotonic), 318 - self.posts_created.load(.monotonic), 319 - self.cooldowns_hit.load(.monotonic), 320 - self.cooldowns_hit.load(.monotonic), 321 - self.errors.load(.monotonic), 322 - self.errors.load(.monotonic), 323 - self.bufos_loaded, 324 - self.bufos_loaded, 325 - top_section, 326 - }); 327 - 328 - return html; 329 - } 330 - }; 331 - 332 - pub const StatsServer = struct { 333 - allocator: Allocator, 334 - stats: *Stats, 335 - port: u16, 336 - 337 - pub fn init(allocator: Allocator, stats: *Stats, port: u16) StatsServer { 338 - return .{ 339 - .allocator = allocator, 340 - .stats = stats, 341 - .port = port, 342 - }; 343 - } 344 - 345 - pub fn run(self: *StatsServer) void { 346 - // spawn periodic save ticker (every 60s) 347 - _ = Thread.spawn(.{}, saveTicker, .{self.stats}) catch {}; 348 - 349 - self.serve() catch |err| { 350 - std.debug.print("stats server error: {}\n", .{err}); 351 - }; 352 - } 353 - 354 - fn saveTicker(s: *Stats) void { 355 - while (true) { 356 - std.Thread.sleep(60 * std.time.ns_per_s); 357 - s.save(); 358 - } 359 - } 360 - 361 - fn serve(self: *StatsServer) !void { 362 - const addr = std.net.Address.initIp4(.{ 0, 0, 0, 0 }, self.port); 363 - 364 - var server = try addr.listen(.{ .reuse_address = true }); 365 - defer server.deinit(); 366 - 367 - std.debug.print("stats server listening on http://0.0.0.0:{}\n", .{self.port}); 368 - 369 - while (true) { 370 - const conn = server.accept() catch |err| { 371 - std.debug.print("accept error: {}\n", .{err}); 372 - continue; 373 - }; 374 - 375 - self.handleConnection(conn) catch |err| { 376 - std.debug.print("connection error: {}\n", .{err}); 377 - }; 378 - } 379 - } 380 - 381 - fn handleConnection(self: *StatsServer, conn: std.net.Server.Connection) !void { 382 - defer conn.stream.close(); 383 - 384 - // read request (we don't really care about it, just serve stats) 385 - var buf: [1024]u8 = undefined; 386 - _ = conn.stream.read(&buf) catch {}; 387 - 388 - const html = self.stats.renderHtml(self.allocator) catch |err| { 389 - std.debug.print("render error: {}\n", .{err}); 390 - return; 391 - }; 392 - defer self.allocator.free(html); 393 - 394 - // write raw HTTP response 395 - var response_buf: [128]u8 = undefined; 396 - const header = std.fmt.bufPrint(&response_buf, "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", .{html.len}) catch return; 397 - 398 - _ = conn.stream.write(header) catch return; 399 - _ = conn.stream.write(html) catch return; 400 - } 401 - };
-224
bot/src/stats_template.zig
··· 1 - // HTML template for stats page 2 - // format args: uptime_secs, uptime_str, posts_checked (x2), matches_found (x2), 3 - // posts_created (x2), cooldowns_hit (x2), errors (x2), bufos_loaded (x2), top_section 4 - 5 - pub const html = 6 - \\<!DOCTYPE html> 7 - \\<html> 8 - \\<head> 9 - \\<meta charset="utf-8"> 10 - \\<meta name="viewport" content="width=device-width, initial-scale=1"> 11 - \\<title>bufo-bot stats</title> 12 - \\<style> 13 - \\ body {{ 14 - \\ font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace; 15 - \\ max-width: 600px; 16 - \\ margin: 40px auto; 17 - \\ padding: 20px; 18 - \\ background: #1a1a2e; 19 - \\ color: #eee; 20 - \\ font-size: 14px; 21 - \\ }} 22 - \\ h1 {{ color: #7bed9f; margin-bottom: 30px; }} 23 - \\ .stat {{ 24 - \\ display: flex; 25 - \\ justify-content: space-between; 26 - \\ padding: 12px 0; 27 - \\ border-bottom: 1px solid #333; 28 - \\ }} 29 - \\ .stat-label {{ color: #aaa; }} 30 - \\ .stat-value {{ font-weight: bold; }} 31 - \\ h2 {{ color: #7bed9f; margin-top: 40px; font-size: 1.2em; }} 32 - \\ .bufo-grid {{ 33 - \\ display: flex; 34 - \\ flex-wrap: wrap; 35 - \\ gap: 8px; 36 - \\ justify-content: flex-start; 37 - \\ align-items: flex-start; 38 - \\ margin-top: 16px; 39 - \\ }} 40 - \\ .bufo-card {{ 41 - \\ position: relative; 42 - \\ border-radius: 8px; 43 - \\ overflow: hidden; 44 - \\ background: #252542; 45 - \\ transition: transform 0.2s; 46 - \\ cursor: pointer; 47 - \\ }} 48 - \\ .bufo-card:hover {{ 49 - \\ transform: scale(1.1); 50 - \\ z-index: 10; 51 - \\ }} 52 - \\ .bufo-card img {{ 53 - \\ width: 100%; 54 - \\ height: 100%; 55 - \\ object-fit: cover; 56 - \\ }} 57 - \\ .bufo-count {{ 58 - \\ position: absolute; 59 - \\ bottom: 4px; 60 - \\ right: 4px; 61 - \\ background: rgba(0,0,0,0.7); 62 - \\ color: #7bed9f; 63 - \\ padding: 2px 6px; 64 - \\ border-radius: 4px; 65 - \\ font-size: 11px; 66 - \\ }} 67 - \\ .no-bufos {{ color: #666; text-align: center; }} 68 - \\ .footer {{ 69 - \\ margin-top: 40px; 70 - \\ padding-top: 20px; 71 - \\ border-top: 1px solid #333; 72 - \\ color: #666; 73 - \\ font-size: 0.9em; 74 - \\ }} 75 - \\ a {{ color: #7bed9f; }} 76 - \\ .modal {{ 77 - \\ display: none; 78 - \\ position: fixed; 79 - \\ top: 0; left: 0; right: 0; bottom: 0; 80 - \\ background: rgba(0,0,0,0.8); 81 - \\ z-index: 100; 82 - \\ justify-content: center; 83 - \\ align-items: center; 84 - \\ }} 85 - \\ .modal.show {{ display: flex; }} 86 - \\ .modal-content {{ 87 - \\ background: #252542; 88 - \\ padding: 20px; 89 - \\ border-radius: 8px; 90 - \\ width: 90vw; 91 - \\ max-width: 600px; 92 - \\ height: 85vh; 93 - \\ display: flex; 94 - \\ flex-direction: column; 95 - \\ }} 96 - \\ .modal-content h3 {{ margin-top: 0; color: #7bed9f; }} 97 - \\ .modal-content .close {{ cursor: pointer; float: right; font-size: 20px; }} 98 - \\ .modal-content .no-posts {{ color: #666; text-align: center; padding: 20px; }} 99 - \\ .embed-wrap {{ flex: 1; overflow: hidden; }} 100 - \\ .embed-wrap iframe {{ border: none; width: 100%; height: 100%; border-radius: 8px; }} 101 - \\ .nav {{ display: flex; justify-content: space-between; align-items: center; margin-top: 10px; gap: 10px; }} 102 - \\ .nav button {{ background: #7bed9f; color: #1a1a2e; border: none; padding: 6px 12px; border-radius: 4px; cursor: pointer; }} 103 - \\ .nav button:disabled {{ opacity: 0.3; cursor: default; }} 104 - \\ .nav span {{ color: #aaa; font-size: 12px; }} 105 - \\</style> 106 - \\</head> 107 - \\<body> 108 - \\<h1>bufo-bot stats</h1> 109 - \\ 110 - \\<div class="stat"> 111 - \\ <span class="stat-label">uptime</span> 112 - \\ <span class="stat-value" id="uptime" data-seconds="{}">{s}</span> 113 - \\</div> 114 - \\<div class="stat"> 115 - \\ <span class="stat-label">posts checked</span> 116 - \\ <span class="stat-value" data-num="{}">{}</span> 117 - \\</div> 118 - \\<div class="stat"> 119 - \\ <span class="stat-label">matches found</span> 120 - \\ <span class="stat-value" data-num="{}">{}</span> 121 - \\</div> 122 - \\<div class="stat"> 123 - \\ <span class="stat-label">bufos posted</span> 124 - \\ <span class="stat-value" data-num="{}">{}</span> 125 - \\</div> 126 - \\<div class="stat"> 127 - \\ <span class="stat-label">cooldowns hit</span> 128 - \\ <span class="stat-value" data-num="{}">{}</span> 129 - \\</div> 130 - \\<div class="stat"> 131 - \\ <span class="stat-label">errors</span> 132 - \\ <span class="stat-value" data-num="{}">{}</span> 133 - \\</div> 134 - \\<div class="stat"> 135 - \\ <span class="stat-label">bufos available</span> 136 - \\ <span class="stat-value" data-num="{}">{}</span> 137 - \\</div> 138 - \\ 139 - \\<h2>top bufos</h2> 140 - \\<div class="bufo-grid"> 141 - \\{s} 142 - \\</div> 143 - \\ 144 - \\<div class="footer"> 145 - \\ <a href="https://find-bufo.com">find-bufo.com</a> | 146 - \\ <a href="https://bsky.app/profile/find-bufo.com">@find-bufo.com</a> 147 - \\</div> 148 - \\<div id="modal" class="modal" onclick="if(event.target===this)closeModal()"> 149 - \\ <div class="modal-content"> 150 - \\ <span class="close" onclick="closeModal()">&times;</span> 151 - \\ <h3 id="modal-title">posts</h3> 152 - \\ <div id="embed-wrap" class="embed-wrap"></div> 153 - \\ <div id="nav" class="nav" style="display:none"> 154 - \\ <button onclick="showEmbed(-1)">&larr;</button> 155 - \\ <span id="nav-info"></span> 156 - \\ <button onclick="showEmbed(1)">&rarr;</button> 157 - \\ </div> 158 - \\ </div> 159 - \\</div> 160 - \\<script> 161 - \\(function() {{ 162 - \\ document.querySelectorAll('[data-num]').forEach(el => {{ 163 - \\ el.textContent = parseInt(el.dataset.num).toLocaleString(); 164 - \\ }}); 165 - \\ const uptimeEl = document.getElementById('uptime'); 166 - \\ let secs = parseInt(uptimeEl.dataset.seconds); 167 - \\ function fmt(s) {{ 168 - \\ const d = Math.floor(s / 86400); 169 - \\ const h = Math.floor((s % 86400) / 3600); 170 - \\ const m = Math.floor((s % 3600) / 60); 171 - \\ const sec = s % 60; 172 - \\ if (d > 0) return d + 'd ' + h + 'h ' + m + 'm'; 173 - \\ if (h > 0) return h + 'h ' + m + 'm ' + sec + 's'; 174 - \\ if (m > 0) return m + 'm ' + sec + 's'; 175 - \\ return sec + 's'; 176 - \\ }} 177 - \\ setInterval(() => {{ secs++; uptimeEl.textContent = fmt(secs); }}, 1000); 178 - \\}})(); 179 - \\let posts = [], idx = 0; 180 - \\async function showPosts(el) {{ 181 - \\ const name = el.dataset.name; 182 - \\ document.getElementById('modal-title').textContent = name; 183 - \\ document.getElementById('embed-wrap').innerHTML = '<p class="no-posts">loading...</p>'; 184 - \\ document.getElementById('nav').style.display = 'none'; 185 - \\ document.getElementById('modal').classList.add('show'); 186 - \\ try {{ 187 - \\ const r = await fetch('https://public.api.bsky.app/xrpc/app.bsky.feed.getAuthorFeed?actor=find-bufo.com&limit=100'); 188 - \\ const data = await r.json(); 189 - \\ const search = name.replace('bufo-','').replace(/-/g,' '); 190 - \\ posts = data.feed.filter(p => {{ 191 - \\ const embed = p.post.embed; 192 - \\ if (!embed) return false; 193 - \\ const img = embed.images?.[0] || embed.media?.images?.[0]; 194 - \\ if (img?.alt?.includes(search)) return true; 195 - \\ if (embed.alt?.includes(search)) return true; 196 - \\ if (embed.media?.alt?.includes(search)) return true; 197 - \\ return false; 198 - \\ }}); 199 - \\ idx = 0; 200 - \\ if (posts.length === 0) {{ 201 - \\ document.getElementById('embed-wrap').innerHTML = '<p class="no-posts">no posts found</p>'; 202 - \\ }} else {{ 203 - \\ showEmbed(0); 204 - \\ }} 205 - \\ }} catch(e) {{ 206 - \\ document.getElementById('embed-wrap').innerHTML = '<p class="no-posts">failed to load</p>'; 207 - \\ }} 208 - \\}} 209 - \\function showEmbed(d) {{ 210 - \\ idx = Math.max(0, Math.min(posts.length - 1, idx + d)); 211 - \\ const uri = posts[idx].post.uri.replace('at://',''); 212 - \\ document.getElementById('embed-wrap').innerHTML = '<iframe src="https://embed.bsky.app/embed/' + uri + '"></iframe>'; 213 - \\ document.getElementById('nav').style.display = 'flex'; 214 - \\ document.getElementById('nav-info').textContent = (idx + 1) + ' of ' + posts.length; 215 - \\ document.querySelectorAll('.nav button')[0].disabled = idx === 0; 216 - \\ document.querySelectorAll('.nav button')[1].disabled = idx === posts.length - 1; 217 - \\}} 218 - \\function closeModal() {{ 219 - \\ document.getElementById('modal').classList.remove('show'); 220 - \\}} 221 - \\</script> 222 - \\</body> 223 - \\</html> 224 - ;
-573
docs/zig-atproto-sdk-wishlist.md
··· 1 - # zig atproto sdk wishlist 2 - 3 - a pie-in-the-sky wishlist for what a zig AT protocol sdk could provide, based on building [bufo-bot](../bot) - a bluesky firehose bot that quote-posts matching images. 4 - 5 - --- 6 - 7 - ## 1. typed lexicon schemas 8 - 9 - the single biggest pain point: everything is `json.Value` with manual field extraction. 10 - 11 - ### what we have now 12 - 13 - ```zig 14 - const parsed = json.parseFromSlice(json.Value, allocator, response.items, .{}); 15 - const root = parsed.value.object; 16 - const jwt_val = root.get("accessJwt") orelse return error.NoJwt; 17 - if (jwt_val != .string) return error.NoJwt; 18 - self.access_jwt = try self.allocator.dupe(u8, jwt_val.string); 19 - ``` 20 - 21 - this pattern repeats hundreds of times. it's verbose, error-prone, and provides zero compile-time safety. 22 - 23 - ### what we want 24 - 25 - ```zig 26 - const atproto = @import("atproto"); 27 - 28 - // codegen from lexicon json schemas 29 - const session = try atproto.server.createSession(allocator, .{ 30 - .identifier = handle, 31 - .password = app_password, 32 - }); 33 - // session.accessJwt is already []const u8 34 - // session.did is already []const u8 35 - // session.handle is already []const u8 36 - ``` 37 - 38 - ideally: 39 - - generate zig structs from lexicon json files at build time (build.zig integration) 40 - - full type safety - if a field is optional in the lexicon, it's `?T` in zig 41 - - proper union types for lexicon unions (e.g., embed types) 42 - - automatic serialization/deserialization 43 - 44 - ### lexicon unions are especially painful 45 - 46 - ```zig 47 - // current: manual $type dispatch 48 - const embed_type = record.object.get("$type") orelse return error.NoType; 49 - if (mem.eql(u8, embed_type.string, "app.bsky.embed.images")) { 50 - // handle images... 51 - } else if (mem.eql(u8, embed_type.string, "app.bsky.embed.video")) { 52 - // handle video... 53 - } else if (mem.eql(u8, embed_type.string, "app.bsky.embed.record")) { 54 - // handle quote... 55 - } else if (mem.eql(u8, embed_type.string, "app.bsky.embed.recordWithMedia")) { 56 - // handle quote with media... 57 - } 58 - 59 - // wanted: tagged union 60 - switch (record.embed) { 61 - .images => |imgs| { ... }, 62 - .video => |vid| { ... }, 63 - .record => |quote| { ... }, 64 - .recordWithMedia => |rwm| { ... }, 65 - } 66 - ``` 67 - 68 - --- 69 - 70 - ## 2. session management 71 - 72 - authentication is surprisingly complex and we had to handle it all manually. 73 - 74 - ### what we had to build 75 - 76 - - login with identifier + app password 77 - - store access JWT and refresh JWT 78 - - detect `ExpiredToken` errors in response bodies 79 - - re-login on expiration (we just re-login, didn't implement refresh) 80 - - resolve DID to PDS host via plc.directory lookup 81 - - get service auth tokens for video upload 82 - 83 - ### what we want 84 - 85 - ```zig 86 - const atproto = @import("atproto"); 87 - 88 - var agent = try atproto.Agent.init(allocator, .{ 89 - .service = "https://bsky.social", 90 - }); 91 - 92 - // login with automatic token refresh 93 - try agent.login(handle, app_password); 94 - 95 - // agent automatically: 96 - // - refreshes tokens before expiration 97 - // - retries on ExpiredToken errors 98 - // - resolves DID -> PDS host 99 - // - handles service auth for video.bsky.app 100 - 101 - // just use it, auth is handled 102 - const blob = try agent.uploadBlob(data, "image/png"); 103 - ``` 104 - 105 - ### service auth is particularly gnarly 106 - 107 - for video uploads, you need: 108 - 1. get a service auth token scoped to `did:web:video.bsky.app` with lexicon `com.atproto.repo.uploadBlob` 109 - 2. use that token (not your session token) for the upload 110 - 3. the endpoint is different (`video.bsky.app` not `bsky.social`) 111 - 112 - we had to figure this out from reading other implementations. an sdk should abstract this entirely. 113 - 114 - --- 115 - 116 - ## 3. blob and media handling 117 - 118 - uploading media requires too much manual work. 119 - 120 - ### current pain 121 - 122 - ```zig 123 - // upload blob, get back raw json string 124 - const blob_json = try client.uploadBlob(data, content_type); 125 - // later, interpolate that json string into another json blob 126 - try body_buf.print(allocator, 127 - \\{{"image":{s},"alt":"{s}"}} 128 - , .{ blob_json, alt_text }); 129 - ``` 130 - 131 - we're passing around json strings and interpolating them. this is fragile. 132 - 133 - ### what we want 134 - 135 - ```zig 136 - // upload returns a typed BlobRef 137 - const blob = try agent.uploadBlob(data, .{ .mime_type = "image/png" }); 138 - 139 - // use it directly in a struct 140 - const post = atproto.feed.Post{ 141 - .text = "", 142 - .embed = .{ .images = .{ 143 - .images = &[_]atproto.embed.Image{ 144 - .{ .image = blob, .alt = "a bufo" }, 145 - }, 146 - }}, 147 - }; 148 - try agent.createRecord("app.bsky.feed.post", post); 149 - ``` 150 - 151 - ### video upload is even worse 152 - 153 - ```zig 154 - // current: manual job polling 155 - const job_id = try client.uploadVideo(data, filename); 156 - var attempts: u32 = 0; 157 - while (attempts < 60) : (attempts += 1) { 158 - // poll job status 159 - // check for JOB_STATE_COMPLETED or JOB_STATE_FAILED 160 - // sleep 1 second between polls 161 - } 162 - 163 - // wanted: one call that handles the async nature 164 - const video_blob = try agent.uploadVideo(data, .{ 165 - .filename = "bufo.gif", 166 - .mime_type = "image/gif", 167 - // sdk handles polling internally 168 - }); 169 - ``` 170 - 171 - --- 172 - 173 - ## 4. AT-URI utilities 174 - 175 - we parse AT-URIs by hand with string splitting. 176 - 177 - ```zig 178 - // current 179 - var parts = mem.splitScalar(u8, uri[5..], '/'); // skip "at://" 180 - const did = parts.next() orelse return error.InvalidUri; 181 - _ = parts.next(); // skip collection 182 - const rkey = parts.next() orelse return error.InvalidUri; 183 - 184 - // wanted 185 - const parsed = atproto.AtUri.parse(uri); 186 - // parsed.repo (the DID) 187 - // parsed.collection 188 - // parsed.rkey 189 - ``` 190 - 191 - also want: 192 - - `AtUri.format()` to construct URIs 193 - - validation (is this a valid DID? valid rkey?) 194 - - CID parsing/validation 195 - 196 - --- 197 - 198 - ## 5. jetstream / firehose client 199 - 200 - we used a separate websocket library and manually parsed jetstream messages. 201 - 202 - ### current 203 - 204 - ```zig 205 - const websocket = @import("websocket"); // third party 206 - 207 - // manual connection with exponential backoff 208 - // manual message parsing 209 - // manual event dispatch 210 - ``` 211 - 212 - ### what we want 213 - 214 - ```zig 215 - const atproto = @import("atproto"); 216 - 217 - var jetstream = atproto.Jetstream.init(allocator, .{ 218 - .endpoint = "jetstream2.us-east.bsky.network", 219 - .collections = &[_][]const u8{"app.bsky.feed.post"}, 220 - }); 221 - 222 - // typed events! 223 - while (try jetstream.next()) |event| { 224 - switch (event) { 225 - .commit => |commit| { 226 - switch (commit.operation) { 227 - .create => |record| { 228 - // record is already typed based on collection 229 - if (commit.collection == .feed_post) { 230 - const post: atproto.feed.Post = record; 231 - std.debug.print("new post: {s}\n", .{post.text}); 232 - } 233 - }, 234 - .delete => { ... }, 235 - } 236 - }, 237 - .identity => |identity| { ... }, 238 - .account => |account| { ... }, 239 - } 240 - } 241 - ``` 242 - 243 - bonus points: 244 - - automatic reconnection with configurable backoff 245 - - cursor support for resuming from a position 246 - - filtering (dids, collections) built-in 247 - - automatic decompression if using zstd streams 248 - 249 - --- 250 - 251 - ## 6. record operations 252 - 253 - CRUD for records is manual json construction. 254 - 255 - ### current 256 - 257 - ```zig 258 - var body_buf: std.ArrayList(u8) = .{}; 259 - try body_buf.print(allocator, 260 - \\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{...}}}} 261 - , .{ did, ... }); 262 - 263 - const result = client.fetch(.{ 264 - .location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" }, 265 - .method = .POST, 266 - .headers = .{ .content_type = .{ .override = "application/json" }, ... }, 267 - .payload = body_buf.items, 268 - ... 269 - }); 270 - ``` 271 - 272 - ### what we want 273 - 274 - ```zig 275 - // create 276 - const result = try agent.createRecord("app.bsky.feed.post", .{ 277 - .text = "hello world", 278 - .createdAt = atproto.Datetime.now(), 279 - }); 280 - // result.uri, result.cid are typed 281 - 282 - // read 283 - const record = try agent.getRecord(atproto.feed.Post, uri); 284 - 285 - // delete 286 - try agent.deleteRecord(uri); 287 - 288 - // list 289 - var iter = agent.listRecords("app.bsky.feed.post", .{ .limit = 50 }); 290 - while (try iter.next()) |record| { ... } 291 - ``` 292 - 293 - --- 294 - 295 - ## 7. rich text / facets 296 - 297 - we avoided facets entirely because they're complex. an sdk should make them easy. 298 - 299 - ### what we want 300 - 301 - ```zig 302 - const rt = atproto.RichText.init(allocator); 303 - try rt.append("check out "); 304 - try rt.appendLink("this repo", "https://github.com/..."); 305 - try rt.append(" by "); 306 - try rt.appendMention("@someone.bsky.social"); 307 - try rt.append(" "); 308 - try rt.appendTag("zig"); 309 - 310 - const post = atproto.feed.Post{ 311 - .text = rt.text(), 312 - .facets = rt.facets(), 313 - }; 314 - ``` 315 - 316 - the sdk should: 317 - - handle unicode byte offsets correctly (this is notoriously tricky) 318 - - auto-detect links/mentions/tags in plain text 319 - - validate handles resolve to real DIDs 320 - 321 - --- 322 - 323 - ## 8. rate limiting and retries 324 - 325 - we have no rate limiting. when we hit limits, we just fail. 326 - 327 - ### what we want 328 - 329 - ```zig 330 - var agent = atproto.Agent.init(allocator, .{ 331 - .rate_limit = .{ 332 - .strategy = .wait, // or .error 333 - .max_retries = 3, 334 - }, 335 - }); 336 - 337 - // agent automatically: 338 - // - respects rate limit headers 339 - // - waits and retries on 429 340 - // - exponential backoff on transient errors 341 - ``` 342 - 343 - --- 344 - 345 - ## 9. pagination helpers 346 - 347 - listing records or searching requires manual cursor handling. 348 - 349 - ```zig 350 - // current: manual 351 - var cursor: ?[]const u8 = null; 352 - while (true) { 353 - const response = try fetch(cursor); 354 - for (response.records) |record| { ... } 355 - cursor = response.cursor orelse break; 356 - } 357 - 358 - // wanted: iterator 359 - var iter = agent.listRecords("app.bsky.feed.post", .{}); 360 - while (try iter.next()) |record| { 361 - // handles pagination transparently 362 - } 363 - 364 - // or collect all 365 - const all_records = try iter.collect(); // fetches all pages 366 - ``` 367 - 368 - --- 369 - 370 - ## 10. did resolution 371 - 372 - we manually hit plc.directory to resolve DIDs. 373 - 374 - ```zig 375 - // current 376 - var url_buf: [256]u8 = undefined; 377 - const url = std.fmt.bufPrint(&url_buf, "https://plc.directory/{s}", .{did}); 378 - // fetch, parse, find service endpoint... 379 - 380 - // wanted 381 - const doc = try atproto.resolveDid(did); 382 - // doc.pds - the PDS endpoint 383 - // doc.handle - verified handle 384 - // doc.signingKey, doc.rotationKeys, etc. 385 - ``` 386 - 387 - should support: 388 - - did:plc via plc.directory 389 - - did:web via .well-known 390 - - caching with TTL 391 - 392 - --- 393 - 394 - ## 11. build.zig integration 395 - 396 - ### lexicon codegen 397 - 398 - ```zig 399 - // build.zig 400 - const atproto = @import("atproto"); 401 - 402 - pub fn build(b: *std.Build) void { 403 - // generate zig types from lexicon schemas 404 - const lexicons = atproto.addLexiconCodegen(b, .{ 405 - .lexicon_dirs = &.{"lexicons/"}, 406 - // or fetch from network 407 - .fetch_lexicons = &.{ 408 - "app.bsky.feed.*", 409 - "app.bsky.actor.*", 410 - "com.atproto.repo.*", 411 - }, 412 - }); 413 - 414 - exe.root_module.addImport("lexicons", lexicons); 415 - } 416 - ``` 417 - 418 - ### bundled CA certs 419 - 420 - TLS in zig requires CA certs. would be nice if the sdk bundled mozilla's CA bundle or made it easy to configure. 421 - 422 - --- 423 - 424 - ## 12. testing utilities 425 - 426 - ### mocks 427 - 428 - ```zig 429 - const atproto = @import("atproto"); 430 - 431 - test "bot responds to matching posts" { 432 - var mock = atproto.testing.MockAgent.init(allocator); 433 - defer mock.deinit(); 434 - 435 - // set up expected calls 436 - mock.expectCreateRecord("app.bsky.feed.post", .{ 437 - .text = "", 438 - // ... 439 - }); 440 - 441 - // run test code 442 - try handlePost(&mock, test_post); 443 - 444 - // verify 445 - try mock.verify(); 446 - } 447 - ``` 448 - 449 - ### jetstream replay 450 - 451 - ```zig 452 - // replay recorded jetstream events for testing 453 - var replay = atproto.testing.JetstreamReplay.init("testdata/events.jsonl"); 454 - while (try replay.next()) |event| { 455 - try handleEvent(event); 456 - } 457 - ``` 458 - 459 - --- 460 - 461 - ## 13. logging / observability 462 - 463 - ### structured logging 464 - 465 - ```zig 466 - var agent = atproto.Agent.init(allocator, .{ 467 - .logger = myLogger, // compatible with std.log or custom 468 - }); 469 - 470 - // logs requests, responses, retries, rate limits 471 - ``` 472 - 473 - ### metrics 474 - 475 - ```zig 476 - var agent = atproto.Agent.init(allocator, .{ 477 - .metrics = .{ 478 - .requests_total = &my_counter, 479 - .request_duration = &my_histogram, 480 - .rate_limit_waits = &my_counter, 481 - }, 482 - }); 483 - ``` 484 - 485 - --- 486 - 487 - ## 14. error handling 488 - 489 - ### typed errors with context 490 - 491 - ```zig 492 - // current: generic errors 493 - error.PostFailed 494 - 495 - // wanted: rich errors 496 - atproto.Error.RateLimit => |e| { 497 - std.debug.print("rate limited, reset at {}\n", .{e.reset_at}); 498 - }, 499 - atproto.Error.InvalidRecord => |e| { 500 - std.debug.print("validation failed: {s}\n", .{e.message}); 501 - }, 502 - atproto.Error.ExpiredToken => { 503 - // sdk should handle this automatically, but if not... 504 - }, 505 - ``` 506 - 507 - --- 508 - 509 - ## 15. moderation / labels 510 - 511 - we didn't need this for bufo-bot, but a complete sdk should support: 512 - 513 - ```zig 514 - // applying labels 515 - try agent.createLabels(.{ 516 - .src = agent.did, 517 - .uri = post_uri, 518 - .val = "spam", 519 - }); 520 - 521 - // reading labels on content 522 - const labels = try agent.getLabels(uri); 523 - for (labels) |label| { 524 - if (mem.eql(u8, label.val, "nsfw")) { 525 - // handle... 526 - } 527 - } 528 - ``` 529 - 530 - --- 531 - 532 - ## 16. feed generators and custom feeds 533 - 534 - ```zig 535 - // serving a feed generator 536 - var server = atproto.FeedGenerator.init(allocator, .{ 537 - .did = my_feed_did, 538 - .hostname = "feed.example.com", 539 - }); 540 - 541 - server.addFeed("trending-bufos", struct { 542 - fn getFeed(ctx: *Context, params: GetFeedParams) !GetFeedResponse { 543 - // return skeleton 544 - } 545 - }.getFeed); 546 - 547 - try server.listen(8080); 548 - ``` 549 - 550 - --- 551 - 552 - ## summary 553 - 554 - the core theme: **let us write application logic, not protocol plumbing**. 555 - 556 - right now building an atproto app in zig means: 557 - - manual json construction/parsing everywhere 558 - - hand-rolling authentication flows 559 - - string interpolation for record creation 560 - - manual http request management 561 - - third-party websocket libraries for firehose 562 - - no compile-time safety for lexicon types 563 - 564 - a good sdk would give us: 565 - - typed lexicon schemas (codegen) 566 - - managed sessions with automatic refresh 567 - - high-level record CRUD 568 - - built-in jetstream client with typed events 569 - - utilities for rich text, AT-URIs, DIDs 570 - - rate limiting and retry logic 571 - - testing helpers 572 - 573 - the dream is writing a bot like bufo-bot in ~100 lines instead of ~1000.
-1
justfile
··· 1 1 # bufo search justfile 2 - mod bot 3 2 4 3 # re-index all bufos with new embeddings 5 4 re-index:
-153
scripts/add_one_bufo.py
··· 1 - #!/usr/bin/env python3 2 - # /// script 3 - # requires-python = ">=3.11" 4 - # dependencies = [ 5 - # "httpx", 6 - # "python-dotenv", 7 - # "pillow", 8 - # ] 9 - # /// 10 - """ 11 - Add a single bufo to turbopuffer. 12 - Usage: uv run scripts/add_one_bufo.py <path_to_image> 13 - """ 14 - 15 - import asyncio 16 - import base64 17 - import hashlib 18 - import os 19 - import sys 20 - from io import BytesIO 21 - from pathlib import Path 22 - 23 - import httpx 24 - from PIL import Image 25 - from dotenv import load_dotenv 26 - 27 - load_dotenv(Path(__file__).parent.parent / ".env") 28 - 29 - 30 - async def embed_image(client: httpx.AsyncClient, image_path: Path, api_key: str) -> list[float] | None: 31 - """Generate embedding for an image using Voyage AI""" 32 - try: 33 - image = Image.open(image_path) 34 - is_animated = hasattr(image, 'n_frames') and image.n_frames > 1 35 - filename_text = image_path.stem.replace("-", " ").replace("_", " ") 36 - 37 - content = [{"type": "text", "text": filename_text}] 38 - 39 - if is_animated: 40 - num_frames = image.n_frames 41 - max_frames = min(5, num_frames) 42 - frame_indices = [int(i * (num_frames - 1) / (max_frames - 1)) for i in range(max_frames)] 43 - for frame_idx in frame_indices: 44 - image.seek(frame_idx) 45 - buffered = BytesIO() 46 - image.convert("RGB").save(buffered, format="WEBP", lossless=True) 47 - img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 48 - content.append({ 49 - "type": "image_base64", 50 - "image_base64": f"data:image/webp;base64,{img_base64}", 51 - }) 52 - else: 53 - buffered = BytesIO() 54 - image.convert("RGB").save(buffered, format="WEBP", lossless=True) 55 - img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") 56 - content.append({ 57 - "type": "image_base64", 58 - "image_base64": f"data:image/webp;base64,{img_base64}", 59 - }) 60 - 61 - response = await client.post( 62 - "https://api.voyageai.com/v1/multimodalembeddings", 63 - headers={ 64 - "Authorization": f"Bearer {api_key}", 65 - "Content-Type": "application/json", 66 - }, 67 - json={ 68 - "inputs": [{"content": content}], 69 - "model": "voyage-multimodal-3", 70 - "input_type": "document", 71 - }, 72 - timeout=60.0, 73 - ) 74 - response.raise_for_status() 75 - result = response.json() 76 - return result["data"][0]["embedding"] 77 - except Exception as e: 78 - print(f"error embedding {image_path.name}: {e}") 79 - return None 80 - 81 - 82 - async def upload_to_turbopuffer(filename: str, embedding: list[float], api_key: str, namespace: str): 83 - """Upload single embedding to turbopuffer""" 84 - file_hash = hashlib.sha256(filename.encode()).hexdigest()[:16] 85 - name = filename.rsplit(".", 1)[0] 86 - url = f"https://find-bufo.com/static/{filename}" 87 - 88 - async with httpx.AsyncClient() as client: 89 - response = await client.post( 90 - f"https://api.turbopuffer.com/v1/vectors/{namespace}", 91 - headers={ 92 - "Authorization": f"Bearer {api_key}", 93 - "Content-Type": "application/json", 94 - }, 95 - json={ 96 - "ids": [file_hash], 97 - "vectors": [embedding], 98 - "distance_metric": "cosine_distance", 99 - "attributes": { 100 - "url": [url], 101 - "name": [name], 102 - "filename": [filename], 103 - }, 104 - "schema": { 105 - "name": {"type": "string", "full_text_search": True}, 106 - "filename": {"type": "string", "full_text_search": True}, 107 - }, 108 - }, 109 - timeout=30.0, 110 - ) 111 - if response.status_code != 200: 112 - print(f"turbopuffer error: {response.text}") 113 - response.raise_for_status() 114 - 115 - print(f"uploaded {filename} to turbopuffer") 116 - 117 - 118 - async def main(): 119 - if len(sys.argv) < 2: 120 - print("usage: uv run scripts/add_one_bufo.py <path_to_image>") 121 - sys.exit(1) 122 - 123 - image_path = Path(sys.argv[1]) 124 - if not image_path.exists(): 125 - print(f"file not found: {image_path}") 126 - sys.exit(1) 127 - 128 - voyage_api_key = os.getenv("VOYAGE_API_TOKEN") 129 - if not voyage_api_key: 130 - print("VOYAGE_API_TOKEN not set") 131 - sys.exit(1) 132 - 133 - tpuf_api_key = os.getenv("TURBOPUFFER_API_KEY") 134 - if not tpuf_api_key: 135 - print("TURBOPUFFER_API_KEY not set") 136 - sys.exit(1) 137 - 138 - tpuf_namespace = os.getenv("TURBOPUFFER_NAMESPACE", "bufos") 139 - 140 - print(f"adding {image_path.name}...") 141 - 142 - async with httpx.AsyncClient() as client: 143 - embedding = await embed_image(client, image_path, voyage_api_key) 144 - if not embedding: 145 - print("failed to generate embedding") 146 - sys.exit(1) 147 - 148 - await upload_to_turbopuffer(image_path.name, embedding, tpuf_api_key, tpuf_namespace) 149 - print("done!") 150 - 151 - 152 - if __name__ == "__main__": 153 - asyncio.run(main())
+20 -35
src/embedding.rs
··· 1 - //! voyage AI embedding implementation 2 - //! 3 - //! implements the `Embedder` trait for voyage's multimodal-3 model. 4 - 5 - use crate::providers::{Embedder, EmbeddingError}; 1 + use anyhow::{Context, Result}; 6 2 use reqwest::Client; 7 3 use serde::{Deserialize, Serialize}; 8 4 9 - const VOYAGE_API_URL: &str = "https://api.voyageai.com/v1/multimodalembeddings"; 10 - const VOYAGE_MODEL: &str = "voyage-multimodal-3"; 11 - 12 5 #[derive(Debug, Serialize)] 13 - struct VoyageRequest { 6 + struct VoyageEmbeddingRequest { 14 7 inputs: Vec<MultimodalInput>, 15 8 model: String, 16 9 #[serde(skip_serializing_if = "Option::is_none")] ··· 29 22 } 30 23 31 24 #[derive(Debug, Deserialize)] 32 - struct VoyageResponse { 25 + struct VoyageEmbeddingResponse { 33 26 data: Vec<VoyageEmbeddingData>, 34 27 } 35 28 ··· 38 31 embedding: Vec<f32>, 39 32 } 40 33 41 - /// voyage AI multimodal embedding client 42 - /// 43 - /// uses the voyage-multimodal-3 model which produces 1024-dimensional vectors. 44 - /// designed for early fusion of text and image content. 45 - #[derive(Clone)] 46 - pub struct VoyageEmbedder { 34 + pub struct EmbeddingClient { 47 35 client: Client, 48 36 api_key: String, 49 37 } 50 38 51 - impl VoyageEmbedder { 39 + impl EmbeddingClient { 52 40 pub fn new(api_key: String) -> Self { 53 41 Self { 54 42 client: Client::new(), 55 43 api_key, 56 44 } 57 45 } 58 - } 59 46 60 - impl Embedder for VoyageEmbedder { 61 - async fn embed(&self, text: &str) -> Result<Vec<f32>, EmbeddingError> { 62 - let request = VoyageRequest { 47 + pub async fn embed_text(&self, text: &str) -> Result<Vec<f32>> { 48 + let request = VoyageEmbeddingRequest { 63 49 inputs: vec![MultimodalInput { 64 50 content: vec![ContentSegment::Text { 65 51 text: text.to_string(), 66 52 }], 67 53 }], 68 - model: VOYAGE_MODEL.to_string(), 54 + model: "voyage-multimodal-3".to_string(), 69 55 input_type: Some("query".to_string()), 70 56 }; 71 57 72 58 let response = self 73 59 .client 74 - .post(VOYAGE_API_URL) 60 + .post("https://api.voyageai.com/v1/multimodalembeddings") 75 61 .header("Authorization", format!("Bearer {}", self.api_key)) 76 62 .json(&request) 77 63 .send() 78 - .await?; 64 + .await 65 + .context("failed to send embedding request")?; 79 66 80 67 if !response.status().is_success() { 81 - let status = response.status().as_u16(); 68 + let status = response.status(); 82 69 let body = response.text().await.unwrap_or_default(); 83 - return Err(EmbeddingError::Api { status, body }); 70 + anyhow::bail!("voyage api error ({}): {}", status, body); 84 71 } 85 72 86 - let voyage_response: VoyageResponse = response.json().await.map_err(|e| { 87 - EmbeddingError::Other(anyhow::anyhow!("failed to parse response: {}", e)) 88 - })?; 73 + let embedding_response: VoyageEmbeddingResponse = response 74 + .json() 75 + .await 76 + .context("failed to parse embedding response")?; 89 77 90 - voyage_response 78 + let embedding = embedding_response 91 79 .data 92 80 .into_iter() 93 81 .next() 94 82 .map(|d| d.embedding) 95 - .ok_or(EmbeddingError::EmptyResponse) 96 - } 83 + .context("no embedding returned")?; 97 84 98 - fn name(&self) -> &'static str { 99 - "voyage-multimodal-3" 85 + Ok(embedding) 100 86 } 101 87 } 102 -
-193
src/filter.rs
··· 1 - //! composable result filters 2 - //! 3 - //! filters are predicates that can be combined to create complex filtering logic. 4 - 5 - use regex::Regex; 6 - 7 - /// a single search result that can be filtered 8 - pub trait Filterable { 9 - fn name(&self) -> &str; 10 - } 11 - 12 - /// a predicate that can accept or reject items 13 - pub trait Filter<T: Filterable>: Send + Sync { 14 - /// returns true if the item should be kept 15 - fn matches(&self, item: &T) -> bool; 16 - } 17 - 18 - /// filters out inappropriate content based on a blocklist 19 - struct BlocklistFilter { 20 - blocklist: Vec<&'static str>, 21 - } 22 - 23 - impl BlocklistFilter { 24 - fn inappropriate_bufos() -> Self { 25 - Self { 26 - blocklist: vec![ 27 - "bufo-juicy", 28 - "good-news-bufo-offers-suppository", 29 - "bufo-declines-your-suppository-offer", 30 - "tsa-bufo-gropes-you", 31 - ], 32 - } 33 - } 34 - } 35 - 36 - impl<T: Filterable> Filter<T> for BlocklistFilter { 37 - fn matches(&self, item: &T) -> bool { 38 - !self.blocklist.iter().any(|blocked| item.name().contains(blocked)) 39 - } 40 - } 41 - 42 - /// filters out items matching any of the given regex patterns 43 - struct ExcludePatternFilter { 44 - patterns: Vec<Regex>, 45 - } 46 - 47 - impl ExcludePatternFilter { 48 - fn from_comma_separated(pattern_str: &str) -> Self { 49 - let patterns = pattern_str 50 - .split(',') 51 - .map(|p| p.trim()) 52 - .filter(|p| !p.is_empty()) 53 - .filter_map(|p| Regex::new(p).ok()) 54 - .collect(); 55 - 56 - Self { patterns } 57 - } 58 - 59 - fn empty() -> Self { 60 - Self { patterns: vec![] } 61 - } 62 - } 63 - 64 - impl<T: Filterable> Filter<T> for ExcludePatternFilter { 65 - fn matches(&self, item: &T) -> bool { 66 - !self.patterns.iter().any(|p| p.is_match(item.name())) 67 - } 68 - } 69 - 70 - /// combined filter that handles family-friendly mode and include/exclude patterns 71 - pub struct ContentFilter { 72 - family_friendly: bool, 73 - blocklist: BlocklistFilter, 74 - exclude: ExcludePatternFilter, 75 - include_patterns: Vec<Regex>, 76 - } 77 - 78 - impl ContentFilter { 79 - pub fn new( 80 - family_friendly: bool, 81 - exclude_str: Option<&str>, 82 - include_str: Option<&str>, 83 - ) -> Self { 84 - let exclude = exclude_str 85 - .map(ExcludePatternFilter::from_comma_separated) 86 - .unwrap_or_else(ExcludePatternFilter::empty); 87 - 88 - let include_patterns: Vec<Regex> = include_str 89 - .map(|s| { 90 - s.split(',') 91 - .map(|p| p.trim()) 92 - .filter(|p| !p.is_empty()) 93 - .filter_map(|p| Regex::new(p).ok()) 94 - .collect() 95 - }) 96 - .unwrap_or_default(); 97 - 98 - Self { 99 - family_friendly, 100 - blocklist: BlocklistFilter::inappropriate_bufos(), 101 - exclude, 102 - include_patterns, 103 - } 104 - } 105 - 106 - pub fn exclude_pattern_count(&self) -> usize { 107 - self.exclude.patterns.len() 108 - } 109 - 110 - pub fn exclude_patterns_str(&self) -> String { 111 - self.exclude 112 - .patterns 113 - .iter() 114 - .map(|r| r.as_str()) 115 - .collect::<Vec<_>>() 116 - .join(",") 117 - } 118 - } 119 - 120 - impl<T: Filterable> Filter<T> for ContentFilter { 121 - fn matches(&self, item: &T) -> bool { 122 - // check family-friendly blocklist 123 - if self.family_friendly && !self.blocklist.matches(item) { 124 - return false; 125 - } 126 - 127 - // check if explicitly included (overrides exclude) 128 - let matches_include = self.include_patterns.iter().any(|p| p.is_match(item.name())); 129 - if matches_include { 130 - return true; 131 - } 132 - 133 - // check exclude patterns 134 - self.exclude.matches(item) 135 - } 136 - } 137 - 138 - #[cfg(test)] 139 - mod tests { 140 - use super::*; 141 - 142 - struct TestItem { 143 - name: String, 144 - } 145 - 146 - impl Filterable for TestItem { 147 - fn name(&self) -> &str { 148 - &self.name 149 - } 150 - } 151 - 152 - #[test] 153 - fn test_blocklist_filter() { 154 - let filter = BlocklistFilter::inappropriate_bufos(); 155 - let good = TestItem { 156 - name: "bufo-happy".into(), 157 - }; 158 - let bad = TestItem { 159 - name: "bufo-juicy".into(), 160 - }; 161 - 162 - assert!(filter.matches(&good)); 163 - assert!(!filter.matches(&bad)); 164 - } 165 - 166 - #[test] 167 - fn test_exclude_pattern_filter() { 168 - let filter = ExcludePatternFilter::from_comma_separated("test, draft"); 169 - let good = TestItem { 170 - name: "bufo-happy".into(), 171 - }; 172 - let bad = TestItem { 173 - name: "bufo-test-mode".into(), 174 - }; 175 - 176 - assert!(filter.matches(&good)); 177 - assert!(!filter.matches(&bad)); 178 - } 179 - 180 - #[test] 181 - fn test_include_overrides_exclude() { 182 - let filter = ContentFilter::new(false, Some("party"), Some("birthday-party")); 183 - let excluded = TestItem { 184 - name: "bufo-party".into(), 185 - }; 186 - let included = TestItem { 187 - name: "bufo-birthday-party".into(), 188 - }; 189 - 190 - assert!(!filter.matches(&excluded)); 191 - assert!(filter.matches(&included)); 192 - } 193 - }
-3
src/main.rs
··· 1 1 mod config; 2 2 mod embedding; 3 - mod filter; 4 - mod providers; 5 - mod scoring; 6 3 mod search; 7 4 mod turbopuffer; 8 5
-99
src/providers.rs
··· 1 - //! provider abstractions for embedding and vector search backends 2 - //! 3 - //! these traits allow swapping implementations (e.g., voyage โ†’ openai embeddings) 4 - //! without changing the search logic. 5 - //! 6 - //! ## design notes 7 - //! 8 - //! we use `async fn` in traits directly (stabilized in rust 1.75). for this crate's 9 - //! use case (single-threaded actix-web), the Send bound issue doesn't apply. 10 - //! 11 - //! the trait design follows patterns from: 12 - //! - async-openai's `Config` trait for backend abstraction 13 - //! - tower's `Service` trait for composability (though simpler here) 14 - 15 - use std::future::Future; 16 - use thiserror::Error; 17 - 18 - /// errors that can occur when generating embeddings 19 - #[derive(Debug, Error)] 20 - pub enum EmbeddingError { 21 - #[error("failed to send request: {0}")] 22 - Request(#[from] reqwest::Error), 23 - 24 - #[error("api error ({status}): {body}")] 25 - Api { status: u16, body: String }, 26 - 27 - #[error("no embedding returned from provider")] 28 - EmptyResponse, 29 - 30 - #[error("{0}")] 31 - Other(#[from] anyhow::Error), 32 - } 33 - 34 - /// a provider that can generate embeddings for text 35 - /// 36 - /// implementations should be cheap to clone (wrap expensive resources in Arc). 37 - /// 38 - /// # example 39 - /// 40 - /// ```ignore 41 - /// let client = VoyageEmbedder::new(api_key); 42 - /// let embedding = client.embed("hello world").await?; 43 - /// ``` 44 - pub trait Embedder: Send + Sync { 45 - /// generate an embedding vector for the given text 46 - fn embed(&self, text: &str) -> impl Future<Output = Result<Vec<f32>, EmbeddingError>> + Send; 47 - 48 - /// human-readable name for logging/debugging 49 - fn name(&self) -> &'static str; 50 - } 51 - 52 - /// errors that can occur during vector search 53 - #[derive(Debug, Error)] 54 - pub enum VectorSearchError { 55 - #[error("request failed: {0}")] 56 - Request(#[from] reqwest::Error), 57 - 58 - #[error("api error ({status}): {body}")] 59 - Api { status: u16, body: String }, 60 - 61 - #[error("query too long: {message}")] 62 - QueryTooLong { message: String }, 63 - 64 - #[error("parse error: {0}")] 65 - Parse(String), 66 - 67 - #[error("{0}")] 68 - Other(#[from] anyhow::Error), 69 - } 70 - 71 - /// a single result from a vector search 72 - #[derive(Debug, Clone)] 73 - pub struct SearchResult { 74 - pub id: String, 75 - /// raw distance/score from the backend (interpretation varies by method) 76 - pub score: f32, 77 - /// arbitrary key-value attributes 78 - pub attributes: std::collections::HashMap<String, String>, 79 - } 80 - 81 - /// a provider that can perform vector similarity search 82 - pub trait VectorStore: Send + Sync { 83 - /// search by vector embedding (ANN/cosine similarity) 84 - fn search_by_vector( 85 - &self, 86 - embedding: &[f32], 87 - top_k: usize, 88 - ) -> impl Future<Output = Result<Vec<SearchResult>, VectorSearchError>> + Send; 89 - 90 - /// search by keyword (BM25 full-text search) 91 - fn search_by_keyword( 92 - &self, 93 - query: &str, 94 - top_k: usize, 95 - ) -> impl Future<Output = Result<Vec<SearchResult>, VectorSearchError>> + Send; 96 - 97 - /// human-readable name for logging/debugging 98 - fn name(&self) -> &'static str; 99 - }
-164
src/scoring.rs
··· 1 - //! score fusion and normalization for hybrid search 2 - //! 3 - //! this module handles the weighted combination of semantic (vector) and 4 - //! keyword (BM25) search scores. 5 - //! 6 - //! ## normalization strategies 7 - //! 8 - //! - **cosine distance โ†’ similarity**: `1.0 - (distance / 2.0)` maps [0, 2] โ†’ [1, 0] 9 - //! - **BM25 max-scaling**: divide by max score so top result = 1.0 10 - //! 11 - //! ## fusion formula 12 - //! 13 - //! ```text 14 - //! score = ฮฑ * semantic + (1 - ฮฑ) * keyword 15 - //! ``` 16 - //! 17 - //! reference: https://opensourceconnections.com/blog/2023/02/27/hybrid-vigor-winning-at-hybrid-search/ 18 - 19 - use std::collections::HashMap; 20 - 21 - /// configuration for score fusion 22 - #[derive(Debug, Clone)] 23 - pub struct FusionConfig { 24 - /// weight for semantic scores (0.0 = pure keyword, 1.0 = pure semantic) 25 - pub alpha: f32, 26 - /// minimum fused score to include in results (filters noise) 27 - pub min_score: f32, 28 - } 29 - 30 - impl Default for FusionConfig { 31 - fn default() -> Self { 32 - Self { 33 - alpha: 0.7, 34 - min_score: 0.001, 35 - } 36 - } 37 - } 38 - 39 - impl FusionConfig { 40 - pub fn new(alpha: f32) -> Self { 41 - Self { 42 - alpha, 43 - ..Default::default() 44 - } 45 - } 46 - } 47 - 48 - /// normalize cosine distance to similarity score 49 - /// 50 - /// cosine distance ranges from 0 (identical) to 2 (opposite). 51 - /// we convert to similarity: 1.0 (identical) to 0.0 (opposite). 52 - #[inline] 53 - pub fn cosine_distance_to_similarity(distance: f32) -> f32 { 54 - 1.0 - (distance / 2.0) 55 - } 56 - 57 - /// normalize BM25 scores using max-scaling 58 - /// 59 - /// divides all scores by the maximum score, ensuring: 60 - /// - top result gets score 1.0 61 - /// - relative spacing is preserved 62 - /// - handles edge cases (empty results, identical scores) 63 - pub fn normalize_bm25_scores(scores: &[(String, f32)]) -> HashMap<String, f32> { 64 - let max_score = scores 65 - .iter() 66 - .map(|(_, s)| *s) 67 - .fold(f32::NEG_INFINITY, f32::max) 68 - .max(0.001); // avoid division by zero 69 - 70 - scores 71 - .iter() 72 - .map(|(id, score)| (id.clone(), (score / max_score).min(1.0))) 73 - .collect() 74 - } 75 - 76 - /// fuse semantic and keyword scores using weighted combination 77 - /// 78 - /// returns items sorted by fused score (descending), filtered by min_score. 79 - pub fn fuse_scores( 80 - semantic_scores: &HashMap<String, f32>, 81 - keyword_scores: &HashMap<String, f32>, 82 - config: &FusionConfig, 83 - ) -> Vec<(String, f32)> { 84 - // collect all unique IDs 85 - let all_ids: std::collections::HashSet<_> = semantic_scores 86 - .keys() 87 - .chain(keyword_scores.keys()) 88 - .collect(); 89 - 90 - let mut fused: Vec<(String, f32)> = all_ids 91 - .into_iter() 92 - .map(|id| { 93 - let semantic = semantic_scores.get(id).copied().unwrap_or(0.0); 94 - let keyword = keyword_scores.get(id).copied().unwrap_or(0.0); 95 - let score = config.alpha * semantic + (1.0 - config.alpha) * keyword; 96 - (id.clone(), score) 97 - }) 98 - .filter(|(_, score)| *score > config.min_score) 99 - .collect(); 100 - 101 - // sort descending by score 102 - fused.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); 103 - 104 - fused 105 - } 106 - 107 - #[cfg(test)] 108 - mod tests { 109 - use super::*; 110 - 111 - #[test] 112 - fn test_cosine_distance_to_similarity() { 113 - assert!((cosine_distance_to_similarity(0.0) - 1.0).abs() < 0.001); 114 - assert!((cosine_distance_to_similarity(2.0) - 0.0).abs() < 0.001); 115 - assert!((cosine_distance_to_similarity(1.0) - 0.5).abs() < 0.001); 116 - } 117 - 118 - #[test] 119 - fn test_normalize_bm25_scores() { 120 - let scores = vec![ 121 - ("a".to_string(), 10.0), 122 - ("b".to_string(), 5.0), 123 - ("c".to_string(), 2.5), 124 - ]; 125 - 126 - let normalized = normalize_bm25_scores(&scores); 127 - 128 - assert!((normalized["a"] - 1.0).abs() < 0.001); 129 - assert!((normalized["b"] - 0.5).abs() < 0.001); 130 - assert!((normalized["c"] - 0.25).abs() < 0.001); 131 - } 132 - 133 - #[test] 134 - fn test_fuse_scores_pure_semantic() { 135 - let mut semantic = HashMap::new(); 136 - semantic.insert("a".to_string(), 0.9); 137 - semantic.insert("b".to_string(), 0.5); 138 - 139 - let mut keyword = HashMap::new(); 140 - keyword.insert("a".to_string(), 0.1); 141 - keyword.insert("c".to_string(), 1.0); 142 - 143 - let config = FusionConfig::new(1.0); // pure semantic 144 - let fused = fuse_scores(&semantic, &keyword, &config); 145 - 146 - assert_eq!(fused[0].0, "a"); 147 - assert!((fused[0].1 - 0.9).abs() < 0.001); 148 - } 149 - 150 - #[test] 151 - fn test_fuse_scores_balanced() { 152 - let mut semantic = HashMap::new(); 153 - semantic.insert("a".to_string(), 0.8); 154 - 155 - let mut keyword = HashMap::new(); 156 - keyword.insert("a".to_string(), 0.4); 157 - 158 - let config = FusionConfig::new(0.5); // balanced 159 - let fused = fuse_scores(&semantic, &keyword, &config); 160 - 161 - // 0.5 * 0.8 + 0.5 * 0.4 = 0.6 162 - assert!((fused[0].1 - 0.6).abs() < 0.001); 163 - } 164 - }
+218 -205
src/search.rs
··· 27 27 //! - `ฮฑ=0.5`: balanced (equal weight to semantic and keyword signals) 28 28 //! - `ฮฑ=0.0`: pure keyword (best for exact filename searches) 29 29 //! 30 + //! ## empirical behavior 31 + //! 32 + //! query: "happy", top_k=3 33 + //! - ฮฑ=1.0: ["proud-bufo-is-excited", "bufo-hehe", "bufo-excited"] (semantic similarity) 34 + //! - ฮฑ=0.5: ["bufo-is-happy-youre-happy", ...] (exact match rises to top) 35 + //! - ฮฑ=0.0: ["bufo-is-happy-youre-happy" (1.0), others (0.0)] (only exact matches score) 36 + //! 30 37 //! ## references 31 38 //! 32 39 //! - voyage multimodal embeddings: https://docs.voyageai.com/docs/multimodal-embeddings ··· 34 41 //! - weighted fusion: standard approach in modern hybrid search systems (2024) 35 42 36 43 use crate::config::Config; 37 - use crate::embedding::VoyageEmbedder; 38 - use crate::filter::{ContentFilter, Filter, Filterable}; 39 - use crate::providers::{Embedder, VectorSearchError, VectorStore}; 40 - use crate::scoring::{cosine_distance_to_similarity, fuse_scores, normalize_bm25_scores, FusionConfig}; 41 - use crate::turbopuffer::TurbopufferStore; 44 + use crate::embedding::EmbeddingClient; 45 + use crate::turbopuffer::{QueryRequest, TurbopufferClient, TurbopufferError}; 42 46 use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult}; 43 47 use serde::{Deserialize, Serialize}; 44 48 use std::collections::hash_map::DefaultHasher; 45 - use std::collections::HashMap; 46 49 use std::hash::{Hash, Hasher}; 47 50 48 51 #[derive(Debug, Deserialize)] ··· 57 60 /// family-friendly mode: filters out inappropriate content (default true) 58 61 #[serde(default = "default_family_friendly")] 59 62 pub family_friendly: bool, 60 - /// comma-separated regex patterns to exclude from results (e.g., "excited,party") 61 - #[serde(default)] 62 - pub exclude: Option<String>, 63 - /// comma-separated regex patterns to include (overrides exclude) 64 - #[serde(default)] 65 - pub include: Option<String>, 66 63 } 67 64 68 65 fn default_top_k() -> usize { ··· 77 74 true 78 75 } 79 76 77 + /// blocklist of inappropriate bufos (filtered when family_friendly=true) 78 + fn get_inappropriate_bufos() -> Vec<&'static str> { 79 + vec![ 80 + "bufo-juicy", 81 + "good-news-bufo-offers-suppository", 82 + "bufo-declines-your-suppository-offer", 83 + "tsa-bufo-gropes-you", 84 + ] 85 + } 86 + 80 87 #[derive(Debug, Serialize)] 81 88 pub struct SearchResponse { 82 89 pub results: Vec<BufoResult>, 83 90 } 84 91 85 - #[derive(Debug, Serialize, Clone)] 92 + #[derive(Debug, Serialize)] 86 93 pub struct BufoResult { 87 94 pub id: String, 88 95 pub url: String, 89 96 pub name: String, 90 - pub score: f32, 91 - } 92 - 93 - impl Filterable for BufoResult { 94 - fn name(&self) -> &str { 95 - &self.name 96 - } 97 - } 98 - 99 - /// errors that can occur during search 100 - #[derive(Debug, thiserror::Error)] 101 - pub enum SearchError { 102 - #[error("embedding error: {0}")] 103 - Embedding(#[from] crate::providers::EmbeddingError), 104 - 105 - #[error("vector search error: {0}")] 106 - VectorSearch(#[from] VectorSearchError), 107 - } 108 - 109 - impl SearchError { 110 - fn into_actix_error(self) -> actix_web::Error { 111 - match &self { 112 - SearchError::VectorSearch(VectorSearchError::QueryTooLong { .. }) => { 113 - actix_web::error::ErrorBadRequest( 114 - "search query is too long (max 1024 characters for text search). try a shorter query." 115 - ) 116 - } 117 - _ => actix_web::error::ErrorInternalServerError(self.to_string()), 118 - } 119 - } 97 + pub score: f32, // normalized 0-1 score for display 120 98 } 121 99 122 100 /// generate etag for caching based on query parameters 123 - fn generate_etag( 124 - query: &str, 125 - top_k: usize, 126 - alpha: f32, 127 - family_friendly: bool, 128 - exclude: &Option<String>, 129 - include: &Option<String>, 130 - ) -> String { 101 + fn generate_etag(query: &str, top_k: usize, alpha: f32, family_friendly: bool) -> String { 131 102 let mut hasher = DefaultHasher::new(); 132 103 query.hash(&mut hasher); 133 104 top_k.hash(&mut hasher); 105 + // convert f32 to bits for consistent hashing 134 106 alpha.to_bits().hash(&mut hasher); 135 107 family_friendly.hash(&mut hasher); 136 - exclude.hash(&mut hasher); 137 - include.hash(&mut hasher); 138 108 format!("\"{}\"", hasher.finish()) 139 109 } 140 110 141 - /// execute hybrid search using the provided embedder and vector store 142 - async fn execute_hybrid_search<E: Embedder, V: VectorStore>( 143 - query: &str, 144 - top_k: usize, 145 - fusion_config: &FusionConfig, 146 - embedder: &E, 147 - vector_store: &V, 148 - ) -> Result<Vec<(String, f32, HashMap<String, String>)>, SearchError> { 149 - // fetch extra results to ensure we have enough after filtering 150 - let search_top_k = top_k * 5; 151 - let query_owned = query.to_string(); 111 + /// shared search implementation used by both POST and GET handlers 112 + async fn perform_search( 113 + query_text: String, 114 + top_k_val: usize, 115 + alpha: f32, 116 + family_friendly: bool, 117 + config: &Config, 118 + ) -> ActixResult<SearchResponse> { 152 119 153 - // generate query embedding 154 - let _embed_span = logfire::span!( 155 - "embedding.generate", 156 - query = &query_owned, 157 - model = embedder.name() 158 - ) 159 - .entered(); 120 + let _search_span = logfire::span!( 121 + "bufo_search", 122 + query = &query_text, 123 + top_k = top_k_val as i64, 124 + alpha = alpha as f64, 125 + family_friendly = family_friendly 126 + ).entered(); 160 127 161 - let query_embedding = embedder.embed(query).await?; 128 + logfire::info!( 129 + "search request received", 130 + query = &query_text, 131 + top_k = top_k_val as i64, 132 + alpha = alpha as f64 133 + ); 134 + 135 + let embedding_client = EmbeddingClient::new(config.voyage_api_key.clone()); 136 + let tpuf_client = TurbopufferClient::new( 137 + config.turbopuffer_api_key.clone(), 138 + config.turbopuffer_namespace.clone(), 139 + ); 140 + 141 + // generate embedding for user query 142 + let query_embedding = { 143 + let _span = logfire::span!( 144 + "voyage.embed_text", 145 + query = &query_text, 146 + model = "voyage-3-lite" 147 + ).entered(); 148 + 149 + embedding_client 150 + .embed_text(&query_text) 151 + .await 152 + .map_err(|e| { 153 + let error_msg = e.to_string(); 154 + logfire::error!( 155 + "embedding generation failed", 156 + error = error_msg, 157 + query = &query_text 158 + ); 159 + actix_web::error::ErrorInternalServerError(format!( 160 + "failed to generate embedding: {}", 161 + e 162 + )) 163 + })? 164 + }; 162 165 163 166 logfire::info!( 164 167 "embedding generated", 165 - query = &query_owned, 168 + query = &query_text, 166 169 embedding_dim = query_embedding.len() as i64 167 170 ); 168 171 169 - // run both searches in sequence (could parallelize with tokio::join! if needed) 170 - let namespace = vector_store.name().to_string(); 172 + // run vector search (semantic) 173 + let search_top_k = top_k_val * 2; // get more results for better fusion 174 + let vector_request = QueryRequest { 175 + rank_by: vec![ 176 + serde_json::json!("vector"), 177 + serde_json::json!("ANN"), 178 + serde_json::json!(query_embedding), 179 + ], 180 + top_k: search_top_k, 181 + include_attributes: Some(vec!["url".to_string(), "name".to_string(), "filename".to_string()]), 182 + }; 171 183 184 + let namespace = config.turbopuffer_namespace.clone(); 172 185 let vector_results = { 173 186 let _span = logfire::span!( 174 187 "turbopuffer.vector_search", 175 - query = &query_owned, 188 + query = &query_text, 176 189 top_k = search_top_k as i64, 177 190 namespace = &namespace 178 - ) 179 - .entered(); 191 + ).entered(); 180 192 181 - vector_store 182 - .search_by_vector(&query_embedding, search_top_k) 183 - .await? 193 + tpuf_client.query(vector_request).await.map_err(|e| { 194 + let error_msg = e.to_string(); 195 + logfire::error!( 196 + "vector search failed", 197 + error = error_msg, 198 + query = &query_text, 199 + top_k = search_top_k as i64 200 + ); 201 + actix_web::error::ErrorInternalServerError(format!( 202 + "failed to query turbopuffer (vector): {}", 203 + e 204 + )) 205 + })? 184 206 }; 185 207 186 208 logfire::info!( 187 209 "vector search completed", 188 - query = &query_owned, 210 + query = &query_text, 189 211 results_found = vector_results.len() as i64 190 212 ); 191 213 214 + // run BM25 text search (keyword) 192 215 let bm25_results = { 193 216 let _span = logfire::span!( 194 217 "turbopuffer.bm25_search", 195 - query = &query_owned, 218 + query = &query_text, 196 219 top_k = search_top_k as i64, 197 220 namespace = &namespace 198 - ) 199 - .entered(); 221 + ).entered(); 200 222 201 - vector_store.search_by_keyword(query, search_top_k).await? 223 + tpuf_client.bm25_query(&query_text, search_top_k).await.map_err(|e| { 224 + let error_msg = e.to_string(); 225 + logfire::error!( 226 + "bm25 search failed", 227 + error = error_msg, 228 + query = &query_text, 229 + top_k = search_top_k as i64 230 + ); 231 + 232 + // return appropriate HTTP status based on error type 233 + match e { 234 + TurbopufferError::QueryTooLong { .. } => { 235 + actix_web::error::ErrorBadRequest( 236 + "search query is too long (max 1024 characters for text search). try a shorter query." 237 + ) 238 + } 239 + _ => { 240 + actix_web::error::ErrorInternalServerError(format!( 241 + "failed to query turbopuffer (BM25): {}", 242 + e 243 + )) 244 + } 245 + } 246 + })? 202 247 }; 203 248 204 - // normalize scores 205 - let semantic_scores: HashMap<String, f32> = vector_results 206 - .iter() 207 - .map(|r| (r.id.clone(), cosine_distance_to_similarity(r.score))) 208 - .collect(); 249 + // weighted fusion: combine vector and BM25 results 250 + use std::collections::HashMap; 251 + 252 + // normalize vector scores (cosine distance -> 0-1 similarity) 253 + let mut semantic_scores: HashMap<String, f32> = HashMap::new(); 254 + for row in &vector_results { 255 + let score = 1.0 - (row.dist / 2.0); 256 + semantic_scores.insert(row.id.clone(), score); 257 + } 209 258 210 - let bm25_raw: Vec<(String, f32)> = bm25_results 211 - .iter() 212 - .map(|r| (r.id.clone(), r.score)) 213 - .collect(); 214 - let keyword_scores = normalize_bm25_scores(&bm25_raw); 259 + // normalize BM25 scores using max normalization (BM25-max-scaled approach) 260 + // this preserves relative spacing and handles edge cases (single result, similar scores) 261 + // reference: https://opensourceconnections.com/blog/2023/02/27/hybrid-vigor-winning-at-hybrid-search/ 262 + let bm25_scores_vec: Vec<f32> = bm25_results.iter().map(|r| r.dist).collect(); 263 + let max_bm25 = bm25_scores_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max).max(0.001); // avoid division by zero 215 264 216 - let max_bm25 = bm25_raw 217 - .iter() 218 - .map(|(_, s)| *s) 219 - .fold(f32::NEG_INFINITY, f32::max); 265 + let mut keyword_scores: HashMap<String, f32> = HashMap::new(); 266 + for row in &bm25_results { 267 + // divide by max to ensure top result gets 1.0, others scale proportionally 268 + let normalized_score = (row.dist / max_bm25).min(1.0); 269 + keyword_scores.insert(row.id.clone(), normalized_score); 270 + } 220 271 221 272 logfire::info!( 222 273 "bm25 search completed", 223 - query = &query_owned, 274 + query = &query_text, 224 275 results_found = bm25_results.len() as i64, 225 276 max_bm25 = max_bm25 as f64, 226 - top_bm25_raw = bm25_raw.first().map(|(_, s)| *s).unwrap_or(0.0) as f64 227 - ); 228 - 229 - // fuse scores 230 - let fused = fuse_scores(&semantic_scores, &keyword_scores, fusion_config); 231 - 232 - logfire::info!( 233 - "weighted fusion completed", 234 - total_candidates = (vector_results.len() + bm25_results.len()) as i64, 235 - alpha = fusion_config.alpha as f64, 236 - pre_filter_results = fused.len() as i64 277 + top_bm25_raw = bm25_scores_vec.first().copied().unwrap_or(0.0) as f64, 278 + top_bm25_normalized = keyword_scores.values().cloned().fold(f32::NEG_INFINITY, f32::max) as f64 237 279 ); 238 280 239 - // collect attributes from both result sets 240 - let mut all_attributes: HashMap<String, HashMap<String, String>> = HashMap::new(); 241 - for result in vector_results.into_iter().chain(bm25_results.into_iter()) { 242 - all_attributes 243 - .entry(result.id.clone()) 244 - .or_insert(result.attributes); 281 + // collect all unique results and compute weighted fusion scores 282 + let mut all_results: HashMap<String, crate::turbopuffer::QueryRow> = HashMap::new(); 283 + for row in vector_results.into_iter().chain(bm25_results.into_iter()) { 284 + all_results.entry(row.id.clone()).or_insert(row); 245 285 } 246 286 247 - // return fused results with attributes 248 - Ok(fused 249 - .into_iter() 250 - .map(|(id, score)| { 251 - let attrs = all_attributes.remove(&id).unwrap_or_default(); 252 - (id, score, attrs) 287 + let mut fused_scores: Vec<(String, f32)> = all_results 288 + .keys() 289 + .map(|id| { 290 + let semantic = semantic_scores.get(id).copied().unwrap_or(0.0); 291 + let keyword = keyword_scores.get(id).copied().unwrap_or(0.0); 292 + let fused = alpha * semantic + (1.0 - alpha) * keyword; 293 + (id.clone(), fused) 253 294 }) 254 - .collect()) 255 - } 295 + .collect(); 256 296 257 - /// shared search implementation used by both POST and GET handlers 258 - async fn perform_search( 259 - query_text: String, 260 - top_k_val: usize, 261 - alpha: f32, 262 - family_friendly: bool, 263 - exclude: Option<String>, 264 - include: Option<String>, 265 - config: &Config, 266 - ) -> ActixResult<SearchResponse> { 267 - let content_filter = ContentFilter::new( 268 - family_friendly, 269 - exclude.as_deref(), 270 - include.as_deref(), 271 - ); 297 + // filter out zero-scored results (irrelevant matches from the other search method) 298 + // this prevents vector-only results from appearing when alpha=0.0 (pure keyword) 299 + // and keyword-only results from appearing when alpha=1.0 (pure semantic) 300 + fused_scores.retain(|(_, score)| *score > 0.001); 272 301 273 - let _search_span = logfire::span!( 274 - "bufo_search", 275 - query = &query_text, 276 - top_k = top_k_val as i64, 277 - alpha = alpha as f64, 278 - family_friendly = family_friendly, 279 - exclude_patterns_count = content_filter.exclude_pattern_count() as i64 280 - ) 281 - .entered(); 302 + // sort by fused score (descending) and take top_k 303 + fused_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); 304 + fused_scores.truncate(top_k_val); 282 305 283 306 logfire::info!( 284 - "search request received", 285 - query = &query_text, 286 - top_k = top_k_val as i64, 307 + "weighted fusion completed", 308 + total_candidates = all_results.len() as i64, 287 309 alpha = alpha as f64, 288 - exclude_patterns = &content_filter.exclude_patterns_str() 310 + final_results = fused_scores.len() as i64 289 311 ); 290 312 291 - // create clients 292 - let embedder = VoyageEmbedder::new(config.voyage_api_key.clone()); 293 - let vector_store = TurbopufferStore::new( 294 - config.turbopuffer_api_key.clone(), 295 - config.turbopuffer_namespace.clone(), 296 - ); 313 + // convert to bufo results 314 + let inappropriate_bufos = get_inappropriate_bufos(); 315 + let results: Vec<BufoResult> = fused_scores 316 + .into_iter() 317 + .filter_map(|(id, score)| { 318 + all_results.get(&id).map(|row| { 319 + let url = row 320 + .attributes 321 + .get("url") 322 + .and_then(|v| v.as_str()) 323 + .unwrap_or("") 324 + .to_string(); 297 325 298 - let fusion_config = FusionConfig::new(alpha); 326 + let name = row 327 + .attributes 328 + .get("name") 329 + .and_then(|v| v.as_str()) 330 + .unwrap_or(&row.id) 331 + .to_string(); 299 332 300 - // execute hybrid search 301 - let fused_results = execute_hybrid_search( 302 - &query_text, 303 - top_k_val, 304 - &fusion_config, 305 - &embedder, 306 - &vector_store, 307 - ) 308 - .await 309 - .map_err(|e| e.into_actix_error())?; 310 - 311 - // convert to BufoResults and apply filtering 312 - let results: Vec<BufoResult> = fused_results 313 - .into_iter() 314 - .map(|(id, score, attrs)| BufoResult { 315 - id: id.clone(), 316 - url: attrs.get("url").cloned().unwrap_or_default(), 317 - name: attrs.get("name").cloned().unwrap_or_else(|| id.clone()), 318 - score, 333 + BufoResult { 334 + id: row.id.clone(), 335 + url, 336 + name, 337 + score, 338 + } 339 + }) 340 + }) 341 + .filter(|result| { 342 + // filter out inappropriate bufos if family_friendly mode is enabled 343 + if family_friendly { 344 + !inappropriate_bufos.iter().any(|&blocked| result.name.contains(blocked)) 345 + } else { 346 + true 347 + } 319 348 }) 320 - .filter(|result| content_filter.matches(result)) 321 - .take(top_k_val) 322 349 .collect(); 323 350 324 351 let results_count = results.len() as i64; 325 - let top_result_name = results 326 - .first() 327 - .map(|r| r.name.clone()) 328 - .unwrap_or_else(|| "none".to_string()); 352 + let top_result_name = results.first().map(|r| r.name.clone()).unwrap_or_else(|| "none".to_string()); 329 353 let top_score_val = results.first().map(|r| r.score as f64).unwrap_or(0.0); 330 354 let avg_score_val = if !results.is_empty() { 331 355 results.iter().map(|r| r.score as f64).sum::<f64>() / results.len() as f64 ··· 355 379 query.top_k, 356 380 query.alpha, 357 381 query.family_friendly, 358 - query.exclude.clone(), 359 - query.include.clone(), 360 - &config, 361 - ) 362 - .await?; 382 + &config 383 + ).await?; 363 384 Ok(HttpResponse::Ok().json(response)) 364 385 } 365 386 ··· 369 390 config: web::Data<Config>, 370 391 req: HttpRequest, 371 392 ) -> ActixResult<HttpResponse> { 372 - let etag = generate_etag( 373 - &query.query, 374 - query.top_k, 375 - query.alpha, 376 - query.family_friendly, 377 - &query.exclude, 378 - &query.include, 379 - ); 393 + // generate etag for caching 394 + let etag = generate_etag(&query.query, query.top_k, query.alpha, query.family_friendly); 380 395 396 + // check if client has cached version 381 397 if let Some(if_none_match) = req.headers().get("if-none-match") { 382 398 if if_none_match.to_str().unwrap_or("") == etag { 383 399 return Ok(HttpResponse::NotModified() ··· 391 407 query.top_k, 392 408 query.alpha, 393 409 query.family_friendly, 394 - query.exclude.clone(), 395 - query.include.clone(), 396 - &config, 397 - ) 398 - .await?; 410 + &config 411 + ).await?; 399 412 400 413 Ok(HttpResponse::Ok() 401 414 .insert_header(("etag", etag.clone())) 402 - .insert_header(("cache-control", "public, max-age=300")) 415 + .insert_header(("cache-control", "public, max-age=300")) // cache for 5 minutes 403 416 .json(response)) 404 417 }
+92 -100
src/turbopuffer.rs
··· 1 - //! turbopuffer vector database implementation 2 - //! 3 - //! implements the `VectorStore` trait for turbopuffer's hybrid search API. 4 - 5 - use crate::providers::{SearchResult, VectorSearchError, VectorStore}; 1 + use anyhow::{Context, Result}; 6 2 use reqwest::Client; 7 3 use serde::{Deserialize, Serialize}; 8 - 9 - const TURBOPUFFER_API_BASE: &str = "https://api.turbopuffer.com/v1/vectors"; 10 - 11 - /// raw response row from turbopuffer API 12 - #[derive(Debug, Deserialize, Serialize, Clone)] 13 - pub struct QueryRow { 14 - pub id: String, 15 - pub dist: f32, 16 - pub attributes: serde_json::Map<String, serde_json::Value>, 17 - } 18 - 19 - impl From<QueryRow> for SearchResult { 20 - fn from(row: QueryRow) -> Self { 21 - let attributes = row 22 - .attributes 23 - .iter() 24 - .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string()))) 25 - .collect(); 4 + use thiserror::Error; 26 5 27 - SearchResult { 28 - id: row.id, 29 - score: row.dist, 30 - attributes, 31 - } 32 - } 6 + #[derive(Debug, Error)] 7 + pub enum TurbopufferError { 8 + #[error("query too long: {message}")] 9 + QueryTooLong { message: String }, 10 + #[error("turbopuffer API error: {0}")] 11 + ApiError(String), 12 + #[error("request failed: {0}")] 13 + RequestFailed(#[from] reqwest::Error), 14 + #[error("{0}")] 15 + Other(#[from] anyhow::Error), 33 16 } 34 17 35 18 #[derive(Debug, Deserialize)] 36 - struct ErrorResponse { 19 + struct TurbopufferErrorResponse { 37 20 error: String, 38 21 #[allow(dead_code)] 39 22 status: String, 40 23 } 41 24 42 - /// turbopuffer vector database client 43 - /// 44 - /// supports both ANN vector search and BM25 full-text search. 45 - #[derive(Clone)] 46 - pub struct TurbopufferStore { 25 + #[derive(Debug, Serialize)] 26 + pub struct QueryRequest { 27 + pub rank_by: Vec<serde_json::Value>, 28 + pub top_k: usize, 29 + #[serde(skip_serializing_if = "Option::is_none")] 30 + pub include_attributes: Option<Vec<String>>, 31 + } 32 + 33 + pub type QueryResponse = Vec<QueryRow>; 34 + 35 + #[derive(Debug, Deserialize, Serialize, Clone)] 36 + pub struct QueryRow { 37 + pub id: String, 38 + pub dist: f32, // for vector: cosine distance; for BM25: BM25 score 39 + pub attributes: serde_json::Map<String, serde_json::Value>, 40 + } 41 + 42 + pub struct TurbopufferClient { 47 43 client: Client, 48 44 api_key: String, 49 45 namespace: String, 50 46 } 51 47 52 - impl TurbopufferStore { 48 + impl TurbopufferClient { 53 49 pub fn new(api_key: String, namespace: String) -> Self { 54 50 Self { 55 51 client: Client::new(), ··· 58 54 } 59 55 } 60 56 61 - fn query_url(&self) -> String { 62 - format!("{}/{}/query", TURBOPUFFER_API_BASE, self.namespace) 63 - } 57 + pub async fn query(&self, request: QueryRequest) -> Result<QueryResponse> { 58 + let url = format!( 59 + "https://api.turbopuffer.com/v1/vectors/{}/query", 60 + self.namespace 61 + ); 64 62 65 - async fn execute_query( 66 - &self, 67 - request: serde_json::Value, 68 - ) -> Result<Vec<QueryRow>, VectorSearchError> { 63 + let request_json = serde_json::to_string_pretty(&request)?; 64 + log::debug!("turbopuffer query request: {}", request_json); 65 + 69 66 let response = self 70 67 .client 71 - .post(self.query_url()) 68 + .post(&url) 72 69 .header("Authorization", format!("Bearer {}", self.api_key)) 73 70 .json(&request) 74 71 .send() 75 - .await?; 72 + .await 73 + .context("failed to send query request")?; 76 74 77 75 if !response.status().is_success() { 78 - let status = response.status().as_u16(); 76 + let status = response.status(); 79 77 let body = response.text().await.unwrap_or_default(); 80 - 81 - // check for specific error types 82 - if let Ok(error_resp) = serde_json::from_str::<ErrorResponse>(&body) { 83 - if error_resp.error.contains("too long") && error_resp.error.contains("max 1024") { 84 - return Err(VectorSearchError::QueryTooLong { 85 - message: error_resp.error, 86 - }); 87 - } 88 - } 89 - 90 - return Err(VectorSearchError::Api { status, body }); 78 + anyhow::bail!("turbopuffer query failed with status {}: {}", status, body); 91 79 } 92 80 93 - let body = response.text().await.map_err(|e| { 94 - VectorSearchError::Other(anyhow::anyhow!("failed to read response: {}", e)) 95 - })?; 81 + let body = response.text().await.context("failed to read response body")?; 96 82 97 83 serde_json::from_str(&body) 98 - .map_err(|e| VectorSearchError::Parse(format!("failed to parse response: {}", e))) 84 + .context(format!("failed to parse query response: {}", body)) 99 85 } 100 - } 101 86 102 - impl VectorStore for TurbopufferStore { 103 - async fn search_by_vector( 104 - &self, 105 - embedding: &[f32], 106 - top_k: usize, 107 - ) -> Result<Vec<SearchResult>, VectorSearchError> { 87 + pub async fn bm25_query(&self, query_text: &str, top_k: usize) -> Result<QueryResponse, TurbopufferError> { 88 + let url = format!( 89 + "https://api.turbopuffer.com/v1/vectors/{}/query", 90 + self.namespace 91 + ); 92 + 108 93 let request = serde_json::json!({ 109 - "rank_by": ["vector", "ANN", embedding], 94 + "rank_by": ["name", "BM25", query_text], 110 95 "top_k": top_k, 111 96 "include_attributes": ["url", "name", "filename"], 112 97 }); 113 98 114 - log::debug!( 115 - "turbopuffer vector query: {}", 116 - serde_json::to_string_pretty(&request).unwrap_or_default() 117 - ); 99 + if let Ok(pretty) = serde_json::to_string_pretty(&request) { 100 + log::debug!("turbopuffer BM25 query request: {}", pretty); 101 + } 118 102 119 - let rows = self.execute_query(request).await?; 120 - Ok(rows.into_iter().map(SearchResult::from).collect()) 121 - } 103 + let response = self 104 + .client 105 + .post(&url) 106 + .header("Authorization", format!("Bearer {}", self.api_key)) 107 + .json(&request) 108 + .send() 109 + .await?; 122 110 123 - async fn search_by_keyword( 124 - &self, 125 - query: &str, 126 - top_k: usize, 127 - ) -> Result<Vec<SearchResult>, VectorSearchError> { 128 - let request = serde_json::json!({ 129 - "rank_by": ["name", "BM25", query], 130 - "top_k": top_k, 131 - "include_attributes": ["url", "name", "filename"], 132 - }); 111 + if !response.status().is_success() { 112 + let status = response.status(); 113 + let body = response.text().await.unwrap_or_default(); 133 114 134 - log::debug!( 135 - "turbopuffer BM25 query: {}", 136 - serde_json::to_string_pretty(&request).unwrap_or_default() 137 - ); 115 + // try to parse turbopuffer error response 116 + if let Ok(error_resp) = serde_json::from_str::<TurbopufferErrorResponse>(&body) { 117 + // check if it's a query length error 118 + if error_resp.error.contains("too long") && error_resp.error.contains("max 1024") { 119 + return Err(TurbopufferError::QueryTooLong { 120 + message: error_resp.error, 121 + }); 122 + } 123 + } 138 124 139 - let rows = self.execute_query(request).await?; 125 + return Err(TurbopufferError::ApiError(format!( 126 + "turbopuffer BM25 query failed with status {}: {}", 127 + status, body 128 + ))); 129 + } 140 130 141 - if let Some(first) = rows.first() { 142 - log::info!( 143 - "BM25 first result - id: {}, dist: {}, name: {:?}", 131 + let body = response.text().await 132 + .map_err(|e| TurbopufferError::Other(anyhow::anyhow!("failed to read response body: {}", e)))?; 133 + log::debug!("turbopuffer BM25 response: {}", body); 134 + 135 + let parsed: QueryResponse = serde_json::from_str(&body) 136 + .map_err(|e| TurbopufferError::Other(anyhow::anyhow!("failed to parse BM25 query response: {}", e)))?; 137 + 138 + // DEBUG: log first result to see what BM25 returns 139 + if let Some(first) = parsed.first() { 140 + log::info!("BM25 first result - id: {}, dist: {}, name: {:?}", 144 141 first.id, 145 142 first.dist, 146 143 first.attributes.get("name") 147 144 ); 148 145 } 149 146 150 - Ok(rows.into_iter().map(SearchResult::from).collect()) 151 - } 152 - 153 - fn name(&self) -> &'static str { 154 - "turbopuffer" 147 + Ok(parsed) 155 148 } 156 149 } 157 -
static/bufo-is-trapped-in-a-cameron-winter-phase.png

This is a binary file and will not be displayed.

+1 -46
static/index.html
··· 5 5 <meta name="viewport" content="width=device-width, initial-scale=1.0"> 6 6 <title>find bufo</title> 7 7 <link rel="icon" type="image/png" href="/static/favicon.png"> 8 - <link rel="apple-touch-icon" href="/static/favicon.png"> 9 - <link rel="manifest" href="/static/manifest.json"> 10 - <meta name="theme-color" content="#8ba888"> 11 8 <style> 12 9 * { 13 10 margin: 0; ··· 211 208 height: 18px; 212 209 cursor: pointer; 213 210 accent-color: #667eea; 214 - } 215 - 216 - .option-group a { 217 - color: #667eea; 218 - text-decoration: none; 219 - } 220 - 221 - .option-group a:hover { 222 - text-decoration: underline; 223 211 } 224 212 225 213 .sample-queries-container { ··· 553 541 id="searchInput" 554 542 placeholder="describe the bufo you seek..." 555 543 autocomplete="off" 556 - autofocus 557 544 > 558 545 <button id="searchButton">search</button> 559 546 </div> ··· 610 597 <span>enabled</span> 611 598 </label> 612 599 </div> 613 - 614 - <div class="option-group"> 615 - <div class="option-label"> 616 - <span class="option-name">exclude patterns</span> 617 - </div> 618 - <div class="option-description"> 619 - comma-separated <a href="https://regex101.com/" target="_blank">regex</a> patterns to exclude (e.g., excited,party) 620 - <br> 621 - <span style="color: #999; font-size: 0.9em;">new to regex? <a href="https://claude.ai" target="_blank">claude</a> can write patterns for you</span> 622 - </div> 623 - <input 624 - type="text" 625 - id="excludeInput" 626 - placeholder="pattern1,pattern2" 627 - style="width: 100%; padding: 10px; font-size: 14px;" 628 - > 629 - </div> 630 600 </div> 631 601 </div> 632 602 ··· 659 629 const alphaSlider = document.getElementById('alphaSlider'); 660 630 const alphaValue = document.getElementById('alphaValue'); 661 631 const familyFriendlyCheckbox = document.getElementById('familyFriendlyCheckbox'); 662 - const excludeInput = document.getElementById('excludeInput'); 663 632 664 633 let hasSearched = false; 665 634 ··· 683 652 684 653 const alpha = parseFloat(alphaSlider.value); 685 654 const familyFriendly = familyFriendlyCheckbox.checked; 686 - const exclude = excludeInput.value.trim(); 687 655 688 656 // hide bufo after first search 689 657 if (!hasSearched) { ··· 698 666 params.set('top_k', '20'); 699 667 params.set('alpha', alpha.toString()); 700 668 params.set('family_friendly', familyFriendly.toString()); 701 - if (exclude) params.set('exclude', exclude); 702 669 const newUrl = `${window.location.pathname}?${params.toString()}`; 703 - window.history.pushState({ query, alpha, familyFriendly, exclude }, '', newUrl); 670 + window.history.pushState({ query, alpha, familyFriendly }, '', newUrl); 704 671 } 705 672 706 673 searchButton.disabled = true; ··· 715 682 params.set('top_k', '20'); 716 683 params.set('alpha', alpha.toString()); 717 684 params.set('family_friendly', familyFriendly.toString()); 718 - if (exclude) params.set('exclude', exclude); 719 685 720 686 const response = await fetch(`/api/search?${params.toString()}`, { 721 687 method: 'GET', ··· 792 758 if (e.state.familyFriendly !== undefined) { 793 759 familyFriendlyCheckbox.checked = e.state.familyFriendly; 794 760 } 795 - if (e.state.exclude !== undefined) { 796 - excludeInput.value = e.state.exclude; 797 - } 798 761 search(false); 799 762 } 800 763 }); ··· 805 768 const query = params.get('q'); 806 769 const alpha = params.get('alpha'); 807 770 const familyFriendly = params.get('family_friendly'); 808 - const exclude = params.get('exclude'); 809 771 810 772 if (alpha) { 811 773 alphaSlider.value = alpha; ··· 816 778 familyFriendlyCheckbox.checked = familyFriendly === 'true'; 817 779 } 818 780 819 - if (exclude) { 820 - excludeInput.value = exclude; 821 - } 822 - 823 781 if (query) { 824 782 searchInput.value = query; 825 783 search(false); // don't update URL since we're already loading from it 826 784 } 827 - 828 - // ensure focus on the search input 829 - searchInput.focus(); 830 785 }); 831 786 832 787 // handle sample query button clicks
-17
static/manifest.json
··· 1 - { 2 - "name": "find bufo", 3 - "short_name": "find bufo", 4 - "description": "hybrid search for bufo.zone", 5 - "start_url": "/", 6 - "display": "standalone", 7 - "background_color": "#8ba888", 8 - "theme_color": "#8ba888", 9 - "icons": [ 10 - { 11 - "src": "/static/favicon.png", 12 - "sizes": "112x112", 13 - "type": "image/png", 14 - "purpose": "any maskable" 15 - } 16 - ] 17 - }