-4
.gitignore
-4
.gitignore
-1
Cargo.lock
-1
Cargo.lock
-1
Cargo.toml
-1
Cargo.toml
···
24
24
opentelemetry = { version = "0.26", features = ["trace", "metrics"] }
25
25
opentelemetry-instrumentation-actix-web = { version = "0.23", features = ["metrics"] }
26
26
opentelemetry-otlp = { version = "0.26", features = ["trace", "http-proto", "reqwest-client", "reqwest-rustls"] }
27
-
regex = "1.12"
+1
-1
README.md
+1
-1
README.md
-32
bot/Dockerfile
-32
bot/Dockerfile
···
1
-
# build stage
2
-
FROM debian:bookworm-slim AS builder
3
-
4
-
RUN apt-get update && apt-get install -y --no-install-recommends \
5
-
ca-certificates \
6
-
curl \
7
-
xz-utils \
8
-
&& rm -rf /var/lib/apt/lists/*
9
-
10
-
# install zig 0.15.2
11
-
RUN curl -L https://ziglang.org/download/0.15.2/zig-x86_64-linux-0.15.2.tar.xz | tar -xJ -C /usr/local \
12
-
&& ln -s /usr/local/zig-x86_64-linux-0.15.2/zig /usr/local/bin/zig
13
-
14
-
WORKDIR /app
15
-
COPY build.zig build.zig.zon ./
16
-
COPY src ./src
17
-
18
-
RUN zig build -Doptimize=ReleaseSafe
19
-
20
-
# runtime stage
21
-
FROM debian:bookworm-slim
22
-
23
-
RUN apt-get update && apt-get install -y --no-install-recommends \
24
-
ca-certificates \
25
-
&& rm -rf /var/lib/apt/lists/* \
26
-
# prefer IPv4 over IPv6 for outbound connections (IPv6 times out in Fly.io)
27
-
&& echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf
28
-
29
-
WORKDIR /app
30
-
COPY --from=builder /app/zig-out/bin/bufo-bot .
31
-
32
-
CMD ["./bufo-bot"]
-53
bot/README.md
-53
bot/README.md
···
1
-
# bufo-bot
2
-
3
-
bluesky bot that listens to the jetstream firehose and quote-posts matching bufo images.
4
-
5
-
## how it works
6
-
7
-
1. connects to bluesky jetstream (firehose)
8
-
2. for each post, checks if text contains an exact phrase matching a bufo name
9
-
3. if matched, quote-posts with the corresponding bufo image
10
-
11
-
## matching logic
12
-
13
-
- extracts phrase from bufo filename (e.g., `bufo-let-them-eat-cake` -> `let them eat cake`)
14
-
- requires exact consecutive word match in post text
15
-
- configurable minimum phrase length (default: 4 words)
16
-
17
-
## configuration
18
-
19
-
| env var | default | description |
20
-
|---------|---------|-------------|
21
-
| `BSKY_HANDLE` | required | bluesky handle (e.g., `find-bufo.com`) |
22
-
| `BSKY_APP_PASSWORD` | required | app password from bsky settings |
23
-
| `MIN_PHRASE_WORDS` | `4` | minimum words in phrase to match |
24
-
| `POSTING_ENABLED` | `false` | must be `true` to actually post |
25
-
| `COOLDOWN_MINUTES` | `120` | don't repost same bufo within this time |
26
-
| `EXCLUDE_PATTERNS` | `...` | exclude bufos matching these patterns |
27
-
| `JETSTREAM_ENDPOINT` | `jetstream2.us-east.bsky.network` | jetstream server |
28
-
29
-
## local dev
30
-
31
-
```bash
32
-
# build
33
-
zig build
34
-
35
-
# run locally (dry run by default)
36
-
./zig-out/bin/bufo-bot
37
-
```
38
-
39
-
## deploy
40
-
41
-
```bash
42
-
# set secrets (once)
43
-
fly secrets set BSKY_HANDLE=find-bufo.com BSKY_APP_PASSWORD=xxxx -a bufo-bot
44
-
45
-
# deploy
46
-
fly deploy
47
-
48
-
# enable posting
49
-
fly secrets set POSTING_ENABLED=true -a bufo-bot
50
-
51
-
# check logs
52
-
fly logs -a bufo-bot
53
-
```
-34
bot/build.zig
-34
bot/build.zig
···
1
-
const std = @import("std");
2
-
3
-
pub fn build(b: *std.Build) void {
4
-
const target = b.standardTargetOptions(.{});
5
-
const optimize = b.standardOptimizeOption(.{});
6
-
7
-
const websocket = b.dependency("websocket", .{
8
-
.target = target,
9
-
.optimize = optimize,
10
-
});
11
-
12
-
const exe = b.addExecutable(.{
13
-
.name = "bufo-bot",
14
-
.root_module = b.createModule(.{
15
-
.root_source_file = b.path("src/main.zig"),
16
-
.target = target,
17
-
.optimize = optimize,
18
-
.imports = &.{
19
-
.{ .name = "websocket", .module = websocket.module("websocket") },
20
-
},
21
-
}),
22
-
});
23
-
24
-
b.installArtifact(exe);
25
-
26
-
const run_cmd = b.addRunArtifact(exe);
27
-
run_cmd.step.dependOn(b.getInstallStep());
28
-
if (b.args) |args| {
29
-
run_cmd.addArgs(args);
30
-
}
31
-
32
-
const run_step = b.step("run", "Run the bot");
33
-
run_step.dependOn(&run_cmd.step);
34
-
}
-17
bot/build.zig.zon
-17
bot/build.zig.zon
···
1
-
.{
2
-
.name = .bufo_bot,
3
-
.version = "0.0.1",
4
-
.fingerprint = 0xe143490f82fa96db,
5
-
.minimum_zig_version = "0.15.0",
6
-
.dependencies = .{
7
-
.websocket = .{
8
-
.url = "https://github.com/karlseguin/websocket.zig/archive/refs/heads/master.tar.gz",
9
-
.hash = "websocket-0.1.0-ZPISdRNzAwAGszh62EpRtoQxu8wb1MSMVI6Ow0o2dmyJ",
10
-
},
11
-
},
12
-
.paths = .{
13
-
"build.zig",
14
-
"build.zig.zon",
15
-
"src",
16
-
},
17
-
}
-31
bot/fly.toml
-31
bot/fly.toml
···
1
-
app = "bufo-bot"
2
-
primary_region = "ewr"
3
-
4
-
[build]
5
-
dockerfile = "Dockerfile"
6
-
7
-
[env]
8
-
JETSTREAM_ENDPOINT = "jetstream2.us-east.bsky.network"
9
-
STATS_PORT = "8080"
10
-
11
-
[http_service]
12
-
internal_port = 8080
13
-
force_https = true
14
-
auto_stop_machines = "off"
15
-
auto_start_machines = true
16
-
min_machines_running = 1
17
-
max_machines_running = 1 # IMPORTANT: only 1 instance - bot consumes jetstream firehose
18
-
19
-
[[vm]]
20
-
memory = "256mb"
21
-
cpu_kind = "shared"
22
-
cpus = 1
23
-
24
-
[mounts]
25
-
source = "bufo_data"
26
-
destination = "/data"
27
-
28
-
# secrets to set via: fly secrets set KEY=value -a bufo-bot
29
-
# - BSKY_HANDLE (e.g., find-bufo.com)
30
-
# - BSKY_APP_PASSWORD (app password from bsky settings)
31
-
# - POSTING_ENABLED=true (to enable posting, default is false)
-25
bot/justfile
-25
bot/justfile
···
1
-
# bot/justfile
2
-
set shell := ["bash", "-eu", "-o", "pipefail", "-c"]
3
-
4
-
default:
5
-
@just --list
6
-
7
-
# build the bot
8
-
build:
9
-
zig build
10
-
11
-
# run the bot locally
12
-
run:
13
-
zig build run
14
-
15
-
# deploy to fly.io
16
-
deploy:
17
-
fly deploy --wait-timeout 180
18
-
19
-
# check logs
20
-
logs:
21
-
fly logs -a bufo-bot
22
-
23
-
# set secrets (run once)
24
-
secrets HANDLE PASSWORD:
25
-
fly secrets set BSKY_HANDLE={{ HANDLE }} BSKY_APP_PASSWORD={{ PASSWORD }} -a bufo-bot
-525
bot/src/bsky.zig
-525
bot/src/bsky.zig
···
1
-
const std = @import("std");
2
-
const mem = std.mem;
3
-
const json = std.json;
4
-
const http = std.http;
5
-
const Allocator = mem.Allocator;
6
-
const Io = std.Io;
7
-
8
-
pub const BskyClient = struct {
9
-
allocator: Allocator,
10
-
handle: []const u8,
11
-
app_password: []const u8,
12
-
access_jwt: ?[]const u8 = null,
13
-
did: ?[]const u8 = null,
14
-
pds_host: ?[]const u8 = null,
15
-
16
-
pub fn init(allocator: Allocator, handle: []const u8, app_password: []const u8) BskyClient {
17
-
return .{
18
-
.allocator = allocator,
19
-
.handle = handle,
20
-
.app_password = app_password,
21
-
};
22
-
}
23
-
24
-
pub fn deinit(self: *BskyClient) void {
25
-
if (self.access_jwt) |jwt| self.allocator.free(jwt);
26
-
if (self.did) |did| self.allocator.free(did);
27
-
if (self.pds_host) |host| self.allocator.free(host);
28
-
}
29
-
30
-
fn httpClient(self: *BskyClient) http.Client {
31
-
return .{ .allocator = self.allocator };
32
-
}
33
-
34
-
pub fn login(self: *BskyClient) !void {
35
-
std.debug.print("logging in as {s}...\n", .{self.handle});
36
-
37
-
var client = self.httpClient();
38
-
defer client.deinit();
39
-
40
-
var body_buf: std.ArrayList(u8) = .{};
41
-
defer body_buf.deinit(self.allocator);
42
-
try body_buf.print(self.allocator, "{{\"identifier\":\"{s}\",\"password\":\"{s}\"}}", .{ self.handle, self.app_password });
43
-
44
-
var aw: Io.Writer.Allocating = .init(self.allocator);
45
-
defer aw.deinit();
46
-
47
-
const result = client.fetch(.{
48
-
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.server.createSession" },
49
-
.method = .POST,
50
-
.headers = .{ .content_type = .{ .override = "application/json" } },
51
-
.payload = body_buf.items,
52
-
.response_writer = &aw.writer,
53
-
}) catch |err| {
54
-
std.debug.print("login request failed: {}\n", .{err});
55
-
return err;
56
-
};
57
-
58
-
if (result.status != .ok) {
59
-
std.debug.print("login failed with status: {}\n", .{result.status});
60
-
return error.LoginFailed;
61
-
}
62
-
63
-
const response = aw.toArrayList();
64
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
65
-
defer parsed.deinit();
66
-
67
-
const root = parsed.value.object;
68
-
69
-
const jwt_val = root.get("accessJwt") orelse return error.NoJwt;
70
-
if (jwt_val != .string) return error.NoJwt;
71
-
72
-
const did_val = root.get("did") orelse return error.NoDid;
73
-
if (did_val != .string) return error.NoDid;
74
-
75
-
self.access_jwt = try self.allocator.dupe(u8, jwt_val.string);
76
-
self.did = try self.allocator.dupe(u8, did_val.string);
77
-
78
-
// fetch PDS host from PLC directory
79
-
try self.fetchPdsHost();
80
-
81
-
std.debug.print("logged in as {s} (did: {s}, pds: {s})\n", .{ self.handle, self.did.?, self.pds_host.? });
82
-
}
83
-
84
-
fn fetchPdsHost(self: *BskyClient) !void {
85
-
var client = self.httpClient();
86
-
defer client.deinit();
87
-
88
-
var url_buf: [256]u8 = undefined;
89
-
const url = std.fmt.bufPrint(&url_buf, "https://plc.directory/{s}", .{self.did.?}) catch return error.UrlTooLong;
90
-
91
-
var aw: Io.Writer.Allocating = .init(self.allocator);
92
-
defer aw.deinit();
93
-
94
-
const result = client.fetch(.{
95
-
.location = .{ .url = url },
96
-
.method = .GET,
97
-
.response_writer = &aw.writer,
98
-
}) catch |err| {
99
-
std.debug.print("fetch PDS host failed: {}\n", .{err});
100
-
return err;
101
-
};
102
-
103
-
if (result.status != .ok) {
104
-
std.debug.print("fetch PDS host failed with status: {}\n", .{result.status});
105
-
return error.PlcLookupFailed;
106
-
}
107
-
108
-
const response = aw.toArrayList();
109
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
110
-
defer parsed.deinit();
111
-
112
-
// find the atproto_pds service endpoint
113
-
const service = parsed.value.object.get("service") orelse return error.NoService;
114
-
if (service != .array) return error.NoService;
115
-
116
-
for (service.array.items) |svc| {
117
-
if (svc != .object) continue;
118
-
const id_val = svc.object.get("id") orelse continue;
119
-
if (id_val != .string) continue;
120
-
if (!mem.eql(u8, id_val.string, "#atproto_pds")) continue;
121
-
122
-
const endpoint_val = svc.object.get("serviceEndpoint") orelse continue;
123
-
if (endpoint_val != .string) continue;
124
-
125
-
// extract host from URL like "https://phellinus.us-west.host.bsky.network"
126
-
const endpoint = endpoint_val.string;
127
-
const prefix = "https://";
128
-
if (mem.startsWith(u8, endpoint, prefix)) {
129
-
self.pds_host = try self.allocator.dupe(u8, endpoint[prefix.len..]);
130
-
return;
131
-
}
132
-
}
133
-
134
-
return error.NoPdsService;
135
-
}
136
-
137
-
pub fn uploadBlob(self: *BskyClient, data: []const u8, content_type: []const u8) ![]const u8 {
138
-
if (self.access_jwt == null) return error.NotLoggedIn;
139
-
140
-
var client = self.httpClient();
141
-
defer client.deinit();
142
-
143
-
var auth_buf: [512]u8 = undefined;
144
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
145
-
146
-
var aw: Io.Writer.Allocating = .init(self.allocator);
147
-
defer aw.deinit();
148
-
149
-
const result = client.fetch(.{
150
-
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.uploadBlob" },
151
-
.method = .POST,
152
-
.headers = .{
153
-
.content_type = .{ .override = content_type },
154
-
.authorization = .{ .override = auth_header },
155
-
},
156
-
.payload = data,
157
-
.response_writer = &aw.writer,
158
-
}) catch |err| {
159
-
std.debug.print("upload blob failed: {}\n", .{err});
160
-
return err;
161
-
};
162
-
163
-
if (result.status != .ok) {
164
-
const err_response = aw.toArrayList();
165
-
std.debug.print("upload blob failed with status: {} - {s}\n", .{ result.status, err_response.items });
166
-
// check for expired token
167
-
if (mem.indexOf(u8, err_response.items, "ExpiredToken") != null) {
168
-
return error.ExpiredToken;
169
-
}
170
-
return error.UploadFailed;
171
-
}
172
-
173
-
const response = aw.toArrayList();
174
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
175
-
defer parsed.deinit();
176
-
177
-
const root = parsed.value.object;
178
-
const blob = root.get("blob") orelse return error.NoBlobRef;
179
-
if (blob != .object) return error.NoBlobRef;
180
-
181
-
return json.Stringify.valueAlloc(self.allocator, blob, .{}) catch return error.SerializeError;
182
-
}
183
-
184
-
pub fn createQuotePost(self: *BskyClient, quote_uri: []const u8, quote_cid: []const u8, blob_json: []const u8, alt_text: []const u8) !void {
185
-
if (self.access_jwt == null or self.did == null) return error.NotLoggedIn;
186
-
187
-
var client = self.httpClient();
188
-
defer client.deinit();
189
-
190
-
var body_buf: std.ArrayList(u8) = .{};
191
-
defer body_buf.deinit(self.allocator);
192
-
193
-
var ts_buf: [30]u8 = undefined;
194
-
try body_buf.print(self.allocator,
195
-
\\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{"$type":"app.bsky.feed.post","text":"","createdAt":"{s}","embed":{{"$type":"app.bsky.embed.recordWithMedia","record":{{"$type":"app.bsky.embed.record","record":{{"uri":"{s}","cid":"{s}"}}}},"media":{{"$type":"app.bsky.embed.images","images":[{{"image":{s},"alt":"{s}"}}]}}}}}}}}
196
-
, .{ self.did.?, getIsoTimestamp(&ts_buf), quote_uri, quote_cid, blob_json, alt_text });
197
-
198
-
var auth_buf: [512]u8 = undefined;
199
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
200
-
201
-
var aw: Io.Writer.Allocating = .init(self.allocator);
202
-
defer aw.deinit();
203
-
204
-
const result = client.fetch(.{
205
-
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" },
206
-
.method = .POST,
207
-
.headers = .{
208
-
.content_type = .{ .override = "application/json" },
209
-
.authorization = .{ .override = auth_header },
210
-
},
211
-
.payload = body_buf.items,
212
-
.response_writer = &aw.writer,
213
-
}) catch |err| {
214
-
std.debug.print("create post failed: {}\n", .{err});
215
-
return err;
216
-
};
217
-
218
-
if (result.status != .ok) {
219
-
const response = aw.toArrayList();
220
-
std.debug.print("create post failed with status: {} - {s}\n", .{ result.status, response.items });
221
-
return error.PostFailed;
222
-
}
223
-
224
-
std.debug.print("posted successfully!\n", .{});
225
-
}
226
-
227
-
pub fn getPostCid(self: *BskyClient, uri: []const u8) ![]const u8 {
228
-
if (self.access_jwt == null) return error.NotLoggedIn;
229
-
230
-
var client = self.httpClient();
231
-
defer client.deinit();
232
-
233
-
var parts = mem.splitScalar(u8, uri[5..], '/');
234
-
const did = parts.next() orelse return error.InvalidUri;
235
-
_ = parts.next();
236
-
const rkey = parts.next() orelse return error.InvalidUri;
237
-
238
-
var url_buf: [512]u8 = undefined;
239
-
const url = std.fmt.bufPrint(&url_buf, "https://bsky.social/xrpc/com.atproto.repo.getRecord?repo={s}&collection=app.bsky.feed.post&rkey={s}", .{ did, rkey }) catch return error.UrlTooLong;
240
-
241
-
var auth_buf: [512]u8 = undefined;
242
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
243
-
244
-
var aw: Io.Writer.Allocating = .init(self.allocator);
245
-
defer aw.deinit();
246
-
247
-
const result = client.fetch(.{
248
-
.location = .{ .url = url },
249
-
.method = .GET,
250
-
.headers = .{ .authorization = .{ .override = auth_header } },
251
-
.response_writer = &aw.writer,
252
-
}) catch |err| {
253
-
std.debug.print("get record failed: {}\n", .{err});
254
-
return err;
255
-
};
256
-
257
-
if (result.status != .ok) {
258
-
return error.GetRecordFailed;
259
-
}
260
-
261
-
const response = aw.toArrayList();
262
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
263
-
defer parsed.deinit();
264
-
265
-
const cid_val = parsed.value.object.get("cid") orelse return error.NoCid;
266
-
if (cid_val != .string) return error.NoCid;
267
-
268
-
return try self.allocator.dupe(u8, cid_val.string);
269
-
}
270
-
271
-
pub fn fetchImage(self: *BskyClient, url: []const u8) ![]const u8 {
272
-
var client = self.httpClient();
273
-
defer client.deinit();
274
-
275
-
var aw: Io.Writer.Allocating = .init(self.allocator);
276
-
errdefer aw.deinit();
277
-
278
-
const result = client.fetch(.{
279
-
.location = .{ .url = url },
280
-
.method = .GET,
281
-
.response_writer = &aw.writer,
282
-
}) catch |err| {
283
-
std.debug.print("fetch image failed: {}\n", .{err});
284
-
return err;
285
-
};
286
-
287
-
if (result.status != .ok) {
288
-
aw.deinit();
289
-
return error.FetchFailed;
290
-
}
291
-
292
-
return try aw.toOwnedSlice();
293
-
}
294
-
295
-
pub fn getServiceAuth(self: *BskyClient) ![]const u8 {
296
-
if (self.access_jwt == null or self.did == null or self.pds_host == null) return error.NotLoggedIn;
297
-
298
-
var client = self.httpClient();
299
-
defer client.deinit();
300
-
301
-
var url_buf: [512]u8 = undefined;
302
-
const url = std.fmt.bufPrint(&url_buf, "https://bsky.social/xrpc/com.atproto.server.getServiceAuth?aud=did:web:{s}&lxm=com.atproto.repo.uploadBlob", .{self.pds_host.?}) catch return error.UrlTooLong;
303
-
304
-
var auth_buf: [512]u8 = undefined;
305
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
306
-
307
-
var aw: Io.Writer.Allocating = .init(self.allocator);
308
-
defer aw.deinit();
309
-
310
-
const result = client.fetch(.{
311
-
.location = .{ .url = url },
312
-
.method = .GET,
313
-
.headers = .{ .authorization = .{ .override = auth_header } },
314
-
.response_writer = &aw.writer,
315
-
}) catch |err| {
316
-
std.debug.print("get service auth failed: {}\n", .{err});
317
-
return err;
318
-
};
319
-
320
-
if (result.status != .ok) {
321
-
const err_response = aw.toArrayList();
322
-
std.debug.print("get service auth failed with status: {} - {s}\n", .{ result.status, err_response.items });
323
-
// check for expired token
324
-
if (mem.indexOf(u8, err_response.items, "ExpiredToken") != null) {
325
-
return error.ExpiredToken;
326
-
}
327
-
return error.ServiceAuthFailed;
328
-
}
329
-
330
-
const response = aw.toArrayList();
331
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
332
-
defer parsed.deinit();
333
-
334
-
const token_val = parsed.value.object.get("token") orelse return error.NoToken;
335
-
if (token_val != .string) return error.NoToken;
336
-
337
-
return try self.allocator.dupe(u8, token_val.string);
338
-
}
339
-
340
-
pub fn uploadVideo(self: *BskyClient, data: []const u8, filename: []const u8) ![]const u8 {
341
-
if (self.did == null) return error.NotLoggedIn;
342
-
343
-
// get service auth token
344
-
const service_token = try self.getServiceAuth();
345
-
defer self.allocator.free(service_token);
346
-
347
-
var client = self.httpClient();
348
-
defer client.deinit();
349
-
350
-
var url_buf: [512]u8 = undefined;
351
-
const url = std.fmt.bufPrint(&url_buf, "https://video.bsky.app/xrpc/app.bsky.video.uploadVideo?did={s}&name={s}", .{ self.did.?, filename }) catch return error.UrlTooLong;
352
-
353
-
var auth_buf: [512]u8 = undefined;
354
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{service_token}) catch return error.AuthTooLong;
355
-
356
-
var aw: Io.Writer.Allocating = .init(self.allocator);
357
-
defer aw.deinit();
358
-
359
-
const result = client.fetch(.{
360
-
.location = .{ .url = url },
361
-
.method = .POST,
362
-
.headers = .{
363
-
.content_type = .{ .override = "image/gif" },
364
-
.authorization = .{ .override = auth_header },
365
-
},
366
-
.payload = data,
367
-
.response_writer = &aw.writer,
368
-
}) catch |err| {
369
-
std.debug.print("upload video failed: {}\n", .{err});
370
-
return err;
371
-
};
372
-
373
-
const response = aw.toArrayList();
374
-
375
-
// handle both .ok and .conflict (already_exists) as success
376
-
if (result.status != .ok and result.status != .conflict) {
377
-
std.debug.print("upload video failed with status: {}\n", .{result.status});
378
-
return error.VideoUploadFailed;
379
-
}
380
-
381
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
382
-
defer parsed.deinit();
383
-
384
-
// for conflict responses, jobId is at root level; for ok responses, it's in jobStatus
385
-
var job_id_val: ?json.Value = null;
386
-
if (parsed.value.object.get("jobStatus")) |job_status| {
387
-
if (job_status == .object) {
388
-
job_id_val = job_status.object.get("jobId");
389
-
}
390
-
}
391
-
// fallback to root level jobId (conflict case)
392
-
if (job_id_val == null) {
393
-
job_id_val = parsed.value.object.get("jobId");
394
-
}
395
-
396
-
const job_id = job_id_val orelse {
397
-
std.debug.print("no jobId in response\n", .{});
398
-
return error.NoJobId;
399
-
};
400
-
if (job_id != .string) return error.NoJobId;
401
-
402
-
return try self.allocator.dupe(u8, job_id.string);
403
-
}
404
-
405
-
pub fn waitForVideo(self: *BskyClient, job_id: []const u8) ![]const u8 {
406
-
const service_token = try self.getServiceAuth();
407
-
defer self.allocator.free(service_token);
408
-
409
-
var url_buf: [512]u8 = undefined;
410
-
const url = std.fmt.bufPrint(&url_buf, "https://video.bsky.app/xrpc/app.bsky.video.getJobStatus?jobId={s}", .{job_id}) catch return error.UrlTooLong;
411
-
412
-
var auth_buf: [512]u8 = undefined;
413
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{service_token}) catch return error.AuthTooLong;
414
-
415
-
var attempts: u32 = 0;
416
-
while (attempts < 60) : (attempts += 1) {
417
-
var client = self.httpClient();
418
-
defer client.deinit();
419
-
420
-
var aw: Io.Writer.Allocating = .init(self.allocator);
421
-
defer aw.deinit();
422
-
423
-
const result = client.fetch(.{
424
-
.location = .{ .url = url },
425
-
.method = .GET,
426
-
.headers = .{ .authorization = .{ .override = auth_header } },
427
-
.response_writer = &aw.writer,
428
-
}) catch |err| {
429
-
std.debug.print("get job status failed: {}\n", .{err});
430
-
return err;
431
-
};
432
-
433
-
if (result.status != .ok) {
434
-
std.debug.print("get job status failed with status: {}\n", .{result.status});
435
-
return error.JobStatusFailed;
436
-
}
437
-
438
-
const response = aw.toArrayList();
439
-
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
440
-
defer parsed.deinit();
441
-
442
-
const job_status = parsed.value.object.get("jobStatus") orelse return error.NoJobStatus;
443
-
if (job_status != .object) return error.NoJobStatus;
444
-
445
-
const state_val = job_status.object.get("state") orelse continue;
446
-
if (state_val != .string) continue;
447
-
448
-
if (mem.eql(u8, state_val.string, "JOB_STATE_COMPLETED")) {
449
-
const blob = job_status.object.get("blob") orelse return error.NoBlobRef;
450
-
if (blob != .object) return error.NoBlobRef;
451
-
return json.Stringify.valueAlloc(self.allocator, blob, .{}) catch return error.SerializeError;
452
-
} else if (mem.eql(u8, state_val.string, "JOB_STATE_FAILED")) {
453
-
std.debug.print("video processing failed\n", .{});
454
-
return error.VideoProcessingFailed;
455
-
}
456
-
457
-
std.Thread.sleep(1 * std.time.ns_per_s);
458
-
}
459
-
460
-
return error.VideoTimeout;
461
-
}
462
-
463
-
pub fn createVideoQuotePost(self: *BskyClient, quote_uri: []const u8, quote_cid: []const u8, blob_json: []const u8, alt_text: []const u8) !void {
464
-
if (self.access_jwt == null or self.did == null) return error.NotLoggedIn;
465
-
466
-
var client = self.httpClient();
467
-
defer client.deinit();
468
-
469
-
var body_buf: std.ArrayList(u8) = .{};
470
-
defer body_buf.deinit(self.allocator);
471
-
472
-
var ts_buf: [30]u8 = undefined;
473
-
try body_buf.print(self.allocator,
474
-
\\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{"$type":"app.bsky.feed.post","text":"","createdAt":"{s}","embed":{{"$type":"app.bsky.embed.recordWithMedia","record":{{"$type":"app.bsky.embed.record","record":{{"uri":"{s}","cid":"{s}"}}}},"media":{{"$type":"app.bsky.embed.video","video":{s},"alt":"{s}"}}}}}}}}
475
-
, .{ self.did.?, getIsoTimestamp(&ts_buf), quote_uri, quote_cid, blob_json, alt_text });
476
-
477
-
var auth_buf: [512]u8 = undefined;
478
-
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
479
-
480
-
var aw: Io.Writer.Allocating = .init(self.allocator);
481
-
defer aw.deinit();
482
-
483
-
const result = client.fetch(.{
484
-
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" },
485
-
.method = .POST,
486
-
.headers = .{
487
-
.content_type = .{ .override = "application/json" },
488
-
.authorization = .{ .override = auth_header },
489
-
},
490
-
.payload = body_buf.items,
491
-
.response_writer = &aw.writer,
492
-
}) catch |err| {
493
-
std.debug.print("create video post failed: {}\n", .{err});
494
-
return err;
495
-
};
496
-
497
-
if (result.status != .ok) {
498
-
const response = aw.toArrayList();
499
-
std.debug.print("create video post failed with status: {} - {s}\n", .{ result.status, response.items });
500
-
return error.PostFailed;
501
-
}
502
-
503
-
std.debug.print("posted video successfully!\n", .{});
504
-
}
505
-
};
506
-
507
-
fn getIsoTimestamp(buf: *[30]u8) []const u8 {
508
-
const ts = std.time.timestamp();
509
-
const epoch_secs: u64 = @intCast(ts);
510
-
const epoch = std.time.epoch.EpochSeconds{ .secs = epoch_secs };
511
-
const day = epoch.getEpochDay();
512
-
const year_day = day.calculateYearDay();
513
-
const month_day = year_day.calculateMonthDay();
514
-
const day_secs = epoch.getDaySeconds();
515
-
516
-
const len = std.fmt.bufPrint(buf, "{d:0>4}-{d:0>2}-{d:0>2}T{d:0>2}:{d:0>2}:{d:0>2}.000Z", .{
517
-
year_day.year,
518
-
month_day.month.numeric(),
519
-
month_day.day_index + 1,
520
-
day_secs.getHoursIntoDay(),
521
-
day_secs.getMinutesIntoHour(),
522
-
day_secs.getSecondsIntoMinute(),
523
-
}) catch return "2025-01-01T00:00:00.000Z";
524
-
return buf[0..len.len];
525
-
}
-47
bot/src/config.zig
-47
bot/src/config.zig
···
1
-
const std = @import("std");
2
-
const posix = std.posix;
3
-
4
-
pub const Config = struct {
5
-
bsky_handle: []const u8,
6
-
bsky_app_password: []const u8,
7
-
jetstream_endpoint: []const u8,
8
-
min_phrase_words: u32,
9
-
posting_enabled: bool,
10
-
cooldown_minutes: u32,
11
-
exclude_patterns: []const u8,
12
-
stats_port: u16,
13
-
14
-
pub fn fromEnv() Config {
15
-
return .{
16
-
.bsky_handle = posix.getenv("BSKY_HANDLE") orelse "find-bufo.com",
17
-
.bsky_app_password = posix.getenv("BSKY_APP_PASSWORD") orelse "",
18
-
.jetstream_endpoint = posix.getenv("JETSTREAM_ENDPOINT") orelse "jetstream2.us-east.bsky.network",
19
-
.min_phrase_words = parseU32(posix.getenv("MIN_PHRASE_WORDS"), 4),
20
-
.posting_enabled = parseBool(posix.getenv("POSTING_ENABLED")),
21
-
.cooldown_minutes = parseU32(posix.getenv("COOLDOWN_MINUTES"), 120),
22
-
.exclude_patterns = posix.getenv("EXCLUDE_PATTERNS") orelse "what-have-you-done,what-have-i-done,sad,crying,cant-take",
23
-
.stats_port = parseU16(posix.getenv("STATS_PORT"), 8080),
24
-
};
25
-
}
26
-
};
27
-
28
-
fn parseU16(str: ?[]const u8, default: u16) u16 {
29
-
if (str) |s| {
30
-
return std.fmt.parseInt(u16, s, 10) catch default;
31
-
}
32
-
return default;
33
-
}
34
-
35
-
fn parseU32(str: ?[]const u8, default: u32) u32 {
36
-
if (str) |s| {
37
-
return std.fmt.parseInt(u32, s, 10) catch default;
38
-
}
39
-
return default;
40
-
}
41
-
42
-
fn parseBool(str: ?[]const u8) bool {
43
-
if (str) |s| {
44
-
return std.mem.eql(u8, s, "true") or std.mem.eql(u8, s, "1");
45
-
}
46
-
return false;
47
-
}
-143
bot/src/jetstream.zig
-143
bot/src/jetstream.zig
···
1
-
const std = @import("std");
2
-
const mem = std.mem;
3
-
const json = std.json;
4
-
const posix = std.posix;
5
-
const Allocator = mem.Allocator;
6
-
const websocket = @import("websocket");
7
-
8
-
pub const Post = struct {
9
-
uri: []const u8,
10
-
text: []const u8,
11
-
did: []const u8,
12
-
rkey: []const u8,
13
-
};
14
-
15
-
pub const JetstreamClient = struct {
16
-
allocator: Allocator,
17
-
host: []const u8,
18
-
callback: *const fn (Post) void,
19
-
20
-
pub fn init(allocator: Allocator, host: []const u8, callback: *const fn (Post) void) JetstreamClient {
21
-
return .{
22
-
.allocator = allocator,
23
-
.host = host,
24
-
.callback = callback,
25
-
};
26
-
}
27
-
28
-
pub fn run(self: *JetstreamClient) void {
29
-
// exponential backoff: 1s -> 2s -> 4s -> ... -> 60s cap
30
-
var backoff: u64 = 1;
31
-
const max_backoff: u64 = 60;
32
-
33
-
while (true) {
34
-
self.connect() catch |err| {
35
-
std.debug.print("jetstream error: {}, reconnecting in {}s...\n", .{ err, backoff });
36
-
};
37
-
posix.nanosleep(backoff, 0);
38
-
backoff = @min(backoff * 2, max_backoff);
39
-
}
40
-
}
41
-
42
-
fn connect(self: *JetstreamClient) !void {
43
-
const path = "/subscribe?wantedCollections=app.bsky.feed.post";
44
-
45
-
std.debug.print("connecting to wss://{s}{s}\n", .{ self.host, path });
46
-
47
-
var client = websocket.Client.init(self.allocator, .{
48
-
.host = self.host,
49
-
.port = 443,
50
-
.tls = true,
51
-
.max_size = 1024 * 1024, // 1MB - some jetstream messages are large
52
-
}) catch |err| {
53
-
std.debug.print("websocket client init failed: {}\n", .{err});
54
-
return err;
55
-
};
56
-
defer client.deinit();
57
-
58
-
var host_header_buf: [256]u8 = undefined;
59
-
const host_header = std.fmt.bufPrint(&host_header_buf, "Host: {s}\r\n", .{self.host}) catch self.host;
60
-
61
-
client.handshake(path, .{ .headers = host_header }) catch |err| {
62
-
std.debug.print("websocket handshake failed: {}\n", .{err});
63
-
return err;
64
-
};
65
-
66
-
std.debug.print("jetstream connected!\n", .{});
67
-
68
-
var handler = Handler{ .allocator = self.allocator, .callback = self.callback };
69
-
client.readLoop(&handler) catch |err| {
70
-
std.debug.print("websocket read loop error: {}\n", .{err});
71
-
return err;
72
-
};
73
-
}
74
-
};
75
-
76
-
const Handler = struct {
77
-
allocator: Allocator,
78
-
callback: *const fn (Post) void,
79
-
80
-
pub fn serverMessage(self: *Handler, data: []const u8) !void {
81
-
self.processMessage(data) catch |err| {
82
-
if (err != error.NotAPost) {
83
-
std.debug.print("message processing error: {}\n", .{err});
84
-
}
85
-
};
86
-
}
87
-
88
-
pub fn close(_: *Handler) void {
89
-
std.debug.print("jetstream connection closed\n", .{});
90
-
}
91
-
92
-
fn processMessage(self: *Handler, payload: []const u8) !void {
93
-
// jetstream format:
94
-
// { "did": "...", "kind": "commit", "commit": { "collection": "app.bsky.feed.post", "rkey": "...", "record": { "text": "...", ... } } }
95
-
const parsed = json.parseFromSlice(json.Value, self.allocator, payload, .{}) catch return error.ParseError;
96
-
defer parsed.deinit();
97
-
98
-
const root = parsed.value.object;
99
-
100
-
// check kind
101
-
const kind = root.get("kind") orelse return error.NotAPost;
102
-
if (kind != .string or !mem.eql(u8, kind.string, "commit")) return error.NotAPost;
103
-
104
-
// get did
105
-
const did_val = root.get("did") orelse return error.NotAPost;
106
-
if (did_val != .string) return error.NotAPost;
107
-
108
-
// get commit
109
-
const commit = root.get("commit") orelse return error.NotAPost;
110
-
if (commit != .object) return error.NotAPost;
111
-
112
-
// check collection
113
-
const collection = commit.object.get("collection") orelse return error.NotAPost;
114
-
if (collection != .string or !mem.eql(u8, collection.string, "app.bsky.feed.post")) return error.NotAPost;
115
-
116
-
// check operation (create only)
117
-
const operation = commit.object.get("operation") orelse return error.NotAPost;
118
-
if (operation != .string or !mem.eql(u8, operation.string, "create")) return error.NotAPost;
119
-
120
-
// get rkey
121
-
const rkey_val = commit.object.get("rkey") orelse return error.NotAPost;
122
-
if (rkey_val != .string) return error.NotAPost;
123
-
124
-
// get record
125
-
const record = commit.object.get("record") orelse return error.NotAPost;
126
-
if (record != .object) return error.NotAPost;
127
-
128
-
// get text
129
-
const text_val = record.object.get("text") orelse return error.NotAPost;
130
-
if (text_val != .string) return error.NotAPost;
131
-
132
-
// construct uri
133
-
var uri_buf: [256]u8 = undefined;
134
-
const uri = std.fmt.bufPrint(&uri_buf, "at://{s}/app.bsky.feed.post/{s}", .{ did_val.string, rkey_val.string }) catch return error.UriTooLong;
135
-
136
-
self.callback(.{
137
-
.uri = uri,
138
-
.text = text_val.string,
139
-
.did = did_val.string,
140
-
.rkey = rkey_val.string,
141
-
});
142
-
}
143
-
};
-243
bot/src/main.zig
-243
bot/src/main.zig
···
1
-
const std = @import("std");
2
-
const mem = std.mem;
3
-
const json = std.json;
4
-
const http = std.http;
5
-
const Thread = std.Thread;
6
-
const Allocator = mem.Allocator;
7
-
const config = @import("config.zig");
8
-
const matcher = @import("matcher.zig");
9
-
const jetstream = @import("jetstream.zig");
10
-
const bsky = @import("bsky.zig");
11
-
const stats = @import("stats.zig");
12
-
13
-
var global_state: ?*BotState = null;
14
-
15
-
const BotState = struct {
16
-
allocator: Allocator,
17
-
config: config.Config,
18
-
matcher: matcher.Matcher,
19
-
bsky_client: bsky.BskyClient,
20
-
recent_bufos: std.StringHashMap(i64), // name -> timestamp
21
-
mutex: Thread.Mutex = .{},
22
-
stats: stats.Stats,
23
-
};
24
-
25
-
pub fn main() !void {
26
-
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
27
-
defer _ = gpa.deinit();
28
-
const allocator = gpa.allocator();
29
-
30
-
std.debug.print("starting bufo bot...\n", .{});
31
-
32
-
const cfg = config.Config.fromEnv();
33
-
34
-
// load bufos from API
35
-
var m = matcher.Matcher.init(allocator, cfg.min_phrase_words);
36
-
try loadBufos(allocator, &m, cfg.exclude_patterns);
37
-
std.debug.print("loaded {} bufos with >= {} word phrases\n", .{ m.count(), cfg.min_phrase_words });
38
-
39
-
if (m.count() == 0) {
40
-
std.debug.print("no bufos loaded, exiting\n", .{});
41
-
return;
42
-
}
43
-
44
-
// init bluesky client
45
-
var bsky_client = bsky.BskyClient.init(allocator, cfg.bsky_handle, cfg.bsky_app_password);
46
-
defer bsky_client.deinit();
47
-
48
-
if (cfg.posting_enabled) {
49
-
try bsky_client.login();
50
-
} else {
51
-
std.debug.print("posting disabled, running in dry-run mode\n", .{});
52
-
}
53
-
54
-
// init stats
55
-
var bot_stats = stats.Stats.init(allocator);
56
-
defer bot_stats.deinit();
57
-
bot_stats.setBufosLoaded(@intCast(m.count()));
58
-
59
-
// init state
60
-
var state = BotState{
61
-
.allocator = allocator,
62
-
.config = cfg,
63
-
.matcher = m,
64
-
.bsky_client = bsky_client,
65
-
.recent_bufos = std.StringHashMap(i64).init(allocator),
66
-
.stats = bot_stats,
67
-
};
68
-
defer state.recent_bufos.deinit();
69
-
70
-
global_state = &state;
71
-
72
-
// start stats server on background thread
73
-
var stats_server = stats.StatsServer.init(allocator, &state.stats, cfg.stats_port);
74
-
const stats_thread = Thread.spawn(.{}, stats.StatsServer.run, .{&stats_server}) catch |err| {
75
-
std.debug.print("failed to start stats server: {}\n", .{err});
76
-
return err;
77
-
};
78
-
defer stats_thread.join();
79
-
80
-
// start jetstream consumer
81
-
var js = jetstream.JetstreamClient.init(allocator, cfg.jetstream_endpoint, onPost);
82
-
js.run();
83
-
}
84
-
85
-
fn onPost(post: jetstream.Post) void {
86
-
const state = global_state orelse return;
87
-
88
-
state.stats.incPostsChecked();
89
-
90
-
// check for match
91
-
const match = state.matcher.findMatch(post.text) orelse return;
92
-
93
-
state.stats.incMatchesFound();
94
-
state.stats.incBufoMatch(match.name, match.url);
95
-
std.debug.print("match: {s}\n", .{match.name});
96
-
97
-
if (!state.config.posting_enabled) {
98
-
std.debug.print("posting disabled, skipping\n", .{});
99
-
return;
100
-
}
101
-
102
-
state.mutex.lock();
103
-
defer state.mutex.unlock();
104
-
105
-
// check cooldown
106
-
const now = std.time.timestamp();
107
-
const cooldown_secs = @as(i64, @intCast(state.config.cooldown_minutes)) * 60;
108
-
109
-
if (state.recent_bufos.get(match.name)) |last_posted| {
110
-
if (now - last_posted < cooldown_secs) {
111
-
state.stats.incCooldownsHit();
112
-
std.debug.print("cooldown: {s} posted recently, skipping\n", .{match.name});
113
-
return;
114
-
}
115
-
}
116
-
117
-
// try to post, with one retry on token expiration
118
-
tryPost(state, post, match, now) catch |err| {
119
-
if (err == error.ExpiredToken) {
120
-
std.debug.print("token expired, re-logging in...\n", .{});
121
-
state.bsky_client.login() catch |login_err| {
122
-
std.debug.print("failed to re-login: {}\n", .{login_err});
123
-
state.stats.incErrors();
124
-
return;
125
-
};
126
-
std.debug.print("re-login successful, retrying post...\n", .{});
127
-
tryPost(state, post, match, now) catch |retry_err| {
128
-
std.debug.print("retry failed: {}\n", .{retry_err});
129
-
state.stats.incErrors();
130
-
};
131
-
} else {
132
-
state.stats.incErrors();
133
-
}
134
-
};
135
-
}
136
-
137
-
fn tryPost(state: *BotState, post: jetstream.Post, match: matcher.Match, now: i64) !void {
138
-
// fetch bufo image
139
-
const img_data = try state.bsky_client.fetchImage(match.url);
140
-
defer state.allocator.free(img_data);
141
-
142
-
const is_gif = mem.endsWith(u8, match.url, ".gif");
143
-
144
-
// build alt text (name without extension, dashes to spaces)
145
-
var alt_buf: [128]u8 = undefined;
146
-
var alt_len: usize = 0;
147
-
for (match.name) |c| {
148
-
if (c == '-') {
149
-
alt_buf[alt_len] = ' ';
150
-
} else if (c == '.') {
151
-
break; // stop at extension
152
-
} else {
153
-
alt_buf[alt_len] = c;
154
-
}
155
-
alt_len += 1;
156
-
if (alt_len >= alt_buf.len - 1) break;
157
-
}
158
-
const alt_text = alt_buf[0..alt_len];
159
-
160
-
// get post CID for quote
161
-
const cid = try state.bsky_client.getPostCid(post.uri);
162
-
defer state.allocator.free(cid);
163
-
164
-
if (is_gif) {
165
-
// upload as video for animated GIFs
166
-
std.debug.print("uploading {d} bytes as video\n", .{img_data.len});
167
-
const job_id = try state.bsky_client.uploadVideo(img_data, match.name);
168
-
defer state.allocator.free(job_id);
169
-
170
-
std.debug.print("waiting for video processing (job: {s})...\n", .{job_id});
171
-
const blob_json = try state.bsky_client.waitForVideo(job_id);
172
-
defer state.allocator.free(blob_json);
173
-
174
-
try state.bsky_client.createVideoQuotePost(post.uri, cid, blob_json, alt_text);
175
-
} else {
176
-
// upload as image
177
-
const content_type = if (mem.endsWith(u8, match.url, ".png"))
178
-
"image/png"
179
-
else
180
-
"image/jpeg";
181
-
182
-
std.debug.print("uploading {d} bytes as {s}\n", .{ img_data.len, content_type });
183
-
const blob_json = try state.bsky_client.uploadBlob(img_data, content_type);
184
-
defer state.allocator.free(blob_json);
185
-
186
-
try state.bsky_client.createQuotePost(post.uri, cid, blob_json, alt_text);
187
-
}
188
-
std.debug.print("posted bufo quote: {s}\n", .{match.name});
189
-
state.stats.incPostsCreated();
190
-
191
-
// update cooldown cache
192
-
state.recent_bufos.put(match.name, now) catch {};
193
-
}
194
-
195
-
fn loadBufos(allocator: Allocator, m: *matcher.Matcher, exclude_patterns: []const u8) !void {
196
-
var client = http.Client{ .allocator = allocator };
197
-
defer client.deinit();
198
-
199
-
var url_buf: [512]u8 = undefined;
200
-
const url = std.fmt.bufPrint(&url_buf, "https://find-bufo.com/api/search?query=bufo&top_k=2000&alpha=0&exclude={s}", .{exclude_patterns}) catch return error.UrlTooLong;
201
-
202
-
var aw: std.Io.Writer.Allocating = .init(allocator);
203
-
defer aw.deinit();
204
-
205
-
const result = client.fetch(.{
206
-
.location = .{ .url = url },
207
-
.method = .GET,
208
-
.response_writer = &aw.writer,
209
-
}) catch |err| {
210
-
std.debug.print("failed to fetch bufos: {}\n", .{err});
211
-
return err;
212
-
};
213
-
214
-
if (result.status != .ok) {
215
-
std.debug.print("failed to fetch bufos, status: {}\n", .{result.status});
216
-
return error.FetchFailed;
217
-
}
218
-
219
-
const response_list = aw.toArrayList();
220
-
const response = response_list.items;
221
-
222
-
const parsed = json.parseFromSlice(json.Value, allocator, response, .{}) catch return error.ParseError;
223
-
defer parsed.deinit();
224
-
225
-
const results = parsed.value.object.get("results") orelse return;
226
-
if (results != .array) return;
227
-
228
-
var loaded: usize = 0;
229
-
for (results.array.items) |item| {
230
-
if (item != .object) continue;
231
-
232
-
const name_val = item.object.get("name") orelse continue;
233
-
if (name_val != .string) continue;
234
-
235
-
const url_val = item.object.get("url") orelse continue;
236
-
if (url_val != .string) continue;
237
-
238
-
m.addBufo(name_val.string, url_val.string) catch continue;
239
-
loaded += 1;
240
-
}
241
-
242
-
std.debug.print("loaded {} bufos from API\n", .{loaded});
243
-
}
-152
bot/src/matcher.zig
-152
bot/src/matcher.zig
···
1
-
const std = @import("std");
2
-
const mem = std.mem;
3
-
const Allocator = mem.Allocator;
4
-
5
-
pub const Bufo = struct {
6
-
name: []const u8,
7
-
url: []const u8,
8
-
phrase: []const []const u8,
9
-
};
10
-
11
-
pub const Match = struct {
12
-
name: []const u8,
13
-
url: []const u8,
14
-
};
15
-
16
-
pub const Matcher = struct {
17
-
bufos: std.ArrayList(Bufo) = .{},
18
-
allocator: Allocator,
19
-
min_words: u32,
20
-
21
-
pub fn init(allocator: Allocator, min_words: u32) Matcher {
22
-
return .{
23
-
.allocator = allocator,
24
-
.min_words = min_words,
25
-
};
26
-
}
27
-
28
-
pub fn deinit(self: *Matcher) void {
29
-
for (self.bufos.items) |bufo| {
30
-
self.allocator.free(bufo.name);
31
-
self.allocator.free(bufo.url);
32
-
for (bufo.phrase) |word| {
33
-
self.allocator.free(word);
34
-
}
35
-
self.allocator.free(bufo.phrase);
36
-
}
37
-
self.bufos.deinit(self.allocator);
38
-
}
39
-
40
-
pub fn addBufo(self: *Matcher, name: []const u8, url: []const u8) !void {
41
-
const phrase = try extractPhrase(self.allocator, name);
42
-
43
-
if (phrase.len < self.min_words) {
44
-
for (phrase) |word| self.allocator.free(word);
45
-
self.allocator.free(phrase);
46
-
return;
47
-
}
48
-
49
-
try self.bufos.append(self.allocator, .{
50
-
.name = try self.allocator.dupe(u8, name),
51
-
.url = try self.allocator.dupe(u8, url),
52
-
.phrase = phrase,
53
-
});
54
-
}
55
-
56
-
pub fn findMatch(self: *Matcher, text: []const u8) ?Match {
57
-
var words: std.ArrayList([]const u8) = .{};
58
-
defer words.deinit(self.allocator);
59
-
60
-
var i: usize = 0;
61
-
while (i < text.len) {
62
-
while (i < text.len and !isAlpha(text[i])) : (i += 1) {}
63
-
if (i >= text.len) break;
64
-
65
-
const start = i;
66
-
while (i < text.len and isAlpha(text[i])) : (i += 1) {}
67
-
68
-
const word = text[start..i];
69
-
if (word.len > 0) {
70
-
words.append(self.allocator, word) catch continue;
71
-
}
72
-
}
73
-
74
-
for (self.bufos.items) |bufo| {
75
-
if (containsPhrase(words.items, bufo.phrase)) {
76
-
return .{
77
-
.name = bufo.name,
78
-
.url = bufo.url,
79
-
};
80
-
}
81
-
}
82
-
return null;
83
-
}
84
-
85
-
pub fn count(self: *Matcher) usize {
86
-
return self.bufos.items.len;
87
-
}
88
-
};
89
-
90
-
fn extractPhrase(allocator: Allocator, name: []const u8) ![]const []const u8 {
91
-
var start: usize = 0;
92
-
if (mem.startsWith(u8, name, "bufo-")) {
93
-
start = 5;
94
-
}
95
-
var end = name.len;
96
-
if (mem.endsWith(u8, name, ".gif")) {
97
-
end -= 4;
98
-
} else if (mem.endsWith(u8, name, ".png")) {
99
-
end -= 4;
100
-
} else if (mem.endsWith(u8, name, ".jpg")) {
101
-
end -= 4;
102
-
} else if (mem.endsWith(u8, name, ".jpeg")) {
103
-
end -= 5;
104
-
}
105
-
106
-
const slug = name[start..end];
107
-
108
-
var words: std.ArrayList([]const u8) = .{};
109
-
errdefer {
110
-
for (words.items) |word| allocator.free(word);
111
-
words.deinit(allocator);
112
-
}
113
-
114
-
var iter = mem.splitScalar(u8, slug, '-');
115
-
while (iter.next()) |word| {
116
-
if (word.len > 0) {
117
-
const lower = try allocator.alloc(u8, word.len);
118
-
for (word, 0..) |c, j| {
119
-
lower[j] = std.ascii.toLower(c);
120
-
}
121
-
try words.append(allocator, lower);
122
-
}
123
-
}
124
-
125
-
return try words.toOwnedSlice(allocator);
126
-
}
127
-
128
-
fn containsPhrase(post_words: []const []const u8, phrase: []const []const u8) bool {
129
-
if (phrase.len == 0 or post_words.len < phrase.len) return false;
130
-
131
-
outer: for (0..post_words.len - phrase.len + 1) |i| {
132
-
for (phrase, 0..) |phrase_word, j| {
133
-
if (!eqlIgnoreCase(post_words[i + j], phrase_word)) {
134
-
continue :outer;
135
-
}
136
-
}
137
-
return true;
138
-
}
139
-
return false;
140
-
}
141
-
142
-
fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
143
-
if (a.len != b.len) return false;
144
-
for (a, b) |ca, cb| {
145
-
if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
146
-
}
147
-
return true;
148
-
}
149
-
150
-
fn isAlpha(c: u8) bool {
151
-
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
152
-
}
-401
bot/src/stats.zig
-401
bot/src/stats.zig
···
1
-
const std = @import("std");
2
-
const mem = std.mem;
3
-
const json = std.json;
4
-
const fs = std.fs;
5
-
const Allocator = mem.Allocator;
6
-
const Thread = std.Thread;
7
-
const template = @import("stats_template.zig");
8
-
9
-
const STATS_PATH = "/data/stats.json";
10
-
11
-
pub const Stats = struct {
12
-
allocator: Allocator,
13
-
start_time: i64,
14
-
prior_uptime: u64 = 0, // cumulative uptime from previous runs
15
-
posts_checked: std.atomic.Value(u64) = .init(0),
16
-
matches_found: std.atomic.Value(u64) = .init(0),
17
-
posts_created: std.atomic.Value(u64) = .init(0),
18
-
cooldowns_hit: std.atomic.Value(u64) = .init(0),
19
-
errors: std.atomic.Value(u64) = .init(0),
20
-
bufos_loaded: u64 = 0,
21
-
22
-
// track per-bufo match counts: name -> {count, url}
23
-
bufo_matches: std.StringHashMap(BufoMatchData),
24
-
bufo_mutex: Thread.Mutex = .{},
25
-
26
-
const BufoMatchData = struct {
27
-
count: u64,
28
-
url: []const u8,
29
-
};
30
-
31
-
pub fn init(allocator: Allocator) Stats {
32
-
var self = Stats{
33
-
.allocator = allocator,
34
-
.start_time = std.time.timestamp(),
35
-
.bufo_matches = std.StringHashMap(BufoMatchData).init(allocator),
36
-
};
37
-
self.load();
38
-
return self;
39
-
}
40
-
41
-
pub fn deinit(self: *Stats) void {
42
-
self.save();
43
-
var iter = self.bufo_matches.iterator();
44
-
while (iter.next()) |entry| {
45
-
self.allocator.free(entry.key_ptr.*);
46
-
self.allocator.free(entry.value_ptr.url);
47
-
}
48
-
self.bufo_matches.deinit();
49
-
}
50
-
51
-
fn load(self: *Stats) void {
52
-
const file = fs.openFileAbsolute(STATS_PATH, .{}) catch return;
53
-
defer file.close();
54
-
55
-
var buf: [64 * 1024]u8 = undefined;
56
-
const len = file.readAll(&buf) catch return;
57
-
if (len == 0) return;
58
-
59
-
const parsed = json.parseFromSlice(json.Value, self.allocator, buf[0..len], .{}) catch return;
60
-
defer parsed.deinit();
61
-
62
-
const root = parsed.value.object;
63
-
64
-
if (root.get("posts_checked")) |v| if (v == .integer) {
65
-
self.posts_checked.store(@intCast(@max(0, v.integer)), .monotonic);
66
-
};
67
-
if (root.get("matches_found")) |v| if (v == .integer) {
68
-
self.matches_found.store(@intCast(@max(0, v.integer)), .monotonic);
69
-
};
70
-
if (root.get("posts_created")) |v| if (v == .integer) {
71
-
self.posts_created.store(@intCast(@max(0, v.integer)), .monotonic);
72
-
};
73
-
if (root.get("cooldowns_hit")) |v| if (v == .integer) {
74
-
self.cooldowns_hit.store(@intCast(@max(0, v.integer)), .monotonic);
75
-
};
76
-
if (root.get("errors")) |v| if (v == .integer) {
77
-
self.errors.store(@intCast(@max(0, v.integer)), .monotonic);
78
-
};
79
-
if (root.get("cumulative_uptime")) |v| if (v == .integer) {
80
-
self.prior_uptime = @intCast(@max(0, v.integer));
81
-
};
82
-
83
-
// load bufo_matches (or legacy bufo_posts)
84
-
const matches_key = if (root.get("bufo_matches") != null) "bufo_matches" else "bufo_posts";
85
-
if (root.get(matches_key)) |bp| {
86
-
if (bp == .object) {
87
-
var iter = bp.object.iterator();
88
-
while (iter.next()) |entry| {
89
-
if (entry.value_ptr.* == .object) {
90
-
// format: {"count": N, "url": "..."}
91
-
const obj = entry.value_ptr.object;
92
-
const count_val = obj.get("count") orelse continue;
93
-
const url_val = obj.get("url") orelse continue;
94
-
if (count_val != .integer or url_val != .string) continue;
95
-
96
-
const key = self.allocator.dupe(u8, entry.key_ptr.*) catch continue;
97
-
const url = self.allocator.dupe(u8, url_val.string) catch {
98
-
self.allocator.free(key);
99
-
continue;
100
-
};
101
-
self.bufo_matches.put(key, .{
102
-
.count = @intCast(@max(0, count_val.integer)),
103
-
.url = url,
104
-
}) catch {
105
-
self.allocator.free(key);
106
-
self.allocator.free(url);
107
-
};
108
-
} else if (entry.value_ptr.* == .integer) {
109
-
// legacy format: just integer count - construct URL from name
110
-
const key = self.allocator.dupe(u8, entry.key_ptr.*) catch continue;
111
-
var url_buf: [256]u8 = undefined;
112
-
const constructed_url = std.fmt.bufPrint(&url_buf, "https://all-the.bufo.zone/{s}", .{entry.key_ptr.*}) catch continue;
113
-
const url = self.allocator.dupe(u8, constructed_url) catch {
114
-
self.allocator.free(key);
115
-
continue;
116
-
};
117
-
self.bufo_matches.put(key, .{
118
-
.count = @intCast(@max(0, entry.value_ptr.integer)),
119
-
.url = url,
120
-
}) catch {
121
-
self.allocator.free(key);
122
-
self.allocator.free(url);
123
-
};
124
-
}
125
-
}
126
-
}
127
-
}
128
-
129
-
std.debug.print("loaded stats from {s}\n", .{STATS_PATH});
130
-
}
131
-
132
-
pub fn save(self: *Stats) void {
133
-
self.bufo_mutex.lock();
134
-
defer self.bufo_mutex.unlock();
135
-
self.saveUnlocked();
136
-
}
137
-
138
-
pub fn totalUptime(self: *Stats) i64 {
139
-
const now = std.time.timestamp();
140
-
const session: i64 = now - self.start_time;
141
-
return @as(i64, @intCast(self.prior_uptime)) + session;
142
-
}
143
-
144
-
pub fn incPostsChecked(self: *Stats) void {
145
-
_ = self.posts_checked.fetchAdd(1, .monotonic);
146
-
}
147
-
148
-
pub fn incMatchesFound(self: *Stats) void {
149
-
_ = self.matches_found.fetchAdd(1, .monotonic);
150
-
}
151
-
152
-
pub fn incBufoMatch(self: *Stats, bufo_name: []const u8, bufo_url: []const u8) void {
153
-
self.bufo_mutex.lock();
154
-
defer self.bufo_mutex.unlock();
155
-
156
-
if (self.bufo_matches.getPtr(bufo_name)) |data| {
157
-
data.count += 1;
158
-
} else {
159
-
const key = self.allocator.dupe(u8, bufo_name) catch return;
160
-
const url = self.allocator.dupe(u8, bufo_url) catch {
161
-
self.allocator.free(key);
162
-
return;
163
-
};
164
-
self.bufo_matches.put(key, .{ .count = 1, .url = url }) catch {
165
-
self.allocator.free(key);
166
-
self.allocator.free(url);
167
-
};
168
-
}
169
-
self.saveUnlocked();
170
-
}
171
-
172
-
pub fn incPostsCreated(self: *Stats) void {
173
-
_ = self.posts_created.fetchAdd(1, .monotonic);
174
-
}
175
-
176
-
fn saveUnlocked(self: *Stats) void {
177
-
// called when mutex is already held
178
-
const file = fs.createFileAbsolute(STATS_PATH, .{}) catch return;
179
-
defer file.close();
180
-
181
-
const now = std.time.timestamp();
182
-
const session_uptime: u64 = @intCast(@max(0, now - self.start_time));
183
-
const total_uptime = self.prior_uptime + session_uptime;
184
-
185
-
var buf: [64 * 1024]u8 = undefined;
186
-
var fbs = std.io.fixedBufferStream(&buf);
187
-
const writer = fbs.writer();
188
-
189
-
writer.writeAll("{") catch return;
190
-
std.fmt.format(writer, "\"posts_checked\":{},", .{self.posts_checked.load(.monotonic)}) catch return;
191
-
std.fmt.format(writer, "\"matches_found\":{},", .{self.matches_found.load(.monotonic)}) catch return;
192
-
std.fmt.format(writer, "\"posts_created\":{},", .{self.posts_created.load(.monotonic)}) catch return;
193
-
std.fmt.format(writer, "\"cooldowns_hit\":{},", .{self.cooldowns_hit.load(.monotonic)}) catch return;
194
-
std.fmt.format(writer, "\"errors\":{},", .{self.errors.load(.monotonic)}) catch return;
195
-
std.fmt.format(writer, "\"cumulative_uptime\":{},", .{total_uptime}) catch return;
196
-
writer.writeAll("\"bufo_matches\":{") catch return;
197
-
198
-
var first = true;
199
-
var iter = self.bufo_matches.iterator();
200
-
while (iter.next()) |entry| {
201
-
if (!first) writer.writeAll(",") catch return;
202
-
first = false;
203
-
std.fmt.format(writer, "\"{s}\":{{\"count\":{},\"url\":\"{s}\"}}", .{ entry.key_ptr.*, entry.value_ptr.count, entry.value_ptr.url }) catch return;
204
-
}
205
-
206
-
writer.writeAll("}}") catch return;
207
-
file.writeAll(fbs.getWritten()) catch return;
208
-
}
209
-
210
-
pub fn incCooldownsHit(self: *Stats) void {
211
-
_ = self.cooldowns_hit.fetchAdd(1, .monotonic);
212
-
}
213
-
214
-
pub fn incErrors(self: *Stats) void {
215
-
_ = self.errors.fetchAdd(1, .monotonic);
216
-
}
217
-
218
-
pub fn setBufosLoaded(self: *Stats, count: u64) void {
219
-
self.bufos_loaded = count;
220
-
}
221
-
222
-
fn formatUptime(seconds: i64, buf: []u8) []const u8 {
223
-
const s: u64 = @intCast(@max(0, seconds));
224
-
const days = s / 86400;
225
-
const hours = (s % 86400) / 3600;
226
-
const mins = (s % 3600) / 60;
227
-
const secs = s % 60;
228
-
229
-
if (days > 0) {
230
-
return std.fmt.bufPrint(buf, "{}d {}h {}m", .{ days, hours, mins }) catch "?";
231
-
} else if (hours > 0) {
232
-
return std.fmt.bufPrint(buf, "{}h {}m {}s", .{ hours, mins, secs }) catch "?";
233
-
} else if (mins > 0) {
234
-
return std.fmt.bufPrint(buf, "{}m {}s", .{ mins, secs }) catch "?";
235
-
} else {
236
-
return std.fmt.bufPrint(buf, "{}s", .{secs}) catch "?";
237
-
}
238
-
}
239
-
240
-
pub fn renderHtml(self: *Stats, allocator: Allocator) ![]const u8 {
241
-
const uptime = self.totalUptime();
242
-
243
-
var uptime_buf: [64]u8 = undefined;
244
-
const uptime_str = formatUptime(uptime, &uptime_buf);
245
-
246
-
const BufoEntry = struct {
247
-
name: []const u8,
248
-
count: u64,
249
-
url: []const u8,
250
-
251
-
fn compare(_: void, a: @This(), b: @This()) bool {
252
-
return a.count > b.count;
253
-
}
254
-
};
255
-
256
-
// collect top bufos
257
-
var top_bufos: std.ArrayList(BufoEntry) = .{};
258
-
defer top_bufos.deinit(allocator);
259
-
260
-
{
261
-
self.bufo_mutex.lock();
262
-
defer self.bufo_mutex.unlock();
263
-
264
-
var iter = self.bufo_matches.iterator();
265
-
while (iter.next()) |entry| {
266
-
try top_bufos.append(allocator, .{ .name = entry.key_ptr.*, .count = entry.value_ptr.count, .url = entry.value_ptr.url });
267
-
}
268
-
}
269
-
270
-
// sort by count descending
271
-
mem.sort(BufoEntry, top_bufos.items, {}, BufoEntry.compare);
272
-
273
-
// build top bufos grid html
274
-
var top_html: std.ArrayList(u8) = .{};
275
-
defer top_html.deinit(allocator);
276
-
277
-
const limit = @min(top_bufos.items.len, 20);
278
-
279
-
// find max count for scaling
280
-
var max_count: u64 = 1;
281
-
for (top_bufos.items[0..limit]) |entry| {
282
-
if (entry.count > max_count) max_count = entry.count;
283
-
}
284
-
285
-
for (top_bufos.items[0..limit]) |entry| {
286
-
// scale size: min 60px, max 160px based on count ratio
287
-
const ratio = @as(f64, @floatFromInt(entry.count)) / @as(f64, @floatFromInt(max_count));
288
-
const size: u32 = @intFromFloat(60.0 + ratio * 100.0);
289
-
290
-
// strip extension for display name
291
-
var display_name = entry.name;
292
-
if (mem.endsWith(u8, entry.name, ".gif")) {
293
-
display_name = entry.name[0 .. entry.name.len - 4];
294
-
} else if (mem.endsWith(u8, entry.name, ".png")) {
295
-
display_name = entry.name[0 .. entry.name.len - 4];
296
-
} else if (mem.endsWith(u8, entry.name, ".jpg")) {
297
-
display_name = entry.name[0 .. entry.name.len - 4];
298
-
}
299
-
300
-
try std.fmt.format(top_html.writer(allocator),
301
-
\\<div class="bufo-card" style="width:{}px;height:{}px;" title="{s} ({} matches)" data-name="{s}" onclick="showPosts(this)">
302
-
\\<img src="{s}" alt="{s}" loading="lazy">
303
-
\\<span class="bufo-count">{}</span>
304
-
\\</div>
305
-
, .{ size, size, display_name, entry.count, display_name, entry.url, display_name, entry.count });
306
-
}
307
-
308
-
const top_section = if (top_bufos.items.len > 0) top_html.items else "<p class=\"no-bufos\">no posts yet</p>";
309
-
310
-
const html = try std.fmt.allocPrint(allocator, template.html, .{
311
-
uptime,
312
-
uptime_str,
313
-
self.posts_checked.load(.monotonic),
314
-
self.posts_checked.load(.monotonic),
315
-
self.matches_found.load(.monotonic),
316
-
self.matches_found.load(.monotonic),
317
-
self.posts_created.load(.monotonic),
318
-
self.posts_created.load(.monotonic),
319
-
self.cooldowns_hit.load(.monotonic),
320
-
self.cooldowns_hit.load(.monotonic),
321
-
self.errors.load(.monotonic),
322
-
self.errors.load(.monotonic),
323
-
self.bufos_loaded,
324
-
self.bufos_loaded,
325
-
top_section,
326
-
});
327
-
328
-
return html;
329
-
}
330
-
};
331
-
332
-
pub const StatsServer = struct {
333
-
allocator: Allocator,
334
-
stats: *Stats,
335
-
port: u16,
336
-
337
-
pub fn init(allocator: Allocator, stats: *Stats, port: u16) StatsServer {
338
-
return .{
339
-
.allocator = allocator,
340
-
.stats = stats,
341
-
.port = port,
342
-
};
343
-
}
344
-
345
-
pub fn run(self: *StatsServer) void {
346
-
// spawn periodic save ticker (every 60s)
347
-
_ = Thread.spawn(.{}, saveTicker, .{self.stats}) catch {};
348
-
349
-
self.serve() catch |err| {
350
-
std.debug.print("stats server error: {}\n", .{err});
351
-
};
352
-
}
353
-
354
-
fn saveTicker(s: *Stats) void {
355
-
while (true) {
356
-
std.Thread.sleep(60 * std.time.ns_per_s);
357
-
s.save();
358
-
}
359
-
}
360
-
361
-
fn serve(self: *StatsServer) !void {
362
-
const addr = std.net.Address.initIp4(.{ 0, 0, 0, 0 }, self.port);
363
-
364
-
var server = try addr.listen(.{ .reuse_address = true });
365
-
defer server.deinit();
366
-
367
-
std.debug.print("stats server listening on http://0.0.0.0:{}\n", .{self.port});
368
-
369
-
while (true) {
370
-
const conn = server.accept() catch |err| {
371
-
std.debug.print("accept error: {}\n", .{err});
372
-
continue;
373
-
};
374
-
375
-
self.handleConnection(conn) catch |err| {
376
-
std.debug.print("connection error: {}\n", .{err});
377
-
};
378
-
}
379
-
}
380
-
381
-
fn handleConnection(self: *StatsServer, conn: std.net.Server.Connection) !void {
382
-
defer conn.stream.close();
383
-
384
-
// read request (we don't really care about it, just serve stats)
385
-
var buf: [1024]u8 = undefined;
386
-
_ = conn.stream.read(&buf) catch {};
387
-
388
-
const html = self.stats.renderHtml(self.allocator) catch |err| {
389
-
std.debug.print("render error: {}\n", .{err});
390
-
return;
391
-
};
392
-
defer self.allocator.free(html);
393
-
394
-
// write raw HTTP response
395
-
var response_buf: [128]u8 = undefined;
396
-
const header = std.fmt.bufPrint(&response_buf, "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", .{html.len}) catch return;
397
-
398
-
_ = conn.stream.write(header) catch return;
399
-
_ = conn.stream.write(html) catch return;
400
-
}
401
-
};
-224
bot/src/stats_template.zig
-224
bot/src/stats_template.zig
···
1
-
// HTML template for stats page
2
-
// format args: uptime_secs, uptime_str, posts_checked (x2), matches_found (x2),
3
-
// posts_created (x2), cooldowns_hit (x2), errors (x2), bufos_loaded (x2), top_section
4
-
5
-
pub const html =
6
-
\\<!DOCTYPE html>
7
-
\\<html>
8
-
\\<head>
9
-
\\<meta charset="utf-8">
10
-
\\<meta name="viewport" content="width=device-width, initial-scale=1">
11
-
\\<title>bufo-bot stats</title>
12
-
\\<style>
13
-
\\ body {{
14
-
\\ font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
15
-
\\ max-width: 600px;
16
-
\\ margin: 40px auto;
17
-
\\ padding: 20px;
18
-
\\ background: #1a1a2e;
19
-
\\ color: #eee;
20
-
\\ font-size: 14px;
21
-
\\ }}
22
-
\\ h1 {{ color: #7bed9f; margin-bottom: 30px; }}
23
-
\\ .stat {{
24
-
\\ display: flex;
25
-
\\ justify-content: space-between;
26
-
\\ padding: 12px 0;
27
-
\\ border-bottom: 1px solid #333;
28
-
\\ }}
29
-
\\ .stat-label {{ color: #aaa; }}
30
-
\\ .stat-value {{ font-weight: bold; }}
31
-
\\ h2 {{ color: #7bed9f; margin-top: 40px; font-size: 1.2em; }}
32
-
\\ .bufo-grid {{
33
-
\\ display: flex;
34
-
\\ flex-wrap: wrap;
35
-
\\ gap: 8px;
36
-
\\ justify-content: flex-start;
37
-
\\ align-items: flex-start;
38
-
\\ margin-top: 16px;
39
-
\\ }}
40
-
\\ .bufo-card {{
41
-
\\ position: relative;
42
-
\\ border-radius: 8px;
43
-
\\ overflow: hidden;
44
-
\\ background: #252542;
45
-
\\ transition: transform 0.2s;
46
-
\\ cursor: pointer;
47
-
\\ }}
48
-
\\ .bufo-card:hover {{
49
-
\\ transform: scale(1.1);
50
-
\\ z-index: 10;
51
-
\\ }}
52
-
\\ .bufo-card img {{
53
-
\\ width: 100%;
54
-
\\ height: 100%;
55
-
\\ object-fit: cover;
56
-
\\ }}
57
-
\\ .bufo-count {{
58
-
\\ position: absolute;
59
-
\\ bottom: 4px;
60
-
\\ right: 4px;
61
-
\\ background: rgba(0,0,0,0.7);
62
-
\\ color: #7bed9f;
63
-
\\ padding: 2px 6px;
64
-
\\ border-radius: 4px;
65
-
\\ font-size: 11px;
66
-
\\ }}
67
-
\\ .no-bufos {{ color: #666; text-align: center; }}
68
-
\\ .footer {{
69
-
\\ margin-top: 40px;
70
-
\\ padding-top: 20px;
71
-
\\ border-top: 1px solid #333;
72
-
\\ color: #666;
73
-
\\ font-size: 0.9em;
74
-
\\ }}
75
-
\\ a {{ color: #7bed9f; }}
76
-
\\ .modal {{
77
-
\\ display: none;
78
-
\\ position: fixed;
79
-
\\ top: 0; left: 0; right: 0; bottom: 0;
80
-
\\ background: rgba(0,0,0,0.8);
81
-
\\ z-index: 100;
82
-
\\ justify-content: center;
83
-
\\ align-items: center;
84
-
\\ }}
85
-
\\ .modal.show {{ display: flex; }}
86
-
\\ .modal-content {{
87
-
\\ background: #252542;
88
-
\\ padding: 20px;
89
-
\\ border-radius: 8px;
90
-
\\ width: 90vw;
91
-
\\ max-width: 600px;
92
-
\\ height: 85vh;
93
-
\\ display: flex;
94
-
\\ flex-direction: column;
95
-
\\ }}
96
-
\\ .modal-content h3 {{ margin-top: 0; color: #7bed9f; }}
97
-
\\ .modal-content .close {{ cursor: pointer; float: right; font-size: 20px; }}
98
-
\\ .modal-content .no-posts {{ color: #666; text-align: center; padding: 20px; }}
99
-
\\ .embed-wrap {{ flex: 1; overflow: hidden; }}
100
-
\\ .embed-wrap iframe {{ border: none; width: 100%; height: 100%; border-radius: 8px; }}
101
-
\\ .nav {{ display: flex; justify-content: space-between; align-items: center; margin-top: 10px; gap: 10px; }}
102
-
\\ .nav button {{ background: #7bed9f; color: #1a1a2e; border: none; padding: 6px 12px; border-radius: 4px; cursor: pointer; }}
103
-
\\ .nav button:disabled {{ opacity: 0.3; cursor: default; }}
104
-
\\ .nav span {{ color: #aaa; font-size: 12px; }}
105
-
\\</style>
106
-
\\</head>
107
-
\\<body>
108
-
\\<h1>bufo-bot stats</h1>
109
-
\\
110
-
\\<div class="stat">
111
-
\\ <span class="stat-label">uptime</span>
112
-
\\ <span class="stat-value" id="uptime" data-seconds="{}">{s}</span>
113
-
\\</div>
114
-
\\<div class="stat">
115
-
\\ <span class="stat-label">posts checked</span>
116
-
\\ <span class="stat-value" data-num="{}">{}</span>
117
-
\\</div>
118
-
\\<div class="stat">
119
-
\\ <span class="stat-label">matches found</span>
120
-
\\ <span class="stat-value" data-num="{}">{}</span>
121
-
\\</div>
122
-
\\<div class="stat">
123
-
\\ <span class="stat-label">bufos posted</span>
124
-
\\ <span class="stat-value" data-num="{}">{}</span>
125
-
\\</div>
126
-
\\<div class="stat">
127
-
\\ <span class="stat-label">cooldowns hit</span>
128
-
\\ <span class="stat-value" data-num="{}">{}</span>
129
-
\\</div>
130
-
\\<div class="stat">
131
-
\\ <span class="stat-label">errors</span>
132
-
\\ <span class="stat-value" data-num="{}">{}</span>
133
-
\\</div>
134
-
\\<div class="stat">
135
-
\\ <span class="stat-label">bufos available</span>
136
-
\\ <span class="stat-value" data-num="{}">{}</span>
137
-
\\</div>
138
-
\\
139
-
\\<h2>top bufos</h2>
140
-
\\<div class="bufo-grid">
141
-
\\{s}
142
-
\\</div>
143
-
\\
144
-
\\<div class="footer">
145
-
\\ <a href="https://find-bufo.com">find-bufo.com</a> |
146
-
\\ <a href="https://bsky.app/profile/find-bufo.com">@find-bufo.com</a>
147
-
\\</div>
148
-
\\<div id="modal" class="modal" onclick="if(event.target===this)closeModal()">
149
-
\\ <div class="modal-content">
150
-
\\ <span class="close" onclick="closeModal()">×</span>
151
-
\\ <h3 id="modal-title">posts</h3>
152
-
\\ <div id="embed-wrap" class="embed-wrap"></div>
153
-
\\ <div id="nav" class="nav" style="display:none">
154
-
\\ <button onclick="showEmbed(-1)">←</button>
155
-
\\ <span id="nav-info"></span>
156
-
\\ <button onclick="showEmbed(1)">→</button>
157
-
\\ </div>
158
-
\\ </div>
159
-
\\</div>
160
-
\\<script>
161
-
\\(function() {{
162
-
\\ document.querySelectorAll('[data-num]').forEach(el => {{
163
-
\\ el.textContent = parseInt(el.dataset.num).toLocaleString();
164
-
\\ }});
165
-
\\ const uptimeEl = document.getElementById('uptime');
166
-
\\ let secs = parseInt(uptimeEl.dataset.seconds);
167
-
\\ function fmt(s) {{
168
-
\\ const d = Math.floor(s / 86400);
169
-
\\ const h = Math.floor((s % 86400) / 3600);
170
-
\\ const m = Math.floor((s % 3600) / 60);
171
-
\\ const sec = s % 60;
172
-
\\ if (d > 0) return d + 'd ' + h + 'h ' + m + 'm';
173
-
\\ if (h > 0) return h + 'h ' + m + 'm ' + sec + 's';
174
-
\\ if (m > 0) return m + 'm ' + sec + 's';
175
-
\\ return sec + 's';
176
-
\\ }}
177
-
\\ setInterval(() => {{ secs++; uptimeEl.textContent = fmt(secs); }}, 1000);
178
-
\\}})();
179
-
\\let posts = [], idx = 0;
180
-
\\async function showPosts(el) {{
181
-
\\ const name = el.dataset.name;
182
-
\\ document.getElementById('modal-title').textContent = name;
183
-
\\ document.getElementById('embed-wrap').innerHTML = '<p class="no-posts">loading...</p>';
184
-
\\ document.getElementById('nav').style.display = 'none';
185
-
\\ document.getElementById('modal').classList.add('show');
186
-
\\ try {{
187
-
\\ const r = await fetch('https://public.api.bsky.app/xrpc/app.bsky.feed.getAuthorFeed?actor=find-bufo.com&limit=100');
188
-
\\ const data = await r.json();
189
-
\\ const search = name.replace('bufo-','').replace(/-/g,' ');
190
-
\\ posts = data.feed.filter(p => {{
191
-
\\ const embed = p.post.embed;
192
-
\\ if (!embed) return false;
193
-
\\ const img = embed.images?.[0] || embed.media?.images?.[0];
194
-
\\ if (img?.alt?.includes(search)) return true;
195
-
\\ if (embed.alt?.includes(search)) return true;
196
-
\\ if (embed.media?.alt?.includes(search)) return true;
197
-
\\ return false;
198
-
\\ }});
199
-
\\ idx = 0;
200
-
\\ if (posts.length === 0) {{
201
-
\\ document.getElementById('embed-wrap').innerHTML = '<p class="no-posts">no posts found</p>';
202
-
\\ }} else {{
203
-
\\ showEmbed(0);
204
-
\\ }}
205
-
\\ }} catch(e) {{
206
-
\\ document.getElementById('embed-wrap').innerHTML = '<p class="no-posts">failed to load</p>';
207
-
\\ }}
208
-
\\}}
209
-
\\function showEmbed(d) {{
210
-
\\ idx = Math.max(0, Math.min(posts.length - 1, idx + d));
211
-
\\ const uri = posts[idx].post.uri.replace('at://','');
212
-
\\ document.getElementById('embed-wrap').innerHTML = '<iframe src="https://embed.bsky.app/embed/' + uri + '"></iframe>';
213
-
\\ document.getElementById('nav').style.display = 'flex';
214
-
\\ document.getElementById('nav-info').textContent = (idx + 1) + ' of ' + posts.length;
215
-
\\ document.querySelectorAll('.nav button')[0].disabled = idx === 0;
216
-
\\ document.querySelectorAll('.nav button')[1].disabled = idx === posts.length - 1;
217
-
\\}}
218
-
\\function closeModal() {{
219
-
\\ document.getElementById('modal').classList.remove('show');
220
-
\\}}
221
-
\\</script>
222
-
\\</body>
223
-
\\</html>
224
-
;
-573
docs/zig-atproto-sdk-wishlist.md
-573
docs/zig-atproto-sdk-wishlist.md
···
1
-
# zig atproto sdk wishlist
2
-
3
-
a pie-in-the-sky wishlist for what a zig AT protocol sdk could provide, based on building [bufo-bot](../bot) - a bluesky firehose bot that quote-posts matching images.
4
-
5
-
---
6
-
7
-
## 1. typed lexicon schemas
8
-
9
-
the single biggest pain point: everything is `json.Value` with manual field extraction.
10
-
11
-
### what we have now
12
-
13
-
```zig
14
-
const parsed = json.parseFromSlice(json.Value, allocator, response.items, .{});
15
-
const root = parsed.value.object;
16
-
const jwt_val = root.get("accessJwt") orelse return error.NoJwt;
17
-
if (jwt_val != .string) return error.NoJwt;
18
-
self.access_jwt = try self.allocator.dupe(u8, jwt_val.string);
19
-
```
20
-
21
-
this pattern repeats hundreds of times. it's verbose, error-prone, and provides zero compile-time safety.
22
-
23
-
### what we want
24
-
25
-
```zig
26
-
const atproto = @import("atproto");
27
-
28
-
// codegen from lexicon json schemas
29
-
const session = try atproto.server.createSession(allocator, .{
30
-
.identifier = handle,
31
-
.password = app_password,
32
-
});
33
-
// session.accessJwt is already []const u8
34
-
// session.did is already []const u8
35
-
// session.handle is already []const u8
36
-
```
37
-
38
-
ideally:
39
-
- generate zig structs from lexicon json files at build time (build.zig integration)
40
-
- full type safety - if a field is optional in the lexicon, it's `?T` in zig
41
-
- proper union types for lexicon unions (e.g., embed types)
42
-
- automatic serialization/deserialization
43
-
44
-
### lexicon unions are especially painful
45
-
46
-
```zig
47
-
// current: manual $type dispatch
48
-
const embed_type = record.object.get("$type") orelse return error.NoType;
49
-
if (mem.eql(u8, embed_type.string, "app.bsky.embed.images")) {
50
-
// handle images...
51
-
} else if (mem.eql(u8, embed_type.string, "app.bsky.embed.video")) {
52
-
// handle video...
53
-
} else if (mem.eql(u8, embed_type.string, "app.bsky.embed.record")) {
54
-
// handle quote...
55
-
} else if (mem.eql(u8, embed_type.string, "app.bsky.embed.recordWithMedia")) {
56
-
// handle quote with media...
57
-
}
58
-
59
-
// wanted: tagged union
60
-
switch (record.embed) {
61
-
.images => |imgs| { ... },
62
-
.video => |vid| { ... },
63
-
.record => |quote| { ... },
64
-
.recordWithMedia => |rwm| { ... },
65
-
}
66
-
```
67
-
68
-
---
69
-
70
-
## 2. session management
71
-
72
-
authentication is surprisingly complex and we had to handle it all manually.
73
-
74
-
### what we had to build
75
-
76
-
- login with identifier + app password
77
-
- store access JWT and refresh JWT
78
-
- detect `ExpiredToken` errors in response bodies
79
-
- re-login on expiration (we just re-login, didn't implement refresh)
80
-
- resolve DID to PDS host via plc.directory lookup
81
-
- get service auth tokens for video upload
82
-
83
-
### what we want
84
-
85
-
```zig
86
-
const atproto = @import("atproto");
87
-
88
-
var agent = try atproto.Agent.init(allocator, .{
89
-
.service = "https://bsky.social",
90
-
});
91
-
92
-
// login with automatic token refresh
93
-
try agent.login(handle, app_password);
94
-
95
-
// agent automatically:
96
-
// - refreshes tokens before expiration
97
-
// - retries on ExpiredToken errors
98
-
// - resolves DID -> PDS host
99
-
// - handles service auth for video.bsky.app
100
-
101
-
// just use it, auth is handled
102
-
const blob = try agent.uploadBlob(data, "image/png");
103
-
```
104
-
105
-
### service auth is particularly gnarly
106
-
107
-
for video uploads, you need:
108
-
1. get a service auth token scoped to `did:web:video.bsky.app` with lexicon `com.atproto.repo.uploadBlob`
109
-
2. use that token (not your session token) for the upload
110
-
3. the endpoint is different (`video.bsky.app` not `bsky.social`)
111
-
112
-
we had to figure this out from reading other implementations. an sdk should abstract this entirely.
113
-
114
-
---
115
-
116
-
## 3. blob and media handling
117
-
118
-
uploading media requires too much manual work.
119
-
120
-
### current pain
121
-
122
-
```zig
123
-
// upload blob, get back raw json string
124
-
const blob_json = try client.uploadBlob(data, content_type);
125
-
// later, interpolate that json string into another json blob
126
-
try body_buf.print(allocator,
127
-
\\{{"image":{s},"alt":"{s}"}}
128
-
, .{ blob_json, alt_text });
129
-
```
130
-
131
-
we're passing around json strings and interpolating them. this is fragile.
132
-
133
-
### what we want
134
-
135
-
```zig
136
-
// upload returns a typed BlobRef
137
-
const blob = try agent.uploadBlob(data, .{ .mime_type = "image/png" });
138
-
139
-
// use it directly in a struct
140
-
const post = atproto.feed.Post{
141
-
.text = "",
142
-
.embed = .{ .images = .{
143
-
.images = &[_]atproto.embed.Image{
144
-
.{ .image = blob, .alt = "a bufo" },
145
-
},
146
-
}},
147
-
};
148
-
try agent.createRecord("app.bsky.feed.post", post);
149
-
```
150
-
151
-
### video upload is even worse
152
-
153
-
```zig
154
-
// current: manual job polling
155
-
const job_id = try client.uploadVideo(data, filename);
156
-
var attempts: u32 = 0;
157
-
while (attempts < 60) : (attempts += 1) {
158
-
// poll job status
159
-
// check for JOB_STATE_COMPLETED or JOB_STATE_FAILED
160
-
// sleep 1 second between polls
161
-
}
162
-
163
-
// wanted: one call that handles the async nature
164
-
const video_blob = try agent.uploadVideo(data, .{
165
-
.filename = "bufo.gif",
166
-
.mime_type = "image/gif",
167
-
// sdk handles polling internally
168
-
});
169
-
```
170
-
171
-
---
172
-
173
-
## 4. AT-URI utilities
174
-
175
-
we parse AT-URIs by hand with string splitting.
176
-
177
-
```zig
178
-
// current
179
-
var parts = mem.splitScalar(u8, uri[5..], '/'); // skip "at://"
180
-
const did = parts.next() orelse return error.InvalidUri;
181
-
_ = parts.next(); // skip collection
182
-
const rkey = parts.next() orelse return error.InvalidUri;
183
-
184
-
// wanted
185
-
const parsed = atproto.AtUri.parse(uri);
186
-
// parsed.repo (the DID)
187
-
// parsed.collection
188
-
// parsed.rkey
189
-
```
190
-
191
-
also want:
192
-
- `AtUri.format()` to construct URIs
193
-
- validation (is this a valid DID? valid rkey?)
194
-
- CID parsing/validation
195
-
196
-
---
197
-
198
-
## 5. jetstream / firehose client
199
-
200
-
we used a separate websocket library and manually parsed jetstream messages.
201
-
202
-
### current
203
-
204
-
```zig
205
-
const websocket = @import("websocket"); // third party
206
-
207
-
// manual connection with exponential backoff
208
-
// manual message parsing
209
-
// manual event dispatch
210
-
```
211
-
212
-
### what we want
213
-
214
-
```zig
215
-
const atproto = @import("atproto");
216
-
217
-
var jetstream = atproto.Jetstream.init(allocator, .{
218
-
.endpoint = "jetstream2.us-east.bsky.network",
219
-
.collections = &[_][]const u8{"app.bsky.feed.post"},
220
-
});
221
-
222
-
// typed events!
223
-
while (try jetstream.next()) |event| {
224
-
switch (event) {
225
-
.commit => |commit| {
226
-
switch (commit.operation) {
227
-
.create => |record| {
228
-
// record is already typed based on collection
229
-
if (commit.collection == .feed_post) {
230
-
const post: atproto.feed.Post = record;
231
-
std.debug.print("new post: {s}\n", .{post.text});
232
-
}
233
-
},
234
-
.delete => { ... },
235
-
}
236
-
},
237
-
.identity => |identity| { ... },
238
-
.account => |account| { ... },
239
-
}
240
-
}
241
-
```
242
-
243
-
bonus points:
244
-
- automatic reconnection with configurable backoff
245
-
- cursor support for resuming from a position
246
-
- filtering (dids, collections) built-in
247
-
- automatic decompression if using zstd streams
248
-
249
-
---
250
-
251
-
## 6. record operations
252
-
253
-
CRUD for records is manual json construction.
254
-
255
-
### current
256
-
257
-
```zig
258
-
var body_buf: std.ArrayList(u8) = .{};
259
-
try body_buf.print(allocator,
260
-
\\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{...}}}}
261
-
, .{ did, ... });
262
-
263
-
const result = client.fetch(.{
264
-
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" },
265
-
.method = .POST,
266
-
.headers = .{ .content_type = .{ .override = "application/json" }, ... },
267
-
.payload = body_buf.items,
268
-
...
269
-
});
270
-
```
271
-
272
-
### what we want
273
-
274
-
```zig
275
-
// create
276
-
const result = try agent.createRecord("app.bsky.feed.post", .{
277
-
.text = "hello world",
278
-
.createdAt = atproto.Datetime.now(),
279
-
});
280
-
// result.uri, result.cid are typed
281
-
282
-
// read
283
-
const record = try agent.getRecord(atproto.feed.Post, uri);
284
-
285
-
// delete
286
-
try agent.deleteRecord(uri);
287
-
288
-
// list
289
-
var iter = agent.listRecords("app.bsky.feed.post", .{ .limit = 50 });
290
-
while (try iter.next()) |record| { ... }
291
-
```
292
-
293
-
---
294
-
295
-
## 7. rich text / facets
296
-
297
-
we avoided facets entirely because they're complex. an sdk should make them easy.
298
-
299
-
### what we want
300
-
301
-
```zig
302
-
const rt = atproto.RichText.init(allocator);
303
-
try rt.append("check out ");
304
-
try rt.appendLink("this repo", "https://github.com/...");
305
-
try rt.append(" by ");
306
-
try rt.appendMention("@someone.bsky.social");
307
-
try rt.append(" ");
308
-
try rt.appendTag("zig");
309
-
310
-
const post = atproto.feed.Post{
311
-
.text = rt.text(),
312
-
.facets = rt.facets(),
313
-
};
314
-
```
315
-
316
-
the sdk should:
317
-
- handle unicode byte offsets correctly (this is notoriously tricky)
318
-
- auto-detect links/mentions/tags in plain text
319
-
- validate handles resolve to real DIDs
320
-
321
-
---
322
-
323
-
## 8. rate limiting and retries
324
-
325
-
we have no rate limiting. when we hit limits, we just fail.
326
-
327
-
### what we want
328
-
329
-
```zig
330
-
var agent = atproto.Agent.init(allocator, .{
331
-
.rate_limit = .{
332
-
.strategy = .wait, // or .error
333
-
.max_retries = 3,
334
-
},
335
-
});
336
-
337
-
// agent automatically:
338
-
// - respects rate limit headers
339
-
// - waits and retries on 429
340
-
// - exponential backoff on transient errors
341
-
```
342
-
343
-
---
344
-
345
-
## 9. pagination helpers
346
-
347
-
listing records or searching requires manual cursor handling.
348
-
349
-
```zig
350
-
// current: manual
351
-
var cursor: ?[]const u8 = null;
352
-
while (true) {
353
-
const response = try fetch(cursor);
354
-
for (response.records) |record| { ... }
355
-
cursor = response.cursor orelse break;
356
-
}
357
-
358
-
// wanted: iterator
359
-
var iter = agent.listRecords("app.bsky.feed.post", .{});
360
-
while (try iter.next()) |record| {
361
-
// handles pagination transparently
362
-
}
363
-
364
-
// or collect all
365
-
const all_records = try iter.collect(); // fetches all pages
366
-
```
367
-
368
-
---
369
-
370
-
## 10. did resolution
371
-
372
-
we manually hit plc.directory to resolve DIDs.
373
-
374
-
```zig
375
-
// current
376
-
var url_buf: [256]u8 = undefined;
377
-
const url = std.fmt.bufPrint(&url_buf, "https://plc.directory/{s}", .{did});
378
-
// fetch, parse, find service endpoint...
379
-
380
-
// wanted
381
-
const doc = try atproto.resolveDid(did);
382
-
// doc.pds - the PDS endpoint
383
-
// doc.handle - verified handle
384
-
// doc.signingKey, doc.rotationKeys, etc.
385
-
```
386
-
387
-
should support:
388
-
- did:plc via plc.directory
389
-
- did:web via .well-known
390
-
- caching with TTL
391
-
392
-
---
393
-
394
-
## 11. build.zig integration
395
-
396
-
### lexicon codegen
397
-
398
-
```zig
399
-
// build.zig
400
-
const atproto = @import("atproto");
401
-
402
-
pub fn build(b: *std.Build) void {
403
-
// generate zig types from lexicon schemas
404
-
const lexicons = atproto.addLexiconCodegen(b, .{
405
-
.lexicon_dirs = &.{"lexicons/"},
406
-
// or fetch from network
407
-
.fetch_lexicons = &.{
408
-
"app.bsky.feed.*",
409
-
"app.bsky.actor.*",
410
-
"com.atproto.repo.*",
411
-
},
412
-
});
413
-
414
-
exe.root_module.addImport("lexicons", lexicons);
415
-
}
416
-
```
417
-
418
-
### bundled CA certs
419
-
420
-
TLS in zig requires CA certs. would be nice if the sdk bundled mozilla's CA bundle or made it easy to configure.
421
-
422
-
---
423
-
424
-
## 12. testing utilities
425
-
426
-
### mocks
427
-
428
-
```zig
429
-
const atproto = @import("atproto");
430
-
431
-
test "bot responds to matching posts" {
432
-
var mock = atproto.testing.MockAgent.init(allocator);
433
-
defer mock.deinit();
434
-
435
-
// set up expected calls
436
-
mock.expectCreateRecord("app.bsky.feed.post", .{
437
-
.text = "",
438
-
// ...
439
-
});
440
-
441
-
// run test code
442
-
try handlePost(&mock, test_post);
443
-
444
-
// verify
445
-
try mock.verify();
446
-
}
447
-
```
448
-
449
-
### jetstream replay
450
-
451
-
```zig
452
-
// replay recorded jetstream events for testing
453
-
var replay = atproto.testing.JetstreamReplay.init("testdata/events.jsonl");
454
-
while (try replay.next()) |event| {
455
-
try handleEvent(event);
456
-
}
457
-
```
458
-
459
-
---
460
-
461
-
## 13. logging / observability
462
-
463
-
### structured logging
464
-
465
-
```zig
466
-
var agent = atproto.Agent.init(allocator, .{
467
-
.logger = myLogger, // compatible with std.log or custom
468
-
});
469
-
470
-
// logs requests, responses, retries, rate limits
471
-
```
472
-
473
-
### metrics
474
-
475
-
```zig
476
-
var agent = atproto.Agent.init(allocator, .{
477
-
.metrics = .{
478
-
.requests_total = &my_counter,
479
-
.request_duration = &my_histogram,
480
-
.rate_limit_waits = &my_counter,
481
-
},
482
-
});
483
-
```
484
-
485
-
---
486
-
487
-
## 14. error handling
488
-
489
-
### typed errors with context
490
-
491
-
```zig
492
-
// current: generic errors
493
-
error.PostFailed
494
-
495
-
// wanted: rich errors
496
-
atproto.Error.RateLimit => |e| {
497
-
std.debug.print("rate limited, reset at {}\n", .{e.reset_at});
498
-
},
499
-
atproto.Error.InvalidRecord => |e| {
500
-
std.debug.print("validation failed: {s}\n", .{e.message});
501
-
},
502
-
atproto.Error.ExpiredToken => {
503
-
// sdk should handle this automatically, but if not...
504
-
},
505
-
```
506
-
507
-
---
508
-
509
-
## 15. moderation / labels
510
-
511
-
we didn't need this for bufo-bot, but a complete sdk should support:
512
-
513
-
```zig
514
-
// applying labels
515
-
try agent.createLabels(.{
516
-
.src = agent.did,
517
-
.uri = post_uri,
518
-
.val = "spam",
519
-
});
520
-
521
-
// reading labels on content
522
-
const labels = try agent.getLabels(uri);
523
-
for (labels) |label| {
524
-
if (mem.eql(u8, label.val, "nsfw")) {
525
-
// handle...
526
-
}
527
-
}
528
-
```
529
-
530
-
---
531
-
532
-
## 16. feed generators and custom feeds
533
-
534
-
```zig
535
-
// serving a feed generator
536
-
var server = atproto.FeedGenerator.init(allocator, .{
537
-
.did = my_feed_did,
538
-
.hostname = "feed.example.com",
539
-
});
540
-
541
-
server.addFeed("trending-bufos", struct {
542
-
fn getFeed(ctx: *Context, params: GetFeedParams) !GetFeedResponse {
543
-
// return skeleton
544
-
}
545
-
}.getFeed);
546
-
547
-
try server.listen(8080);
548
-
```
549
-
550
-
---
551
-
552
-
## summary
553
-
554
-
the core theme: **let us write application logic, not protocol plumbing**.
555
-
556
-
right now building an atproto app in zig means:
557
-
- manual json construction/parsing everywhere
558
-
- hand-rolling authentication flows
559
-
- string interpolation for record creation
560
-
- manual http request management
561
-
- third-party websocket libraries for firehose
562
-
- no compile-time safety for lexicon types
563
-
564
-
a good sdk would give us:
565
-
- typed lexicon schemas (codegen)
566
-
- managed sessions with automatic refresh
567
-
- high-level record CRUD
568
-
- built-in jetstream client with typed events
569
-
- utilities for rich text, AT-URIs, DIDs
570
-
- rate limiting and retry logic
571
-
- testing helpers
572
-
573
-
the dream is writing a bot like bufo-bot in ~100 lines instead of ~1000.
-1
justfile
-1
justfile
-153
scripts/add_one_bufo.py
-153
scripts/add_one_bufo.py
···
1
-
#!/usr/bin/env python3
2
-
# /// script
3
-
# requires-python = ">=3.11"
4
-
# dependencies = [
5
-
# "httpx",
6
-
# "python-dotenv",
7
-
# "pillow",
8
-
# ]
9
-
# ///
10
-
"""
11
-
Add a single bufo to turbopuffer.
12
-
Usage: uv run scripts/add_one_bufo.py <path_to_image>
13
-
"""
14
-
15
-
import asyncio
16
-
import base64
17
-
import hashlib
18
-
import os
19
-
import sys
20
-
from io import BytesIO
21
-
from pathlib import Path
22
-
23
-
import httpx
24
-
from PIL import Image
25
-
from dotenv import load_dotenv
26
-
27
-
load_dotenv(Path(__file__).parent.parent / ".env")
28
-
29
-
30
-
async def embed_image(client: httpx.AsyncClient, image_path: Path, api_key: str) -> list[float] | None:
31
-
"""Generate embedding for an image using Voyage AI"""
32
-
try:
33
-
image = Image.open(image_path)
34
-
is_animated = hasattr(image, 'n_frames') and image.n_frames > 1
35
-
filename_text = image_path.stem.replace("-", " ").replace("_", " ")
36
-
37
-
content = [{"type": "text", "text": filename_text}]
38
-
39
-
if is_animated:
40
-
num_frames = image.n_frames
41
-
max_frames = min(5, num_frames)
42
-
frame_indices = [int(i * (num_frames - 1) / (max_frames - 1)) for i in range(max_frames)]
43
-
for frame_idx in frame_indices:
44
-
image.seek(frame_idx)
45
-
buffered = BytesIO()
46
-
image.convert("RGB").save(buffered, format="WEBP", lossless=True)
47
-
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
48
-
content.append({
49
-
"type": "image_base64",
50
-
"image_base64": f"data:image/webp;base64,{img_base64}",
51
-
})
52
-
else:
53
-
buffered = BytesIO()
54
-
image.convert("RGB").save(buffered, format="WEBP", lossless=True)
55
-
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
56
-
content.append({
57
-
"type": "image_base64",
58
-
"image_base64": f"data:image/webp;base64,{img_base64}",
59
-
})
60
-
61
-
response = await client.post(
62
-
"https://api.voyageai.com/v1/multimodalembeddings",
63
-
headers={
64
-
"Authorization": f"Bearer {api_key}",
65
-
"Content-Type": "application/json",
66
-
},
67
-
json={
68
-
"inputs": [{"content": content}],
69
-
"model": "voyage-multimodal-3",
70
-
"input_type": "document",
71
-
},
72
-
timeout=60.0,
73
-
)
74
-
response.raise_for_status()
75
-
result = response.json()
76
-
return result["data"][0]["embedding"]
77
-
except Exception as e:
78
-
print(f"error embedding {image_path.name}: {e}")
79
-
return None
80
-
81
-
82
-
async def upload_to_turbopuffer(filename: str, embedding: list[float], api_key: str, namespace: str):
83
-
"""Upload single embedding to turbopuffer"""
84
-
file_hash = hashlib.sha256(filename.encode()).hexdigest()[:16]
85
-
name = filename.rsplit(".", 1)[0]
86
-
url = f"https://find-bufo.com/static/{filename}"
87
-
88
-
async with httpx.AsyncClient() as client:
89
-
response = await client.post(
90
-
f"https://api.turbopuffer.com/v1/vectors/{namespace}",
91
-
headers={
92
-
"Authorization": f"Bearer {api_key}",
93
-
"Content-Type": "application/json",
94
-
},
95
-
json={
96
-
"ids": [file_hash],
97
-
"vectors": [embedding],
98
-
"distance_metric": "cosine_distance",
99
-
"attributes": {
100
-
"url": [url],
101
-
"name": [name],
102
-
"filename": [filename],
103
-
},
104
-
"schema": {
105
-
"name": {"type": "string", "full_text_search": True},
106
-
"filename": {"type": "string", "full_text_search": True},
107
-
},
108
-
},
109
-
timeout=30.0,
110
-
)
111
-
if response.status_code != 200:
112
-
print(f"turbopuffer error: {response.text}")
113
-
response.raise_for_status()
114
-
115
-
print(f"uploaded {filename} to turbopuffer")
116
-
117
-
118
-
async def main():
119
-
if len(sys.argv) < 2:
120
-
print("usage: uv run scripts/add_one_bufo.py <path_to_image>")
121
-
sys.exit(1)
122
-
123
-
image_path = Path(sys.argv[1])
124
-
if not image_path.exists():
125
-
print(f"file not found: {image_path}")
126
-
sys.exit(1)
127
-
128
-
voyage_api_key = os.getenv("VOYAGE_API_TOKEN")
129
-
if not voyage_api_key:
130
-
print("VOYAGE_API_TOKEN not set")
131
-
sys.exit(1)
132
-
133
-
tpuf_api_key = os.getenv("TURBOPUFFER_API_KEY")
134
-
if not tpuf_api_key:
135
-
print("TURBOPUFFER_API_KEY not set")
136
-
sys.exit(1)
137
-
138
-
tpuf_namespace = os.getenv("TURBOPUFFER_NAMESPACE", "bufos")
139
-
140
-
print(f"adding {image_path.name}...")
141
-
142
-
async with httpx.AsyncClient() as client:
143
-
embedding = await embed_image(client, image_path, voyage_api_key)
144
-
if not embedding:
145
-
print("failed to generate embedding")
146
-
sys.exit(1)
147
-
148
-
await upload_to_turbopuffer(image_path.name, embedding, tpuf_api_key, tpuf_namespace)
149
-
print("done!")
150
-
151
-
152
-
if __name__ == "__main__":
153
-
asyncio.run(main())
+20
-35
src/embedding.rs
+20
-35
src/embedding.rs
···
1
-
//! voyage AI embedding implementation
2
-
//!
3
-
//! implements the `Embedder` trait for voyage's multimodal-3 model.
4
-
5
-
use crate::providers::{Embedder, EmbeddingError};
1
+
use anyhow::{Context, Result};
6
2
use reqwest::Client;
7
3
use serde::{Deserialize, Serialize};
8
4
9
-
const VOYAGE_API_URL: &str = "https://api.voyageai.com/v1/multimodalembeddings";
10
-
const VOYAGE_MODEL: &str = "voyage-multimodal-3";
11
-
12
5
#[derive(Debug, Serialize)]
13
-
struct VoyageRequest {
6
+
struct VoyageEmbeddingRequest {
14
7
inputs: Vec<MultimodalInput>,
15
8
model: String,
16
9
#[serde(skip_serializing_if = "Option::is_none")]
···
29
22
}
30
23
31
24
#[derive(Debug, Deserialize)]
32
-
struct VoyageResponse {
25
+
struct VoyageEmbeddingResponse {
33
26
data: Vec<VoyageEmbeddingData>,
34
27
}
35
28
···
38
31
embedding: Vec<f32>,
39
32
}
40
33
41
-
/// voyage AI multimodal embedding client
42
-
///
43
-
/// uses the voyage-multimodal-3 model which produces 1024-dimensional vectors.
44
-
/// designed for early fusion of text and image content.
45
-
#[derive(Clone)]
46
-
pub struct VoyageEmbedder {
34
+
pub struct EmbeddingClient {
47
35
client: Client,
48
36
api_key: String,
49
37
}
50
38
51
-
impl VoyageEmbedder {
39
+
impl EmbeddingClient {
52
40
pub fn new(api_key: String) -> Self {
53
41
Self {
54
42
client: Client::new(),
55
43
api_key,
56
44
}
57
45
}
58
-
}
59
46
60
-
impl Embedder for VoyageEmbedder {
61
-
async fn embed(&self, text: &str) -> Result<Vec<f32>, EmbeddingError> {
62
-
let request = VoyageRequest {
47
+
pub async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
48
+
let request = VoyageEmbeddingRequest {
63
49
inputs: vec![MultimodalInput {
64
50
content: vec![ContentSegment::Text {
65
51
text: text.to_string(),
66
52
}],
67
53
}],
68
-
model: VOYAGE_MODEL.to_string(),
54
+
model: "voyage-multimodal-3".to_string(),
69
55
input_type: Some("query".to_string()),
70
56
};
71
57
72
58
let response = self
73
59
.client
74
-
.post(VOYAGE_API_URL)
60
+
.post("https://api.voyageai.com/v1/multimodalembeddings")
75
61
.header("Authorization", format!("Bearer {}", self.api_key))
76
62
.json(&request)
77
63
.send()
78
-
.await?;
64
+
.await
65
+
.context("failed to send embedding request")?;
79
66
80
67
if !response.status().is_success() {
81
-
let status = response.status().as_u16();
68
+
let status = response.status();
82
69
let body = response.text().await.unwrap_or_default();
83
-
return Err(EmbeddingError::Api { status, body });
70
+
anyhow::bail!("voyage api error ({}): {}", status, body);
84
71
}
85
72
86
-
let voyage_response: VoyageResponse = response.json().await.map_err(|e| {
87
-
EmbeddingError::Other(anyhow::anyhow!("failed to parse response: {}", e))
88
-
})?;
73
+
let embedding_response: VoyageEmbeddingResponse = response
74
+
.json()
75
+
.await
76
+
.context("failed to parse embedding response")?;
89
77
90
-
voyage_response
78
+
let embedding = embedding_response
91
79
.data
92
80
.into_iter()
93
81
.next()
94
82
.map(|d| d.embedding)
95
-
.ok_or(EmbeddingError::EmptyResponse)
96
-
}
83
+
.context("no embedding returned")?;
97
84
98
-
fn name(&self) -> &'static str {
99
-
"voyage-multimodal-3"
85
+
Ok(embedding)
100
86
}
101
87
}
102
-
-193
src/filter.rs
-193
src/filter.rs
···
1
-
//! composable result filters
2
-
//!
3
-
//! filters are predicates that can be combined to create complex filtering logic.
4
-
5
-
use regex::Regex;
6
-
7
-
/// a single search result that can be filtered
8
-
pub trait Filterable {
9
-
fn name(&self) -> &str;
10
-
}
11
-
12
-
/// a predicate that can accept or reject items
13
-
pub trait Filter<T: Filterable>: Send + Sync {
14
-
/// returns true if the item should be kept
15
-
fn matches(&self, item: &T) -> bool;
16
-
}
17
-
18
-
/// filters out inappropriate content based on a blocklist
19
-
struct BlocklistFilter {
20
-
blocklist: Vec<&'static str>,
21
-
}
22
-
23
-
impl BlocklistFilter {
24
-
fn inappropriate_bufos() -> Self {
25
-
Self {
26
-
blocklist: vec![
27
-
"bufo-juicy",
28
-
"good-news-bufo-offers-suppository",
29
-
"bufo-declines-your-suppository-offer",
30
-
"tsa-bufo-gropes-you",
31
-
],
32
-
}
33
-
}
34
-
}
35
-
36
-
impl<T: Filterable> Filter<T> for BlocklistFilter {
37
-
fn matches(&self, item: &T) -> bool {
38
-
!self.blocklist.iter().any(|blocked| item.name().contains(blocked))
39
-
}
40
-
}
41
-
42
-
/// filters out items matching any of the given regex patterns
43
-
struct ExcludePatternFilter {
44
-
patterns: Vec<Regex>,
45
-
}
46
-
47
-
impl ExcludePatternFilter {
48
-
fn from_comma_separated(pattern_str: &str) -> Self {
49
-
let patterns = pattern_str
50
-
.split(',')
51
-
.map(|p| p.trim())
52
-
.filter(|p| !p.is_empty())
53
-
.filter_map(|p| Regex::new(p).ok())
54
-
.collect();
55
-
56
-
Self { patterns }
57
-
}
58
-
59
-
fn empty() -> Self {
60
-
Self { patterns: vec![] }
61
-
}
62
-
}
63
-
64
-
impl<T: Filterable> Filter<T> for ExcludePatternFilter {
65
-
fn matches(&self, item: &T) -> bool {
66
-
!self.patterns.iter().any(|p| p.is_match(item.name()))
67
-
}
68
-
}
69
-
70
-
/// combined filter that handles family-friendly mode and include/exclude patterns
71
-
pub struct ContentFilter {
72
-
family_friendly: bool,
73
-
blocklist: BlocklistFilter,
74
-
exclude: ExcludePatternFilter,
75
-
include_patterns: Vec<Regex>,
76
-
}
77
-
78
-
impl ContentFilter {
79
-
pub fn new(
80
-
family_friendly: bool,
81
-
exclude_str: Option<&str>,
82
-
include_str: Option<&str>,
83
-
) -> Self {
84
-
let exclude = exclude_str
85
-
.map(ExcludePatternFilter::from_comma_separated)
86
-
.unwrap_or_else(ExcludePatternFilter::empty);
87
-
88
-
let include_patterns: Vec<Regex> = include_str
89
-
.map(|s| {
90
-
s.split(',')
91
-
.map(|p| p.trim())
92
-
.filter(|p| !p.is_empty())
93
-
.filter_map(|p| Regex::new(p).ok())
94
-
.collect()
95
-
})
96
-
.unwrap_or_default();
97
-
98
-
Self {
99
-
family_friendly,
100
-
blocklist: BlocklistFilter::inappropriate_bufos(),
101
-
exclude,
102
-
include_patterns,
103
-
}
104
-
}
105
-
106
-
pub fn exclude_pattern_count(&self) -> usize {
107
-
self.exclude.patterns.len()
108
-
}
109
-
110
-
pub fn exclude_patterns_str(&self) -> String {
111
-
self.exclude
112
-
.patterns
113
-
.iter()
114
-
.map(|r| r.as_str())
115
-
.collect::<Vec<_>>()
116
-
.join(",")
117
-
}
118
-
}
119
-
120
-
impl<T: Filterable> Filter<T> for ContentFilter {
121
-
fn matches(&self, item: &T) -> bool {
122
-
// check family-friendly blocklist
123
-
if self.family_friendly && !self.blocklist.matches(item) {
124
-
return false;
125
-
}
126
-
127
-
// check if explicitly included (overrides exclude)
128
-
let matches_include = self.include_patterns.iter().any(|p| p.is_match(item.name()));
129
-
if matches_include {
130
-
return true;
131
-
}
132
-
133
-
// check exclude patterns
134
-
self.exclude.matches(item)
135
-
}
136
-
}
137
-
138
-
#[cfg(test)]
139
-
mod tests {
140
-
use super::*;
141
-
142
-
struct TestItem {
143
-
name: String,
144
-
}
145
-
146
-
impl Filterable for TestItem {
147
-
fn name(&self) -> &str {
148
-
&self.name
149
-
}
150
-
}
151
-
152
-
#[test]
153
-
fn test_blocklist_filter() {
154
-
let filter = BlocklistFilter::inappropriate_bufos();
155
-
let good = TestItem {
156
-
name: "bufo-happy".into(),
157
-
};
158
-
let bad = TestItem {
159
-
name: "bufo-juicy".into(),
160
-
};
161
-
162
-
assert!(filter.matches(&good));
163
-
assert!(!filter.matches(&bad));
164
-
}
165
-
166
-
#[test]
167
-
fn test_exclude_pattern_filter() {
168
-
let filter = ExcludePatternFilter::from_comma_separated("test, draft");
169
-
let good = TestItem {
170
-
name: "bufo-happy".into(),
171
-
};
172
-
let bad = TestItem {
173
-
name: "bufo-test-mode".into(),
174
-
};
175
-
176
-
assert!(filter.matches(&good));
177
-
assert!(!filter.matches(&bad));
178
-
}
179
-
180
-
#[test]
181
-
fn test_include_overrides_exclude() {
182
-
let filter = ContentFilter::new(false, Some("party"), Some("birthday-party"));
183
-
let excluded = TestItem {
184
-
name: "bufo-party".into(),
185
-
};
186
-
let included = TestItem {
187
-
name: "bufo-birthday-party".into(),
188
-
};
189
-
190
-
assert!(!filter.matches(&excluded));
191
-
assert!(filter.matches(&included));
192
-
}
193
-
}
-3
src/main.rs
-3
src/main.rs
-99
src/providers.rs
-99
src/providers.rs
···
1
-
//! provider abstractions for embedding and vector search backends
2
-
//!
3
-
//! these traits allow swapping implementations (e.g., voyage โ openai embeddings)
4
-
//! without changing the search logic.
5
-
//!
6
-
//! ## design notes
7
-
//!
8
-
//! we use `async fn` in traits directly (stabilized in rust 1.75). for this crate's
9
-
//! use case (single-threaded actix-web), the Send bound issue doesn't apply.
10
-
//!
11
-
//! the trait design follows patterns from:
12
-
//! - async-openai's `Config` trait for backend abstraction
13
-
//! - tower's `Service` trait for composability (though simpler here)
14
-
15
-
use std::future::Future;
16
-
use thiserror::Error;
17
-
18
-
/// errors that can occur when generating embeddings
19
-
#[derive(Debug, Error)]
20
-
pub enum EmbeddingError {
21
-
#[error("failed to send request: {0}")]
22
-
Request(#[from] reqwest::Error),
23
-
24
-
#[error("api error ({status}): {body}")]
25
-
Api { status: u16, body: String },
26
-
27
-
#[error("no embedding returned from provider")]
28
-
EmptyResponse,
29
-
30
-
#[error("{0}")]
31
-
Other(#[from] anyhow::Error),
32
-
}
33
-
34
-
/// a provider that can generate embeddings for text
35
-
///
36
-
/// implementations should be cheap to clone (wrap expensive resources in Arc).
37
-
///
38
-
/// # example
39
-
///
40
-
/// ```ignore
41
-
/// let client = VoyageEmbedder::new(api_key);
42
-
/// let embedding = client.embed("hello world").await?;
43
-
/// ```
44
-
pub trait Embedder: Send + Sync {
45
-
/// generate an embedding vector for the given text
46
-
fn embed(&self, text: &str) -> impl Future<Output = Result<Vec<f32>, EmbeddingError>> + Send;
47
-
48
-
/// human-readable name for logging/debugging
49
-
fn name(&self) -> &'static str;
50
-
}
51
-
52
-
/// errors that can occur during vector search
53
-
#[derive(Debug, Error)]
54
-
pub enum VectorSearchError {
55
-
#[error("request failed: {0}")]
56
-
Request(#[from] reqwest::Error),
57
-
58
-
#[error("api error ({status}): {body}")]
59
-
Api { status: u16, body: String },
60
-
61
-
#[error("query too long: {message}")]
62
-
QueryTooLong { message: String },
63
-
64
-
#[error("parse error: {0}")]
65
-
Parse(String),
66
-
67
-
#[error("{0}")]
68
-
Other(#[from] anyhow::Error),
69
-
}
70
-
71
-
/// a single result from a vector search
72
-
#[derive(Debug, Clone)]
73
-
pub struct SearchResult {
74
-
pub id: String,
75
-
/// raw distance/score from the backend (interpretation varies by method)
76
-
pub score: f32,
77
-
/// arbitrary key-value attributes
78
-
pub attributes: std::collections::HashMap<String, String>,
79
-
}
80
-
81
-
/// a provider that can perform vector similarity search
82
-
pub trait VectorStore: Send + Sync {
83
-
/// search by vector embedding (ANN/cosine similarity)
84
-
fn search_by_vector(
85
-
&self,
86
-
embedding: &[f32],
87
-
top_k: usize,
88
-
) -> impl Future<Output = Result<Vec<SearchResult>, VectorSearchError>> + Send;
89
-
90
-
/// search by keyword (BM25 full-text search)
91
-
fn search_by_keyword(
92
-
&self,
93
-
query: &str,
94
-
top_k: usize,
95
-
) -> impl Future<Output = Result<Vec<SearchResult>, VectorSearchError>> + Send;
96
-
97
-
/// human-readable name for logging/debugging
98
-
fn name(&self) -> &'static str;
99
-
}
-164
src/scoring.rs
-164
src/scoring.rs
···
1
-
//! score fusion and normalization for hybrid search
2
-
//!
3
-
//! this module handles the weighted combination of semantic (vector) and
4
-
//! keyword (BM25) search scores.
5
-
//!
6
-
//! ## normalization strategies
7
-
//!
8
-
//! - **cosine distance โ similarity**: `1.0 - (distance / 2.0)` maps [0, 2] โ [1, 0]
9
-
//! - **BM25 max-scaling**: divide by max score so top result = 1.0
10
-
//!
11
-
//! ## fusion formula
12
-
//!
13
-
//! ```text
14
-
//! score = ฮฑ * semantic + (1 - ฮฑ) * keyword
15
-
//! ```
16
-
//!
17
-
//! reference: https://opensourceconnections.com/blog/2023/02/27/hybrid-vigor-winning-at-hybrid-search/
18
-
19
-
use std::collections::HashMap;
20
-
21
-
/// configuration for score fusion
22
-
#[derive(Debug, Clone)]
23
-
pub struct FusionConfig {
24
-
/// weight for semantic scores (0.0 = pure keyword, 1.0 = pure semantic)
25
-
pub alpha: f32,
26
-
/// minimum fused score to include in results (filters noise)
27
-
pub min_score: f32,
28
-
}
29
-
30
-
impl Default for FusionConfig {
31
-
fn default() -> Self {
32
-
Self {
33
-
alpha: 0.7,
34
-
min_score: 0.001,
35
-
}
36
-
}
37
-
}
38
-
39
-
impl FusionConfig {
40
-
pub fn new(alpha: f32) -> Self {
41
-
Self {
42
-
alpha,
43
-
..Default::default()
44
-
}
45
-
}
46
-
}
47
-
48
-
/// normalize cosine distance to similarity score
49
-
///
50
-
/// cosine distance ranges from 0 (identical) to 2 (opposite).
51
-
/// we convert to similarity: 1.0 (identical) to 0.0 (opposite).
52
-
#[inline]
53
-
pub fn cosine_distance_to_similarity(distance: f32) -> f32 {
54
-
1.0 - (distance / 2.0)
55
-
}
56
-
57
-
/// normalize BM25 scores using max-scaling
58
-
///
59
-
/// divides all scores by the maximum score, ensuring:
60
-
/// - top result gets score 1.0
61
-
/// - relative spacing is preserved
62
-
/// - handles edge cases (empty results, identical scores)
63
-
pub fn normalize_bm25_scores(scores: &[(String, f32)]) -> HashMap<String, f32> {
64
-
let max_score = scores
65
-
.iter()
66
-
.map(|(_, s)| *s)
67
-
.fold(f32::NEG_INFINITY, f32::max)
68
-
.max(0.001); // avoid division by zero
69
-
70
-
scores
71
-
.iter()
72
-
.map(|(id, score)| (id.clone(), (score / max_score).min(1.0)))
73
-
.collect()
74
-
}
75
-
76
-
/// fuse semantic and keyword scores using weighted combination
77
-
///
78
-
/// returns items sorted by fused score (descending), filtered by min_score.
79
-
pub fn fuse_scores(
80
-
semantic_scores: &HashMap<String, f32>,
81
-
keyword_scores: &HashMap<String, f32>,
82
-
config: &FusionConfig,
83
-
) -> Vec<(String, f32)> {
84
-
// collect all unique IDs
85
-
let all_ids: std::collections::HashSet<_> = semantic_scores
86
-
.keys()
87
-
.chain(keyword_scores.keys())
88
-
.collect();
89
-
90
-
let mut fused: Vec<(String, f32)> = all_ids
91
-
.into_iter()
92
-
.map(|id| {
93
-
let semantic = semantic_scores.get(id).copied().unwrap_or(0.0);
94
-
let keyword = keyword_scores.get(id).copied().unwrap_or(0.0);
95
-
let score = config.alpha * semantic + (1.0 - config.alpha) * keyword;
96
-
(id.clone(), score)
97
-
})
98
-
.filter(|(_, score)| *score > config.min_score)
99
-
.collect();
100
-
101
-
// sort descending by score
102
-
fused.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
103
-
104
-
fused
105
-
}
106
-
107
-
#[cfg(test)]
108
-
mod tests {
109
-
use super::*;
110
-
111
-
#[test]
112
-
fn test_cosine_distance_to_similarity() {
113
-
assert!((cosine_distance_to_similarity(0.0) - 1.0).abs() < 0.001);
114
-
assert!((cosine_distance_to_similarity(2.0) - 0.0).abs() < 0.001);
115
-
assert!((cosine_distance_to_similarity(1.0) - 0.5).abs() < 0.001);
116
-
}
117
-
118
-
#[test]
119
-
fn test_normalize_bm25_scores() {
120
-
let scores = vec![
121
-
("a".to_string(), 10.0),
122
-
("b".to_string(), 5.0),
123
-
("c".to_string(), 2.5),
124
-
];
125
-
126
-
let normalized = normalize_bm25_scores(&scores);
127
-
128
-
assert!((normalized["a"] - 1.0).abs() < 0.001);
129
-
assert!((normalized["b"] - 0.5).abs() < 0.001);
130
-
assert!((normalized["c"] - 0.25).abs() < 0.001);
131
-
}
132
-
133
-
#[test]
134
-
fn test_fuse_scores_pure_semantic() {
135
-
let mut semantic = HashMap::new();
136
-
semantic.insert("a".to_string(), 0.9);
137
-
semantic.insert("b".to_string(), 0.5);
138
-
139
-
let mut keyword = HashMap::new();
140
-
keyword.insert("a".to_string(), 0.1);
141
-
keyword.insert("c".to_string(), 1.0);
142
-
143
-
let config = FusionConfig::new(1.0); // pure semantic
144
-
let fused = fuse_scores(&semantic, &keyword, &config);
145
-
146
-
assert_eq!(fused[0].0, "a");
147
-
assert!((fused[0].1 - 0.9).abs() < 0.001);
148
-
}
149
-
150
-
#[test]
151
-
fn test_fuse_scores_balanced() {
152
-
let mut semantic = HashMap::new();
153
-
semantic.insert("a".to_string(), 0.8);
154
-
155
-
let mut keyword = HashMap::new();
156
-
keyword.insert("a".to_string(), 0.4);
157
-
158
-
let config = FusionConfig::new(0.5); // balanced
159
-
let fused = fuse_scores(&semantic, &keyword, &config);
160
-
161
-
// 0.5 * 0.8 + 0.5 * 0.4 = 0.6
162
-
assert!((fused[0].1 - 0.6).abs() < 0.001);
163
-
}
164
-
}
+218
-205
src/search.rs
+218
-205
src/search.rs
···
27
27
//! - `ฮฑ=0.5`: balanced (equal weight to semantic and keyword signals)
28
28
//! - `ฮฑ=0.0`: pure keyword (best for exact filename searches)
29
29
//!
30
+
//! ## empirical behavior
31
+
//!
32
+
//! query: "happy", top_k=3
33
+
//! - ฮฑ=1.0: ["proud-bufo-is-excited", "bufo-hehe", "bufo-excited"] (semantic similarity)
34
+
//! - ฮฑ=0.5: ["bufo-is-happy-youre-happy", ...] (exact match rises to top)
35
+
//! - ฮฑ=0.0: ["bufo-is-happy-youre-happy" (1.0), others (0.0)] (only exact matches score)
36
+
//!
30
37
//! ## references
31
38
//!
32
39
//! - voyage multimodal embeddings: https://docs.voyageai.com/docs/multimodal-embeddings
···
34
41
//! - weighted fusion: standard approach in modern hybrid search systems (2024)
35
42
36
43
use crate::config::Config;
37
-
use crate::embedding::VoyageEmbedder;
38
-
use crate::filter::{ContentFilter, Filter, Filterable};
39
-
use crate::providers::{Embedder, VectorSearchError, VectorStore};
40
-
use crate::scoring::{cosine_distance_to_similarity, fuse_scores, normalize_bm25_scores, FusionConfig};
41
-
use crate::turbopuffer::TurbopufferStore;
44
+
use crate::embedding::EmbeddingClient;
45
+
use crate::turbopuffer::{QueryRequest, TurbopufferClient, TurbopufferError};
42
46
use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult};
43
47
use serde::{Deserialize, Serialize};
44
48
use std::collections::hash_map::DefaultHasher;
45
-
use std::collections::HashMap;
46
49
use std::hash::{Hash, Hasher};
47
50
48
51
#[derive(Debug, Deserialize)]
···
57
60
/// family-friendly mode: filters out inappropriate content (default true)
58
61
#[serde(default = "default_family_friendly")]
59
62
pub family_friendly: bool,
60
-
/// comma-separated regex patterns to exclude from results (e.g., "excited,party")
61
-
#[serde(default)]
62
-
pub exclude: Option<String>,
63
-
/// comma-separated regex patterns to include (overrides exclude)
64
-
#[serde(default)]
65
-
pub include: Option<String>,
66
63
}
67
64
68
65
fn default_top_k() -> usize {
···
77
74
true
78
75
}
79
76
77
+
/// blocklist of inappropriate bufos (filtered when family_friendly=true)
78
+
fn get_inappropriate_bufos() -> Vec<&'static str> {
79
+
vec![
80
+
"bufo-juicy",
81
+
"good-news-bufo-offers-suppository",
82
+
"bufo-declines-your-suppository-offer",
83
+
"tsa-bufo-gropes-you",
84
+
]
85
+
}
86
+
80
87
#[derive(Debug, Serialize)]
81
88
pub struct SearchResponse {
82
89
pub results: Vec<BufoResult>,
83
90
}
84
91
85
-
#[derive(Debug, Serialize, Clone)]
92
+
#[derive(Debug, Serialize)]
86
93
pub struct BufoResult {
87
94
pub id: String,
88
95
pub url: String,
89
96
pub name: String,
90
-
pub score: f32,
91
-
}
92
-
93
-
impl Filterable for BufoResult {
94
-
fn name(&self) -> &str {
95
-
&self.name
96
-
}
97
-
}
98
-
99
-
/// errors that can occur during search
100
-
#[derive(Debug, thiserror::Error)]
101
-
pub enum SearchError {
102
-
#[error("embedding error: {0}")]
103
-
Embedding(#[from] crate::providers::EmbeddingError),
104
-
105
-
#[error("vector search error: {0}")]
106
-
VectorSearch(#[from] VectorSearchError),
107
-
}
108
-
109
-
impl SearchError {
110
-
fn into_actix_error(self) -> actix_web::Error {
111
-
match &self {
112
-
SearchError::VectorSearch(VectorSearchError::QueryTooLong { .. }) => {
113
-
actix_web::error::ErrorBadRequest(
114
-
"search query is too long (max 1024 characters for text search). try a shorter query."
115
-
)
116
-
}
117
-
_ => actix_web::error::ErrorInternalServerError(self.to_string()),
118
-
}
119
-
}
97
+
pub score: f32, // normalized 0-1 score for display
120
98
}
121
99
122
100
/// generate etag for caching based on query parameters
123
-
fn generate_etag(
124
-
query: &str,
125
-
top_k: usize,
126
-
alpha: f32,
127
-
family_friendly: bool,
128
-
exclude: &Option<String>,
129
-
include: &Option<String>,
130
-
) -> String {
101
+
fn generate_etag(query: &str, top_k: usize, alpha: f32, family_friendly: bool) -> String {
131
102
let mut hasher = DefaultHasher::new();
132
103
query.hash(&mut hasher);
133
104
top_k.hash(&mut hasher);
105
+
// convert f32 to bits for consistent hashing
134
106
alpha.to_bits().hash(&mut hasher);
135
107
family_friendly.hash(&mut hasher);
136
-
exclude.hash(&mut hasher);
137
-
include.hash(&mut hasher);
138
108
format!("\"{}\"", hasher.finish())
139
109
}
140
110
141
-
/// execute hybrid search using the provided embedder and vector store
142
-
async fn execute_hybrid_search<E: Embedder, V: VectorStore>(
143
-
query: &str,
144
-
top_k: usize,
145
-
fusion_config: &FusionConfig,
146
-
embedder: &E,
147
-
vector_store: &V,
148
-
) -> Result<Vec<(String, f32, HashMap<String, String>)>, SearchError> {
149
-
// fetch extra results to ensure we have enough after filtering
150
-
let search_top_k = top_k * 5;
151
-
let query_owned = query.to_string();
111
+
/// shared search implementation used by both POST and GET handlers
112
+
async fn perform_search(
113
+
query_text: String,
114
+
top_k_val: usize,
115
+
alpha: f32,
116
+
family_friendly: bool,
117
+
config: &Config,
118
+
) -> ActixResult<SearchResponse> {
152
119
153
-
// generate query embedding
154
-
let _embed_span = logfire::span!(
155
-
"embedding.generate",
156
-
query = &query_owned,
157
-
model = embedder.name()
158
-
)
159
-
.entered();
120
+
let _search_span = logfire::span!(
121
+
"bufo_search",
122
+
query = &query_text,
123
+
top_k = top_k_val as i64,
124
+
alpha = alpha as f64,
125
+
family_friendly = family_friendly
126
+
).entered();
160
127
161
-
let query_embedding = embedder.embed(query).await?;
128
+
logfire::info!(
129
+
"search request received",
130
+
query = &query_text,
131
+
top_k = top_k_val as i64,
132
+
alpha = alpha as f64
133
+
);
134
+
135
+
let embedding_client = EmbeddingClient::new(config.voyage_api_key.clone());
136
+
let tpuf_client = TurbopufferClient::new(
137
+
config.turbopuffer_api_key.clone(),
138
+
config.turbopuffer_namespace.clone(),
139
+
);
140
+
141
+
// generate embedding for user query
142
+
let query_embedding = {
143
+
let _span = logfire::span!(
144
+
"voyage.embed_text",
145
+
query = &query_text,
146
+
model = "voyage-3-lite"
147
+
).entered();
148
+
149
+
embedding_client
150
+
.embed_text(&query_text)
151
+
.await
152
+
.map_err(|e| {
153
+
let error_msg = e.to_string();
154
+
logfire::error!(
155
+
"embedding generation failed",
156
+
error = error_msg,
157
+
query = &query_text
158
+
);
159
+
actix_web::error::ErrorInternalServerError(format!(
160
+
"failed to generate embedding: {}",
161
+
e
162
+
))
163
+
})?
164
+
};
162
165
163
166
logfire::info!(
164
167
"embedding generated",
165
-
query = &query_owned,
168
+
query = &query_text,
166
169
embedding_dim = query_embedding.len() as i64
167
170
);
168
171
169
-
// run both searches in sequence (could parallelize with tokio::join! if needed)
170
-
let namespace = vector_store.name().to_string();
172
+
// run vector search (semantic)
173
+
let search_top_k = top_k_val * 2; // get more results for better fusion
174
+
let vector_request = QueryRequest {
175
+
rank_by: vec![
176
+
serde_json::json!("vector"),
177
+
serde_json::json!("ANN"),
178
+
serde_json::json!(query_embedding),
179
+
],
180
+
top_k: search_top_k,
181
+
include_attributes: Some(vec!["url".to_string(), "name".to_string(), "filename".to_string()]),
182
+
};
171
183
184
+
let namespace = config.turbopuffer_namespace.clone();
172
185
let vector_results = {
173
186
let _span = logfire::span!(
174
187
"turbopuffer.vector_search",
175
-
query = &query_owned,
188
+
query = &query_text,
176
189
top_k = search_top_k as i64,
177
190
namespace = &namespace
178
-
)
179
-
.entered();
191
+
).entered();
180
192
181
-
vector_store
182
-
.search_by_vector(&query_embedding, search_top_k)
183
-
.await?
193
+
tpuf_client.query(vector_request).await.map_err(|e| {
194
+
let error_msg = e.to_string();
195
+
logfire::error!(
196
+
"vector search failed",
197
+
error = error_msg,
198
+
query = &query_text,
199
+
top_k = search_top_k as i64
200
+
);
201
+
actix_web::error::ErrorInternalServerError(format!(
202
+
"failed to query turbopuffer (vector): {}",
203
+
e
204
+
))
205
+
})?
184
206
};
185
207
186
208
logfire::info!(
187
209
"vector search completed",
188
-
query = &query_owned,
210
+
query = &query_text,
189
211
results_found = vector_results.len() as i64
190
212
);
191
213
214
+
// run BM25 text search (keyword)
192
215
let bm25_results = {
193
216
let _span = logfire::span!(
194
217
"turbopuffer.bm25_search",
195
-
query = &query_owned,
218
+
query = &query_text,
196
219
top_k = search_top_k as i64,
197
220
namespace = &namespace
198
-
)
199
-
.entered();
221
+
).entered();
200
222
201
-
vector_store.search_by_keyword(query, search_top_k).await?
223
+
tpuf_client.bm25_query(&query_text, search_top_k).await.map_err(|e| {
224
+
let error_msg = e.to_string();
225
+
logfire::error!(
226
+
"bm25 search failed",
227
+
error = error_msg,
228
+
query = &query_text,
229
+
top_k = search_top_k as i64
230
+
);
231
+
232
+
// return appropriate HTTP status based on error type
233
+
match e {
234
+
TurbopufferError::QueryTooLong { .. } => {
235
+
actix_web::error::ErrorBadRequest(
236
+
"search query is too long (max 1024 characters for text search). try a shorter query."
237
+
)
238
+
}
239
+
_ => {
240
+
actix_web::error::ErrorInternalServerError(format!(
241
+
"failed to query turbopuffer (BM25): {}",
242
+
e
243
+
))
244
+
}
245
+
}
246
+
})?
202
247
};
203
248
204
-
// normalize scores
205
-
let semantic_scores: HashMap<String, f32> = vector_results
206
-
.iter()
207
-
.map(|r| (r.id.clone(), cosine_distance_to_similarity(r.score)))
208
-
.collect();
249
+
// weighted fusion: combine vector and BM25 results
250
+
use std::collections::HashMap;
251
+
252
+
// normalize vector scores (cosine distance -> 0-1 similarity)
253
+
let mut semantic_scores: HashMap<String, f32> = HashMap::new();
254
+
for row in &vector_results {
255
+
let score = 1.0 - (row.dist / 2.0);
256
+
semantic_scores.insert(row.id.clone(), score);
257
+
}
209
258
210
-
let bm25_raw: Vec<(String, f32)> = bm25_results
211
-
.iter()
212
-
.map(|r| (r.id.clone(), r.score))
213
-
.collect();
214
-
let keyword_scores = normalize_bm25_scores(&bm25_raw);
259
+
// normalize BM25 scores using max normalization (BM25-max-scaled approach)
260
+
// this preserves relative spacing and handles edge cases (single result, similar scores)
261
+
// reference: https://opensourceconnections.com/blog/2023/02/27/hybrid-vigor-winning-at-hybrid-search/
262
+
let bm25_scores_vec: Vec<f32> = bm25_results.iter().map(|r| r.dist).collect();
263
+
let max_bm25 = bm25_scores_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max).max(0.001); // avoid division by zero
215
264
216
-
let max_bm25 = bm25_raw
217
-
.iter()
218
-
.map(|(_, s)| *s)
219
-
.fold(f32::NEG_INFINITY, f32::max);
265
+
let mut keyword_scores: HashMap<String, f32> = HashMap::new();
266
+
for row in &bm25_results {
267
+
// divide by max to ensure top result gets 1.0, others scale proportionally
268
+
let normalized_score = (row.dist / max_bm25).min(1.0);
269
+
keyword_scores.insert(row.id.clone(), normalized_score);
270
+
}
220
271
221
272
logfire::info!(
222
273
"bm25 search completed",
223
-
query = &query_owned,
274
+
query = &query_text,
224
275
results_found = bm25_results.len() as i64,
225
276
max_bm25 = max_bm25 as f64,
226
-
top_bm25_raw = bm25_raw.first().map(|(_, s)| *s).unwrap_or(0.0) as f64
227
-
);
228
-
229
-
// fuse scores
230
-
let fused = fuse_scores(&semantic_scores, &keyword_scores, fusion_config);
231
-
232
-
logfire::info!(
233
-
"weighted fusion completed",
234
-
total_candidates = (vector_results.len() + bm25_results.len()) as i64,
235
-
alpha = fusion_config.alpha as f64,
236
-
pre_filter_results = fused.len() as i64
277
+
top_bm25_raw = bm25_scores_vec.first().copied().unwrap_or(0.0) as f64,
278
+
top_bm25_normalized = keyword_scores.values().cloned().fold(f32::NEG_INFINITY, f32::max) as f64
237
279
);
238
280
239
-
// collect attributes from both result sets
240
-
let mut all_attributes: HashMap<String, HashMap<String, String>> = HashMap::new();
241
-
for result in vector_results.into_iter().chain(bm25_results.into_iter()) {
242
-
all_attributes
243
-
.entry(result.id.clone())
244
-
.or_insert(result.attributes);
281
+
// collect all unique results and compute weighted fusion scores
282
+
let mut all_results: HashMap<String, crate::turbopuffer::QueryRow> = HashMap::new();
283
+
for row in vector_results.into_iter().chain(bm25_results.into_iter()) {
284
+
all_results.entry(row.id.clone()).or_insert(row);
245
285
}
246
286
247
-
// return fused results with attributes
248
-
Ok(fused
249
-
.into_iter()
250
-
.map(|(id, score)| {
251
-
let attrs = all_attributes.remove(&id).unwrap_or_default();
252
-
(id, score, attrs)
287
+
let mut fused_scores: Vec<(String, f32)> = all_results
288
+
.keys()
289
+
.map(|id| {
290
+
let semantic = semantic_scores.get(id).copied().unwrap_or(0.0);
291
+
let keyword = keyword_scores.get(id).copied().unwrap_or(0.0);
292
+
let fused = alpha * semantic + (1.0 - alpha) * keyword;
293
+
(id.clone(), fused)
253
294
})
254
-
.collect())
255
-
}
295
+
.collect();
256
296
257
-
/// shared search implementation used by both POST and GET handlers
258
-
async fn perform_search(
259
-
query_text: String,
260
-
top_k_val: usize,
261
-
alpha: f32,
262
-
family_friendly: bool,
263
-
exclude: Option<String>,
264
-
include: Option<String>,
265
-
config: &Config,
266
-
) -> ActixResult<SearchResponse> {
267
-
let content_filter = ContentFilter::new(
268
-
family_friendly,
269
-
exclude.as_deref(),
270
-
include.as_deref(),
271
-
);
297
+
// filter out zero-scored results (irrelevant matches from the other search method)
298
+
// this prevents vector-only results from appearing when alpha=0.0 (pure keyword)
299
+
// and keyword-only results from appearing when alpha=1.0 (pure semantic)
300
+
fused_scores.retain(|(_, score)| *score > 0.001);
272
301
273
-
let _search_span = logfire::span!(
274
-
"bufo_search",
275
-
query = &query_text,
276
-
top_k = top_k_val as i64,
277
-
alpha = alpha as f64,
278
-
family_friendly = family_friendly,
279
-
exclude_patterns_count = content_filter.exclude_pattern_count() as i64
280
-
)
281
-
.entered();
302
+
// sort by fused score (descending) and take top_k
303
+
fused_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
304
+
fused_scores.truncate(top_k_val);
282
305
283
306
logfire::info!(
284
-
"search request received",
285
-
query = &query_text,
286
-
top_k = top_k_val as i64,
307
+
"weighted fusion completed",
308
+
total_candidates = all_results.len() as i64,
287
309
alpha = alpha as f64,
288
-
exclude_patterns = &content_filter.exclude_patterns_str()
310
+
final_results = fused_scores.len() as i64
289
311
);
290
312
291
-
// create clients
292
-
let embedder = VoyageEmbedder::new(config.voyage_api_key.clone());
293
-
let vector_store = TurbopufferStore::new(
294
-
config.turbopuffer_api_key.clone(),
295
-
config.turbopuffer_namespace.clone(),
296
-
);
313
+
// convert to bufo results
314
+
let inappropriate_bufos = get_inappropriate_bufos();
315
+
let results: Vec<BufoResult> = fused_scores
316
+
.into_iter()
317
+
.filter_map(|(id, score)| {
318
+
all_results.get(&id).map(|row| {
319
+
let url = row
320
+
.attributes
321
+
.get("url")
322
+
.and_then(|v| v.as_str())
323
+
.unwrap_or("")
324
+
.to_string();
297
325
298
-
let fusion_config = FusionConfig::new(alpha);
326
+
let name = row
327
+
.attributes
328
+
.get("name")
329
+
.and_then(|v| v.as_str())
330
+
.unwrap_or(&row.id)
331
+
.to_string();
299
332
300
-
// execute hybrid search
301
-
let fused_results = execute_hybrid_search(
302
-
&query_text,
303
-
top_k_val,
304
-
&fusion_config,
305
-
&embedder,
306
-
&vector_store,
307
-
)
308
-
.await
309
-
.map_err(|e| e.into_actix_error())?;
310
-
311
-
// convert to BufoResults and apply filtering
312
-
let results: Vec<BufoResult> = fused_results
313
-
.into_iter()
314
-
.map(|(id, score, attrs)| BufoResult {
315
-
id: id.clone(),
316
-
url: attrs.get("url").cloned().unwrap_or_default(),
317
-
name: attrs.get("name").cloned().unwrap_or_else(|| id.clone()),
318
-
score,
333
+
BufoResult {
334
+
id: row.id.clone(),
335
+
url,
336
+
name,
337
+
score,
338
+
}
339
+
})
340
+
})
341
+
.filter(|result| {
342
+
// filter out inappropriate bufos if family_friendly mode is enabled
343
+
if family_friendly {
344
+
!inappropriate_bufos.iter().any(|&blocked| result.name.contains(blocked))
345
+
} else {
346
+
true
347
+
}
319
348
})
320
-
.filter(|result| content_filter.matches(result))
321
-
.take(top_k_val)
322
349
.collect();
323
350
324
351
let results_count = results.len() as i64;
325
-
let top_result_name = results
326
-
.first()
327
-
.map(|r| r.name.clone())
328
-
.unwrap_or_else(|| "none".to_string());
352
+
let top_result_name = results.first().map(|r| r.name.clone()).unwrap_or_else(|| "none".to_string());
329
353
let top_score_val = results.first().map(|r| r.score as f64).unwrap_or(0.0);
330
354
let avg_score_val = if !results.is_empty() {
331
355
results.iter().map(|r| r.score as f64).sum::<f64>() / results.len() as f64
···
355
379
query.top_k,
356
380
query.alpha,
357
381
query.family_friendly,
358
-
query.exclude.clone(),
359
-
query.include.clone(),
360
-
&config,
361
-
)
362
-
.await?;
382
+
&config
383
+
).await?;
363
384
Ok(HttpResponse::Ok().json(response))
364
385
}
365
386
···
369
390
config: web::Data<Config>,
370
391
req: HttpRequest,
371
392
) -> ActixResult<HttpResponse> {
372
-
let etag = generate_etag(
373
-
&query.query,
374
-
query.top_k,
375
-
query.alpha,
376
-
query.family_friendly,
377
-
&query.exclude,
378
-
&query.include,
379
-
);
393
+
// generate etag for caching
394
+
let etag = generate_etag(&query.query, query.top_k, query.alpha, query.family_friendly);
380
395
396
+
// check if client has cached version
381
397
if let Some(if_none_match) = req.headers().get("if-none-match") {
382
398
if if_none_match.to_str().unwrap_or("") == etag {
383
399
return Ok(HttpResponse::NotModified()
···
391
407
query.top_k,
392
408
query.alpha,
393
409
query.family_friendly,
394
-
query.exclude.clone(),
395
-
query.include.clone(),
396
-
&config,
397
-
)
398
-
.await?;
410
+
&config
411
+
).await?;
399
412
400
413
Ok(HttpResponse::Ok()
401
414
.insert_header(("etag", etag.clone()))
402
-
.insert_header(("cache-control", "public, max-age=300"))
415
+
.insert_header(("cache-control", "public, max-age=300")) // cache for 5 minutes
403
416
.json(response))
404
417
}
+92
-100
src/turbopuffer.rs
+92
-100
src/turbopuffer.rs
···
1
-
//! turbopuffer vector database implementation
2
-
//!
3
-
//! implements the `VectorStore` trait for turbopuffer's hybrid search API.
4
-
5
-
use crate::providers::{SearchResult, VectorSearchError, VectorStore};
1
+
use anyhow::{Context, Result};
6
2
use reqwest::Client;
7
3
use serde::{Deserialize, Serialize};
8
-
9
-
const TURBOPUFFER_API_BASE: &str = "https://api.turbopuffer.com/v1/vectors";
10
-
11
-
/// raw response row from turbopuffer API
12
-
#[derive(Debug, Deserialize, Serialize, Clone)]
13
-
pub struct QueryRow {
14
-
pub id: String,
15
-
pub dist: f32,
16
-
pub attributes: serde_json::Map<String, serde_json::Value>,
17
-
}
18
-
19
-
impl From<QueryRow> for SearchResult {
20
-
fn from(row: QueryRow) -> Self {
21
-
let attributes = row
22
-
.attributes
23
-
.iter()
24
-
.filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
25
-
.collect();
4
+
use thiserror::Error;
26
5
27
-
SearchResult {
28
-
id: row.id,
29
-
score: row.dist,
30
-
attributes,
31
-
}
32
-
}
6
+
#[derive(Debug, Error)]
7
+
pub enum TurbopufferError {
8
+
#[error("query too long: {message}")]
9
+
QueryTooLong { message: String },
10
+
#[error("turbopuffer API error: {0}")]
11
+
ApiError(String),
12
+
#[error("request failed: {0}")]
13
+
RequestFailed(#[from] reqwest::Error),
14
+
#[error("{0}")]
15
+
Other(#[from] anyhow::Error),
33
16
}
34
17
35
18
#[derive(Debug, Deserialize)]
36
-
struct ErrorResponse {
19
+
struct TurbopufferErrorResponse {
37
20
error: String,
38
21
#[allow(dead_code)]
39
22
status: String,
40
23
}
41
24
42
-
/// turbopuffer vector database client
43
-
///
44
-
/// supports both ANN vector search and BM25 full-text search.
45
-
#[derive(Clone)]
46
-
pub struct TurbopufferStore {
25
+
#[derive(Debug, Serialize)]
26
+
pub struct QueryRequest {
27
+
pub rank_by: Vec<serde_json::Value>,
28
+
pub top_k: usize,
29
+
#[serde(skip_serializing_if = "Option::is_none")]
30
+
pub include_attributes: Option<Vec<String>>,
31
+
}
32
+
33
+
pub type QueryResponse = Vec<QueryRow>;
34
+
35
+
#[derive(Debug, Deserialize, Serialize, Clone)]
36
+
pub struct QueryRow {
37
+
pub id: String,
38
+
pub dist: f32, // for vector: cosine distance; for BM25: BM25 score
39
+
pub attributes: serde_json::Map<String, serde_json::Value>,
40
+
}
41
+
42
+
pub struct TurbopufferClient {
47
43
client: Client,
48
44
api_key: String,
49
45
namespace: String,
50
46
}
51
47
52
-
impl TurbopufferStore {
48
+
impl TurbopufferClient {
53
49
pub fn new(api_key: String, namespace: String) -> Self {
54
50
Self {
55
51
client: Client::new(),
···
58
54
}
59
55
}
60
56
61
-
fn query_url(&self) -> String {
62
-
format!("{}/{}/query", TURBOPUFFER_API_BASE, self.namespace)
63
-
}
57
+
pub async fn query(&self, request: QueryRequest) -> Result<QueryResponse> {
58
+
let url = format!(
59
+
"https://api.turbopuffer.com/v1/vectors/{}/query",
60
+
self.namespace
61
+
);
64
62
65
-
async fn execute_query(
66
-
&self,
67
-
request: serde_json::Value,
68
-
) -> Result<Vec<QueryRow>, VectorSearchError> {
63
+
let request_json = serde_json::to_string_pretty(&request)?;
64
+
log::debug!("turbopuffer query request: {}", request_json);
65
+
69
66
let response = self
70
67
.client
71
-
.post(self.query_url())
68
+
.post(&url)
72
69
.header("Authorization", format!("Bearer {}", self.api_key))
73
70
.json(&request)
74
71
.send()
75
-
.await?;
72
+
.await
73
+
.context("failed to send query request")?;
76
74
77
75
if !response.status().is_success() {
78
-
let status = response.status().as_u16();
76
+
let status = response.status();
79
77
let body = response.text().await.unwrap_or_default();
80
-
81
-
// check for specific error types
82
-
if let Ok(error_resp) = serde_json::from_str::<ErrorResponse>(&body) {
83
-
if error_resp.error.contains("too long") && error_resp.error.contains("max 1024") {
84
-
return Err(VectorSearchError::QueryTooLong {
85
-
message: error_resp.error,
86
-
});
87
-
}
88
-
}
89
-
90
-
return Err(VectorSearchError::Api { status, body });
78
+
anyhow::bail!("turbopuffer query failed with status {}: {}", status, body);
91
79
}
92
80
93
-
let body = response.text().await.map_err(|e| {
94
-
VectorSearchError::Other(anyhow::anyhow!("failed to read response: {}", e))
95
-
})?;
81
+
let body = response.text().await.context("failed to read response body")?;
96
82
97
83
serde_json::from_str(&body)
98
-
.map_err(|e| VectorSearchError::Parse(format!("failed to parse response: {}", e)))
84
+
.context(format!("failed to parse query response: {}", body))
99
85
}
100
-
}
101
86
102
-
impl VectorStore for TurbopufferStore {
103
-
async fn search_by_vector(
104
-
&self,
105
-
embedding: &[f32],
106
-
top_k: usize,
107
-
) -> Result<Vec<SearchResult>, VectorSearchError> {
87
+
pub async fn bm25_query(&self, query_text: &str, top_k: usize) -> Result<QueryResponse, TurbopufferError> {
88
+
let url = format!(
89
+
"https://api.turbopuffer.com/v1/vectors/{}/query",
90
+
self.namespace
91
+
);
92
+
108
93
let request = serde_json::json!({
109
-
"rank_by": ["vector", "ANN", embedding],
94
+
"rank_by": ["name", "BM25", query_text],
110
95
"top_k": top_k,
111
96
"include_attributes": ["url", "name", "filename"],
112
97
});
113
98
114
-
log::debug!(
115
-
"turbopuffer vector query: {}",
116
-
serde_json::to_string_pretty(&request).unwrap_or_default()
117
-
);
99
+
if let Ok(pretty) = serde_json::to_string_pretty(&request) {
100
+
log::debug!("turbopuffer BM25 query request: {}", pretty);
101
+
}
118
102
119
-
let rows = self.execute_query(request).await?;
120
-
Ok(rows.into_iter().map(SearchResult::from).collect())
121
-
}
103
+
let response = self
104
+
.client
105
+
.post(&url)
106
+
.header("Authorization", format!("Bearer {}", self.api_key))
107
+
.json(&request)
108
+
.send()
109
+
.await?;
122
110
123
-
async fn search_by_keyword(
124
-
&self,
125
-
query: &str,
126
-
top_k: usize,
127
-
) -> Result<Vec<SearchResult>, VectorSearchError> {
128
-
let request = serde_json::json!({
129
-
"rank_by": ["name", "BM25", query],
130
-
"top_k": top_k,
131
-
"include_attributes": ["url", "name", "filename"],
132
-
});
111
+
if !response.status().is_success() {
112
+
let status = response.status();
113
+
let body = response.text().await.unwrap_or_default();
133
114
134
-
log::debug!(
135
-
"turbopuffer BM25 query: {}",
136
-
serde_json::to_string_pretty(&request).unwrap_or_default()
137
-
);
115
+
// try to parse turbopuffer error response
116
+
if let Ok(error_resp) = serde_json::from_str::<TurbopufferErrorResponse>(&body) {
117
+
// check if it's a query length error
118
+
if error_resp.error.contains("too long") && error_resp.error.contains("max 1024") {
119
+
return Err(TurbopufferError::QueryTooLong {
120
+
message: error_resp.error,
121
+
});
122
+
}
123
+
}
138
124
139
-
let rows = self.execute_query(request).await?;
125
+
return Err(TurbopufferError::ApiError(format!(
126
+
"turbopuffer BM25 query failed with status {}: {}",
127
+
status, body
128
+
)));
129
+
}
140
130
141
-
if let Some(first) = rows.first() {
142
-
log::info!(
143
-
"BM25 first result - id: {}, dist: {}, name: {:?}",
131
+
let body = response.text().await
132
+
.map_err(|e| TurbopufferError::Other(anyhow::anyhow!("failed to read response body: {}", e)))?;
133
+
log::debug!("turbopuffer BM25 response: {}", body);
134
+
135
+
let parsed: QueryResponse = serde_json::from_str(&body)
136
+
.map_err(|e| TurbopufferError::Other(anyhow::anyhow!("failed to parse BM25 query response: {}", e)))?;
137
+
138
+
// DEBUG: log first result to see what BM25 returns
139
+
if let Some(first) = parsed.first() {
140
+
log::info!("BM25 first result - id: {}, dist: {}, name: {:?}",
144
141
first.id,
145
142
first.dist,
146
143
first.attributes.get("name")
147
144
);
148
145
}
149
146
150
-
Ok(rows.into_iter().map(SearchResult::from).collect())
151
-
}
152
-
153
-
fn name(&self) -> &'static str {
154
-
"turbopuffer"
147
+
Ok(parsed)
155
148
}
156
149
}
157
-
static/bufo-is-trapped-in-a-cameron-winter-phase.png
static/bufo-is-trapped-in-a-cameron-winter-phase.png
This is a binary file and will not be displayed.
+1
-46
static/index.html
+1
-46
static/index.html
···
5
5
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6
6
<title>find bufo</title>
7
7
<link rel="icon" type="image/png" href="/static/favicon.png">
8
-
<link rel="apple-touch-icon" href="/static/favicon.png">
9
-
<link rel="manifest" href="/static/manifest.json">
10
-
<meta name="theme-color" content="#8ba888">
11
8
<style>
12
9
* {
13
10
margin: 0;
···
211
208
height: 18px;
212
209
cursor: pointer;
213
210
accent-color: #667eea;
214
-
}
215
-
216
-
.option-group a {
217
-
color: #667eea;
218
-
text-decoration: none;
219
-
}
220
-
221
-
.option-group a:hover {
222
-
text-decoration: underline;
223
211
}
224
212
225
213
.sample-queries-container {
···
553
541
id="searchInput"
554
542
placeholder="describe the bufo you seek..."
555
543
autocomplete="off"
556
-
autofocus
557
544
>
558
545
<button id="searchButton">search</button>
559
546
</div>
···
610
597
<span>enabled</span>
611
598
</label>
612
599
</div>
613
-
614
-
<div class="option-group">
615
-
<div class="option-label">
616
-
<span class="option-name">exclude patterns</span>
617
-
</div>
618
-
<div class="option-description">
619
-
comma-separated <a href="https://regex101.com/" target="_blank">regex</a> patterns to exclude (e.g., excited,party)
620
-
<br>
621
-
<span style="color: #999; font-size: 0.9em;">new to regex? <a href="https://claude.ai" target="_blank">claude</a> can write patterns for you</span>
622
-
</div>
623
-
<input
624
-
type="text"
625
-
id="excludeInput"
626
-
placeholder="pattern1,pattern2"
627
-
style="width: 100%; padding: 10px; font-size: 14px;"
628
-
>
629
-
</div>
630
600
</div>
631
601
</div>
632
602
···
659
629
const alphaSlider = document.getElementById('alphaSlider');
660
630
const alphaValue = document.getElementById('alphaValue');
661
631
const familyFriendlyCheckbox = document.getElementById('familyFriendlyCheckbox');
662
-
const excludeInput = document.getElementById('excludeInput');
663
632
664
633
let hasSearched = false;
665
634
···
683
652
684
653
const alpha = parseFloat(alphaSlider.value);
685
654
const familyFriendly = familyFriendlyCheckbox.checked;
686
-
const exclude = excludeInput.value.trim();
687
655
688
656
// hide bufo after first search
689
657
if (!hasSearched) {
···
698
666
params.set('top_k', '20');
699
667
params.set('alpha', alpha.toString());
700
668
params.set('family_friendly', familyFriendly.toString());
701
-
if (exclude) params.set('exclude', exclude);
702
669
const newUrl = `${window.location.pathname}?${params.toString()}`;
703
-
window.history.pushState({ query, alpha, familyFriendly, exclude }, '', newUrl);
670
+
window.history.pushState({ query, alpha, familyFriendly }, '', newUrl);
704
671
}
705
672
706
673
searchButton.disabled = true;
···
715
682
params.set('top_k', '20');
716
683
params.set('alpha', alpha.toString());
717
684
params.set('family_friendly', familyFriendly.toString());
718
-
if (exclude) params.set('exclude', exclude);
719
685
720
686
const response = await fetch(`/api/search?${params.toString()}`, {
721
687
method: 'GET',
···
792
758
if (e.state.familyFriendly !== undefined) {
793
759
familyFriendlyCheckbox.checked = e.state.familyFriendly;
794
760
}
795
-
if (e.state.exclude !== undefined) {
796
-
excludeInput.value = e.state.exclude;
797
-
}
798
761
search(false);
799
762
}
800
763
});
···
805
768
const query = params.get('q');
806
769
const alpha = params.get('alpha');
807
770
const familyFriendly = params.get('family_friendly');
808
-
const exclude = params.get('exclude');
809
771
810
772
if (alpha) {
811
773
alphaSlider.value = alpha;
···
816
778
familyFriendlyCheckbox.checked = familyFriendly === 'true';
817
779
}
818
780
819
-
if (exclude) {
820
-
excludeInput.value = exclude;
821
-
}
822
-
823
781
if (query) {
824
782
searchInput.value = query;
825
783
search(false); // don't update URL since we're already loading from it
826
784
}
827
-
828
-
// ensure focus on the search input
829
-
searchInput.focus();
830
785
});
831
786
832
787
// handle sample query button clicks
-17
static/manifest.json
-17
static/manifest.json
···
1
-
{
2
-
"name": "find bufo",
3
-
"short_name": "find bufo",
4
-
"description": "hybrid search for bufo.zone",
5
-
"start_url": "/",
6
-
"display": "standalone",
7
-
"background_color": "#8ba888",
8
-
"theme_color": "#8ba888",
9
-
"icons": [
10
-
{
11
-
"src": "/static/favicon.png",
12
-
"sizes": "112x112",
13
-
"type": "image/png",
14
-
"purpose": "any maskable"
15
-
}
16
-
]
17
-
}