+4
.gitignore
+4
.gitignore
+1
Cargo.lock
+1
Cargo.lock
+1
Cargo.toml
+1
Cargo.toml
···
24
24
opentelemetry = { version = "0.26", features = ["trace", "metrics"] }
25
25
opentelemetry-instrumentation-actix-web = { version = "0.23", features = ["metrics"] }
26
26
opentelemetry-otlp = { version = "0.26", features = ["trace", "http-proto", "reqwest-client", "reqwest-rustls"] }
27
+
regex = "1.12"
+1
-1
Dockerfile
+1
-1
Dockerfile
+40
-10
README.md
+40
-10
README.md
···
1
1
# find-bufo
2
2
3
-
semantic search for the bufo zone
3
+
hybrid semantic + keyword search for the bufo zone
4
4
5
5
**live at: [find-bufo.fly.dev](https://find-bufo.fly.dev/)**
6
6
7
7
## overview
8
8
9
-
a one-page application for searching through all the bufos from [bufo.zone](https://bufo.zone/) using multi-modal embeddings and vector search.
9
+
a one-page application for searching through all the bufos from [bufo.zone](https://bufo.zone/) using hybrid search that combines:
10
+
- **semantic search** via multimodal embeddings (understands meaning and visual content)
11
+
- **keyword search** via BM25 full-text search (finds exact filename matches)
10
12
11
13
## architecture
12
14
···
36
38
to populate the vector store with bufos:
37
39
38
40
```bash
39
-
uvx scripts/ingest_bufos.py
41
+
just re-index
40
42
```
41
43
42
44
this will:
43
45
1. scrape all bufos from bufo.zone
44
46
2. download them to `data/bufos/`
45
-
3. generate embeddings for each image
47
+
3. generate embeddings for each image with `input_type="document"`
46
48
4. upload to turbopuffer
47
49
48
50
## development
···
63
65
fly launch # first time
64
66
fly secrets set VOYAGE_API_TOKEN=your_token
65
67
fly secrets set TURBOPUFFER_API_KEY=your_key
66
-
fly deploy
68
+
just deploy
67
69
```
68
70
69
71
## usage
70
72
71
73
1. open the app
72
74
2. enter a search query describing the bufo you want
73
-
3. see the top matching bufos with similarity scores
75
+
3. see the top matching bufos with hybrid similarity scores
74
76
4. click any bufo to open it in a new tab
75
77
78
+
### api parameters
79
+
80
+
the search API supports these parameters:
81
+
- `query`: search text (required)
82
+
- `top_k`: number of results (default: 10)
83
+
- `alpha`: fusion weight (default: 0.7)
84
+
- `1.0` = pure semantic (best for conceptual queries like "happy", "apocalyptic")
85
+
- `0.7` = default (balances semantic understanding with exact matches)
86
+
- `0.5` = balanced (equal weight to both signals)
87
+
- `0.0` = pure keyword (best for exact filename searches)
88
+
89
+
example: `/api/search?query=jumping&top_k=5&alpha=0.5`
90
+
76
91
## how it works
77
92
78
-
1. **ingestion**: all bufo images are embedded using voyage ai's multimodal model
79
-
2. **search**: user queries are embedded with the same model
80
-
3. **retrieval**: turbopuffer finds the most similar bufos using cosine distance
81
-
4. **display**: results are shown with similarity scores
93
+
### ingestion
94
+
all bufo images are processed through early fusion multimodal embeddings:
95
+
1. filename text extracted (e.g., "bufo-jumping-on-bed" โ "bufo jumping on bed")
96
+
2. combined with image content in single embedding request
97
+
3. voyage-multimodal-3 creates 1024-dim vectors capturing both text and visual features
98
+
4. uploaded to turbopuffer with BM25-enabled `name` field for keyword search
99
+
100
+
### search
101
+
1. **semantic branch**: query embedded using voyage-multimodal-3 with `input_type="query"`
102
+
2. **keyword branch**: BM25 full-text search against bufo names
103
+
3. **fusion**: weighted combination using `alpha` parameter
104
+
- `score = ฮฑ * semantic + (1-ฮฑ) * keyword`
105
+
- both scores normalized to 0-1 range before fusion
106
+
4. **ranking**: results sorted by fused score, top_k returned
107
+
108
+
### why hybrid?
109
+
- semantic alone: misses exact filename matches (e.g., "happy" might not find "bufo-is-happy")
110
+
- keyword alone: no semantic understanding (e.g., "happy" won't find "excited" or "smiling")
111
+
- hybrid: gets the best of both worlds
+32
bot/Dockerfile
+32
bot/Dockerfile
···
1
+
# build stage
2
+
FROM debian:bookworm-slim AS builder
3
+
4
+
RUN apt-get update && apt-get install -y --no-install-recommends \
5
+
ca-certificates \
6
+
curl \
7
+
xz-utils \
8
+
&& rm -rf /var/lib/apt/lists/*
9
+
10
+
# install zig 0.15.2
11
+
RUN curl -L https://ziglang.org/download/0.15.2/zig-x86_64-linux-0.15.2.tar.xz | tar -xJ -C /usr/local \
12
+
&& ln -s /usr/local/zig-x86_64-linux-0.15.2/zig /usr/local/bin/zig
13
+
14
+
WORKDIR /app
15
+
COPY build.zig build.zig.zon ./
16
+
COPY src ./src
17
+
18
+
RUN zig build -Doptimize=ReleaseSafe
19
+
20
+
# runtime stage
21
+
FROM debian:bookworm-slim
22
+
23
+
RUN apt-get update && apt-get install -y --no-install-recommends \
24
+
ca-certificates \
25
+
&& rm -rf /var/lib/apt/lists/* \
26
+
# prefer IPv4 over IPv6 for outbound connections (IPv6 times out in Fly.io)
27
+
&& echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf
28
+
29
+
WORKDIR /app
30
+
COPY --from=builder /app/zig-out/bin/bufo-bot .
31
+
32
+
CMD ["./bufo-bot"]
+54
bot/README.md
+54
bot/README.md
···
1
+
# bufo-bot (zig)
2
+
3
+
bluesky bot that listens to the jetstream firehose and quote-posts matching bufo images.
4
+
5
+
## how it works
6
+
7
+
1. connects to bluesky jetstream (firehose)
8
+
2. for each post, checks if text contains an exact phrase matching a bufo name
9
+
3. if matched, quote-posts with the corresponding bufo image (or posts without quote based on quote_chance)
10
+
11
+
## matching logic
12
+
13
+
- extracts phrase from bufo filename (e.g., `bufo-hop-in-we-re-going` -> `hop in we re going`)
14
+
- requires exact consecutive word match in post text
15
+
- configurable minimum phrase length (default: 4 words)
16
+
17
+
## configuration
18
+
19
+
| env var | default | description |
20
+
|---------|---------|-------------|
21
+
| `BSKY_HANDLE` | required | bluesky handle (e.g., `find-bufo.com`) |
22
+
| `BSKY_APP_PASSWORD` | required | app password from bsky settings |
23
+
| `MIN_PHRASE_WORDS` | `4` | minimum words in phrase to match |
24
+
| `POSTING_ENABLED` | `false` | must be `true` to actually post |
25
+
| `COOLDOWN_MINUTES` | `120` | don't repost same bufo within this time |
26
+
| `QUOTE_CHANCE` | `0.5` | probability of quoting vs just posting with rkey |
27
+
| `EXCLUDE_PATTERNS` | `...` | exclude bufos matching these patterns |
28
+
| `JETSTREAM_ENDPOINT` | `jetstream2.us-east.bsky.network` | jetstream server |
29
+
30
+
## local dev
31
+
32
+
```bash
33
+
# build
34
+
zig build
35
+
36
+
# run locally (dry run by default)
37
+
./zig-out/bin/bufo-bot
38
+
```
39
+
40
+
## deploy
41
+
42
+
```bash
43
+
# set secrets (once)
44
+
fly secrets set BSKY_HANDLE=find-bufo.com BSKY_APP_PASSWORD=xxxx -a bufo-bot
45
+
46
+
# deploy
47
+
fly deploy
48
+
49
+
# enable posting
50
+
fly secrets set POSTING_ENABLED=true -a bufo-bot
51
+
52
+
# check logs
53
+
fly logs -a bufo-bot
54
+
```
+34
bot/build.zig
+34
bot/build.zig
···
1
+
const std = @import("std");
2
+
3
+
pub fn build(b: *std.Build) void {
4
+
const target = b.standardTargetOptions(.{});
5
+
const optimize = b.standardOptimizeOption(.{});
6
+
7
+
const websocket = b.dependency("websocket", .{
8
+
.target = target,
9
+
.optimize = optimize,
10
+
});
11
+
12
+
const exe = b.addExecutable(.{
13
+
.name = "bufo-bot",
14
+
.root_module = b.createModule(.{
15
+
.root_source_file = b.path("src/main.zig"),
16
+
.target = target,
17
+
.optimize = optimize,
18
+
.imports = &.{
19
+
.{ .name = "websocket", .module = websocket.module("websocket") },
20
+
},
21
+
}),
22
+
});
23
+
24
+
b.installArtifact(exe);
25
+
26
+
const run_cmd = b.addRunArtifact(exe);
27
+
run_cmd.step.dependOn(b.getInstallStep());
28
+
if (b.args) |args| {
29
+
run_cmd.addArgs(args);
30
+
}
31
+
32
+
const run_step = b.step("run", "Run the bot");
33
+
run_step.dependOn(&run_cmd.step);
34
+
}
+17
bot/build.zig.zon
+17
bot/build.zig.zon
···
1
+
.{
2
+
.name = .bufo_bot,
3
+
.version = "0.0.1",
4
+
.fingerprint = 0xe143490f82fa96db,
5
+
.minimum_zig_version = "0.15.0",
6
+
.dependencies = .{
7
+
.websocket = .{
8
+
.url = "https://github.com/karlseguin/websocket.zig/archive/refs/heads/master.tar.gz",
9
+
.hash = "websocket-0.1.0-ZPISdRNzAwAGszh62EpRtoQxu8wb1MSMVI6Ow0o2dmyJ",
10
+
},
11
+
},
12
+
.paths = .{
13
+
"build.zig",
14
+
"build.zig.zon",
15
+
"src",
16
+
},
17
+
}
+22
bot/fly.toml
+22
bot/fly.toml
···
1
+
app = "bufo-bot"
2
+
primary_region = "ewr"
3
+
4
+
[build]
5
+
dockerfile = "Dockerfile"
6
+
7
+
[env]
8
+
JETSTREAM_ENDPOINT = "jetstream2.us-east.bsky.network"
9
+
10
+
# worker process - no http service
11
+
[processes]
12
+
worker = "./bufo-bot"
13
+
14
+
[[vm]]
15
+
memory = "256mb"
16
+
cpu_kind = "shared"
17
+
cpus = 1
18
+
19
+
# secrets to set via: fly secrets set KEY=value -a bufo-bot
20
+
# - BSKY_HANDLE (e.g., find-bufo.com)
21
+
# - BSKY_APP_PASSWORD (app password from bsky settings)
22
+
# - POSTING_ENABLED=true (to enable posting, default is false)
+25
bot/justfile
+25
bot/justfile
···
1
+
# bot/justfile
2
+
set shell := ["bash", "-eu", "-o", "pipefail", "-c"]
3
+
4
+
default:
5
+
@just --list
6
+
7
+
# build the bot
8
+
build:
9
+
zig build
10
+
11
+
# run the bot locally
12
+
run:
13
+
zig build run
14
+
15
+
# deploy to fly.io
16
+
deploy:
17
+
fly deploy --wait-timeout 180
18
+
19
+
# check logs
20
+
logs:
21
+
fly logs -a bufo-bot
22
+
23
+
# set secrets (run once)
24
+
secrets HANDLE PASSWORD:
25
+
fly secrets set BSKY_HANDLE={{ HANDLE }} BSKY_APP_PASSWORD={{ PASSWORD }} -a bufo-bot
+257
bot/src/bsky.zig
+257
bot/src/bsky.zig
···
1
+
const std = @import("std");
2
+
const mem = std.mem;
3
+
const json = std.json;
4
+
const http = std.http;
5
+
const Allocator = mem.Allocator;
6
+
const Io = std.Io;
7
+
8
+
pub const BskyClient = struct {
9
+
allocator: Allocator,
10
+
handle: []const u8,
11
+
app_password: []const u8,
12
+
access_jwt: ?[]const u8 = null,
13
+
did: ?[]const u8 = null,
14
+
client: http.Client,
15
+
16
+
pub fn init(allocator: Allocator, handle: []const u8, app_password: []const u8) BskyClient {
17
+
return .{
18
+
.allocator = allocator,
19
+
.handle = handle,
20
+
.app_password = app_password,
21
+
.client = .{ .allocator = allocator },
22
+
};
23
+
}
24
+
25
+
pub fn deinit(self: *BskyClient) void {
26
+
if (self.access_jwt) |jwt| self.allocator.free(jwt);
27
+
if (self.did) |did| self.allocator.free(did);
28
+
self.client.deinit();
29
+
}
30
+
31
+
pub fn login(self: *BskyClient) !void {
32
+
std.debug.print("logging in as {s}...\n", .{self.handle});
33
+
34
+
var body_buf: std.ArrayList(u8) = .{};
35
+
defer body_buf.deinit(self.allocator);
36
+
try body_buf.print(self.allocator, "{{\"identifier\":\"{s}\",\"password\":\"{s}\"}}", .{ self.handle, self.app_password });
37
+
38
+
var aw: Io.Writer.Allocating = .init(self.allocator);
39
+
defer aw.deinit();
40
+
41
+
const result = self.client.fetch(.{
42
+
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.server.createSession" },
43
+
.method = .POST,
44
+
.headers = .{ .content_type = .{ .override = "application/json" } },
45
+
.payload = body_buf.items,
46
+
.response_writer = &aw.writer,
47
+
}) catch |err| {
48
+
std.debug.print("login request failed: {}\n", .{err});
49
+
return err;
50
+
};
51
+
52
+
if (result.status != .ok) {
53
+
std.debug.print("login failed with status: {}\n", .{result.status});
54
+
return error.LoginFailed;
55
+
}
56
+
57
+
const response = aw.toArrayList();
58
+
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
59
+
defer parsed.deinit();
60
+
61
+
const root = parsed.value.object;
62
+
63
+
const jwt_val = root.get("accessJwt") orelse return error.NoJwt;
64
+
if (jwt_val != .string) return error.NoJwt;
65
+
66
+
const did_val = root.get("did") orelse return error.NoDid;
67
+
if (did_val != .string) return error.NoDid;
68
+
69
+
self.access_jwt = try self.allocator.dupe(u8, jwt_val.string);
70
+
self.did = try self.allocator.dupe(u8, did_val.string);
71
+
72
+
std.debug.print("logged in as {s} (did: {s})\n", .{ self.handle, self.did.? });
73
+
}
74
+
75
+
pub fn uploadBlob(self: *BskyClient, data: []const u8, content_type: []const u8) ![]const u8 {
76
+
if (self.access_jwt == null) return error.NotLoggedIn;
77
+
78
+
var auth_buf: [512]u8 = undefined;
79
+
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
80
+
81
+
var aw: Io.Writer.Allocating = .init(self.allocator);
82
+
defer aw.deinit();
83
+
84
+
const result = self.client.fetch(.{
85
+
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.uploadBlob" },
86
+
.method = .POST,
87
+
.headers = .{
88
+
.content_type = .{ .override = content_type },
89
+
.authorization = .{ .override = auth_header },
90
+
},
91
+
.payload = data,
92
+
.response_writer = &aw.writer,
93
+
}) catch |err| {
94
+
std.debug.print("upload blob failed: {}\n", .{err});
95
+
return err;
96
+
};
97
+
98
+
if (result.status != .ok) {
99
+
std.debug.print("upload blob failed with status: {}\n", .{result.status});
100
+
return error.UploadFailed;
101
+
}
102
+
103
+
const response = aw.toArrayList();
104
+
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
105
+
defer parsed.deinit();
106
+
107
+
const root = parsed.value.object;
108
+
const blob = root.get("blob") orelse return error.NoBlobRef;
109
+
if (blob != .object) return error.NoBlobRef;
110
+
111
+
return json.Stringify.valueAlloc(self.allocator, blob, .{}) catch return error.SerializeError;
112
+
}
113
+
114
+
pub fn createQuotePost(self: *BskyClient, quote_uri: []const u8, quote_cid: []const u8, blob_json: []const u8, alt_text: []const u8) !void {
115
+
if (self.access_jwt == null or self.did == null) return error.NotLoggedIn;
116
+
117
+
var body_buf: std.ArrayList(u8) = .{};
118
+
defer body_buf.deinit(self.allocator);
119
+
120
+
try body_buf.print(self.allocator,
121
+
\\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{"$type":"app.bsky.feed.post","text":"","createdAt":"{s}","embed":{{"$type":"app.bsky.embed.recordWithMedia","record":{{"$type":"app.bsky.embed.record","record":{{"uri":"{s}","cid":"{s}"}}}},"media":{{"$type":"app.bsky.embed.images","images":[{{"image":{s},"alt":"{s}"}}]}}}}}}}}
122
+
, .{ self.did.?, getIsoTimestamp(), quote_uri, quote_cid, blob_json, alt_text });
123
+
124
+
var auth_buf: [512]u8 = undefined;
125
+
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
126
+
127
+
var aw: Io.Writer.Allocating = .init(self.allocator);
128
+
defer aw.deinit();
129
+
130
+
const result = self.client.fetch(.{
131
+
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" },
132
+
.method = .POST,
133
+
.headers = .{
134
+
.content_type = .{ .override = "application/json" },
135
+
.authorization = .{ .override = auth_header },
136
+
},
137
+
.payload = body_buf.items,
138
+
.response_writer = &aw.writer,
139
+
}) catch |err| {
140
+
std.debug.print("create post failed: {}\n", .{err});
141
+
return err;
142
+
};
143
+
144
+
if (result.status != .ok) {
145
+
const response = aw.toArrayList();
146
+
std.debug.print("create post failed with status: {} - {s}\n", .{ result.status, response.items });
147
+
return error.PostFailed;
148
+
}
149
+
150
+
std.debug.print("posted successfully!\n", .{});
151
+
}
152
+
153
+
pub fn createSimplePost(self: *BskyClient, text: []const u8, blob_json: []const u8, alt_text: []const u8) !void {
154
+
if (self.access_jwt == null or self.did == null) return error.NotLoggedIn;
155
+
156
+
var body_buf: std.ArrayList(u8) = .{};
157
+
defer body_buf.deinit(self.allocator);
158
+
159
+
try body_buf.print(self.allocator,
160
+
\\{{"repo":"{s}","collection":"app.bsky.feed.post","record":{{"$type":"app.bsky.feed.post","text":"{s}","createdAt":"{s}","embed":{{"$type":"app.bsky.embed.images","images":[{{"image":{s},"alt":"{s}"}}]}}}}}}
161
+
, .{ self.did.?, text, getIsoTimestamp(), blob_json, alt_text });
162
+
163
+
var auth_buf: [512]u8 = undefined;
164
+
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
165
+
166
+
var aw: Io.Writer.Allocating = .init(self.allocator);
167
+
defer aw.deinit();
168
+
169
+
const result = self.client.fetch(.{
170
+
.location = .{ .url = "https://bsky.social/xrpc/com.atproto.repo.createRecord" },
171
+
.method = .POST,
172
+
.headers = .{
173
+
.content_type = .{ .override = "application/json" },
174
+
.authorization = .{ .override = auth_header },
175
+
},
176
+
.payload = body_buf.items,
177
+
.response_writer = &aw.writer,
178
+
}) catch |err| {
179
+
std.debug.print("create post failed: {}\n", .{err});
180
+
return err;
181
+
};
182
+
183
+
if (result.status != .ok) {
184
+
const response = aw.toArrayList();
185
+
std.debug.print("create post failed with status: {} - {s}\n", .{ result.status, response.items });
186
+
return error.PostFailed;
187
+
}
188
+
189
+
std.debug.print("posted successfully!\n", .{});
190
+
}
191
+
192
+
pub fn getPostCid(self: *BskyClient, uri: []const u8) ![]const u8 {
193
+
if (self.access_jwt == null) return error.NotLoggedIn;
194
+
195
+
var parts = mem.splitScalar(u8, uri[5..], '/');
196
+
const did = parts.next() orelse return error.InvalidUri;
197
+
_ = parts.next();
198
+
const rkey = parts.next() orelse return error.InvalidUri;
199
+
200
+
var url_buf: [512]u8 = undefined;
201
+
const url = std.fmt.bufPrint(&url_buf, "https://bsky.social/xrpc/com.atproto.repo.getRecord?repo={s}&collection=app.bsky.feed.post&rkey={s}", .{ did, rkey }) catch return error.UrlTooLong;
202
+
203
+
var auth_buf: [512]u8 = undefined;
204
+
const auth_header = std.fmt.bufPrint(&auth_buf, "Bearer {s}", .{self.access_jwt.?}) catch return error.AuthTooLong;
205
+
206
+
var aw: Io.Writer.Allocating = .init(self.allocator);
207
+
defer aw.deinit();
208
+
209
+
const result = self.client.fetch(.{
210
+
.location = .{ .url = url },
211
+
.method = .GET,
212
+
.headers = .{ .authorization = .{ .override = auth_header } },
213
+
.response_writer = &aw.writer,
214
+
}) catch |err| {
215
+
std.debug.print("get record failed: {}\n", .{err});
216
+
return err;
217
+
};
218
+
219
+
if (result.status != .ok) {
220
+
return error.GetRecordFailed;
221
+
}
222
+
223
+
const response = aw.toArrayList();
224
+
const parsed = json.parseFromSlice(json.Value, self.allocator, response.items, .{}) catch return error.ParseError;
225
+
defer parsed.deinit();
226
+
227
+
const cid_val = parsed.value.object.get("cid") orelse return error.NoCid;
228
+
if (cid_val != .string) return error.NoCid;
229
+
230
+
return try self.allocator.dupe(u8, cid_val.string);
231
+
}
232
+
233
+
pub fn fetchImage(self: *BskyClient, url: []const u8) ![]const u8 {
234
+
var aw: Io.Writer.Allocating = .init(self.allocator);
235
+
errdefer aw.deinit();
236
+
237
+
const result = self.client.fetch(.{
238
+
.location = .{ .url = url },
239
+
.method = .GET,
240
+
.response_writer = &aw.writer,
241
+
}) catch |err| {
242
+
std.debug.print("fetch image failed: {}\n", .{err});
243
+
return err;
244
+
};
245
+
246
+
if (result.status != .ok) {
247
+
aw.deinit();
248
+
return error.FetchFailed;
249
+
}
250
+
251
+
return try aw.toOwnedSlice();
252
+
}
253
+
};
254
+
255
+
fn getIsoTimestamp() []const u8 {
256
+
return "2025-01-01T00:00:00.000Z";
257
+
}
+47
bot/src/config.zig
+47
bot/src/config.zig
···
1
+
const std = @import("std");
2
+
const posix = std.posix;
3
+
4
+
pub const Config = struct {
5
+
bsky_handle: []const u8,
6
+
bsky_app_password: []const u8,
7
+
jetstream_endpoint: []const u8,
8
+
min_phrase_words: u32,
9
+
posting_enabled: bool,
10
+
cooldown_minutes: u32,
11
+
quote_chance: f32,
12
+
exclude_patterns: []const u8,
13
+
14
+
pub fn fromEnv() Config {
15
+
return .{
16
+
.bsky_handle = posix.getenv("BSKY_HANDLE") orelse "find-bufo.com",
17
+
.bsky_app_password = posix.getenv("BSKY_APP_PASSWORD") orelse "",
18
+
.jetstream_endpoint = posix.getenv("JETSTREAM_ENDPOINT") orelse "jetstream2.us-east.bsky.network",
19
+
.min_phrase_words = parseU32(posix.getenv("MIN_PHRASE_WORDS"), 4),
20
+
.posting_enabled = parseBool(posix.getenv("POSTING_ENABLED")),
21
+
.cooldown_minutes = parseU32(posix.getenv("COOLDOWN_MINUTES"), 120),
22
+
.quote_chance = parseF32(posix.getenv("QUOTE_CHANCE"), 0.5),
23
+
.exclude_patterns = posix.getenv("EXCLUDE_PATTERNS") orelse "what-have-you-done,what-have-i-done,sad,crying,cant-take",
24
+
};
25
+
}
26
+
};
27
+
28
+
fn parseU32(str: ?[]const u8, default: u32) u32 {
29
+
if (str) |s| {
30
+
return std.fmt.parseInt(u32, s, 10) catch default;
31
+
}
32
+
return default;
33
+
}
34
+
35
+
fn parseF32(str: ?[]const u8, default: f32) f32 {
36
+
if (str) |s| {
37
+
return std.fmt.parseFloat(f32, s) catch default;
38
+
}
39
+
return default;
40
+
}
41
+
42
+
fn parseBool(str: ?[]const u8) bool {
43
+
if (str) |s| {
44
+
return std.mem.eql(u8, s, "true") or std.mem.eql(u8, s, "1");
45
+
}
46
+
return false;
47
+
}
+147
bot/src/jetstream.zig
+147
bot/src/jetstream.zig
···
1
+
const std = @import("std");
2
+
const mem = std.mem;
3
+
const json = std.json;
4
+
const posix = std.posix;
5
+
const Allocator = mem.Allocator;
6
+
const websocket = @import("websocket");
7
+
8
+
pub const Post = struct {
9
+
uri: []const u8,
10
+
text: []const u8,
11
+
did: []const u8,
12
+
rkey: []const u8,
13
+
};
14
+
15
+
pub const JetstreamClient = struct {
16
+
allocator: Allocator,
17
+
host: []const u8,
18
+
callback: *const fn (Post) void,
19
+
20
+
pub fn init(allocator: Allocator, host: []const u8, callback: *const fn (Post) void) JetstreamClient {
21
+
return .{
22
+
.allocator = allocator,
23
+
.host = host,
24
+
.callback = callback,
25
+
};
26
+
}
27
+
28
+
pub fn run(self: *JetstreamClient) void {
29
+
// exponential backoff: 1s -> 2s -> 4s -> ... -> 60s cap
30
+
var backoff: u64 = 1;
31
+
const max_backoff: u64 = 60;
32
+
33
+
while (true) {
34
+
self.connect() catch |err| {
35
+
std.debug.print("jetstream error: {}, reconnecting in {}s...\n", .{ err, backoff });
36
+
};
37
+
posix.nanosleep(backoff, 0);
38
+
backoff = @min(backoff * 2, max_backoff);
39
+
}
40
+
}
41
+
42
+
fn connect(self: *JetstreamClient) !void {
43
+
const path = "/subscribe?wantedCollections=app.bsky.feed.post";
44
+
45
+
std.debug.print("connecting to wss://{s}{s}\n", .{ self.host, path });
46
+
47
+
var client = websocket.Client.init(self.allocator, .{
48
+
.host = self.host,
49
+
.port = 443,
50
+
.tls = true,
51
+
}) catch |err| {
52
+
std.debug.print("websocket client init failed: {}\n", .{err});
53
+
return err;
54
+
};
55
+
defer client.deinit();
56
+
57
+
var host_header_buf: [256]u8 = undefined;
58
+
const host_header = std.fmt.bufPrint(&host_header_buf, "Host: {s}\r\n", .{self.host}) catch self.host;
59
+
60
+
client.handshake(path, .{ .headers = host_header }) catch |err| {
61
+
std.debug.print("websocket handshake failed: {}\n", .{err});
62
+
return err;
63
+
};
64
+
65
+
std.debug.print("jetstream connected!\n", .{});
66
+
67
+
var handler = Handler{ .allocator = self.allocator, .callback = self.callback };
68
+
client.readLoop(&handler) catch |err| {
69
+
std.debug.print("websocket read loop error: {}\n", .{err});
70
+
return err;
71
+
};
72
+
}
73
+
};
74
+
75
+
const Handler = struct {
76
+
allocator: Allocator,
77
+
callback: *const fn (Post) void,
78
+
msg_count: usize = 0,
79
+
80
+
pub fn serverMessage(self: *Handler, data: []const u8) !void {
81
+
self.msg_count += 1;
82
+
if (self.msg_count % 1000 == 1) {
83
+
std.debug.print("jetstream: processed {} messages\n", .{self.msg_count});
84
+
}
85
+
self.processMessage(data) catch |err| {
86
+
if (err != error.NotAPost) {
87
+
std.debug.print("message processing error: {}\n", .{err});
88
+
}
89
+
};
90
+
}
91
+
92
+
pub fn close(_: *Handler) void {
93
+
std.debug.print("jetstream connection closed\n", .{});
94
+
}
95
+
96
+
fn processMessage(self: *Handler, payload: []const u8) !void {
97
+
// jetstream format:
98
+
// { "did": "...", "kind": "commit", "commit": { "collection": "app.bsky.feed.post", "rkey": "...", "record": { "text": "...", ... } } }
99
+
const parsed = json.parseFromSlice(json.Value, self.allocator, payload, .{}) catch return error.ParseError;
100
+
defer parsed.deinit();
101
+
102
+
const root = parsed.value.object;
103
+
104
+
// check kind
105
+
const kind = root.get("kind") orelse return error.NotAPost;
106
+
if (kind != .string or !mem.eql(u8, kind.string, "commit")) return error.NotAPost;
107
+
108
+
// get did
109
+
const did_val = root.get("did") orelse return error.NotAPost;
110
+
if (did_val != .string) return error.NotAPost;
111
+
112
+
// get commit
113
+
const commit = root.get("commit") orelse return error.NotAPost;
114
+
if (commit != .object) return error.NotAPost;
115
+
116
+
// check collection
117
+
const collection = commit.object.get("collection") orelse return error.NotAPost;
118
+
if (collection != .string or !mem.eql(u8, collection.string, "app.bsky.feed.post")) return error.NotAPost;
119
+
120
+
// check operation (create only)
121
+
const operation = commit.object.get("operation") orelse return error.NotAPost;
122
+
if (operation != .string or !mem.eql(u8, operation.string, "create")) return error.NotAPost;
123
+
124
+
// get rkey
125
+
const rkey_val = commit.object.get("rkey") orelse return error.NotAPost;
126
+
if (rkey_val != .string) return error.NotAPost;
127
+
128
+
// get record
129
+
const record = commit.object.get("record") orelse return error.NotAPost;
130
+
if (record != .object) return error.NotAPost;
131
+
132
+
// get text
133
+
const text_val = record.object.get("text") orelse return error.NotAPost;
134
+
if (text_val != .string) return error.NotAPost;
135
+
136
+
// construct uri
137
+
var uri_buf: [256]u8 = undefined;
138
+
const uri = std.fmt.bufPrint(&uri_buf, "at://{s}/app.bsky.feed.post/{s}", .{ did_val.string, rkey_val.string }) catch return error.UriTooLong;
139
+
140
+
self.callback(.{
141
+
.uri = uri,
142
+
.text = text_val.string,
143
+
.did = did_val.string,
144
+
.rkey = rkey_val.string,
145
+
});
146
+
}
147
+
};
+216
bot/src/main.zig
+216
bot/src/main.zig
···
1
+
const std = @import("std");
2
+
const mem = std.mem;
3
+
const json = std.json;
4
+
const http = std.http;
5
+
const Thread = std.Thread;
6
+
const Allocator = mem.Allocator;
7
+
const config = @import("config.zig");
8
+
const matcher = @import("matcher.zig");
9
+
const jetstream = @import("jetstream.zig");
10
+
const bsky = @import("bsky.zig");
11
+
12
+
var global_state: ?*BotState = null;
13
+
14
+
const BotState = struct {
15
+
allocator: Allocator,
16
+
config: config.Config,
17
+
matcher: matcher.Matcher,
18
+
bsky_client: bsky.BskyClient,
19
+
recent_bufos: std.StringHashMap(i64), // name -> timestamp
20
+
mutex: Thread.Mutex = .{},
21
+
rng: std.Random.DefaultPrng,
22
+
};
23
+
24
+
pub fn main() !void {
25
+
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
26
+
defer _ = gpa.deinit();
27
+
const allocator = gpa.allocator();
28
+
29
+
std.debug.print("starting bufo bot...\n", .{});
30
+
31
+
const cfg = config.Config.fromEnv();
32
+
33
+
// load bufos from API
34
+
var m = matcher.Matcher.init(allocator, cfg.min_phrase_words);
35
+
try loadBufos(allocator, &m, cfg.exclude_patterns);
36
+
std.debug.print("loaded {} bufos with >= {} word phrases\n", .{ m.count(), cfg.min_phrase_words });
37
+
38
+
if (m.count() == 0) {
39
+
std.debug.print("no bufos loaded, exiting\n", .{});
40
+
return;
41
+
}
42
+
43
+
// init bluesky client
44
+
var bsky_client = bsky.BskyClient.init(allocator, cfg.bsky_handle, cfg.bsky_app_password);
45
+
defer bsky_client.deinit();
46
+
47
+
if (cfg.posting_enabled) {
48
+
try bsky_client.login();
49
+
} else {
50
+
std.debug.print("posting disabled, running in dry-run mode\n", .{});
51
+
}
52
+
53
+
// init state
54
+
var state = BotState{
55
+
.allocator = allocator,
56
+
.config = cfg,
57
+
.matcher = m,
58
+
.bsky_client = bsky_client,
59
+
.recent_bufos = std.StringHashMap(i64).init(allocator),
60
+
.rng = std.Random.DefaultPrng.init(@intCast(std.time.timestamp())),
61
+
};
62
+
defer state.recent_bufos.deinit();
63
+
64
+
global_state = &state;
65
+
66
+
// start jetstream consumer
67
+
var js = jetstream.JetstreamClient.init(allocator, cfg.jetstream_endpoint, onPost);
68
+
js.run();
69
+
}
70
+
71
+
fn onPost(post: jetstream.Post) void {
72
+
const state = global_state orelse return;
73
+
74
+
// check for match
75
+
const match = state.matcher.findMatch(post.text) orelse return;
76
+
77
+
std.debug.print("match: '{s}' -> {s}\n", .{ match.phrase, match.name });
78
+
79
+
if (!state.config.posting_enabled) {
80
+
std.debug.print("posting disabled, skipping\n", .{});
81
+
return;
82
+
}
83
+
84
+
state.mutex.lock();
85
+
defer state.mutex.unlock();
86
+
87
+
// check cooldown
88
+
const now = std.time.timestamp();
89
+
const cooldown_secs = @as(i64, @intCast(state.config.cooldown_minutes)) * 60;
90
+
91
+
if (state.recent_bufos.get(match.name)) |last_posted| {
92
+
if (now - last_posted < cooldown_secs) {
93
+
std.debug.print("cooldown: {s} posted recently, skipping\n", .{match.name});
94
+
return;
95
+
}
96
+
}
97
+
98
+
// random quote chance
99
+
const should_quote = state.rng.random().float(f32) < state.config.quote_chance;
100
+
101
+
// fetch bufo image
102
+
const img_data = state.bsky_client.fetchImage(match.url) catch |err| {
103
+
std.debug.print("failed to fetch bufo image: {}\n", .{err});
104
+
return;
105
+
};
106
+
defer state.allocator.free(img_data);
107
+
108
+
// determine content type from URL
109
+
const content_type = if (mem.endsWith(u8, match.url, ".gif"))
110
+
"image/gif"
111
+
else if (mem.endsWith(u8, match.url, ".png"))
112
+
"image/png"
113
+
else
114
+
"image/jpeg";
115
+
116
+
// upload blob
117
+
const blob_json = state.bsky_client.uploadBlob(img_data, content_type) catch |err| {
118
+
std.debug.print("failed to upload blob: {}\n", .{err});
119
+
return;
120
+
};
121
+
defer state.allocator.free(blob_json);
122
+
123
+
// build alt text (name without extension, dashes to spaces)
124
+
var alt_buf: [128]u8 = undefined;
125
+
var alt_len: usize = 0;
126
+
for (match.name) |c| {
127
+
if (c == '-') {
128
+
alt_buf[alt_len] = ' ';
129
+
} else if (c == '.') {
130
+
break; // stop at extension
131
+
} else {
132
+
alt_buf[alt_len] = c;
133
+
}
134
+
alt_len += 1;
135
+
if (alt_len >= alt_buf.len - 1) break;
136
+
}
137
+
const alt_text = alt_buf[0..alt_len];
138
+
139
+
if (should_quote) {
140
+
// get post CID for quote
141
+
const cid = state.bsky_client.getPostCid(post.uri) catch |err| {
142
+
std.debug.print("failed to get post CID: {}\n", .{err});
143
+
return;
144
+
};
145
+
defer state.allocator.free(cid);
146
+
147
+
state.bsky_client.createQuotePost(post.uri, cid, blob_json, alt_text) catch |err| {
148
+
std.debug.print("failed to create quote post: {}\n", .{err});
149
+
return;
150
+
};
151
+
std.debug.print("posted bufo quote: {s} (phrase: {s})\n", .{ match.name, match.phrase });
152
+
} else {
153
+
// post without quote
154
+
var text_buf: [256]u8 = undefined;
155
+
const text = std.fmt.bufPrint(&text_buf, "matched {s} (not quoting to reduce spam)", .{post.rkey}) catch "matched a post";
156
+
157
+
state.bsky_client.createSimplePost(text, blob_json, alt_text) catch |err| {
158
+
std.debug.print("failed to create simple post: {}\n", .{err});
159
+
return;
160
+
};
161
+
std.debug.print("posted bufo (no quote): {s} for {s}\n", .{ match.name, post.rkey });
162
+
}
163
+
164
+
// update cooldown cache
165
+
state.recent_bufos.put(match.name, now) catch {};
166
+
}
167
+
168
+
fn loadBufos(allocator: Allocator, m: *matcher.Matcher, exclude_patterns: []const u8) !void {
169
+
var client = http.Client{ .allocator = allocator };
170
+
defer client.deinit();
171
+
172
+
var url_buf: [512]u8 = undefined;
173
+
const url = std.fmt.bufPrint(&url_buf, "https://find-bufo.com/api/search?query=bufo&top_k=2000&alpha=0&exclude={s}", .{exclude_patterns}) catch return error.UrlTooLong;
174
+
175
+
var aw: std.Io.Writer.Allocating = .init(allocator);
176
+
defer aw.deinit();
177
+
178
+
const result = client.fetch(.{
179
+
.location = .{ .url = url },
180
+
.method = .GET,
181
+
.response_writer = &aw.writer,
182
+
}) catch |err| {
183
+
std.debug.print("failed to fetch bufos: {}\n", .{err});
184
+
return err;
185
+
};
186
+
187
+
if (result.status != .ok) {
188
+
std.debug.print("failed to fetch bufos, status: {}\n", .{result.status});
189
+
return error.FetchFailed;
190
+
}
191
+
192
+
const response_list = aw.toArrayList();
193
+
const response = response_list.items;
194
+
195
+
const parsed = json.parseFromSlice(json.Value, allocator, response, .{}) catch return error.ParseError;
196
+
defer parsed.deinit();
197
+
198
+
const results = parsed.value.object.get("results") orelse return;
199
+
if (results != .array) return;
200
+
201
+
var loaded: usize = 0;
202
+
for (results.array.items) |item| {
203
+
if (item != .object) continue;
204
+
205
+
const name_val = item.object.get("name") orelse continue;
206
+
if (name_val != .string) continue;
207
+
208
+
const url_val = item.object.get("url") orelse continue;
209
+
if (url_val != .string) continue;
210
+
211
+
m.addBufo(name_val.string, url_val.string) catch continue;
212
+
loaded += 1;
213
+
}
214
+
215
+
std.debug.print("loaded {} bufos from API\n", .{loaded});
216
+
}
+164
bot/src/matcher.zig
+164
bot/src/matcher.zig
···
1
+
const std = @import("std");
2
+
const mem = std.mem;
3
+
const Allocator = mem.Allocator;
4
+
5
+
pub const Bufo = struct {
6
+
name: []const u8,
7
+
url: []const u8,
8
+
phrase: []const []const u8,
9
+
};
10
+
11
+
pub const Match = struct {
12
+
name: []const u8,
13
+
url: []const u8,
14
+
phrase: []const u8,
15
+
};
16
+
17
+
pub const Matcher = struct {
18
+
bufos: std.ArrayList(Bufo) = .{},
19
+
allocator: Allocator,
20
+
min_words: u32,
21
+
22
+
pub fn init(allocator: Allocator, min_words: u32) Matcher {
23
+
return .{
24
+
.allocator = allocator,
25
+
.min_words = min_words,
26
+
};
27
+
}
28
+
29
+
pub fn deinit(self: *Matcher) void {
30
+
for (self.bufos.items) |bufo| {
31
+
self.allocator.free(bufo.name);
32
+
self.allocator.free(bufo.url);
33
+
for (bufo.phrase) |word| {
34
+
self.allocator.free(word);
35
+
}
36
+
self.allocator.free(bufo.phrase);
37
+
}
38
+
self.bufos.deinit(self.allocator);
39
+
}
40
+
41
+
pub fn addBufo(self: *Matcher, name: []const u8, url: []const u8) !void {
42
+
const phrase = try extractPhrase(self.allocator, name);
43
+
44
+
if (phrase.len < self.min_words) {
45
+
for (phrase) |word| self.allocator.free(word);
46
+
self.allocator.free(phrase);
47
+
return;
48
+
}
49
+
50
+
try self.bufos.append(self.allocator, .{
51
+
.name = try self.allocator.dupe(u8, name),
52
+
.url = try self.allocator.dupe(u8, url),
53
+
.phrase = phrase,
54
+
});
55
+
}
56
+
57
+
pub fn findMatch(self: *Matcher, text: []const u8) ?Match {
58
+
var words: std.ArrayList([]const u8) = .{};
59
+
defer words.deinit(self.allocator);
60
+
61
+
var i: usize = 0;
62
+
while (i < text.len) {
63
+
while (i < text.len and !isAlpha(text[i])) : (i += 1) {}
64
+
if (i >= text.len) break;
65
+
66
+
const start = i;
67
+
while (i < text.len and isAlpha(text[i])) : (i += 1) {}
68
+
69
+
const word = text[start..i];
70
+
if (word.len > 0) {
71
+
words.append(self.allocator, word) catch continue;
72
+
}
73
+
}
74
+
75
+
for (self.bufos.items) |bufo| {
76
+
if (containsPhrase(words.items, bufo.phrase)) {
77
+
var phrase_buf: [256]u8 = undefined;
78
+
var phrase_len: usize = 0;
79
+
for (bufo.phrase, 0..) |word, j| {
80
+
if (j > 0) {
81
+
phrase_buf[phrase_len] = ' ';
82
+
phrase_len += 1;
83
+
}
84
+
@memcpy(phrase_buf[phrase_len .. phrase_len + word.len], word);
85
+
phrase_len += word.len;
86
+
}
87
+
return .{
88
+
.name = bufo.name,
89
+
.url = bufo.url,
90
+
.phrase = phrase_buf[0..phrase_len],
91
+
};
92
+
}
93
+
}
94
+
return null;
95
+
}
96
+
97
+
pub fn count(self: *Matcher) usize {
98
+
return self.bufos.items.len;
99
+
}
100
+
};
101
+
102
+
fn extractPhrase(allocator: Allocator, name: []const u8) ![]const []const u8 {
103
+
var start: usize = 0;
104
+
if (mem.startsWith(u8, name, "bufo-")) {
105
+
start = 5;
106
+
}
107
+
var end = name.len;
108
+
if (mem.endsWith(u8, name, ".gif")) {
109
+
end -= 4;
110
+
} else if (mem.endsWith(u8, name, ".png")) {
111
+
end -= 4;
112
+
} else if (mem.endsWith(u8, name, ".jpg")) {
113
+
end -= 4;
114
+
} else if (mem.endsWith(u8, name, ".jpeg")) {
115
+
end -= 5;
116
+
}
117
+
118
+
const slug = name[start..end];
119
+
120
+
var words: std.ArrayList([]const u8) = .{};
121
+
errdefer {
122
+
for (words.items) |word| allocator.free(word);
123
+
words.deinit(allocator);
124
+
}
125
+
126
+
var iter = mem.splitScalar(u8, slug, '-');
127
+
while (iter.next()) |word| {
128
+
if (word.len > 0) {
129
+
const lower = try allocator.alloc(u8, word.len);
130
+
for (word, 0..) |c, j| {
131
+
lower[j] = std.ascii.toLower(c);
132
+
}
133
+
try words.append(allocator, lower);
134
+
}
135
+
}
136
+
137
+
return try words.toOwnedSlice(allocator);
138
+
}
139
+
140
+
fn containsPhrase(post_words: []const []const u8, phrase: []const []const u8) bool {
141
+
if (phrase.len == 0 or post_words.len < phrase.len) return false;
142
+
143
+
outer: for (0..post_words.len - phrase.len + 1) |i| {
144
+
for (phrase, 0..) |phrase_word, j| {
145
+
if (!eqlIgnoreCase(post_words[i + j], phrase_word)) {
146
+
continue :outer;
147
+
}
148
+
}
149
+
return true;
150
+
}
151
+
return false;
152
+
}
153
+
154
+
fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
155
+
if (a.len != b.len) return false;
156
+
for (a, b) |ca, cb| {
157
+
if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
158
+
}
159
+
return true;
160
+
}
161
+
162
+
fn isAlpha(c: u8) bool {
163
+
return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
164
+
}
+28
justfile
+28
justfile
···
1
+
# bufo search justfile
2
+
mod bot
3
+
4
+
# re-index all bufos with new embeddings
5
+
re-index:
6
+
@echo "re-indexing all bufos with input_type=document..."
7
+
uv run scripts/ingest_bufos.py
8
+
9
+
# deploy to fly.io
10
+
deploy:
11
+
@echo "deploying to fly.io..."
12
+
fly deploy --wait-timeout 180
13
+
14
+
# run dev server with hot reload
15
+
dev:
16
+
@echo "starting dev server with hot reload..."
17
+
RUST_LOG=info cargo watch -x run -w src -w static
18
+
19
+
# build and run locally
20
+
run:
21
+
@echo "building and running locally..."
22
+
cargo build --release
23
+
./target/release/find-bufo
24
+
25
+
# build release binary
26
+
build:
27
+
@echo "building release binary..."
28
+
cargo build --release
+153
scripts/add_one_bufo.py
+153
scripts/add_one_bufo.py
···
1
+
#!/usr/bin/env python3
2
+
# /// script
3
+
# requires-python = ">=3.11"
4
+
# dependencies = [
5
+
# "httpx",
6
+
# "python-dotenv",
7
+
# "pillow",
8
+
# ]
9
+
# ///
10
+
"""
11
+
Add a single bufo to turbopuffer.
12
+
Usage: uv run scripts/add_one_bufo.py <path_to_image>
13
+
"""
14
+
15
+
import asyncio
16
+
import base64
17
+
import hashlib
18
+
import os
19
+
import sys
20
+
from io import BytesIO
21
+
from pathlib import Path
22
+
23
+
import httpx
24
+
from PIL import Image
25
+
from dotenv import load_dotenv
26
+
27
+
load_dotenv(Path(__file__).parent.parent / ".env")
28
+
29
+
30
+
async def embed_image(client: httpx.AsyncClient, image_path: Path, api_key: str) -> list[float] | None:
31
+
"""Generate embedding for an image using Voyage AI"""
32
+
try:
33
+
image = Image.open(image_path)
34
+
is_animated = hasattr(image, 'n_frames') and image.n_frames > 1
35
+
filename_text = image_path.stem.replace("-", " ").replace("_", " ")
36
+
37
+
content = [{"type": "text", "text": filename_text}]
38
+
39
+
if is_animated:
40
+
num_frames = image.n_frames
41
+
max_frames = min(5, num_frames)
42
+
frame_indices = [int(i * (num_frames - 1) / (max_frames - 1)) for i in range(max_frames)]
43
+
for frame_idx in frame_indices:
44
+
image.seek(frame_idx)
45
+
buffered = BytesIO()
46
+
image.convert("RGB").save(buffered, format="WEBP", lossless=True)
47
+
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
48
+
content.append({
49
+
"type": "image_base64",
50
+
"image_base64": f"data:image/webp;base64,{img_base64}",
51
+
})
52
+
else:
53
+
buffered = BytesIO()
54
+
image.convert("RGB").save(buffered, format="WEBP", lossless=True)
55
+
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
56
+
content.append({
57
+
"type": "image_base64",
58
+
"image_base64": f"data:image/webp;base64,{img_base64}",
59
+
})
60
+
61
+
response = await client.post(
62
+
"https://api.voyageai.com/v1/multimodalembeddings",
63
+
headers={
64
+
"Authorization": f"Bearer {api_key}",
65
+
"Content-Type": "application/json",
66
+
},
67
+
json={
68
+
"inputs": [{"content": content}],
69
+
"model": "voyage-multimodal-3",
70
+
"input_type": "document",
71
+
},
72
+
timeout=60.0,
73
+
)
74
+
response.raise_for_status()
75
+
result = response.json()
76
+
return result["data"][0]["embedding"]
77
+
except Exception as e:
78
+
print(f"error embedding {image_path.name}: {e}")
79
+
return None
80
+
81
+
82
+
async def upload_to_turbopuffer(filename: str, embedding: list[float], api_key: str, namespace: str):
83
+
"""Upload single embedding to turbopuffer"""
84
+
file_hash = hashlib.sha256(filename.encode()).hexdigest()[:16]
85
+
name = filename.rsplit(".", 1)[0]
86
+
url = f"https://find-bufo.fly.dev/static/{filename}"
87
+
88
+
async with httpx.AsyncClient() as client:
89
+
response = await client.post(
90
+
f"https://api.turbopuffer.com/v1/vectors/{namespace}",
91
+
headers={
92
+
"Authorization": f"Bearer {api_key}",
93
+
"Content-Type": "application/json",
94
+
},
95
+
json={
96
+
"ids": [file_hash],
97
+
"vectors": [embedding],
98
+
"distance_metric": "cosine_distance",
99
+
"attributes": {
100
+
"url": [url],
101
+
"name": [name],
102
+
"filename": [filename],
103
+
},
104
+
"schema": {
105
+
"name": {"type": "string", "full_text_search": True},
106
+
"filename": {"type": "string", "full_text_search": True},
107
+
},
108
+
},
109
+
timeout=30.0,
110
+
)
111
+
if response.status_code != 200:
112
+
print(f"turbopuffer error: {response.text}")
113
+
response.raise_for_status()
114
+
115
+
print(f"uploaded {filename} to turbopuffer")
116
+
117
+
118
+
async def main():
119
+
if len(sys.argv) < 2:
120
+
print("usage: uv run scripts/add_one_bufo.py <path_to_image>")
121
+
sys.exit(1)
122
+
123
+
image_path = Path(sys.argv[1])
124
+
if not image_path.exists():
125
+
print(f"file not found: {image_path}")
126
+
sys.exit(1)
127
+
128
+
voyage_api_key = os.getenv("VOYAGE_API_TOKEN")
129
+
if not voyage_api_key:
130
+
print("VOYAGE_API_TOKEN not set")
131
+
sys.exit(1)
132
+
133
+
tpuf_api_key = os.getenv("TURBOPUFFER_API_KEY")
134
+
if not tpuf_api_key:
135
+
print("TURBOPUFFER_API_KEY not set")
136
+
sys.exit(1)
137
+
138
+
tpuf_namespace = os.getenv("TURBOPUFFER_NAMESPACE", "bufos")
139
+
140
+
print(f"adding {image_path.name}...")
141
+
142
+
async with httpx.AsyncClient() as client:
143
+
embedding = await embed_image(client, image_path, voyage_api_key)
144
+
if not embedding:
145
+
print("failed to generate embedding")
146
+
sys.exit(1)
147
+
148
+
await upload_to_turbopuffer(image_path.name, embedding, tpuf_api_key, tpuf_namespace)
149
+
print("done!")
150
+
151
+
152
+
if __name__ == "__main__":
153
+
asyncio.run(main())
+15
-5
scripts/ingest_bufos.py
+15
-5
scripts/ingest_bufos.py
···
125
125
# check if this is an animated image
126
126
is_animated = hasattr(image, 'n_frames') and image.n_frames > 1
127
127
128
+
# extract semantic meaning from filename for early fusion
129
+
# convert "bufo-jumping-on-bed.png" -> "bufo jumping on bed"
130
+
filename_text = image_path.stem.replace("-", " ").replace("_", " ")
131
+
132
+
# start content array with filename text for early fusion
133
+
content = [{
134
+
"type": "text",
135
+
"text": filename_text
136
+
}]
137
+
128
138
if is_animated:
129
139
# for animated GIFs, extract multiple keyframes for temporal representation
130
140
num_frames = image.n_frames
···
132
142
max_frames = min(5, num_frames)
133
143
frame_indices = [int(i * (num_frames - 1) / (max_frames - 1)) for i in range(max_frames)]
134
144
135
-
# extract each frame as base64 image
136
-
content = []
145
+
# add each frame to content array
137
146
for frame_idx in frame_indices:
138
147
image.seek(frame_idx)
139
148
buffered = BytesIO()
···
144
153
"image_base64": f"data:image/webp;base64,{img_base64}",
145
154
})
146
155
else:
147
-
# for static images, just send the single image
156
+
# for static images, add single image to content array
148
157
buffered = BytesIO()
149
158
image.convert("RGB").save(buffered, format="WEBP", lossless=True)
150
159
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
151
-
content = [{
160
+
content.append({
152
161
"type": "image_base64",
153
162
"image_base64": f"data:image/webp;base64,{img_base64}",
154
-
}]
163
+
})
155
164
156
165
response = await client.post(
157
166
"https://api.voyageai.com/v1/multimodalembeddings",
···
162
171
json={
163
172
"inputs": [{"content": content}],
164
173
"model": "voyage-multimodal-3",
174
+
"input_type": "document",
165
175
},
166
176
timeout=60.0,
167
177
)
+38
-34
src/embedding.rs
+38
-34
src/embedding.rs
···
1
-
use anyhow::{Context, Result};
1
+
//! voyage AI embedding implementation
2
+
//!
3
+
//! implements the `Embedder` trait for voyage's multimodal-3 model.
4
+
5
+
use crate::providers::{Embedder, EmbeddingError};
2
6
use reqwest::Client;
3
7
use serde::{Deserialize, Serialize};
8
+
9
+
const VOYAGE_API_URL: &str = "https://api.voyageai.com/v1/multimodalembeddings";
10
+
const VOYAGE_MODEL: &str = "voyage-multimodal-3";
4
11
5
12
#[derive(Debug, Serialize)]
6
-
struct VoyageEmbeddingRequest {
13
+
struct VoyageRequest {
7
14
inputs: Vec<MultimodalInput>,
8
15
model: String,
16
+
#[serde(skip_serializing_if = "Option::is_none")]
17
+
input_type: Option<String>,
9
18
}
10
19
11
20
#[derive(Debug, Serialize)]
···
20
29
}
21
30
22
31
#[derive(Debug, Deserialize)]
23
-
struct VoyageEmbeddingResponse {
32
+
struct VoyageResponse {
24
33
data: Vec<VoyageEmbeddingData>,
25
34
}
26
35
···
29
38
embedding: Vec<f32>,
30
39
}
31
40
32
-
pub struct EmbeddingClient {
41
+
/// voyage AI multimodal embedding client
42
+
///
43
+
/// uses the voyage-multimodal-3 model which produces 1024-dimensional vectors.
44
+
/// designed for early fusion of text and image content.
45
+
#[derive(Clone)]
46
+
pub struct VoyageEmbedder {
33
47
client: Client,
34
48
api_key: String,
35
49
}
36
50
37
-
impl EmbeddingClient {
51
+
impl VoyageEmbedder {
38
52
pub fn new(api_key: String) -> Self {
39
53
Self {
40
54
client: Client::new(),
41
55
api_key,
42
56
}
43
57
}
58
+
}
44
59
45
-
pub async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
46
-
let request = VoyageEmbeddingRequest {
60
+
impl Embedder for VoyageEmbedder {
61
+
async fn embed(&self, text: &str) -> Result<Vec<f32>, EmbeddingError> {
62
+
let request = VoyageRequest {
47
63
inputs: vec![MultimodalInput {
48
64
content: vec![ContentSegment::Text {
49
65
text: text.to_string(),
50
66
}],
51
67
}],
52
-
model: "voyage-multimodal-3".to_string(),
68
+
model: VOYAGE_MODEL.to_string(),
69
+
input_type: Some("query".to_string()),
53
70
};
54
-
55
-
let json_body = serde_json::to_string(&request)?;
56
-
log::debug!("Sending request body: {}", json_body);
57
71
58
72
let response = self
59
73
.client
60
-
.post("https://api.voyageai.com/v1/multimodalembeddings")
74
+
.post(VOYAGE_API_URL)
61
75
.header("Authorization", format!("Bearer {}", self.api_key))
62
76
.json(&request)
63
77
.send()
64
-
.await
65
-
.context("failed to send embedding request")?;
78
+
.await?;
66
79
67
80
if !response.status().is_success() {
68
-
let status = response.status();
81
+
let status = response.status().as_u16();
69
82
let body = response.text().await.unwrap_or_default();
70
-
anyhow::bail!("voyage api error ({}): {}", status, body);
83
+
return Err(EmbeddingError::Api { status, body });
71
84
}
72
85
73
-
let embedding_response: VoyageEmbeddingResponse = response
74
-
.json()
75
-
.await
76
-
.context("failed to parse embedding response")?;
86
+
let voyage_response: VoyageResponse = response.json().await.map_err(|e| {
87
+
EmbeddingError::Other(anyhow::anyhow!("failed to parse response: {}", e))
88
+
})?;
77
89
78
-
let embedding = embedding_response
90
+
voyage_response
79
91
.data
80
92
.into_iter()
81
93
.next()
82
94
.map(|d| d.embedding)
83
-
.context("no embedding returned")?;
84
-
85
-
log::debug!(
86
-
"Generated embedding for '{}': dimension={}, first 5 values=[{:.4}, {:.4}, {:.4}, {:.4}, {:.4}]",
87
-
text,
88
-
embedding.len(),
89
-
embedding.get(0).unwrap_or(&0.0),
90
-
embedding.get(1).unwrap_or(&0.0),
91
-
embedding.get(2).unwrap_or(&0.0),
92
-
embedding.get(3).unwrap_or(&0.0),
93
-
embedding.get(4).unwrap_or(&0.0)
94
-
);
95
+
.ok_or(EmbeddingError::EmptyResponse)
96
+
}
95
97
96
-
Ok(embedding)
98
+
fn name(&self) -> &'static str {
99
+
"voyage-multimodal-3"
97
100
}
98
101
}
102
+
+193
src/filter.rs
+193
src/filter.rs
···
1
+
//! composable result filters
2
+
//!
3
+
//! filters are predicates that can be combined to create complex filtering logic.
4
+
5
+
use regex::Regex;
6
+
7
+
/// a single search result that can be filtered
8
+
pub trait Filterable {
9
+
fn name(&self) -> &str;
10
+
}
11
+
12
+
/// a predicate that can accept or reject items
13
+
pub trait Filter<T: Filterable>: Send + Sync {
14
+
/// returns true if the item should be kept
15
+
fn matches(&self, item: &T) -> bool;
16
+
}
17
+
18
+
/// filters out inappropriate content based on a blocklist
19
+
struct BlocklistFilter {
20
+
blocklist: Vec<&'static str>,
21
+
}
22
+
23
+
impl BlocklistFilter {
24
+
fn inappropriate_bufos() -> Self {
25
+
Self {
26
+
blocklist: vec![
27
+
"bufo-juicy",
28
+
"good-news-bufo-offers-suppository",
29
+
"bufo-declines-your-suppository-offer",
30
+
"tsa-bufo-gropes-you",
31
+
],
32
+
}
33
+
}
34
+
}
35
+
36
+
impl<T: Filterable> Filter<T> for BlocklistFilter {
37
+
fn matches(&self, item: &T) -> bool {
38
+
!self.blocklist.iter().any(|blocked| item.name().contains(blocked))
39
+
}
40
+
}
41
+
42
+
/// filters out items matching any of the given regex patterns
43
+
struct ExcludePatternFilter {
44
+
patterns: Vec<Regex>,
45
+
}
46
+
47
+
impl ExcludePatternFilter {
48
+
fn from_comma_separated(pattern_str: &str) -> Self {
49
+
let patterns = pattern_str
50
+
.split(',')
51
+
.map(|p| p.trim())
52
+
.filter(|p| !p.is_empty())
53
+
.filter_map(|p| Regex::new(p).ok())
54
+
.collect();
55
+
56
+
Self { patterns }
57
+
}
58
+
59
+
fn empty() -> Self {
60
+
Self { patterns: vec![] }
61
+
}
62
+
}
63
+
64
+
impl<T: Filterable> Filter<T> for ExcludePatternFilter {
65
+
fn matches(&self, item: &T) -> bool {
66
+
!self.patterns.iter().any(|p| p.is_match(item.name()))
67
+
}
68
+
}
69
+
70
+
/// combined filter that handles family-friendly mode and include/exclude patterns
71
+
pub struct ContentFilter {
72
+
family_friendly: bool,
73
+
blocklist: BlocklistFilter,
74
+
exclude: ExcludePatternFilter,
75
+
include_patterns: Vec<Regex>,
76
+
}
77
+
78
+
impl ContentFilter {
79
+
pub fn new(
80
+
family_friendly: bool,
81
+
exclude_str: Option<&str>,
82
+
include_str: Option<&str>,
83
+
) -> Self {
84
+
let exclude = exclude_str
85
+
.map(ExcludePatternFilter::from_comma_separated)
86
+
.unwrap_or_else(ExcludePatternFilter::empty);
87
+
88
+
let include_patterns: Vec<Regex> = include_str
89
+
.map(|s| {
90
+
s.split(',')
91
+
.map(|p| p.trim())
92
+
.filter(|p| !p.is_empty())
93
+
.filter_map(|p| Regex::new(p).ok())
94
+
.collect()
95
+
})
96
+
.unwrap_or_default();
97
+
98
+
Self {
99
+
family_friendly,
100
+
blocklist: BlocklistFilter::inappropriate_bufos(),
101
+
exclude,
102
+
include_patterns,
103
+
}
104
+
}
105
+
106
+
pub fn exclude_pattern_count(&self) -> usize {
107
+
self.exclude.patterns.len()
108
+
}
109
+
110
+
pub fn exclude_patterns_str(&self) -> String {
111
+
self.exclude
112
+
.patterns
113
+
.iter()
114
+
.map(|r| r.as_str())
115
+
.collect::<Vec<_>>()
116
+
.join(",")
117
+
}
118
+
}
119
+
120
+
impl<T: Filterable> Filter<T> for ContentFilter {
121
+
fn matches(&self, item: &T) -> bool {
122
+
// check family-friendly blocklist
123
+
if self.family_friendly && !self.blocklist.matches(item) {
124
+
return false;
125
+
}
126
+
127
+
// check if explicitly included (overrides exclude)
128
+
let matches_include = self.include_patterns.iter().any(|p| p.is_match(item.name()));
129
+
if matches_include {
130
+
return true;
131
+
}
132
+
133
+
// check exclude patterns
134
+
self.exclude.matches(item)
135
+
}
136
+
}
137
+
138
+
#[cfg(test)]
139
+
mod tests {
140
+
use super::*;
141
+
142
+
struct TestItem {
143
+
name: String,
144
+
}
145
+
146
+
impl Filterable for TestItem {
147
+
fn name(&self) -> &str {
148
+
&self.name
149
+
}
150
+
}
151
+
152
+
#[test]
153
+
fn test_blocklist_filter() {
154
+
let filter = BlocklistFilter::inappropriate_bufos();
155
+
let good = TestItem {
156
+
name: "bufo-happy".into(),
157
+
};
158
+
let bad = TestItem {
159
+
name: "bufo-juicy".into(),
160
+
};
161
+
162
+
assert!(filter.matches(&good));
163
+
assert!(!filter.matches(&bad));
164
+
}
165
+
166
+
#[test]
167
+
fn test_exclude_pattern_filter() {
168
+
let filter = ExcludePatternFilter::from_comma_separated("test, draft");
169
+
let good = TestItem {
170
+
name: "bufo-happy".into(),
171
+
};
172
+
let bad = TestItem {
173
+
name: "bufo-test-mode".into(),
174
+
};
175
+
176
+
assert!(filter.matches(&good));
177
+
assert!(!filter.matches(&bad));
178
+
}
179
+
180
+
#[test]
181
+
fn test_include_overrides_exclude() {
182
+
let filter = ContentFilter::new(false, Some("party"), Some("birthday-party"));
183
+
let excluded = TestItem {
184
+
name: "bufo-party".into(),
185
+
};
186
+
let included = TestItem {
187
+
name: "bufo-birthday-party".into(),
188
+
};
189
+
190
+
assert!(!filter.matches(&excluded));
191
+
assert!(filter.matches(&included));
192
+
}
193
+
}
+7
-1
src/main.rs
+7
-1
src/main.rs
···
1
1
mod config;
2
2
mod embedding;
3
+
mod filter;
4
+
mod providers;
5
+
mod scoring;
3
6
mod search;
4
7
mod turbopuffer;
5
8
···
10
13
use anyhow::Result;
11
14
use config::Config;
12
15
use opentelemetry_instrumentation_actix_web::{RequestMetrics, RequestTracing};
16
+
use tracing::level_filters::LevelFilter;
13
17
14
18
async fn index() -> HttpResponse {
15
19
HttpResponse::Ok()
···
21
25
async fn main() -> Result<()> {
22
26
dotenv::dotenv().ok();
23
27
24
-
// initialize logfire
28
+
// initialize logfire with info level filter to exclude trace/debug spans
25
29
let logfire = logfire::configure()
30
+
.with_default_level_filter(LevelFilter::INFO)
26
31
.finish()
27
32
.map_err(|e| anyhow::anyhow!("failed to initialize logfire: {}", e))?;
28
33
···
60
65
web::scope("/api")
61
66
.wrap(Governor::new(&governor_conf))
62
67
.route("/search", web::post().to(search::search))
68
+
.route("/search", web::get().to(search::search_get))
63
69
.route("/health", web::get().to(|| async { HttpResponse::Ok().body("ok") }))
64
70
)
65
71
.service(fs::Files::new("/static", "./static").show_files_listing())
+99
src/providers.rs
+99
src/providers.rs
···
1
+
//! provider abstractions for embedding and vector search backends
2
+
//!
3
+
//! these traits allow swapping implementations (e.g., voyage โ openai embeddings)
4
+
//! without changing the search logic.
5
+
//!
6
+
//! ## design notes
7
+
//!
8
+
//! we use `async fn` in traits directly (stabilized in rust 1.75). for this crate's
9
+
//! use case (single-threaded actix-web), the Send bound issue doesn't apply.
10
+
//!
11
+
//! the trait design follows patterns from:
12
+
//! - async-openai's `Config` trait for backend abstraction
13
+
//! - tower's `Service` trait for composability (though simpler here)
14
+
15
+
use std::future::Future;
16
+
use thiserror::Error;
17
+
18
+
/// errors that can occur when generating embeddings
19
+
#[derive(Debug, Error)]
20
+
pub enum EmbeddingError {
21
+
#[error("failed to send request: {0}")]
22
+
Request(#[from] reqwest::Error),
23
+
24
+
#[error("api error ({status}): {body}")]
25
+
Api { status: u16, body: String },
26
+
27
+
#[error("no embedding returned from provider")]
28
+
EmptyResponse,
29
+
30
+
#[error("{0}")]
31
+
Other(#[from] anyhow::Error),
32
+
}
33
+
34
+
/// a provider that can generate embeddings for text
35
+
///
36
+
/// implementations should be cheap to clone (wrap expensive resources in Arc).
37
+
///
38
+
/// # example
39
+
///
40
+
/// ```ignore
41
+
/// let client = VoyageEmbedder::new(api_key);
42
+
/// let embedding = client.embed("hello world").await?;
43
+
/// ```
44
+
pub trait Embedder: Send + Sync {
45
+
/// generate an embedding vector for the given text
46
+
fn embed(&self, text: &str) -> impl Future<Output = Result<Vec<f32>, EmbeddingError>> + Send;
47
+
48
+
/// human-readable name for logging/debugging
49
+
fn name(&self) -> &'static str;
50
+
}
51
+
52
+
/// errors that can occur during vector search
53
+
#[derive(Debug, Error)]
54
+
pub enum VectorSearchError {
55
+
#[error("request failed: {0}")]
56
+
Request(#[from] reqwest::Error),
57
+
58
+
#[error("api error ({status}): {body}")]
59
+
Api { status: u16, body: String },
60
+
61
+
#[error("query too long: {message}")]
62
+
QueryTooLong { message: String },
63
+
64
+
#[error("parse error: {0}")]
65
+
Parse(String),
66
+
67
+
#[error("{0}")]
68
+
Other(#[from] anyhow::Error),
69
+
}
70
+
71
+
/// a single result from a vector search
72
+
#[derive(Debug, Clone)]
73
+
pub struct SearchResult {
74
+
pub id: String,
75
+
/// raw distance/score from the backend (interpretation varies by method)
76
+
pub score: f32,
77
+
/// arbitrary key-value attributes
78
+
pub attributes: std::collections::HashMap<String, String>,
79
+
}
80
+
81
+
/// a provider that can perform vector similarity search
82
+
pub trait VectorStore: Send + Sync {
83
+
/// search by vector embedding (ANN/cosine similarity)
84
+
fn search_by_vector(
85
+
&self,
86
+
embedding: &[f32],
87
+
top_k: usize,
88
+
) -> impl Future<Output = Result<Vec<SearchResult>, VectorSearchError>> + Send;
89
+
90
+
/// search by keyword (BM25 full-text search)
91
+
fn search_by_keyword(
92
+
&self,
93
+
query: &str,
94
+
top_k: usize,
95
+
) -> impl Future<Output = Result<Vec<SearchResult>, VectorSearchError>> + Send;
96
+
97
+
/// human-readable name for logging/debugging
98
+
fn name(&self) -> &'static str;
99
+
}
+164
src/scoring.rs
+164
src/scoring.rs
···
1
+
//! score fusion and normalization for hybrid search
2
+
//!
3
+
//! this module handles the weighted combination of semantic (vector) and
4
+
//! keyword (BM25) search scores.
5
+
//!
6
+
//! ## normalization strategies
7
+
//!
8
+
//! - **cosine distance โ similarity**: `1.0 - (distance / 2.0)` maps [0, 2] โ [1, 0]
9
+
//! - **BM25 max-scaling**: divide by max score so top result = 1.0
10
+
//!
11
+
//! ## fusion formula
12
+
//!
13
+
//! ```text
14
+
//! score = ฮฑ * semantic + (1 - ฮฑ) * keyword
15
+
//! ```
16
+
//!
17
+
//! reference: https://opensourceconnections.com/blog/2023/02/27/hybrid-vigor-winning-at-hybrid-search/
18
+
19
+
use std::collections::HashMap;
20
+
21
+
/// configuration for score fusion
22
+
#[derive(Debug, Clone)]
23
+
pub struct FusionConfig {
24
+
/// weight for semantic scores (0.0 = pure keyword, 1.0 = pure semantic)
25
+
pub alpha: f32,
26
+
/// minimum fused score to include in results (filters noise)
27
+
pub min_score: f32,
28
+
}
29
+
30
+
impl Default for FusionConfig {
31
+
fn default() -> Self {
32
+
Self {
33
+
alpha: 0.7,
34
+
min_score: 0.001,
35
+
}
36
+
}
37
+
}
38
+
39
+
impl FusionConfig {
40
+
pub fn new(alpha: f32) -> Self {
41
+
Self {
42
+
alpha,
43
+
..Default::default()
44
+
}
45
+
}
46
+
}
47
+
48
+
/// normalize cosine distance to similarity score
49
+
///
50
+
/// cosine distance ranges from 0 (identical) to 2 (opposite).
51
+
/// we convert to similarity: 1.0 (identical) to 0.0 (opposite).
52
+
#[inline]
53
+
pub fn cosine_distance_to_similarity(distance: f32) -> f32 {
54
+
1.0 - (distance / 2.0)
55
+
}
56
+
57
+
/// normalize BM25 scores using max-scaling
58
+
///
59
+
/// divides all scores by the maximum score, ensuring:
60
+
/// - top result gets score 1.0
61
+
/// - relative spacing is preserved
62
+
/// - handles edge cases (empty results, identical scores)
63
+
pub fn normalize_bm25_scores(scores: &[(String, f32)]) -> HashMap<String, f32> {
64
+
let max_score = scores
65
+
.iter()
66
+
.map(|(_, s)| *s)
67
+
.fold(f32::NEG_INFINITY, f32::max)
68
+
.max(0.001); // avoid division by zero
69
+
70
+
scores
71
+
.iter()
72
+
.map(|(id, score)| (id.clone(), (score / max_score).min(1.0)))
73
+
.collect()
74
+
}
75
+
76
+
/// fuse semantic and keyword scores using weighted combination
77
+
///
78
+
/// returns items sorted by fused score (descending), filtered by min_score.
79
+
pub fn fuse_scores(
80
+
semantic_scores: &HashMap<String, f32>,
81
+
keyword_scores: &HashMap<String, f32>,
82
+
config: &FusionConfig,
83
+
) -> Vec<(String, f32)> {
84
+
// collect all unique IDs
85
+
let all_ids: std::collections::HashSet<_> = semantic_scores
86
+
.keys()
87
+
.chain(keyword_scores.keys())
88
+
.collect();
89
+
90
+
let mut fused: Vec<(String, f32)> = all_ids
91
+
.into_iter()
92
+
.map(|id| {
93
+
let semantic = semantic_scores.get(id).copied().unwrap_or(0.0);
94
+
let keyword = keyword_scores.get(id).copied().unwrap_or(0.0);
95
+
let score = config.alpha * semantic + (1.0 - config.alpha) * keyword;
96
+
(id.clone(), score)
97
+
})
98
+
.filter(|(_, score)| *score > config.min_score)
99
+
.collect();
100
+
101
+
// sort descending by score
102
+
fused.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
103
+
104
+
fused
105
+
}
106
+
107
+
#[cfg(test)]
108
+
mod tests {
109
+
use super::*;
110
+
111
+
#[test]
112
+
fn test_cosine_distance_to_similarity() {
113
+
assert!((cosine_distance_to_similarity(0.0) - 1.0).abs() < 0.001);
114
+
assert!((cosine_distance_to_similarity(2.0) - 0.0).abs() < 0.001);
115
+
assert!((cosine_distance_to_similarity(1.0) - 0.5).abs() < 0.001);
116
+
}
117
+
118
+
#[test]
119
+
fn test_normalize_bm25_scores() {
120
+
let scores = vec![
121
+
("a".to_string(), 10.0),
122
+
("b".to_string(), 5.0),
123
+
("c".to_string(), 2.5),
124
+
];
125
+
126
+
let normalized = normalize_bm25_scores(&scores);
127
+
128
+
assert!((normalized["a"] - 1.0).abs() < 0.001);
129
+
assert!((normalized["b"] - 0.5).abs() < 0.001);
130
+
assert!((normalized["c"] - 0.25).abs() < 0.001);
131
+
}
132
+
133
+
#[test]
134
+
fn test_fuse_scores_pure_semantic() {
135
+
let mut semantic = HashMap::new();
136
+
semantic.insert("a".to_string(), 0.9);
137
+
semantic.insert("b".to_string(), 0.5);
138
+
139
+
let mut keyword = HashMap::new();
140
+
keyword.insert("a".to_string(), 0.1);
141
+
keyword.insert("c".to_string(), 1.0);
142
+
143
+
let config = FusionConfig::new(1.0); // pure semantic
144
+
let fused = fuse_scores(&semantic, &keyword, &config);
145
+
146
+
assert_eq!(fused[0].0, "a");
147
+
assert!((fused[0].1 - 0.9).abs() < 0.001);
148
+
}
149
+
150
+
#[test]
151
+
fn test_fuse_scores_balanced() {
152
+
let mut semantic = HashMap::new();
153
+
semantic.insert("a".to_string(), 0.8);
154
+
155
+
let mut keyword = HashMap::new();
156
+
keyword.insert("a".to_string(), 0.4);
157
+
158
+
let config = FusionConfig::new(0.5); // balanced
159
+
let fused = fuse_scores(&semantic, &keyword, &config);
160
+
161
+
// 0.5 * 0.8 + 0.5 * 0.4 = 0.6
162
+
assert!((fused[0].1 - 0.6).abs() < 0.001);
163
+
}
164
+
}
+353
-117
src/search.rs
+353
-117
src/search.rs
···
1
+
//! hybrid search combining semantic embeddings with keyword matching
2
+
//!
3
+
//! this implementation uses weighted fusion to balance semantic understanding with exact matches.
4
+
//!
5
+
//! ## search components
6
+
//!
7
+
//! ### 1. semantic search (vector/ANN)
8
+
//! - voyage AI multimodal-3 embeddings via early fusion:
9
+
//! - filename text (e.g., "bufo-jumping-on-bed" โ "bufo jumping on bed") + image content
10
+
//! - unified transformer encoder creates 1024-dim vectors
11
+
//! - cosine distance similarity against turbopuffer
12
+
//! - **strength**: finds semantically related bufos (e.g., "happy" โ excited, smiling bufos)
13
+
//! - **weakness**: may miss exact filename matches (e.g., "happy" might not surface "bufo-is-happy")
14
+
//!
15
+
//! ### 2. keyword search (BM25)
16
+
//! - full-text search on bufo `name` field (filename without extension)
17
+
//! - BM25 ranking: IDF-weighted term frequency with document length normalization
18
+
//! - **strength**: excellent for exact/partial matches (e.g., "jumping" โ "bufos-jumping-on-the-bed")
19
+
//! - **weakness**: no semantic understanding (e.g., "happy" won't find "excited" or "smiling")
20
+
//!
21
+
//! ### 3. weighted fusion
22
+
//! - formula: `score = ฮฑ * semantic + (1-ฮฑ) * keyword`
23
+
//! - both scores normalized to 0-1 range before fusion
24
+
//! - configurable `alpha` parameter (default 0.7):
25
+
//! - `ฮฑ=1.0`: pure semantic (best for conceptual queries like "apocalyptic", "in a giving mood")
26
+
//! - `ฮฑ=0.7`: default (70% semantic, 30% keyword - balances both strengths)
27
+
//! - `ฮฑ=0.5`: balanced (equal weight to semantic and keyword signals)
28
+
//! - `ฮฑ=0.0`: pure keyword (best for exact filename searches)
29
+
//!
30
+
//! ## references
31
+
//!
32
+
//! - voyage multimodal embeddings: https://docs.voyageai.com/docs/multimodal-embeddings
33
+
//! - turbopuffer BM25: https://turbopuffer.com/docs/fts
34
+
//! - weighted fusion: standard approach in modern hybrid search systems (2024)
35
+
1
36
use crate::config::Config;
2
-
use crate::embedding::EmbeddingClient;
3
-
use crate::turbopuffer::{QueryRequest, TurbopufferClient};
4
-
use actix_web::{web, HttpResponse, Result as ActixResult};
37
+
use crate::embedding::VoyageEmbedder;
38
+
use crate::filter::{ContentFilter, Filter, Filterable};
39
+
use crate::providers::{Embedder, VectorSearchError, VectorStore};
40
+
use crate::scoring::{cosine_distance_to_similarity, fuse_scores, normalize_bm25_scores, FusionConfig};
41
+
use crate::turbopuffer::TurbopufferStore;
42
+
use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult};
5
43
use serde::{Deserialize, Serialize};
6
-
use tracing::instrument;
44
+
use std::collections::hash_map::DefaultHasher;
45
+
use std::collections::HashMap;
46
+
use std::hash::{Hash, Hasher};
7
47
8
48
#[derive(Debug, Deserialize)]
9
49
pub struct SearchQuery {
10
50
pub query: String,
11
51
#[serde(default = "default_top_k")]
12
52
pub top_k: usize,
53
+
/// alpha parameter for weighted fusion (0.0 = pure keyword, 1.0 = pure semantic)
54
+
/// default 0.7 favors semantic search while still considering exact matches
55
+
#[serde(default = "default_alpha")]
56
+
pub alpha: f32,
57
+
/// family-friendly mode: filters out inappropriate content (default true)
58
+
#[serde(default = "default_family_friendly")]
59
+
pub family_friendly: bool,
60
+
/// comma-separated regex patterns to exclude from results (e.g., "excited,party")
61
+
#[serde(default)]
62
+
pub exclude: Option<String>,
63
+
/// comma-separated regex patterns to include (overrides exclude)
64
+
#[serde(default)]
65
+
pub include: Option<String>,
13
66
}
14
67
15
68
fn default_top_k() -> usize {
16
69
10
17
70
}
18
71
72
+
fn default_alpha() -> f32 {
73
+
0.7
74
+
}
75
+
76
+
fn default_family_friendly() -> bool {
77
+
true
78
+
}
79
+
19
80
#[derive(Debug, Serialize)]
20
81
pub struct SearchResponse {
21
82
pub results: Vec<BufoResult>,
22
83
}
23
84
24
-
#[derive(Debug, Serialize)]
85
+
#[derive(Debug, Serialize, Clone)]
25
86
pub struct BufoResult {
26
87
pub id: String,
27
88
pub url: String,
28
89
pub name: String,
29
-
pub score: f32, // normalized 0-1 score for display
90
+
pub score: f32,
91
+
}
92
+
93
+
impl Filterable for BufoResult {
94
+
fn name(&self) -> &str {
95
+
&self.name
96
+
}
97
+
}
98
+
99
+
/// errors that can occur during search
100
+
#[derive(Debug, thiserror::Error)]
101
+
pub enum SearchError {
102
+
#[error("embedding error: {0}")]
103
+
Embedding(#[from] crate::providers::EmbeddingError),
104
+
105
+
#[error("vector search error: {0}")]
106
+
VectorSearch(#[from] VectorSearchError),
107
+
}
108
+
109
+
impl SearchError {
110
+
fn into_actix_error(self) -> actix_web::Error {
111
+
match &self {
112
+
SearchError::VectorSearch(VectorSearchError::QueryTooLong { .. }) => {
113
+
actix_web::error::ErrorBadRequest(
114
+
"search query is too long (max 1024 characters for text search). try a shorter query."
115
+
)
116
+
}
117
+
_ => actix_web::error::ErrorInternalServerError(self.to_string()),
118
+
}
119
+
}
120
+
}
121
+
122
+
/// generate etag for caching based on query parameters
123
+
fn generate_etag(
124
+
query: &str,
125
+
top_k: usize,
126
+
alpha: f32,
127
+
family_friendly: bool,
128
+
exclude: &Option<String>,
129
+
include: &Option<String>,
130
+
) -> String {
131
+
let mut hasher = DefaultHasher::new();
132
+
query.hash(&mut hasher);
133
+
top_k.hash(&mut hasher);
134
+
alpha.to_bits().hash(&mut hasher);
135
+
family_friendly.hash(&mut hasher);
136
+
exclude.hash(&mut hasher);
137
+
include.hash(&mut hasher);
138
+
format!("\"{}\"", hasher.finish())
30
139
}
31
140
32
-
#[instrument(skip(config), fields(query = %query.query, top_k = query.top_k))]
33
-
pub async fn search(
34
-
query: web::Json<SearchQuery>,
35
-
config: web::Data<Config>,
36
-
) -> ActixResult<HttpResponse> {
37
-
let embedding_client = EmbeddingClient::new(config.voyage_api_key.clone());
38
-
let tpuf_client = TurbopufferClient::new(
39
-
config.turbopuffer_api_key.clone(),
40
-
config.turbopuffer_namespace.clone(),
141
+
/// execute hybrid search using the provided embedder and vector store
142
+
async fn execute_hybrid_search<E: Embedder, V: VectorStore>(
143
+
query: &str,
144
+
top_k: usize,
145
+
fusion_config: &FusionConfig,
146
+
embedder: &E,
147
+
vector_store: &V,
148
+
) -> Result<Vec<(String, f32, HashMap<String, String>)>, SearchError> {
149
+
// fetch extra results to ensure we have enough after filtering
150
+
let search_top_k = top_k * 5;
151
+
let query_owned = query.to_string();
152
+
153
+
// generate query embedding
154
+
let _embed_span = logfire::span!(
155
+
"embedding.generate",
156
+
query = &query_owned,
157
+
model = embedder.name()
158
+
)
159
+
.entered();
160
+
161
+
let query_embedding = embedder.embed(query).await?;
162
+
163
+
logfire::info!(
164
+
"embedding generated",
165
+
query = &query_owned,
166
+
embedding_dim = query_embedding.len() as i64
41
167
);
42
168
43
-
// run vector search
44
-
let query_embedding = {
45
-
let _span = logfire::span!("generate_embedding", query = &query.query);
46
-
embedding_client
47
-
.embed_text(&query.query)
48
-
.await
49
-
.map_err(|e| {
50
-
logfire::error!("failed to generate embedding", error = e.to_string());
51
-
actix_web::error::ErrorInternalServerError(format!(
52
-
"failed to generate embedding: {}",
53
-
e
54
-
))
55
-
})?
56
-
};
169
+
// run both searches in sequence (could parallelize with tokio::join! if needed)
170
+
let namespace = vector_store.name().to_string();
171
+
172
+
let vector_results = {
173
+
let _span = logfire::span!(
174
+
"turbopuffer.vector_search",
175
+
query = &query_owned,
176
+
top_k = search_top_k as i64,
177
+
namespace = &namespace
178
+
)
179
+
.entered();
57
180
58
-
let vector_request = QueryRequest {
59
-
rank_by: vec![
60
-
serde_json::json!("vector"),
61
-
serde_json::json!("ANN"),
62
-
serde_json::json!(query_embedding),
63
-
],
64
-
top_k: query.top_k * 2, // get more results for fusion
65
-
include_attributes: Some(vec!["url".to_string(), "name".to_string(), "filename".to_string()]),
181
+
vector_store
182
+
.search_by_vector(&query_embedding, search_top_k)
183
+
.await?
66
184
};
67
185
68
-
let vector_results = {
69
-
let _span = logfire::span!("vector_search", top_k = query.top_k * 2);
70
-
tpuf_client.query(vector_request).await.map_err(|e| {
71
-
logfire::error!("vector search failed", error = e.to_string());
72
-
actix_web::error::ErrorInternalServerError(format!(
73
-
"failed to query turbopuffer (vector): {}",
74
-
e
75
-
))
76
-
})?
77
-
};
186
+
logfire::info!(
187
+
"vector search completed",
188
+
query = &query_owned,
189
+
results_found = vector_results.len() as i64
190
+
);
78
191
79
-
// run BM25 text search
80
-
let bm25_top_k = query.top_k * 2;
81
192
let bm25_results = {
82
-
let _span = logfire::span!("bm25_search", query = &query.query, top_k = bm25_top_k as i64);
83
-
tpuf_client.bm25_query(&query.query, bm25_top_k).await.map_err(|e| {
84
-
logfire::error!("bm25 search failed", error = e.to_string());
85
-
actix_web::error::ErrorInternalServerError(format!(
86
-
"failed to query turbopuffer (BM25): {}",
87
-
e
88
-
))
89
-
})?
193
+
let _span = logfire::span!(
194
+
"turbopuffer.bm25_search",
195
+
query = &query_owned,
196
+
top_k = search_top_k as i64,
197
+
namespace = &namespace
198
+
)
199
+
.entered();
200
+
201
+
vector_store.search_by_keyword(query, search_top_k).await?
90
202
};
91
203
92
-
// combine results using Reciprocal Rank Fusion (RRF)
93
-
let _span = logfire::span!("reciprocal_rank_fusion",
94
-
vector_results = vector_results.len(),
95
-
bm25_results = bm25_results.len()
204
+
// normalize scores
205
+
let semantic_scores: HashMap<String, f32> = vector_results
206
+
.iter()
207
+
.map(|r| (r.id.clone(), cosine_distance_to_similarity(r.score)))
208
+
.collect();
209
+
210
+
let bm25_raw: Vec<(String, f32)> = bm25_results
211
+
.iter()
212
+
.map(|r| (r.id.clone(), r.score))
213
+
.collect();
214
+
let keyword_scores = normalize_bm25_scores(&bm25_raw);
215
+
216
+
let max_bm25 = bm25_raw
217
+
.iter()
218
+
.map(|(_, s)| *s)
219
+
.fold(f32::NEG_INFINITY, f32::max);
220
+
221
+
logfire::info!(
222
+
"bm25 search completed",
223
+
query = &query_owned,
224
+
results_found = bm25_results.len() as i64,
225
+
max_bm25 = max_bm25 as f64,
226
+
top_bm25_raw = bm25_raw.first().map(|(_, s)| *s).unwrap_or(0.0) as f64
96
227
);
97
228
98
-
use std::collections::HashMap;
99
-
let mut rrf_scores: HashMap<String, f32> = HashMap::new();
100
-
let k = 60.0; // RRF constant
229
+
// fuse scores
230
+
let fused = fuse_scores(&semantic_scores, &keyword_scores, fusion_config);
101
231
102
-
// Add vector search rankings
103
-
for (rank, row) in vector_results.iter().enumerate() {
104
-
let score = 1.0 / (k + (rank as f32) + 1.0);
105
-
*rrf_scores.entry(row.id.clone()).or_insert(0.0) += score;
106
-
}
232
+
logfire::info!(
233
+
"weighted fusion completed",
234
+
total_candidates = (vector_results.len() + bm25_results.len()) as i64,
235
+
alpha = fusion_config.alpha as f64,
236
+
pre_filter_results = fused.len() as i64
237
+
);
107
238
108
-
// Add BM25 search rankings
109
-
for (rank, row) in bm25_results.iter().enumerate() {
110
-
let score = 1.0 / (k + (rank as f32) + 1.0);
111
-
*rrf_scores.entry(row.id.clone()).or_insert(0.0) += score;
239
+
// collect attributes from both result sets
240
+
let mut all_attributes: HashMap<String, HashMap<String, String>> = HashMap::new();
241
+
for result in vector_results.into_iter().chain(bm25_results.into_iter()) {
242
+
all_attributes
243
+
.entry(result.id.clone())
244
+
.or_insert(result.attributes);
112
245
}
113
246
114
-
// Collect all unique results
115
-
let mut all_results: HashMap<String, crate::turbopuffer::QueryRow> = HashMap::new();
116
-
for row in vector_results.into_iter().chain(bm25_results.into_iter()) {
117
-
all_results.entry(row.id.clone()).or_insert(row);
118
-
}
247
+
// return fused results with attributes
248
+
Ok(fused
249
+
.into_iter()
250
+
.map(|(id, score)| {
251
+
let attrs = all_attributes.remove(&id).unwrap_or_default();
252
+
(id, score, attrs)
253
+
})
254
+
.collect())
255
+
}
119
256
120
-
// Sort by RRF score and take top_k
121
-
let mut scored_results: Vec<(String, f32)> = rrf_scores.into_iter().collect();
122
-
scored_results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
123
-
scored_results.truncate(query.top_k);
257
+
/// shared search implementation used by both POST and GET handlers
258
+
async fn perform_search(
259
+
query_text: String,
260
+
top_k_val: usize,
261
+
alpha: f32,
262
+
family_friendly: bool,
263
+
exclude: Option<String>,
264
+
include: Option<String>,
265
+
config: &Config,
266
+
) -> ActixResult<SearchResponse> {
267
+
let content_filter = ContentFilter::new(
268
+
family_friendly,
269
+
exclude.as_deref(),
270
+
include.as_deref(),
271
+
);
272
+
273
+
let _search_span = logfire::span!(
274
+
"bufo_search",
275
+
query = &query_text,
276
+
top_k = top_k_val as i64,
277
+
alpha = alpha as f64,
278
+
family_friendly = family_friendly,
279
+
exclude_patterns_count = content_filter.exclude_pattern_count() as i64
280
+
)
281
+
.entered();
282
+
283
+
logfire::info!(
284
+
"search request received",
285
+
query = &query_text,
286
+
top_k = top_k_val as i64,
287
+
alpha = alpha as f64,
288
+
exclude_patterns = &content_filter.exclude_patterns_str()
289
+
);
124
290
125
-
// Scale RRF scores to 0-1 range
126
-
// RRF scores typically range from ~0.016 (single match, low rank) to ~0.033 (dual match, high rank)
127
-
// Scale by 25x to map good matches near 1.0, poor matches stay low
128
-
let results: Vec<BufoResult> = scored_results
129
-
.into_iter()
130
-
.filter_map(|(id, rrf_score)| {
131
-
all_results.get(&id).map(|row| {
132
-
let url = row
133
-
.attributes
134
-
.get("url")
135
-
.and_then(|v| v.as_str())
136
-
.unwrap_or("")
137
-
.to_string();
291
+
// create clients
292
+
let embedder = VoyageEmbedder::new(config.voyage_api_key.clone());
293
+
let vector_store = TurbopufferStore::new(
294
+
config.turbopuffer_api_key.clone(),
295
+
config.turbopuffer_namespace.clone(),
296
+
);
138
297
139
-
let name = row
140
-
.attributes
141
-
.get("name")
142
-
.and_then(|v| v.as_str())
143
-
.unwrap_or(&row.id)
144
-
.to_string();
298
+
let fusion_config = FusionConfig::new(alpha);
145
299
146
-
// Scale and clamp RRF score to 0-1 range
147
-
// Good matches (appearing high in both searches) will approach 1.0
148
-
// Weak matches will naturally be lower
149
-
let scaled_score = (rrf_score * 25.0).min(1.0);
300
+
// execute hybrid search
301
+
let fused_results = execute_hybrid_search(
302
+
&query_text,
303
+
top_k_val,
304
+
&fusion_config,
305
+
&embedder,
306
+
&vector_store,
307
+
)
308
+
.await
309
+
.map_err(|e| e.into_actix_error())?;
150
310
151
-
BufoResult {
152
-
id: row.id.clone(),
153
-
url,
154
-
name,
155
-
score: scaled_score,
156
-
}
157
-
})
311
+
// convert to BufoResults and apply filtering
312
+
let results: Vec<BufoResult> = fused_results
313
+
.into_iter()
314
+
.map(|(id, score, attrs)| BufoResult {
315
+
id: id.clone(),
316
+
url: attrs.get("url").cloned().unwrap_or_default(),
317
+
name: attrs.get("name").cloned().unwrap_or_else(|| id.clone()),
318
+
score,
158
319
})
320
+
.filter(|result| content_filter.matches(result))
321
+
.take(top_k_val)
159
322
.collect();
160
323
161
-
logfire::info!("search completed",
162
-
query = &query.query,
163
-
results_count = results.len() as i64,
164
-
top_score = results.first().map(|r| r.score as f64).unwrap_or(0.0)
324
+
let results_count = results.len() as i64;
325
+
let top_result_name = results
326
+
.first()
327
+
.map(|r| r.name.clone())
328
+
.unwrap_or_else(|| "none".to_string());
329
+
let top_score_val = results.first().map(|r| r.score as f64).unwrap_or(0.0);
330
+
let avg_score_val = if !results.is_empty() {
331
+
results.iter().map(|r| r.score as f64).sum::<f64>() / results.len() as f64
332
+
} else {
333
+
0.0
334
+
};
335
+
336
+
logfire::info!(
337
+
"search completed successfully",
338
+
query = &query_text,
339
+
results_count = results_count,
340
+
top_result = &top_result_name,
341
+
top_score = top_score_val,
342
+
avg_score = avg_score_val
343
+
);
344
+
345
+
Ok(SearchResponse { results })
346
+
}
347
+
348
+
/// POST /api/search handler (existing API)
349
+
pub async fn search(
350
+
query: web::Json<SearchQuery>,
351
+
config: web::Data<Config>,
352
+
) -> ActixResult<HttpResponse> {
353
+
let response = perform_search(
354
+
query.query.clone(),
355
+
query.top_k,
356
+
query.alpha,
357
+
query.family_friendly,
358
+
query.exclude.clone(),
359
+
query.include.clone(),
360
+
&config,
361
+
)
362
+
.await?;
363
+
Ok(HttpResponse::Ok().json(response))
364
+
}
365
+
366
+
/// GET /api/search handler for shareable URLs
367
+
pub async fn search_get(
368
+
query: web::Query<SearchQuery>,
369
+
config: web::Data<Config>,
370
+
req: HttpRequest,
371
+
) -> ActixResult<HttpResponse> {
372
+
let etag = generate_etag(
373
+
&query.query,
374
+
query.top_k,
375
+
query.alpha,
376
+
query.family_friendly,
377
+
&query.exclude,
378
+
&query.include,
165
379
);
166
380
167
-
Ok(HttpResponse::Ok().json(SearchResponse { results }))
381
+
if let Some(if_none_match) = req.headers().get("if-none-match") {
382
+
if if_none_match.to_str().unwrap_or("") == etag {
383
+
return Ok(HttpResponse::NotModified()
384
+
.insert_header(("etag", etag))
385
+
.finish());
386
+
}
387
+
}
388
+
389
+
let response = perform_search(
390
+
query.query.clone(),
391
+
query.top_k,
392
+
query.alpha,
393
+
query.family_friendly,
394
+
query.exclude.clone(),
395
+
query.include.clone(),
396
+
&config,
397
+
)
398
+
.await?;
399
+
400
+
Ok(HttpResponse::Ok()
401
+
.insert_header(("etag", etag.clone()))
402
+
.insert_header(("cache-control", "public, max-age=300"))
403
+
.json(response))
168
404
}
+105
-51
src/turbopuffer.rs
+105
-51
src/turbopuffer.rs
···
1
-
use anyhow::{Context, Result};
1
+
//! turbopuffer vector database implementation
2
+
//!
3
+
//! implements the `VectorStore` trait for turbopuffer's hybrid search API.
4
+
5
+
use crate::providers::{SearchResult, VectorSearchError, VectorStore};
2
6
use reqwest::Client;
3
7
use serde::{Deserialize, Serialize};
4
8
5
-
#[derive(Debug, Serialize)]
6
-
pub struct QueryRequest {
7
-
pub rank_by: Vec<serde_json::Value>,
8
-
pub top_k: usize,
9
-
#[serde(skip_serializing_if = "Option::is_none")]
10
-
pub include_attributes: Option<Vec<String>>,
11
-
}
12
-
13
-
pub type QueryResponse = Vec<QueryRow>;
9
+
const TURBOPUFFER_API_BASE: &str = "https://api.turbopuffer.com/v1/vectors";
14
10
11
+
/// raw response row from turbopuffer API
15
12
#[derive(Debug, Deserialize, Serialize, Clone)]
16
13
pub struct QueryRow {
17
14
pub id: String,
18
-
pub dist: Option<f32>,
15
+
pub dist: f32,
19
16
pub attributes: serde_json::Map<String, serde_json::Value>,
20
17
}
21
18
22
-
pub struct TurbopufferClient {
19
+
impl From<QueryRow> for SearchResult {
20
+
fn from(row: QueryRow) -> Self {
21
+
let attributes = row
22
+
.attributes
23
+
.iter()
24
+
.filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
25
+
.collect();
26
+
27
+
SearchResult {
28
+
id: row.id,
29
+
score: row.dist,
30
+
attributes,
31
+
}
32
+
}
33
+
}
34
+
35
+
#[derive(Debug, Deserialize)]
36
+
struct ErrorResponse {
37
+
error: String,
38
+
#[allow(dead_code)]
39
+
status: String,
40
+
}
41
+
42
+
/// turbopuffer vector database client
43
+
///
44
+
/// supports both ANN vector search and BM25 full-text search.
45
+
#[derive(Clone)]
46
+
pub struct TurbopufferStore {
23
47
client: Client,
24
48
api_key: String,
25
49
namespace: String,
26
50
}
27
51
28
-
impl TurbopufferClient {
52
+
impl TurbopufferStore {
29
53
pub fn new(api_key: String, namespace: String) -> Self {
30
54
Self {
31
55
client: Client::new(),
···
34
58
}
35
59
}
36
60
37
-
pub async fn query(&self, request: QueryRequest) -> Result<QueryResponse> {
38
-
let url = format!(
39
-
"https://api.turbopuffer.com/v1/vectors/{}/query",
40
-
self.namespace
41
-
);
61
+
fn query_url(&self) -> String {
62
+
format!("{}/{}/query", TURBOPUFFER_API_BASE, self.namespace)
63
+
}
42
64
43
-
let request_json = serde_json::to_string_pretty(&request)?;
44
-
log::debug!("turbopuffer query request: {}", request_json);
45
-
65
+
async fn execute_query(
66
+
&self,
67
+
request: serde_json::Value,
68
+
) -> Result<Vec<QueryRow>, VectorSearchError> {
46
69
let response = self
47
70
.client
48
-
.post(&url)
71
+
.post(self.query_url())
49
72
.header("Authorization", format!("Bearer {}", self.api_key))
50
73
.json(&request)
51
74
.send()
52
-
.await
53
-
.context("failed to send query request")?;
75
+
.await?;
54
76
55
77
if !response.status().is_success() {
56
-
let status = response.status();
78
+
let status = response.status().as_u16();
57
79
let body = response.text().await.unwrap_or_default();
58
-
anyhow::bail!("turbopuffer query failed with status {}: {}", status, body);
80
+
81
+
// check for specific error types
82
+
if let Ok(error_resp) = serde_json::from_str::<ErrorResponse>(&body) {
83
+
if error_resp.error.contains("too long") && error_resp.error.contains("max 1024") {
84
+
return Err(VectorSearchError::QueryTooLong {
85
+
message: error_resp.error,
86
+
});
87
+
}
88
+
}
89
+
90
+
return Err(VectorSearchError::Api { status, body });
59
91
}
60
92
61
-
let body = response.text().await.context("failed to read response body")?;
62
-
log::debug!("turbopuffer response: {}", body);
93
+
let body = response.text().await.map_err(|e| {
94
+
VectorSearchError::Other(anyhow::anyhow!("failed to read response: {}", e))
95
+
})?;
63
96
64
97
serde_json::from_str(&body)
65
-
.context(format!("failed to parse query response: {}", body))
98
+
.map_err(|e| VectorSearchError::Parse(format!("failed to parse response: {}", e)))
66
99
}
100
+
}
67
101
68
-
pub async fn bm25_query(&self, query_text: &str, top_k: usize) -> Result<QueryResponse> {
69
-
let url = format!(
70
-
"https://api.turbopuffer.com/v1/vectors/{}/query",
71
-
self.namespace
102
+
impl VectorStore for TurbopufferStore {
103
+
async fn search_by_vector(
104
+
&self,
105
+
embedding: &[f32],
106
+
top_k: usize,
107
+
) -> Result<Vec<SearchResult>, VectorSearchError> {
108
+
let request = serde_json::json!({
109
+
"rank_by": ["vector", "ANN", embedding],
110
+
"top_k": top_k,
111
+
"include_attributes": ["url", "name", "filename"],
112
+
});
113
+
114
+
log::debug!(
115
+
"turbopuffer vector query: {}",
116
+
serde_json::to_string_pretty(&request).unwrap_or_default()
72
117
);
73
118
119
+
let rows = self.execute_query(request).await?;
120
+
Ok(rows.into_iter().map(SearchResult::from).collect())
121
+
}
122
+
123
+
async fn search_by_keyword(
124
+
&self,
125
+
query: &str,
126
+
top_k: usize,
127
+
) -> Result<Vec<SearchResult>, VectorSearchError> {
74
128
let request = serde_json::json!({
75
-
"rank_by": ["name", "BM25", query_text],
129
+
"rank_by": ["name", "BM25", query],
76
130
"top_k": top_k,
77
131
"include_attributes": ["url", "name", "filename"],
78
132
});
79
133
80
-
log::debug!("turbopuffer BM25 query request: {}", serde_json::to_string_pretty(&request)?);
134
+
log::debug!(
135
+
"turbopuffer BM25 query: {}",
136
+
serde_json::to_string_pretty(&request).unwrap_or_default()
137
+
);
81
138
82
-
let response = self
83
-
.client
84
-
.post(&url)
85
-
.header("Authorization", format!("Bearer {}", self.api_key))
86
-
.json(&request)
87
-
.send()
88
-
.await
89
-
.context("failed to send BM25 query request")?;
139
+
let rows = self.execute_query(request).await?;
90
140
91
-
if !response.status().is_success() {
92
-
let status = response.status();
93
-
let body = response.text().await.unwrap_or_default();
94
-
anyhow::bail!("turbopuffer BM25 query failed with status {}: {}", status, body);
141
+
if let Some(first) = rows.first() {
142
+
log::info!(
143
+
"BM25 first result - id: {}, dist: {}, name: {:?}",
144
+
first.id,
145
+
first.dist,
146
+
first.attributes.get("name")
147
+
);
95
148
}
96
149
97
-
let body = response.text().await.context("failed to read response body")?;
98
-
log::debug!("turbopuffer BM25 response: {}", body);
150
+
Ok(rows.into_iter().map(SearchResult::from).collect())
151
+
}
99
152
100
-
serde_json::from_str(&body)
101
-
.context(format!("failed to parse BM25 query response: {}", body))
153
+
fn name(&self) -> &'static str {
154
+
"turbopuffer"
102
155
}
103
156
}
157
+
static/bufo-is-trapped-in-a-cameron-winter-phase.png
static/bufo-is-trapped-in-a-cameron-winter-phase.png
This is a binary file and will not be displayed.
+43
static/bufo-peek.js
+43
static/bufo-peek.js
···
1
+
// bufo peeking animation logic
2
+
// this file handles the silly but delightful bufo that peeks from random edges
3
+
4
+
(function() {
5
+
const peekingBufo = document.getElementById('peekingBufo');
6
+
let hasSearched = false;
7
+
8
+
// animation cycle duration (must match CSS animation duration)
9
+
const PEEK_CYCLE_MS = 6000;
10
+
11
+
// function to set random bufo position
12
+
function setRandomBufoPosition() {
13
+
const positions = ['top', 'right', 'bottom', 'left'];
14
+
15
+
// remove all position classes
16
+
peekingBufo.classList.remove(
17
+
'peeking-bufo-top',
18
+
'peeking-bufo-right',
19
+
'peeking-bufo-bottom',
20
+
'peeking-bufo-left'
21
+
);
22
+
23
+
// set new random position
24
+
const position = positions[Math.floor(Math.random() * positions.length)];
25
+
peekingBufo.classList.add(`peeking-bufo-${position}`);
26
+
}
27
+
28
+
// set initial position
29
+
setRandomBufoPosition();
30
+
31
+
// move to new position after each peek cycle
32
+
setInterval(() => {
33
+
if (!hasSearched) {
34
+
setRandomBufoPosition();
35
+
}
36
+
}, PEEK_CYCLE_MS);
37
+
38
+
// hide bufo after first search
39
+
window.addEventListener('bufo-hide', () => {
40
+
hasSearched = true;
41
+
peekingBufo.classList.add('hidden');
42
+
});
43
+
})();
static/favicon.gif
static/favicon.gif
This is a binary file and will not be displayed.
static/favicon.png
static/favicon.png
This is a binary file and will not be displayed.
+571
-33
static/index.html
+571
-33
static/index.html
···
5
5
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6
6
<title>find bufo</title>
7
7
<link rel="icon" type="image/png" href="/static/favicon.png">
8
+
<link rel="apple-touch-icon" href="/static/favicon.png">
9
+
<link rel="manifest" href="/static/manifest.json">
10
+
<meta name="theme-color" content="#8ba888">
8
11
<style>
9
12
* {
10
13
margin: 0;
···
13
16
}
14
17
15
18
body {
16
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
17
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
19
+
font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Code', 'Fira Mono', 'Roboto Mono', monospace;
20
+
background: #8ba888;
18
21
min-height: 100vh;
19
22
display: flex;
20
23
justify-content: center;
21
24
align-items: center;
22
25
padding: 20px;
26
+
position: relative;
27
+
overflow-x: hidden;
23
28
}
24
29
25
30
.container {
···
41
46
text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
42
47
}
43
48
49
+
h1 a {
50
+
color: inherit;
51
+
text-decoration: none;
52
+
cursor: pointer;
53
+
transition: opacity 0.2s;
54
+
}
55
+
56
+
h1 a:hover {
57
+
opacity: 0.8;
58
+
}
59
+
44
60
.subtitle {
45
61
color: rgba(255, 255, 255, 0.9);
46
62
font-size: 1.1em;
47
63
margin-bottom: 15px;
48
64
}
49
65
50
-
.info {
51
-
color: rgba(255, 255, 255, 0.85);
52
-
font-size: 0.9em;
53
-
text-align: center;
54
-
margin-top: 10px;
66
+
.subtitle a {
67
+
color: inherit;
68
+
text-decoration: underline;
69
+
text-decoration-color: rgba(255, 255, 255, 0.4);
70
+
transition: text-decoration-color 0.2s;
55
71
}
56
72
57
-
.info a {
58
-
color: rgba(255, 255, 255, 0.95);
59
-
text-decoration: none;
60
-
border-bottom: 1px solid rgba(255, 255, 255, 0.4);
61
-
transition: border-color 0.2s;
62
-
}
63
-
64
-
.info a:hover {
65
-
border-bottom-color: rgba(255, 255, 255, 0.9);
73
+
.subtitle a:hover {
74
+
text-decoration-color: rgba(255, 255, 255, 0.9);
66
75
}
67
76
68
77
.search-box {
···
70
79
border-radius: 12px;
71
80
padding: 20px;
72
81
box-shadow: 0 10px 40px rgba(0,0,0,0.2);
82
+
margin-bottom: 20px;
83
+
}
84
+
85
+
.search-options {
86
+
margin-top: 15px;
87
+
padding-top: 15px;
88
+
border-top: 1px solid #e0e0e0;
89
+
}
90
+
91
+
.search-options.collapsed {
92
+
display: none;
93
+
}
94
+
95
+
.options-toggle {
96
+
background: rgba(102, 126, 234, 0.1);
97
+
border: 1.5px solid rgba(102, 126, 234, 0.3);
98
+
border-radius: 6px;
99
+
color: #667eea;
100
+
font-size: 0.85em;
101
+
font-family: inherit;
102
+
font-weight: 600;
103
+
cursor: pointer;
104
+
padding: 8px 14px;
105
+
margin-top: 10px;
106
+
transition: all 0.2s;
107
+
display: inline-flex;
108
+
align-items: center;
109
+
gap: 6px;
110
+
}
111
+
112
+
.options-toggle:hover {
113
+
background: rgba(102, 126, 234, 0.15);
114
+
border-color: rgba(102, 126, 234, 0.5);
115
+
transform: translateY(-1px);
116
+
}
117
+
118
+
.options-toggle svg {
119
+
width: 14px;
120
+
height: 14px;
121
+
}
122
+
123
+
.option-group {
124
+
margin-bottom: 15px;
125
+
}
126
+
127
+
.option-label {
128
+
display: flex;
129
+
justify-content: space-between;
130
+
align-items: center;
131
+
margin-bottom: 8px;
132
+
font-size: 0.9em;
133
+
color: #555;
134
+
}
135
+
136
+
.option-name {
137
+
font-weight: 600;
138
+
}
139
+
140
+
.option-name a {
141
+
text-decoration: none;
142
+
color: inherit;
143
+
border-bottom: 1px dotted #999;
144
+
transition: border-color 0.2s, color 0.2s;
145
+
}
146
+
147
+
.option-name a:hover {
148
+
color: #667eea;
149
+
border-bottom-color: #667eea;
150
+
}
151
+
152
+
.option-value {
153
+
color: #667eea;
154
+
font-weight: 700;
155
+
font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Code', monospace;
156
+
}
157
+
158
+
.option-description {
159
+
font-size: 0.8em;
160
+
color: #888;
161
+
margin-bottom: 8px;
162
+
line-height: 1.4;
163
+
}
164
+
165
+
input[type="range"] {
166
+
width: 100%;
167
+
height: 6px;
168
+
border-radius: 3px;
169
+
background: #e0e0e0;
170
+
outline: none;
171
+
-webkit-appearance: none;
172
+
}
173
+
174
+
input[type="range"]::-webkit-slider-thumb {
175
+
-webkit-appearance: none;
176
+
appearance: none;
177
+
width: 18px;
178
+
height: 18px;
179
+
border-radius: 50%;
180
+
background: #667eea;
181
+
cursor: pointer;
182
+
}
183
+
184
+
input[type="range"]::-moz-range-thumb {
185
+
width: 18px;
186
+
height: 18px;
187
+
border-radius: 50%;
188
+
background: #667eea;
189
+
cursor: pointer;
190
+
border: none;
191
+
}
192
+
193
+
.alpha-markers {
194
+
display: flex;
195
+
justify-content: space-between;
196
+
font-size: 0.75em;
197
+
color: #aaa;
198
+
margin-top: 4px;
199
+
}
200
+
201
+
.checkbox-wrapper {
202
+
display: flex;
203
+
align-items: center;
204
+
gap: 10px;
205
+
cursor: pointer;
206
+
user-select: none;
207
+
}
208
+
209
+
input[type="checkbox"] {
210
+
width: 18px;
211
+
height: 18px;
212
+
cursor: pointer;
213
+
accent-color: #667eea;
214
+
}
215
+
216
+
.option-group a {
217
+
color: #667eea;
218
+
text-decoration: none;
219
+
}
220
+
221
+
.option-group a:hover {
222
+
text-decoration: underline;
223
+
}
224
+
225
+
.sample-queries-container {
226
+
text-align: center;
73
227
margin-bottom: 30px;
74
228
}
75
229
230
+
.sample-queries-container.hidden {
231
+
display: none;
232
+
}
233
+
234
+
.sample-queries-label {
235
+
color: rgba(255, 255, 255, 0.7);
236
+
font-size: 0.85em;
237
+
margin-bottom: 10px;
238
+
font-weight: 500;
239
+
}
240
+
241
+
.sample-queries {
242
+
display: flex;
243
+
gap: 8px;
244
+
justify-content: center;
245
+
flex-wrap: wrap;
246
+
}
247
+
248
+
.sample-query-btn {
249
+
padding: 8px 16px;
250
+
background: rgba(255, 255, 255, 0.15);
251
+
backdrop-filter: blur(10px);
252
+
border: 1.5px solid rgba(255, 255, 255, 0.3);
253
+
border-radius: 6px;
254
+
font-size: 0.85em;
255
+
font-family: inherit;
256
+
font-weight: 600;
257
+
cursor: pointer;
258
+
transition: all 0.2s;
259
+
color: white;
260
+
text-shadow: 0 1px 2px rgba(0,0,0,0.2);
261
+
}
262
+
263
+
.sample-query-btn.happy {
264
+
background: rgba(255, 220, 100, 0.25);
265
+
border-color: rgba(255, 220, 100, 0.4);
266
+
}
267
+
268
+
.sample-query-btn.happy:hover {
269
+
background: rgba(255, 220, 100, 0.4);
270
+
border-color: rgba(255, 220, 100, 0.6);
271
+
transform: translateY(-2px);
272
+
}
273
+
274
+
.sample-query-btn.apocalyptic {
275
+
background: rgba(255, 80, 80, 0.25);
276
+
border-color: rgba(255, 80, 80, 0.4);
277
+
}
278
+
279
+
.sample-query-btn.apocalyptic:hover {
280
+
background: rgba(255, 80, 80, 0.4);
281
+
border-color: rgba(255, 80, 80, 0.6);
282
+
transform: translateY(-2px);
283
+
}
284
+
285
+
.sample-query-btn.giving {
286
+
background: rgba(100, 220, 150, 0.25);
287
+
border-color: rgba(100, 220, 150, 0.4);
288
+
}
289
+
290
+
.sample-query-btn.giving:hover {
291
+
background: rgba(100, 220, 150, 0.4);
292
+
border-color: rgba(100, 220, 150, 0.6);
293
+
transform: translateY(-2px);
294
+
}
295
+
296
+
@media (max-width: 600px) {
297
+
.sample-query-btn {
298
+
font-size: 0.8em;
299
+
padding: 6px 12px;
300
+
}
301
+
302
+
.sample-queries {
303
+
gap: 6px;
304
+
}
305
+
}
306
+
76
307
.search-input-wrapper {
77
308
display: flex;
78
309
gap: 10px;
···
86
317
border: 2px solid #e0e0e0;
87
318
border-radius: 8px;
88
319
font-size: 16px;
320
+
font-family: inherit;
89
321
transition: border-color 0.3s;
90
322
}
91
323
···
101
333
border: none;
102
334
border-radius: 8px;
103
335
font-size: 16px;
336
+
font-family: inherit;
104
337
font-weight: 600;
105
338
cursor: pointer;
106
339
transition: background 0.3s;
···
199
432
}
200
433
201
434
.no-results {
435
+
grid-column: 1 / -1;
436
+
display: flex;
437
+
flex-direction: column;
438
+
align-items: center;
439
+
justify-content: center;
202
440
text-align: center;
441
+
padding: 40px 20px;
442
+
margin: 0 auto;
443
+
}
444
+
445
+
.no-results-text {
446
+
font-size: 3em;
447
+
margin-bottom: 50px;
448
+
font-weight: 700;
203
449
color: white;
204
-
font-size: 1.2em;
205
-
padding: 40px;
450
+
text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
451
+
}
452
+
453
+
.no-results-bufo {
454
+
max-width: 600px;
455
+
width: 100%;
456
+
height: auto;
457
+
display: block;
458
+
margin: 0 auto;
459
+
}
460
+
461
+
@media (max-width: 600px) {
462
+
.no-results-text {
463
+
font-size: 2em;
464
+
}
465
+
466
+
.no-results-bufo {
467
+
max-width: 300px;
468
+
}
469
+
}
470
+
471
+
@keyframes peek-in-out {
472
+
0% {
473
+
transform: var(--peek-start);
474
+
}
475
+
20% {
476
+
transform: var(--peek-in);
477
+
}
478
+
80% {
479
+
transform: var(--peek-in);
480
+
}
481
+
100% {
482
+
transform: var(--peek-start);
483
+
}
484
+
}
485
+
486
+
.peeking-bufo {
487
+
position: fixed;
488
+
pointer-events: none;
489
+
z-index: 1000;
490
+
width: 200px;
491
+
height: auto;
492
+
animation: peek-in-out 6s ease-in-out infinite;
493
+
}
494
+
495
+
.peeking-bufo.hidden {
496
+
opacity: 0;
497
+
pointer-events: none;
498
+
animation: none;
499
+
}
500
+
501
+
.peeking-bufo-right {
502
+
right: -200px;
503
+
top: 50%;
504
+
--peek-start: translateY(-50%);
505
+
--peek-in: translateX(-200px) translateY(-50%);
506
+
}
507
+
508
+
.peeking-bufo-bottom {
509
+
bottom: -200px;
510
+
left: 50%;
511
+
--peek-start: translateX(-50%) rotate(90deg);
512
+
--peek-in: translateX(-50%) translateY(-200px) rotate(90deg);
513
+
}
514
+
515
+
.peeking-bufo-left {
516
+
left: -200px;
517
+
top: 50%;
518
+
--peek-start: translateY(-50%) scaleX(-1);
519
+
--peek-in: translateX(200px) translateY(-50%) scaleX(-1);
520
+
}
521
+
522
+
.peeking-bufo-top {
523
+
top: -200px;
524
+
left: 50%;
525
+
--peek-start: translateX(-50%) rotate(-90deg);
526
+
--peek-in: translateX(-50%) translateY(200px) rotate(-90deg);
527
+
}
528
+
529
+
@media (max-width: 1024px) {
530
+
.peeking-bufo {
531
+
width: 150px;
532
+
}
533
+
}
534
+
535
+
@media (max-width: 768px) {
536
+
.peeking-bufo {
537
+
width: 100px;
538
+
}
206
539
}
207
540
</style>
208
541
</head>
209
542
<body>
210
543
<div class="container">
211
544
<div class="header">
212
-
<h1>find bufo</h1>
213
-
<p class="subtitle">semantic search for <a href="https://bufo.zone" target="_blank" style="color: inherit; text-decoration: underline;">bufo.zone</a></p>
214
-
<p class="info">
215
-
<a href="https://turbopuffer.com/docs/hybrid-search" target="_blank">multimodal hybrid search</a> ยท
216
-
<a href="https://git.tangled.sh/zzstoatzz.io/find-bufo" target="_blank">source</a>
217
-
</p>
545
+
<h1><a href="/">find bufo</a></h1>
546
+
<p class="subtitle"><a href="https://tangled.org/@zzstoatzz.io/find-bufo/blob/main/src/search.rs#L1-L41" target="_blank">hybrid search</a> for <a href="https://bufo.zone" target="_blank">bufo.zone</a></p>
218
547
</div>
219
548
220
549
<div class="search-box">
···
224
553
id="searchInput"
225
554
placeholder="describe the bufo you seek..."
226
555
autocomplete="off"
556
+
autofocus
227
557
>
228
558
<button id="searchButton">search</button>
229
559
</div>
560
+
561
+
<button class="options-toggle" id="optionsToggle">
562
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
563
+
<line x1="4" y1="6" x2="20" y2="6"></line>
564
+
<line x1="4" y1="12" x2="20" y2="12"></line>
565
+
<line x1="4" y1="18" x2="20" y2="18"></line>
566
+
<circle cx="7" cy="6" r="2" fill="currentColor"></circle>
567
+
<circle cx="14" cy="12" r="2" fill="currentColor"></circle>
568
+
<circle cx="17" cy="18" r="2" fill="currentColor"></circle>
569
+
</svg>
570
+
<span id="optionsToggleText">search filters</span>
571
+
</button>
572
+
573
+
<div class="search-options collapsed" id="searchOptions">
574
+
<div class="option-group">
575
+
<div class="option-label">
576
+
<span class="option-name">search mode (<a href="https://tangled.org/@zzstoatzz.io/find-bufo/blob/main/src/search.rs#L22" target="_blank">ฮฑ</a>)</span>
577
+
<span class="option-value" id="alphaValue">0.70</span>
578
+
</div>
579
+
<div class="option-description">
580
+
balance between semantic understanding and exact keyword matching
581
+
</div>
582
+
<input
583
+
type="range"
584
+
id="alphaSlider"
585
+
min="0"
586
+
max="1"
587
+
step="0.01"
588
+
value="0.7"
589
+
>
590
+
<div class="alpha-markers">
591
+
<span>keyword</span>
592
+
<span>balanced</span>
593
+
<span>semantic</span>
594
+
</div>
595
+
</div>
596
+
597
+
<div class="option-group">
598
+
<div class="option-label">
599
+
<span class="option-name">family-friendly mode</span>
600
+
</div>
601
+
<div class="option-description">
602
+
filter out inappropriate content
603
+
</div>
604
+
<label class="checkbox-wrapper">
605
+
<input
606
+
type="checkbox"
607
+
id="familyFriendlyCheckbox"
608
+
checked
609
+
>
610
+
<span>enabled</span>
611
+
</label>
612
+
</div>
613
+
614
+
<div class="option-group">
615
+
<div class="option-label">
616
+
<span class="option-name">exclude patterns</span>
617
+
</div>
618
+
<div class="option-description">
619
+
comma-separated <a href="https://regex101.com/" target="_blank">regex</a> patterns to exclude (e.g., excited,party)
620
+
<br>
621
+
<span style="color: #999; font-size: 0.9em;">new to regex? <a href="https://claude.ai" target="_blank">claude</a> can write patterns for you</span>
622
+
</div>
623
+
<input
624
+
type="text"
625
+
id="excludeInput"
626
+
placeholder="pattern1,pattern2"
627
+
style="width: 100%; padding: 10px; font-size: 14px;"
628
+
>
629
+
</div>
630
+
</div>
631
+
</div>
632
+
633
+
<div id="sampleQueriesContainer" class="sample-queries-container">
634
+
<div class="sample-queries-label">try a sample query:</div>
635
+
<div class="sample-queries">
636
+
<button class="sample-query-btn happy" data-query="happy">happy</button>
637
+
<button class="sample-query-btn apocalyptic" data-query="apocalyptic">apocalyptic</button>
638
+
<button class="sample-query-btn giving" data-query="in a giving mood">in a giving mood</button>
639
+
</div>
230
640
</div>
231
641
232
642
<div id="error" class="error" style="display: none;"></div>
···
234
644
<div id="results" class="results"></div>
235
645
</div>
236
646
647
+
<img src="https://all-the.bufo.zone/bufo-just-checking.gif" alt="bufo peeking" class="peeking-bufo" id="peekingBufo">
648
+
649
+
<script src="/static/bufo-peek.js"></script>
237
650
<script>
238
651
const searchInput = document.getElementById('searchInput');
239
652
const searchButton = document.getElementById('searchButton');
240
653
const resultsDiv = document.getElementById('results');
241
654
const loadingDiv = document.getElementById('loading');
242
655
const errorDiv = document.getElementById('error');
656
+
const sampleQueriesContainer = document.getElementById('sampleQueriesContainer');
657
+
const optionsToggle = document.getElementById('optionsToggle');
658
+
const searchOptions = document.getElementById('searchOptions');
659
+
const alphaSlider = document.getElementById('alphaSlider');
660
+
const alphaValue = document.getElementById('alphaValue');
661
+
const familyFriendlyCheckbox = document.getElementById('familyFriendlyCheckbox');
662
+
const excludeInput = document.getElementById('excludeInput');
243
663
244
-
async function search() {
664
+
let hasSearched = false;
665
+
666
+
// toggle search options
667
+
const optionsToggleText = document.getElementById('optionsToggleText');
668
+
optionsToggle.addEventListener('click', () => {
669
+
searchOptions.classList.toggle('collapsed');
670
+
optionsToggleText.textContent = searchOptions.classList.contains('collapsed')
671
+
? 'search filters'
672
+
: 'hide filters';
673
+
});
674
+
675
+
// update alpha value display
676
+
alphaSlider.addEventListener('input', (e) => {
677
+
alphaValue.textContent = parseFloat(e.target.value).toFixed(2);
678
+
});
679
+
680
+
async function search(updateUrl = true) {
245
681
const query = searchInput.value.trim();
246
682
if (!query) return;
247
683
684
+
const alpha = parseFloat(alphaSlider.value);
685
+
const familyFriendly = familyFriendlyCheckbox.checked;
686
+
const exclude = excludeInput.value.trim();
687
+
688
+
// hide bufo after first search
689
+
if (!hasSearched) {
690
+
window.dispatchEvent(new Event('bufo-hide'));
691
+
hasSearched = true;
692
+
}
693
+
694
+
// update url with query parameters for sharing
695
+
if (updateUrl) {
696
+
const params = new URLSearchParams();
697
+
params.set('q', query);
698
+
params.set('top_k', '20');
699
+
params.set('alpha', alpha.toString());
700
+
params.set('family_friendly', familyFriendly.toString());
701
+
if (exclude) params.set('exclude', exclude);
702
+
const newUrl = `${window.location.pathname}?${params.toString()}`;
703
+
window.history.pushState({ query, alpha, familyFriendly, exclude }, '', newUrl);
704
+
}
705
+
248
706
searchButton.disabled = true;
249
707
loadingDiv.style.display = 'block';
250
708
resultsDiv.innerHTML = '';
251
709
errorDiv.style.display = 'none';
710
+
sampleQueriesContainer.classList.add('hidden');
252
711
253
712
try {
254
-
const response = await fetch('/api/search', {
255
-
method: 'POST',
713
+
const params = new URLSearchParams();
714
+
params.set('query', query);
715
+
params.set('top_k', '20');
716
+
params.set('alpha', alpha.toString());
717
+
params.set('family_friendly', familyFriendly.toString());
718
+
if (exclude) params.set('exclude', exclude);
719
+
720
+
const response = await fetch(`/api/search?${params.toString()}`, {
721
+
method: 'GET',
256
722
headers: {
257
-
'Content-Type': 'application/json',
723
+
'Accept': 'application/json',
258
724
},
259
-
body: JSON.stringify({ query, top_k: 20 }),
260
725
});
261
726
262
727
if (!response.ok) {
263
-
throw new Error(`search failed: ${response.statusText}`);
728
+
// try to extract error message from response body
729
+
let errorMessage = response.statusText;
730
+
try {
731
+
const errorText = await response.text();
732
+
// actix-web returns plain text error messages, not JSON
733
+
if (errorText) {
734
+
errorMessage = errorText;
735
+
}
736
+
} catch (e) {
737
+
// if reading body fails, use the status text
738
+
}
739
+
throw new Error(errorMessage);
264
740
}
265
741
266
742
const data = await response.json();
···
276
752
277
753
function displayResults(results) {
278
754
if (results.length === 0) {
279
-
resultsDiv.innerHTML = '<div class="no-results">no bufos found</div>';
755
+
resultsDiv.innerHTML = `
756
+
<div class="no-results">
757
+
<div class="no-results-text">no bufos found</div>
758
+
<img src="https://all-the.bufo.zone/bufo-shrug.png" alt="bufo shrug" class="no-results-bufo">
759
+
</div>
760
+
`;
280
761
return;
281
762
}
282
763
···
295
776
});
296
777
}
297
778
298
-
searchButton.addEventListener('click', search);
779
+
searchButton.addEventListener('click', () => search(true));
299
780
searchInput.addEventListener('keypress', (e) => {
300
-
if (e.key === 'Enter') search();
781
+
if (e.key === 'Enter') search(true);
782
+
});
783
+
784
+
// handle browser back/forward
785
+
window.addEventListener('popstate', (e) => {
786
+
if (e.state && e.state.query) {
787
+
searchInput.value = e.state.query;
788
+
if (e.state.alpha !== undefined) {
789
+
alphaSlider.value = e.state.alpha;
790
+
alphaValue.textContent = parseFloat(e.state.alpha).toFixed(2);
791
+
}
792
+
if (e.state.familyFriendly !== undefined) {
793
+
familyFriendlyCheckbox.checked = e.state.familyFriendly;
794
+
}
795
+
if (e.state.exclude !== undefined) {
796
+
excludeInput.value = e.state.exclude;
797
+
}
798
+
search(false);
799
+
}
800
+
});
801
+
802
+
// auto-execute search if url has query params (for shared links)
803
+
window.addEventListener('DOMContentLoaded', () => {
804
+
const params = new URLSearchParams(window.location.search);
805
+
const query = params.get('q');
806
+
const alpha = params.get('alpha');
807
+
const familyFriendly = params.get('family_friendly');
808
+
const exclude = params.get('exclude');
809
+
810
+
if (alpha) {
811
+
alphaSlider.value = alpha;
812
+
alphaValue.textContent = parseFloat(alpha).toFixed(2);
813
+
}
814
+
815
+
if (familyFriendly !== null) {
816
+
familyFriendlyCheckbox.checked = familyFriendly === 'true';
817
+
}
818
+
819
+
if (exclude) {
820
+
excludeInput.value = exclude;
821
+
}
822
+
823
+
if (query) {
824
+
searchInput.value = query;
825
+
search(false); // don't update URL since we're already loading from it
826
+
}
827
+
828
+
// ensure focus on the search input
829
+
searchInput.focus();
830
+
});
831
+
832
+
// handle sample query button clicks
833
+
document.querySelectorAll('.sample-query-btn').forEach(btn => {
834
+
btn.addEventListener('click', () => {
835
+
const query = btn.getAttribute('data-query');
836
+
searchInput.value = query;
837
+
search(true);
838
+
});
301
839
});
302
840
</script>
303
841
</body>
+17
static/manifest.json
+17
static/manifest.json
···
1
+
{
2
+
"name": "find bufo",
3
+
"short_name": "find bufo",
4
+
"description": "hybrid search for bufo.zone",
5
+
"start_url": "/",
6
+
"display": "standalone",
7
+
"background_color": "#8ba888",
8
+
"theme_color": "#8ba888",
9
+
"icons": [
10
+
{
11
+
"src": "/static/favicon.png",
12
+
"sizes": "112x112",
13
+
"type": "image/png",
14
+
"purpose": "any maskable"
15
+
}
16
+
]
17
+
}