+5
bun.lock
+5
bun.lock
···
3
3
"workspaces": {
4
4
"": {
5
5
"name": "plcbundle",
6
+
"dependencies": {
7
+
"@jmespath-community/jmespath": "^1.3.0",
8
+
},
6
9
"devDependencies": {
7
10
"@types/bun": "^1.3.1",
8
11
},
9
12
},
10
13
},
11
14
"packages": {
15
+
"@jmespath-community/jmespath": ["@jmespath-community/jmespath@1.3.0", "", { "bin": { "jp": "dist/cli.mjs" } }, "sha512-nzOrEdWKNpognj6CT+1Atr7gw0bqUC1KTBRyasBXS9NjFpz+og7LeFZrIQqV81GRcCzKa5H+DNipvv7NQK3GzA=="],
16
+
12
17
"@types/bun": ["@types/bun@1.3.1", "", { "dependencies": { "bun-types": "1.3.1" } }, "sha512-4jNMk2/K9YJtfqwoAa28c8wK+T7nvJFOjxI4h/7sORWcypRNxBpr+TPNaCfVWq70tLCJsqoFwcf0oI0JU/fvMQ=="],
13
18
14
19
"@types/node": ["@types/node@24.9.2", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-uWN8YqxXxqFMX2RqGOrumsKeti4LlmIMIyV0lgut4jx7KQBcBiW6vkDtIBvHnHIquwNfJhk8v2OtmO8zXWHfPA=="],
+23
examples/service-types.ts
+23
examples/service-types.ts
···
1
+
const counts: Record<string, number>= {}
2
+
3
+
export function process({ op }: { op: any }) {
4
+
5
+
if (!op.operation?.services) {
6
+
return
7
+
}
8
+
for (const key of Object.keys(op.operation.services)) {
9
+
if (!counts[key]) {
10
+
counts[key] = 1
11
+
} else {
12
+
counts[key] += 1
13
+
}
14
+
}
15
+
}
16
+
17
+
export function prepare() {
18
+
return { counts }
19
+
}
20
+
21
+
export function finalize(results: any, { aggregate }: any) {
22
+
console.log(Object.fromEntries(Object.entries(aggregate(results.map((r: any) => r.data.counts))).sort((a, b) => a[1] < b[1] ? 1 : -1)))
23
+
}
+1
-1
jsr.json
+1
-1
jsr.json
+5
-2
package.json
+5
-2
package.json
···
1
1
{
2
2
"name": "@atscan/plcbundle-bun",
3
-
"version": "0.9.4",
3
+
"version": "0.9.5",
4
4
"type": "module",
5
5
"description": "Bun library for working with DID PLC bundle archives (plcbundle)",
6
6
"main": "./src/index.ts",
···
16
16
"scripts": {
17
17
"test": "bun test",
18
18
"test:watch": "bun test --watch",
19
-
"test:coverage": "bun test --coverage",
19
+
"tests:coverage": "bun test --coverage",
20
20
"cli": "bun src/cli.ts",
21
21
"publish": "bunx jsr publish"
22
22
},
···
25
25
},
26
26
"publishConfig": {
27
27
"access": "public"
28
+
},
29
+
"dependencies": {
30
+
"@jmespath-community/jmespath": "^1.3.0"
28
31
}
29
32
}
+13
-7
src/cli.ts
+13
-7
src/cli.ts
···
6
6
import { info } from './cmds/info';
7
7
import { verify } from './cmds/verify';
8
8
import { exportCmd } from './cmds/export';
9
+
import { query } from './cmds/query';
9
10
10
11
const commands = {
11
12
clone,
12
13
detect,
13
14
process: processCmd,
15
+
query,
16
+
q: query, // Alias for query
14
17
info,
15
18
verify,
16
19
export: exportCmd,
···
23
26
bun cli <command> [options]
24
27
25
28
COMMANDS:
26
-
clone Clone bundles from a remote repository
27
-
detect Detect and filter operations using a custom function
28
-
process Process operations with a custom function
29
-
info Show index or bundle information
30
-
verify Verify bundle integrity
31
-
export Export operations from bundle
32
-
help Show this help
29
+
clone Clone bundles from a remote repository
30
+
detect Detect and filter operations using a custom function
31
+
process Process operations with a custom function
32
+
query (q) Query operations using JMESPath or simple dot notation
33
+
info Show index or bundle information
34
+
verify Verify bundle integrity
35
+
export Export operations from bundle
36
+
help Show this help
33
37
34
38
Use 'bun cli <command> -h' for command-specific help
35
39
36
40
EXAMPLES:
37
41
bun cli clone --remote https://plcbundle.atscan.net
38
42
bun cli detect ./examples/detect.ts --bundles 1-100
43
+
bun cli q did --simple --bundles 1-1000
44
+
bun cli query 'operation.services.*.endpoint' --bundles 1-100
39
45
bun cli process ./my-processor.ts --threads 4
40
46
bun cli info --dir ./bundles
41
47
bun cli verify --bundle 42
+399
src/cmds/query.ts
+399
src/cmds/query.ts
···
1
+
import { parseArgs } from 'util';
2
+
import { PLCBundle } from '../plcbundle';
3
+
import { search as jmespathSearch } from '@jmespath-community/jmespath';
4
+
5
+
export async function query(args: string[]) {
6
+
if (args.includes('-h') || args.includes('--help')) {
7
+
console.log(`
8
+
query - Query operations using JMESPath or simple dot notation
9
+
10
+
USAGE:
11
+
plcbundle-bun query <expression> [options]
12
+
13
+
ARGUMENTS:
14
+
<expression> Query expression (required)
15
+
16
+
OPTIONS:
17
+
--dir <path> Bundle directory (default: ./)
18
+
--bundles <spec> Bundle selection: number (42) or range (1-50)
19
+
--threads <num> Number of worker threads (default: 0 = auto-detect CPU cores)
20
+
--format <type> Output format: jsonl|count (default: jsonl)
21
+
--limit <num> Limit number of results
22
+
--simple Use fast simple dot notation (10x faster for basic queries)
23
+
--no-progress Disable progress output
24
+
25
+
QUERY MODES:
26
+
27
+
SIMPLE (--simple) - Ultra-fast for basic property access:
28
+
did Direct property (fastest!)
29
+
operation.services.atproto_pds Nested property
30
+
alsoKnownAs[0] Array indexing
31
+
operation.alsoKnownAs[0].name Combined access
32
+
33
+
JMESPATH (default) - Full query power:
34
+
operation.services.*.endpoint Wildcard
35
+
foo[?age > \`30\`] Filtering
36
+
{did: did, handle: alsoKnownAs[0]} Projection
37
+
operation.services[*].endpoint All endpoints
38
+
39
+
EXAMPLES:
40
+
# Ultra-fast simple queries (recommended for basic property access)
41
+
bun cli q 'did' --simple --bundles 1-10000
42
+
bun cli q 'operation.services.atproto_pds.endpoint' --simple
43
+
44
+
# Complex JMESPath queries (default)
45
+
bun cli q 'operation.services.*.endpoint' --bundles 1-100
46
+
bun cli q '[?operation.alsoKnownAs]' --bundles 1-100
47
+
`);
48
+
return;
49
+
}
50
+
51
+
const { values, positionals } = parseArgs({
52
+
args,
53
+
options: {
54
+
dir: { type: 'string', default: './' },
55
+
bundles: { type: 'string' },
56
+
threads: { type: 'string', default: '0' },
57
+
format: { type: 'string', default: 'jsonl' },
58
+
limit: { type: 'string' },
59
+
simple: { type: 'boolean', default: false },
60
+
'no-progress': { type: 'boolean', default: false },
61
+
},
62
+
strict: false,
63
+
allowPositionals: true,
64
+
});
65
+
66
+
const expression = positionals[0];
67
+
if (!expression) {
68
+
console.error('Error: Query expression is required');
69
+
process.exit(1);
70
+
}
71
+
72
+
const dir = (values.dir as string) || './';
73
+
let threads = parseInt((values.threads as string) || '0');
74
+
75
+
if (threads === 0) {
76
+
threads = navigator.hardwareConcurrency || 4;
77
+
}
78
+
79
+
const format = (values.format as string) || 'jsonl';
80
+
const limit = values.limit ? parseInt(values.limit as string) : undefined;
81
+
const useSimple = Boolean(values.simple);
82
+
const noProgress = Boolean(values['no-progress']);
83
+
84
+
const bundle = new PLCBundle(dir);
85
+
const stats = await bundle.getStats();
86
+
87
+
let start: number, end: number;
88
+
89
+
if (values.bundles && typeof values.bundles === 'string') {
90
+
const bundleSpec = values.bundles;
91
+
if (bundleSpec.includes('-')) {
92
+
const [startStr, endStr] = bundleSpec.split('-');
93
+
start = parseInt(startStr.trim());
94
+
end = parseInt(endStr.trim());
95
+
if (isNaN(start) || isNaN(end)) {
96
+
throw new Error(`Invalid bundle range: ${bundleSpec}`);
97
+
}
98
+
} else {
99
+
start = parseInt(bundleSpec);
100
+
end = start;
101
+
if (isNaN(start)) {
102
+
throw new Error(`Invalid bundle number: ${bundleSpec}`);
103
+
}
104
+
}
105
+
} else {
106
+
start = 1;
107
+
end = stats.lastBundle;
108
+
}
109
+
110
+
if (start < 1 || end > stats.lastBundle || start > end) {
111
+
throw new Error(`Invalid bundle range ${start}-${end} (available: 1-${stats.lastBundle})`);
112
+
}
113
+
114
+
const queryType = useSimple ? 'simple' : 'JMESPath';
115
+
const threadInfo = threads > 1 ? ` (${threads} threads)` : '';
116
+
console.error(`Querying bundles ${start}-${end}${threadInfo} with ${queryType}: ${expression}\n`);
117
+
118
+
const startTime = Date.now();
119
+
let matchCount = 0;
120
+
let totalOps = 0;
121
+
let totalBytes = 0;
122
+
const totalBundles = end - start + 1;
123
+
let shouldStop = false;
124
+
125
+
// Compile simple expression and detect fast path
126
+
let queryFn: (op: any) => any;
127
+
128
+
if (useSimple) {
129
+
const compiled = compileSimplePath(expression);
130
+
131
+
// Ultra-fast path for single property access (e.g., "did")
132
+
if (compiled.segments.length === 1 && compiled.segments[0].type === 'property') {
133
+
const prop = compiled.segments[0].value as string;
134
+
queryFn = (op) => op[prop];
135
+
} else {
136
+
// Fast path for dot notation
137
+
queryFn = (op) => querySimplePath(op, compiled);
138
+
}
139
+
} else {
140
+
// JMESPath
141
+
queryFn = (op) => jmespathSearch(op, expression);
142
+
}
143
+
144
+
if (threads === 1) {
145
+
// Single-threaded
146
+
for (let bundleNum = start; bundleNum <= end; bundleNum++) {
147
+
for await (const { op, line } of bundle.streamOperations(bundleNum)) {
148
+
totalOps++;
149
+
totalBytes += line.length;
150
+
151
+
if (!noProgress && totalOps % 5000 === 0) {
152
+
const elapsed = (Date.now() - startTime) / 1000;
153
+
const bundlesCompleted = bundleNum - start + 1;
154
+
const progress = bundlesCompleted / totalBundles;
155
+
const mbPerSec = totalBytes / elapsed / 1e6;
156
+
const eta = bundlesCompleted > 0 ? ((totalBundles - bundlesCompleted) / bundlesCompleted) * elapsed : 0;
157
+
renderProgressBar(elapsed, bundlesCompleted, totalBundles, progress, matchCount, mbPerSec, eta);
158
+
}
159
+
160
+
try {
161
+
const result = queryFn(op);
162
+
163
+
if (result !== null && result !== undefined) {
164
+
matchCount++;
165
+
if (format !== 'count') {
166
+
console.log(JSON.stringify(result));
167
+
}
168
+
if (limit && matchCount >= limit) {
169
+
shouldStop = true;
170
+
break;
171
+
}
172
+
}
173
+
} catch (error) {
174
+
// Skip invalid operations
175
+
}
176
+
}
177
+
if (shouldStop) break;
178
+
}
179
+
} else {
180
+
// Multi-threaded
181
+
const bundlesPerThread = Math.ceil(totalBundles / threads);
182
+
const workerPath = new URL('../worker.ts', import.meta.url).pathname;
183
+
const workers: Worker[] = [];
184
+
const workerStats: Array<{
185
+
totalOps: number;
186
+
totalBytes: number;
187
+
matchCount: number;
188
+
bundlesCompleted: number;
189
+
threadStart: number;
190
+
}> = [];
191
+
192
+
const workerPromises = [];
193
+
194
+
for (let i = 0; i < threads; i++) {
195
+
const threadStart = start + i * bundlesPerThread;
196
+
const threadEnd = Math.min(threadStart + bundlesPerThread - 1, end);
197
+
if (threadStart > end) break;
198
+
199
+
const worker = new Worker(workerPath);
200
+
workers.push(worker);
201
+
workerStats[i] = { totalOps: 0, totalBytes: 0, matchCount: 0, bundlesCompleted: 0, threadStart };
202
+
203
+
const promise = new Promise<any>((resolve) => {
204
+
worker.onmessage = (event) => {
205
+
const msg = event.data;
206
+
207
+
if (msg.type === 'progress') {
208
+
workerStats[i].totalOps = msg.totalOps;
209
+
workerStats[i].totalBytes = msg.totalBytes;
210
+
workerStats[i].matchCount = msg.matchCount;
211
+
workerStats[i].bundlesCompleted = Math.max(0, msg.currentBundle - workerStats[i].threadStart + 1);
212
+
213
+
let aggOps = 0, aggBytes = 0, aggMatches = 0, totalBundlesCompleted = 0;
214
+
for (const ws of workerStats) {
215
+
aggOps += ws.totalOps;
216
+
aggBytes += ws.totalBytes;
217
+
aggMatches += ws.matchCount;
218
+
totalBundlesCompleted += ws.bundlesCompleted;
219
+
}
220
+
221
+
const progress = Math.min(totalBundlesCompleted / totalBundles, 1.0);
222
+
if (!noProgress) {
223
+
const elapsed = (Date.now() - startTime) / 1000;
224
+
const mbPerSec = aggBytes / elapsed / 1e6;
225
+
const eta = totalBundlesCompleted > 0 ? ((totalBundles - totalBundlesCompleted) / totalBundlesCompleted) * elapsed : 0;
226
+
renderProgressBar(elapsed, totalBundlesCompleted, totalBundles, progress, aggMatches, mbPerSec, eta);
227
+
}
228
+
} else if (msg.type === 'match-batch') {
229
+
for (const match of msg.matches) {
230
+
matchCount++;
231
+
if (format !== 'count') {
232
+
console.log(JSON.stringify(match.result));
233
+
}
234
+
if (limit && matchCount >= limit) {
235
+
shouldStop = true;
236
+
workers.forEach(w => w.terminate());
237
+
break;
238
+
}
239
+
}
240
+
} else if (msg.type === 'result') {
241
+
totalOps += msg.totalOps;
242
+
totalBytes += msg.totalBytes;
243
+
resolve(msg);
244
+
}
245
+
};
246
+
});
247
+
248
+
workerPromises.push(promise);
249
+
worker.postMessage({
250
+
dir: bundle['dir'],
251
+
start: threadStart,
252
+
end: threadEnd,
253
+
expression: expression,
254
+
useSimple: useSimple,
255
+
flush: true,
256
+
mode: 'query',
257
+
});
258
+
}
259
+
260
+
await Promise.all(workerPromises);
261
+
workers.forEach(w => w.terminate());
262
+
}
263
+
264
+
const elapsed = (Date.now() - startTime) / 1000;
265
+
266
+
if (!noProgress) {
267
+
const mbPerSec = totalBytes / elapsed / 1e6;
268
+
renderProgressBar(elapsed, totalBundles, totalBundles, 1.0, matchCount, mbPerSec, 0);
269
+
console.error('\n');
270
+
}
271
+
272
+
if (format === 'count') {
273
+
console.log(matchCount);
274
+
}
275
+
276
+
console.error('');
277
+
console.error(`✓ Query complete`);
278
+
console.error(` Total operations: ${totalOps.toLocaleString()}`);
279
+
console.error(` Matches found: ${matchCount.toLocaleString()} (${totalOps > 0 ? ((matchCount/totalOps)*100).toFixed(2) : '0.00'}%)`);
280
+
console.error(` Total bytes: ${(totalBytes / 1e6).toFixed(1)} MB`);
281
+
console.error(` Time elapsed: ${elapsed.toFixed(2)}s`);
282
+
console.error(` Throughput: ${(totalOps / elapsed).toFixed(0)} ops/sec | ${(totalBytes / elapsed / 1e6).toFixed(1)} MB/s`);
283
+
if (threads > 1) {
284
+
console.error(` Threads: ${threads}`);
285
+
}
286
+
}
287
+
288
+
// Simple dot notation parser
289
+
interface SimplePath {
290
+
segments: Array<{ type: 'property' | 'index'; value: string | number }>;
291
+
}
292
+
293
+
function compileSimplePath(expression: string): SimplePath {
294
+
const segments: Array<{ type: 'property' | 'index'; value: string | number }> = [];
295
+
296
+
let current = '';
297
+
let i = 0;
298
+
299
+
while (i < expression.length) {
300
+
const char = expression[i];
301
+
302
+
if (char === '.') {
303
+
if (current) {
304
+
segments.push({ type: 'property', value: current });
305
+
current = '';
306
+
}
307
+
i++;
308
+
} else if (char === '[') {
309
+
if (current) {
310
+
segments.push({ type: 'property', value: current });
311
+
current = '';
312
+
}
313
+
i++;
314
+
let index = '';
315
+
while (i < expression.length && expression[i] !== ']') {
316
+
index += expression[i];
317
+
i++;
318
+
}
319
+
segments.push({ type: 'index', value: parseInt(index) });
320
+
i++;
321
+
} else {
322
+
current += char;
323
+
i++;
324
+
}
325
+
}
326
+
327
+
if (current) {
328
+
segments.push({ type: 'property', value: current });
329
+
}
330
+
331
+
return { segments };
332
+
}
333
+
334
+
function querySimplePath(obj: any, compiled: SimplePath): any {
335
+
let current = obj;
336
+
337
+
for (const segment of compiled.segments) {
338
+
if (current == null) return null;
339
+
340
+
if (segment.type === 'property') {
341
+
current = current[segment.value];
342
+
} else {
343
+
if (Array.isArray(current)) {
344
+
current = current[segment.value as number];
345
+
} else {
346
+
return null;
347
+
}
348
+
}
349
+
}
350
+
351
+
return current;
352
+
}
353
+
354
+
function renderProgressBar(
355
+
elapsed: number,
356
+
current: number,
357
+
total: number,
358
+
progress: number,
359
+
matches: number,
360
+
mbPerSec: number,
361
+
etaSeconds: number
362
+
) {
363
+
const barWidth = 40;
364
+
const filledWidth = Math.floor(progress * barWidth);
365
+
const hours = Math.floor(elapsed / 3600);
366
+
const minutes = Math.floor((elapsed % 3600) / 60);
367
+
const seconds = Math.floor(elapsed % 60);
368
+
const timeStr = `[${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}]`;
369
+
370
+
let bar = '';
371
+
if (filledWidth === 0) {
372
+
bar = '>' + ' '.repeat(barWidth - 1);
373
+
} else if (filledWidth >= barWidth) {
374
+
bar = '>'.repeat(barWidth);
375
+
} else {
376
+
bar = '>' + '-'.repeat(filledWidth - 1) + ' '.repeat(barWidth - filledWidth);
377
+
}
378
+
379
+
const percent = (progress * 100).toFixed(1);
380
+
let etaStr = '';
381
+
if (etaSeconds > 0 && etaSeconds < 86400) {
382
+
if (etaSeconds < 60) {
383
+
etaStr = `${Math.ceil(etaSeconds)}s`;
384
+
} else if (etaSeconds < 3600) {
385
+
etaStr = `${Math.ceil(etaSeconds / 60)}m`;
386
+
} else {
387
+
const etaHours = Math.floor(etaSeconds / 3600);
388
+
const etaMin = Math.ceil((etaSeconds % 3600) / 60);
389
+
etaStr = `${etaHours}h ${etaMin}m`;
390
+
}
391
+
}
392
+
393
+
const matchesStr = matches >= 1000000 ? `${(matches / 1000000).toFixed(1)}M` : matches >= 1000 ? `${(matches / 1000).toFixed(0)}k` : matches.toString();
394
+
const line = etaStr
395
+
? `${timeStr} ${bar} ${current}/${total} (${percent}%) ${matchesStr} matches | ${mbPerSec.toFixed(1)} MB/s [ETA: ${etaStr}]`
396
+
: `${timeStr} ${bar} ${current}/${total} (${percent}%) ${matchesStr} matches | ${mbPerSec.toFixed(1)} MB/s`;
397
+
398
+
process.stderr.write(`\r${line}\x1b[K`);
399
+
}
+26
-5
src/plcbundle.ts
+26
-5
src/plcbundle.ts
···
309
309
310
310
// Determine mode based on what function is exported
311
311
let mode: 'detect' | 'process' = 'detect';
312
+
let mod: any;
312
313
if (module) {
313
314
try {
314
-
const mod = await import(module);
315
+
mod = await import(module);
315
316
// If module has 'process' function, use process mode
316
317
if (mod.process) {
317
318
mode = 'process';
···
337
338
onMatch
338
339
);
339
340
}
340
-
341
+
341
342
// Load module if provided but single-threaded
342
-
if (module && !callback) {
343
-
const resolvedPath = module;
344
-
const mod = await import(resolvedPath);
343
+
if (mod && !callback) {
345
344
const userFn = mode === 'detect' ? (mod.detect || mod.default) : (mod.process || mod.default);
346
345
347
346
callback = (op, position, bundleNum, line) => {
···
446
445
447
446
// Cleanup
448
447
workers.forEach(w => w.terminate());
448
+
449
+
if (modulePath) {
450
+
const mod = await import(modulePath);
451
+
if (mod.finalize) {
452
+
await mod.finalize(results, { aggregate: this.aggregate });
453
+
}
454
+
}
449
455
450
456
// Aggregate results
451
457
let totalOps = 0;
···
475
481
matches: flush || mode === 'process' ? undefined : allMatches,
476
482
};
477
483
}
484
+
485
+
private aggregate(objects: Array<{ [key: string]: number }>): { [key: string]: number } {
486
+
const aggregatedDict: { [key: string]: number } = {};
487
+
488
+
for (const currentObj of objects) {
489
+
for (const key in currentObj) {
490
+
if (Object.prototype.hasOwnProperty.call(currentObj, key)) {
491
+
aggregatedDict[key] = (aggregatedDict[key] || 0) + currentObj[key];
492
+
}
493
+
}
494
+
}
495
+
496
+
return aggregatedDict;
497
+
}
498
+
478
499
479
500
/**
480
501
* Fast single-threaded processing (optimized)
+141
-63
src/worker.ts
+141
-63
src/worker.ts
···
1
1
/// <reference lib="webworker" />
2
2
3
+
import { search as jmespathSearch } from '@jmespath-community/jmespath';
4
+
3
5
export interface WorkerTask {
4
6
dir: string;
5
7
start: number;
6
8
end: number;
7
-
modulePath: string;
9
+
modulePath?: string;
10
+
expression?: string;
11
+
useSimple?: boolean;
8
12
silent?: boolean;
9
13
flush?: boolean;
10
-
mode?: 'detect' | 'process'; // Add mode parameter
14
+
mode?: 'detect' | 'process' | 'query';
11
15
}
12
16
13
17
export interface WorkerProgress {
14
18
type: 'progress';
19
+
currentBundle: number;
15
20
totalOps: number;
16
21
totalBytes: number;
22
+
matchCount: number;
17
23
}
18
24
19
-
export interface WorkerMatch {
20
-
type: 'match';
21
-
bundle: number;
22
-
position: number;
23
-
cid: string;
24
-
size: number;
25
-
labels: string[];
25
+
export interface WorkerMatchBatch {
26
+
type: 'match-batch';
27
+
matches: Array<{ result: any }>;
26
28
}
27
29
28
30
export interface WorkerResult {
29
31
type: 'result';
30
32
totalOps: number;
31
33
totalBytes: number;
32
-
matches: Array<{
33
-
bundle: number;
34
-
position: number;
35
-
cid: string;
36
-
size: number;
37
-
labels: string[];
38
-
}>;
34
+
matchCount: number;
35
+
data?: any;
39
36
}
40
37
41
38
self.onmessage = async (event: MessageEvent<WorkerTask>) => {
42
-
const { dir, start, end, modulePath, silent, flush, mode = 'detect' } = event.data;
39
+
const {
40
+
dir,
41
+
start,
42
+
end,
43
+
modulePath,
44
+
expression,
45
+
useSimple,
46
+
silent,
47
+
flush,
48
+
mode = 'detect'
49
+
} = event.data;
43
50
44
-
// Override console if silent
45
51
if (silent) {
46
52
globalThis.console = {
47
53
log: () => {},
···
53
59
} as any;
54
60
}
55
61
56
-
// Load the appropriate function based on mode
57
-
const mod = await import(modulePath);
58
-
const userFn = mode === 'detect'
59
-
? (mod.detect || mod.default)
60
-
: (mod.process || mod.default);
62
+
let userFn: any;
63
+
let queryFn: ((op: any) => any) | null = null;
64
+
65
+
if (mode === 'query') {
66
+
// Query mode
67
+
if (useSimple) {
68
+
const compiled = compileSimplePath(expression!);
69
+
70
+
// Ultra-fast path for single property
71
+
if (compiled.segments.length === 1 && compiled.segments[0].type === 'property') {
72
+
const prop = compiled.segments[0].value as string;
73
+
queryFn = (op) => op[prop];
74
+
} else {
75
+
queryFn = (op) => querySimplePath(op, compiled);
76
+
}
77
+
} else {
78
+
queryFn = (op) => jmespathSearch(op, expression!);
79
+
}
80
+
} else {
81
+
// Detect or process mode
82
+
const mod = await import(modulePath!);
83
+
userFn = mode === 'detect' ? (mod.detect || mod.default) : (mod.process || mod.default);
84
+
}
61
85
62
86
let totalOps = 0;
63
87
let totalBytes = 0;
64
-
const matches: any[] = [];
88
+
let matchCount = 0;
89
+
const BATCH_SIZE = 1000;
90
+
let matchBatch: any[] = [];
91
+
92
+
const flushBatch = () => {
93
+
if (matchBatch.length > 0) {
94
+
self.postMessage({ type: 'match-batch', matches: matchBatch } as WorkerMatchBatch);
95
+
matchBatch = [];
96
+
}
97
+
};
65
98
66
99
for (let bundleNum = start; bundleNum <= end; bundleNum++) {
67
100
const bundlePath = `${dir}${bundleNum.toString().padStart(6, '0')}.jsonl.zst`;
···
70
103
const compressed = await Bun.file(bundlePath).arrayBuffer();
71
104
const decompressed = Bun.zstdDecompressSync(compressed);
72
105
const text = new TextDecoder().decode(decompressed);
73
-
74
106
const lines = text.split('\n');
75
107
76
108
for (let position = 0; position < lines.length; position++) {
···
79
111
80
112
totalOps++;
81
113
totalBytes += line.length;
82
-
83
114
const op = JSON.parse(line);
84
115
85
-
if (mode === 'detect') {
86
-
// Detection mode - look for labels
87
-
const labels = userFn({ op });
88
-
89
-
if (labels && labels.length > 0) {
90
-
const match = {
91
-
bundle: bundleNum,
92
-
position,
93
-
cid: op.cid.slice(-4),
94
-
size: line.length,
95
-
labels,
96
-
};
116
+
if (mode === 'query') {
117
+
try {
118
+
const result = queryFn!(op);
97
119
98
-
if (flush) {
99
-
// Send match immediately
100
-
self.postMessage({
101
-
type: 'match',
102
-
...match,
103
-
} as WorkerMatch);
104
-
} else {
105
-
// Buffer matches
106
-
matches.push(match);
120
+
if (result !== null && result !== undefined) {
121
+
matchCount++;
122
+
matchBatch.push({ result });
123
+
if (matchBatch.length >= BATCH_SIZE) flushBatch();
107
124
}
125
+
} catch (error) {}
126
+
} else if (mode === 'detect') {
127
+
const labels = userFn({ op });
128
+
if (labels && labels.length > 0) {
129
+
matchCount++;
130
+
matchBatch.push({ result: { bundle: bundleNum, position, cid: op.cid.slice(-4), labels } });
131
+
if (matchBatch.length >= BATCH_SIZE) flushBatch();
108
132
}
109
133
} else {
110
-
// Process mode - just call the function
134
+
// process mode
111
135
userFn({ op, position, bundle: bundleNum, line });
112
136
}
113
137
114
138
if (totalOps % 10000 === 0) {
115
-
self.postMessage({
116
-
type: 'progress',
117
-
totalOps,
118
-
totalBytes,
119
-
} as WorkerProgress);
139
+
self.postMessage({ type: 'progress', currentBundle: bundleNum, totalOps, totalBytes, matchCount } as WorkerProgress);
120
140
}
121
141
}
122
-
} catch (error) {
123
-
// Continue on error
142
+
143
+
self.postMessage({ type: 'progress', currentBundle: bundleNum, totalOps, totalBytes, matchCount } as WorkerProgress);
144
+
} catch (error) {}
145
+
}
146
+
147
+
flushBatch();
148
+
149
+
let prepareResult: any;
150
+
if (mode !== 'query' && modulePath) {
151
+
const mod = await import(modulePath);
152
+
if (typeof mod.prepare === 'function') {
153
+
try {
154
+
prepareResult = await mod.prepare();
155
+
} catch (error) {}
124
156
}
125
157
}
126
158
127
-
// Send final result
128
-
self.postMessage({
129
-
type: 'result',
130
-
totalOps,
131
-
totalBytes,
132
-
matches: flush ? [] : matches,
133
-
} as WorkerResult);
134
-
};
159
+
self.postMessage({ type: 'result', totalOps, totalBytes, matchCount, data: prepareResult || null } as WorkerResult);
160
+
};
161
+
162
+
interface SimplePath {
163
+
segments: Array<{ type: 'property' | 'index'; value: string | number }>;
164
+
}
165
+
166
+
function compileSimplePath(expression: string): SimplePath {
167
+
const segments: Array<{ type: 'property' | 'index'; value: string | number }> = [];
168
+
let current = '';
169
+
let i = 0;
170
+
171
+
while (i < expression.length) {
172
+
const char = expression[i];
173
+
if (char === '.') {
174
+
if (current) segments.push({ type: 'property', value: current });
175
+
current = '';
176
+
i++;
177
+
} else if (char === '[') {
178
+
if (current) segments.push({ type: 'property', value: current });
179
+
current = '';
180
+
i++;
181
+
let index = '';
182
+
while (i < expression.length && expression[i] !== ']') {
183
+
index += expression[i];
184
+
i++;
185
+
}
186
+
segments.push({ type: 'index', value: parseInt(index) });
187
+
i++;
188
+
} else {
189
+
current += char;
190
+
i++;
191
+
}
192
+
}
193
+
if (current) segments.push({ type: 'property', value: current });
194
+
return { segments };
195
+
}
196
+
197
+
function querySimplePath(obj: any, compiled: SimplePath): any {
198
+
let current = obj;
199
+
for (const segment of compiled.segments) {
200
+
if (current == null) return null;
201
+
if (segment.type === 'property') {
202
+
current = current[segment.value];
203
+
} else {
204
+
if (Array.isArray(current)) {
205
+
current = current[segment.value as number];
206
+
} else {
207
+
return null;
208
+
}
209
+
}
210
+
}
211
+
return current;
212
+
}