⚡ Zero-dependency plcbundle library exclusively for Bun

query cmd

Changed files
+559 -78
src
+5
bun.lock
··· 3 "workspaces": { 4 "": { 5 "name": "plcbundle", 6 "devDependencies": { 7 "@types/bun": "^1.3.1", 8 }, 9 }, 10 }, 11 "packages": { 12 "@types/bun": ["@types/bun@1.3.1", "", { "dependencies": { "bun-types": "1.3.1" } }, "sha512-4jNMk2/K9YJtfqwoAa28c8wK+T7nvJFOjxI4h/7sORWcypRNxBpr+TPNaCfVWq70tLCJsqoFwcf0oI0JU/fvMQ=="], 13 14 "@types/node": ["@types/node@24.9.2", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-uWN8YqxXxqFMX2RqGOrumsKeti4LlmIMIyV0lgut4jx7KQBcBiW6vkDtIBvHnHIquwNfJhk8v2OtmO8zXWHfPA=="],
··· 3 "workspaces": { 4 "": { 5 "name": "plcbundle", 6 + "dependencies": { 7 + "@jmespath-community/jmespath": "^1.3.0", 8 + }, 9 "devDependencies": { 10 "@types/bun": "^1.3.1", 11 }, 12 }, 13 }, 14 "packages": { 15 + "@jmespath-community/jmespath": ["@jmespath-community/jmespath@1.3.0", "", { "bin": { "jp": "dist/cli.mjs" } }, "sha512-nzOrEdWKNpognj6CT+1Atr7gw0bqUC1KTBRyasBXS9NjFpz+og7LeFZrIQqV81GRcCzKa5H+DNipvv7NQK3GzA=="], 16 + 17 "@types/bun": ["@types/bun@1.3.1", "", { "dependencies": { "bun-types": "1.3.1" } }, "sha512-4jNMk2/K9YJtfqwoAa28c8wK+T7nvJFOjxI4h/7sORWcypRNxBpr+TPNaCfVWq70tLCJsqoFwcf0oI0JU/fvMQ=="], 18 19 "@types/node": ["@types/node@24.9.2", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-uWN8YqxXxqFMX2RqGOrumsKeti4LlmIMIyV0lgut4jx7KQBcBiW6vkDtIBvHnHIquwNfJhk8v2OtmO8zXWHfPA=="],
+4 -1
package.json
··· 16 "scripts": { 17 "test": "bun test", 18 "test:watch": "bun test --watch", 19 - "test:coverage": "bun test --coverage", 20 "cli": "bun src/cli.ts", 21 "publish": "bunx jsr publish" 22 }, ··· 25 }, 26 "publishConfig": { 27 "access": "public" 28 } 29 }
··· 16 "scripts": { 17 "test": "bun test", 18 "test:watch": "bun test --watch", 19 + "tests:coverage": "bun test --coverage", 20 "cli": "bun src/cli.ts", 21 "publish": "bunx jsr publish" 22 }, ··· 25 }, 26 "publishConfig": { 27 "access": "public" 28 + }, 29 + "dependencies": { 30 + "@jmespath-community/jmespath": "^1.3.0" 31 } 32 }
+13 -7
src/cli.ts
··· 6 import { info } from './cmds/info'; 7 import { verify } from './cmds/verify'; 8 import { exportCmd } from './cmds/export'; 9 10 const commands = { 11 clone, 12 detect, 13 process: processCmd, 14 info, 15 verify, 16 export: exportCmd, ··· 23 bun cli <command> [options] 24 25 COMMANDS: 26 - clone Clone bundles from a remote repository 27 - detect Detect and filter operations using a custom function 28 - process Process operations with a custom function 29 - info Show index or bundle information 30 - verify Verify bundle integrity 31 - export Export operations from bundle 32 - help Show this help 33 34 Use 'bun cli <command> -h' for command-specific help 35 36 EXAMPLES: 37 bun cli clone --remote https://plcbundle.atscan.net 38 bun cli detect ./examples/detect.ts --bundles 1-100 39 bun cli process ./my-processor.ts --threads 4 40 bun cli info --dir ./bundles 41 bun cli verify --bundle 42
··· 6 import { info } from './cmds/info'; 7 import { verify } from './cmds/verify'; 8 import { exportCmd } from './cmds/export'; 9 + import { query } from './cmds/query'; 10 11 const commands = { 12 clone, 13 detect, 14 process: processCmd, 15 + query, 16 + q: query, // Alias for query 17 info, 18 verify, 19 export: exportCmd, ··· 26 bun cli <command> [options] 27 28 COMMANDS: 29 + clone Clone bundles from a remote repository 30 + detect Detect and filter operations using a custom function 31 + process Process operations with a custom function 32 + query (q) Query operations using JMESPath or simple dot notation 33 + info Show index or bundle information 34 + verify Verify bundle integrity 35 + export Export operations from bundle 36 + help Show this help 37 38 Use 'bun cli <command> -h' for command-specific help 39 40 EXAMPLES: 41 bun cli clone --remote https://plcbundle.atscan.net 42 bun cli detect ./examples/detect.ts --bundles 1-100 43 + bun cli q did --simple --bundles 1-1000 44 + bun cli query 'operation.services.*.endpoint' --bundles 1-100 45 bun cli process ./my-processor.ts --threads 4 46 bun cli info --dir ./bundles 47 bun cli verify --bundle 42
+399
src/cmds/query.ts
···
··· 1 + import { parseArgs } from 'util'; 2 + import { PLCBundle } from '../plcbundle'; 3 + import { search as jmespathSearch } from '@jmespath-community/jmespath'; 4 + 5 + export async function query(args: string[]) { 6 + if (args.includes('-h') || args.includes('--help')) { 7 + console.log(` 8 + query - Query operations using JMESPath or simple dot notation 9 + 10 + USAGE: 11 + plcbundle-bun query <expression> [options] 12 + 13 + ARGUMENTS: 14 + <expression> Query expression (required) 15 + 16 + OPTIONS: 17 + --dir <path> Bundle directory (default: ./) 18 + --bundles <spec> Bundle selection: number (42) or range (1-50) 19 + --threads <num> Number of worker threads (default: 0 = auto-detect CPU cores) 20 + --format <type> Output format: jsonl|count (default: jsonl) 21 + --limit <num> Limit number of results 22 + --simple Use fast simple dot notation (10x faster for basic queries) 23 + --no-progress Disable progress output 24 + 25 + QUERY MODES: 26 + 27 + SIMPLE (--simple) - Ultra-fast for basic property access: 28 + did Direct property (fastest!) 29 + operation.services.atproto_pds Nested property 30 + alsoKnownAs[0] Array indexing 31 + operation.alsoKnownAs[0].name Combined access 32 + 33 + JMESPATH (default) - Full query power: 34 + operation.services.*.endpoint Wildcard 35 + foo[?age > \`30\`] Filtering 36 + {did: did, handle: alsoKnownAs[0]} Projection 37 + operation.services[*].endpoint All endpoints 38 + 39 + EXAMPLES: 40 + # Ultra-fast simple queries (recommended for basic property access) 41 + bun cli q 'did' --simple --bundles 1-10000 42 + bun cli q 'operation.services.atproto_pds.endpoint' --simple 43 + 44 + # Complex JMESPath queries (default) 45 + bun cli q 'operation.services.*.endpoint' --bundles 1-100 46 + bun cli q '[?operation.alsoKnownAs]' --bundles 1-100 47 + `); 48 + return; 49 + } 50 + 51 + const { values, positionals } = parseArgs({ 52 + args, 53 + options: { 54 + dir: { type: 'string', default: './' }, 55 + bundles: { type: 'string' }, 56 + threads: { type: 'string', default: '0' }, 57 + format: { type: 'string', default: 'jsonl' }, 58 + limit: { type: 'string' }, 59 + simple: { type: 'boolean', default: false }, 60 + 'no-progress': { type: 'boolean', default: false }, 61 + }, 62 + strict: false, 63 + allowPositionals: true, 64 + }); 65 + 66 + const expression = positionals[0]; 67 + if (!expression) { 68 + console.error('Error: Query expression is required'); 69 + process.exit(1); 70 + } 71 + 72 + const dir = (values.dir as string) || './'; 73 + let threads = parseInt((values.threads as string) || '0'); 74 + 75 + if (threads === 0) { 76 + threads = navigator.hardwareConcurrency || 4; 77 + } 78 + 79 + const format = (values.format as string) || 'jsonl'; 80 + const limit = values.limit ? parseInt(values.limit as string) : undefined; 81 + const useSimple = Boolean(values.simple); 82 + const noProgress = Boolean(values['no-progress']); 83 + 84 + const bundle = new PLCBundle(dir); 85 + const stats = await bundle.getStats(); 86 + 87 + let start: number, end: number; 88 + 89 + if (values.bundles && typeof values.bundles === 'string') { 90 + const bundleSpec = values.bundles; 91 + if (bundleSpec.includes('-')) { 92 + const [startStr, endStr] = bundleSpec.split('-'); 93 + start = parseInt(startStr.trim()); 94 + end = parseInt(endStr.trim()); 95 + if (isNaN(start) || isNaN(end)) { 96 + throw new Error(`Invalid bundle range: ${bundleSpec}`); 97 + } 98 + } else { 99 + start = parseInt(bundleSpec); 100 + end = start; 101 + if (isNaN(start)) { 102 + throw new Error(`Invalid bundle number: ${bundleSpec}`); 103 + } 104 + } 105 + } else { 106 + start = 1; 107 + end = stats.lastBundle; 108 + } 109 + 110 + if (start < 1 || end > stats.lastBundle || start > end) { 111 + throw new Error(`Invalid bundle range ${start}-${end} (available: 1-${stats.lastBundle})`); 112 + } 113 + 114 + const queryType = useSimple ? 'simple' : 'JMESPath'; 115 + const threadInfo = threads > 1 ? ` (${threads} threads)` : ''; 116 + console.error(`Querying bundles ${start}-${end}${threadInfo} with ${queryType}: ${expression}\n`); 117 + 118 + const startTime = Date.now(); 119 + let matchCount = 0; 120 + let totalOps = 0; 121 + let totalBytes = 0; 122 + const totalBundles = end - start + 1; 123 + let shouldStop = false; 124 + 125 + // Compile simple expression and detect fast path 126 + let queryFn: (op: any) => any; 127 + 128 + if (useSimple) { 129 + const compiled = compileSimplePath(expression); 130 + 131 + // Ultra-fast path for single property access (e.g., "did") 132 + if (compiled.segments.length === 1 && compiled.segments[0].type === 'property') { 133 + const prop = compiled.segments[0].value as string; 134 + queryFn = (op) => op[prop]; 135 + } else { 136 + // Fast path for dot notation 137 + queryFn = (op) => querySimplePath(op, compiled); 138 + } 139 + } else { 140 + // JMESPath 141 + queryFn = (op) => jmespathSearch(op, expression); 142 + } 143 + 144 + if (threads === 1) { 145 + // Single-threaded 146 + for (let bundleNum = start; bundleNum <= end; bundleNum++) { 147 + for await (const { op, line } of bundle.streamOperations(bundleNum)) { 148 + totalOps++; 149 + totalBytes += line.length; 150 + 151 + if (!noProgress && totalOps % 5000 === 0) { 152 + const elapsed = (Date.now() - startTime) / 1000; 153 + const bundlesCompleted = bundleNum - start + 1; 154 + const progress = bundlesCompleted / totalBundles; 155 + const mbPerSec = totalBytes / elapsed / 1e6; 156 + const eta = bundlesCompleted > 0 ? ((totalBundles - bundlesCompleted) / bundlesCompleted) * elapsed : 0; 157 + renderProgressBar(elapsed, bundlesCompleted, totalBundles, progress, matchCount, mbPerSec, eta); 158 + } 159 + 160 + try { 161 + const result = queryFn(op); 162 + 163 + if (result !== null && result !== undefined) { 164 + matchCount++; 165 + if (format !== 'count') { 166 + console.log(JSON.stringify(result)); 167 + } 168 + if (limit && matchCount >= limit) { 169 + shouldStop = true; 170 + break; 171 + } 172 + } 173 + } catch (error) { 174 + // Skip invalid operations 175 + } 176 + } 177 + if (shouldStop) break; 178 + } 179 + } else { 180 + // Multi-threaded 181 + const bundlesPerThread = Math.ceil(totalBundles / threads); 182 + const workerPath = new URL('../worker.ts', import.meta.url).pathname; 183 + const workers: Worker[] = []; 184 + const workerStats: Array<{ 185 + totalOps: number; 186 + totalBytes: number; 187 + matchCount: number; 188 + bundlesCompleted: number; 189 + threadStart: number; 190 + }> = []; 191 + 192 + const workerPromises = []; 193 + 194 + for (let i = 0; i < threads; i++) { 195 + const threadStart = start + i * bundlesPerThread; 196 + const threadEnd = Math.min(threadStart + bundlesPerThread - 1, end); 197 + if (threadStart > end) break; 198 + 199 + const worker = new Worker(workerPath); 200 + workers.push(worker); 201 + workerStats[i] = { totalOps: 0, totalBytes: 0, matchCount: 0, bundlesCompleted: 0, threadStart }; 202 + 203 + const promise = new Promise<any>((resolve) => { 204 + worker.onmessage = (event) => { 205 + const msg = event.data; 206 + 207 + if (msg.type === 'progress') { 208 + workerStats[i].totalOps = msg.totalOps; 209 + workerStats[i].totalBytes = msg.totalBytes; 210 + workerStats[i].matchCount = msg.matchCount; 211 + workerStats[i].bundlesCompleted = Math.max(0, msg.currentBundle - workerStats[i].threadStart + 1); 212 + 213 + let aggOps = 0, aggBytes = 0, aggMatches = 0, totalBundlesCompleted = 0; 214 + for (const ws of workerStats) { 215 + aggOps += ws.totalOps; 216 + aggBytes += ws.totalBytes; 217 + aggMatches += ws.matchCount; 218 + totalBundlesCompleted += ws.bundlesCompleted; 219 + } 220 + 221 + const progress = Math.min(totalBundlesCompleted / totalBundles, 1.0); 222 + if (!noProgress) { 223 + const elapsed = (Date.now() - startTime) / 1000; 224 + const mbPerSec = aggBytes / elapsed / 1e6; 225 + const eta = totalBundlesCompleted > 0 ? ((totalBundles - totalBundlesCompleted) / totalBundlesCompleted) * elapsed : 0; 226 + renderProgressBar(elapsed, totalBundlesCompleted, totalBundles, progress, aggMatches, mbPerSec, eta); 227 + } 228 + } else if (msg.type === 'match-batch') { 229 + for (const match of msg.matches) { 230 + matchCount++; 231 + if (format !== 'count') { 232 + console.log(JSON.stringify(match.result)); 233 + } 234 + if (limit && matchCount >= limit) { 235 + shouldStop = true; 236 + workers.forEach(w => w.terminate()); 237 + break; 238 + } 239 + } 240 + } else if (msg.type === 'result') { 241 + totalOps += msg.totalOps; 242 + totalBytes += msg.totalBytes; 243 + resolve(msg); 244 + } 245 + }; 246 + }); 247 + 248 + workerPromises.push(promise); 249 + worker.postMessage({ 250 + dir: bundle['dir'], 251 + start: threadStart, 252 + end: threadEnd, 253 + expression: expression, 254 + useSimple: useSimple, 255 + flush: true, 256 + mode: 'query', 257 + }); 258 + } 259 + 260 + await Promise.all(workerPromises); 261 + workers.forEach(w => w.terminate()); 262 + } 263 + 264 + const elapsed = (Date.now() - startTime) / 1000; 265 + 266 + if (!noProgress) { 267 + const mbPerSec = totalBytes / elapsed / 1e6; 268 + renderProgressBar(elapsed, totalBundles, totalBundles, 1.0, matchCount, mbPerSec, 0); 269 + console.error('\n'); 270 + } 271 + 272 + if (format === 'count') { 273 + console.log(matchCount); 274 + } 275 + 276 + console.error(''); 277 + console.error(`✓ Query complete`); 278 + console.error(` Total operations: ${totalOps.toLocaleString()}`); 279 + console.error(` Matches found: ${matchCount.toLocaleString()} (${totalOps > 0 ? ((matchCount/totalOps)*100).toFixed(2) : '0.00'}%)`); 280 + console.error(` Total bytes: ${(totalBytes / 1e6).toFixed(1)} MB`); 281 + console.error(` Time elapsed: ${elapsed.toFixed(2)}s`); 282 + console.error(` Throughput: ${(totalOps / elapsed).toFixed(0)} ops/sec | ${(totalBytes / elapsed / 1e6).toFixed(1)} MB/s`); 283 + if (threads > 1) { 284 + console.error(` Threads: ${threads}`); 285 + } 286 + } 287 + 288 + // Simple dot notation parser 289 + interface SimplePath { 290 + segments: Array<{ type: 'property' | 'index'; value: string | number }>; 291 + } 292 + 293 + function compileSimplePath(expression: string): SimplePath { 294 + const segments: Array<{ type: 'property' | 'index'; value: string | number }> = []; 295 + 296 + let current = ''; 297 + let i = 0; 298 + 299 + while (i < expression.length) { 300 + const char = expression[i]; 301 + 302 + if (char === '.') { 303 + if (current) { 304 + segments.push({ type: 'property', value: current }); 305 + current = ''; 306 + } 307 + i++; 308 + } else if (char === '[') { 309 + if (current) { 310 + segments.push({ type: 'property', value: current }); 311 + current = ''; 312 + } 313 + i++; 314 + let index = ''; 315 + while (i < expression.length && expression[i] !== ']') { 316 + index += expression[i]; 317 + i++; 318 + } 319 + segments.push({ type: 'index', value: parseInt(index) }); 320 + i++; 321 + } else { 322 + current += char; 323 + i++; 324 + } 325 + } 326 + 327 + if (current) { 328 + segments.push({ type: 'property', value: current }); 329 + } 330 + 331 + return { segments }; 332 + } 333 + 334 + function querySimplePath(obj: any, compiled: SimplePath): any { 335 + let current = obj; 336 + 337 + for (const segment of compiled.segments) { 338 + if (current == null) return null; 339 + 340 + if (segment.type === 'property') { 341 + current = current[segment.value]; 342 + } else { 343 + if (Array.isArray(current)) { 344 + current = current[segment.value as number]; 345 + } else { 346 + return null; 347 + } 348 + } 349 + } 350 + 351 + return current; 352 + } 353 + 354 + function renderProgressBar( 355 + elapsed: number, 356 + current: number, 357 + total: number, 358 + progress: number, 359 + matches: number, 360 + mbPerSec: number, 361 + etaSeconds: number 362 + ) { 363 + const barWidth = 40; 364 + const filledWidth = Math.floor(progress * barWidth); 365 + const hours = Math.floor(elapsed / 3600); 366 + const minutes = Math.floor((elapsed % 3600) / 60); 367 + const seconds = Math.floor(elapsed % 60); 368 + const timeStr = `[${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}]`; 369 + 370 + let bar = ''; 371 + if (filledWidth === 0) { 372 + bar = '>' + ' '.repeat(barWidth - 1); 373 + } else if (filledWidth >= barWidth) { 374 + bar = '>'.repeat(barWidth); 375 + } else { 376 + bar = '>' + '-'.repeat(filledWidth - 1) + ' '.repeat(barWidth - filledWidth); 377 + } 378 + 379 + const percent = (progress * 100).toFixed(1); 380 + let etaStr = ''; 381 + if (etaSeconds > 0 && etaSeconds < 86400) { 382 + if (etaSeconds < 60) { 383 + etaStr = `${Math.ceil(etaSeconds)}s`; 384 + } else if (etaSeconds < 3600) { 385 + etaStr = `${Math.ceil(etaSeconds / 60)}m`; 386 + } else { 387 + const etaHours = Math.floor(etaSeconds / 3600); 388 + const etaMin = Math.ceil((etaSeconds % 3600) / 60); 389 + etaStr = `${etaHours}h ${etaMin}m`; 390 + } 391 + } 392 + 393 + const matchesStr = matches >= 1000000 ? `${(matches / 1000000).toFixed(1)}M` : matches >= 1000 ? `${(matches / 1000).toFixed(0)}k` : matches.toString(); 394 + const line = etaStr 395 + ? `${timeStr} ${bar} ${current}/${total} (${percent}%) ${matchesStr} matches | ${mbPerSec.toFixed(1)} MB/s [ETA: ${etaStr}]` 396 + : `${timeStr} ${bar} ${current}/${total} (${percent}%) ${matchesStr} matches | ${mbPerSec.toFixed(1)} MB/s`; 397 + 398 + process.stderr.write(`\r${line}\x1b[K`); 399 + }
+138 -70
src/worker.ts
··· 1 /// <reference lib="webworker" /> 2 3 export interface WorkerTask { 4 dir: string; 5 start: number; 6 end: number; 7 - modulePath: string; 8 silent?: boolean; 9 flush?: boolean; 10 - mode?: 'detect' | 'process'; // Add mode parameter 11 } 12 13 export interface WorkerProgress { 14 type: 'progress'; 15 totalOps: number; 16 totalBytes: number; 17 } 18 19 - export interface WorkerMatch { 20 - type: 'match'; 21 - bundle: number; 22 - position: number; 23 - cid: string; 24 - size: number; 25 - labels: string[]; 26 } 27 28 export interface WorkerResult { 29 type: 'result'; 30 totalOps: number; 31 totalBytes: number; 32 - matches: Array<{ 33 - bundle: number; 34 - position: number; 35 - cid: string; 36 - size: number; 37 - labels: string[]; 38 - }>; 39 } 40 41 self.onmessage = async (event: MessageEvent<WorkerTask>) => { 42 - const { dir, start, end, modulePath, silent, flush, mode = 'detect' } = event.data; 43 44 - // Override console if silent 45 if (silent) { 46 globalThis.console = { 47 log: () => {}, ··· 53 } as any; 54 } 55 56 - // Load the appropriate function based on mode 57 - const mod = await import(modulePath); 58 - const userFn = mode === 'detect' 59 - ? (mod.detect || mod.default) 60 - : (mod.process || mod.default); 61 62 let totalOps = 0; 63 let totalBytes = 0; 64 - const matches: any[] = []; 65 66 for (let bundleNum = start; bundleNum <= end; bundleNum++) { 67 const bundlePath = `${dir}${bundleNum.toString().padStart(6, '0')}.jsonl.zst`; ··· 70 const compressed = await Bun.file(bundlePath).arrayBuffer(); 71 const decompressed = Bun.zstdDecompressSync(compressed); 72 const text = new TextDecoder().decode(decompressed); 73 - 74 const lines = text.split('\n'); 75 76 for (let position = 0; position < lines.length; position++) { ··· 79 80 totalOps++; 81 totalBytes += line.length; 82 - 83 const op = JSON.parse(line); 84 85 - if (mode === 'detect') { 86 - // Detection mode - look for labels 87 - const labels = userFn({ op }); 88 - 89 - if (labels && labels.length > 0) { 90 - const match = { 91 - bundle: bundleNum, 92 - position, 93 - cid: op.cid.slice(-4), 94 - size: line.length, 95 - labels, 96 - }; 97 98 - if (flush) { 99 - // Send match immediately 100 - self.postMessage({ 101 - type: 'match', 102 - ...match, 103 - } as WorkerMatch); 104 - } else { 105 - // Buffer matches 106 - matches.push(match); 107 } 108 } 109 } else { 110 - // Process mode - just call the function 111 userFn({ op, position, bundle: bundleNum, line }); 112 } 113 114 if (totalOps % 10000 === 0) { 115 - self.postMessage({ 116 - type: 'progress', 117 - totalOps, 118 - totalBytes, 119 - } as WorkerProgress); 120 } 121 } 122 - } catch (error) { 123 - // Continue on error 124 - } 125 } 126 127 let prepareResult: any; 128 - if (typeof mod.prepare === 'function') { 129 - try { 130 - prepareResult = await mod.prepare(); 131 - } catch (error) { 132 - // Silently ignore finalize errors 133 } 134 } 135 136 - // Send final result 137 - self.postMessage({ 138 - type: 'result', 139 - totalOps, 140 - totalBytes, 141 - matches: flush ? [] : matches, 142 - data: prepareResult || null 143 - } as WorkerResult); 144 - };
··· 1 /// <reference lib="webworker" /> 2 3 + import { search as jmespathSearch } from '@jmespath-community/jmespath'; 4 + 5 export interface WorkerTask { 6 dir: string; 7 start: number; 8 end: number; 9 + modulePath?: string; 10 + expression?: string; 11 + useSimple?: boolean; 12 silent?: boolean; 13 flush?: boolean; 14 + mode?: 'detect' | 'process' | 'query'; 15 } 16 17 export interface WorkerProgress { 18 type: 'progress'; 19 + currentBundle: number; 20 totalOps: number; 21 totalBytes: number; 22 + matchCount: number; 23 } 24 25 + export interface WorkerMatchBatch { 26 + type: 'match-batch'; 27 + matches: Array<{ result: any }>; 28 } 29 30 export interface WorkerResult { 31 type: 'result'; 32 totalOps: number; 33 totalBytes: number; 34 + matchCount: number; 35 + data?: any; 36 } 37 38 self.onmessage = async (event: MessageEvent<WorkerTask>) => { 39 + const { 40 + dir, 41 + start, 42 + end, 43 + modulePath, 44 + expression, 45 + useSimple, 46 + silent, 47 + flush, 48 + mode = 'detect' 49 + } = event.data; 50 51 if (silent) { 52 globalThis.console = { 53 log: () => {}, ··· 59 } as any; 60 } 61 62 + let userFn: any; 63 + let queryFn: ((op: any) => any) | null = null; 64 + 65 + if (mode === 'query') { 66 + // Query mode 67 + if (useSimple) { 68 + const compiled = compileSimplePath(expression!); 69 + 70 + // Ultra-fast path for single property 71 + if (compiled.segments.length === 1 && compiled.segments[0].type === 'property') { 72 + const prop = compiled.segments[0].value as string; 73 + queryFn = (op) => op[prop]; 74 + } else { 75 + queryFn = (op) => querySimplePath(op, compiled); 76 + } 77 + } else { 78 + queryFn = (op) => jmespathSearch(op, expression!); 79 + } 80 + } else { 81 + // Detect or process mode 82 + const mod = await import(modulePath!); 83 + userFn = mode === 'detect' ? (mod.detect || mod.default) : (mod.process || mod.default); 84 + } 85 86 let totalOps = 0; 87 let totalBytes = 0; 88 + let matchCount = 0; 89 + const BATCH_SIZE = 1000; 90 + let matchBatch: any[] = []; 91 + 92 + const flushBatch = () => { 93 + if (matchBatch.length > 0) { 94 + self.postMessage({ type: 'match-batch', matches: matchBatch } as WorkerMatchBatch); 95 + matchBatch = []; 96 + } 97 + }; 98 99 for (let bundleNum = start; bundleNum <= end; bundleNum++) { 100 const bundlePath = `${dir}${bundleNum.toString().padStart(6, '0')}.jsonl.zst`; ··· 103 const compressed = await Bun.file(bundlePath).arrayBuffer(); 104 const decompressed = Bun.zstdDecompressSync(compressed); 105 const text = new TextDecoder().decode(decompressed); 106 const lines = text.split('\n'); 107 108 for (let position = 0; position < lines.length; position++) { ··· 111 112 totalOps++; 113 totalBytes += line.length; 114 const op = JSON.parse(line); 115 116 + if (mode === 'query') { 117 + try { 118 + const result = queryFn!(op); 119 120 + if (result !== null && result !== undefined) { 121 + matchCount++; 122 + matchBatch.push({ result }); 123 + if (matchBatch.length >= BATCH_SIZE) flushBatch(); 124 } 125 + } catch (error) {} 126 + } else if (mode === 'detect') { 127 + const labels = userFn({ op }); 128 + if (labels && labels.length > 0) { 129 + matchCount++; 130 + matchBatch.push({ result: { bundle: bundleNum, position, cid: op.cid.slice(-4), labels } }); 131 + if (matchBatch.length >= BATCH_SIZE) flushBatch(); 132 } 133 } else { 134 + // process mode 135 userFn({ op, position, bundle: bundleNum, line }); 136 } 137 138 if (totalOps % 10000 === 0) { 139 + self.postMessage({ type: 'progress', currentBundle: bundleNum, totalOps, totalBytes, matchCount } as WorkerProgress); 140 } 141 } 142 + 143 + self.postMessage({ type: 'progress', currentBundle: bundleNum, totalOps, totalBytes, matchCount } as WorkerProgress); 144 + } catch (error) {} 145 } 146 + 147 + flushBatch(); 148 149 let prepareResult: any; 150 + if (mode !== 'query' && modulePath) { 151 + const mod = await import(modulePath); 152 + if (typeof mod.prepare === 'function') { 153 + try { 154 + prepareResult = await mod.prepare(); 155 + } catch (error) {} 156 } 157 } 158 159 + self.postMessage({ type: 'result', totalOps, totalBytes, matchCount, data: prepareResult || null } as WorkerResult); 160 + }; 161 + 162 + interface SimplePath { 163 + segments: Array<{ type: 'property' | 'index'; value: string | number }>; 164 + } 165 + 166 + function compileSimplePath(expression: string): SimplePath { 167 + const segments: Array<{ type: 'property' | 'index'; value: string | number }> = []; 168 + let current = ''; 169 + let i = 0; 170 + 171 + while (i < expression.length) { 172 + const char = expression[i]; 173 + if (char === '.') { 174 + if (current) segments.push({ type: 'property', value: current }); 175 + current = ''; 176 + i++; 177 + } else if (char === '[') { 178 + if (current) segments.push({ type: 'property', value: current }); 179 + current = ''; 180 + i++; 181 + let index = ''; 182 + while (i < expression.length && expression[i] !== ']') { 183 + index += expression[i]; 184 + i++; 185 + } 186 + segments.push({ type: 'index', value: parseInt(index) }); 187 + i++; 188 + } else { 189 + current += char; 190 + i++; 191 + } 192 + } 193 + if (current) segments.push({ type: 'property', value: current }); 194 + return { segments }; 195 + } 196 + 197 + function querySimplePath(obj: any, compiled: SimplePath): any { 198 + let current = obj; 199 + for (const segment of compiled.segments) { 200 + if (current == null) return null; 201 + if (segment.type === 'property') { 202 + current = current[segment.value]; 203 + } else { 204 + if (Array.isArray(current)) { 205 + current = current[segment.value as number]; 206 + } else { 207 + return null; 208 + } 209 + } 210 + } 211 + return current; 212 + }