source dump of claude code
at main 370 lines 12 kB view raw
1/** 2 * Pure-TypeScript port of vendor/file-index-src (Rust NAPI module). 3 * 4 * The native module wraps nucleo (https://github.com/helix-editor/nucleo) for 5 * high-performance fuzzy file searching. This port reimplements the same API 6 * and scoring behavior without native dependencies. 7 * 8 * Key API: 9 * new FileIndex() 10 * .loadFromFileList(fileList: string[]): void — dedupe + index paths 11 * .search(query: string, limit: number): SearchResult[] 12 * 13 * Score semantics: lower = better. Score is position-in-results / result-count, 14 * so the best match is 0.0. Paths containing "test" get a 1.05× penalty (capped 15 * at 1.0) so non-test files rank slightly higher. 16 */ 17 18export type SearchResult = { 19 path: string 20 score: number 21} 22 23// nucleo-style scoring constants (approximating fzf-v2 / nucleo bonuses) 24const SCORE_MATCH = 16 25const BONUS_BOUNDARY = 8 26const BONUS_CAMEL = 6 27const BONUS_CONSECUTIVE = 4 28const BONUS_FIRST_CHAR = 8 29const PENALTY_GAP_START = 3 30const PENALTY_GAP_EXTENSION = 1 31 32const TOP_LEVEL_CACHE_LIMIT = 100 33const MAX_QUERY_LEN = 64 34// Yield to event loop after this many ms of sync work. Chunk sizes are 35// time-based (not count-based) so slow machines get smaller chunks and 36// stay responsive — 5k paths is ~2ms on M-series but could be 15ms+ on 37// older Windows hardware. 38const CHUNK_MS = 4 39 40// Reusable buffer: records where each needle char matched during the indexOf scan 41const posBuf = new Int32Array(MAX_QUERY_LEN) 42 43export class FileIndex { 44 private paths: string[] = [] 45 private lowerPaths: string[] = [] 46 private charBits: Int32Array = new Int32Array(0) 47 private pathLens: Uint16Array = new Uint16Array(0) 48 private topLevelCache: SearchResult[] | null = null 49 // During async build, tracks how many paths have bitmap/lowerPath filled. 50 // search() uses this to search the ready prefix while build continues. 51 private readyCount = 0 52 53 /** 54 * Load paths from an array of strings. 55 * This is the main way to populate the index — ripgrep collects files, we just search them. 56 * Automatically deduplicates paths. 57 */ 58 loadFromFileList(fileList: string[]): void { 59 // Deduplicate and filter empty strings (matches Rust HashSet behavior) 60 const seen = new Set<string>() 61 const paths: string[] = [] 62 for (const line of fileList) { 63 if (line.length > 0 && !seen.has(line)) { 64 seen.add(line) 65 paths.push(line) 66 } 67 } 68 69 this.buildIndex(paths) 70 } 71 72 /** 73 * Async variant: yields to the event loop every ~8–12k paths so large 74 * indexes (270k+ files) don't block the main thread for >10ms at a time. 75 * Identical result to loadFromFileList. 76 * 77 * Returns { queryable, done }: 78 * - queryable: resolves as soon as the first chunk is indexed (search 79 * returns partial results). For a 270k-path list this is ~5–10ms of 80 * sync work after the paths array is available. 81 * - done: resolves when the entire index is built. 82 */ 83 loadFromFileListAsync(fileList: string[]): { 84 queryable: Promise<void> 85 done: Promise<void> 86 } { 87 let markQueryable: () => void = () => {} 88 const queryable = new Promise<void>(resolve => { 89 markQueryable = resolve 90 }) 91 const done = this.buildAsync(fileList, markQueryable) 92 return { queryable, done } 93 } 94 95 private async buildAsync( 96 fileList: string[], 97 markQueryable: () => void, 98 ): Promise<void> { 99 const seen = new Set<string>() 100 const paths: string[] = [] 101 let chunkStart = performance.now() 102 for (let i = 0; i < fileList.length; i++) { 103 const line = fileList[i]! 104 if (line.length > 0 && !seen.has(line)) { 105 seen.add(line) 106 paths.push(line) 107 } 108 // Check every 256 iterations to amortize performance.now() overhead 109 if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) { 110 await yieldToEventLoop() 111 chunkStart = performance.now() 112 } 113 } 114 115 this.resetArrays(paths) 116 117 chunkStart = performance.now() 118 let firstChunk = true 119 for (let i = 0; i < paths.length; i++) { 120 this.indexPath(i) 121 if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) { 122 this.readyCount = i + 1 123 if (firstChunk) { 124 markQueryable() 125 firstChunk = false 126 } 127 await yieldToEventLoop() 128 chunkStart = performance.now() 129 } 130 } 131 this.readyCount = paths.length 132 markQueryable() 133 } 134 135 private buildIndex(paths: string[]): void { 136 this.resetArrays(paths) 137 for (let i = 0; i < paths.length; i++) { 138 this.indexPath(i) 139 } 140 this.readyCount = paths.length 141 } 142 143 private resetArrays(paths: string[]): void { 144 const n = paths.length 145 this.paths = paths 146 this.lowerPaths = new Array(n) 147 this.charBits = new Int32Array(n) 148 this.pathLens = new Uint16Array(n) 149 this.readyCount = 0 150 this.topLevelCache = computeTopLevelEntries(paths, TOP_LEVEL_CACHE_LIMIT) 151 } 152 153 // Precompute: lowercase, a–z bitmap, length. Bitmap gives O(1) rejection 154 // of paths missing any needle letter (89% survival for broad queries like 155 // "test" → still a 10%+ free win; 90%+ rejection for rare chars). 156 private indexPath(i: number): void { 157 const lp = this.paths[i]!.toLowerCase() 158 this.lowerPaths[i] = lp 159 const len = lp.length 160 this.pathLens[i] = len 161 let bits = 0 162 for (let j = 0; j < len; j++) { 163 const c = lp.charCodeAt(j) 164 if (c >= 97 && c <= 122) bits |= 1 << (c - 97) 165 } 166 this.charBits[i] = bits 167 } 168 169 /** 170 * Search for files matching the query using fuzzy matching. 171 * Returns top N results sorted by match score. 172 */ 173 search(query: string, limit: number): SearchResult[] { 174 if (limit <= 0) return [] 175 if (query.length === 0) { 176 if (this.topLevelCache) { 177 return this.topLevelCache.slice(0, limit) 178 } 179 return [] 180 } 181 182 // Smart case: lowercase query → case-insensitive; any uppercase → case-sensitive 183 const caseSensitive = query !== query.toLowerCase() 184 const needle = caseSensitive ? query : query.toLowerCase() 185 const nLen = Math.min(needle.length, MAX_QUERY_LEN) 186 const needleChars: string[] = new Array(nLen) 187 let needleBitmap = 0 188 for (let j = 0; j < nLen; j++) { 189 const ch = needle.charAt(j) 190 needleChars[j] = ch 191 const cc = ch.charCodeAt(0) 192 if (cc >= 97 && cc <= 122) needleBitmap |= 1 << (cc - 97) 193 } 194 195 // Upper bound on score assuming every match gets the max boundary bonus. 196 // Used to reject paths whose gap penalties alone make them unable to beat 197 // the current top-k threshold, before the charCodeAt-heavy boundary pass. 198 const scoreCeiling = 199 nLen * (SCORE_MATCH + BONUS_BOUNDARY) + BONUS_FIRST_CHAR + 32 200 201 // Top-k: maintain a sorted-ascending array of the best `limit` matches. 202 // Avoids O(n log n) sort of all matches when we only need `limit` of them. 203 const topK: { path: string; fuzzScore: number }[] = [] 204 let threshold = -Infinity 205 206 const { paths, lowerPaths, charBits, pathLens, readyCount } = this 207 208 outer: for (let i = 0; i < readyCount; i++) { 209 // O(1) bitmap reject: path must contain every letter in the needle 210 if ((charBits[i]! & needleBitmap) !== needleBitmap) continue 211 212 const haystack = caseSensitive ? paths[i]! : lowerPaths[i]! 213 214 // Fused indexOf scan: find positions (SIMD-accelerated in JSC/V8) AND 215 // accumulate gap/consecutive terms inline. The greedy-earliest positions 216 // found here are identical to what the charCodeAt scorer would find, so 217 // we score directly from them — no second scan. 218 let pos = haystack.indexOf(needleChars[0]!) 219 if (pos === -1) continue 220 posBuf[0] = pos 221 let gapPenalty = 0 222 let consecBonus = 0 223 let prev = pos 224 for (let j = 1; j < nLen; j++) { 225 pos = haystack.indexOf(needleChars[j]!, prev + 1) 226 if (pos === -1) continue outer 227 posBuf[j] = pos 228 const gap = pos - prev - 1 229 if (gap === 0) consecBonus += BONUS_CONSECUTIVE 230 else gapPenalty += PENALTY_GAP_START + gap * PENALTY_GAP_EXTENSION 231 prev = pos 232 } 233 234 // Gap-bound reject: if the best-case score (all boundary bonuses) minus 235 // known gap penalties can't beat threshold, skip the boundary pass. 236 if ( 237 topK.length === limit && 238 scoreCeiling + consecBonus - gapPenalty <= threshold 239 ) { 240 continue 241 } 242 243 // Boundary/camelCase scoring: check the char before each match position. 244 const path = paths[i]! 245 const hLen = pathLens[i]! 246 let score = nLen * SCORE_MATCH + consecBonus - gapPenalty 247 score += scoreBonusAt(path, posBuf[0]!, true) 248 for (let j = 1; j < nLen; j++) { 249 score += scoreBonusAt(path, posBuf[j]!, false) 250 } 251 score += Math.max(0, 32 - (hLen >> 2)) 252 253 if (topK.length < limit) { 254 topK.push({ path, fuzzScore: score }) 255 if (topK.length === limit) { 256 topK.sort((a, b) => a.fuzzScore - b.fuzzScore) 257 threshold = topK[0]!.fuzzScore 258 } 259 } else if (score > threshold) { 260 let lo = 0 261 let hi = topK.length 262 while (lo < hi) { 263 const mid = (lo + hi) >> 1 264 if (topK[mid]!.fuzzScore < score) lo = mid + 1 265 else hi = mid 266 } 267 topK.splice(lo, 0, { path, fuzzScore: score }) 268 topK.shift() 269 threshold = topK[0]!.fuzzScore 270 } 271 } 272 273 // topK is ascending; reverse to descending (best first) 274 topK.sort((a, b) => b.fuzzScore - a.fuzzScore) 275 276 const matchCount = topK.length 277 const denom = Math.max(matchCount, 1) 278 const results: SearchResult[] = new Array(matchCount) 279 280 for (let i = 0; i < matchCount; i++) { 281 const path = topK[i]!.path 282 const positionScore = i / denom 283 const finalScore = path.includes('test') 284 ? Math.min(positionScore * 1.05, 1.0) 285 : positionScore 286 results[i] = { path, score: finalScore } 287 } 288 289 return results 290 } 291} 292 293/** 294 * Boundary/camelCase bonus for a match at position `pos` in the original-case 295 * path. `first` enables the start-of-string bonus (only for needle[0]). 296 */ 297function scoreBonusAt(path: string, pos: number, first: boolean): number { 298 if (pos === 0) return first ? BONUS_FIRST_CHAR : 0 299 const prevCh = path.charCodeAt(pos - 1) 300 if (isBoundary(prevCh)) return BONUS_BOUNDARY 301 if (isLower(prevCh) && isUpper(path.charCodeAt(pos))) return BONUS_CAMEL 302 return 0 303} 304 305function isBoundary(code: number): boolean { 306 // / \ - _ . space 307 return ( 308 code === 47 || // / 309 code === 92 || // \ 310 code === 45 || // - 311 code === 95 || // _ 312 code === 46 || // . 313 code === 32 // space 314 ) 315} 316 317function isLower(code: number): boolean { 318 return code >= 97 && code <= 122 319} 320 321function isUpper(code: number): boolean { 322 return code >= 65 && code <= 90 323} 324 325export function yieldToEventLoop(): Promise<void> { 326 return new Promise(resolve => setImmediate(resolve)) 327} 328 329export { CHUNK_MS } 330 331/** 332 * Extract unique top-level path segments, sorted by (length asc, then alpha asc). 333 * Handles both Unix (/) and Windows (\) path separators. 334 * Mirrors FileIndex::compute_top_level_entries in lib.rs. 335 */ 336function computeTopLevelEntries( 337 paths: string[], 338 limit: number, 339): SearchResult[] { 340 const topLevel = new Set<string>() 341 342 for (const p of paths) { 343 // Split on first / or \ separator 344 let end = p.length 345 for (let i = 0; i < p.length; i++) { 346 const c = p.charCodeAt(i) 347 if (c === 47 || c === 92) { 348 end = i 349 break 350 } 351 } 352 const segment = p.slice(0, end) 353 if (segment.length > 0) { 354 topLevel.add(segment) 355 if (topLevel.size >= limit) break 356 } 357 } 358 359 const sorted = Array.from(topLevel) 360 sorted.sort((a, b) => { 361 const lenDiff = a.length - b.length 362 if (lenDiff !== 0) return lenDiff 363 return a < b ? -1 : a > b ? 1 : 0 364 }) 365 366 return sorted.slice(0, limit).map(path => ({ path, score: 0.0 })) 367} 368 369export default FileIndex 370export type { FileIndex as FileIndexType }