native-ts/file-index/index.ts at main · nonbinary.computer/claude-code

nonbinary.computer / claude-code
forked from oppi.li/claude-code
fork atom
source dump of claude code
fork atom
claude-code / native-ts / file-index / index.ts
at main 370 lines 12 kB view raw
wrap content
oppi.li dump from zip 9d ago
63aada3f
  1/**
  2 * Pure-TypeScript port of vendor/file-index-src (Rust NAPI module).
  3 *
  4 * The native module wraps nucleo (https://github.com/helix-editor/nucleo) for
  5 * high-performance fuzzy file searching. This port reimplements the same API
  6 * and scoring behavior without native dependencies.
  7 *
  8 * Key API:
  9 *   new FileIndex()
 10 *   .loadFromFileList(fileList: string[]): void   — dedupe + index paths
 11 *   .search(query: string, limit: number): SearchResult[]
 12 *
 13 * Score semantics: lower = better. Score is position-in-results / result-count,
 14 * so the best match is 0.0. Paths containing "test" get a 1.05× penalty (capped
 15 * at 1.0) so non-test files rank slightly higher.
 16 */
 17
 18export type SearchResult = {
 19  path: string
 20  score: number
 21}
 22
 23// nucleo-style scoring constants (approximating fzf-v2 / nucleo bonuses)
 24const SCORE_MATCH = 16
 25const BONUS_BOUNDARY = 8
 26const BONUS_CAMEL = 6
 27const BONUS_CONSECUTIVE = 4
 28const BONUS_FIRST_CHAR = 8
 29const PENALTY_GAP_START = 3
 30const PENALTY_GAP_EXTENSION = 1
 31
 32const TOP_LEVEL_CACHE_LIMIT = 100
 33const MAX_QUERY_LEN = 64
 34// Yield to event loop after this many ms of sync work. Chunk sizes are
 35// time-based (not count-based) so slow machines get smaller chunks and
 36// stay responsive — 5k paths is ~2ms on M-series but could be 15ms+ on
 37// older Windows hardware.
 38const CHUNK_MS = 4
 39
 40// Reusable buffer: records where each needle char matched during the indexOf scan
 41const posBuf = new Int32Array(MAX_QUERY_LEN)
 42
 43export class FileIndex {
 44  private paths: string[] = []
 45  private lowerPaths: string[] = []
 46  private charBits: Int32Array = new Int32Array(0)
 47  private pathLens: Uint16Array = new Uint16Array(0)
 48  private topLevelCache: SearchResult[] | null = null
 49  // During async build, tracks how many paths have bitmap/lowerPath filled.
 50  // search() uses this to search the ready prefix while build continues.
 51  private readyCount = 0
 52
 53  /**
 54   * Load paths from an array of strings.
 55   * This is the main way to populate the index — ripgrep collects files, we just search them.
 56   * Automatically deduplicates paths.
 57   */
 58  loadFromFileList(fileList: string[]): void {
 59    // Deduplicate and filter empty strings (matches Rust HashSet behavior)
 60    const seen = new Set<string>()
 61    const paths: string[] = []
 62    for (const line of fileList) {
 63      if (line.length > 0 && !seen.has(line)) {
 64        seen.add(line)
 65        paths.push(line)
 66      }
 67    }
 68
 69    this.buildIndex(paths)
 70  }
 71
 72  /**
 73   * Async variant: yields to the event loop every ~8–12k paths so large
 74   * indexes (270k+ files) don't block the main thread for >10ms at a time.
 75   * Identical result to loadFromFileList.
 76   *
 77   * Returns { queryable, done }:
 78   *   - queryable: resolves as soon as the first chunk is indexed (search
 79   *     returns partial results). For a 270k-path list this is ~5–10ms of
 80   *     sync work after the paths array is available.
 81   *   - done: resolves when the entire index is built.
 82   */
 83  loadFromFileListAsync(fileList: string[]): {
 84    queryable: Promise<void>
 85    done: Promise<void>
 86  } {
 87    let markQueryable: () => void = () => {}
 88    const queryable = new Promise<void>(resolve => {
 89      markQueryable = resolve
 90    })
 91    const done = this.buildAsync(fileList, markQueryable)
 92    return { queryable, done }
 93  }
 94
 95  private async buildAsync(
 96    fileList: string[],
 97    markQueryable: () => void,
 98  ): Promise<void> {
 99    const seen = new Set<string>()
100    const paths: string[] = []
101    let chunkStart = performance.now()
102    for (let i = 0; i < fileList.length; i++) {
103      const line = fileList[i]!
104      if (line.length > 0 && !seen.has(line)) {
105        seen.add(line)
106        paths.push(line)
107      }
108      // Check every 256 iterations to amortize performance.now() overhead
109      if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) {
110        await yieldToEventLoop()
111        chunkStart = performance.now()
112      }
113    }
114
115    this.resetArrays(paths)
116
117    chunkStart = performance.now()
118    let firstChunk = true
119    for (let i = 0; i < paths.length; i++) {
120      this.indexPath(i)
121      if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) {
122        this.readyCount = i + 1
123        if (firstChunk) {
124          markQueryable()
125          firstChunk = false
126        }
127        await yieldToEventLoop()
128        chunkStart = performance.now()
129      }
130    }
131    this.readyCount = paths.length
132    markQueryable()
133  }
134
135  private buildIndex(paths: string[]): void {
136    this.resetArrays(paths)
137    for (let i = 0; i < paths.length; i++) {
138      this.indexPath(i)
139    }
140    this.readyCount = paths.length
141  }
142
143  private resetArrays(paths: string[]): void {
144    const n = paths.length
145    this.paths = paths
146    this.lowerPaths = new Array(n)
147    this.charBits = new Int32Array(n)
148    this.pathLens = new Uint16Array(n)
149    this.readyCount = 0
150    this.topLevelCache = computeTopLevelEntries(paths, TOP_LEVEL_CACHE_LIMIT)
151  }
152
153  // Precompute: lowercase, a–z bitmap, length. Bitmap gives O(1) rejection
154  // of paths missing any needle letter (89% survival for broad queries like
155  // "test" → still a 10%+ free win; 90%+ rejection for rare chars).
156  private indexPath(i: number): void {
157    const lp = this.paths[i]!.toLowerCase()
158    this.lowerPaths[i] = lp
159    const len = lp.length
160    this.pathLens[i] = len
161    let bits = 0
162    for (let j = 0; j < len; j++) {
163      const c = lp.charCodeAt(j)
164      if (c >= 97 && c <= 122) bits |= 1 << (c - 97)
165    }
166    this.charBits[i] = bits
167  }
168
169  /**
170   * Search for files matching the query using fuzzy matching.
171   * Returns top N results sorted by match score.
172   */
173  search(query: string, limit: number): SearchResult[] {
174    if (limit <= 0) return []
175    if (query.length === 0) {
176      if (this.topLevelCache) {
177        return this.topLevelCache.slice(0, limit)
178      }
179      return []
180    }
181
182    // Smart case: lowercase query → case-insensitive; any uppercase → case-sensitive
183    const caseSensitive = query !== query.toLowerCase()
184    const needle = caseSensitive ? query : query.toLowerCase()
185    const nLen = Math.min(needle.length, MAX_QUERY_LEN)
186    const needleChars: string[] = new Array(nLen)
187    let needleBitmap = 0
188    for (let j = 0; j < nLen; j++) {
189      const ch = needle.charAt(j)
190      needleChars[j] = ch
191      const cc = ch.charCodeAt(0)
192      if (cc >= 97 && cc <= 122) needleBitmap |= 1 << (cc - 97)
193    }
194
195    // Upper bound on score assuming every match gets the max boundary bonus.
196    // Used to reject paths whose gap penalties alone make them unable to beat
197    // the current top-k threshold, before the charCodeAt-heavy boundary pass.
198    const scoreCeiling =
199      nLen * (SCORE_MATCH + BONUS_BOUNDARY) + BONUS_FIRST_CHAR + 32
200
201    // Top-k: maintain a sorted-ascending array of the best `limit` matches.
202    // Avoids O(n log n) sort of all matches when we only need `limit` of them.
203    const topK: { path: string; fuzzScore: number }[] = []
204    let threshold = -Infinity
205
206    const { paths, lowerPaths, charBits, pathLens, readyCount } = this
207
208    outer: for (let i = 0; i < readyCount; i++) {
209      // O(1) bitmap reject: path must contain every letter in the needle
210      if ((charBits[i]! & needleBitmap) !== needleBitmap) continue
211
212      const haystack = caseSensitive ? paths[i]! : lowerPaths[i]!
213
214      // Fused indexOf scan: find positions (SIMD-accelerated in JSC/V8) AND
215      // accumulate gap/consecutive terms inline. The greedy-earliest positions
216      // found here are identical to what the charCodeAt scorer would find, so
217      // we score directly from them — no second scan.
218      let pos = haystack.indexOf(needleChars[0]!)
219      if (pos === -1) continue
220      posBuf[0] = pos
221      let gapPenalty = 0
222      let consecBonus = 0
223      let prev = pos
224      for (let j = 1; j < nLen; j++) {
225        pos = haystack.indexOf(needleChars[j]!, prev + 1)
226        if (pos === -1) continue outer
227        posBuf[j] = pos
228        const gap = pos - prev - 1
229        if (gap === 0) consecBonus += BONUS_CONSECUTIVE
230        else gapPenalty += PENALTY_GAP_START + gap * PENALTY_GAP_EXTENSION
231        prev = pos
232      }
233
234      // Gap-bound reject: if the best-case score (all boundary bonuses) minus
235      // known gap penalties can't beat threshold, skip the boundary pass.
236      if (
237        topK.length === limit &&
238        scoreCeiling + consecBonus - gapPenalty <= threshold
239      ) {
240        continue
241      }
242
243      // Boundary/camelCase scoring: check the char before each match position.
244      const path = paths[i]!
245      const hLen = pathLens[i]!
246      let score = nLen * SCORE_MATCH + consecBonus - gapPenalty
247      score += scoreBonusAt(path, posBuf[0]!, true)
248      for (let j = 1; j < nLen; j++) {
249        score += scoreBonusAt(path, posBuf[j]!, false)
250      }
251      score += Math.max(0, 32 - (hLen >> 2))
252
253      if (topK.length < limit) {
254        topK.push({ path, fuzzScore: score })
255        if (topK.length === limit) {
256          topK.sort((a, b) => a.fuzzScore - b.fuzzScore)
257          threshold = topK[0]!.fuzzScore
258        }
259      } else if (score > threshold) {
260        let lo = 0
261        let hi = topK.length
262        while (lo < hi) {
263          const mid = (lo + hi) >> 1
264          if (topK[mid]!.fuzzScore < score) lo = mid + 1
265          else hi = mid
266        }
267        topK.splice(lo, 0, { path, fuzzScore: score })
268        topK.shift()
269        threshold = topK[0]!.fuzzScore
270      }
271    }
272
273    // topK is ascending; reverse to descending (best first)
274    topK.sort((a, b) => b.fuzzScore - a.fuzzScore)
275
276    const matchCount = topK.length
277    const denom = Math.max(matchCount, 1)
278    const results: SearchResult[] = new Array(matchCount)
279
280    for (let i = 0; i < matchCount; i++) {
281      const path = topK[i]!.path
282      const positionScore = i / denom
283      const finalScore = path.includes('test')
284        ? Math.min(positionScore * 1.05, 1.0)
285        : positionScore
286      results[i] = { path, score: finalScore }
287    }
288
289    return results
290  }
291}
292
293/**
294 * Boundary/camelCase bonus for a match at position `pos` in the original-case
295 * path. `first` enables the start-of-string bonus (only for needle[0]).
296 */
297function scoreBonusAt(path: string, pos: number, first: boolean): number {
298  if (pos === 0) return first ? BONUS_FIRST_CHAR : 0
299  const prevCh = path.charCodeAt(pos - 1)
300  if (isBoundary(prevCh)) return BONUS_BOUNDARY
301  if (isLower(prevCh) && isUpper(path.charCodeAt(pos))) return BONUS_CAMEL
302  return 0
303}
304
305function isBoundary(code: number): boolean {
306  // / \ - _ . space
307  return (
308    code === 47 || // /
309    code === 92 || // \
310    code === 45 || // -
311    code === 95 || // _
312    code === 46 || // .
313    code === 32 // space
314  )
315}
316
317function isLower(code: number): boolean {
318  return code >= 97 && code <= 122
319}
320
321function isUpper(code: number): boolean {
322  return code >= 65 && code <= 90
323}
324
325export function yieldToEventLoop(): Promise<void> {
326  return new Promise(resolve => setImmediate(resolve))
327}
328
329export { CHUNK_MS }
330
331/**
332 * Extract unique top-level path segments, sorted by (length asc, then alpha asc).
333 * Handles both Unix (/) and Windows (\) path separators.
334 * Mirrors FileIndex::compute_top_level_entries in lib.rs.
335 */
336function computeTopLevelEntries(
337  paths: string[],
338  limit: number,
339): SearchResult[] {
340  const topLevel = new Set<string>()
341
342  for (const p of paths) {
343    // Split on first / or \ separator
344    let end = p.length
345    for (let i = 0; i < p.length; i++) {
346      const c = p.charCodeAt(i)
347      if (c === 47 || c === 92) {
348        end = i
349        break
350      }
351    }
352    const segment = p.slice(0, end)
353    if (segment.length > 0) {
354      topLevel.add(segment)
355      if (topLevel.size >= limit) break
356    }
357  }
358
359  const sorted = Array.from(topLevel)
360  sorted.sort((a, b) => {
361    const lenDiff = a.length - b.length
362    if (lenDiff !== 0) return lenDiff
363    return a < b ? -1 : a > b ? 1 : 0
364  })
365
366  return sorted.slice(0, limit).map(path => ({ path, score: 0.0 }))
367}
368
369export default FileIndex
370export type { FileIndex as FileIndexType }