utils/sessionStoragePortable.ts at main

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / utils / sessionStoragePortable.ts
at main 793 lines 25 kB view raw
wrap content
oppi.li dump from zip 2d ago
63aada3f
  1/**
  2 * Portable session storage utilities.
  3 *
  4 * Pure Node.js — no internal dependencies on logging, experiments, or feature
  5 * flags. Shared between the CLI (src/utils/sessionStorage.ts) and the VS Code
  6 * extension (packages/claude-vscode/src/common-host/sessionStorage.ts).
  7 */
  8
  9import type { UUID } from 'crypto'
 10import { open as fsOpen, readdir, realpath, stat } from 'fs/promises'
 11import { join } from 'path'
 12import { getClaudeConfigHomeDir } from './envUtils.js'
 13import { getWorktreePathsPortable } from './getWorktreePathsPortable.js'
 14import { djb2Hash } from './hash.js'
 15
 16/** Size of the head/tail buffer for lite metadata reads. */
 17export const LITE_READ_BUF_SIZE = 65536
 18
 19// ---------------------------------------------------------------------------
 20// UUID validation
 21// ---------------------------------------------------------------------------
 22
 23const uuidRegex =
 24  /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
 25
 26export function validateUuid(maybeUuid: unknown): UUID | null {
 27  if (typeof maybeUuid !== 'string') return null
 28  return uuidRegex.test(maybeUuid) ? (maybeUuid as UUID) : null
 29}
 30
 31// ---------------------------------------------------------------------------
 32// JSON string field extraction — no full parse, works on truncated lines
 33// ---------------------------------------------------------------------------
 34
 35/**
 36 * Unescape a JSON string value extracted as raw text.
 37 * Only allocates a new string when escape sequences are present.
 38 */
 39export function unescapeJsonString(raw: string): string {
 40  if (!raw.includes('\\')) return raw
 41  try {
 42    return JSON.parse(`"${raw}"`)
 43  } catch {
 44    return raw
 45  }
 46}
 47
 48/**
 49 * Extracts a simple JSON string field value from raw text without full parsing.
 50 * Looks for `"key":"value"` or `"key": "value"` patterns.
 51 * Returns the first match, or undefined if not found.
 52 */
 53export function extractJsonStringField(
 54  text: string,
 55  key: string,
 56): string | undefined {
 57  const patterns = [`"${key}":"`, `"${key}": "`]
 58  for (const pattern of patterns) {
 59    const idx = text.indexOf(pattern)
 60    if (idx < 0) continue
 61
 62    const valueStart = idx + pattern.length
 63    let i = valueStart
 64    while (i < text.length) {
 65      if (text[i] === '\\') {
 66        i += 2
 67        continue
 68      }
 69      if (text[i] === '"') {
 70        return unescapeJsonString(text.slice(valueStart, i))
 71      }
 72      i++
 73    }
 74  }
 75  return undefined
 76}
 77
 78/**
 79 * Like extractJsonStringField but finds the LAST occurrence.
 80 * Useful for fields that are appended (customTitle, tag, etc.).
 81 */
 82export function extractLastJsonStringField(
 83  text: string,
 84  key: string,
 85): string | undefined {
 86  const patterns = [`"${key}":"`, `"${key}": "`]
 87  let lastValue: string | undefined
 88  for (const pattern of patterns) {
 89    let searchFrom = 0
 90    while (true) {
 91      const idx = text.indexOf(pattern, searchFrom)
 92      if (idx < 0) break
 93
 94      const valueStart = idx + pattern.length
 95      let i = valueStart
 96      while (i < text.length) {
 97        if (text[i] === '\\') {
 98          i += 2
 99          continue
100        }
101        if (text[i] === '"') {
102          lastValue = unescapeJsonString(text.slice(valueStart, i))
103          break
104        }
105        i++
106      }
107      searchFrom = i + 1
108    }
109  }
110  return lastValue
111}
112
113// ---------------------------------------------------------------------------
114// First prompt extraction from head chunk
115// ---------------------------------------------------------------------------
116
117/**
118 * Pattern matching auto-generated or system messages that should be skipped
119 * when looking for the first meaningful user prompt. Matches anything that
120 * starts with a lowercase XML-like tag (IDE context, hook output, task
121 * notifications, channel messages, etc.) or a synthetic interrupt marker.
122 */
123const SKIP_FIRST_PROMPT_PATTERN =
124  /^(?:\s*<[a-z][\w-]*[\s>]|\[Request interrupted by user[^\]]*\])/
125
126const COMMAND_NAME_RE = /<command-name>(.*?)<\/command-name>/
127
128/**
129 * Extracts the first meaningful user prompt from a JSONL head chunk.
130 *
131 * Skips tool_result messages, isMeta, isCompactSummary, command-name messages,
132 * and auto-generated patterns (session hooks, tick, IDE metadata, etc.).
133 * Truncates to 200 chars.
134 */
135export function extractFirstPromptFromHead(head: string): string {
136  let start = 0
137  let commandFallback = ''
138  while (start < head.length) {
139    const newlineIdx = head.indexOf('\n', start)
140    const line =
141      newlineIdx >= 0 ? head.slice(start, newlineIdx) : head.slice(start)
142    start = newlineIdx >= 0 ? newlineIdx + 1 : head.length
143
144    if (!line.includes('"type":"user"') && !line.includes('"type": "user"'))
145      continue
146    if (line.includes('"tool_result"')) continue
147    if (line.includes('"isMeta":true') || line.includes('"isMeta": true'))
148      continue
149    if (
150      line.includes('"isCompactSummary":true') ||
151      line.includes('"isCompactSummary": true')
152    )
153      continue
154
155    try {
156      const entry = JSON.parse(line) as Record<string, unknown>
157      if (entry.type !== 'user') continue
158
159      const message = entry.message as Record<string, unknown> | undefined
160      if (!message) continue
161
162      const content = message.content
163      const texts: string[] = []
164      if (typeof content === 'string') {
165        texts.push(content)
166      } else if (Array.isArray(content)) {
167        for (const block of content as Record<string, unknown>[]) {
168          if (block.type === 'text' && typeof block.text === 'string') {
169            texts.push(block.text as string)
170          }
171        }
172      }
173
174      for (const raw of texts) {
175        let result = raw.replace(/\n/g, ' ').trim()
176        if (!result) continue
177
178        // Skip slash-command messages but remember first as fallback
179        const cmdMatch = COMMAND_NAME_RE.exec(result)
180        if (cmdMatch) {
181          if (!commandFallback) commandFallback = cmdMatch[1]!
182          continue
183        }
184
185        // Format bash input with ! prefix before the generic XML skip
186        const bashMatch = /<bash-input>([\s\S]*?)<\/bash-input>/.exec(result)
187        if (bashMatch) return `! ${bashMatch[1]!.trim()}`
188
189        if (SKIP_FIRST_PROMPT_PATTERN.test(result)) continue
190
191        if (result.length > 200) {
192          result = result.slice(0, 200).trim() + '\u2026'
193        }
194        return result
195      }
196    } catch {
197      continue
198    }
199  }
200  if (commandFallback) return commandFallback
201  return ''
202}
203
204// ---------------------------------------------------------------------------
205// File I/O — read head and tail of a file
206// ---------------------------------------------------------------------------
207
208/**
209 * Reads the first and last LITE_READ_BUF_SIZE bytes of a file.
210 *
211 * For small files where head covers tail, `tail === head`.
212 * Accepts a shared Buffer to avoid per-file allocation overhead.
213 * Returns `{ head: '', tail: '' }` on any error.
214 */
215export async function readHeadAndTail(
216  filePath: string,
217  fileSize: number,
218  buf: Buffer,
219): Promise<{ head: string; tail: string }> {
220  try {
221    const fh = await fsOpen(filePath, 'r')
222    try {
223      const headResult = await fh.read(buf, 0, LITE_READ_BUF_SIZE, 0)
224      if (headResult.bytesRead === 0) return { head: '', tail: '' }
225
226      const head = buf.toString('utf8', 0, headResult.bytesRead)
227
228      const tailOffset = Math.max(0, fileSize - LITE_READ_BUF_SIZE)
229      let tail = head
230      if (tailOffset > 0) {
231        const tailResult = await fh.read(buf, 0, LITE_READ_BUF_SIZE, tailOffset)
232        tail = buf.toString('utf8', 0, tailResult.bytesRead)
233      }
234
235      return { head, tail }
236    } finally {
237      await fh.close()
238    }
239  } catch {
240    return { head: '', tail: '' }
241  }
242}
243
244export type LiteSessionFile = {
245  mtime: number
246  size: number
247  head: string
248  tail: string
249}
250
251/**
252 * Opens a single session file, stats it, and reads head + tail in one fd.
253 * Allocates its own buffer — safe for concurrent use with Promise.all.
254 * Returns null on any error.
255 */
256export async function readSessionLite(
257  filePath: string,
258): Promise<LiteSessionFile | null> {
259  try {
260    const fh = await fsOpen(filePath, 'r')
261    try {
262      const stat = await fh.stat()
263      const buf = Buffer.allocUnsafe(LITE_READ_BUF_SIZE)
264      const headResult = await fh.read(buf, 0, LITE_READ_BUF_SIZE, 0)
265      if (headResult.bytesRead === 0) return null
266
267      const head = buf.toString('utf8', 0, headResult.bytesRead)
268      const tailOffset = Math.max(0, stat.size - LITE_READ_BUF_SIZE)
269      let tail = head
270      if (tailOffset > 0) {
271        const tailResult = await fh.read(buf, 0, LITE_READ_BUF_SIZE, tailOffset)
272        tail = buf.toString('utf8', 0, tailResult.bytesRead)
273      }
274
275      return { mtime: stat.mtime.getTime(), size: stat.size, head, tail }
276    } finally {
277      await fh.close()
278    }
279  } catch {
280    return null
281  }
282}
283
284// ---------------------------------------------------------------------------
285// Path sanitization
286// ---------------------------------------------------------------------------
287
288/**
289 * Maximum length for a single filesystem path component (directory or file name).
290 * Most filesystems (ext4, APFS, NTFS) limit individual components to 255 bytes.
291 * We use 200 to leave room for the hash suffix and separator.
292 */
293export const MAX_SANITIZED_LENGTH = 200
294
295function simpleHash(str: string): string {
296  return Math.abs(djb2Hash(str)).toString(36)
297}
298
299/**
300 * Makes a string safe for use as a directory or file name.
301 * Replaces all non-alphanumeric characters with hyphens.
302 * This ensures compatibility across all platforms, including Windows
303 * where characters like colons are reserved.
304 *
305 * For deeply nested paths that would exceed filesystem limits (255 bytes),
306 * truncates and appends a hash suffix for uniqueness.
307 *
308 * @param name - The string to make safe (e.g., '/Users/foo/my-project' or 'plugin:name:server')
309 * @returns A safe name (e.g., '-Users-foo-my-project' or 'plugin-name-server')
310 */
311export function sanitizePath(name: string): string {
312  const sanitized = name.replace(/[^a-zA-Z0-9]/g, '-')
313  if (sanitized.length <= MAX_SANITIZED_LENGTH) {
314    return sanitized
315  }
316  const hash =
317    typeof Bun !== 'undefined' ? Bun.hash(name).toString(36) : simpleHash(name)
318  return `${sanitized.slice(0, MAX_SANITIZED_LENGTH)}-${hash}`
319}
320
321// ---------------------------------------------------------------------------
322// Project directory discovery (shared by listSessions & getSessionMessages)
323// ---------------------------------------------------------------------------
324
325export function getProjectsDir(): string {
326  return join(getClaudeConfigHomeDir(), 'projects')
327}
328
329export function getProjectDir(projectDir: string): string {
330  return join(getProjectsDir(), sanitizePath(projectDir))
331}
332
333/**
334 * Resolves a directory path to its canonical form using realpath + NFC
335 * normalization. Falls back to NFC-only if realpath fails (e.g., the
336 * directory doesn't exist yet). Ensures symlinked paths (e.g.,
337 * /tmp → /private/tmp on macOS) resolve to the same project directory.
338 */
339export async function canonicalizePath(dir: string): Promise<string> {
340  try {
341    return (await realpath(dir)).normalize('NFC')
342  } catch {
343    return dir.normalize('NFC')
344  }
345}
346
347/**
348 * Finds the project directory for a given path, tolerating hash mismatches
349 * for long paths (>200 chars). The CLI uses Bun.hash while the SDK under
350 * Node.js uses simpleHash — for paths that exceed MAX_SANITIZED_LENGTH,
351 * these produce different directory suffixes. This function falls back to
352 * prefix-based scanning when the exact match doesn't exist.
353 */
354export async function findProjectDir(
355  projectPath: string,
356): Promise<string | undefined> {
357  const exact = getProjectDir(projectPath)
358  try {
359    await readdir(exact)
360    return exact
361  } catch {
362    // Exact match failed — for short paths this means no sessions exist.
363    // For long paths, try prefix matching to handle hash mismatches.
364    const sanitized = sanitizePath(projectPath)
365    if (sanitized.length <= MAX_SANITIZED_LENGTH) {
366      return undefined
367    }
368    const prefix = sanitized.slice(0, MAX_SANITIZED_LENGTH)
369    const projectsDir = getProjectsDir()
370    try {
371      const dirents = await readdir(projectsDir, { withFileTypes: true })
372      const match = dirents.find(
373        d => d.isDirectory() && d.name.startsWith(prefix + '-'),
374      )
375      return match ? join(projectsDir, match.name) : undefined
376    } catch {
377      return undefined
378    }
379  }
380}
381
382/**
383 * Resolve a sessionId to its on-disk JSONL file path.
384 *
385 * When `dir` is provided: canonicalize it, look in that project's directory
386 * (with findProjectDir fallback for Bun/Node hash mismatches), then fall back
387 * to sibling git worktrees. `projectPath` in the result is the canonical
388 * user-facing directory the file was found under.
389 *
390 * When `dir` is omitted: scan all project directories under ~/.claude/projects/.
391 * `projectPath` is undefined in this case (no meaningful project path to report).
392 *
393 * Existence is checked by stat (operate-then-catch-ENOENT, no existsSync).
394 * Zero-byte files are treated as not-found so callers continue searching past
395 * a truncated copy to find a valid one in a sibling directory.
396 *
397 * `fileSize` is returned so callers (loadSessionBuffer) don't need to re-stat.
398 *
399 * Shared by getSessionInfoImpl and getSessionMessagesImpl — the caller
400 * invokes its own reader (readSessionLite / loadSessionBuffer) on the
401 * resolved path.
402 */
403export async function resolveSessionFilePath(
404  sessionId: string,
405  dir?: string,
406): Promise<
407  | { filePath: string; projectPath: string | undefined; fileSize: number }
408  | undefined
409> {
410  const fileName = `${sessionId}.jsonl`
411
412  if (dir) {
413    const canonical = await canonicalizePath(dir)
414    const projectDir = await findProjectDir(canonical)
415    if (projectDir) {
416      const filePath = join(projectDir, fileName)
417      try {
418        const s = await stat(filePath)
419        if (s.size > 0)
420          return { filePath, projectPath: canonical, fileSize: s.size }
421      } catch {
422        // ENOENT/EACCES — keep searching
423      }
424    }
425    // Worktree fallback — sessions may live under a different worktree root
426    let worktreePaths: string[]
427    try {
428      worktreePaths = await getWorktreePathsPortable(canonical)
429    } catch {
430      worktreePaths = []
431    }
432    for (const wt of worktreePaths) {
433      if (wt === canonical) continue
434      const wtProjectDir = await findProjectDir(wt)
435      if (!wtProjectDir) continue
436      const filePath = join(wtProjectDir, fileName)
437      try {
438        const s = await stat(filePath)
439        if (s.size > 0) return { filePath, projectPath: wt, fileSize: s.size }
440      } catch {
441        // ENOENT/EACCES — keep searching
442      }
443    }
444    return undefined
445  }
446
447  // No dir — scan all project directories
448  const projectsDir = getProjectsDir()
449  let dirents: string[]
450  try {
451    dirents = await readdir(projectsDir)
452  } catch {
453    return undefined
454  }
455  for (const name of dirents) {
456    const filePath = join(projectsDir, name, fileName)
457    try {
458      const s = await stat(filePath)
459      if (s.size > 0)
460        return { filePath, projectPath: undefined, fileSize: s.size }
461    } catch {
462      // ENOENT/ENOTDIR — not in this project, keep scanning
463    }
464  }
465  return undefined
466}
467
468// ---------------------------------------------------------------------------
469// Compact-boundary chunked read (shared by loadTranscriptFile & SDK getSessionMessages)
470// ---------------------------------------------------------------------------
471
472/** Chunk size for the forward transcript reader. 1 MB balances I/O calls vs buffer growth. */
473const TRANSCRIPT_READ_CHUNK_SIZE = 1024 * 1024
474
475/**
476 * File size below which precompact filtering is skipped.
477 * Large sessions (>5 MB) almost always have compact boundaries — they got big
478 * because of many turns triggering auto-compact.
479 */
480export const SKIP_PRECOMPACT_THRESHOLD = 5 * 1024 * 1024
481
482/** Marker bytes searched for when locating the boundary. Lazy: allocated on
483 * first use, not at module load. Most sessions never resume. */
484let _compactBoundaryMarker: Buffer | undefined
485function compactBoundaryMarker(): Buffer {
486  return (_compactBoundaryMarker ??= Buffer.from('"compact_boundary"'))
487}
488
489/**
490 * Confirm a byte-matched line is a real compact_boundary (marker can appear
491 * inside user content) and check for preservedSegment.
492 */
493function parseBoundaryLine(
494  line: string,
495): { hasPreservedSegment: boolean } | null {
496  try {
497    const parsed = JSON.parse(line) as {
498      type?: string
499      subtype?: string
500      compactMetadata?: { preservedSegment?: unknown }
501    }
502    if (parsed.type !== 'system' || parsed.subtype !== 'compact_boundary') {
503      return null
504    }
505    return {
506      hasPreservedSegment: Boolean(parsed.compactMetadata?.preservedSegment),
507    }
508  } catch {
509    return null
510  }
511}
512
513/**
514 * Single forward chunked read for the --resume load path. Attr-snap lines
515 * are skipped at the fd level; compact boundaries truncate in-stream. Peak
516 * is the output size, not the file size.
517 *
518 * The surviving (last) attr-snap is appended at EOF instead of in-place;
519 * restoreAttributionStateFromSnapshots only reads [length-1] so position
520 * doesn't matter.
521 */
522
523type Sink = { buf: Buffer; len: number; cap: number }
524
525function sinkWrite(s: Sink, src: Buffer, start: number, end: number): void {
526  const n = end - start
527  if (n <= 0) return
528  if (s.len + n > s.buf.length) {
529    const grown = Buffer.allocUnsafe(
530      Math.min(Math.max(s.buf.length * 2, s.len + n), s.cap),
531    )
532    s.buf.copy(grown, 0, 0, s.len)
533    s.buf = grown
534  }
535  src.copy(s.buf, s.len, start, end)
536  s.len += n
537}
538
539function hasPrefix(
540  src: Buffer,
541  prefix: Buffer,
542  at: number,
543  end: number,
544): boolean {
545  return (
546    end - at >= prefix.length &&
547    src.compare(prefix, 0, prefix.length, at, at + prefix.length) === 0
548  )
549}
550
551const ATTR_SNAP_PREFIX = Buffer.from('{"type":"attribution-snapshot"')
552const SYSTEM_PREFIX = Buffer.from('{"type":"system"')
553const LF = 0x0a
554const LF_BYTE = Buffer.from([LF])
555const BOUNDARY_SEARCH_BOUND = 256 // marker sits ~28 bytes in; 256 is slack
556
557type LoadState = {
558  out: Sink
559  boundaryStartOffset: number
560  hasPreservedSegment: boolean
561  lastSnapSrc: Buffer | null // most-recent attr-snap, appended at EOF
562  lastSnapLen: number
563  lastSnapBuf: Buffer | undefined
564  bufFileOff: number // file offset of buf[0]
565  carryLen: number
566  carryBuf: Buffer | undefined
567  straddleSnapCarryLen: number // per-chunk; reset by processStraddle
568  straddleSnapTailEnd: number
569}
570
571// Line spanning the chunk seam. 0 = fall through to concat.
572function processStraddle(
573  s: LoadState,
574  chunk: Buffer,
575  bytesRead: number,
576): number {
577  s.straddleSnapCarryLen = 0
578  s.straddleSnapTailEnd = 0
579  if (s.carryLen === 0) return 0
580  const cb = s.carryBuf!
581  const firstNl = chunk.indexOf(LF)
582  if (firstNl === -1 || firstNl >= bytesRead) return 0
583  const tailEnd = firstNl + 1
584  if (hasPrefix(cb, ATTR_SNAP_PREFIX, 0, s.carryLen)) {
585    s.straddleSnapCarryLen = s.carryLen
586    s.straddleSnapTailEnd = tailEnd
587    s.lastSnapSrc = null
588  } else if (s.carryLen < ATTR_SNAP_PREFIX.length) {
589    return 0 // too short to rule out attr-snap
590  } else {
591    if (hasPrefix(cb, SYSTEM_PREFIX, 0, s.carryLen)) {
592      const hit = parseBoundaryLine(
593        cb.toString('utf-8', 0, s.carryLen) +
594          chunk.toString('utf-8', 0, firstNl),
595      )
596      if (hit?.hasPreservedSegment) {
597        s.hasPreservedSegment = true
598      } else if (hit) {
599        s.out.len = 0
600        s.boundaryStartOffset = s.bufFileOff
601        s.hasPreservedSegment = false
602        s.lastSnapSrc = null
603      }
604    }
605    sinkWrite(s.out, cb, 0, s.carryLen)
606    sinkWrite(s.out, chunk, 0, tailEnd)
607  }
608  s.bufFileOff += s.carryLen + tailEnd
609  s.carryLen = 0
610  return tailEnd
611}
612
613// Strip attr-snaps, truncate on boundaries. Kept lines write as runs.
614function scanChunkLines(
615  s: LoadState,
616  buf: Buffer,
617  boundaryMarker: Buffer,
618): { lastSnapStart: number; lastSnapEnd: number; trailStart: number } {
619  let boundaryAt = buf.indexOf(boundaryMarker)
620  let runStart = 0
621  let lineStart = 0
622  let lastSnapStart = -1
623  let lastSnapEnd = -1
624  let nl = buf.indexOf(LF)
625  while (nl !== -1) {
626    const lineEnd = nl + 1
627    if (boundaryAt !== -1 && boundaryAt < lineStart) {
628      boundaryAt = buf.indexOf(boundaryMarker, lineStart)
629    }
630    if (hasPrefix(buf, ATTR_SNAP_PREFIX, lineStart, lineEnd)) {
631      sinkWrite(s.out, buf, runStart, lineStart)
632      lastSnapStart = lineStart
633      lastSnapEnd = lineEnd
634      runStart = lineEnd
635    } else if (
636      boundaryAt >= lineStart &&
637      boundaryAt < Math.min(lineStart + BOUNDARY_SEARCH_BOUND, lineEnd)
638    ) {
639      const hit = parseBoundaryLine(buf.toString('utf-8', lineStart, nl))
640      if (hit?.hasPreservedSegment) {
641        s.hasPreservedSegment = true // don't truncate; preserved msgs already in output
642      } else if (hit) {
643        s.out.len = 0
644        s.boundaryStartOffset = s.bufFileOff + lineStart
645        s.hasPreservedSegment = false
646        s.lastSnapSrc = null
647        lastSnapStart = -1
648        s.straddleSnapCarryLen = 0
649        runStart = lineStart
650      }
651      boundaryAt = buf.indexOf(
652        boundaryMarker,
653        boundaryAt + boundaryMarker.length,
654      )
655    }
656    lineStart = lineEnd
657    nl = buf.indexOf(LF, lineStart)
658  }
659  sinkWrite(s.out, buf, runStart, lineStart)
660  return { lastSnapStart, lastSnapEnd, trailStart: lineStart }
661}
662
663// In-buf snap wins over straddle (later in file). carryBuf still valid here.
664function captureSnap(
665  s: LoadState,
666  buf: Buffer,
667  chunk: Buffer,
668  lastSnapStart: number,
669  lastSnapEnd: number,
670): void {
671  if (lastSnapStart !== -1) {
672    s.lastSnapLen = lastSnapEnd - lastSnapStart
673    if (s.lastSnapBuf === undefined || s.lastSnapLen > s.lastSnapBuf.length) {
674      s.lastSnapBuf = Buffer.allocUnsafe(s.lastSnapLen)
675    }
676    buf.copy(s.lastSnapBuf, 0, lastSnapStart, lastSnapEnd)
677    s.lastSnapSrc = s.lastSnapBuf
678  } else if (s.straddleSnapCarryLen > 0) {
679    s.lastSnapLen = s.straddleSnapCarryLen + s.straddleSnapTailEnd
680    if (s.lastSnapBuf === undefined || s.lastSnapLen > s.lastSnapBuf.length) {
681      s.lastSnapBuf = Buffer.allocUnsafe(s.lastSnapLen)
682    }
683    s.carryBuf!.copy(s.lastSnapBuf, 0, 0, s.straddleSnapCarryLen)
684    chunk.copy(s.lastSnapBuf, s.straddleSnapCarryLen, 0, s.straddleSnapTailEnd)
685    s.lastSnapSrc = s.lastSnapBuf
686  }
687}
688
689function captureCarry(s: LoadState, buf: Buffer, trailStart: number): void {
690  s.carryLen = buf.length - trailStart
691  if (s.carryLen > 0) {
692    if (s.carryBuf === undefined || s.carryLen > s.carryBuf.length) {
693      s.carryBuf = Buffer.allocUnsafe(s.carryLen)
694    }
695    buf.copy(s.carryBuf, 0, trailStart, buf.length)
696  }
697}
698
699function finalizeOutput(s: LoadState): void {
700  if (s.carryLen > 0) {
701    const cb = s.carryBuf!
702    if (hasPrefix(cb, ATTR_SNAP_PREFIX, 0, s.carryLen)) {
703      s.lastSnapSrc = cb
704      s.lastSnapLen = s.carryLen
705    } else {
706      sinkWrite(s.out, cb, 0, s.carryLen)
707    }
708  }
709  if (s.lastSnapSrc) {
710    if (s.out.len > 0 && s.out.buf[s.out.len - 1] !== LF) {
711      sinkWrite(s.out, LF_BYTE, 0, 1)
712    }
713    sinkWrite(s.out, s.lastSnapSrc, 0, s.lastSnapLen)
714  }
715}
716
717export async function readTranscriptForLoad(
718  filePath: string,
719  fileSize: number,
720): Promise<{
721  boundaryStartOffset: number
722  postBoundaryBuf: Buffer
723  hasPreservedSegment: boolean
724}> {
725  const boundaryMarker = compactBoundaryMarker()
726  const CHUNK_SIZE = TRANSCRIPT_READ_CHUNK_SIZE
727
728  const s: LoadState = {
729    out: {
730      // Gated callers enter with fileSize > 5MB, so min(fileSize, 8MB) lands
731      // in [5, 8]MB; large boundaryless sessions (24-31MB output) take 2
732      // grows. Ungated callers (attribution.ts) pass small files too — the
733      // min just right-sizes the initial buf, no grows.
734      buf: Buffer.allocUnsafe(Math.min(fileSize, 8 * 1024 * 1024)),
735      len: 0,
736      // +1: finalizeOutput may insert one LF between a non-LF-terminated
737      // carry and the reordered last attr-snap (crash-truncated file).
738      cap: fileSize + 1,
739    },
740    boundaryStartOffset: 0,
741    hasPreservedSegment: false,
742    lastSnapSrc: null,
743    lastSnapLen: 0,
744    lastSnapBuf: undefined,
745    bufFileOff: 0,
746    carryLen: 0,
747    carryBuf: undefined,
748    straddleSnapCarryLen: 0,
749    straddleSnapTailEnd: 0,
750  }
751
752  const chunk = Buffer.allocUnsafe(CHUNK_SIZE)
753  const fd = await fsOpen(filePath, 'r')
754  try {
755    let filePos = 0
756    while (filePos < fileSize) {
757      const { bytesRead } = await fd.read(
758        chunk,
759        0,
760        Math.min(CHUNK_SIZE, fileSize - filePos),
761        filePos,
762      )
763      if (bytesRead === 0) break
764      filePos += bytesRead
765
766      const chunkOff = processStraddle(s, chunk, bytesRead)
767
768      let buf: Buffer
769      if (s.carryLen > 0) {
770        const bufLen = s.carryLen + (bytesRead - chunkOff)
771        buf = Buffer.allocUnsafe(bufLen)
772        s.carryBuf!.copy(buf, 0, 0, s.carryLen)
773        chunk.copy(buf, s.carryLen, chunkOff, bytesRead)
774      } else {
775        buf = chunk.subarray(chunkOff, bytesRead)
776      }
777
778      const r = scanChunkLines(s, buf, boundaryMarker)
779      captureSnap(s, buf, chunk, r.lastSnapStart, r.lastSnapEnd)
780      captureCarry(s, buf, r.trailStart)
781      s.bufFileOff += r.trailStart
782    }
783    finalizeOutput(s)
784  } finally {
785    await fd.close()
786  }
787
788  return {
789    boundaryStartOffset: s.boundaryStartOffset,
790    postBoundaryBuf: s.out.buf.subarray(0, s.out.len),
791    hasPreservedSegment: s.hasPreservedSegment,
792  }
793}