utils/pdf.ts at main · oppi.li/claude-code

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / utils / pdf.ts
at main 300 lines 8.1 kB view raw
wrap content
oppi.li dump from zip 2d ago
63aada3f
  1import { randomUUID } from 'crypto'
  2import { mkdir, readdir, readFile } from 'fs/promises'
  3import { join } from 'path'
  4import {
  5  PDF_MAX_EXTRACT_SIZE,
  6  PDF_TARGET_RAW_SIZE,
  7} from '../constants/apiLimits.js'
  8import { errorMessage } from './errors.js'
  9import { execFileNoThrow } from './execFileNoThrow.js'
 10import { formatFileSize } from './format.js'
 11import { getFsImplementation } from './fsOperations.js'
 12import { getToolResultsDir } from './toolResultStorage.js'
 13
 14export type PDFError = {
 15  reason:
 16    | 'empty'
 17    | 'too_large'
 18    | 'password_protected'
 19    | 'corrupted'
 20    | 'unknown'
 21    | 'unavailable'
 22  message: string
 23}
 24
 25export type PDFResult<T> =
 26  | { success: true; data: T }
 27  | { success: false; error: PDFError }
 28
 29/**
 30 * Read a PDF file and return it as base64-encoded data.
 31 * @param filePath Path to the PDF file
 32 * @returns Result containing PDF data or a structured error
 33 */
 34export async function readPDF(filePath: string): Promise<
 35  PDFResult<{
 36    type: 'pdf'
 37    file: {
 38      filePath: string
 39      base64: string
 40      originalSize: number
 41    }
 42  }>
 43> {
 44  try {
 45    const fs = getFsImplementation()
 46    const stats = await fs.stat(filePath)
 47    const originalSize = stats.size
 48
 49    // Check if file is empty
 50    if (originalSize === 0) {
 51      return {
 52        success: false,
 53        error: { reason: 'empty', message: `PDF file is empty: ${filePath}` },
 54      }
 55    }
 56
 57    // Check if PDF exceeds maximum size
 58    // The API has a 32MB total request limit. After base64 encoding (~33% larger),
 59    // a PDF must be under ~20MB raw to leave room for conversation context.
 60    if (originalSize > PDF_TARGET_RAW_SIZE) {
 61      return {
 62        success: false,
 63        error: {
 64          reason: 'too_large',
 65          message: `PDF file exceeds maximum allowed size of ${formatFileSize(PDF_TARGET_RAW_SIZE)}.`,
 66        },
 67      }
 68    }
 69
 70    const fileBuffer = await readFile(filePath)
 71
 72    // Validate PDF magic bytes — reject files that aren't actually PDFs
 73    // (e.g., HTML files renamed to .pdf) before they enter conversation context.
 74    // Once an invalid PDF document block is in the message history, every subsequent
 75    // API call fails with 400 "The PDF specified was not valid" and the session
 76    // becomes unrecoverable without /clear.
 77    const header = fileBuffer.subarray(0, 5).toString('ascii')
 78    if (!header.startsWith('%PDF-')) {
 79      return {
 80        success: false,
 81        error: {
 82          reason: 'corrupted',
 83          message: `File is not a valid PDF (missing %PDF- header): ${filePath}`,
 84        },
 85      }
 86    }
 87
 88    const base64 = fileBuffer.toString('base64')
 89
 90    // Note: We cannot check page count here without parsing the PDF
 91    // The API will enforce the 100-page limit and return an error if exceeded
 92
 93    return {
 94      success: true,
 95      data: {
 96        type: 'pdf',
 97        file: {
 98          filePath,
 99          base64,
100          originalSize,
101        },
102      },
103    }
104  } catch (e: unknown) {
105    return {
106      success: false,
107      error: {
108        reason: 'unknown',
109        message: errorMessage(e),
110      },
111    }
112  }
113}
114
115/**
116 * Get the number of pages in a PDF file using `pdfinfo` (from poppler-utils).
117 * Returns `null` if pdfinfo is not available or if the page count cannot be determined.
118 */
119export async function getPDFPageCount(
120  filePath: string,
121): Promise<number | null> {
122  const { code, stdout } = await execFileNoThrow('pdfinfo', [filePath], {
123    timeout: 10_000,
124    useCwd: false,
125  })
126  if (code !== 0) {
127    return null
128  }
129  const match = /^Pages:\s+(\d+)/m.exec(stdout)
130  if (!match) {
131    return null
132  }
133  const count = parseInt(match[1]!, 10)
134  return isNaN(count) ? null : count
135}
136
137export type PDFExtractPagesResult = {
138  type: 'parts'
139  file: {
140    filePath: string
141    originalSize: number
142    count: number
143    outputDir: string
144  }
145}
146
147let pdftoppmAvailable: boolean | undefined
148
149/**
150 * Reset the pdftoppm availability cache. Used by tests only.
151 */
152export function resetPdftoppmCache(): void {
153  pdftoppmAvailable = undefined
154}
155
156/**
157 * Check whether the `pdftoppm` binary (from poppler-utils) is available.
158 * The result is cached for the lifetime of the process.
159 */
160export async function isPdftoppmAvailable(): Promise<boolean> {
161  if (pdftoppmAvailable !== undefined) return pdftoppmAvailable
162  const { code, stderr } = await execFileNoThrow('pdftoppm', ['-v'], {
163    timeout: 5000,
164    useCwd: false,
165  })
166  // pdftoppm prints version info to stderr and exits 0 (or sometimes 99 on older versions)
167  pdftoppmAvailable = code === 0 || stderr.length > 0
168  return pdftoppmAvailable
169}
170
171/**
172 * Extract PDF pages as JPEG images using pdftoppm.
173 * Produces page-01.jpg, page-02.jpg, etc. in an output directory.
174 * This enables reading large PDFs and works with all API providers.
175 *
176 * @param filePath Path to the PDF file
177 * @param options Optional page range (1-indexed, inclusive)
178 */
179export async function extractPDFPages(
180  filePath: string,
181  options?: { firstPage?: number; lastPage?: number },
182): Promise<PDFResult<PDFExtractPagesResult>> {
183  try {
184    const fs = getFsImplementation()
185    const stats = await fs.stat(filePath)
186    const originalSize = stats.size
187
188    if (originalSize === 0) {
189      return {
190        success: false,
191        error: { reason: 'empty', message: `PDF file is empty: ${filePath}` },
192      }
193    }
194
195    if (originalSize > PDF_MAX_EXTRACT_SIZE) {
196      return {
197        success: false,
198        error: {
199          reason: 'too_large',
200          message: `PDF file exceeds maximum allowed size for text extraction (${formatFileSize(PDF_MAX_EXTRACT_SIZE)}).`,
201        },
202      }
203    }
204
205    const available = await isPdftoppmAvailable()
206    if (!available) {
207      return {
208        success: false,
209        error: {
210          reason: 'unavailable',
211          message:
212            'pdftoppm is not installed. Install poppler-utils (e.g. `brew install poppler` or `apt-get install poppler-utils`) to enable PDF page rendering.',
213        },
214      }
215    }
216
217    const uuid = randomUUID()
218    const outputDir = join(getToolResultsDir(), `pdf-${uuid}`)
219    await mkdir(outputDir, { recursive: true })
220
221    // pdftoppm produces files like <prefix>-01.jpg, <prefix>-02.jpg, etc.
222    const prefix = join(outputDir, 'page')
223    const args = ['-jpeg', '-r', '100']
224    if (options?.firstPage) {
225      args.push('-f', String(options.firstPage))
226    }
227    if (options?.lastPage && options.lastPage !== Infinity) {
228      args.push('-l', String(options.lastPage))
229    }
230    args.push(filePath, prefix)
231    const { code, stderr } = await execFileNoThrow('pdftoppm', args, {
232      timeout: 120_000,
233      useCwd: false,
234    })
235
236    if (code !== 0) {
237      if (/password/i.test(stderr)) {
238        return {
239          success: false,
240          error: {
241            reason: 'password_protected',
242            message:
243              'PDF is password-protected. Please provide an unprotected version.',
244          },
245        }
246      }
247      if (/damaged|corrupt|invalid/i.test(stderr)) {
248        return {
249          success: false,
250          error: {
251            reason: 'corrupted',
252            message: 'PDF file is corrupted or invalid.',
253          },
254        }
255      }
256      return {
257        success: false,
258        error: { reason: 'unknown', message: `pdftoppm failed: ${stderr}` },
259      }
260    }
261
262    // Read generated image files and sort naturally
263    const entries = await readdir(outputDir)
264    const imageFiles = entries.filter(f => f.endsWith('.jpg')).sort()
265    const pageCount = imageFiles.length
266
267    if (pageCount === 0) {
268      return {
269        success: false,
270        error: {
271          reason: 'corrupted',
272          message: 'pdftoppm produced no output pages. The PDF may be invalid.',
273        },
274      }
275    }
276
277    const count = imageFiles.length
278
279    return {
280      success: true,
281      data: {
282        type: 'parts',
283        file: {
284          filePath,
285          originalSize,
286          outputDir,
287          count,
288        },
289      },
290    }
291  } catch (e: unknown) {
292    return {
293      success: false,
294      error: {
295        reason: 'unknown',
296        message: errorMessage(e),
297      },
298    }
299  }
300}