source dump of claude code
at main 300 lines 8.1 kB view raw
1import { randomUUID } from 'crypto' 2import { mkdir, readdir, readFile } from 'fs/promises' 3import { join } from 'path' 4import { 5 PDF_MAX_EXTRACT_SIZE, 6 PDF_TARGET_RAW_SIZE, 7} from '../constants/apiLimits.js' 8import { errorMessage } from './errors.js' 9import { execFileNoThrow } from './execFileNoThrow.js' 10import { formatFileSize } from './format.js' 11import { getFsImplementation } from './fsOperations.js' 12import { getToolResultsDir } from './toolResultStorage.js' 13 14export type PDFError = { 15 reason: 16 | 'empty' 17 | 'too_large' 18 | 'password_protected' 19 | 'corrupted' 20 | 'unknown' 21 | 'unavailable' 22 message: string 23} 24 25export type PDFResult<T> = 26 | { success: true; data: T } 27 | { success: false; error: PDFError } 28 29/** 30 * Read a PDF file and return it as base64-encoded data. 31 * @param filePath Path to the PDF file 32 * @returns Result containing PDF data or a structured error 33 */ 34export async function readPDF(filePath: string): Promise< 35 PDFResult<{ 36 type: 'pdf' 37 file: { 38 filePath: string 39 base64: string 40 originalSize: number 41 } 42 }> 43> { 44 try { 45 const fs = getFsImplementation() 46 const stats = await fs.stat(filePath) 47 const originalSize = stats.size 48 49 // Check if file is empty 50 if (originalSize === 0) { 51 return { 52 success: false, 53 error: { reason: 'empty', message: `PDF file is empty: ${filePath}` }, 54 } 55 } 56 57 // Check if PDF exceeds maximum size 58 // The API has a 32MB total request limit. After base64 encoding (~33% larger), 59 // a PDF must be under ~20MB raw to leave room for conversation context. 60 if (originalSize > PDF_TARGET_RAW_SIZE) { 61 return { 62 success: false, 63 error: { 64 reason: 'too_large', 65 message: `PDF file exceeds maximum allowed size of ${formatFileSize(PDF_TARGET_RAW_SIZE)}.`, 66 }, 67 } 68 } 69 70 const fileBuffer = await readFile(filePath) 71 72 // Validate PDF magic bytes — reject files that aren't actually PDFs 73 // (e.g., HTML files renamed to .pdf) before they enter conversation context. 74 // Once an invalid PDF document block is in the message history, every subsequent 75 // API call fails with 400 "The PDF specified was not valid" and the session 76 // becomes unrecoverable without /clear. 77 const header = fileBuffer.subarray(0, 5).toString('ascii') 78 if (!header.startsWith('%PDF-')) { 79 return { 80 success: false, 81 error: { 82 reason: 'corrupted', 83 message: `File is not a valid PDF (missing %PDF- header): ${filePath}`, 84 }, 85 } 86 } 87 88 const base64 = fileBuffer.toString('base64') 89 90 // Note: We cannot check page count here without parsing the PDF 91 // The API will enforce the 100-page limit and return an error if exceeded 92 93 return { 94 success: true, 95 data: { 96 type: 'pdf', 97 file: { 98 filePath, 99 base64, 100 originalSize, 101 }, 102 }, 103 } 104 } catch (e: unknown) { 105 return { 106 success: false, 107 error: { 108 reason: 'unknown', 109 message: errorMessage(e), 110 }, 111 } 112 } 113} 114 115/** 116 * Get the number of pages in a PDF file using `pdfinfo` (from poppler-utils). 117 * Returns `null` if pdfinfo is not available or if the page count cannot be determined. 118 */ 119export async function getPDFPageCount( 120 filePath: string, 121): Promise<number | null> { 122 const { code, stdout } = await execFileNoThrow('pdfinfo', [filePath], { 123 timeout: 10_000, 124 useCwd: false, 125 }) 126 if (code !== 0) { 127 return null 128 } 129 const match = /^Pages:\s+(\d+)/m.exec(stdout) 130 if (!match) { 131 return null 132 } 133 const count = parseInt(match[1]!, 10) 134 return isNaN(count) ? null : count 135} 136 137export type PDFExtractPagesResult = { 138 type: 'parts' 139 file: { 140 filePath: string 141 originalSize: number 142 count: number 143 outputDir: string 144 } 145} 146 147let pdftoppmAvailable: boolean | undefined 148 149/** 150 * Reset the pdftoppm availability cache. Used by tests only. 151 */ 152export function resetPdftoppmCache(): void { 153 pdftoppmAvailable = undefined 154} 155 156/** 157 * Check whether the `pdftoppm` binary (from poppler-utils) is available. 158 * The result is cached for the lifetime of the process. 159 */ 160export async function isPdftoppmAvailable(): Promise<boolean> { 161 if (pdftoppmAvailable !== undefined) return pdftoppmAvailable 162 const { code, stderr } = await execFileNoThrow('pdftoppm', ['-v'], { 163 timeout: 5000, 164 useCwd: false, 165 }) 166 // pdftoppm prints version info to stderr and exits 0 (or sometimes 99 on older versions) 167 pdftoppmAvailable = code === 0 || stderr.length > 0 168 return pdftoppmAvailable 169} 170 171/** 172 * Extract PDF pages as JPEG images using pdftoppm. 173 * Produces page-01.jpg, page-02.jpg, etc. in an output directory. 174 * This enables reading large PDFs and works with all API providers. 175 * 176 * @param filePath Path to the PDF file 177 * @param options Optional page range (1-indexed, inclusive) 178 */ 179export async function extractPDFPages( 180 filePath: string, 181 options?: { firstPage?: number; lastPage?: number }, 182): Promise<PDFResult<PDFExtractPagesResult>> { 183 try { 184 const fs = getFsImplementation() 185 const stats = await fs.stat(filePath) 186 const originalSize = stats.size 187 188 if (originalSize === 0) { 189 return { 190 success: false, 191 error: { reason: 'empty', message: `PDF file is empty: ${filePath}` }, 192 } 193 } 194 195 if (originalSize > PDF_MAX_EXTRACT_SIZE) { 196 return { 197 success: false, 198 error: { 199 reason: 'too_large', 200 message: `PDF file exceeds maximum allowed size for text extraction (${formatFileSize(PDF_MAX_EXTRACT_SIZE)}).`, 201 }, 202 } 203 } 204 205 const available = await isPdftoppmAvailable() 206 if (!available) { 207 return { 208 success: false, 209 error: { 210 reason: 'unavailable', 211 message: 212 'pdftoppm is not installed. Install poppler-utils (e.g. `brew install poppler` or `apt-get install poppler-utils`) to enable PDF page rendering.', 213 }, 214 } 215 } 216 217 const uuid = randomUUID() 218 const outputDir = join(getToolResultsDir(), `pdf-${uuid}`) 219 await mkdir(outputDir, { recursive: true }) 220 221 // pdftoppm produces files like <prefix>-01.jpg, <prefix>-02.jpg, etc. 222 const prefix = join(outputDir, 'page') 223 const args = ['-jpeg', '-r', '100'] 224 if (options?.firstPage) { 225 args.push('-f', String(options.firstPage)) 226 } 227 if (options?.lastPage && options.lastPage !== Infinity) { 228 args.push('-l', String(options.lastPage)) 229 } 230 args.push(filePath, prefix) 231 const { code, stderr } = await execFileNoThrow('pdftoppm', args, { 232 timeout: 120_000, 233 useCwd: false, 234 }) 235 236 if (code !== 0) { 237 if (/password/i.test(stderr)) { 238 return { 239 success: false, 240 error: { 241 reason: 'password_protected', 242 message: 243 'PDF is password-protected. Please provide an unprotected version.', 244 }, 245 } 246 } 247 if (/damaged|corrupt|invalid/i.test(stderr)) { 248 return { 249 success: false, 250 error: { 251 reason: 'corrupted', 252 message: 'PDF file is corrupted or invalid.', 253 }, 254 } 255 } 256 return { 257 success: false, 258 error: { reason: 'unknown', message: `pdftoppm failed: ${stderr}` }, 259 } 260 } 261 262 // Read generated image files and sort naturally 263 const entries = await readdir(outputDir) 264 const imageFiles = entries.filter(f => f.endsWith('.jpg')).sort() 265 const pageCount = imageFiles.length 266 267 if (pageCount === 0) { 268 return { 269 success: false, 270 error: { 271 reason: 'corrupted', 272 message: 'pdftoppm produced no output pages. The PDF may be invalid.', 273 }, 274 } 275 } 276 277 const count = imageFiles.length 278 279 return { 280 success: true, 281 data: { 282 type: 'parts', 283 file: { 284 filePath, 285 originalSize, 286 outputDir, 287 count, 288 }, 289 }, 290 } 291 } catch (e: unknown) { 292 return { 293 success: false, 294 error: { 295 reason: 'unknown', 296 message: errorMessage(e), 297 }, 298 } 299 } 300}