/** * Binary file extensions to skip for text-based operations. * These files can't be meaningfully compared as text and are often large. */ export const BINARY_EXTENSIONS = new Set([ // Images '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', // Videos '.mp4', '.mov', '.avi', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.mpeg', '.mpg', // Audio '.mp3', '.wav', '.ogg', '.flac', '.aac', '.m4a', '.wma', '.aiff', '.opus', // Archives '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.z', '.tgz', '.iso', // Executables/binaries '.exe', '.dll', '.so', '.dylib', '.bin', '.o', '.a', '.obj', '.lib', '.app', '.msi', '.deb', '.rpm', // Documents (PDF is here; FileReadTool excludes it at the call site) '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp', // Fonts '.ttf', '.otf', '.woff', '.woff2', '.eot', // Bytecode / VM artifacts '.pyc', '.pyo', '.class', '.jar', '.war', '.ear', '.node', '.wasm', '.rlib', // Database files '.sqlite', '.sqlite3', '.db', '.mdb', '.idx', // Design / 3D '.psd', '.ai', '.eps', '.sketch', '.fig', '.xd', '.blend', '.3ds', '.max', // Flash '.swf', '.fla', // Lock/profiling data '.lockb', '.dat', '.data', ]) /** * Check if a file path has a binary extension. */ export function hasBinaryExtension(filePath: string): boolean { const ext = filePath.slice(filePath.lastIndexOf('.')).toLowerCase() return BINARY_EXTENSIONS.has(ext) } /** * Number of bytes to read for binary content detection. */ const BINARY_CHECK_SIZE = 8192 /** * Check if a buffer contains binary content by looking for null bytes * or a high proportion of non-printable characters. */ export function isBinaryContent(buffer: Buffer): boolean { // Check first BINARY_CHECK_SIZE bytes (or full buffer if smaller) const checkSize = Math.min(buffer.length, BINARY_CHECK_SIZE) let nonPrintable = 0 for (let i = 0; i < checkSize; i++) { const byte = buffer[i]! // Null byte is a strong indicator of binary if (byte === 0) { return true } // Count non-printable, non-whitespace bytes // Printable ASCII is 32-126, plus common whitespace (9, 10, 13) if ( byte < 32 && byte !== 9 && // tab byte !== 10 && // newline byte !== 13 // carriage return ) { nonPrintable++ } } // If more than 10% non-printable, likely binary return nonPrintable / checkSize > 0.1 }