use eyre::{Context, Result}; use std::sync::Arc; use zstd::bulk::{Compressor, Decompressor}; const DICT_BYTES: &[u8] = include_bytes!("dicts/post_content_v1.dict"); const DICT_VERSION: u8 = 1; // Changed to u8 (1 byte instead of i16) pub const COMPRESSION_LEVEL: i32 = 3; const MAX_DECOMPRESSED_SIZE: usize = 300_000; // 300KB max (Bluesky's current limit) // Zstd frame constants (for reconstruction) const ZSTD_MAGIC: &[u8] = &[0x28, 0xb5, 0x2f, 0xfd]; // Little-endian 0xFD2FB528 const DICT_ID_BYTES: &[u8] = &[0x7a, 0xf8, 0xab, 0x69]; // Dictionary ID 1772877946 (little-endian) /// Post content compression/decompression #[derive(Clone)] pub struct PostContentCodec { dict: Arc>, version: u8, } impl PostContentCodec { /// Create codec (loads embedded dictionary) pub fn new() -> Self { Self { dict: Arc::new(DICT_BYTES.to_vec()), version: DICT_VERSION, } } /// Compress post content - aggressively strips all frame metadata /// /// Output format: [dict_version(1 byte)][fhd(1 byte)][compressed_data_blocks] /// Saves: magic(4) + dict_id(4) + FCS(0-2) = 8-10 bytes per post pub fn compress(&self, content: &str) -> Result> { let mut compressor = Compressor::with_dictionary(COMPRESSION_LEVEL, &self.dict) .context("Failed to create compressor")?; let compressed = compressor .compress(content.as_bytes()) .context("Compression failed")?; // Verify we got expected zstd frame if compressed.len() < 9 || &compressed[0..4] != ZSTD_MAGIC { eyre::bail!("Unexpected zstd frame format"); } // Determine where to strip (magic+FHD+dict = 9 bytes) let fhd = compressed[4]; let magic_fhd_dict_size = match fhd { 0x23 | 0x63 => 9, // Always strip magic(4) + FHD(1) + dict(4) = 9 bytes _ => eyre::bail!("Unexpected frame header descriptor: 0x{:02x}", fhd), }; // Build new format: [dict_version(1)][fhd(1)][FCS if present][block_data] // For FHD=0x23: strip magic+FHD+dict (9 bytes), keep block data // For FHD=0x63: strip magic+FHD+dict (9 bytes), keep FCS+block data let mut result = Vec::with_capacity(2 + compressed.len() - magic_fhd_dict_size); result.push(self.version); result.push(fhd); // Store the FHD byte result.extend_from_slice(&compressed[magic_fhd_dict_size..]); // Keep FCS (if present) + block data Ok(result) } /// Decompress post content - reconstructs full zstd frame /// /// Input format: [dict_version(1 byte)][fhd(1 byte)][FCS(0-2 bytes)][block_data] /// Reconstructs: [magic(4)][FHD(1)][dict_id(4)][FCS(0-2)][block_data] pub fn decompress(&self, compressed: &[u8]) -> Result { if compressed.len() < 2 { eyre::bail!("Compressed data too short"); } // First byte is our dictionary version let dict_version = compressed[0]; // Verify we support this version if dict_version != DICT_VERSION { eyre::bail!("Unsupported dictionary version: {}", dict_version); } // Second byte is the FHD (Frame Header Descriptor) let fhd = compressed[1]; // Validate FHD if fhd != 0x23 && fhd != 0x63 { eyre::bail!("Invalid frame header descriptor: 0x{:02x}", fhd); } // Data starts at byte 2 (after version and FHD) // For FHD=0x23: [block_data] // For FHD=0x63: [FCS(2 bytes)][block_data] let data = &compressed[2..]; // Reconstruct full zstd frame let mut frame = Vec::with_capacity(9 + data.len()); frame.extend_from_slice(ZSTD_MAGIC); // 4 bytes frame.push(fhd); // 1 byte frame.extend_from_slice(DICT_ID_BYTES); // 4 bytes frame.extend_from_slice(data); // FCS (if present) + block data // Decompress let mut decompressor = Decompressor::with_dictionary(&self.dict).context("Failed to create decompressor")?; let decompressed = decompressor .decompress(&frame, MAX_DECOMPRESSED_SIZE) .context("Decompression failed")?; String::from_utf8(decompressed).context("Invalid UTF-8 in decompressed content") } /// Get dictionary version (now u8 instead of i16) pub fn version(&self) -> u8 { self.version } } impl Default for PostContentCodec { fn default() -> Self { Self::new() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_compress_decompress_roundtrip() { let codec = PostContentCodec::new(); let original = "This is a test post with some content."; let compressed = codec.compress(original).unwrap(); let decompressed = codec.decompress(&compressed).unwrap(); assert_eq!(original, decompressed); assert!(compressed.len() < original.len()); } #[test] fn test_aggressive_stripping() { let codec = PostContentCodec::new(); let original = "Test content for aggressive compression"; let compressed = codec.compress(original).unwrap(); // Verify format: first byte should be dict version (1) assert_eq!(compressed[0], 1, "First byte should be dict version"); // Second byte should be FHD (0x23 or 0x63) assert!( compressed[1] == 0x23 || compressed[1] == 0x63, "Second byte should be FHD (0x23 or 0x63), got: 0x{:02x}", compressed[1] ); // Should NOT contain magic number at start if compressed.len() >= 4 { assert_ne!(&compressed[0..4], ZSTD_MAGIC, "Should not start with magic"); } // But decompression still works let decompressed = codec.decompress(&compressed).unwrap(); assert_eq!(original, decompressed); } #[test] fn test_unicode() { let codec = PostContentCodec::new(); let original = "Hello 世界! 🌍 Testing émojis."; let compressed = codec.compress(original).unwrap(); let decompressed = codec.decompress(&compressed).unwrap(); assert_eq!(original, decompressed); } #[test] fn test_version() { let codec = PostContentCodec::new(); assert_eq!(codec.version(), 1); } #[test] fn test_empty_content() { let codec = PostContentCodec::new(); let original = ""; let compressed = codec.compress(original).unwrap(); let decompressed = codec.decompress(&compressed).unwrap(); assert_eq!(original, decompressed); } #[test] fn test_typical_post() { let codec = PostContentCodec::new(); let original = "Just posted a cool photo from my trip! Check it out 📸"; let compressed = codec.compress(original).unwrap(); let decompressed = codec.decompress(&compressed).unwrap(); assert_eq!(original, decompressed); println!( "Original: {} bytes, Compressed: {} bytes, Ratio: {:.2}x", original.len(), compressed.len(), original.len() as f64 / compressed.len() as f64 ); } #[test] fn test_savings_calculation() { let codec = PostContentCodec::new(); let original = "This is a typical post with some text content."; // Compress with standard zstd (for comparison) let mut compressor_std = Compressor::with_dictionary(3, DICT_BYTES).unwrap(); let std_compressed = compressor_std.compress(original.as_bytes()).unwrap(); // Compress with our aggressive method let our_compressed = codec.compress(original).unwrap(); // Calculate savings let bytes_saved = std_compressed.len() - our_compressed.len(); println!("Standard zstd: {} bytes", std_compressed.len()); println!("Our format: {} bytes", our_compressed.len()); println!( "Savings: {} bytes ({:.1}%)", bytes_saved, (bytes_saved as f64 / std_compressed.len() as f64) * 100.0 ); // Should save at least 7 bytes (magic + dict: 4+4-1 = 7, we keep FHD now) assert!(bytes_saved >= 7, "Should save at least 7 bytes"); // Verify decompression works let decompressed = codec.decompress(&our_compressed).unwrap(); assert_eq!(original, decompressed); } #[test] fn test_both_fhd_types() { let codec = PostContentCodec::new(); // Test a post that will use FHD=0x23 (short content, no FCS) let short_content = "Short post"; let compressed_short = codec.compress(short_content).unwrap(); let decompressed_short = codec.decompress(&compressed_short).unwrap(); assert_eq!(short_content, decompressed_short); println!("FHD for short post: 0x{:02x}", compressed_short[1]); // Test a post that will use FHD=0x63 (longer content, with FCS) let long_content = "This is a much longer post with a lot of content that should trigger FHD=0x63. " .repeat(10); println!("Original content length: {}", long_content.len()); // First, compress with raw zstd to see what it produces let mut raw_compressor = Compressor::with_dictionary(COMPRESSION_LEVEL, DICT_BYTES).unwrap(); let raw_compressed = raw_compressor.compress(long_content.as_bytes()).unwrap(); println!("Raw zstd compressed length: {}", raw_compressed.len()); println!("Raw FHD: 0x{:02x}", raw_compressed[4]); if raw_compressed[4] == 0x63 { let fcs_bytes = &raw_compressed[9..11]; let fcs_value = u16::from_le_bytes([fcs_bytes[0], fcs_bytes[1]]); println!( "Raw FCS bytes: {:02x} {:02x}, value: {}, indicated size: {}", fcs_bytes[0], fcs_bytes[1], fcs_value, fcs_value as usize + 256 ); } // Now compress with our codec let compressed_long = codec.compress(&long_content).unwrap(); println!("Codec FHD: 0x{:02x}", compressed_long[1]); println!( "Codec compressed length: {}, First 12 bytes: {:02x?}", compressed_long.len(), &compressed_long[..12.min(compressed_long.len())] ); // Try to decompress match codec.decompress(&compressed_long) { Ok(decompressed_long) => { println!( "Decompressed successfully! Length: {}", decompressed_long.len() ); assert_eq!(long_content, decompressed_long); } Err(e) => { println!("Decompression failed: {:?}", e); // Try manual decompression to debug let mut manual_frame = Vec::new(); manual_frame.extend_from_slice(ZSTD_MAGIC); manual_frame.push(compressed_long[1]); // FHD manual_frame.extend_from_slice(DICT_ID_BYTES); manual_frame.extend_from_slice(&compressed_long[2..]); // data let mut manual_dec = Decompressor::with_dictionary(DICT_BYTES).unwrap(); match manual_dec.decompress(&manual_frame, MAX_DECOMPRESSED_SIZE) { Ok(d) => println!("Manual decompression worked! Length: {}", d.len()), Err(e2) => println!("Manual decompression also failed: {:?}", e2), } panic!("Failed to decompress long post: {}", e); } } } }