Rust AppView - highly experimental!
at experiments 315 lines 12 kB view raw
1use eyre::{Context, Result}; 2use std::sync::Arc; 3use zstd::bulk::{Compressor, Decompressor}; 4 5const DICT_BYTES: &[u8] = include_bytes!("dicts/post_content_v1.dict"); 6const DICT_VERSION: u8 = 1; // Changed to u8 (1 byte instead of i16) 7pub const COMPRESSION_LEVEL: i32 = 3; 8const MAX_DECOMPRESSED_SIZE: usize = 300_000; // 300KB max (Bluesky's current limit) 9 10// Zstd frame constants (for reconstruction) 11const ZSTD_MAGIC: &[u8] = &[0x28, 0xb5, 0x2f, 0xfd]; // Little-endian 0xFD2FB528 12const DICT_ID_BYTES: &[u8] = &[0x7a, 0xf8, 0xab, 0x69]; // Dictionary ID 1772877946 (little-endian) 13 14/// Post content compression/decompression 15#[derive(Clone)] 16pub struct PostContentCodec { 17 dict: Arc<Vec<u8>>, 18 version: u8, 19} 20 21impl PostContentCodec { 22 /// Create codec (loads embedded dictionary) 23 pub fn new() -> Self { 24 Self { 25 dict: Arc::new(DICT_BYTES.to_vec()), 26 version: DICT_VERSION, 27 } 28 } 29 30 /// Compress post content - aggressively strips all frame metadata 31 /// 32 /// Output format: [dict_version(1 byte)][fhd(1 byte)][compressed_data_blocks] 33 /// Saves: magic(4) + dict_id(4) + FCS(0-2) = 8-10 bytes per post 34 pub fn compress(&self, content: &str) -> Result<Vec<u8>> { 35 let mut compressor = Compressor::with_dictionary(COMPRESSION_LEVEL, &self.dict) 36 .context("Failed to create compressor")?; 37 38 let compressed = compressor 39 .compress(content.as_bytes()) 40 .context("Compression failed")?; 41 42 // Verify we got expected zstd frame 43 if compressed.len() < 9 || &compressed[0..4] != ZSTD_MAGIC { 44 eyre::bail!("Unexpected zstd frame format"); 45 } 46 47 // Determine where to strip (magic+FHD+dict = 9 bytes) 48 let fhd = compressed[4]; 49 let magic_fhd_dict_size = match fhd { 50 0x23 | 0x63 => 9, // Always strip magic(4) + FHD(1) + dict(4) = 9 bytes 51 _ => eyre::bail!("Unexpected frame header descriptor: 0x{:02x}", fhd), 52 }; 53 54 // Build new format: [dict_version(1)][fhd(1)][FCS if present][block_data] 55 // For FHD=0x23: strip magic+FHD+dict (9 bytes), keep block data 56 // For FHD=0x63: strip magic+FHD+dict (9 bytes), keep FCS+block data 57 let mut result = Vec::with_capacity(2 + compressed.len() - magic_fhd_dict_size); 58 result.push(self.version); 59 result.push(fhd); // Store the FHD byte 60 result.extend_from_slice(&compressed[magic_fhd_dict_size..]); // Keep FCS (if present) + block data 61 62 Ok(result) 63 } 64 65 /// Decompress post content - reconstructs full zstd frame 66 /// 67 /// Input format: [dict_version(1 byte)][fhd(1 byte)][FCS(0-2 bytes)][block_data] 68 /// Reconstructs: [magic(4)][FHD(1)][dict_id(4)][FCS(0-2)][block_data] 69 pub fn decompress(&self, compressed: &[u8]) -> Result<String> { 70 if compressed.len() < 2 { 71 eyre::bail!("Compressed data too short"); 72 } 73 74 // First byte is our dictionary version 75 let dict_version = compressed[0]; 76 77 // Verify we support this version 78 if dict_version != DICT_VERSION { 79 eyre::bail!("Unsupported dictionary version: {}", dict_version); 80 } 81 82 // Second byte is the FHD (Frame Header Descriptor) 83 let fhd = compressed[1]; 84 85 // Validate FHD 86 if fhd != 0x23 && fhd != 0x63 { 87 eyre::bail!("Invalid frame header descriptor: 0x{:02x}", fhd); 88 } 89 90 // Data starts at byte 2 (after version and FHD) 91 // For FHD=0x23: [block_data] 92 // For FHD=0x63: [FCS(2 bytes)][block_data] 93 let data = &compressed[2..]; 94 95 // Reconstruct full zstd frame 96 let mut frame = Vec::with_capacity(9 + data.len()); 97 frame.extend_from_slice(ZSTD_MAGIC); // 4 bytes 98 frame.push(fhd); // 1 byte 99 frame.extend_from_slice(DICT_ID_BYTES); // 4 bytes 100 frame.extend_from_slice(data); // FCS (if present) + block data 101 102 // Decompress 103 let mut decompressor = 104 Decompressor::with_dictionary(&self.dict).context("Failed to create decompressor")?; 105 106 let decompressed = decompressor 107 .decompress(&frame, MAX_DECOMPRESSED_SIZE) 108 .context("Decompression failed")?; 109 110 String::from_utf8(decompressed).context("Invalid UTF-8 in decompressed content") 111 } 112 113 /// Get dictionary version (now u8 instead of i16) 114 pub fn version(&self) -> u8 { 115 self.version 116 } 117} 118 119impl Default for PostContentCodec { 120 fn default() -> Self { 121 Self::new() 122 } 123} 124 125#[cfg(test)] 126mod tests { 127 use super::*; 128 129 #[test] 130 fn test_compress_decompress_roundtrip() { 131 let codec = PostContentCodec::new(); 132 133 let original = "This is a test post with some content."; 134 let compressed = codec.compress(original).unwrap(); 135 let decompressed = codec.decompress(&compressed).unwrap(); 136 137 assert_eq!(original, decompressed); 138 assert!(compressed.len() < original.len()); 139 } 140 141 #[test] 142 fn test_aggressive_stripping() { 143 let codec = PostContentCodec::new(); 144 let original = "Test content for aggressive compression"; 145 let compressed = codec.compress(original).unwrap(); 146 147 // Verify format: first byte should be dict version (1) 148 assert_eq!(compressed[0], 1, "First byte should be dict version"); 149 150 // Second byte should be FHD (0x23 or 0x63) 151 assert!( 152 compressed[1] == 0x23 || compressed[1] == 0x63, 153 "Second byte should be FHD (0x23 or 0x63), got: 0x{:02x}", 154 compressed[1] 155 ); 156 157 // Should NOT contain magic number at start 158 if compressed.len() >= 4 { 159 assert_ne!(&compressed[0..4], ZSTD_MAGIC, "Should not start with magic"); 160 } 161 162 // But decompression still works 163 let decompressed = codec.decompress(&compressed).unwrap(); 164 assert_eq!(original, decompressed); 165 } 166 167 #[test] 168 fn test_unicode() { 169 let codec = PostContentCodec::new(); 170 171 let original = "Hello 世界! 🌍 Testing émojis."; 172 let compressed = codec.compress(original).unwrap(); 173 let decompressed = codec.decompress(&compressed).unwrap(); 174 175 assert_eq!(original, decompressed); 176 } 177 178 #[test] 179 fn test_version() { 180 let codec = PostContentCodec::new(); 181 assert_eq!(codec.version(), 1); 182 } 183 184 #[test] 185 fn test_empty_content() { 186 let codec = PostContentCodec::new(); 187 188 let original = ""; 189 let compressed = codec.compress(original).unwrap(); 190 let decompressed = codec.decompress(&compressed).unwrap(); 191 192 assert_eq!(original, decompressed); 193 } 194 195 #[test] 196 fn test_typical_post() { 197 let codec = PostContentCodec::new(); 198 199 let original = "Just posted a cool photo from my trip! Check it out 📸"; 200 let compressed = codec.compress(original).unwrap(); 201 let decompressed = codec.decompress(&compressed).unwrap(); 202 203 assert_eq!(original, decompressed); 204 println!( 205 "Original: {} bytes, Compressed: {} bytes, Ratio: {:.2}x", 206 original.len(), 207 compressed.len(), 208 original.len() as f64 / compressed.len() as f64 209 ); 210 } 211 212 #[test] 213 fn test_savings_calculation() { 214 let codec = PostContentCodec::new(); 215 let original = "This is a typical post with some text content."; 216 217 // Compress with standard zstd (for comparison) 218 let mut compressor_std = Compressor::with_dictionary(3, DICT_BYTES).unwrap(); 219 let std_compressed = compressor_std.compress(original.as_bytes()).unwrap(); 220 221 // Compress with our aggressive method 222 let our_compressed = codec.compress(original).unwrap(); 223 224 // Calculate savings 225 let bytes_saved = std_compressed.len() - our_compressed.len(); 226 227 println!("Standard zstd: {} bytes", std_compressed.len()); 228 println!("Our format: {} bytes", our_compressed.len()); 229 println!( 230 "Savings: {} bytes ({:.1}%)", 231 bytes_saved, 232 (bytes_saved as f64 / std_compressed.len() as f64) * 100.0 233 ); 234 235 // Should save at least 7 bytes (magic + dict: 4+4-1 = 7, we keep FHD now) 236 assert!(bytes_saved >= 7, "Should save at least 7 bytes"); 237 238 // Verify decompression works 239 let decompressed = codec.decompress(&our_compressed).unwrap(); 240 assert_eq!(original, decompressed); 241 } 242 243 #[test] 244 fn test_both_fhd_types() { 245 let codec = PostContentCodec::new(); 246 247 // Test a post that will use FHD=0x23 (short content, no FCS) 248 let short_content = "Short post"; 249 let compressed_short = codec.compress(short_content).unwrap(); 250 let decompressed_short = codec.decompress(&compressed_short).unwrap(); 251 assert_eq!(short_content, decompressed_short); 252 println!("FHD for short post: 0x{:02x}", compressed_short[1]); 253 254 // Test a post that will use FHD=0x63 (longer content, with FCS) 255 let long_content = 256 "This is a much longer post with a lot of content that should trigger FHD=0x63. " 257 .repeat(10); 258 println!("Original content length: {}", long_content.len()); 259 260 // First, compress with raw zstd to see what it produces 261 let mut raw_compressor = 262 Compressor::with_dictionary(COMPRESSION_LEVEL, DICT_BYTES).unwrap(); 263 let raw_compressed = raw_compressor.compress(long_content.as_bytes()).unwrap(); 264 println!("Raw zstd compressed length: {}", raw_compressed.len()); 265 println!("Raw FHD: 0x{:02x}", raw_compressed[4]); 266 if raw_compressed[4] == 0x63 { 267 let fcs_bytes = &raw_compressed[9..11]; 268 let fcs_value = u16::from_le_bytes([fcs_bytes[0], fcs_bytes[1]]); 269 println!( 270 "Raw FCS bytes: {:02x} {:02x}, value: {}, indicated size: {}", 271 fcs_bytes[0], 272 fcs_bytes[1], 273 fcs_value, 274 fcs_value as usize + 256 275 ); 276 } 277 278 // Now compress with our codec 279 let compressed_long = codec.compress(&long_content).unwrap(); 280 println!("Codec FHD: 0x{:02x}", compressed_long[1]); 281 println!( 282 "Codec compressed length: {}, First 12 bytes: {:02x?}", 283 compressed_long.len(), 284 &compressed_long[..12.min(compressed_long.len())] 285 ); 286 287 // Try to decompress 288 match codec.decompress(&compressed_long) { 289 Ok(decompressed_long) => { 290 println!( 291 "Decompressed successfully! Length: {}", 292 decompressed_long.len() 293 ); 294 assert_eq!(long_content, decompressed_long); 295 } 296 Err(e) => { 297 println!("Decompression failed: {:?}", e); 298 // Try manual decompression to debug 299 let mut manual_frame = Vec::new(); 300 manual_frame.extend_from_slice(ZSTD_MAGIC); 301 manual_frame.push(compressed_long[1]); // FHD 302 manual_frame.extend_from_slice(DICT_ID_BYTES); 303 manual_frame.extend_from_slice(&compressed_long[2..]); // data 304 305 let mut manual_dec = Decompressor::with_dictionary(DICT_BYTES).unwrap(); 306 match manual_dec.decompress(&manual_frame, MAX_DECOMPRESSED_SIZE) { 307 Ok(d) => println!("Manual decompression worked! Length: {}", d.len()), 308 Err(e2) => println!("Manual decompression also failed: {:?}", e2), 309 } 310 311 panic!("Failed to decompress long post: {}", e); 312 } 313 } 314 } 315}