1use eyre::{Context, Result};
2use std::sync::Arc;
3use zstd::bulk::{Compressor, Decompressor};
4
5const DICT_BYTES: &[u8] = include_bytes!("dicts/post_content_v1.dict");
6const DICT_VERSION: u8 = 1; // Changed to u8 (1 byte instead of i16)
7pub const COMPRESSION_LEVEL: i32 = 3;
8const MAX_DECOMPRESSED_SIZE: usize = 300_000; // 300KB max (Bluesky's current limit)
9
10// Zstd frame constants (for reconstruction)
11const ZSTD_MAGIC: &[u8] = &[0x28, 0xb5, 0x2f, 0xfd]; // Little-endian 0xFD2FB528
12const DICT_ID_BYTES: &[u8] = &[0x7a, 0xf8, 0xab, 0x69]; // Dictionary ID 1772877946 (little-endian)
13
14/// Post content compression/decompression
15#[derive(Clone)]
16pub struct PostContentCodec {
17 dict: Arc<Vec<u8>>,
18 version: u8,
19}
20
21impl PostContentCodec {
22 /// Create codec (loads embedded dictionary)
23 pub fn new() -> Self {
24 Self {
25 dict: Arc::new(DICT_BYTES.to_vec()),
26 version: DICT_VERSION,
27 }
28 }
29
30 /// Compress post content - aggressively strips all frame metadata
31 ///
32 /// Output format: [dict_version(1 byte)][fhd(1 byte)][compressed_data_blocks]
33 /// Saves: magic(4) + dict_id(4) + FCS(0-2) = 8-10 bytes per post
34 pub fn compress(&self, content: &str) -> Result<Vec<u8>> {
35 let mut compressor = Compressor::with_dictionary(COMPRESSION_LEVEL, &self.dict)
36 .context("Failed to create compressor")?;
37
38 let compressed = compressor
39 .compress(content.as_bytes())
40 .context("Compression failed")?;
41
42 // Verify we got expected zstd frame
43 if compressed.len() < 9 || &compressed[0..4] != ZSTD_MAGIC {
44 eyre::bail!("Unexpected zstd frame format");
45 }
46
47 // Determine where to strip (magic+FHD+dict = 9 bytes)
48 let fhd = compressed[4];
49 let magic_fhd_dict_size = match fhd {
50 0x23 | 0x63 => 9, // Always strip magic(4) + FHD(1) + dict(4) = 9 bytes
51 _ => eyre::bail!("Unexpected frame header descriptor: 0x{:02x}", fhd),
52 };
53
54 // Build new format: [dict_version(1)][fhd(1)][FCS if present][block_data]
55 // For FHD=0x23: strip magic+FHD+dict (9 bytes), keep block data
56 // For FHD=0x63: strip magic+FHD+dict (9 bytes), keep FCS+block data
57 let mut result = Vec::with_capacity(2 + compressed.len() - magic_fhd_dict_size);
58 result.push(self.version);
59 result.push(fhd); // Store the FHD byte
60 result.extend_from_slice(&compressed[magic_fhd_dict_size..]); // Keep FCS (if present) + block data
61
62 Ok(result)
63 }
64
65 /// Decompress post content - reconstructs full zstd frame
66 ///
67 /// Input format: [dict_version(1 byte)][fhd(1 byte)][FCS(0-2 bytes)][block_data]
68 /// Reconstructs: [magic(4)][FHD(1)][dict_id(4)][FCS(0-2)][block_data]
69 pub fn decompress(&self, compressed: &[u8]) -> Result<String> {
70 if compressed.len() < 2 {
71 eyre::bail!("Compressed data too short");
72 }
73
74 // First byte is our dictionary version
75 let dict_version = compressed[0];
76
77 // Verify we support this version
78 if dict_version != DICT_VERSION {
79 eyre::bail!("Unsupported dictionary version: {}", dict_version);
80 }
81
82 // Second byte is the FHD (Frame Header Descriptor)
83 let fhd = compressed[1];
84
85 // Validate FHD
86 if fhd != 0x23 && fhd != 0x63 {
87 eyre::bail!("Invalid frame header descriptor: 0x{:02x}", fhd);
88 }
89
90 // Data starts at byte 2 (after version and FHD)
91 // For FHD=0x23: [block_data]
92 // For FHD=0x63: [FCS(2 bytes)][block_data]
93 let data = &compressed[2..];
94
95 // Reconstruct full zstd frame
96 let mut frame = Vec::with_capacity(9 + data.len());
97 frame.extend_from_slice(ZSTD_MAGIC); // 4 bytes
98 frame.push(fhd); // 1 byte
99 frame.extend_from_slice(DICT_ID_BYTES); // 4 bytes
100 frame.extend_from_slice(data); // FCS (if present) + block data
101
102 // Decompress
103 let mut decompressor =
104 Decompressor::with_dictionary(&self.dict).context("Failed to create decompressor")?;
105
106 let decompressed = decompressor
107 .decompress(&frame, MAX_DECOMPRESSED_SIZE)
108 .context("Decompression failed")?;
109
110 String::from_utf8(decompressed).context("Invalid UTF-8 in decompressed content")
111 }
112
113 /// Get dictionary version (now u8 instead of i16)
114 pub fn version(&self) -> u8 {
115 self.version
116 }
117}
118
119impl Default for PostContentCodec {
120 fn default() -> Self {
121 Self::new()
122 }
123}
124
125#[cfg(test)]
126mod tests {
127 use super::*;
128
129 #[test]
130 fn test_compress_decompress_roundtrip() {
131 let codec = PostContentCodec::new();
132
133 let original = "This is a test post with some content.";
134 let compressed = codec.compress(original).unwrap();
135 let decompressed = codec.decompress(&compressed).unwrap();
136
137 assert_eq!(original, decompressed);
138 assert!(compressed.len() < original.len());
139 }
140
141 #[test]
142 fn test_aggressive_stripping() {
143 let codec = PostContentCodec::new();
144 let original = "Test content for aggressive compression";
145 let compressed = codec.compress(original).unwrap();
146
147 // Verify format: first byte should be dict version (1)
148 assert_eq!(compressed[0], 1, "First byte should be dict version");
149
150 // Second byte should be FHD (0x23 or 0x63)
151 assert!(
152 compressed[1] == 0x23 || compressed[1] == 0x63,
153 "Second byte should be FHD (0x23 or 0x63), got: 0x{:02x}",
154 compressed[1]
155 );
156
157 // Should NOT contain magic number at start
158 if compressed.len() >= 4 {
159 assert_ne!(&compressed[0..4], ZSTD_MAGIC, "Should not start with magic");
160 }
161
162 // But decompression still works
163 let decompressed = codec.decompress(&compressed).unwrap();
164 assert_eq!(original, decompressed);
165 }
166
167 #[test]
168 fn test_unicode() {
169 let codec = PostContentCodec::new();
170
171 let original = "Hello 世界! 🌍 Testing émojis.";
172 let compressed = codec.compress(original).unwrap();
173 let decompressed = codec.decompress(&compressed).unwrap();
174
175 assert_eq!(original, decompressed);
176 }
177
178 #[test]
179 fn test_version() {
180 let codec = PostContentCodec::new();
181 assert_eq!(codec.version(), 1);
182 }
183
184 #[test]
185 fn test_empty_content() {
186 let codec = PostContentCodec::new();
187
188 let original = "";
189 let compressed = codec.compress(original).unwrap();
190 let decompressed = codec.decompress(&compressed).unwrap();
191
192 assert_eq!(original, decompressed);
193 }
194
195 #[test]
196 fn test_typical_post() {
197 let codec = PostContentCodec::new();
198
199 let original = "Just posted a cool photo from my trip! Check it out 📸";
200 let compressed = codec.compress(original).unwrap();
201 let decompressed = codec.decompress(&compressed).unwrap();
202
203 assert_eq!(original, decompressed);
204 println!(
205 "Original: {} bytes, Compressed: {} bytes, Ratio: {:.2}x",
206 original.len(),
207 compressed.len(),
208 original.len() as f64 / compressed.len() as f64
209 );
210 }
211
212 #[test]
213 fn test_savings_calculation() {
214 let codec = PostContentCodec::new();
215 let original = "This is a typical post with some text content.";
216
217 // Compress with standard zstd (for comparison)
218 let mut compressor_std = Compressor::with_dictionary(3, DICT_BYTES).unwrap();
219 let std_compressed = compressor_std.compress(original.as_bytes()).unwrap();
220
221 // Compress with our aggressive method
222 let our_compressed = codec.compress(original).unwrap();
223
224 // Calculate savings
225 let bytes_saved = std_compressed.len() - our_compressed.len();
226
227 println!("Standard zstd: {} bytes", std_compressed.len());
228 println!("Our format: {} bytes", our_compressed.len());
229 println!(
230 "Savings: {} bytes ({:.1}%)",
231 bytes_saved,
232 (bytes_saved as f64 / std_compressed.len() as f64) * 100.0
233 );
234
235 // Should save at least 7 bytes (magic + dict: 4+4-1 = 7, we keep FHD now)
236 assert!(bytes_saved >= 7, "Should save at least 7 bytes");
237
238 // Verify decompression works
239 let decompressed = codec.decompress(&our_compressed).unwrap();
240 assert_eq!(original, decompressed);
241 }
242
243 #[test]
244 fn test_both_fhd_types() {
245 let codec = PostContentCodec::new();
246
247 // Test a post that will use FHD=0x23 (short content, no FCS)
248 let short_content = "Short post";
249 let compressed_short = codec.compress(short_content).unwrap();
250 let decompressed_short = codec.decompress(&compressed_short).unwrap();
251 assert_eq!(short_content, decompressed_short);
252 println!("FHD for short post: 0x{:02x}", compressed_short[1]);
253
254 // Test a post that will use FHD=0x63 (longer content, with FCS)
255 let long_content =
256 "This is a much longer post with a lot of content that should trigger FHD=0x63. "
257 .repeat(10);
258 println!("Original content length: {}", long_content.len());
259
260 // First, compress with raw zstd to see what it produces
261 let mut raw_compressor =
262 Compressor::with_dictionary(COMPRESSION_LEVEL, DICT_BYTES).unwrap();
263 let raw_compressed = raw_compressor.compress(long_content.as_bytes()).unwrap();
264 println!("Raw zstd compressed length: {}", raw_compressed.len());
265 println!("Raw FHD: 0x{:02x}", raw_compressed[4]);
266 if raw_compressed[4] == 0x63 {
267 let fcs_bytes = &raw_compressed[9..11];
268 let fcs_value = u16::from_le_bytes([fcs_bytes[0], fcs_bytes[1]]);
269 println!(
270 "Raw FCS bytes: {:02x} {:02x}, value: {}, indicated size: {}",
271 fcs_bytes[0],
272 fcs_bytes[1],
273 fcs_value,
274 fcs_value as usize + 256
275 );
276 }
277
278 // Now compress with our codec
279 let compressed_long = codec.compress(&long_content).unwrap();
280 println!("Codec FHD: 0x{:02x}", compressed_long[1]);
281 println!(
282 "Codec compressed length: {}, First 12 bytes: {:02x?}",
283 compressed_long.len(),
284 &compressed_long[..12.min(compressed_long.len())]
285 );
286
287 // Try to decompress
288 match codec.decompress(&compressed_long) {
289 Ok(decompressed_long) => {
290 println!(
291 "Decompressed successfully! Length: {}",
292 decompressed_long.len()
293 );
294 assert_eq!(long_content, decompressed_long);
295 }
296 Err(e) => {
297 println!("Decompression failed: {:?}", e);
298 // Try manual decompression to debug
299 let mut manual_frame = Vec::new();
300 manual_frame.extend_from_slice(ZSTD_MAGIC);
301 manual_frame.push(compressed_long[1]); // FHD
302 manual_frame.extend_from_slice(DICT_ID_BYTES);
303 manual_frame.extend_from_slice(&compressed_long[2..]); // data
304
305 let mut manual_dec = Decompressor::with_dictionary(DICT_BYTES).unwrap();
306 match manual_dec.decompress(&manual_frame, MAX_DECOMPRESSED_SIZE) {
307 Ok(d) => println!("Manual decompression worked! Length: {}", d.len()),
308 Err(e2) => println!("Manual decompression also failed: {:?}", e2),
309 }
310
311 panic!("Failed to decompress long post: {}", e);
312 }
313 }
314 }
315}