we (web engine): Experimental web browser project to understand the limits of Claude
1//! DEFLATE decompressor (RFC 1951).
2//!
3//! Supports all three block types:
4//! - Non-compressed (BTYPE=00)
5//! - Fixed Huffman codes (BTYPE=01)
6//! - Dynamic Huffman codes (BTYPE=10)
7
8use std::fmt;
9
10// ---------------------------------------------------------------------------
11// Error type
12// ---------------------------------------------------------------------------
13
14/// Errors that can occur during DEFLATE decompression.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum DeflateError {
17 /// Unexpected end of input data.
18 UnexpectedEof,
19 /// Invalid block type (BTYPE=11 is reserved).
20 InvalidBlockType(u8),
21 /// Non-compressed block length check failed (LEN != ~NLEN).
22 LenMismatch { len: u16, nlen: u16 },
23 /// Invalid Huffman code encountered.
24 InvalidCode,
25 /// Invalid code length code in dynamic block header.
26 InvalidCodeLengths,
27 /// Back-reference distance exceeds available output.
28 InvalidDistance { distance: usize, available: usize },
29 /// Huffman tree is over-subscribed or otherwise invalid.
30 InvalidHuffmanTree,
31}
32
33impl fmt::Display for DeflateError {
34 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35 match self {
36 Self::UnexpectedEof => write!(f, "unexpected end of input"),
37 Self::InvalidBlockType(b) => write!(f, "invalid block type: {b}"),
38 Self::LenMismatch { len, nlen } => {
39 write!(
40 f,
41 "non-compressed block length mismatch: LEN={len}, NLEN={nlen}"
42 )
43 }
44 Self::InvalidCode => write!(f, "invalid Huffman code"),
45 Self::InvalidCodeLengths => write!(f, "invalid code length codes"),
46 Self::InvalidDistance {
47 distance,
48 available,
49 } => {
50 write!(
51 f,
52 "invalid back-reference: distance {distance} exceeds available {available}"
53 )
54 }
55 Self::InvalidHuffmanTree => write!(f, "invalid Huffman tree"),
56 }
57 }
58}
59
60pub type Result<T> = std::result::Result<T, DeflateError>;
61
62// ---------------------------------------------------------------------------
63// Bit reader
64// ---------------------------------------------------------------------------
65
66/// Reads bits from a byte slice, LSB-first (as required by DEFLATE).
67struct BitReader<'a> {
68 data: &'a [u8],
69 pos: usize,
70 bit_buf: u32,
71 bits_in_buf: u8,
72}
73
74impl<'a> BitReader<'a> {
75 fn new(data: &'a [u8]) -> Self {
76 Self {
77 data,
78 pos: 0,
79 bit_buf: 0,
80 bits_in_buf: 0,
81 }
82 }
83
84 /// Ensure at least `count` bits are available in the buffer.
85 fn fill(&mut self, count: u8) -> Result<()> {
86 while self.bits_in_buf < count {
87 if self.pos >= self.data.len() {
88 return Err(DeflateError::UnexpectedEof);
89 }
90 self.bit_buf |= (self.data[self.pos] as u32) << self.bits_in_buf;
91 self.pos += 1;
92 self.bits_in_buf += 8;
93 }
94 Ok(())
95 }
96
97 /// Fill the buffer with as many bits as possible, up to `count`.
98 /// Does not error on EOF — just fills what's available.
99 fn fill_available(&mut self, count: u8) {
100 while self.bits_in_buf < count {
101 if self.pos >= self.data.len() {
102 return;
103 }
104 self.bit_buf |= (self.data[self.pos] as u32) << self.bits_in_buf;
105 self.pos += 1;
106 self.bits_in_buf += 8;
107 }
108 }
109
110 /// Peek at the lowest `count` bits without consuming them.
111 fn peek(&self, count: u8) -> u32 {
112 self.bit_buf & ((1u32 << count) - 1)
113 }
114
115 /// Consume `count` bits from the buffer (must have been fill'd first).
116 fn consume(&mut self, count: u8) {
117 self.bit_buf >>= count;
118 self.bits_in_buf -= count;
119 }
120
121 /// Read `count` bits (up to 25) as a u32, LSB first.
122 fn read_bits(&mut self, count: u8) -> Result<u32> {
123 debug_assert!(count <= 25);
124 self.fill(count)?;
125 let val = self.peek(count);
126 self.consume(count);
127 Ok(val)
128 }
129
130 /// Align to the next byte boundary by discarding remaining bits.
131 fn align_to_byte(&mut self) {
132 self.bit_buf = 0;
133 self.bits_in_buf = 0;
134 }
135
136 /// Read a raw byte (after byte-alignment).
137 fn read_byte(&mut self) -> Result<u8> {
138 if self.pos >= self.data.len() {
139 return Err(DeflateError::UnexpectedEof);
140 }
141 let b = self.data[self.pos];
142 self.pos += 1;
143 Ok(b)
144 }
145
146 /// Read `n` raw bytes into a buffer (after byte-alignment).
147 fn read_bytes(&mut self, buf: &mut [u8]) -> Result<()> {
148 let n = buf.len();
149 if self.pos + n > self.data.len() {
150 return Err(DeflateError::UnexpectedEof);
151 }
152 buf.copy_from_slice(&self.data[self.pos..self.pos + n]);
153 self.pos += n;
154 Ok(())
155 }
156}
157
158// ---------------------------------------------------------------------------
159// Huffman tree
160// ---------------------------------------------------------------------------
161
162/// Maximum number of bits in a Huffman code for DEFLATE.
163const MAX_BITS: usize = 15;
164
165/// A Huffman decoding table built from code lengths.
166///
167/// Uses a two-level lookup: a primary table indexed by up to `primary_bits`
168/// bits, and secondary tables for longer codes.
169struct HuffmanTable {
170 /// Lookup table entries. Each entry is either:
171 /// - A leaf: (symbol << 4) | code_length (code_length > 0)
172 /// - A subtable pointer: (subtable_offset << 4) | 0, where offset is into `entries`
173 entries: Vec<u32>,
174 primary_bits: u8,
175}
176
177impl HuffmanTable {
178 /// Build a Huffman table from code lengths.
179 ///
180 /// `code_lengths[i]` is the bit length of symbol `i`. A length of 0 means
181 /// the symbol does not appear.
182 fn from_code_lengths(code_lengths: &[u8], primary_bits: u8) -> Result<Self> {
183 let max_code_len = code_lengths.iter().copied().max().unwrap_or(0) as usize;
184
185 if max_code_len > MAX_BITS {
186 return Err(DeflateError::InvalidHuffmanTree);
187 }
188
189 // Count codes of each length
190 let mut bl_count = [0u32; MAX_BITS + 1];
191 for &len in code_lengths {
192 bl_count[len as usize] += 1;
193 }
194 bl_count[0] = 0;
195
196 // Check for empty tree (all lengths are 0)
197 let total_codes: u32 = bl_count[1..].iter().sum();
198 if total_codes == 0 {
199 // Empty tree — return a table that always fails to decode
200 let size = 1 << primary_bits;
201 return Ok(Self {
202 entries: vec![0; size],
203 primary_bits,
204 });
205 }
206
207 // Compute next_code for each bit length (RFC 1951 §3.2.2)
208 let mut next_code = [0u32; MAX_BITS + 1];
209 let mut code = 0u32;
210 for bits in 1..=max_code_len {
211 code = (code + bl_count[bits - 1]) << 1;
212 next_code[bits] = code;
213 }
214
215 // Assign codes to symbols
216 let mut codes = vec![0u32; code_lengths.len()];
217 let mut lengths = vec![0u8; code_lengths.len()];
218 for (sym, &len) in code_lengths.iter().enumerate() {
219 if len > 0 {
220 codes[sym] = next_code[len as usize];
221 lengths[sym] = len;
222 next_code[len as usize] += 1;
223 }
224 }
225
226 // Build the lookup table
227 let primary_size = 1usize << primary_bits;
228 let mut entries = vec![0u32; primary_size];
229
230 // For codes that fit in primary_bits, fill all matching entries
231 for (sym, (&code_val, &code_len)) in codes.iter().zip(lengths.iter()).enumerate() {
232 if code_len == 0 {
233 continue;
234 }
235 if code_len <= primary_bits {
236 // Reverse the code bits for LSB-first lookup
237 let reversed = reverse_bits(code_val, code_len);
238 let extra_bits = primary_bits - code_len;
239 let base = reversed as usize;
240 // Fill all entries where the extra bits vary
241 for i in 0..(1usize << extra_bits) {
242 let index = base | (i << code_len);
243 entries[index] = ((sym as u32) << 4) | (code_len as u32);
244 }
245 }
246 }
247
248 // For codes longer than primary_bits, build secondary tables
249 if max_code_len as u8 > primary_bits {
250 // Group long codes by their primary_bits prefix
251 type SubtableEntry = (u32, u8, u16);
252 let mut subtables: Vec<(usize, Vec<SubtableEntry>)> = Vec::new();
253
254 for (sym, (&code_val, &code_len)) in codes.iter().zip(lengths.iter()).enumerate() {
255 if code_len == 0 || code_len <= primary_bits {
256 continue;
257 }
258 let reversed = reverse_bits(code_val, code_len);
259 let primary_index = (reversed as usize) & (primary_size - 1);
260 let secondary_bits = reversed >> primary_bits;
261 let secondary_len = code_len - primary_bits;
262
263 // Find or create subtable for this primary index
264 let pos = subtables.iter().position(|(idx, _)| *idx == primary_index);
265 match pos {
266 Some(p) => {
267 subtables[p]
268 .1
269 .push((secondary_bits, secondary_len, sym as u16));
270 }
271 None => {
272 subtables.push((
273 primary_index,
274 vec![(secondary_bits, secondary_len, sym as u16)],
275 ));
276 }
277 }
278 }
279
280 for (primary_index, symbols) in &subtables {
281 let max_secondary = symbols.iter().map(|(_, l, _)| *l).max().unwrap_or(0);
282 let subtable_size = 1usize << max_secondary;
283 let subtable_offset = entries.len();
284
285 entries.resize(entries.len() + subtable_size, 0);
286
287 for &(sec_bits, sec_len, sym) in symbols {
288 let extra = max_secondary - sec_len;
289 let base = sec_bits as usize;
290 for i in 0..(1usize << extra) {
291 let index = subtable_offset + base + (i << sec_len);
292 let total_len = primary_bits + sec_len;
293 entries[index] = ((sym as u32) << 4) | (total_len as u32);
294 }
295 }
296
297 // Store subtable pointer in primary entry
298 // Format: (subtable_offset << 4) | 0, with high bit indicating subtable
299 // We use bit pattern: offset in upper bits, max_secondary in bits [1..4], flag=0 in bit 0
300 // Actually simpler: store (offset << 8) | (max_secondary << 4) with code_len = 0
301 entries[*primary_index] =
302 ((subtable_offset as u32) << 8) | ((max_secondary as u32) << 4);
303 }
304 }
305
306 Ok(Self {
307 entries,
308 primary_bits,
309 })
310 }
311
312 /// Decode one symbol from the bit stream.
313 fn decode(&self, reader: &mut BitReader<'_>) -> Result<u16> {
314 // Fill as many bits as possible up to primary_bits.
315 // Near the end of stream we may have fewer bits than primary_bits,
316 // but enough for the actual code (e.g., 7-bit end-of-block in a 9-bit table).
317 reader.fill_available(self.primary_bits);
318 if reader.bits_in_buf == 0 {
319 return Err(DeflateError::UnexpectedEof);
320 }
321 // Peek only as many bits as we actually have (padded with zeros)
322 let peek_bits = std::cmp::min(self.primary_bits, reader.bits_in_buf);
323 let bits = reader.peek(peek_bits);
324 // If we have fewer bits than primary_bits, the upper bits are implicitly 0,
325 // which means only codes whose upper bits are 0 can match. This works for
326 // end-of-stream where the last code is padded with zeros.
327 let entry = self.entries[bits as usize];
328
329 let code_len = entry & 0xF;
330 if code_len > 0 {
331 if code_len as u8 > reader.bits_in_buf {
332 return Err(DeflateError::UnexpectedEof);
333 }
334 // Direct hit — consume only the actual code length
335 reader.consume(code_len as u8);
336 return Ok((entry >> 4) as u16);
337 }
338
339 // Subtable lookup — consume primary_bits, then peek into subtable
340 reader.consume(self.primary_bits);
341 let subtable_offset = (entry >> 8) as usize;
342 let max_secondary = ((entry >> 4) & 0xF) as u8;
343
344 reader.fill(max_secondary)?;
345 let sec_bits = reader.peek(max_secondary);
346 let sub_entry = self.entries[subtable_offset + sec_bits as usize];
347 let sub_code_len = sub_entry & 0xF;
348
349 if sub_code_len == 0 {
350 return Err(DeflateError::InvalidCode);
351 }
352
353 let actual_secondary = sub_code_len as u8 - self.primary_bits;
354 reader.consume(actual_secondary);
355
356 Ok((sub_entry >> 4) as u16)
357 }
358}
359
360/// Reverse the bottom `len` bits of `code`.
361fn reverse_bits(code: u32, len: u8) -> u32 {
362 let mut result = 0u32;
363 let mut c = code;
364 for _ in 0..len {
365 result = (result << 1) | (c & 1);
366 c >>= 1;
367 }
368 result
369}
370
371// ---------------------------------------------------------------------------
372// Fixed Huffman tables (RFC 1951 §3.2.6)
373// ---------------------------------------------------------------------------
374
375fn fixed_literal_lengths() -> [u8; 288] {
376 let mut lengths = [0u8; 288];
377 lengths[..=143].fill(8);
378 lengths[144..=255].fill(9);
379 lengths[256..=279].fill(7);
380 lengths[280..=287].fill(8);
381 lengths
382}
383
384fn fixed_distance_lengths() -> [u8; 32] {
385 [5u8; 32]
386}
387
388// ---------------------------------------------------------------------------
389// Length and distance base values (RFC 1951 §3.2.5)
390// ---------------------------------------------------------------------------
391
392/// (base_length, extra_bits) for length codes 257..285
393const LENGTH_TABLE: [(u16, u8); 29] = [
394 (3, 0),
395 (4, 0),
396 (5, 0),
397 (6, 0),
398 (7, 0),
399 (8, 0),
400 (9, 0),
401 (10, 0),
402 (11, 1),
403 (13, 1),
404 (15, 1),
405 (17, 1),
406 (19, 2),
407 (23, 2),
408 (27, 2),
409 (31, 2),
410 (35, 3),
411 (43, 3),
412 (51, 3),
413 (59, 3),
414 (67, 4),
415 (83, 4),
416 (99, 4),
417 (115, 4),
418 (131, 5),
419 (163, 5),
420 (195, 5),
421 (227, 5),
422 (258, 0),
423];
424
425/// (base_distance, extra_bits) for distance codes 0..29
426const DISTANCE_TABLE: [(u16, u8); 30] = [
427 (1, 0),
428 (2, 0),
429 (3, 0),
430 (4, 0),
431 (5, 1),
432 (7, 1),
433 (9, 2),
434 (13, 2),
435 (17, 3),
436 (25, 3),
437 (33, 4),
438 (49, 4),
439 (65, 5),
440 (97, 5),
441 (129, 6),
442 (193, 6),
443 (257, 7),
444 (385, 7),
445 (513, 8),
446 (769, 8),
447 (1025, 9),
448 (1537, 9),
449 (2049, 10),
450 (3073, 10),
451 (4097, 11),
452 (6145, 11),
453 (8193, 12),
454 (12289, 12),
455 (16385, 13),
456 (24577, 13),
457];
458
459/// Order in which code length code lengths appear (RFC 1951 §3.2.7).
460const CODE_LENGTH_ORDER: [usize; 19] = [
461 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
462];
463
464// ---------------------------------------------------------------------------
465// Decompressor
466// ---------------------------------------------------------------------------
467
468/// Decompress a DEFLATE compressed data stream per RFC 1951.
469pub fn inflate(input: &[u8]) -> Result<Vec<u8>> {
470 let mut reader = BitReader::new(input);
471 let mut output = Vec::new();
472
473 loop {
474 let bfinal = reader.read_bits(1)?;
475 let btype = reader.read_bits(2)? as u8;
476
477 match btype {
478 0 => decode_uncompressed(&mut reader, &mut output)?,
479 1 => decode_fixed(&mut reader, &mut output)?,
480 2 => decode_dynamic(&mut reader, &mut output)?,
481 _ => return Err(DeflateError::InvalidBlockType(btype)),
482 }
483
484 if bfinal == 1 {
485 break;
486 }
487 }
488
489 Ok(output)
490}
491
492/// Decode a non-compressed block (BTYPE=00).
493fn decode_uncompressed(reader: &mut BitReader<'_>, output: &mut Vec<u8>) -> Result<()> {
494 reader.align_to_byte();
495
496 let lo = reader.read_byte()? as u16;
497 let hi = reader.read_byte()? as u16;
498 let len = lo | (hi << 8);
499
500 let nlo = reader.read_byte()? as u16;
501 let nhi = reader.read_byte()? as u16;
502 let nlen = nlo | (nhi << 8);
503
504 if len != !nlen {
505 return Err(DeflateError::LenMismatch { len, nlen });
506 }
507
508 let mut buf = vec![0u8; len as usize];
509 reader.read_bytes(&mut buf)?;
510 output.extend_from_slice(&buf);
511
512 Ok(())
513}
514
515/// Decode a block with fixed Huffman codes (BTYPE=01).
516fn decode_fixed(reader: &mut BitReader<'_>, output: &mut Vec<u8>) -> Result<()> {
517 let lit_lengths = fixed_literal_lengths();
518 let dist_lengths = fixed_distance_lengths();
519
520 let lit_table = HuffmanTable::from_code_lengths(&lit_lengths, 9)?;
521 let dist_table = HuffmanTable::from_code_lengths(&dist_lengths, 5)?;
522
523 decode_compressed(reader, &lit_table, &dist_table, output)
524}
525
526/// Decode a block with dynamic Huffman codes (BTYPE=10).
527fn decode_dynamic(reader: &mut BitReader<'_>, output: &mut Vec<u8>) -> Result<()> {
528 let hlit = reader.read_bits(5)? as usize + 257;
529 let hdist = reader.read_bits(5)? as usize + 1;
530 let hclen = reader.read_bits(4)? as usize + 4;
531
532 // Read code length code lengths
533 let mut cl_lengths = [0u8; 19];
534 for i in 0..hclen {
535 cl_lengths[CODE_LENGTH_ORDER[i]] = reader.read_bits(3)? as u8;
536 }
537
538 let cl_table = HuffmanTable::from_code_lengths(&cl_lengths, 7)?;
539
540 // Decode literal/length and distance code lengths
541 let total = hlit + hdist;
542 let mut combined_lengths = vec![0u8; total];
543 let mut i = 0;
544
545 while i < total {
546 let sym = cl_table.decode(reader)?;
547
548 match sym {
549 0..=15 => {
550 combined_lengths[i] = sym as u8;
551 i += 1;
552 }
553 16 => {
554 // Repeat previous code length 3-6 times
555 if i == 0 {
556 return Err(DeflateError::InvalidCodeLengths);
557 }
558 let repeat = reader.read_bits(2)? as usize + 3;
559 let prev = combined_lengths[i - 1];
560 for _ in 0..repeat {
561 if i >= total {
562 return Err(DeflateError::InvalidCodeLengths);
563 }
564 combined_lengths[i] = prev;
565 i += 1;
566 }
567 }
568 17 => {
569 // Repeat 0 for 3-10 times
570 let repeat = reader.read_bits(3)? as usize + 3;
571 for _ in 0..repeat {
572 if i >= total {
573 return Err(DeflateError::InvalidCodeLengths);
574 }
575 combined_lengths[i] = 0;
576 i += 1;
577 }
578 }
579 18 => {
580 // Repeat 0 for 11-138 times
581 let repeat = reader.read_bits(7)? as usize + 11;
582 for _ in 0..repeat {
583 if i >= total {
584 return Err(DeflateError::InvalidCodeLengths);
585 }
586 combined_lengths[i] = 0;
587 i += 1;
588 }
589 }
590 _ => return Err(DeflateError::InvalidCodeLengths),
591 }
592 }
593
594 let lit_lengths = &combined_lengths[..hlit];
595 let dist_lengths = &combined_lengths[hlit..];
596
597 let lit_table = HuffmanTable::from_code_lengths(lit_lengths, 9)?;
598 let dist_table = HuffmanTable::from_code_lengths(dist_lengths, 6)?;
599
600 decode_compressed(reader, &lit_table, &dist_table, output)
601}
602
603/// Decode compressed data using literal/length and distance Huffman tables.
604fn decode_compressed(
605 reader: &mut BitReader<'_>,
606 lit_table: &HuffmanTable,
607 dist_table: &HuffmanTable,
608 output: &mut Vec<u8>,
609) -> Result<()> {
610 loop {
611 let sym = lit_table.decode(reader)?;
612
613 match sym {
614 0..=255 => {
615 output.push(sym as u8);
616 }
617 256 => {
618 // End of block
619 return Ok(());
620 }
621 257..=285 => {
622 // Length/distance pair
623 let length_index = (sym - 257) as usize;
624 let (base_len, extra_bits) = LENGTH_TABLE[length_index];
625 let length = base_len as usize
626 + if extra_bits > 0 {
627 reader.read_bits(extra_bits)? as usize
628 } else {
629 0
630 };
631
632 let dist_sym = dist_table.decode(reader)?;
633 if dist_sym as usize >= DISTANCE_TABLE.len() {
634 return Err(DeflateError::InvalidCode);
635 }
636 let (base_dist, dist_extra) = DISTANCE_TABLE[dist_sym as usize];
637 let distance = base_dist as usize
638 + if dist_extra > 0 {
639 reader.read_bits(dist_extra)? as usize
640 } else {
641 0
642 };
643
644 if distance > output.len() {
645 return Err(DeflateError::InvalidDistance {
646 distance,
647 available: output.len(),
648 });
649 }
650
651 // Copy from back-reference — byte-by-byte to handle overlapping
652 let start = output.len() - distance;
653 for i in 0..length {
654 let b = output[start + (i % distance)];
655 output.push(b);
656 }
657 }
658 _ => {
659 return Err(DeflateError::InvalidCode);
660 }
661 }
662 }
663}
664
665// ---------------------------------------------------------------------------
666// Tests
667// ---------------------------------------------------------------------------
668
669#[cfg(test)]
670mod tests {
671 use super::*;
672
673 // -- Error Display tests --
674
675 #[test]
676 fn error_display() {
677 assert_eq!(
678 DeflateError::UnexpectedEof.to_string(),
679 "unexpected end of input"
680 );
681 assert_eq!(
682 DeflateError::InvalidBlockType(3).to_string(),
683 "invalid block type: 3"
684 );
685 assert_eq!(
686 DeflateError::LenMismatch { len: 5, nlen: 10 }.to_string(),
687 "non-compressed block length mismatch: LEN=5, NLEN=10"
688 );
689 assert_eq!(
690 DeflateError::InvalidCode.to_string(),
691 "invalid Huffman code"
692 );
693 assert_eq!(
694 DeflateError::InvalidCodeLengths.to_string(),
695 "invalid code length codes"
696 );
697 assert_eq!(
698 DeflateError::InvalidDistance {
699 distance: 100,
700 available: 50
701 }
702 .to_string(),
703 "invalid back-reference: distance 100 exceeds available 50"
704 );
705 assert_eq!(
706 DeflateError::InvalidHuffmanTree.to_string(),
707 "invalid Huffman tree"
708 );
709 }
710
711 // -- BitReader tests --
712
713 #[test]
714 fn bit_reader_single_byte() {
715 let data = [0b10110100u8];
716 let mut reader = BitReader::new(&data);
717 // LSB first: bits are 0,0,1,0,1,1,0,1
718 assert_eq!(reader.read_bits(1).unwrap(), 0);
719 assert_eq!(reader.read_bits(1).unwrap(), 0);
720 assert_eq!(reader.read_bits(1).unwrap(), 1);
721 assert_eq!(reader.read_bits(1).unwrap(), 0);
722 assert_eq!(reader.read_bits(1).unwrap(), 1);
723 assert_eq!(reader.read_bits(1).unwrap(), 1);
724 assert_eq!(reader.read_bits(1).unwrap(), 0);
725 assert_eq!(reader.read_bits(1).unwrap(), 1);
726 }
727
728 #[test]
729 fn bit_reader_multi_bit() {
730 let data = [0b11010011u8];
731 let mut reader = BitReader::new(&data);
732 // Read 3 bits LSB first: bits 0,1,0 → reversed from MSB: 011 → value 3
733 assert_eq!(reader.read_bits(3).unwrap(), 0b011); // lowest 3 bits
734 assert_eq!(reader.read_bits(5).unwrap(), 0b11010); // remaining 5 bits
735 }
736
737 #[test]
738 fn bit_reader_cross_byte() {
739 let data = [0xFF, 0x00];
740 let mut reader = BitReader::new(&data);
741 assert_eq!(reader.read_bits(4).unwrap(), 0xF);
742 assert_eq!(reader.read_bits(8).unwrap(), 0x0F); // 4 bits from first byte + 4 from second
743 }
744
745 #[test]
746 fn bit_reader_eof() {
747 let data = [0x00];
748 let mut reader = BitReader::new(&data);
749 reader.read_bits(8).unwrap();
750 assert!(matches!(
751 reader.read_bits(1),
752 Err(DeflateError::UnexpectedEof)
753 ));
754 }
755
756 #[test]
757 fn bit_reader_align_and_read_bytes() {
758 let data = [0xFF, 0xAB, 0xCD];
759 let mut reader = BitReader::new(&data);
760 reader.read_bits(3).unwrap(); // read some bits
761 reader.align_to_byte();
762 let mut buf = [0u8; 2];
763 reader.read_bytes(&mut buf).unwrap();
764 assert_eq!(buf, [0xAB, 0xCD]);
765 }
766
767 // -- reverse_bits tests --
768
769 #[test]
770 fn reverse_bits_basic() {
771 assert_eq!(reverse_bits(0b110, 3), 0b011);
772 assert_eq!(reverse_bits(0b1010, 4), 0b0101);
773 assert_eq!(reverse_bits(0b1, 1), 0b1);
774 assert_eq!(reverse_bits(0b0, 1), 0b0);
775 assert_eq!(reverse_bits(0b11001, 5), 0b10011);
776 }
777
778 // -- Huffman table tests --
779
780 #[test]
781 fn huffman_fixed_literal_table() {
782 // Verify fixed literal table can be constructed
783 let lengths = fixed_literal_lengths();
784 let table = HuffmanTable::from_code_lengths(&lengths, 9).unwrap();
785 assert!(!table.entries.is_empty());
786 }
787
788 #[test]
789 fn huffman_fixed_distance_table() {
790 let lengths = fixed_distance_lengths();
791 let table = HuffmanTable::from_code_lengths(&lengths, 5).unwrap();
792 assert!(!table.entries.is_empty());
793 }
794
795 #[test]
796 fn huffman_simple_decode() {
797 // Build a simple tree: A=0, B=10, C=11
798 let lengths = [1u8, 2, 2];
799 let table = HuffmanTable::from_code_lengths(&lengths, 2).unwrap();
800
801 // A = code 0 (1 bit), B = code 10 (2 bits), C = code 11 (2 bits)
802 // LSB first: A=0, B=01, C=11
803 // Encode: A, B, C, A = 0 | 01 | 11 | 0 = 0b0_11_01_0 in LSB order
804 // In bytes: 0b01110100 (pad with zeros on MSB side if needed)
805 // Wait, let me recalculate for LSB-first bit reader:
806 // A: bit 0 → 0
807 // B: bits 1-2 → 01 (LSB first, so code 10 reversed = 01)
808 // C: bits 3-4 → 11 (code 11 reversed = 11)
809 // A: bit 5 → 0
810 // Byte: bits 76543210 = 00_0_11_01_0 = 0b00011010 = 0x1A
811 let data = [0x1A];
812 let mut reader = BitReader::new(&data);
813
814 assert_eq!(table.decode(&mut reader).unwrap(), 0); // A
815 assert_eq!(table.decode(&mut reader).unwrap(), 1); // B
816 assert_eq!(table.decode(&mut reader).unwrap(), 2); // C
817 assert_eq!(table.decode(&mut reader).unwrap(), 0); // A
818 }
819
820 // -- Non-compressed block (BTYPE=00) tests --
821
822 #[test]
823 fn inflate_uncompressed_block() {
824 // BFINAL=1, BTYPE=00 (non-compressed)
825 // Bits: 1 (final) 00 (type) → 0b001 = least significant 3 bits
826 // Then aligned: LEN=5, NLEN=~5=0xFFFA
827 let data_to_store = b"Hello";
828 let mut input = Vec::new();
829 // First byte: BFINAL=1, BTYPE=00 → bits 1,0,0 → byte 0x01
830 input.push(0x01);
831 // LEN = 5, little-endian
832 input.push(5);
833 input.push(0);
834 // NLEN = !5 = 0xFFFA, little-endian
835 input.push(0xFA);
836 input.push(0xFF);
837 // Data
838 input.extend_from_slice(data_to_store);
839
840 let result = inflate(&input).unwrap();
841 assert_eq!(result, b"Hello");
842 }
843
844 #[test]
845 fn inflate_uncompressed_empty() {
846 // Non-compressed block with length 0
847 let mut input = Vec::new();
848 input.push(0x01); // BFINAL=1, BTYPE=00
849 input.push(0); // LEN=0
850 input.push(0);
851 input.push(0xFF); // NLEN=0xFFFF = !0
852 input.push(0xFF);
853
854 let result = inflate(&input).unwrap();
855 assert!(result.is_empty());
856 }
857
858 #[test]
859 fn inflate_uncompressed_len_mismatch() {
860 let mut input = Vec::new();
861 input.push(0x01);
862 input.push(5);
863 input.push(0);
864 input.push(0x00); // Wrong NLEN
865 input.push(0x00);
866
867 assert!(matches!(
868 inflate(&input),
869 Err(DeflateError::LenMismatch { .. })
870 ));
871 }
872
873 // -- Multiple non-compressed blocks --
874
875 #[test]
876 fn inflate_two_uncompressed_blocks() {
877 let mut input = Vec::new();
878
879 // Block 1: BFINAL=0, BTYPE=00
880 input.push(0x00);
881 input.push(3);
882 input.push(0);
883 input.push(0xFC);
884 input.push(0xFF);
885 input.extend_from_slice(b"Hel");
886
887 // Block 2: BFINAL=1, BTYPE=00
888 input.push(0x01);
889 input.push(2);
890 input.push(0);
891 input.push(0xFD);
892 input.push(0xFF);
893 input.extend_from_slice(b"lo");
894
895 let result = inflate(&input).unwrap();
896 assert_eq!(result, b"Hello");
897 }
898
899 // -- Fixed Huffman block tests --
900
901 #[test]
902 fn inflate_fixed_huffman_literals_only() {
903 // Use Rust's flate2-equivalent: manually construct a fixed Huffman block
904 // that encodes just literal bytes and end-of-block.
905 //
906 // For simplicity, we test against known compressed output.
907 // The easiest way is to compress with the non-compressed format
908 // and verify, then test with known DEFLATE streams.
909
910 // Use a known compressed representation of empty data:
911 // Fixed block with just end-of-block (symbol 256)
912 // Symbol 256 has code length 7, code = 0000000 in fixed Huffman
913 // Actually, per fixed table: 256-279 have 7-bit codes starting at 0b0000000
914 // Code for 256: 7 bits, value 0b0000000
915 // Reversed for LSB-first: 0b0000000
916 // BFINAL=1, BTYPE=01: bits 1, 0, 1 → 0b101 = 5
917 // Then 7 bits of 0 for symbol 256
918 // Total: 3 + 7 = 10 bits → 2 bytes
919 // Byte 0: bits 0-7: 1,0,1, 0,0,0,0,0 = 0b00000_101 = 0x05
920 // Byte 1: bits 8-9: 0,0 (remaining bits of the code) + padding = 0x00
921 let input = [0x03, 0x00]; // Known empty fixed block
922 let result = inflate(&input).unwrap();
923 assert!(result.is_empty());
924 }
925
926 #[test]
927 fn inflate_fixed_huffman_known_data() {
928 // This is the DEFLATE compressed form of "Hello" using fixed Huffman codes.
929 // Generated by standard deflate: raw deflate of "Hello"
930 // We can verify this by using known test vectors.
931 //
932 // Actually, let's build it manually:
933 // BFINAL=1, BTYPE=01 (fixed)
934 // Then encode 'H'(72), 'e'(101), 'l'(108), 'l'(108), 'o'(111), EOB(256)
935 //
936 // Fixed Huffman codes for literals 0-143: 8-bit codes 00110000..10111111
937 // Code for literal N (0-143): reversed(N + 0x30) in 8 bits
938 //
939 // Let's use a known deflate stream instead. The bytes below are the
940 // raw DEFLATE encoding of "Hello" (verified against RFC 1951).
941 let input = [0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x07, 0x00];
942 let result = inflate(&input).unwrap();
943 assert_eq!(result, b"Hello");
944 }
945
946 // -- Invalid block type --
947
948 #[test]
949 fn inflate_invalid_block_type() {
950 // BFINAL=1, BTYPE=11 (reserved)
951 // Bits: 1, 1, 1 → byte 0x07
952 let input = [0x07];
953 assert!(matches!(
954 inflate(&input),
955 Err(DeflateError::InvalidBlockType(3))
956 ));
957 }
958
959 // -- Empty input --
960
961 #[test]
962 fn inflate_empty_input() {
963 assert!(matches!(inflate(&[]), Err(DeflateError::UnexpectedEof)));
964 }
965
966 // -- Distance validation --
967
968 #[test]
969 fn inflate_distance_table_coverage() {
970 // Verify all distance table entries are valid
971 for (i, &(base, extra)) in DISTANCE_TABLE.iter().enumerate() {
972 assert!(base >= 1, "distance code {i} has base {base} < 1");
973 let max_dist = base as u32 + ((1u32 << extra) - 1);
974 assert!(
975 max_dist <= 32768,
976 "distance code {i} max {max_dist} exceeds 32KB window"
977 );
978 }
979 }
980
981 // -- Length table coverage --
982
983 #[test]
984 fn inflate_length_table_coverage() {
985 // Verify length table entries
986 for (i, &(base, extra)) in LENGTH_TABLE.iter().enumerate() {
987 assert!(base >= 3, "length code {i} has base {base} < 3");
988 if i < 28 {
989 // Codes 257-284 have ranges
990 let max_len = base as u32 + ((1u32 << extra) - 1);
991 assert!(max_len <= 258, "length code {i} max {max_len} exceeds 258");
992 }
993 }
994 }
995
996 // -- Back-reference (LZ77) tests via fixed Huffman --
997
998 #[test]
999 fn inflate_with_back_reference() {
1000 // DEFLATE compressed "abcabcabc" — uses a back-reference
1001 // Raw DEFLATE for "abcabcabc" with fixed Huffman:
1002 let input = [0x4b, 0x4c, 0x4a, 0x4e, 0x04, 0x23, 0x00];
1003 let result = inflate(&input).unwrap();
1004 assert_eq!(result, b"abcabcabc");
1005 }
1006
1007 // -- Dynamic Huffman block tests --
1008
1009 #[test]
1010 fn inflate_dynamic_huffman() {
1011 // A known DEFLATE stream with dynamic Huffman codes.
1012 // This is the raw DEFLATE encoding of "AAAAAAAAAA" (10 A's)
1013 // which benefits from dynamic codes + back-references.
1014 let input = [0x73, 0x74, 0x84, 0x01, 0x00];
1015 let result = inflate(&input).unwrap();
1016 assert_eq!(result, b"AAAAAAAAAA");
1017 }
1018
1019 // -- Fixed Huffman table properties --
1020
1021 #[test]
1022 fn fixed_literal_lengths_correct() {
1023 let lengths = fixed_literal_lengths();
1024 // 0-143: length 8
1025 for i in 0..=143 {
1026 assert_eq!(lengths[i], 8, "literal {i} should have length 8");
1027 }
1028 // 144-255: length 9
1029 for i in 144..=255 {
1030 assert_eq!(lengths[i], 9, "literal {i} should have length 9");
1031 }
1032 // 256-279: length 7
1033 for i in 256..=279 {
1034 assert_eq!(lengths[i], 7, "literal {i} should have length 7");
1035 }
1036 // 280-287: length 8
1037 for i in 280..=287 {
1038 assert_eq!(lengths[i], 8, "literal {i} should have length 8");
1039 }
1040 }
1041
1042 #[test]
1043 fn fixed_distance_lengths_correct() {
1044 let lengths = fixed_distance_lengths();
1045 for (i, &len) in lengths.iter().enumerate() {
1046 assert_eq!(len, 5, "distance {i} should have length 5");
1047 }
1048 }
1049
1050 // -- Round-trip test using non-compressed then verifying inflate --
1051
1052 #[test]
1053 fn inflate_uncompressed_large_block() {
1054 // Test with a larger block (256 bytes)
1055 let payload: Vec<u8> = (0..=255).collect();
1056 let mut input = Vec::new();
1057 input.push(0x01); // BFINAL=1, BTYPE=00
1058 let len = payload.len() as u16;
1059 input.push(len as u8);
1060 input.push((len >> 8) as u8);
1061 let nlen = !len;
1062 input.push(nlen as u8);
1063 input.push((nlen >> 8) as u8);
1064 input.extend_from_slice(&payload);
1065
1066 let result = inflate(&input).unwrap();
1067 assert_eq!(result, payload);
1068 }
1069
1070 // -- Code length order --
1071
1072 #[test]
1073 fn code_length_order_correct() {
1074 // Per RFC 1951 §3.2.7
1075 assert_eq!(
1076 CODE_LENGTH_ORDER,
1077 [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
1078 );
1079 }
1080
1081 // -- Overlapping back-reference --
1082
1083 #[test]
1084 fn inflate_overlapping_backref() {
1085 // Test that overlapping back-references work correctly.
1086 // e.g., distance=1, length=5 starting from "A" should produce "AAAAA"
1087 // This is tested indirectly through the DEFLATE stream,
1088 // but let's verify the copy logic directly.
1089 let mut output = vec![b'A'];
1090 let distance = 1usize;
1091 let length = 5usize;
1092 let start = output.len() - distance;
1093 for i in 0..length {
1094 let b = output[start + (i % distance)];
1095 output.push(b);
1096 }
1097 assert_eq!(output, b"AAAAAA");
1098 }
1099
1100 // -- Comprehensive round-trip tests with zlib-generated vectors --
1101
1102 #[test]
1103 fn inflate_single_byte() {
1104 let compressed = [0x4b, 0x04, 0x00];
1105 let result = inflate(&compressed).unwrap();
1106 assert_eq!(result, b"a");
1107 }
1108
1109 #[test]
1110 fn inflate_repeated_char_1000() {
1111 // 1000 'a' characters — heavy LZ77 back-referencing
1112 let compressed = [
1113 0x4b, 0x4c, 0x1c, 0x05, 0xa3, 0x60, 0x14, 0x0c, 0x77, 0x00, 0x00,
1114 ];
1115 let result = inflate(&compressed).unwrap();
1116 assert_eq!(result.len(), 1000);
1117 assert!(result.iter().all(|&b| b == b'a'));
1118 }
1119
1120 #[test]
1121 fn inflate_pangram() {
1122 let compressed = [
1123 0x0b, 0xc9, 0x48, 0x55, 0x28, 0x2c, 0xcd, 0x4c, 0xce, 0x56, 0x48, 0x2a, 0xca, 0x2f,
1124 0xcf, 0x53, 0x48, 0xcb, 0xaf, 0x50, 0xc8, 0x2a, 0xcd, 0x2d, 0x28, 0x56, 0xc8, 0x2f,
1125 0x4b, 0x2d, 0x52, 0x28, 0x01, 0x4a, 0xe7, 0x24, 0x56, 0x55, 0x2a, 0xa4, 0xe4, 0xa7,
1126 0x03, 0x00,
1127 ];
1128 let result = inflate(&compressed).unwrap();
1129 assert_eq!(result, b"The quick brown fox jumps over the lazy dog");
1130 }
1131
1132 #[test]
1133 fn inflate_alphabet_repeated() {
1134 // "abcdefghijklmnopqrstuvwxyz" * 10 = 260 bytes
1135 let compressed = [
1136 0x4b, 0x4c, 0x4a, 0x4e, 0x49, 0x4d, 0x4b, 0xcf, 0xc8, 0xcc, 0xca, 0xce, 0xc9, 0xcd,
1137 0xcb, 0x2f, 0x28, 0x2c, 0x2a, 0x2e, 0x29, 0x2d, 0x2b, 0xaf, 0xa8, 0xac, 0x4a, 0x1c,
1138 0x31, 0x32, 0x00,
1139 ];
1140 let result = inflate(&compressed).unwrap();
1141 let expected: Vec<u8> = b"abcdefghijklmnopqrstuvwxyz".repeat(10);
1142 assert_eq!(result, expected);
1143 }
1144
1145 // -- All byte values via non-compressed block --
1146
1147 #[test]
1148 fn inflate_all_byte_values() {
1149 // Non-compressed block containing all 256 byte values
1150 let payload: Vec<u8> = (0..=255).collect();
1151 // zlib at level 6 uses non-compressed for this data
1152 let compressed = [
1153 0x01, 0x00, 0x01, 0xff, 0xfe, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
1154 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
1155 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24,
1156 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
1157 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40,
1158 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e,
1159 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
1160 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a,
1161 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1162 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86,
1163 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94,
1164 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2,
1165 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
1166 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe,
1167 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc,
1168 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
1169 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8,
1170 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6,
1171 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
1172 ];
1173 let result = inflate(&compressed).unwrap();
1174 assert_eq!(result, payload);
1175 }
1176
1177 // -- Maximum back-reference distance (32KB window) --
1178
1179 #[test]
1180 fn inflate_max_window_uncompressed() {
1181 // Test near the 32KB window limit with non-compressed blocks
1182 let payload = vec![0xAB; 32768];
1183 // Build as a series of non-compressed blocks (max 65535 bytes each)
1184 let mut input = Vec::new();
1185 input.push(0x01); // BFINAL=1, BTYPE=00
1186 let len = 32768u16;
1187 // 32768 = 0x8000 which fits in u16
1188 input.push(len as u8);
1189 input.push((len >> 8) as u8);
1190 let nlen = !len;
1191 input.push(nlen as u8);
1192 input.push((nlen >> 8) as u8);
1193 input.extend_from_slice(&payload);
1194
1195 let result = inflate(&input).unwrap();
1196 assert_eq!(result.len(), 32768);
1197 assert!(result.iter().all(|&b| b == 0xAB));
1198 }
1199}