we (web engine): Experimental web browser project to understand the limits of Claude
at utf-codecs 1199 lines 41 kB view raw
1//! DEFLATE decompressor (RFC 1951). 2//! 3//! Supports all three block types: 4//! - Non-compressed (BTYPE=00) 5//! - Fixed Huffman codes (BTYPE=01) 6//! - Dynamic Huffman codes (BTYPE=10) 7 8use std::fmt; 9 10// --------------------------------------------------------------------------- 11// Error type 12// --------------------------------------------------------------------------- 13 14/// Errors that can occur during DEFLATE decompression. 15#[derive(Debug, Clone, PartialEq, Eq)] 16pub enum DeflateError { 17 /// Unexpected end of input data. 18 UnexpectedEof, 19 /// Invalid block type (BTYPE=11 is reserved). 20 InvalidBlockType(u8), 21 /// Non-compressed block length check failed (LEN != ~NLEN). 22 LenMismatch { len: u16, nlen: u16 }, 23 /// Invalid Huffman code encountered. 24 InvalidCode, 25 /// Invalid code length code in dynamic block header. 26 InvalidCodeLengths, 27 /// Back-reference distance exceeds available output. 28 InvalidDistance { distance: usize, available: usize }, 29 /// Huffman tree is over-subscribed or otherwise invalid. 30 InvalidHuffmanTree, 31} 32 33impl fmt::Display for DeflateError { 34 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 35 match self { 36 Self::UnexpectedEof => write!(f, "unexpected end of input"), 37 Self::InvalidBlockType(b) => write!(f, "invalid block type: {b}"), 38 Self::LenMismatch { len, nlen } => { 39 write!( 40 f, 41 "non-compressed block length mismatch: LEN={len}, NLEN={nlen}" 42 ) 43 } 44 Self::InvalidCode => write!(f, "invalid Huffman code"), 45 Self::InvalidCodeLengths => write!(f, "invalid code length codes"), 46 Self::InvalidDistance { 47 distance, 48 available, 49 } => { 50 write!( 51 f, 52 "invalid back-reference: distance {distance} exceeds available {available}" 53 ) 54 } 55 Self::InvalidHuffmanTree => write!(f, "invalid Huffman tree"), 56 } 57 } 58} 59 60pub type Result<T> = std::result::Result<T, DeflateError>; 61 62// --------------------------------------------------------------------------- 63// Bit reader 64// --------------------------------------------------------------------------- 65 66/// Reads bits from a byte slice, LSB-first (as required by DEFLATE). 67struct BitReader<'a> { 68 data: &'a [u8], 69 pos: usize, 70 bit_buf: u32, 71 bits_in_buf: u8, 72} 73 74impl<'a> BitReader<'a> { 75 fn new(data: &'a [u8]) -> Self { 76 Self { 77 data, 78 pos: 0, 79 bit_buf: 0, 80 bits_in_buf: 0, 81 } 82 } 83 84 /// Ensure at least `count` bits are available in the buffer. 85 fn fill(&mut self, count: u8) -> Result<()> { 86 while self.bits_in_buf < count { 87 if self.pos >= self.data.len() { 88 return Err(DeflateError::UnexpectedEof); 89 } 90 self.bit_buf |= (self.data[self.pos] as u32) << self.bits_in_buf; 91 self.pos += 1; 92 self.bits_in_buf += 8; 93 } 94 Ok(()) 95 } 96 97 /// Fill the buffer with as many bits as possible, up to `count`. 98 /// Does not error on EOF — just fills what's available. 99 fn fill_available(&mut self, count: u8) { 100 while self.bits_in_buf < count { 101 if self.pos >= self.data.len() { 102 return; 103 } 104 self.bit_buf |= (self.data[self.pos] as u32) << self.bits_in_buf; 105 self.pos += 1; 106 self.bits_in_buf += 8; 107 } 108 } 109 110 /// Peek at the lowest `count` bits without consuming them. 111 fn peek(&self, count: u8) -> u32 { 112 self.bit_buf & ((1u32 << count) - 1) 113 } 114 115 /// Consume `count` bits from the buffer (must have been fill'd first). 116 fn consume(&mut self, count: u8) { 117 self.bit_buf >>= count; 118 self.bits_in_buf -= count; 119 } 120 121 /// Read `count` bits (up to 25) as a u32, LSB first. 122 fn read_bits(&mut self, count: u8) -> Result<u32> { 123 debug_assert!(count <= 25); 124 self.fill(count)?; 125 let val = self.peek(count); 126 self.consume(count); 127 Ok(val) 128 } 129 130 /// Align to the next byte boundary by discarding remaining bits. 131 fn align_to_byte(&mut self) { 132 self.bit_buf = 0; 133 self.bits_in_buf = 0; 134 } 135 136 /// Read a raw byte (after byte-alignment). 137 fn read_byte(&mut self) -> Result<u8> { 138 if self.pos >= self.data.len() { 139 return Err(DeflateError::UnexpectedEof); 140 } 141 let b = self.data[self.pos]; 142 self.pos += 1; 143 Ok(b) 144 } 145 146 /// Read `n` raw bytes into a buffer (after byte-alignment). 147 fn read_bytes(&mut self, buf: &mut [u8]) -> Result<()> { 148 let n = buf.len(); 149 if self.pos + n > self.data.len() { 150 return Err(DeflateError::UnexpectedEof); 151 } 152 buf.copy_from_slice(&self.data[self.pos..self.pos + n]); 153 self.pos += n; 154 Ok(()) 155 } 156} 157 158// --------------------------------------------------------------------------- 159// Huffman tree 160// --------------------------------------------------------------------------- 161 162/// Maximum number of bits in a Huffman code for DEFLATE. 163const MAX_BITS: usize = 15; 164 165/// A Huffman decoding table built from code lengths. 166/// 167/// Uses a two-level lookup: a primary table indexed by up to `primary_bits` 168/// bits, and secondary tables for longer codes. 169struct HuffmanTable { 170 /// Lookup table entries. Each entry is either: 171 /// - A leaf: (symbol << 4) | code_length (code_length > 0) 172 /// - A subtable pointer: (subtable_offset << 4) | 0, where offset is into `entries` 173 entries: Vec<u32>, 174 primary_bits: u8, 175} 176 177impl HuffmanTable { 178 /// Build a Huffman table from code lengths. 179 /// 180 /// `code_lengths[i]` is the bit length of symbol `i`. A length of 0 means 181 /// the symbol does not appear. 182 fn from_code_lengths(code_lengths: &[u8], primary_bits: u8) -> Result<Self> { 183 let max_code_len = code_lengths.iter().copied().max().unwrap_or(0) as usize; 184 185 if max_code_len > MAX_BITS { 186 return Err(DeflateError::InvalidHuffmanTree); 187 } 188 189 // Count codes of each length 190 let mut bl_count = [0u32; MAX_BITS + 1]; 191 for &len in code_lengths { 192 bl_count[len as usize] += 1; 193 } 194 bl_count[0] = 0; 195 196 // Check for empty tree (all lengths are 0) 197 let total_codes: u32 = bl_count[1..].iter().sum(); 198 if total_codes == 0 { 199 // Empty tree — return a table that always fails to decode 200 let size = 1 << primary_bits; 201 return Ok(Self { 202 entries: vec![0; size], 203 primary_bits, 204 }); 205 } 206 207 // Compute next_code for each bit length (RFC 1951 §3.2.2) 208 let mut next_code = [0u32; MAX_BITS + 1]; 209 let mut code = 0u32; 210 for bits in 1..=max_code_len { 211 code = (code + bl_count[bits - 1]) << 1; 212 next_code[bits] = code; 213 } 214 215 // Assign codes to symbols 216 let mut codes = vec![0u32; code_lengths.len()]; 217 let mut lengths = vec![0u8; code_lengths.len()]; 218 for (sym, &len) in code_lengths.iter().enumerate() { 219 if len > 0 { 220 codes[sym] = next_code[len as usize]; 221 lengths[sym] = len; 222 next_code[len as usize] += 1; 223 } 224 } 225 226 // Build the lookup table 227 let primary_size = 1usize << primary_bits; 228 let mut entries = vec![0u32; primary_size]; 229 230 // For codes that fit in primary_bits, fill all matching entries 231 for (sym, (&code_val, &code_len)) in codes.iter().zip(lengths.iter()).enumerate() { 232 if code_len == 0 { 233 continue; 234 } 235 if code_len <= primary_bits { 236 // Reverse the code bits for LSB-first lookup 237 let reversed = reverse_bits(code_val, code_len); 238 let extra_bits = primary_bits - code_len; 239 let base = reversed as usize; 240 // Fill all entries where the extra bits vary 241 for i in 0..(1usize << extra_bits) { 242 let index = base | (i << code_len); 243 entries[index] = ((sym as u32) << 4) | (code_len as u32); 244 } 245 } 246 } 247 248 // For codes longer than primary_bits, build secondary tables 249 if max_code_len as u8 > primary_bits { 250 // Group long codes by their primary_bits prefix 251 type SubtableEntry = (u32, u8, u16); 252 let mut subtables: Vec<(usize, Vec<SubtableEntry>)> = Vec::new(); 253 254 for (sym, (&code_val, &code_len)) in codes.iter().zip(lengths.iter()).enumerate() { 255 if code_len == 0 || code_len <= primary_bits { 256 continue; 257 } 258 let reversed = reverse_bits(code_val, code_len); 259 let primary_index = (reversed as usize) & (primary_size - 1); 260 let secondary_bits = reversed >> primary_bits; 261 let secondary_len = code_len - primary_bits; 262 263 // Find or create subtable for this primary index 264 let pos = subtables.iter().position(|(idx, _)| *idx == primary_index); 265 match pos { 266 Some(p) => { 267 subtables[p] 268 .1 269 .push((secondary_bits, secondary_len, sym as u16)); 270 } 271 None => { 272 subtables.push(( 273 primary_index, 274 vec![(secondary_bits, secondary_len, sym as u16)], 275 )); 276 } 277 } 278 } 279 280 for (primary_index, symbols) in &subtables { 281 let max_secondary = symbols.iter().map(|(_, l, _)| *l).max().unwrap_or(0); 282 let subtable_size = 1usize << max_secondary; 283 let subtable_offset = entries.len(); 284 285 entries.resize(entries.len() + subtable_size, 0); 286 287 for &(sec_bits, sec_len, sym) in symbols { 288 let extra = max_secondary - sec_len; 289 let base = sec_bits as usize; 290 for i in 0..(1usize << extra) { 291 let index = subtable_offset + base + (i << sec_len); 292 let total_len = primary_bits + sec_len; 293 entries[index] = ((sym as u32) << 4) | (total_len as u32); 294 } 295 } 296 297 // Store subtable pointer in primary entry 298 // Format: (subtable_offset << 4) | 0, with high bit indicating subtable 299 // We use bit pattern: offset in upper bits, max_secondary in bits [1..4], flag=0 in bit 0 300 // Actually simpler: store (offset << 8) | (max_secondary << 4) with code_len = 0 301 entries[*primary_index] = 302 ((subtable_offset as u32) << 8) | ((max_secondary as u32) << 4); 303 } 304 } 305 306 Ok(Self { 307 entries, 308 primary_bits, 309 }) 310 } 311 312 /// Decode one symbol from the bit stream. 313 fn decode(&self, reader: &mut BitReader<'_>) -> Result<u16> { 314 // Fill as many bits as possible up to primary_bits. 315 // Near the end of stream we may have fewer bits than primary_bits, 316 // but enough for the actual code (e.g., 7-bit end-of-block in a 9-bit table). 317 reader.fill_available(self.primary_bits); 318 if reader.bits_in_buf == 0 { 319 return Err(DeflateError::UnexpectedEof); 320 } 321 // Peek only as many bits as we actually have (padded with zeros) 322 let peek_bits = std::cmp::min(self.primary_bits, reader.bits_in_buf); 323 let bits = reader.peek(peek_bits); 324 // If we have fewer bits than primary_bits, the upper bits are implicitly 0, 325 // which means only codes whose upper bits are 0 can match. This works for 326 // end-of-stream where the last code is padded with zeros. 327 let entry = self.entries[bits as usize]; 328 329 let code_len = entry & 0xF; 330 if code_len > 0 { 331 if code_len as u8 > reader.bits_in_buf { 332 return Err(DeflateError::UnexpectedEof); 333 } 334 // Direct hit — consume only the actual code length 335 reader.consume(code_len as u8); 336 return Ok((entry >> 4) as u16); 337 } 338 339 // Subtable lookup — consume primary_bits, then peek into subtable 340 reader.consume(self.primary_bits); 341 let subtable_offset = (entry >> 8) as usize; 342 let max_secondary = ((entry >> 4) & 0xF) as u8; 343 344 reader.fill(max_secondary)?; 345 let sec_bits = reader.peek(max_secondary); 346 let sub_entry = self.entries[subtable_offset + sec_bits as usize]; 347 let sub_code_len = sub_entry & 0xF; 348 349 if sub_code_len == 0 { 350 return Err(DeflateError::InvalidCode); 351 } 352 353 let actual_secondary = sub_code_len as u8 - self.primary_bits; 354 reader.consume(actual_secondary); 355 356 Ok((sub_entry >> 4) as u16) 357 } 358} 359 360/// Reverse the bottom `len` bits of `code`. 361fn reverse_bits(code: u32, len: u8) -> u32 { 362 let mut result = 0u32; 363 let mut c = code; 364 for _ in 0..len { 365 result = (result << 1) | (c & 1); 366 c >>= 1; 367 } 368 result 369} 370 371// --------------------------------------------------------------------------- 372// Fixed Huffman tables (RFC 1951 §3.2.6) 373// --------------------------------------------------------------------------- 374 375fn fixed_literal_lengths() -> [u8; 288] { 376 let mut lengths = [0u8; 288]; 377 lengths[..=143].fill(8); 378 lengths[144..=255].fill(9); 379 lengths[256..=279].fill(7); 380 lengths[280..=287].fill(8); 381 lengths 382} 383 384fn fixed_distance_lengths() -> [u8; 32] { 385 [5u8; 32] 386} 387 388// --------------------------------------------------------------------------- 389// Length and distance base values (RFC 1951 §3.2.5) 390// --------------------------------------------------------------------------- 391 392/// (base_length, extra_bits) for length codes 257..285 393const LENGTH_TABLE: [(u16, u8); 29] = [ 394 (3, 0), 395 (4, 0), 396 (5, 0), 397 (6, 0), 398 (7, 0), 399 (8, 0), 400 (9, 0), 401 (10, 0), 402 (11, 1), 403 (13, 1), 404 (15, 1), 405 (17, 1), 406 (19, 2), 407 (23, 2), 408 (27, 2), 409 (31, 2), 410 (35, 3), 411 (43, 3), 412 (51, 3), 413 (59, 3), 414 (67, 4), 415 (83, 4), 416 (99, 4), 417 (115, 4), 418 (131, 5), 419 (163, 5), 420 (195, 5), 421 (227, 5), 422 (258, 0), 423]; 424 425/// (base_distance, extra_bits) for distance codes 0..29 426const DISTANCE_TABLE: [(u16, u8); 30] = [ 427 (1, 0), 428 (2, 0), 429 (3, 0), 430 (4, 0), 431 (5, 1), 432 (7, 1), 433 (9, 2), 434 (13, 2), 435 (17, 3), 436 (25, 3), 437 (33, 4), 438 (49, 4), 439 (65, 5), 440 (97, 5), 441 (129, 6), 442 (193, 6), 443 (257, 7), 444 (385, 7), 445 (513, 8), 446 (769, 8), 447 (1025, 9), 448 (1537, 9), 449 (2049, 10), 450 (3073, 10), 451 (4097, 11), 452 (6145, 11), 453 (8193, 12), 454 (12289, 12), 455 (16385, 13), 456 (24577, 13), 457]; 458 459/// Order in which code length code lengths appear (RFC 1951 §3.2.7). 460const CODE_LENGTH_ORDER: [usize; 19] = [ 461 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, 462]; 463 464// --------------------------------------------------------------------------- 465// Decompressor 466// --------------------------------------------------------------------------- 467 468/// Decompress a DEFLATE compressed data stream per RFC 1951. 469pub fn inflate(input: &[u8]) -> Result<Vec<u8>> { 470 let mut reader = BitReader::new(input); 471 let mut output = Vec::new(); 472 473 loop { 474 let bfinal = reader.read_bits(1)?; 475 let btype = reader.read_bits(2)? as u8; 476 477 match btype { 478 0 => decode_uncompressed(&mut reader, &mut output)?, 479 1 => decode_fixed(&mut reader, &mut output)?, 480 2 => decode_dynamic(&mut reader, &mut output)?, 481 _ => return Err(DeflateError::InvalidBlockType(btype)), 482 } 483 484 if bfinal == 1 { 485 break; 486 } 487 } 488 489 Ok(output) 490} 491 492/// Decode a non-compressed block (BTYPE=00). 493fn decode_uncompressed(reader: &mut BitReader<'_>, output: &mut Vec<u8>) -> Result<()> { 494 reader.align_to_byte(); 495 496 let lo = reader.read_byte()? as u16; 497 let hi = reader.read_byte()? as u16; 498 let len = lo | (hi << 8); 499 500 let nlo = reader.read_byte()? as u16; 501 let nhi = reader.read_byte()? as u16; 502 let nlen = nlo | (nhi << 8); 503 504 if len != !nlen { 505 return Err(DeflateError::LenMismatch { len, nlen }); 506 } 507 508 let mut buf = vec![0u8; len as usize]; 509 reader.read_bytes(&mut buf)?; 510 output.extend_from_slice(&buf); 511 512 Ok(()) 513} 514 515/// Decode a block with fixed Huffman codes (BTYPE=01). 516fn decode_fixed(reader: &mut BitReader<'_>, output: &mut Vec<u8>) -> Result<()> { 517 let lit_lengths = fixed_literal_lengths(); 518 let dist_lengths = fixed_distance_lengths(); 519 520 let lit_table = HuffmanTable::from_code_lengths(&lit_lengths, 9)?; 521 let dist_table = HuffmanTable::from_code_lengths(&dist_lengths, 5)?; 522 523 decode_compressed(reader, &lit_table, &dist_table, output) 524} 525 526/// Decode a block with dynamic Huffman codes (BTYPE=10). 527fn decode_dynamic(reader: &mut BitReader<'_>, output: &mut Vec<u8>) -> Result<()> { 528 let hlit = reader.read_bits(5)? as usize + 257; 529 let hdist = reader.read_bits(5)? as usize + 1; 530 let hclen = reader.read_bits(4)? as usize + 4; 531 532 // Read code length code lengths 533 let mut cl_lengths = [0u8; 19]; 534 for i in 0..hclen { 535 cl_lengths[CODE_LENGTH_ORDER[i]] = reader.read_bits(3)? as u8; 536 } 537 538 let cl_table = HuffmanTable::from_code_lengths(&cl_lengths, 7)?; 539 540 // Decode literal/length and distance code lengths 541 let total = hlit + hdist; 542 let mut combined_lengths = vec![0u8; total]; 543 let mut i = 0; 544 545 while i < total { 546 let sym = cl_table.decode(reader)?; 547 548 match sym { 549 0..=15 => { 550 combined_lengths[i] = sym as u8; 551 i += 1; 552 } 553 16 => { 554 // Repeat previous code length 3-6 times 555 if i == 0 { 556 return Err(DeflateError::InvalidCodeLengths); 557 } 558 let repeat = reader.read_bits(2)? as usize + 3; 559 let prev = combined_lengths[i - 1]; 560 for _ in 0..repeat { 561 if i >= total { 562 return Err(DeflateError::InvalidCodeLengths); 563 } 564 combined_lengths[i] = prev; 565 i += 1; 566 } 567 } 568 17 => { 569 // Repeat 0 for 3-10 times 570 let repeat = reader.read_bits(3)? as usize + 3; 571 for _ in 0..repeat { 572 if i >= total { 573 return Err(DeflateError::InvalidCodeLengths); 574 } 575 combined_lengths[i] = 0; 576 i += 1; 577 } 578 } 579 18 => { 580 // Repeat 0 for 11-138 times 581 let repeat = reader.read_bits(7)? as usize + 11; 582 for _ in 0..repeat { 583 if i >= total { 584 return Err(DeflateError::InvalidCodeLengths); 585 } 586 combined_lengths[i] = 0; 587 i += 1; 588 } 589 } 590 _ => return Err(DeflateError::InvalidCodeLengths), 591 } 592 } 593 594 let lit_lengths = &combined_lengths[..hlit]; 595 let dist_lengths = &combined_lengths[hlit..]; 596 597 let lit_table = HuffmanTable::from_code_lengths(lit_lengths, 9)?; 598 let dist_table = HuffmanTable::from_code_lengths(dist_lengths, 6)?; 599 600 decode_compressed(reader, &lit_table, &dist_table, output) 601} 602 603/// Decode compressed data using literal/length and distance Huffman tables. 604fn decode_compressed( 605 reader: &mut BitReader<'_>, 606 lit_table: &HuffmanTable, 607 dist_table: &HuffmanTable, 608 output: &mut Vec<u8>, 609) -> Result<()> { 610 loop { 611 let sym = lit_table.decode(reader)?; 612 613 match sym { 614 0..=255 => { 615 output.push(sym as u8); 616 } 617 256 => { 618 // End of block 619 return Ok(()); 620 } 621 257..=285 => { 622 // Length/distance pair 623 let length_index = (sym - 257) as usize; 624 let (base_len, extra_bits) = LENGTH_TABLE[length_index]; 625 let length = base_len as usize 626 + if extra_bits > 0 { 627 reader.read_bits(extra_bits)? as usize 628 } else { 629 0 630 }; 631 632 let dist_sym = dist_table.decode(reader)?; 633 if dist_sym as usize >= DISTANCE_TABLE.len() { 634 return Err(DeflateError::InvalidCode); 635 } 636 let (base_dist, dist_extra) = DISTANCE_TABLE[dist_sym as usize]; 637 let distance = base_dist as usize 638 + if dist_extra > 0 { 639 reader.read_bits(dist_extra)? as usize 640 } else { 641 0 642 }; 643 644 if distance > output.len() { 645 return Err(DeflateError::InvalidDistance { 646 distance, 647 available: output.len(), 648 }); 649 } 650 651 // Copy from back-reference — byte-by-byte to handle overlapping 652 let start = output.len() - distance; 653 for i in 0..length { 654 let b = output[start + (i % distance)]; 655 output.push(b); 656 } 657 } 658 _ => { 659 return Err(DeflateError::InvalidCode); 660 } 661 } 662 } 663} 664 665// --------------------------------------------------------------------------- 666// Tests 667// --------------------------------------------------------------------------- 668 669#[cfg(test)] 670mod tests { 671 use super::*; 672 673 // -- Error Display tests -- 674 675 #[test] 676 fn error_display() { 677 assert_eq!( 678 DeflateError::UnexpectedEof.to_string(), 679 "unexpected end of input" 680 ); 681 assert_eq!( 682 DeflateError::InvalidBlockType(3).to_string(), 683 "invalid block type: 3" 684 ); 685 assert_eq!( 686 DeflateError::LenMismatch { len: 5, nlen: 10 }.to_string(), 687 "non-compressed block length mismatch: LEN=5, NLEN=10" 688 ); 689 assert_eq!( 690 DeflateError::InvalidCode.to_string(), 691 "invalid Huffman code" 692 ); 693 assert_eq!( 694 DeflateError::InvalidCodeLengths.to_string(), 695 "invalid code length codes" 696 ); 697 assert_eq!( 698 DeflateError::InvalidDistance { 699 distance: 100, 700 available: 50 701 } 702 .to_string(), 703 "invalid back-reference: distance 100 exceeds available 50" 704 ); 705 assert_eq!( 706 DeflateError::InvalidHuffmanTree.to_string(), 707 "invalid Huffman tree" 708 ); 709 } 710 711 // -- BitReader tests -- 712 713 #[test] 714 fn bit_reader_single_byte() { 715 let data = [0b10110100u8]; 716 let mut reader = BitReader::new(&data); 717 // LSB first: bits are 0,0,1,0,1,1,0,1 718 assert_eq!(reader.read_bits(1).unwrap(), 0); 719 assert_eq!(reader.read_bits(1).unwrap(), 0); 720 assert_eq!(reader.read_bits(1).unwrap(), 1); 721 assert_eq!(reader.read_bits(1).unwrap(), 0); 722 assert_eq!(reader.read_bits(1).unwrap(), 1); 723 assert_eq!(reader.read_bits(1).unwrap(), 1); 724 assert_eq!(reader.read_bits(1).unwrap(), 0); 725 assert_eq!(reader.read_bits(1).unwrap(), 1); 726 } 727 728 #[test] 729 fn bit_reader_multi_bit() { 730 let data = [0b11010011u8]; 731 let mut reader = BitReader::new(&data); 732 // Read 3 bits LSB first: bits 0,1,0 → reversed from MSB: 011 → value 3 733 assert_eq!(reader.read_bits(3).unwrap(), 0b011); // lowest 3 bits 734 assert_eq!(reader.read_bits(5).unwrap(), 0b11010); // remaining 5 bits 735 } 736 737 #[test] 738 fn bit_reader_cross_byte() { 739 let data = [0xFF, 0x00]; 740 let mut reader = BitReader::new(&data); 741 assert_eq!(reader.read_bits(4).unwrap(), 0xF); 742 assert_eq!(reader.read_bits(8).unwrap(), 0x0F); // 4 bits from first byte + 4 from second 743 } 744 745 #[test] 746 fn bit_reader_eof() { 747 let data = [0x00]; 748 let mut reader = BitReader::new(&data); 749 reader.read_bits(8).unwrap(); 750 assert!(matches!( 751 reader.read_bits(1), 752 Err(DeflateError::UnexpectedEof) 753 )); 754 } 755 756 #[test] 757 fn bit_reader_align_and_read_bytes() { 758 let data = [0xFF, 0xAB, 0xCD]; 759 let mut reader = BitReader::new(&data); 760 reader.read_bits(3).unwrap(); // read some bits 761 reader.align_to_byte(); 762 let mut buf = [0u8; 2]; 763 reader.read_bytes(&mut buf).unwrap(); 764 assert_eq!(buf, [0xAB, 0xCD]); 765 } 766 767 // -- reverse_bits tests -- 768 769 #[test] 770 fn reverse_bits_basic() { 771 assert_eq!(reverse_bits(0b110, 3), 0b011); 772 assert_eq!(reverse_bits(0b1010, 4), 0b0101); 773 assert_eq!(reverse_bits(0b1, 1), 0b1); 774 assert_eq!(reverse_bits(0b0, 1), 0b0); 775 assert_eq!(reverse_bits(0b11001, 5), 0b10011); 776 } 777 778 // -- Huffman table tests -- 779 780 #[test] 781 fn huffman_fixed_literal_table() { 782 // Verify fixed literal table can be constructed 783 let lengths = fixed_literal_lengths(); 784 let table = HuffmanTable::from_code_lengths(&lengths, 9).unwrap(); 785 assert!(!table.entries.is_empty()); 786 } 787 788 #[test] 789 fn huffman_fixed_distance_table() { 790 let lengths = fixed_distance_lengths(); 791 let table = HuffmanTable::from_code_lengths(&lengths, 5).unwrap(); 792 assert!(!table.entries.is_empty()); 793 } 794 795 #[test] 796 fn huffman_simple_decode() { 797 // Build a simple tree: A=0, B=10, C=11 798 let lengths = [1u8, 2, 2]; 799 let table = HuffmanTable::from_code_lengths(&lengths, 2).unwrap(); 800 801 // A = code 0 (1 bit), B = code 10 (2 bits), C = code 11 (2 bits) 802 // LSB first: A=0, B=01, C=11 803 // Encode: A, B, C, A = 0 | 01 | 11 | 0 = 0b0_11_01_0 in LSB order 804 // In bytes: 0b01110100 (pad with zeros on MSB side if needed) 805 // Wait, let me recalculate for LSB-first bit reader: 806 // A: bit 0 → 0 807 // B: bits 1-2 → 01 (LSB first, so code 10 reversed = 01) 808 // C: bits 3-4 → 11 (code 11 reversed = 11) 809 // A: bit 5 → 0 810 // Byte: bits 76543210 = 00_0_11_01_0 = 0b00011010 = 0x1A 811 let data = [0x1A]; 812 let mut reader = BitReader::new(&data); 813 814 assert_eq!(table.decode(&mut reader).unwrap(), 0); // A 815 assert_eq!(table.decode(&mut reader).unwrap(), 1); // B 816 assert_eq!(table.decode(&mut reader).unwrap(), 2); // C 817 assert_eq!(table.decode(&mut reader).unwrap(), 0); // A 818 } 819 820 // -- Non-compressed block (BTYPE=00) tests -- 821 822 #[test] 823 fn inflate_uncompressed_block() { 824 // BFINAL=1, BTYPE=00 (non-compressed) 825 // Bits: 1 (final) 00 (type) → 0b001 = least significant 3 bits 826 // Then aligned: LEN=5, NLEN=~5=0xFFFA 827 let data_to_store = b"Hello"; 828 let mut input = Vec::new(); 829 // First byte: BFINAL=1, BTYPE=00 → bits 1,0,0 → byte 0x01 830 input.push(0x01); 831 // LEN = 5, little-endian 832 input.push(5); 833 input.push(0); 834 // NLEN = !5 = 0xFFFA, little-endian 835 input.push(0xFA); 836 input.push(0xFF); 837 // Data 838 input.extend_from_slice(data_to_store); 839 840 let result = inflate(&input).unwrap(); 841 assert_eq!(result, b"Hello"); 842 } 843 844 #[test] 845 fn inflate_uncompressed_empty() { 846 // Non-compressed block with length 0 847 let mut input = Vec::new(); 848 input.push(0x01); // BFINAL=1, BTYPE=00 849 input.push(0); // LEN=0 850 input.push(0); 851 input.push(0xFF); // NLEN=0xFFFF = !0 852 input.push(0xFF); 853 854 let result = inflate(&input).unwrap(); 855 assert!(result.is_empty()); 856 } 857 858 #[test] 859 fn inflate_uncompressed_len_mismatch() { 860 let mut input = Vec::new(); 861 input.push(0x01); 862 input.push(5); 863 input.push(0); 864 input.push(0x00); // Wrong NLEN 865 input.push(0x00); 866 867 assert!(matches!( 868 inflate(&input), 869 Err(DeflateError::LenMismatch { .. }) 870 )); 871 } 872 873 // -- Multiple non-compressed blocks -- 874 875 #[test] 876 fn inflate_two_uncompressed_blocks() { 877 let mut input = Vec::new(); 878 879 // Block 1: BFINAL=0, BTYPE=00 880 input.push(0x00); 881 input.push(3); 882 input.push(0); 883 input.push(0xFC); 884 input.push(0xFF); 885 input.extend_from_slice(b"Hel"); 886 887 // Block 2: BFINAL=1, BTYPE=00 888 input.push(0x01); 889 input.push(2); 890 input.push(0); 891 input.push(0xFD); 892 input.push(0xFF); 893 input.extend_from_slice(b"lo"); 894 895 let result = inflate(&input).unwrap(); 896 assert_eq!(result, b"Hello"); 897 } 898 899 // -- Fixed Huffman block tests -- 900 901 #[test] 902 fn inflate_fixed_huffman_literals_only() { 903 // Use Rust's flate2-equivalent: manually construct a fixed Huffman block 904 // that encodes just literal bytes and end-of-block. 905 // 906 // For simplicity, we test against known compressed output. 907 // The easiest way is to compress with the non-compressed format 908 // and verify, then test with known DEFLATE streams. 909 910 // Use a known compressed representation of empty data: 911 // Fixed block with just end-of-block (symbol 256) 912 // Symbol 256 has code length 7, code = 0000000 in fixed Huffman 913 // Actually, per fixed table: 256-279 have 7-bit codes starting at 0b0000000 914 // Code for 256: 7 bits, value 0b0000000 915 // Reversed for LSB-first: 0b0000000 916 // BFINAL=1, BTYPE=01: bits 1, 0, 1 → 0b101 = 5 917 // Then 7 bits of 0 for symbol 256 918 // Total: 3 + 7 = 10 bits → 2 bytes 919 // Byte 0: bits 0-7: 1,0,1, 0,0,0,0,0 = 0b00000_101 = 0x05 920 // Byte 1: bits 8-9: 0,0 (remaining bits of the code) + padding = 0x00 921 let input = [0x03, 0x00]; // Known empty fixed block 922 let result = inflate(&input).unwrap(); 923 assert!(result.is_empty()); 924 } 925 926 #[test] 927 fn inflate_fixed_huffman_known_data() { 928 // This is the DEFLATE compressed form of "Hello" using fixed Huffman codes. 929 // Generated by standard deflate: raw deflate of "Hello" 930 // We can verify this by using known test vectors. 931 // 932 // Actually, let's build it manually: 933 // BFINAL=1, BTYPE=01 (fixed) 934 // Then encode 'H'(72), 'e'(101), 'l'(108), 'l'(108), 'o'(111), EOB(256) 935 // 936 // Fixed Huffman codes for literals 0-143: 8-bit codes 00110000..10111111 937 // Code for literal N (0-143): reversed(N + 0x30) in 8 bits 938 // 939 // Let's use a known deflate stream instead. The bytes below are the 940 // raw DEFLATE encoding of "Hello" (verified against RFC 1951). 941 let input = [0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x07, 0x00]; 942 let result = inflate(&input).unwrap(); 943 assert_eq!(result, b"Hello"); 944 } 945 946 // -- Invalid block type -- 947 948 #[test] 949 fn inflate_invalid_block_type() { 950 // BFINAL=1, BTYPE=11 (reserved) 951 // Bits: 1, 1, 1 → byte 0x07 952 let input = [0x07]; 953 assert!(matches!( 954 inflate(&input), 955 Err(DeflateError::InvalidBlockType(3)) 956 )); 957 } 958 959 // -- Empty input -- 960 961 #[test] 962 fn inflate_empty_input() { 963 assert!(matches!(inflate(&[]), Err(DeflateError::UnexpectedEof))); 964 } 965 966 // -- Distance validation -- 967 968 #[test] 969 fn inflate_distance_table_coverage() { 970 // Verify all distance table entries are valid 971 for (i, &(base, extra)) in DISTANCE_TABLE.iter().enumerate() { 972 assert!(base >= 1, "distance code {i} has base {base} < 1"); 973 let max_dist = base as u32 + ((1u32 << extra) - 1); 974 assert!( 975 max_dist <= 32768, 976 "distance code {i} max {max_dist} exceeds 32KB window" 977 ); 978 } 979 } 980 981 // -- Length table coverage -- 982 983 #[test] 984 fn inflate_length_table_coverage() { 985 // Verify length table entries 986 for (i, &(base, extra)) in LENGTH_TABLE.iter().enumerate() { 987 assert!(base >= 3, "length code {i} has base {base} < 3"); 988 if i < 28 { 989 // Codes 257-284 have ranges 990 let max_len = base as u32 + ((1u32 << extra) - 1); 991 assert!(max_len <= 258, "length code {i} max {max_len} exceeds 258"); 992 } 993 } 994 } 995 996 // -- Back-reference (LZ77) tests via fixed Huffman -- 997 998 #[test] 999 fn inflate_with_back_reference() { 1000 // DEFLATE compressed "abcabcabc" — uses a back-reference 1001 // Raw DEFLATE for "abcabcabc" with fixed Huffman: 1002 let input = [0x4b, 0x4c, 0x4a, 0x4e, 0x04, 0x23, 0x00]; 1003 let result = inflate(&input).unwrap(); 1004 assert_eq!(result, b"abcabcabc"); 1005 } 1006 1007 // -- Dynamic Huffman block tests -- 1008 1009 #[test] 1010 fn inflate_dynamic_huffman() { 1011 // A known DEFLATE stream with dynamic Huffman codes. 1012 // This is the raw DEFLATE encoding of "AAAAAAAAAA" (10 A's) 1013 // which benefits from dynamic codes + back-references. 1014 let input = [0x73, 0x74, 0x84, 0x01, 0x00]; 1015 let result = inflate(&input).unwrap(); 1016 assert_eq!(result, b"AAAAAAAAAA"); 1017 } 1018 1019 // -- Fixed Huffman table properties -- 1020 1021 #[test] 1022 fn fixed_literal_lengths_correct() { 1023 let lengths = fixed_literal_lengths(); 1024 // 0-143: length 8 1025 for i in 0..=143 { 1026 assert_eq!(lengths[i], 8, "literal {i} should have length 8"); 1027 } 1028 // 144-255: length 9 1029 for i in 144..=255 { 1030 assert_eq!(lengths[i], 9, "literal {i} should have length 9"); 1031 } 1032 // 256-279: length 7 1033 for i in 256..=279 { 1034 assert_eq!(lengths[i], 7, "literal {i} should have length 7"); 1035 } 1036 // 280-287: length 8 1037 for i in 280..=287 { 1038 assert_eq!(lengths[i], 8, "literal {i} should have length 8"); 1039 } 1040 } 1041 1042 #[test] 1043 fn fixed_distance_lengths_correct() { 1044 let lengths = fixed_distance_lengths(); 1045 for (i, &len) in lengths.iter().enumerate() { 1046 assert_eq!(len, 5, "distance {i} should have length 5"); 1047 } 1048 } 1049 1050 // -- Round-trip test using non-compressed then verifying inflate -- 1051 1052 #[test] 1053 fn inflate_uncompressed_large_block() { 1054 // Test with a larger block (256 bytes) 1055 let payload: Vec<u8> = (0..=255).collect(); 1056 let mut input = Vec::new(); 1057 input.push(0x01); // BFINAL=1, BTYPE=00 1058 let len = payload.len() as u16; 1059 input.push(len as u8); 1060 input.push((len >> 8) as u8); 1061 let nlen = !len; 1062 input.push(nlen as u8); 1063 input.push((nlen >> 8) as u8); 1064 input.extend_from_slice(&payload); 1065 1066 let result = inflate(&input).unwrap(); 1067 assert_eq!(result, payload); 1068 } 1069 1070 // -- Code length order -- 1071 1072 #[test] 1073 fn code_length_order_correct() { 1074 // Per RFC 1951 §3.2.7 1075 assert_eq!( 1076 CODE_LENGTH_ORDER, 1077 [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15] 1078 ); 1079 } 1080 1081 // -- Overlapping back-reference -- 1082 1083 #[test] 1084 fn inflate_overlapping_backref() { 1085 // Test that overlapping back-references work correctly. 1086 // e.g., distance=1, length=5 starting from "A" should produce "AAAAA" 1087 // This is tested indirectly through the DEFLATE stream, 1088 // but let's verify the copy logic directly. 1089 let mut output = vec![b'A']; 1090 let distance = 1usize; 1091 let length = 5usize; 1092 let start = output.len() - distance; 1093 for i in 0..length { 1094 let b = output[start + (i % distance)]; 1095 output.push(b); 1096 } 1097 assert_eq!(output, b"AAAAAA"); 1098 } 1099 1100 // -- Comprehensive round-trip tests with zlib-generated vectors -- 1101 1102 #[test] 1103 fn inflate_single_byte() { 1104 let compressed = [0x4b, 0x04, 0x00]; 1105 let result = inflate(&compressed).unwrap(); 1106 assert_eq!(result, b"a"); 1107 } 1108 1109 #[test] 1110 fn inflate_repeated_char_1000() { 1111 // 1000 'a' characters — heavy LZ77 back-referencing 1112 let compressed = [ 1113 0x4b, 0x4c, 0x1c, 0x05, 0xa3, 0x60, 0x14, 0x0c, 0x77, 0x00, 0x00, 1114 ]; 1115 let result = inflate(&compressed).unwrap(); 1116 assert_eq!(result.len(), 1000); 1117 assert!(result.iter().all(|&b| b == b'a')); 1118 } 1119 1120 #[test] 1121 fn inflate_pangram() { 1122 let compressed = [ 1123 0x0b, 0xc9, 0x48, 0x55, 0x28, 0x2c, 0xcd, 0x4c, 0xce, 0x56, 0x48, 0x2a, 0xca, 0x2f, 1124 0xcf, 0x53, 0x48, 0xcb, 0xaf, 0x50, 0xc8, 0x2a, 0xcd, 0x2d, 0x28, 0x56, 0xc8, 0x2f, 1125 0x4b, 0x2d, 0x52, 0x28, 0x01, 0x4a, 0xe7, 0x24, 0x56, 0x55, 0x2a, 0xa4, 0xe4, 0xa7, 1126 0x03, 0x00, 1127 ]; 1128 let result = inflate(&compressed).unwrap(); 1129 assert_eq!(result, b"The quick brown fox jumps over the lazy dog"); 1130 } 1131 1132 #[test] 1133 fn inflate_alphabet_repeated() { 1134 // "abcdefghijklmnopqrstuvwxyz" * 10 = 260 bytes 1135 let compressed = [ 1136 0x4b, 0x4c, 0x4a, 0x4e, 0x49, 0x4d, 0x4b, 0xcf, 0xc8, 0xcc, 0xca, 0xce, 0xc9, 0xcd, 1137 0xcb, 0x2f, 0x28, 0x2c, 0x2a, 0x2e, 0x29, 0x2d, 0x2b, 0xaf, 0xa8, 0xac, 0x4a, 0x1c, 1138 0x31, 0x32, 0x00, 1139 ]; 1140 let result = inflate(&compressed).unwrap(); 1141 let expected: Vec<u8> = b"abcdefghijklmnopqrstuvwxyz".repeat(10); 1142 assert_eq!(result, expected); 1143 } 1144 1145 // -- All byte values via non-compressed block -- 1146 1147 #[test] 1148 fn inflate_all_byte_values() { 1149 // Non-compressed block containing all 256 byte values 1150 let payload: Vec<u8> = (0..=255).collect(); 1151 // zlib at level 6 uses non-compressed for this data 1152 let compressed = [ 1153 0x01, 0x00, 0x01, 0xff, 0xfe, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 1154 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 1155 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 1156 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 1157 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 1158 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 1159 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 1160 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 1161 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 1162 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 1163 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 1164 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 1165 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 1166 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 1167 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 1168 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 1169 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 1170 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 1171 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 1172 ]; 1173 let result = inflate(&compressed).unwrap(); 1174 assert_eq!(result, payload); 1175 } 1176 1177 // -- Maximum back-reference distance (32KB window) -- 1178 1179 #[test] 1180 fn inflate_max_window_uncompressed() { 1181 // Test near the 32KB window limit with non-compressed blocks 1182 let payload = vec![0xAB; 32768]; 1183 // Build as a series of non-compressed blocks (max 65535 bytes each) 1184 let mut input = Vec::new(); 1185 input.push(0x01); // BFINAL=1, BTYPE=00 1186 let len = 32768u16; 1187 // 32768 = 0x8000 which fits in u16 1188 input.push(len as u8); 1189 input.push((len >> 8) as u8); 1190 let nlen = !len; 1191 input.push(nlen as u8); 1192 input.push((nlen >> 8) as u8); 1193 input.extend_from_slice(&payload); 1194 1195 let result = inflate(&input).unwrap(); 1196 assert_eq!(result.len(), 32768); 1197 assert!(result.iter().all(|&b| b == 0xAB)); 1198 } 1199}