//! UTF-8 decoder and encoder per WHATWG Encoding Standard.

use crate::error::{EncodingError, Result};

/// Error handling mode.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ErrorMode {
    Replacement,
    Fatal,
}

/// Decode a byte slice as UTF-8.
///
/// In replacement mode, invalid sequences are replaced with U+FFFD.
/// In fatal mode, the first invalid sequence causes an error.
pub(crate) fn decode_utf8(bytes: &[u8], mode: ErrorMode) -> Result<String> {
    // Strip UTF-8 BOM if present
    let bytes = if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
        &bytes[3..]
    } else {
        bytes
    };

    let mut output = String::with_capacity(bytes.len());
    let mut decoder = Utf8Decoder::new();
    let mut i = 0;

    while i < bytes.len() {
        match decoder.process_byte(bytes[i]) {
            DecoderResult::CodePoint(ch) => {
                output.push(ch);
                i += 1;
            }
            DecoderResult::Error(error_pos) => {
                if mode == ErrorMode::Fatal {
                    return Err(EncodingError::InvalidSequence {
                        encoding: "UTF-8",
                        position: error_pos,
                    });
                }
                output.push('\u{FFFD}');
                i += 1;
            }
            DecoderResult::ErrorPrepend(error_pos) => {
                if mode == ErrorMode::Fatal {
                    return Err(EncodingError::InvalidSequence {
                        encoding: "UTF-8",
                        position: error_pos,
                    });
                }
                output.push('\u{FFFD}');
                // Do NOT advance i — re-process this byte
            }
            DecoderResult::Continue => {
                i += 1;
            }
        }
    }

    // Handle incomplete sequence at end of input
    if decoder.bytes_needed > 0 {
        if mode == ErrorMode::Fatal {
            return Err(EncodingError::InvalidSequence {
                encoding: "UTF-8",
                position: bytes.len().saturating_sub(decoder.bytes_seen as usize),
            });
        }
        output.push('\u{FFFD}');
    }

    Ok(output)
}

/// Encode a string as UTF-8 bytes.
///
/// Since Rust strings are already valid UTF-8, this is a straightforward copy.
pub(crate) fn encode_utf8(text: &str) -> Vec<u8> {
    text.as_bytes().to_vec()
}

// ---------------------------------------------------------------------------
// Streaming UTF-8 decoder (WHATWG Encoding Standard §8.1.1)
// ---------------------------------------------------------------------------

enum DecoderResult {
    /// A valid code point was decoded.
    CodePoint(char),
    /// An error occurred at the given byte position; advance to next byte.
    Error(usize),
    /// An error occurred at the given byte position; re-process current byte.
    ErrorPrepend(usize),
    /// More bytes needed; continue feeding.
    Continue,
}

struct Utf8Decoder {
    code_point: u32,
    bytes_seen: u8,
    bytes_needed: u8,
    lower_boundary: u8,
    upper_boundary: u8,
    /// Position of the start of the current multi-byte sequence.
    sequence_start: usize,
    /// Total bytes processed so far.
    position: usize,
}

impl Utf8Decoder {
    fn new() -> Self {
        Self {
            code_point: 0,
            bytes_seen: 0,
            bytes_needed: 0,
            lower_boundary: 0x80,
            upper_boundary: 0xBF,
            sequence_start: 0,
            position: 0,
        }
    }

    fn process_byte(&mut self, byte: u8) -> DecoderResult {
        let pos = self.position;
        self.position += 1;

        if self.bytes_needed == 0 {
            match byte {
                0x00..=0x7F => DecoderResult::CodePoint(byte as char),
                0xC2..=0xDF => {
                    self.bytes_needed = 1;
                    self.code_point = (byte & 0x1F) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                0xE0 => {
                    self.bytes_needed = 2;
                    self.lower_boundary = 0xA0;
                    self.code_point = (byte & 0x0F) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                0xE1..=0xEC | 0xEE..=0xEF => {
                    self.bytes_needed = 2;
                    self.code_point = (byte & 0x0F) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                0xED => {
                    self.bytes_needed = 2;
                    self.upper_boundary = 0x9F;
                    self.code_point = (byte & 0x0F) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                0xF0 => {
                    self.bytes_needed = 3;
                    self.lower_boundary = 0x90;
                    self.code_point = (byte & 0x07) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                0xF1..=0xF3 => {
                    self.bytes_needed = 3;
                    self.code_point = (byte & 0x07) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                0xF4 => {
                    self.bytes_needed = 3;
                    self.upper_boundary = 0x8F;
                    self.code_point = (byte & 0x07) as u32;
                    self.sequence_start = pos;
                    DecoderResult::Continue
                }
                _ => {
                    // 0x80..=0xC1, 0xF5..=0xFF: invalid lead byte
                    DecoderResult::Error(pos)
                }
            }
        } else {
            // Expecting continuation byte
            if byte < self.lower_boundary || byte > self.upper_boundary {
                // Invalid continuation — reset and prepend byte
                let err_pos = self.sequence_start;
                self.reset();
                self.position -= 1; // will be re-processed
                return DecoderResult::ErrorPrepend(err_pos);
            }

            // Valid continuation byte
            self.lower_boundary = 0x80;
            self.upper_boundary = 0xBF;
            self.code_point = (self.code_point << 6) | (byte & 0x3F) as u32;
            self.bytes_seen += 1;

            if self.bytes_seen == self.bytes_needed {
                let cp = self.code_point;
                self.reset();
                // The WHATWG state machine guarantees valid scalar values here,
                // but use fallback for defense-in-depth.
                let ch = char::from_u32(cp).unwrap_or('\u{FFFD}');
                DecoderResult::CodePoint(ch)
            } else {
                DecoderResult::Continue
            }
        }
    }

    fn reset(&mut self) {
        self.code_point = 0;
        self.bytes_seen = 0;
        self.bytes_needed = 0;
        self.lower_boundary = 0x80;
        self.upper_boundary = 0xBF;
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    fn decode_replace(bytes: &[u8]) -> String {
        decode_utf8(bytes, ErrorMode::Replacement).unwrap()
    }

    fn decode_fatal(bytes: &[u8]) -> Result<String> {
        decode_utf8(bytes, ErrorMode::Fatal)
    }

    // -- Basic ASCII --

    #[test]
    fn ascii_roundtrip() {
        assert_eq!(decode_replace(b"Hello, world!"), "Hello, world!");
    }

    #[test]
    fn empty_input() {
        assert_eq!(decode_replace(b""), "");
    }

    #[test]
    fn null_byte() {
        assert_eq!(decode_replace(&[0x00]), "\0");
    }

    // -- Multi-byte sequences --

    #[test]
    fn two_byte_sequence() {
        // U+00E9 (e with acute) = 0xC3 0xA9
        assert_eq!(decode_replace(&[0xC3, 0xA9]), "\u{00E9}");
    }

    #[test]
    fn three_byte_sequence() {
        // U+4E16 (CJK character) = 0xE4 0xB8 0x96
        assert_eq!(decode_replace(&[0xE4, 0xB8, 0x96]), "\u{4E16}");
    }

    #[test]
    fn four_byte_sequence() {
        // U+1F600 (grinning face) = 0xF0 0x9F 0x98 0x80
        assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98, 0x80]), "\u{1F600}");
    }

    #[test]
    fn mixed_ascii_and_multibyte() {
        // "Caf\u{00E9}" = [0x43, 0x61, 0x66, 0xC3, 0xA9]
        assert_eq!(
            decode_replace(&[0x43, 0x61, 0x66, 0xC3, 0xA9]),
            "Caf\u{00E9}"
        );
    }

    // -- BOM handling --

    #[test]
    fn bom_stripped() {
        // UTF-8 BOM + "A"
        assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF, 0x41]), "A");
    }

    #[test]
    fn bom_only() {
        assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF]), "");
    }

    // -- Invalid sequences (replacement mode) --

    #[test]
    fn invalid_byte_ff() {
        assert_eq!(decode_replace(&[0xFF]), "\u{FFFD}");
    }

    #[test]
    fn invalid_byte_fe() {
        assert_eq!(decode_replace(&[0xFE]), "\u{FFFD}");
    }

    #[test]
    fn invalid_continuation_byte_standalone() {
        // 0x80 without a lead byte
        assert_eq!(decode_replace(&[0x80]), "\u{FFFD}");
    }

    #[test]
    fn overlong_two_byte() {
        // 0xC0 0xAF is an overlong encoding of U+002F ('/')
        // 0xC0 is always invalid (lead byte rejected), 0xAF is a continuation
        // byte without a lead (also invalid) — both produce U+FFFD
        assert_eq!(decode_replace(&[0xC0, 0xAF]), "\u{FFFD}\u{FFFD}");
    }

    #[test]
    fn truncated_two_byte() {
        // 0xC3 without continuation
        assert_eq!(decode_replace(&[0xC3]), "\u{FFFD}");
    }

    #[test]
    fn truncated_three_byte() {
        // 0xE4 0xB8 without third byte
        assert_eq!(decode_replace(&[0xE4, 0xB8]), "\u{FFFD}");
    }

    #[test]
    fn truncated_four_byte() {
        // 0xF0 0x9F 0x98 without fourth byte
        assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98]), "\u{FFFD}");
    }

    #[test]
    fn surrogate_half_rejected() {
        // U+D800 would encode as 0xED 0xA0 0x80, but surrogates are invalid in UTF-8
        // 0xED with upper_boundary 0x9F rejects 0xA0
        assert_eq!(
            decode_replace(&[0xED, 0xA0, 0x80]),
            "\u{FFFD}\u{FFFD}\u{FFFD}"
        );
    }

    #[test]
    fn invalid_continuation_mid_sequence() {
        // 0xE4 expects continuation, but 0x41 is ASCII — error + prepend
        assert_eq!(decode_replace(&[0xE4, 0x41]), "\u{FFFD}A");
    }

    #[test]
    fn invalid_between_valid() {
        // Valid 'A', invalid 0xFF, valid 'B'
        assert_eq!(decode_replace(&[0x41, 0xFF, 0x42]), "A\u{FFFD}B");
    }

    #[test]
    fn multiple_errors_in_a_row() {
        assert_eq!(
            decode_replace(&[0xFE, 0xFF, 0xFE]),
            "\u{FFFD}\u{FFFD}\u{FFFD}"
        );
    }

    // -- Fatal mode --

    #[test]
    fn fatal_valid() {
        assert_eq!(decode_fatal(b"Hello").unwrap(), "Hello");
    }

    #[test]
    fn fatal_invalid() {
        let err = decode_fatal(&[0x41, 0xFF]).unwrap_err();
        assert!(matches!(
            err,
            EncodingError::InvalidSequence {
                encoding: "UTF-8",
                position: 1
            }
        ));
    }

    #[test]
    fn fatal_truncated() {
        let err = decode_fatal(&[0xC3]).unwrap_err();
        assert!(matches!(
            err,
            EncodingError::InvalidSequence {
                encoding: "UTF-8",
                ..
            }
        ));
    }

    // -- Encoder --

    #[test]
    fn encode_ascii() {
        assert_eq!(encode_utf8("Hello"), b"Hello");
    }

    #[test]
    fn encode_multibyte() {
        assert_eq!(encode_utf8("\u{00E9}"), &[0xC3, 0xA9]);
    }

    #[test]
    fn encode_emoji() {
        assert_eq!(encode_utf8("\u{1F600}"), &[0xF0, 0x9F, 0x98, 0x80]);
    }

    #[test]
    fn encode_empty() {
        assert_eq!(encode_utf8(""), b"");
    }

    #[test]
    fn roundtrip() {
        let original = "Hello \u{4E16}\u{754C} \u{1F600}";
        let encoded = encode_utf8(original);
        let decoded = decode_replace(&encoded);
        assert_eq!(decoded, original);
    }

    // -- Edge cases --

    #[test]
    fn max_two_byte() {
        // U+07FF = 0xDF 0xBF
        assert_eq!(decode_replace(&[0xDF, 0xBF]), "\u{07FF}");
    }

    #[test]
    fn min_three_byte() {
        // U+0800 = 0xE0 0xA0 0x80
        assert_eq!(decode_replace(&[0xE0, 0xA0, 0x80]), "\u{0800}");
    }

    #[test]
    fn max_three_byte() {
        // U+FFFF = 0xEF 0xBF 0xBF
        assert_eq!(decode_replace(&[0xEF, 0xBF, 0xBF]), "\u{FFFF}");
    }

    #[test]
    fn min_four_byte() {
        // U+10000 = 0xF0 0x90 0x80 0x80
        assert_eq!(decode_replace(&[0xF0, 0x90, 0x80, 0x80]), "\u{10000}");
    }

    #[test]
    fn max_unicode() {
        // U+10FFFF = 0xF4 0x8F 0xBF 0xBF
        assert_eq!(decode_replace(&[0xF4, 0x8F, 0xBF, 0xBF]), "\u{10FFFF}");
    }

    #[test]
    fn above_max_unicode_rejected() {
        // 0xF4 0x90 would start U+110000, which is above max
        // 0xF4 has upper_boundary = 0x8F, so 0x90 is rejected
        assert_eq!(
            decode_replace(&[0xF4, 0x90, 0x80, 0x80]),
            "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"
        );
    }

    #[test]
    fn overlong_three_byte_rejected() {
        // 0xE0 requires lower_boundary = 0xA0, so 0xE0 0x80 0x80 is rejected
        assert_eq!(
            decode_replace(&[0xE0, 0x80, 0x80]),
            "\u{FFFD}\u{FFFD}\u{FFFD}"
        );
    }

    #[test]
    fn overlong_four_byte_rejected() {
        // 0xF0 requires lower_boundary = 0x90, so 0xF0 0x80 0x80 0x80 is rejected
        assert_eq!(
            decode_replace(&[0xF0, 0x80, 0x80, 0x80]),
            "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"
        );
    }
}