//! spaCy-compatible tokenizer.
//!
//! port of spaCy's `tokenizer.pyx` algorithm: whitespace split → per-chunk
//! iterative prefix/suffix stripping → infix splitting → special case lookup.
//! uses generated data tables from `tokenizer_data.zig`.

const std = @import("std");
const data = @import("tokenizer_data.zig");

/// a token is a byte-offset slice into the original text.
pub const Token = struct {
    /// byte offset of start in original text
    start: u32,
    /// byte offset of end (exclusive) in original text
    end: u32,

    pub fn text(self: Token, source: []const u8) []const u8 {
        return source[self.start..self.end];
    }
};

/// maximum tokens per document.
pub const MAX_TOKENS = 1024;

/// tokenize text into tokens. returns the number of tokens written.
/// tokens are byte-offset spans into the original text.
pub fn tokenize(text: []const u8, out: []Token) u32 {
    if (text.len == 0) return 0;
    var count: u32 = 0;

    // phase 1: whitespace split into chunks
    var i: usize = 0;
    while (i < text.len) {
        // skip whitespace
        while (i < text.len and isWhitespace(text[i])) : (i += 1) {}
        if (i >= text.len) break;

        // find end of chunk (next whitespace)
        const chunk_start = i;
        while (i < text.len and !isWhitespace(text[i])) : (i += 1) {}
        const chunk_end = i;

        // phase 2: tokenize this chunk
        count = tokenizeChunk(text, chunk_start, chunk_end, out, count);
        if (count >= out.len) return count;
    }

    return count;
}

/// tokenize a single whitespace-delimited chunk.
/// text[start..end] is the chunk. writes tokens to out[count..].
fn tokenizeChunk(
    text: []const u8,
    start: usize,
    end: usize,
    out: []Token,
    count_in: u32,
) u32 {
    var count = count_in;
    const chunk = text[start..end];

    // check special cases first
    if (data.specials.get(chunk)) |special| {
        var offset: u32 = @intCast(start);
        for (0..special.len) |ti| {
            const tok_text = special.tokens[ti];
            // find this token text in the source at the expected position
            const tok_start = findSubstr(text[offset..end], tok_text);
            if (tok_start) |ts| {
                if (count < out.len) {
                    out[count] = .{
                        .start = offset + @as(u32, @intCast(ts)),
                        .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)),
                    };
                    count += 1;
                }
                offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len));
            } else {
                // special token not found at expected position — emit based on length
                if (count < out.len) {
                    out[count] = .{
                        .start = offset,
                        .end = offset + @as(u32, @intCast(tok_text.len)),
                    };
                    count += 1;
                }
                offset += @as(u32, @intCast(tok_text.len));
            }
        }
        return count;
    }

    // split affixes iteratively
    var lo: u32 = @intCast(start);
    var hi: u32 = @intCast(end);

    // prefix and suffix stacks (indices into out buffer)
    const prefix_start = count; // where prefixes begin in out
    var suffix_buf: [64]Token = undefined;
    var n_suffixes: u32 = 0;

    var last_len: u32 = 0;
    while (lo < hi and (hi - lo) != last_len) {
        const span = text[lo..hi];
        const span_len = hi - lo;

        // check if remaining span is a special case
        if (data.specials.get(span) != null) break;

        // check URL match
        if (matchUrl(span) > 0) break;

        last_len = span_len;

        // try prefix
        const pre_len = matchPrefix(span);

        // try suffix on span[pre_len..] but strip from end of full span
        const suf_len = if (pre_len < span_len)
            matchSuffix(span[pre_len..])
        else
            @as(usize, 0);

        if (pre_len > 0 and suf_len > 0 and (pre_len + suf_len) <= span_len) {
            // both prefix and suffix, non-overlapping
            // check if stripping prefix reveals a special
            const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi];
            if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) {
                // emit prefix, let middle handle the special
                if (count < out.len) {
                    out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
                    count += 1;
                }
                lo += @as(u32, @intCast(pre_len));
                break;
            }
            // check if stripping suffix reveals a special
            const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))];
            if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) {
                if (n_suffixes < suffix_buf.len) {
                    suffix_buf[n_suffixes] = .{
                        .start = hi - @as(u32, @intCast(suf_len)),
                        .end = hi,
                    };
                    n_suffixes += 1;
                }
                hi -= @as(u32, @intCast(suf_len));
                break;
            }

            // strip both
            if (count < out.len) {
                out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
                count += 1;
            }
            if (n_suffixes < suffix_buf.len) {
                suffix_buf[n_suffixes] = .{
                    .start = hi - @as(u32, @intCast(suf_len)),
                    .end = hi,
                };
                n_suffixes += 1;
            }
            lo += @as(u32, @intCast(pre_len));
            hi -= @as(u32, @intCast(suf_len));
        } else if (pre_len > 0) {
            // prefix only
            const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi];
            if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) {
                if (count < out.len) {
                    out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
                    count += 1;
                }
                lo += @as(u32, @intCast(pre_len));
                break;
            }
            if (count < out.len) {
                out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) };
                count += 1;
            }
            lo += @as(u32, @intCast(pre_len));
        } else if (suf_len > 0) {
            const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))];
            if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) {
                if (n_suffixes < suffix_buf.len) {
                    suffix_buf[n_suffixes] = .{
                        .start = hi - @as(u32, @intCast(suf_len)),
                        .end = hi,
                    };
                    n_suffixes += 1;
                }
                hi -= @as(u32, @intCast(suf_len));
                break;
            }
            if (n_suffixes < suffix_buf.len) {
                suffix_buf[n_suffixes] = .{
                    .start = hi - @as(u32, @intCast(suf_len)),
                    .end = hi,
                };
                n_suffixes += 1;
            }
            hi -= @as(u32, @intCast(suf_len));
        }
        // else: neither matched, last_len == span_len, loop exits
    }

    _ = prefix_start;

    // emit middle portion
    if (lo < hi) {
        const middle = text[lo..hi];

        // try special cases for the remaining middle
        if (data.specials.get(middle)) |special| {
            var offset: u32 = lo;
            for (0..special.len) |ti| {
                const tok_text = special.tokens[ti];
                const tok_start = findSubstr(text[offset..hi], tok_text);
                if (tok_start) |ts| {
                    if (count < out.len) {
                        out[count] = .{
                            .start = offset + @as(u32, @intCast(ts)),
                            .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)),
                        };
                        count += 1;
                    }
                    offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len));
                } else {
                    if (count < out.len) {
                        out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)) };
                        count += 1;
                    }
                    offset += @as(u32, @intCast(tok_text.len));
                }
            }
        } else if (matchUrl(middle) > 0) {
            // URL — emit as single token
            if (count < out.len) {
                out[count] = .{ .start = lo, .end = hi };
                count += 1;
            }
        } else {
            // try infix splitting
            var infixes: [64]Infix = undefined;
            const n_infixes = findInfixes(middle, &infixes);

            if (n_infixes == 0) {
                // no infixes — single token
                if (count < out.len) {
                    out[count] = .{ .start = lo, .end = hi };
                    count += 1;
                }
            } else {
                // split on infixes
                var pos: u32 = lo;
                for (infixes[0..n_infixes]) |inf| {
                    const inf_start = lo + @as(u32, @intCast(inf.start));
                    const inf_end = lo + @as(u32, @intCast(inf.end));

                    // skip infixes at position 0
                    if (inf.start == 0) continue;

                    // emit text before infix
                    if (inf_start > pos) {
                        if (count < out.len) {
                            out[count] = .{ .start = pos, .end = inf_start };
                            count += 1;
                        }
                    }

                    // emit infix
                    if (inf_start != inf_end) {
                        if (count < out.len) {
                            out[count] = .{ .start = inf_start, .end = inf_end };
                            count += 1;
                        }
                    }

                    pos = inf_end;
                }

                // emit text after last infix
                if (pos < hi) {
                    if (count < out.len) {
                        out[count] = .{ .start = pos, .end = hi };
                        count += 1;
                    }
                }
            }
        }
    }

    // emit suffixes in reverse order
    var si = n_suffixes;
    while (si > 0) {
        si -= 1;
        if (count < out.len) {
            out[count] = suffix_buf[si];
            count += 1;
        }
    }

    return count;
}

// ── pattern matching ──

/// match a prefix at position 0. returns byte length of match, or 0.
pub fn matchPrefix(text: []const u8) usize {
    if (text.len == 0) return 0;

    const cp = data.decodeUtf8(text) orelse return 0;

    // 1. single-character prefixes (switch on codepoint)
    if (data.isPrefixChar(cp.value)) return cp.len;

    // 2. multi-char literals (longest first)
    for (data.prefix_multi_literals) |lit| {
        if (std.mem.startsWith(u8, text, lit)) return lit.len;
    }

    // 3. symbol class (unicode So/Sc categories)
    if (data.isSymbol(cp.value)) return cp.len;

    // 4. 2+ dots
    if (text.len >= 2 and text[0] == '.' and text[1] == '.') {
        var i: usize = 2;
        while (i < text.len and text[i] == '.') : (i += 1) {}
        return i;
    }

    // 5. literal-unless-digit (e.g., + not followed by digit)
    if (data.isPrefixUnlessDigit(cp.value)) {
        if (cp.len >= text.len) return cp.len;
        const next = data.decodeUtf8(text[cp.len..]);
        if (next == null or !isAsciiDigit(next.?.value)) return cp.len;
    }

    return 0;
}

/// match a suffix at the end of text. returns byte length of suffix, or 0.
pub fn matchSuffix(text: []const u8) usize {
    if (text.len == 0) return 0;

    const last = data.lastCodepoint(text) orelse return 0;

    // 1. single-character suffixes
    if (data.isSuffixChar(last.value)) return last.len;

    // 2. symbol class
    if (data.isSymbol(last.value)) return last.len;

    // 3. multi-char literal suffixes (longest first)
    for (data.suffix_multi_literals) |lit| {
        if (std.mem.endsWith(u8, text, lit)) return lit.len;
    }

    // 4. 2+ dots at end
    if (text.len >= 2 and text[text.len - 1] == '.' and text[text.len - 2] == '.') {
        var i: usize = text.len - 2;
        while (i > 0 and text[i - 1] == '.') : (i -= 1) {}
        return text.len - i;
    }

    // 5. lookbehind rules (generated)
    const lb = data.matchSuffixLookbehind(text);
    if (lb > 0) return lb;

    return 0;
}

/// infix match result
const Infix = struct { start: usize, end: usize };

/// find all infix split points. returns count written.
fn findInfixes(text: []const u8, out: []Infix) usize {
    var count: usize = 0;
    if (text.len == 0) return 0;

    var i: usize = 0;
    while (i < text.len) {
        const cp = data.decodeUtf8(text[i..]) orelse {
            i += 1;
            continue;
        };
        var matched: usize = 0;

        // 1. 2+ dots (infix[0])
        if (text[i] == '.' and i + 1 < text.len and text[i + 1] == '.') {
            var end = i + 2;
            while (end < text.len and text[end] == '.') : (end += 1) {}
            matched = end - i;
        }
        // 2. ellipsis U+2026 (infix[1])
        else if (cp.value == 0x2026) {
            matched = cp.len;
        }
        // 3. symbol class (infix[2])
        else if (data.isSymbol(cp.value)) {
            matched = cp.len;
        }
        // contextual rules require lookbehind/lookahead
        else {
            const prev_cp = if (i > 0) data.lastCodepoint(text[0..i]) else null;
            const next_start = i + cp.len;
            const next_cp = if (next_start < text.len) data.decodeUtf8(text[next_start..]) else null;

            // 4. math ops between digits: (?<=[0-9])[+\-*^](?=[0-9\-]) (infix[3])
            if (prev_cp != null and isAsciiDigit(prev_cp.?.value)) {
                if (cp.value == '+' or cp.value == '-' or cp.value == '*' or cp.value == '^') {
                    if (next_cp != null and (isAsciiDigit(next_cp.?.value) or next_cp.?.value == '-')) {
                        matched = cp.len;
                    }
                }
            }

            // 5. period between lower/punct and upper (infix[4])
            if (matched == 0 and cp.value == '.') {
                if (prev_cp != null and next_cp != null) {
                    if (data.is_infix_4_behind(prev_cp.?.value) and data.is_infix_4_ahead(next_cp.?.value)) {
                        matched = 1;
                    }
                }
            }

            // 6. comma between alpha chars (infix[5])
            if (matched == 0 and cp.value == ',') {
                if (prev_cp != null and next_cp != null) {
                    if (data.is_infix_5_behind(prev_cp.?.value) and data.is_infix_5_ahead(next_cp.?.value)) {
                        matched = 1;
                    }
                }
            }

            // 7. hyphens/dashes between alnum (infix[6])
            if (matched == 0 and prev_cp != null and next_cp != null) {
                if (data.is_infix_6_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) {
                    // try alternatives: ---, --, —— (U+2014 U+2014), —, –, -, ~
                    if (i + 3 <= text.len and std.mem.eql(u8, text[i..][0..3], "---")) {
                        matched = 3;
                    } else if (i + 2 <= text.len and std.mem.eql(u8, text[i..][0..2], "--")) {
                        matched = 2;
                    } else if (i + 6 <= text.len and std.mem.eql(u8, text[i..][0..6], "\xe2\x80\x94\xe2\x80\x94")) {
                        matched = 6; // ——
                    } else if (cp.value == 0x2014) { // —
                        matched = cp.len;
                    } else if (cp.value == 0x2013) { // –
                        matched = cp.len;
                    } else if (cp.value == '-') {
                        matched = 1;
                    } else if (cp.value == '~') {
                        matched = 1;
                    }
                }
            }

            // 8. separators between alnum (infix[7])
            if (matched == 0 and prev_cp != null and next_cp != null) {
                if (data.is_infix_7_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) {
                    if (cp.value == '/' or cp.value == ':' or cp.value == '<' or
                        cp.value == '>' or cp.value == '=')
                    {
                        matched = cp.len;
                    }
                }
            }
        }

        if (matched > 0) {
            if (count < out.len) {
                out[count] = .{ .start = i, .end = i + matched };
                count += 1;
            }
            i += matched;
        } else {
            i += cp.len;
        }
    }

    return count;
}

/// simplified URL matcher. matches scheme://... or domain.tld patterns.
/// returns length of match from start, or 0.
fn matchUrl(text: []const u8) usize {
    if (text.len < 4) return 0;

    // check for scheme://
    var pos: usize = 0;
    if (std.mem.startsWith(u8, text, "http://")) {
        pos = 7;
    } else if (std.mem.startsWith(u8, text, "https://")) {
        pos = 8;
    } else if (std.mem.startsWith(u8, text, "ftp://")) {
        pos = 6;
    } else {
        // try generic scheme://
        var j: usize = 0;
        while (j < text.len and j < 20) : (j += 1) {
            const c = text[j];
            if (c == ':') {
                if (j + 2 < text.len and text[j + 1] == '/' and text[j + 2] == '/') {
                    pos = j + 3;
                    break;
                } else break;
            }
            if (!isAsciiAlnum(c) and c != '+' and c != '-' and c != '.') break;
        }

        // no scheme — try bare domain: word.word or word@word.word
        if (pos == 0) {
            pos = matchBareDomain(text);
        }
    }

    if (pos == 0 or pos >= text.len) return 0;

    // consume until whitespace
    while (pos < text.len and !isWhitespace(text[pos])) : (pos += 1) {}

    return pos;
}

/// match a bare domain like example.com or user@example.com
fn matchBareDomain(text: []const u8) usize {
    // look for word.word or word@word.word pattern
    var i: usize = 0;
    var has_dot = false;
    var has_at = false;
    var last_was_alnum = false;

    while (i < text.len) {
        const c = text[i];
        if (isAsciiAlnum(c) or c == '-' or c == '_') {
            last_was_alnum = isAsciiAlnum(c);
            i += 1;
        } else if (c == '.' and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) {
            has_dot = true;
            last_was_alnum = false;
            i += 1;
        } else if (c == '@' and !has_at and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) {
            has_at = true;
            last_was_alnum = false;
            i += 1;
        } else break;
    }

    // must have at least one dot to be a domain
    if (!has_dot) return 0;

    // check TLD is at least 2 chars and alphabetic (not numeric like 500.00)
    var last_dot: usize = 0;
    var j: usize = 0;
    while (j < i) : (j += 1) {
        if (text[j] == '.') last_dot = j;
    }
    const tld_start = last_dot + 1;
    const tld_len = i - tld_start;
    if (tld_len < 2) return 0;

    // TLD must contain at least one letter
    var has_alpha = false;
    j = tld_start;
    while (j < i) : (j += 1) {
        if ((text[j] >= 'a' and text[j] <= 'z') or (text[j] >= 'A' and text[j] <= 'Z')) {
            has_alpha = true;
            break;
        }
    }
    if (!has_alpha) return 0;

    return i;
}

// ── helpers ──

fn isWhitespace(c: u8) bool {
    return c == ' ' or c == '\t' or c == '\n' or c == '\r';
}

fn isAsciiDigit(c: u21) bool {
    return c >= '0' and c <= '9';
}

fn isAsciiAlnum(c: u8) bool {
    return (c >= '0' and c <= '9') or (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z');
}

fn findSubstr(haystack: []const u8, needle: []const u8) ?usize {
    if (needle.len > haystack.len) return null;
    if (needle.len == 0) return 0;
    var i: usize = 0;
    while (i + needle.len <= haystack.len) : (i += 1) {
        if (std.mem.eql(u8, haystack[i..][0..needle.len], needle)) return i;
    }
    return null;
}

// ── tests ──

const testing = std.testing;

test "tokenize basic sentence" {
    var tokens: [64]Token = undefined;
    const n = tokenize("Barack Obama visited Paris.", &tokens);

    const expected = [_][]const u8{ "Barack", "Obama", "visited", "Paris", "." };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text("Barack Obama visited Paris."));
    }
}

test "tokenize contractions" {
    var tokens: [64]Token = undefined;
    const text = "I can't believe it's not butter!";
    const n = tokenize(text, &tokens);

    const expected = [_][]const u8{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text(text));
    }
}

test "tokenize currency and punctuation" {
    var tokens: [64]Token = undefined;
    const text = "Apple Inc. is worth $2.5 trillion.";
    const n = tokenize(text, &tokens);

    const expected = [_][]const u8{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text(text));
    }
}

test "tokenize parentheses" {
    var tokens: [64]Token = undefined;
    const text = "Dr. Smith's office (room 42) is closed.";
    const n = tokenize(text, &tokens);

    const expected = [_][]const u8{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text(text));
    }
}

test "tokenize hyphenated words" {
    var tokens: [64]Token = undefined;
    const text = "New York-based company";
    const n = tokenize(text, &tokens);

    const expected = [_][]const u8{ "New", "York", "-", "based", "company" };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text(text));
    }
}

test "tokenize abbreviations" {
    var tokens: [64]Token = undefined;
    const text = "U.S.A. and U.K. are allies.";
    const n = tokenize(text, &tokens);

    const expected = [_][]const u8{ "U.S.A.", "and", "U.K.", "are", "allies", "." };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text(text));
    }
}

test "tokenize email" {
    var tokens: [64]Token = undefined;
    const text = "e-mail: test@example.com";
    const n = tokenize(text, &tokens);

    const expected = [_][]const u8{ "e", "-", "mail", ":", "test@example.com" };
    try testing.expectEqual(@as(u32, expected.len), n);
    for (expected, 0..) |exp, i| {
        try testing.expectEqualStrings(exp, tokens[i].text(text));
    }
}

test "matchPrefix" {
    // single chars
    try testing.expectEqual(@as(usize, 1), matchPrefix("$100"));
    try testing.expectEqual(@as(usize, 1), matchPrefix("(hello)"));
    try testing.expectEqual(@as(usize, 1), matchPrefix("\"quote"));
    try testing.expectEqual(@as(usize, 1), matchPrefix("!"));

    // multi-char
    try testing.expectEqual(@as(usize, 3), matchPrefix("US$100"));
    try testing.expectEqual(@as(usize, 2), matchPrefix("C$100"));

    // dots
    try testing.expectEqual(@as(usize, 3), matchPrefix("...hello"));
    try testing.expectEqual(@as(usize, 2), matchPrefix("..hello"));

    // no match
    try testing.expectEqual(@as(usize, 0), matchPrefix("hello"));
    try testing.expectEqual(@as(usize, 0), matchPrefix("123"));
}

test "matchSuffix" {
    try testing.expectEqual(@as(usize, 1), matchSuffix("hello."));
    try testing.expectEqual(@as(usize, 1), matchSuffix("hello!"));
    try testing.expectEqual(@as(usize, 1), matchSuffix("hello)"));
    try testing.expectEqual(@as(usize, 1), matchSuffix("hello,"));
    try testing.expectEqual(@as(usize, 0), matchSuffix("hello"));
}

test "findInfixes" {
    var infixes: [64]Infix = undefined;

    // hyphen between words
    const n1 = findInfixes("York-based", &infixes);
    try testing.expect(n1 > 0);
    try testing.expectEqual(@as(usize, 4), infixes[0].start);
    try testing.expectEqual(@as(usize, 5), infixes[0].end);
}