//! spaCy-compatible tokenizer. //! //! port of spaCy's `tokenizer.pyx` algorithm: whitespace split → per-chunk //! iterative prefix/suffix stripping → infix splitting → special case lookup. //! uses generated data tables from `tokenizer_data.zig`. const std = @import("std"); const data = @import("tokenizer_data.zig"); /// a token is a byte-offset slice into the original text. pub const Token = struct { /// byte offset of start in original text start: u32, /// byte offset of end (exclusive) in original text end: u32, pub fn text(self: Token, source: []const u8) []const u8 { return source[self.start..self.end]; } }; /// maximum tokens per document. pub const MAX_TOKENS = 1024; /// tokenize text into tokens. returns the number of tokens written. /// tokens are byte-offset spans into the original text. pub fn tokenize(text: []const u8, out: []Token) u32 { if (text.len == 0) return 0; var count: u32 = 0; // phase 1: whitespace split into chunks var i: usize = 0; while (i < text.len) { // skip whitespace while (i < text.len and isWhitespace(text[i])) : (i += 1) {} if (i >= text.len) break; // find end of chunk (next whitespace) const chunk_start = i; while (i < text.len and !isWhitespace(text[i])) : (i += 1) {} const chunk_end = i; // phase 2: tokenize this chunk count = tokenizeChunk(text, chunk_start, chunk_end, out, count); if (count >= out.len) return count; } return count; } /// tokenize a single whitespace-delimited chunk. /// text[start..end] is the chunk. writes tokens to out[count..]. fn tokenizeChunk( text: []const u8, start: usize, end: usize, out: []Token, count_in: u32, ) u32 { var count = count_in; const chunk = text[start..end]; // check special cases first if (data.specials.get(chunk)) |special| { var offset: u32 = @intCast(start); for (0..special.len) |ti| { const tok_text = special.tokens[ti]; // find this token text in the source at the expected position const tok_start = findSubstr(text[offset..end], tok_text); if (tok_start) |ts| { if (count < out.len) { out[count] = .{ .start = offset + @as(u32, @intCast(ts)), .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)), }; count += 1; } offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)); } else { // special token not found at expected position — emit based on length if (count < out.len) { out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)), }; count += 1; } offset += @as(u32, @intCast(tok_text.len)); } } return count; } // split affixes iteratively var lo: u32 = @intCast(start); var hi: u32 = @intCast(end); // prefix and suffix stacks (indices into out buffer) const prefix_start = count; // where prefixes begin in out var suffix_buf: [64]Token = undefined; var n_suffixes: u32 = 0; var last_len: u32 = 0; while (lo < hi and (hi - lo) != last_len) { const span = text[lo..hi]; const span_len = hi - lo; // check if remaining span is a special case if (data.specials.get(span) != null) break; // check URL match if (matchUrl(span) > 0) break; last_len = span_len; // try prefix const pre_len = matchPrefix(span); // try suffix on span[pre_len..] but strip from end of full span const suf_len = if (pre_len < span_len) matchSuffix(span[pre_len..]) else @as(usize, 0); if (pre_len > 0 and suf_len > 0 and (pre_len + suf_len) <= span_len) { // both prefix and suffix, non-overlapping // check if stripping prefix reveals a special const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi]; if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) { // emit prefix, let middle handle the special if (count < out.len) { out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; count += 1; } lo += @as(u32, @intCast(pre_len)); break; } // check if stripping suffix reveals a special const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))]; if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) { if (n_suffixes < suffix_buf.len) { suffix_buf[n_suffixes] = .{ .start = hi - @as(u32, @intCast(suf_len)), .end = hi, }; n_suffixes += 1; } hi -= @as(u32, @intCast(suf_len)); break; } // strip both if (count < out.len) { out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; count += 1; } if (n_suffixes < suffix_buf.len) { suffix_buf[n_suffixes] = .{ .start = hi - @as(u32, @intCast(suf_len)), .end = hi, }; n_suffixes += 1; } lo += @as(u32, @intCast(pre_len)); hi -= @as(u32, @intCast(suf_len)); } else if (pre_len > 0) { // prefix only const minus_pre = text[lo + @as(u32, @intCast(pre_len)) .. hi]; if (minus_pre.len > 0 and data.specials.get(minus_pre) != null) { if (count < out.len) { out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; count += 1; } lo += @as(u32, @intCast(pre_len)); break; } if (count < out.len) { out[count] = .{ .start = lo, .end = lo + @as(u32, @intCast(pre_len)) }; count += 1; } lo += @as(u32, @intCast(pre_len)); } else if (suf_len > 0) { const minus_suf = text[lo..hi - @as(u32, @intCast(suf_len))]; if (minus_suf.len > 0 and data.specials.get(minus_suf) != null) { if (n_suffixes < suffix_buf.len) { suffix_buf[n_suffixes] = .{ .start = hi - @as(u32, @intCast(suf_len)), .end = hi, }; n_suffixes += 1; } hi -= @as(u32, @intCast(suf_len)); break; } if (n_suffixes < suffix_buf.len) { suffix_buf[n_suffixes] = .{ .start = hi - @as(u32, @intCast(suf_len)), .end = hi, }; n_suffixes += 1; } hi -= @as(u32, @intCast(suf_len)); } // else: neither matched, last_len == span_len, loop exits } _ = prefix_start; // emit middle portion if (lo < hi) { const middle = text[lo..hi]; // try special cases for the remaining middle if (data.specials.get(middle)) |special| { var offset: u32 = lo; for (0..special.len) |ti| { const tok_text = special.tokens[ti]; const tok_start = findSubstr(text[offset..hi], tok_text); if (tok_start) |ts| { if (count < out.len) { out[count] = .{ .start = offset + @as(u32, @intCast(ts)), .end = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)), }; count += 1; } offset = offset + @as(u32, @intCast(ts)) + @as(u32, @intCast(tok_text.len)); } else { if (count < out.len) { out[count] = .{ .start = offset, .end = offset + @as(u32, @intCast(tok_text.len)) }; count += 1; } offset += @as(u32, @intCast(tok_text.len)); } } } else if (matchUrl(middle) > 0) { // URL — emit as single token if (count < out.len) { out[count] = .{ .start = lo, .end = hi }; count += 1; } } else { // try infix splitting var infixes: [64]Infix = undefined; const n_infixes = findInfixes(middle, &infixes); if (n_infixes == 0) { // no infixes — single token if (count < out.len) { out[count] = .{ .start = lo, .end = hi }; count += 1; } } else { // split on infixes var pos: u32 = lo; for (infixes[0..n_infixes]) |inf| { const inf_start = lo + @as(u32, @intCast(inf.start)); const inf_end = lo + @as(u32, @intCast(inf.end)); // skip infixes at position 0 if (inf.start == 0) continue; // emit text before infix if (inf_start > pos) { if (count < out.len) { out[count] = .{ .start = pos, .end = inf_start }; count += 1; } } // emit infix if (inf_start != inf_end) { if (count < out.len) { out[count] = .{ .start = inf_start, .end = inf_end }; count += 1; } } pos = inf_end; } // emit text after last infix if (pos < hi) { if (count < out.len) { out[count] = .{ .start = pos, .end = hi }; count += 1; } } } } } // emit suffixes in reverse order var si = n_suffixes; while (si > 0) { si -= 1; if (count < out.len) { out[count] = suffix_buf[si]; count += 1; } } return count; } // ── pattern matching ── /// match a prefix at position 0. returns byte length of match, or 0. pub fn matchPrefix(text: []const u8) usize { if (text.len == 0) return 0; const cp = data.decodeUtf8(text) orelse return 0; // 1. single-character prefixes (switch on codepoint) if (data.isPrefixChar(cp.value)) return cp.len; // 2. multi-char literals (longest first) for (data.prefix_multi_literals) |lit| { if (std.mem.startsWith(u8, text, lit)) return lit.len; } // 3. symbol class (unicode So/Sc categories) if (data.isSymbol(cp.value)) return cp.len; // 4. 2+ dots if (text.len >= 2 and text[0] == '.' and text[1] == '.') { var i: usize = 2; while (i < text.len and text[i] == '.') : (i += 1) {} return i; } // 5. literal-unless-digit (e.g., + not followed by digit) if (data.isPrefixUnlessDigit(cp.value)) { if (cp.len >= text.len) return cp.len; const next = data.decodeUtf8(text[cp.len..]); if (next == null or !isAsciiDigit(next.?.value)) return cp.len; } return 0; } /// match a suffix at the end of text. returns byte length of suffix, or 0. pub fn matchSuffix(text: []const u8) usize { if (text.len == 0) return 0; const last = data.lastCodepoint(text) orelse return 0; // 1. single-character suffixes if (data.isSuffixChar(last.value)) return last.len; // 2. symbol class if (data.isSymbol(last.value)) return last.len; // 3. multi-char literal suffixes (longest first) for (data.suffix_multi_literals) |lit| { if (std.mem.endsWith(u8, text, lit)) return lit.len; } // 4. 2+ dots at end if (text.len >= 2 and text[text.len - 1] == '.' and text[text.len - 2] == '.') { var i: usize = text.len - 2; while (i > 0 and text[i - 1] == '.') : (i -= 1) {} return text.len - i; } // 5. lookbehind rules (generated) const lb = data.matchSuffixLookbehind(text); if (lb > 0) return lb; return 0; } /// infix match result const Infix = struct { start: usize, end: usize }; /// find all infix split points. returns count written. fn findInfixes(text: []const u8, out: []Infix) usize { var count: usize = 0; if (text.len == 0) return 0; var i: usize = 0; while (i < text.len) { const cp = data.decodeUtf8(text[i..]) orelse { i += 1; continue; }; var matched: usize = 0; // 1. 2+ dots (infix[0]) if (text[i] == '.' and i + 1 < text.len and text[i + 1] == '.') { var end = i + 2; while (end < text.len and text[end] == '.') : (end += 1) {} matched = end - i; } // 2. ellipsis U+2026 (infix[1]) else if (cp.value == 0x2026) { matched = cp.len; } // 3. symbol class (infix[2]) else if (data.isSymbol(cp.value)) { matched = cp.len; } // contextual rules require lookbehind/lookahead else { const prev_cp = if (i > 0) data.lastCodepoint(text[0..i]) else null; const next_start = i + cp.len; const next_cp = if (next_start < text.len) data.decodeUtf8(text[next_start..]) else null; // 4. math ops between digits: (?<=[0-9])[+\-*^](?=[0-9\-]) (infix[3]) if (prev_cp != null and isAsciiDigit(prev_cp.?.value)) { if (cp.value == '+' or cp.value == '-' or cp.value == '*' or cp.value == '^') { if (next_cp != null and (isAsciiDigit(next_cp.?.value) or next_cp.?.value == '-')) { matched = cp.len; } } } // 5. period between lower/punct and upper (infix[4]) if (matched == 0 and cp.value == '.') { if (prev_cp != null and next_cp != null) { if (data.is_infix_4_behind(prev_cp.?.value) and data.is_infix_4_ahead(next_cp.?.value)) { matched = 1; } } } // 6. comma between alpha chars (infix[5]) if (matched == 0 and cp.value == ',') { if (prev_cp != null and next_cp != null) { if (data.is_infix_5_behind(prev_cp.?.value) and data.is_infix_5_ahead(next_cp.?.value)) { matched = 1; } } } // 7. hyphens/dashes between alnum (infix[6]) if (matched == 0 and prev_cp != null and next_cp != null) { if (data.is_infix_6_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) { // try alternatives: ---, --, —— (U+2014 U+2014), —, –, -, ~ if (i + 3 <= text.len and std.mem.eql(u8, text[i..][0..3], "---")) { matched = 3; } else if (i + 2 <= text.len and std.mem.eql(u8, text[i..][0..2], "--")) { matched = 2; } else if (i + 6 <= text.len and std.mem.eql(u8, text[i..][0..6], "\xe2\x80\x94\xe2\x80\x94")) { matched = 6; // —— } else if (cp.value == 0x2014) { // — matched = cp.len; } else if (cp.value == 0x2013) { // – matched = cp.len; } else if (cp.value == '-') { matched = 1; } else if (cp.value == '~') { matched = 1; } } } // 8. separators between alnum (infix[7]) if (matched == 0 and prev_cp != null and next_cp != null) { if (data.is_infix_7_behind(prev_cp.?.value) and data.is_infix_7_ahead(next_cp.?.value)) { if (cp.value == '/' or cp.value == ':' or cp.value == '<' or cp.value == '>' or cp.value == '=') { matched = cp.len; } } } } if (matched > 0) { if (count < out.len) { out[count] = .{ .start = i, .end = i + matched }; count += 1; } i += matched; } else { i += cp.len; } } return count; } /// simplified URL matcher. matches scheme://... or domain.tld patterns. /// returns length of match from start, or 0. fn matchUrl(text: []const u8) usize { if (text.len < 4) return 0; // check for scheme:// var pos: usize = 0; if (std.mem.startsWith(u8, text, "http://")) { pos = 7; } else if (std.mem.startsWith(u8, text, "https://")) { pos = 8; } else if (std.mem.startsWith(u8, text, "ftp://")) { pos = 6; } else { // try generic scheme:// var j: usize = 0; while (j < text.len and j < 20) : (j += 1) { const c = text[j]; if (c == ':') { if (j + 2 < text.len and text[j + 1] == '/' and text[j + 2] == '/') { pos = j + 3; break; } else break; } if (!isAsciiAlnum(c) and c != '+' and c != '-' and c != '.') break; } // no scheme — try bare domain: word.word or word@word.word if (pos == 0) { pos = matchBareDomain(text); } } if (pos == 0 or pos >= text.len) return 0; // consume until whitespace while (pos < text.len and !isWhitespace(text[pos])) : (pos += 1) {} return pos; } /// match a bare domain like example.com or user@example.com fn matchBareDomain(text: []const u8) usize { // look for word.word or word@word.word pattern var i: usize = 0; var has_dot = false; var has_at = false; var last_was_alnum = false; while (i < text.len) { const c = text[i]; if (isAsciiAlnum(c) or c == '-' or c == '_') { last_was_alnum = isAsciiAlnum(c); i += 1; } else if (c == '.' and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) { has_dot = true; last_was_alnum = false; i += 1; } else if (c == '@' and !has_at and last_was_alnum and i + 1 < text.len and isAsciiAlnum(text[i + 1])) { has_at = true; last_was_alnum = false; i += 1; } else break; } // must have at least one dot to be a domain if (!has_dot) return 0; // check TLD is at least 2 chars and alphabetic (not numeric like 500.00) var last_dot: usize = 0; var j: usize = 0; while (j < i) : (j += 1) { if (text[j] == '.') last_dot = j; } const tld_start = last_dot + 1; const tld_len = i - tld_start; if (tld_len < 2) return 0; // TLD must contain at least one letter var has_alpha = false; j = tld_start; while (j < i) : (j += 1) { if ((text[j] >= 'a' and text[j] <= 'z') or (text[j] >= 'A' and text[j] <= 'Z')) { has_alpha = true; break; } } if (!has_alpha) return 0; return i; } // ── helpers ── fn isWhitespace(c: u8) bool { return c == ' ' or c == '\t' or c == '\n' or c == '\r'; } fn isAsciiDigit(c: u21) bool { return c >= '0' and c <= '9'; } fn isAsciiAlnum(c: u8) bool { return (c >= '0' and c <= '9') or (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'); } fn findSubstr(haystack: []const u8, needle: []const u8) ?usize { if (needle.len > haystack.len) return null; if (needle.len == 0) return 0; var i: usize = 0; while (i + needle.len <= haystack.len) : (i += 1) { if (std.mem.eql(u8, haystack[i..][0..needle.len], needle)) return i; } return null; } // ── tests ── const testing = std.testing; test "tokenize basic sentence" { var tokens: [64]Token = undefined; const n = tokenize("Barack Obama visited Paris.", &tokens); const expected = [_][]const u8{ "Barack", "Obama", "visited", "Paris", "." }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text("Barack Obama visited Paris.")); } } test "tokenize contractions" { var tokens: [64]Token = undefined; const text = "I can't believe it's not butter!"; const n = tokenize(text, &tokens); const expected = [_][]const u8{ "I", "ca", "n't", "believe", "it", "'s", "not", "butter", "!" }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text(text)); } } test "tokenize currency and punctuation" { var tokens: [64]Token = undefined; const text = "Apple Inc. is worth $2.5 trillion."; const n = tokenize(text, &tokens); const expected = [_][]const u8{ "Apple", "Inc.", "is", "worth", "$", "2.5", "trillion", "." }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text(text)); } } test "tokenize parentheses" { var tokens: [64]Token = undefined; const text = "Dr. Smith's office (room 42) is closed."; const n = tokenize(text, &tokens); const expected = [_][]const u8{ "Dr.", "Smith", "'s", "office", "(", "room", "42", ")", "is", "closed", "." }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text(text)); } } test "tokenize hyphenated words" { var tokens: [64]Token = undefined; const text = "New York-based company"; const n = tokenize(text, &tokens); const expected = [_][]const u8{ "New", "York", "-", "based", "company" }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text(text)); } } test "tokenize abbreviations" { var tokens: [64]Token = undefined; const text = "U.S.A. and U.K. are allies."; const n = tokenize(text, &tokens); const expected = [_][]const u8{ "U.S.A.", "and", "U.K.", "are", "allies", "." }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text(text)); } } test "tokenize email" { var tokens: [64]Token = undefined; const text = "e-mail: test@example.com"; const n = tokenize(text, &tokens); const expected = [_][]const u8{ "e", "-", "mail", ":", "test@example.com" }; try testing.expectEqual(@as(u32, expected.len), n); for (expected, 0..) |exp, i| { try testing.expectEqualStrings(exp, tokens[i].text(text)); } } test "matchPrefix" { // single chars try testing.expectEqual(@as(usize, 1), matchPrefix("$100")); try testing.expectEqual(@as(usize, 1), matchPrefix("(hello)")); try testing.expectEqual(@as(usize, 1), matchPrefix("\"quote")); try testing.expectEqual(@as(usize, 1), matchPrefix("!")); // multi-char try testing.expectEqual(@as(usize, 3), matchPrefix("US$100")); try testing.expectEqual(@as(usize, 2), matchPrefix("C$100")); // dots try testing.expectEqual(@as(usize, 3), matchPrefix("...hello")); try testing.expectEqual(@as(usize, 2), matchPrefix("..hello")); // no match try testing.expectEqual(@as(usize, 0), matchPrefix("hello")); try testing.expectEqual(@as(usize, 0), matchPrefix("123")); } test "matchSuffix" { try testing.expectEqual(@as(usize, 1), matchSuffix("hello.")); try testing.expectEqual(@as(usize, 1), matchSuffix("hello!")); try testing.expectEqual(@as(usize, 1), matchSuffix("hello)")); try testing.expectEqual(@as(usize, 1), matchSuffix("hello,")); try testing.expectEqual(@as(usize, 0), matchSuffix("hello")); } test "findInfixes" { var infixes: [64]Infix = undefined; // hyphen between words const n1 = findInfixes("York-based", &infixes); try testing.expect(n1 > 0); try testing.expectEqual(@as(usize, 4), infixes[0].start); try testing.expectEqual(@as(usize, 5), infixes[0].end); }